diff --git a/src/intel/compiler/elk/brw_asm.h b/src/intel/compiler/elk/brw_asm.h
new file mode 100644
index 00000000000..d6d9ce47b03
--- /dev/null
+++ b/src/intel/compiler/elk/brw_asm.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef BRW_ASM_H
+#define BRW_ASM_H
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <assert.h>
+
+#include "compiler/brw_reg.h"
+#include "compiler/brw_reg_type.h"
+#include "compiler/brw_eu_defines.h"
+#include "compiler/brw_inst.h"
+#include "compiler/brw_eu.h"
+#include "dev/intel_device_info.h"
+#include "util/list.h"
+
+/* glibc < 2.27 defines OVERFLOW in /usr/include/math.h. */
+#undef OVERFLOW
+
+int yyparse(void);
+int yylex(void);
+char *lex_text(void);
+
+extern struct brw_codegen *p;
+extern int errors;
+extern char *input_filename;
+
+extern struct list_head instr_labels;
+extern struct list_head target_labels;
+
+struct condition {
+   unsigned cond_modifier:4;
+   unsigned flag_reg_nr:1;
+   unsigned flag_subreg_nr:1;
+};
+
+struct predicate {
+   unsigned pred_control:4;
+   unsigned pred_inv:1;
+   unsigned flag_reg_nr:1;
+   unsigned flag_subreg_nr:1;
+};
+
+enum instoption_type {
+   INSTOPTION_FLAG,
+   INSTOPTION_DEP_INFO,
+};
+
+struct instoption {
+   enum instoption_type type;
+   union {
+      unsigned uint_value;
+      struct tgl_swsb depinfo_value;
+   };
+};
+
+struct options {
+   unsigned access_mode:1;
+   unsigned compression_control:2;
+   unsigned thread_control:2;
+   unsigned no_dd_check:1; // Dependency control
+   unsigned no_dd_clear:1; // Dependency control
+   unsigned mask_control:1;
+   unsigned debug_control:1;
+   unsigned acc_wr_control:1;
+   unsigned end_of_thread:1;
+   unsigned compaction:1;
+   unsigned qtr_ctrl:2;
+   unsigned nib_ctrl:1;
+   unsigned is_compr:1;
+   struct tgl_swsb depinfo;
+};
+
+struct msgdesc {
+   unsigned ex_bso:1;
+   unsigned src1_len:5;
+};
+
+enum instr_label_type {
+   INSTR_LABEL_JIP,
+   INSTR_LABEL_UIP,
+};
+
+struct instr_label {
+   struct list_head link;
+
+   char *name;
+   int offset;
+   enum instr_label_type type;
+};
+
+struct target_label {
+   struct list_head link;
+
+   char *name;
+   int offset;
+};
+
+#endif /* BRW_ASM_H */
diff --git a/src/intel/compiler/elk/brw_asm_tool.c b/src/intel/compiler/elk/brw_asm_tool.c
new file mode 100644
index 00000000000..6e4a5fce52d
--- /dev/null
+++ b/src/intel/compiler/elk/brw_asm_tool.c
@@ -0,0 +1,385 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include "brw_asm.h"
+#include "intel/compiler/brw_disasm_info.h"
+
+enum opt_output_type {
+   OPT_OUTPUT_HEX,
+   OPT_OUTPUT_C_LITERAL,
+   OPT_OUTPUT_BIN,
+};
+
+extern FILE *yyin;
+struct brw_codegen *p;
+static enum opt_output_type output_type = OPT_OUTPUT_BIN;
+char *input_filename = NULL;
+int errors;
+
+struct list_head instr_labels;
+struct list_head target_labels;
+
+static void
+print_help(const char *progname, FILE *file)
+{
+   fprintf(file,
+           "Usage: %s [OPTION] inputfile\n"
+           "Assemble i965 instructions from input file.\n\n"
+           "    -h, --help             display this help and exit\n"
+           "    -t, --type=OUTPUT_TYPE OUTPUT_TYPE can be 'bin' (default if omitted),\n"
+           "                           'c_literal', or 'hex'\n"
+           "    -o, --output           specify output file\n"
+           "        --compact          print compacted instructions\n"
+           "    -g, --gen=platform     assemble instructions for given \n"
+           "                           platform (3 letter platform name)\n"
+           "Example:\n"
+           "    i965_asm -g kbl input.asm -t hex -o output\n",
+           progname);
+}
+
+static uint32_t
+get_dword(const brw_inst *inst, int idx)
+{
+   uint32_t dword;
+   memcpy(&dword, (char *)inst + 4 * idx, sizeof(dword));
+   return dword;
+}
+
+static void
+print_instruction(FILE *output, bool compact, const brw_inst *instruction)
+{
+   int byte_limit;
+
+   byte_limit = (compact == true) ? 8 : 16;
+
+   switch (output_type) {
+   case OPT_OUTPUT_HEX: {
+      fprintf(output, "%02x", ((unsigned char *)instruction)[0]);
+
+      for (unsigned i = 1; i < byte_limit; i++) {
+         fprintf(output, " %02x", ((unsigned char *)instruction)[i]);
+      }
+      break;
+   }
+   case OPT_OUTPUT_C_LITERAL: {
+      fprintf(output, "\t0x%08x,", get_dword(instruction, 0));
+
+      for (unsigned i = 1; i < byte_limit / 4; i++)
+         fprintf(output, " 0x%08x,", get_dword(instruction, i));
+
+      break;
+   }
+   case OPT_OUTPUT_BIN:
+      fwrite(instruction, 1, byte_limit, output);
+      break;
+   }
+
+   if (output_type != OPT_OUTPUT_BIN) {
+      fprintf(output, "\n");
+   }
+}
+
+static struct intel_device_info *
+i965_disasm_init(uint16_t pci_id)
+{
+   struct intel_device_info *devinfo;
+
+   devinfo = malloc(sizeof *devinfo);
+   if (devinfo == NULL)
+      return NULL;
+
+   if (!intel_get_device_info_from_pci_id(pci_id, devinfo)) {
+      fprintf(stderr, "can't find device information: pci_id=0x%x\n",
+              pci_id);
+      free(devinfo);
+      return NULL;
+   }
+
+   return devinfo;
+}
+
+static bool
+i965_postprocess_labels()
+{
+   if (p->devinfo->ver < 6) {
+      return true;
+   }
+
+   void *store = p->store;
+
+   struct target_label *tlabel;
+   struct instr_label *ilabel, *s;
+
+   const unsigned to_bytes_scale = brw_jump_scale(p->devinfo);
+
+   LIST_FOR_EACH_ENTRY(tlabel, &target_labels, link) {
+      LIST_FOR_EACH_ENTRY_SAFE(ilabel, s, &instr_labels, link) {
+         if (!strcmp(tlabel->name, ilabel->name)) {
+            brw_inst *inst = store + ilabel->offset;
+
+            int relative_offset = (tlabel->offset - ilabel->offset) / sizeof(brw_inst);
+            relative_offset *= to_bytes_scale;
+
+            unsigned opcode = brw_inst_opcode(p->isa, inst);
+
+            if (ilabel->type == INSTR_LABEL_JIP) {
+               switch (opcode) {
+               case BRW_OPCODE_IF:
+               case BRW_OPCODE_ELSE:
+               case BRW_OPCODE_ENDIF:
+               case BRW_OPCODE_WHILE:
+                  if (p->devinfo->ver >= 7) {
+                     brw_inst_set_jip(p->devinfo, inst, relative_offset);
+                  } else if (p->devinfo->ver == 6) {
+                     brw_inst_set_gfx6_jump_count(p->devinfo, inst, relative_offset);
+                  }
+                  break;
+               case BRW_OPCODE_BREAK:
+               case BRW_OPCODE_HALT:
+               case BRW_OPCODE_CONTINUE:
+                  brw_inst_set_jip(p->devinfo, inst, relative_offset);
+                  break;
+               default:
+                  fprintf(stderr, "Unknown opcode %d with JIP label\n", opcode);
+                  return false;
+               }
+            } else {
+               switch (opcode) {
+               case BRW_OPCODE_IF:
+               case BRW_OPCODE_ELSE:
+                  if (p->devinfo->ver > 7) {
+                     brw_inst_set_uip(p->devinfo, inst, relative_offset);
+                  } else if (p->devinfo->ver == 7) {
+                     brw_inst_set_uip(p->devinfo, inst, relative_offset);
+                  } else if (p->devinfo->ver == 6) {
+                     // Nothing
+                  }
+                  break;
+               case BRW_OPCODE_WHILE:
+               case BRW_OPCODE_ENDIF:
+                  fprintf(stderr, "WHILE/ENDIF cannot have UIP offset\n");
+                  return false;
+               case BRW_OPCODE_BREAK:
+               case BRW_OPCODE_CONTINUE:
+               case BRW_OPCODE_HALT:
+                  brw_inst_set_uip(p->devinfo, inst, relative_offset);
+                  break;
+               default:
+                  fprintf(stderr, "Unknown opcode %d with UIP label\n", opcode);
+                  return false;
+               }
+            }
+
+            list_del(&ilabel->link);
+         }
+      }
+   }
+
+   LIST_FOR_EACH_ENTRY(ilabel, &instr_labels, link) {
+      fprintf(stderr, "Unknown label '%s'\n", ilabel->name);
+   }
+
+   return list_is_empty(&instr_labels);
+}
+
+int main(int argc, char **argv)
+{
+   char *output_file = NULL;
+   char c;
+   FILE *output = stdout;
+   bool help = false, compact = false;
+   void *store;
+   uint64_t pci_id = 0;
+   int offset = 0, err;
+   int start_offset = 0;
+   struct disasm_info *disasm_info;
+   struct intel_device_info *devinfo = NULL;
+   int result = EXIT_FAILURE;
+   list_inithead(&instr_labels);
+   list_inithead(&target_labels);
+
+   const struct option i965_asm_opts[] = {
+      { "help",          no_argument,       (int *) &help,      true },
+      { "type",          required_argument, NULL,               't' },
+      { "gen",           required_argument, NULL,               'g' },
+      { "output",        required_argument, NULL,               'o' },
+      { "compact",       no_argument,       (int *) &compact,   true },
+      { NULL,            0,                 NULL,               0 }
+   };
+
+   while ((c = getopt_long(argc, argv, ":t:g:o:h", i965_asm_opts, NULL)) != -1) {
+      switch (c) {
+      case 'g': {
+         const int id = intel_device_name_to_pci_device_id(optarg);
+         if (id < 0) {
+            fprintf(stderr, "can't parse gen: '%s', expected 3 letter "
+                            "platform name\n", optarg);
+            goto end;
+         } else {
+            pci_id = id;
+         }
+         break;
+      }
+      case 'h':
+         help = true;
+         print_help(argv[0], stderr);
+         goto end;
+      case 't': {
+         if (strcmp(optarg, "hex") == 0) {
+            output_type = OPT_OUTPUT_HEX;
+         } else if (strcmp(optarg, "c_literal") == 0) {
+            output_type = OPT_OUTPUT_C_LITERAL;
+         } else if (strcmp(optarg, "bin") == 0) {
+            output_type = OPT_OUTPUT_BIN;
+         } else {
+            fprintf(stderr, "invalid value for --type: %s\n", optarg);
+            goto end;
+         }
+         break;
+      }
+      case 'o':
+         output_file = strdup(optarg);
+         break;
+      case 0:
+         break;
+      case ':':
+         fprintf(stderr, "%s: option `-%c' requires an argument\n",
+                 argv[0], optopt);
+         goto end;
+      case '?':
+      default:
+         fprintf(stderr, "%s: option `-%c' is invalid: ignored\n",
+                 argv[0], optopt);
+         goto end;
+      }
+   }
+
+   if (help || !pci_id) {
+      print_help(argv[0], stderr);
+      goto end;
+   }
+
+   if (!argv[optind]) {
+      fprintf(stderr, "Please specify input file\n");
+      goto end;
+   }
+
+   input_filename = strdup(argv[optind]);
+   yyin = fopen(input_filename, "r");
+   if (!yyin) {
+      fprintf(stderr, "Unable to read input file : %s\n",
+              input_filename);
+      goto end;
+   }
+
+   if (output_file) {
+      output = fopen(output_file, "w");
+      if (!output) {
+         fprintf(stderr, "Couldn't open output file\n");
+         goto end;
+      }
+   }
+
+   devinfo = i965_disasm_init(pci_id);
+   if (!devinfo) {
+      fprintf(stderr, "Unable to allocate memory for "
+                      "intel_device_info struct instance.\n");
+      goto end;
+   }
+
+   struct brw_isa_info isa;
+   brw_init_isa_info(&isa, devinfo);
+
+   p = rzalloc(NULL, struct brw_codegen);
+   brw_init_codegen(&isa, p, p);
+   p->automatic_exec_sizes = false;
+
+   err = yyparse();
+   if (err || errors)
+      goto end;
+
+   if (!i965_postprocess_labels())
+      goto end;
+
+   store = p->store;
+
+   disasm_info = disasm_initialize(p->isa, NULL);
+   if (!disasm_info) {
+      fprintf(stderr, "Unable to initialize disasm_info struct instance\n");
+      goto end;
+   }
+
+   if (output_type == OPT_OUTPUT_C_LITERAL)
+      fprintf(output, "{\n");
+
+   brw_validate_instructions(p->isa, p->store, 0,
+                             p->next_insn_offset, disasm_info);
+
+   const int nr_insn = (p->next_insn_offset - start_offset) / 16;
+
+   if (compact)
+      brw_compact_instructions(p, start_offset, disasm_info);
+
+   for (int i = 0; i < nr_insn; i++) {
+      const brw_inst *insn = store + offset;
+      bool compacted = false;
+
+      if (compact && brw_inst_cmpt_control(p->devinfo, insn)) {
+            offset += 8;
+            compacted = true;
+      } else {
+            offset += 16;
+      }
+
+      print_instruction(output, compacted, insn);
+   }
+
+   ralloc_free(disasm_info);
+
+   if (output_type == OPT_OUTPUT_C_LITERAL)
+      fprintf(output, "}");
+
+   result = EXIT_SUCCESS;
+   goto end;
+
+end:
+   free(input_filename);
+   free(output_file);
+
+   if (yyin)
+      fclose(yyin);
+
+   if (output)
+      fclose(output);
+
+   if (p)
+      ralloc_free(p);
+
+   if (devinfo)
+      free(devinfo);
+
+   exit(result);
+}
diff --git a/src/intel/compiler/elk/brw_cfg.cpp b/src/intel/compiler/elk/brw_cfg.cpp
new file mode 100644
index 00000000000..01cb42635c1
--- /dev/null
+++ b/src/intel/compiler/elk/brw_cfg.cpp
@@ -0,0 +1,833 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_cfg.h"
+#include "util/u_dynarray.h"
+#include "brw_shader.h"
+
+/** @file brw_cfg.cpp
+ *
+ * Walks the shader instructions generated and creates a set of basic
+ * blocks with successor/predecessor edges connecting them.
+ */
+
+using namespace brw;
+
+static bblock_t *
+pop_stack(exec_list *list)
+{
+   bblock_link *link = (bblock_link *)list->get_tail();
+   bblock_t *block = link->block;
+   link->link.remove();
+
+   return block;
+}
+
+static exec_node *
+link(void *mem_ctx, bblock_t *block, enum bblock_link_kind kind)
+{
+   bblock_link *l = new(mem_ctx) bblock_link(block, kind);
+   return &l->link;
+}
+
+void
+push_stack(exec_list *list, void *mem_ctx, bblock_t *block)
+{
+   /* The kind of the link is immaterial, but we need to provide one since
+    * this is (ab)using the edge data structure in order to implement a stack.
+    */
+   list->push_tail(link(mem_ctx, block, bblock_link_logical));
+}
+
+bblock_t::bblock_t(cfg_t *cfg) :
+   cfg(cfg), start_ip(0), end_ip(0), end_ip_delta(0), num(0)
+{
+   instructions.make_empty();
+   parents.make_empty();
+   children.make_empty();
+}
+
+void
+bblock_t::add_successor(void *mem_ctx, bblock_t *successor,
+                        enum bblock_link_kind kind)
+{
+   successor->parents.push_tail(::link(mem_ctx, this, kind));
+   children.push_tail(::link(mem_ctx, successor, kind));
+}
+
+bool
+bblock_t::is_predecessor_of(const bblock_t *block,
+                            enum bblock_link_kind kind) const
+{
+   foreach_list_typed_safe (bblock_link, parent, link, &block->parents) {
+      if (parent->block == this && parent->kind <= kind) {
+         return true;
+      }
+   }
+
+   return false;
+}
+
+bool
+bblock_t::is_successor_of(const bblock_t *block,
+                          enum bblock_link_kind kind) const
+{
+   foreach_list_typed_safe (bblock_link, child, link, &block->children) {
+      if (child->block == this && child->kind <= kind) {
+         return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+ends_block(const backend_instruction *inst)
+{
+   enum opcode op = inst->opcode;
+
+   return op == BRW_OPCODE_IF ||
+          op == BRW_OPCODE_ELSE ||
+          op == BRW_OPCODE_CONTINUE ||
+          op == BRW_OPCODE_BREAK ||
+          op == BRW_OPCODE_DO ||
+          op == BRW_OPCODE_WHILE;
+}
+
+static bool
+starts_block(const backend_instruction *inst)
+{
+   enum opcode op = inst->opcode;
+
+   return op == BRW_OPCODE_DO ||
+          op == BRW_OPCODE_ENDIF;
+}
+
+bool
+bblock_t::can_combine_with(const bblock_t *that) const
+{
+   if ((const bblock_t *)this->link.next != that)
+      return false;
+
+   if (ends_block(this->end()) ||
+       starts_block(that->start()))
+      return false;
+
+   return true;
+}
+
+void
+bblock_t::combine_with(bblock_t *that)
+{
+   assert(this->can_combine_with(that));
+   foreach_list_typed (bblock_link, link, link, &that->parents) {
+      assert(link->block == this);
+   }
+
+   this->end_ip = that->end_ip;
+   this->instructions.append_list(&that->instructions);
+
+   this->cfg->remove_block(that);
+}
+
+void
+bblock_t::dump(FILE *file) const
+{
+   const backend_shader *s = this->cfg->s;
+
+   int ip = this->start_ip;
+   foreach_inst_in_block(backend_instruction, inst, this) {
+      fprintf(file, "%5d: ", ip);
+      s->dump_instruction(inst, file);
+      ip++;
+   }
+}
+
+void
+bblock_t::unlink_list(exec_list *list)
+{
+   assert(list == &parents || list == &children);
+   const bool remove_parent = list == &children;
+
+   foreach_list_typed_safe(bblock_link, link, link, list) {
+      /* Also break the links from the other block back to this block. */
+      exec_list *sub_list = remove_parent ? &link->block->parents : &link->block->children;
+
+      foreach_list_typed_safe(bblock_link, sub_link, link, sub_list) {
+         if (sub_link->block == this) {
+            sub_link->link.remove();
+            ralloc_free(sub_link);
+         }
+      }
+
+      link->link.remove();
+      ralloc_free(link);
+   }
+}
+
+cfg_t::cfg_t(const backend_shader *s, exec_list *instructions) :
+   s(s)
+{
+   mem_ctx = ralloc_context(NULL);
+   block_list.make_empty();
+   blocks = NULL;
+   num_blocks = 0;
+
+   bblock_t *cur = NULL;
+   int ip = 0;
+
+   bblock_t *entry = new_block();
+   bblock_t *cur_if = NULL;    /**< BB ending with IF. */
+   bblock_t *cur_else = NULL;  /**< BB ending with ELSE. */
+   bblock_t *cur_do = NULL;    /**< BB starting with DO. */
+   bblock_t *cur_while = NULL; /**< BB immediately following WHILE. */
+   exec_list if_stack, else_stack, do_stack, while_stack;
+   bblock_t *next;
+
+   set_next_block(&cur, entry, ip);
+
+   foreach_in_list_safe(backend_instruction, inst, instructions) {
+      /* set_next_block wants the post-incremented ip */
+      ip++;
+
+      inst->exec_node::remove();
+
+      switch (inst->opcode) {
+      case BRW_OPCODE_IF:
+         cur->instructions.push_tail(inst);
+
+	 /* Push our information onto a stack so we can recover from
+	  * nested ifs.
+	  */
+         push_stack(&if_stack, mem_ctx, cur_if);
+         push_stack(&else_stack, mem_ctx, cur_else);
+
+	 cur_if = cur;
+	 cur_else = NULL;
+
+	 /* Set up our immediately following block, full of "then"
+	  * instructions.
+	  */
+	 next = new_block();
+         cur_if->add_successor(mem_ctx, next, bblock_link_logical);
+
+	 set_next_block(&cur, next, ip);
+	 break;
+
+      case BRW_OPCODE_ELSE:
+         cur->instructions.push_tail(inst);
+
+         cur_else = cur;
+
+	 next = new_block();
+         assert(cur_if != NULL);
+         cur_if->add_successor(mem_ctx, next, bblock_link_logical);
+         cur_else->add_successor(mem_ctx, next, bblock_link_physical);
+
+	 set_next_block(&cur, next, ip);
+	 break;
+
+      case BRW_OPCODE_ENDIF: {
+         bblock_t *cur_endif;
+
+         if (cur->instructions.is_empty()) {
+            /* New block was just created; use it. */
+            cur_endif = cur;
+         } else {
+            cur_endif = new_block();
+
+            cur->add_successor(mem_ctx, cur_endif, bblock_link_logical);
+
+            set_next_block(&cur, cur_endif, ip - 1);
+         }
+
+         cur->instructions.push_tail(inst);
+
+         if (cur_else) {
+            cur_else->add_successor(mem_ctx, cur_endif, bblock_link_logical);
+         } else {
+            assert(cur_if != NULL);
+            cur_if->add_successor(mem_ctx, cur_endif, bblock_link_logical);
+         }
+
+         assert(cur_if->end()->opcode == BRW_OPCODE_IF);
+         assert(!cur_else || cur_else->end()->opcode == BRW_OPCODE_ELSE);
+
+	 /* Pop the stack so we're in the previous if/else/endif */
+	 cur_if = pop_stack(&if_stack);
+	 cur_else = pop_stack(&else_stack);
+	 break;
+      }
+      case BRW_OPCODE_DO:
+	 /* Push our information onto a stack so we can recover from
+	  * nested loops.
+	  */
+         push_stack(&do_stack, mem_ctx, cur_do);
+         push_stack(&while_stack, mem_ctx, cur_while);
+
+	 /* Set up the block just after the while.  Don't know when exactly
+	  * it will start, yet.
+	  */
+	 cur_while = new_block();
+
+         if (cur->instructions.is_empty()) {
+            /* New block was just created; use it. */
+            cur_do = cur;
+         } else {
+            cur_do = new_block();
+
+            cur->add_successor(mem_ctx, cur_do, bblock_link_logical);
+
+            set_next_block(&cur, cur_do, ip - 1);
+         }
+
+         cur->instructions.push_tail(inst);
+
+         /* Represent divergent execution of the loop as a pair of alternative
+          * edges coming out of the DO instruction: For any physical iteration
+          * of the loop a given logical thread can either start off enabled
+          * (which is represented as the "next" successor), or disabled (if it
+          * has reached a non-uniform exit of the loop during a previous
+          * iteration, which is represented as the "cur_while" successor).
+          *
+          * The disabled edge will be taken by the logical thread anytime we
+          * arrive at the DO instruction through a back-edge coming from a
+          * conditional exit of the loop where divergent control flow started.
+          *
+          * This guarantees that there is a control-flow path from any
+          * divergence point of the loop into the convergence point
+          * (immediately past the WHILE instruction) such that it overlaps the
+          * whole IP region of divergent control flow (potentially the whole
+          * loop) *and* doesn't imply the execution of any instructions part
+          * of the loop (since the corresponding execution mask bit will be
+          * disabled for a diverging thread).
+          *
+          * This way we make sure that any variables that are live throughout
+          * the region of divergence for an inactive logical thread are also
+          * considered to interfere with any other variables assigned by
+          * active logical threads within the same physical region of the
+          * program, since otherwise we would risk cross-channel data
+          * corruption.
+          */
+         next = new_block();
+         cur->add_successor(mem_ctx, next, bblock_link_logical);
+         cur->add_successor(mem_ctx, cur_while, bblock_link_physical);
+         set_next_block(&cur, next, ip);
+	 break;
+
+      case BRW_OPCODE_CONTINUE:
+         cur->instructions.push_tail(inst);
+
+         /* A conditional CONTINUE may start a region of divergent control
+          * flow until the start of the next loop iteration (*not* until the
+          * end of the loop which is why the successor is not the top-level
+          * divergence point at cur_do).  The live interval of any variable
+          * extending through a CONTINUE edge is guaranteed to overlap the
+          * whole region of divergent execution, because any variable live-out
+          * at the CONTINUE instruction will also be live-in at the top of the
+          * loop, and therefore also live-out at the bottom-most point of the
+          * loop which is reachable from the top (since a control flow path
+          * exists from a definition of the variable through this CONTINUE
+          * instruction, the top of the loop, the (reachable) bottom of the
+          * loop, the top of the loop again, into a use of the variable).
+          */
+         assert(cur_do != NULL);
+         cur->add_successor(mem_ctx, cur_do->next(), bblock_link_logical);
+
+	 next = new_block();
+	 if (inst->predicate)
+            cur->add_successor(mem_ctx, next, bblock_link_logical);
+         else
+            cur->add_successor(mem_ctx, next, bblock_link_physical);
+
+	 set_next_block(&cur, next, ip);
+	 break;
+
+      case BRW_OPCODE_BREAK:
+         cur->instructions.push_tail(inst);
+
+         /* A conditional BREAK instruction may start a region of divergent
+          * control flow until the end of the loop if the condition is
+          * non-uniform, in which case the loop will execute additional
+          * iterations with the present channel disabled.  We model this as a
+          * control flow path from the divergence point to the convergence
+          * point that overlaps the whole IP range of the loop and skips over
+          * the execution of any other instructions part of the loop.
+          *
+          * See the DO case for additional explanation.
+          */
+         assert(cur_do != NULL);
+         cur->add_successor(mem_ctx, cur_do, bblock_link_physical);
+         cur->add_successor(mem_ctx, cur_while, bblock_link_logical);
+
+	 next = new_block();
+	 if (inst->predicate)
+            cur->add_successor(mem_ctx, next, bblock_link_logical);
+         else
+            cur->add_successor(mem_ctx, next, bblock_link_physical);
+
+	 set_next_block(&cur, next, ip);
+	 break;
+
+      case BRW_OPCODE_WHILE:
+         cur->instructions.push_tail(inst);
+
+         assert(cur_do != NULL && cur_while != NULL);
+
+         /* A conditional WHILE instruction may start a region of divergent
+          * control flow until the end of the loop, just like the BREAK
+          * instruction.  See the BREAK case for more details.  OTOH an
+          * unconditional WHILE instruction is non-divergent (just like an
+          * unconditional CONTINUE), and will necessarily lead to the
+          * execution of an additional iteration of the loop for all enabled
+          * channels, so we may skip over the divergence point at the top of
+          * the loop to keep the CFG as unambiguous as possible.
+          */
+         if (inst->predicate) {
+            cur->add_successor(mem_ctx, cur_do, bblock_link_logical);
+         } else {
+            cur->add_successor(mem_ctx, cur_do->next(), bblock_link_logical);
+         }
+
+	 set_next_block(&cur, cur_while, ip);
+
+	 /* Pop the stack so we're in the previous loop */
+	 cur_do = pop_stack(&do_stack);
+	 cur_while = pop_stack(&while_stack);
+	 break;
+
+      default:
+         cur->instructions.push_tail(inst);
+	 break;
+      }
+   }
+
+   cur->end_ip = ip - 1;
+
+   make_block_array();
+}
+
+cfg_t::~cfg_t()
+{
+   ralloc_free(mem_ctx);
+}
+
+void
+cfg_t::remove_block(bblock_t *block)
+{
+   foreach_list_typed_safe (bblock_link, predecessor, link, &block->parents) {
+      /* cfg_t::validate checks that predecessor and successor lists are well
+       * formed, so it is known that the loop here would find exactly one
+       * block. Set old_link_kind to silence "variable used but not set"
+       * warnings.
+       */
+      bblock_link_kind old_link_kind = bblock_link_logical;
+
+      /* Remove block from all of its predecessors' successor lists. */
+      foreach_list_typed_safe (bblock_link, successor, link,
+                               &predecessor->block->children) {
+         if (block == successor->block) {
+            old_link_kind = successor->kind;
+            successor->link.remove();
+            ralloc_free(successor);
+            break;
+         }
+      }
+
+      /* Add removed-block's successors to its predecessors' successor lists. */
+      foreach_list_typed (bblock_link, successor, link, &block->children) {
+         bool need_to_link = true;
+         bblock_link_kind new_link_kind = MAX2(old_link_kind, successor->kind);
+
+         foreach_list_typed_safe (bblock_link, child, link, &predecessor->block->children) {
+            /* There is already a link between the two blocks. If the links
+             * are the same kind or the link is logical, do nothing. If the
+             * existing link is physical and the proposed new link is logical,
+             * promote the existing link to logical.
+             *
+             * This is accomplished by taking the minimum of the existing link
+             * kind and the proposed link kind.
+             */
+            if (child->block == successor->block) {
+               child->kind = MIN2(child->kind, new_link_kind);
+               need_to_link = false;
+               break;
+            }
+         }
+
+         if (need_to_link) {
+            predecessor->block->children.push_tail(link(mem_ctx,
+                                                        successor->block,
+                                                        new_link_kind));
+         }
+      }
+   }
+
+   foreach_list_typed_safe (bblock_link, successor, link, &block->children) {
+      /* cfg_t::validate checks that predecessor and successor lists are well
+       * formed, so it is known that the loop here would find exactly one
+       * block. Set old_link_kind to silence "variable used but not set"
+       * warnings.
+       */
+      bblock_link_kind old_link_kind = bblock_link_logical;
+
+      /* Remove block from all of its childrens' parents lists. */
+      foreach_list_typed_safe (bblock_link, predecessor, link,
+                               &successor->block->parents) {
+         if (block == predecessor->block) {
+            old_link_kind = predecessor->kind;
+            predecessor->link.remove();
+            ralloc_free(predecessor);
+         }
+      }
+
+      /* Add removed-block's predecessors to its successors' predecessor lists. */
+      foreach_list_typed (bblock_link, predecessor, link, &block->parents) {
+         bool need_to_link = true;
+         bblock_link_kind new_link_kind = MAX2(old_link_kind, predecessor->kind);
+
+         foreach_list_typed_safe (bblock_link, parent, link, &successor->block->parents) {
+            /* There is already a link between the two blocks. If the links
+             * are the same kind or the link is logical, do nothing. If the
+             * existing link is physical and the proposed new link is logical,
+             * promote the existing link to logical.
+             *
+             * This is accomplished by taking the minimum of the existing link
+             * kind and the proposed link kind.
+             */
+            if (parent->block == predecessor->block) {
+               parent->kind = MIN2(parent->kind, new_link_kind);
+               need_to_link = false;
+               break;
+            }
+         }
+
+         if (need_to_link) {
+            successor->block->parents.push_tail(link(mem_ctx,
+                                                     predecessor->block,
+                                                     new_link_kind));
+         }
+      }
+   }
+
+   block->link.remove();
+
+   for (int b = block->num; b < this->num_blocks - 1; b++) {
+      this->blocks[b] = this->blocks[b + 1];
+      this->blocks[b]->num = b;
+   }
+
+   this->blocks[this->num_blocks - 1]->num = this->num_blocks - 2;
+   this->num_blocks--;
+}
+
+bblock_t *
+cfg_t::new_block()
+{
+   bblock_t *block = new(mem_ctx) bblock_t(this);
+
+   return block;
+}
+
+void
+cfg_t::set_next_block(bblock_t **cur, bblock_t *block, int ip)
+{
+   if (*cur) {
+      (*cur)->end_ip = ip - 1;
+   }
+
+   block->start_ip = ip;
+   block->num = num_blocks++;
+   block_list.push_tail(&block->link);
+   *cur = block;
+}
+
+void
+cfg_t::make_block_array()
+{
+   blocks = ralloc_array(mem_ctx, bblock_t *, num_blocks);
+
+   int i = 0;
+   foreach_block (block, this) {
+      blocks[i++] = block;
+   }
+   assert(i == num_blocks);
+}
+
+namespace {
+
+struct link_desc {
+   char kind;
+   int num;
+};
+
+int
+compare_link_desc(const void *a, const void *b)
+{
+   const link_desc *la = (const link_desc *)a;
+   const link_desc *lb = (const link_desc *)b;
+
+   return la->num < lb->num ? -1 :
+          la->num > lb->num ? +1 :
+          la->kind < lb->kind ? -1 :
+          la->kind > lb->kind ? +1 :
+          0;
+}
+
+void
+sort_links(util_dynarray *scratch, exec_list *list)
+{
+   util_dynarray_clear(scratch);
+   foreach_list_typed(bblock_link, link, link, list) {
+      link_desc l;
+      l.kind = link->kind == bblock_link_logical ? '-' : '~';
+      l.num = link->block->num;
+      util_dynarray_append(scratch, link_desc, l);
+   }
+   qsort(scratch->data, util_dynarray_num_elements(scratch, link_desc),
+         sizeof(link_desc), compare_link_desc);
+}
+
+} /* namespace */
+
+void
+cfg_t::dump(FILE *file)
+{
+   const idom_tree *idom = (s ? &s->idom_analysis.require() : NULL);
+
+   /* Temporary storage to sort the lists of blocks.  This normalizes the
+    * output, making it possible to use it for certain tests.
+    */
+   util_dynarray scratch;
+   util_dynarray_init(&scratch, NULL);
+
+   foreach_block (block, this) {
+      if (idom && idom->parent(block))
+         fprintf(file, "START B%d IDOM(B%d)", block->num,
+                 idom->parent(block)->num);
+      else
+         fprintf(file, "START B%d IDOM(none)", block->num);
+
+      sort_links(&scratch, &block->parents);
+      util_dynarray_foreach(&scratch, link_desc, l)
+         fprintf(file, " <%cB%d", l->kind, l->num);
+      fprintf(file, "\n");
+
+      if (s != NULL)
+         block->dump(file);
+      fprintf(file, "END B%d", block->num);
+
+      sort_links(&scratch, &block->children);
+      util_dynarray_foreach(&scratch, link_desc, l)
+         fprintf(file, " %c>B%d", l->kind, l->num);
+      fprintf(file, "\n");
+   }
+
+   util_dynarray_fini(&scratch);
+}
+
+/* Calculates the immediate dominator of each block, according to "A Simple,
+ * Fast Dominance Algorithm" by Keith D. Cooper, Timothy J. Harvey, and Ken
+ * Kennedy.
+ *
+ * The authors claim that for control flow graphs of sizes normally encountered
+ * (less than 1000 nodes) that this algorithm is significantly faster than
+ * others like Lengauer-Tarjan.
+ */
+idom_tree::idom_tree(const backend_shader *s) :
+   num_parents(s->cfg->num_blocks),
+   parents(new bblock_t *[num_parents]())
+{
+   bool changed;
+
+   parents[0] = s->cfg->blocks[0];
+
+   do {
+      changed = false;
+
+      foreach_block(block, s->cfg) {
+         if (block->num == 0)
+            continue;
+
+         bblock_t *new_idom = NULL;
+         foreach_list_typed(bblock_link, parent_link, link, &block->parents) {
+            if (parent(parent_link->block)) {
+               new_idom = (new_idom ? intersect(new_idom, parent_link->block) :
+                           parent_link->block);
+            }
+         }
+
+         if (parent(block) != new_idom) {
+            parents[block->num] = new_idom;
+            changed = true;
+         }
+      }
+   } while (changed);
+}
+
+idom_tree::~idom_tree()
+{
+   delete[] parents;
+}
+
+bblock_t *
+idom_tree::intersect(bblock_t *b1, bblock_t *b2) const
+{
+   /* Note, the comparisons here are the opposite of what the paper says
+    * because we index blocks from beginning -> end (i.e. reverse post-order)
+    * instead of post-order like they assume.
+    */
+   while (b1->num != b2->num) {
+      while (b1->num > b2->num)
+         b1 = parent(b1);
+      while (b2->num > b1->num)
+         b2 = parent(b2);
+   }
+   assert(b1);
+   return b1;
+}
+
+void
+idom_tree::dump() const
+{
+   printf("digraph DominanceTree {\n");
+   for (unsigned i = 0; i < num_parents; i++)
+      printf("\t%d -> %d\n", parents[i]->num, i);
+   printf("}\n");
+}
+
+void
+cfg_t::dump_cfg()
+{
+   printf("digraph CFG {\n");
+   for (int b = 0; b < num_blocks; b++) {
+      bblock_t *block = this->blocks[b];
+
+      foreach_list_typed_safe (bblock_link, child, link, &block->children) {
+         printf("\t%d -> %d\n", b, child->block->num);
+      }
+   }
+   printf("}\n");
+}
+
+#define cfgv_assert(assertion)                                          \
+   {                                                                    \
+      if (!(assertion)) {                                               \
+         fprintf(stderr, "ASSERT: CFG validation in %s failed!\n", stage_abbrev); \
+         fprintf(stderr, "%s:%d: '%s' failed\n", __FILE__, __LINE__, #assertion);  \
+         abort();                                                       \
+      }                                                                 \
+   }
+
+#ifndef NDEBUG
+void
+cfg_t::validate(const char *stage_abbrev)
+{
+   foreach_block(block, this) {
+      foreach_list_typed(bblock_link, successor, link, &block->children) {
+         /* Each successor of a block must have one predecessor link back to
+          * the block.
+          */
+         bool successor_links_back_to_predecessor = false;
+         bblock_t *succ_block = successor->block;
+
+         foreach_list_typed(bblock_link, predecessor, link, &succ_block->parents) {
+            if (predecessor->block == block) {
+               cfgv_assert(!successor_links_back_to_predecessor);
+               cfgv_assert(successor->kind == predecessor->kind);
+               successor_links_back_to_predecessor = true;
+            }
+         }
+
+         cfgv_assert(successor_links_back_to_predecessor);
+
+         /* Each successor block must appear only once in the list of
+          * successors.
+          */
+         foreach_list_typed_from(bblock_link, later_successor, link,
+                                 &block->children, successor->link.next) {
+            cfgv_assert(successor->block != later_successor->block);
+         }
+      }
+
+      foreach_list_typed(bblock_link, predecessor, link, &block->parents) {
+         /* Each predecessor of a block must have one successor link back to
+          * the block.
+          */
+         bool predecessor_links_back_to_successor = false;
+         bblock_t *pred_block = predecessor->block;
+
+         foreach_list_typed(bblock_link, successor, link, &pred_block->children) {
+            if (successor->block == block) {
+               cfgv_assert(!predecessor_links_back_to_successor);
+               cfgv_assert(successor->kind == predecessor->kind);
+               predecessor_links_back_to_successor = true;
+            }
+         }
+
+         cfgv_assert(predecessor_links_back_to_successor);
+
+         /* Each precessor block must appear only once in the list of
+          * precessors.
+          */
+         foreach_list_typed_from(bblock_link, later_precessor, link,
+                                 &block->parents, predecessor->link.next) {
+            cfgv_assert(predecessor->block != later_precessor->block);
+         }
+      }
+
+      backend_instruction *first_inst = block->start();
+      if (first_inst->opcode == BRW_OPCODE_DO) {
+         /* DO instructions both begin and end a block, so the DO instruction
+          * must be the only instruction in the block.
+          */
+         cfgv_assert(exec_list_is_singular(&block->instructions));
+
+         /* A block starting with DO should have exactly two successors. One
+          * is a physical link to the block starting after the WHILE
+          * instruction. The other is a logical link to the block starting the
+          * body of the loop.
+          */
+         bblock_t *physical_block = nullptr;
+         bblock_t *logical_block = nullptr;
+
+         foreach_list_typed(bblock_link, child, link, &block->children) {
+            if (child->kind == bblock_link_physical) {
+               cfgv_assert(physical_block == nullptr);
+               physical_block = child->block;
+            } else {
+               cfgv_assert(logical_block == nullptr);
+               logical_block = child->block;
+            }
+         }
+
+         cfgv_assert(logical_block != nullptr);
+         cfgv_assert(physical_block != nullptr);
+      }
+   }
+}
+#endif
diff --git a/src/intel/compiler/elk/brw_cfg.h b/src/intel/compiler/elk/brw_cfg.h
new file mode 100644
index 00000000000..7784ab43784
--- /dev/null
+++ b/src/intel/compiler/elk/brw_cfg.h
@@ -0,0 +1,532 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#ifndef BRW_CFG_H
+#define BRW_CFG_H
+
+#include "brw_ir.h"
+#ifdef __cplusplus
+#include "brw_ir_analysis.h"
+#endif
+
+struct bblock_t;
+
+/**
+ * CFG edge types.
+ *
+ * A logical edge represents a potential control flow path of the original
+ * scalar program, while a physical edge represents a control flow path that
+ * may not have existed in the original program but was introduced during
+ * vectorization in order to implement divergent control flow of different
+ * shader invocations within the same SIMD thread.
+ *
+ * All logical edges in the CFG are considered to be physical edges but not
+ * the other way around -- I.e. the logical CFG is a subset of the physical
+ * one.
+ */
+enum bblock_link_kind {
+   bblock_link_logical = 0,
+   bblock_link_physical
+};
+
+struct bblock_link {
+#ifdef __cplusplus
+   DECLARE_RALLOC_CXX_OPERATORS(bblock_link)
+
+   bblock_link(bblock_t *block, enum bblock_link_kind kind)
+      : block(block), kind(kind)
+   {
+   }
+#endif
+
+   struct exec_node link;
+   struct bblock_t *block;
+
+   /* Type of this CFG edge.  Because bblock_link_logical also implies
+    * bblock_link_physical, the proper way to test for membership of edge 'l'
+    * in CFG kind 'k' is 'l.kind <= k'.
+    */
+   enum bblock_link_kind kind;
+};
+
+struct backend_shader;
+struct cfg_t;
+
+struct bblock_t {
+#ifdef __cplusplus
+   DECLARE_RALLOC_CXX_OPERATORS(bblock_t)
+
+   explicit bblock_t(cfg_t *cfg);
+
+   void add_successor(void *mem_ctx, bblock_t *successor,
+                      enum bblock_link_kind kind);
+   bool is_predecessor_of(const bblock_t *block,
+                          enum bblock_link_kind kind) const;
+   bool is_successor_of(const bblock_t *block,
+                        enum bblock_link_kind kind) const;
+   bool can_combine_with(const bblock_t *that) const;
+   void combine_with(bblock_t *that);
+   void dump(FILE *file = stderr) const;
+
+   backend_instruction *start();
+   const backend_instruction *start() const;
+   backend_instruction *end();
+   const backend_instruction *end() const;
+
+   bblock_t *next();
+   const bblock_t *next() const;
+   bblock_t *prev();
+   const bblock_t *prev() const;
+
+   bool starts_with_control_flow() const;
+   bool ends_with_control_flow() const;
+
+   backend_instruction *first_non_control_flow_inst();
+   backend_instruction *last_non_control_flow_inst();
+
+private:
+   /**
+    * \sa unlink_parents, unlink_children
+    */
+   void unlink_list(exec_list *);
+
+public:
+   void unlink_parents()
+   {
+      unlink_list(&parents);
+   }
+
+   void unlink_children()
+   {
+      unlink_list(&children);
+   }
+#endif
+
+   struct exec_node link;
+   struct cfg_t *cfg;
+
+   int start_ip;
+   int end_ip;
+
+   /**
+    * Change in end_ip since the last time IPs of later blocks were updated.
+    */
+   int end_ip_delta;
+
+   struct exec_list instructions;
+   struct exec_list parents;
+   struct exec_list children;
+   int num;
+};
+
+static inline struct backend_instruction *
+bblock_start(struct bblock_t *block)
+{
+   return (struct backend_instruction *)exec_list_get_head(&block->instructions);
+}
+
+static inline const struct backend_instruction *
+bblock_start_const(const struct bblock_t *block)
+{
+   return (const struct backend_instruction *)exec_list_get_head_const(&block->instructions);
+}
+
+static inline struct backend_instruction *
+bblock_end(struct bblock_t *block)
+{
+   return (struct backend_instruction *)exec_list_get_tail(&block->instructions);
+}
+
+static inline const struct backend_instruction *
+bblock_end_const(const struct bblock_t *block)
+{
+   return (const struct backend_instruction *)exec_list_get_tail_const(&block->instructions);
+}
+
+static inline struct bblock_t *
+bblock_next(struct bblock_t *block)
+{
+   if (exec_node_is_tail_sentinel(block->link.next))
+      return NULL;
+
+   return (struct bblock_t *)block->link.next;
+}
+
+static inline const struct bblock_t *
+bblock_next_const(const struct bblock_t *block)
+{
+   if (exec_node_is_tail_sentinel(block->link.next))
+      return NULL;
+
+   return (const struct bblock_t *)block->link.next;
+}
+
+static inline struct bblock_t *
+bblock_prev(struct bblock_t *block)
+{
+   if (exec_node_is_head_sentinel(block->link.prev))
+      return NULL;
+
+   return (struct bblock_t *)block->link.prev;
+}
+
+static inline const struct bblock_t *
+bblock_prev_const(const struct bblock_t *block)
+{
+   if (exec_node_is_head_sentinel(block->link.prev))
+      return NULL;
+
+   return (const struct bblock_t *)block->link.prev;
+}
+
+static inline bool
+bblock_starts_with_control_flow(const struct bblock_t *block)
+{
+   enum opcode op = bblock_start_const(block)->opcode;
+   return op == BRW_OPCODE_DO || op == BRW_OPCODE_ENDIF;
+}
+
+static inline bool
+bblock_ends_with_control_flow(const struct bblock_t *block)
+{
+   enum opcode op = bblock_end_const(block)->opcode;
+   return op == BRW_OPCODE_IF ||
+          op == BRW_OPCODE_ELSE ||
+          op == BRW_OPCODE_WHILE ||
+          op == BRW_OPCODE_BREAK ||
+          op == BRW_OPCODE_CONTINUE;
+}
+
+static inline struct backend_instruction *
+bblock_first_non_control_flow_inst(struct bblock_t *block)
+{
+   struct backend_instruction *inst = bblock_start(block);
+   if (bblock_starts_with_control_flow(block))
+#ifdef __cplusplus
+      inst = (struct backend_instruction *)inst->next;
+#else
+      inst = (struct backend_instruction *)inst->link.next;
+#endif
+   return inst;
+}
+
+static inline struct backend_instruction *
+bblock_last_non_control_flow_inst(struct bblock_t *block)
+{
+   struct backend_instruction *inst = bblock_end(block);
+   if (bblock_ends_with_control_flow(block))
+#ifdef __cplusplus
+      inst = (struct backend_instruction *)inst->prev;
+#else
+      inst = (struct backend_instruction *)inst->link.prev;
+#endif
+   return inst;
+}
+
+#ifdef __cplusplus
+inline backend_instruction *
+bblock_t::start()
+{
+   return bblock_start(this);
+}
+
+inline const backend_instruction *
+bblock_t::start() const
+{
+   return bblock_start_const(this);
+}
+
+inline backend_instruction *
+bblock_t::end()
+{
+   return bblock_end(this);
+}
+
+inline const backend_instruction *
+bblock_t::end() const
+{
+   return bblock_end_const(this);
+}
+
+inline bblock_t *
+bblock_t::next()
+{
+   return bblock_next(this);
+}
+
+inline const bblock_t *
+bblock_t::next() const
+{
+   return bblock_next_const(this);
+}
+
+inline bblock_t *
+bblock_t::prev()
+{
+   return bblock_prev(this);
+}
+
+inline const bblock_t *
+bblock_t::prev() const
+{
+   return bblock_prev_const(this);
+}
+
+inline bool
+bblock_t::starts_with_control_flow() const
+{
+   return bblock_starts_with_control_flow(this);
+}
+
+inline bool
+bblock_t::ends_with_control_flow() const
+{
+   return bblock_ends_with_control_flow(this);
+}
+
+inline backend_instruction *
+bblock_t::first_non_control_flow_inst()
+{
+   return bblock_first_non_control_flow_inst(this);
+}
+
+inline backend_instruction *
+bblock_t::last_non_control_flow_inst()
+{
+   return bblock_last_non_control_flow_inst(this);
+}
+#endif
+
+struct cfg_t {
+#ifdef __cplusplus
+   DECLARE_RALLOC_CXX_OPERATORS(cfg_t)
+
+   cfg_t(const backend_shader *s, exec_list *instructions);
+   ~cfg_t();
+
+   void remove_block(bblock_t *block);
+
+   bblock_t *first_block();
+   const bblock_t *first_block() const;
+   bblock_t *last_block();
+   const bblock_t *last_block() const;
+
+   bblock_t *new_block();
+   void set_next_block(bblock_t **cur, bblock_t *block, int ip);
+   void make_block_array();
+
+   void dump(FILE *file = stderr);
+   void dump_cfg();
+
+#ifdef NDEBUG
+   void validate(UNUSED const char *stage_abbrev) { }
+#else
+   void validate(const char *stage_abbrev);
+#endif
+
+   /**
+    * Propagate bblock_t::end_ip_delta data through the CFG.
+    */
+   inline void adjust_block_ips();
+
+#endif
+   const struct backend_shader *s;
+   void *mem_ctx;
+
+   /** Ordered list (by ip) of basic blocks */
+   struct exec_list block_list;
+   struct bblock_t **blocks;
+   int num_blocks;
+};
+
+static inline struct bblock_t *
+cfg_first_block(struct cfg_t *cfg)
+{
+   return (struct bblock_t *)exec_list_get_head(&cfg->block_list);
+}
+
+static inline const struct bblock_t *
+cfg_first_block_const(const struct cfg_t *cfg)
+{
+   return (const struct bblock_t *)exec_list_get_head_const(&cfg->block_list);
+}
+
+static inline struct bblock_t *
+cfg_last_block(struct cfg_t *cfg)
+{
+   return (struct bblock_t *)exec_list_get_tail(&cfg->block_list);
+}
+
+static inline const struct bblock_t *
+cfg_last_block_const(const struct cfg_t *cfg)
+{
+   return (const struct bblock_t *)exec_list_get_tail_const(&cfg->block_list);
+}
+
+#ifdef __cplusplus
+inline bblock_t *
+cfg_t::first_block()
+{
+   return cfg_first_block(this);
+}
+
+const inline bblock_t *
+cfg_t::first_block() const
+{
+   return cfg_first_block_const(this);
+}
+
+inline bblock_t *
+cfg_t::last_block()
+{
+   return cfg_last_block(this);
+}
+
+const inline bblock_t *
+cfg_t::last_block() const
+{
+   return cfg_last_block_const(this);
+}
+#endif
+
+/* Note that this is implemented with a double for loop -- break will
+ * break from the inner loop only!
+ */
+#define foreach_block_and_inst(__block, __type, __inst, __cfg) \
+   foreach_block (__block, __cfg)                              \
+      foreach_inst_in_block (__type, __inst, __block)
+
+/* Note that this is implemented with a double for loop -- break will
+ * break from the inner loop only!
+ */
+#define foreach_block_and_inst_safe(__block, __type, __inst, __cfg) \
+   foreach_block_safe (__block, __cfg)                              \
+      foreach_inst_in_block_safe (__type, __inst, __block)
+
+#define foreach_block(__block, __cfg)                          \
+   foreach_list_typed (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_block_reverse(__block, __cfg)                  \
+   foreach_list_typed_reverse (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_block_safe(__block, __cfg)                     \
+   foreach_list_typed_safe (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_block_reverse_safe(__block, __cfg)             \
+   foreach_list_typed_reverse_safe (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_inst_in_block(__type, __inst, __block)         \
+   foreach_in_list(__type, __inst, &(__block)->instructions)
+
+#define foreach_inst_in_block_safe(__type, __inst, __block)    \
+   for (__type *__inst = (__type *)__block->instructions.head_sentinel.next, \
+               *__next = (__type *)__inst->next;               \
+        __next != NULL;                                        \
+        __inst = __next,                                       \
+        __next = (__type *)__next->next)
+
+#define foreach_inst_in_block_reverse(__type, __inst, __block) \
+   foreach_in_list_reverse(__type, __inst, &(__block)->instructions)
+
+#define foreach_inst_in_block_reverse_safe(__type, __inst, __block) \
+   foreach_in_list_reverse_safe(__type, __inst, &(__block)->instructions)
+
+#define foreach_inst_in_block_starting_from(__type, __scan_inst, __inst) \
+   for (__type *__scan_inst = (__type *)__inst->next;          \
+        !__scan_inst->is_tail_sentinel();                      \
+        __scan_inst = (__type *)__scan_inst->next)
+
+#define foreach_inst_in_block_reverse_starting_from(__type, __scan_inst, __inst) \
+   for (__type *__scan_inst = (__type *)__inst->prev;          \
+        !__scan_inst->is_head_sentinel();                      \
+        __scan_inst = (__type *)__scan_inst->prev)
+
+#ifdef __cplusplus
+inline void
+cfg_t::adjust_block_ips()
+{
+   int delta = 0;
+
+   foreach_block(block, this) {
+      block->start_ip += delta;
+      block->end_ip += delta;
+
+      delta += block->end_ip_delta;
+
+      block->end_ip_delta = 0;
+   }
+}
+
+namespace brw {
+   /**
+    * Immediate dominator tree analysis of a shader.
+    */
+   struct idom_tree {
+      idom_tree(const backend_shader *s);
+      ~idom_tree();
+
+      bool
+      validate(const backend_shader *) const
+      {
+         /* FINISHME */
+         return true;
+      }
+
+      analysis_dependency_class
+      dependency_class() const
+      {
+         return DEPENDENCY_BLOCKS;
+      }
+
+      const bblock_t *
+      parent(const bblock_t *b) const
+      {
+         assert(unsigned(b->num) < num_parents);
+         return parents[b->num];
+      }
+
+      bblock_t *
+      parent(bblock_t *b) const
+      {
+         assert(unsigned(b->num) < num_parents);
+         return parents[b->num];
+      }
+
+      bblock_t *
+      intersect(bblock_t *b1, bblock_t *b2) const;
+
+      void
+      dump() const;
+
+   private:
+      unsigned num_parents;
+      bblock_t **parents;
+   };
+}
+#endif
+
+#endif /* BRW_CFG_H */
diff --git a/src/intel/compiler/elk/brw_clip.h b/src/intel/compiler/elk/brw_clip.h
new file mode 100644
index 00000000000..4ca89455963
--- /dev/null
+++ b/src/intel/compiler/elk/brw_clip.h
@@ -0,0 +1,163 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#ifndef BRW_CLIP_H
+#define BRW_CLIP_H
+
+#include "brw_compiler.h"
+#include "brw_eu.h"
+
+/* Initial 3 verts, plus at most 6 additional verts from intersections
+ * with fixed planes, plus at most 8 additional verts from intersections
+ * with user clip planes
+ */
+#define MAX_VERTS (3+6+8)
+
+#define PRIM_MASK  (0x1f)
+
+struct brw_clip_compile {
+   struct brw_codegen func;
+   struct brw_clip_prog_key key;
+   struct brw_clip_prog_data prog_data;
+
+   struct {
+      struct brw_reg R0;
+      struct brw_reg vertex[MAX_VERTS];
+
+      struct brw_reg t;
+      struct brw_reg t0, t1;
+      struct brw_reg dp0, dp1;
+
+      struct brw_reg dpPrev;
+      struct brw_reg dp;
+      struct brw_reg loopcount;
+      struct brw_reg nr_verts;
+      struct brw_reg planemask;
+
+      struct brw_reg inlist;
+      struct brw_reg outlist;
+      struct brw_reg freelist;
+
+      struct brw_reg dir;
+      struct brw_reg tmp0, tmp1;
+      struct brw_reg offset;
+
+      struct brw_reg fixed_planes;
+      struct brw_reg plane_equation;
+
+      struct brw_reg ff_sync;
+
+      /* Bitmask indicating which coordinate attribute should be used for
+       * comparison to each clipping plane. A 0 indicates that VARYING_SLOT_POS
+       * should be used, because it's one of the fixed +/- x/y/z planes that
+       * constitute the bounds of the view volume. A 1 indicates that
+       * VARYING_SLOT_CLIP_VERTEX should be used (if available) since it's a user-
+       * defined clipping plane.
+       */
+      struct brw_reg vertex_src_mask;
+
+      /* Offset into the vertex of the current plane's clipdistance value */
+      struct brw_reg clipdistance_offset;
+   } reg;
+
+   /* Number of registers storing VUE data */
+   GLuint nr_regs;
+
+   GLuint first_tmp;
+   GLuint last_tmp;
+
+   bool need_direction;
+
+   struct intel_vue_map vue_map;
+};
+
+/**
+ * True if the given varying is one of the outputs of the vertex shader.
+ */
+static inline bool brw_clip_have_varying(struct brw_clip_compile *c,
+                                         GLuint varying)
+{
+   return (c->key.attrs & BITFIELD64_BIT(varying)) ? 1 : 0;
+}
+
+/* Points are only culled, so no need for a clip routine, however it
+ * works out easier to have a dummy one.
+ */
+void brw_emit_unfilled_clip( struct brw_clip_compile *c );
+void brw_emit_tri_clip( struct brw_clip_compile *c );
+void brw_emit_line_clip( struct brw_clip_compile *c );
+void brw_emit_point_clip( struct brw_clip_compile *c );
+
+/* brw_clip_tri.c, for use by the unfilled clip routine:
+ */
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c );
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c );
+void brw_clip_tri( struct brw_clip_compile *c );
+void brw_clip_tri_emit_polygon( struct brw_clip_compile *c );
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
+			      GLuint nr_verts );
+
+
+/* Utils:
+ */
+
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     bool force_edgeflag );
+
+void brw_clip_init_planes( struct brw_clip_compile *c );
+
+void brw_clip_emit_vue(struct brw_clip_compile *c,
+		       struct brw_indirect vert,
+                       enum brw_urb_write_flags flags,
+		       GLuint header);
+
+void brw_clip_kill_thread(struct brw_clip_compile *c);
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c );
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c );
+
+void brw_clip_copy_flatshaded_attributes( struct brw_clip_compile *c,
+                                          GLuint to, GLuint from );
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c );
+
+struct brw_reg get_tmp( struct brw_clip_compile *c );
+
+void brw_clip_project_position(struct brw_clip_compile *c,
+             struct brw_reg pos );
+void brw_clip_ff_sync(struct brw_clip_compile *c);
+void brw_clip_init_ff_sync(struct brw_clip_compile *c);
+
+#endif
diff --git a/src/intel/compiler/elk/brw_clip_line.c b/src/intel/compiler/elk/brw_clip_line.c
new file mode 100644
index 00000000000..b71173277d9
--- /dev/null
+++ b/src/intel/compiler/elk/brw_clip_line.c
@@ -0,0 +1,303 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#include "brw_clip.h"
+#include "brw_prim.h"
+
+static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
+{
+   const struct intel_device_info *devinfo = c->func.devinfo;
+   GLuint i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < 4; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->reg.t           = brw_vec1_grf(i, 0);
+   c->reg.t0          = brw_vec1_grf(i, 1);
+   c->reg.t1          = brw_vec1_grf(i, 2);
+   c->reg.planemask   = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dp0         = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp1         = brw_vec1_grf(i, 4);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0);
+      i++;
+   }
+
+   c->reg.vertex_src_mask = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+   c->reg.clipdistance_offset = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_W);
+   i++;
+
+   if (devinfo->ver == 5) {
+      c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+      i++;
+   }
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+/* Line clipping, more or less following the following algorithm:
+ *
+ *  for (p=0;p<MAX_PLANES;p++) {
+ *     if (clipmask & (1 << p)) {
+ *        GLfloat dp0 = DOTPROD( vtx0, plane[p] );
+ *        GLfloat dp1 = DOTPROD( vtx1, plane[p] );
+ *
+ *        if (dp1 < 0.0f) {
+ *           GLfloat t = dp1 / (dp1 - dp0);
+ *           if (t > t1) t1 = t;
+ *        } else {
+ *           GLfloat t = dp0 / (dp0 - dp1);
+ *           if (t > t0) t0 = t;
+ *        }
+ *
+ *        if (t0 + t1 >= 1.0)
+ *           return;
+ *     }
+ *  }
+ *
+ *  interp( ctx, newvtx0, vtx0, vtx1, t0 );
+ *  interp( ctx, newvtx1, vtx1, vtx0, t1 );
+ *
+ */
+static void clip_and_emit_line( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_indirect vtx0     = brw_indirect(0, 0);
+   struct brw_indirect vtx1      = brw_indirect(1, 0);
+   struct brw_indirect newvtx0   = brw_indirect(2, 0);
+   struct brw_indirect newvtx1   = brw_indirect(3, 0);
+   struct brw_indirect plane_ptr = brw_indirect(4, 0);
+   struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
+   GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
+   GLint clipdist0_offset = c->key.nr_userclip
+      ? brw_varying_to_offset(&c->vue_map, VARYING_SLOT_CLIP_DIST0)
+      : 0;
+
+   brw_MOV(p, get_addr_reg(vtx0),      brw_address(c->reg.vertex[0]));
+   brw_MOV(p, get_addr_reg(vtx1),      brw_address(c->reg.vertex[1]));
+   brw_MOV(p, get_addr_reg(newvtx0),   brw_address(c->reg.vertex[2]));
+   brw_MOV(p, get_addr_reg(newvtx1),   brw_address(c->reg.vertex[3]));
+   brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c));
+
+   /* Note: init t0, t1 together:
+    */
+   brw_MOV(p, vec2(c->reg.t0), brw_imm_f(0));
+
+   brw_clip_init_planes(c);
+   brw_clip_init_clipmask(c);
+
+   /* -ve rhw workaround */
+   if (p->devinfo->has_negative_rhw_bug) {
+      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
+              brw_imm_ud(1<<20));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+      brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
+      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+   }
+
+   /* Set the initial vertex source mask: The first 6 planes are the bounds
+    * of the view volume; the next 8 planes are the user clipping planes.
+    */
+   brw_MOV(p, c->reg.vertex_src_mask, brw_imm_ud(0x3fc0));
+
+   /* Set the initial clipdistance offset to be 6 floats before gl_ClipDistance[0].
+    * We'll increment 6 times before we start hitting actual user clipping. */
+   brw_MOV(p, c->reg.clipdistance_offset, brw_imm_d(clipdist0_offset - 6*sizeof(float)));
+
+   brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_AND(p, v1_null_ud, c->reg.planemask, brw_imm_ud(1));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+
+      brw_IF(p, BRW_EXECUTE_1);
+      {
+         brw_AND(p, v1_null_ud, c->reg.vertex_src_mask, brw_imm_ud(1));
+         brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+         brw_IF(p, BRW_EXECUTE_1);
+         {
+            /* user clip distance: just fetch the correct float from each vertex */
+            struct brw_indirect temp_ptr = brw_indirect(7, 0);
+            brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx0), c->reg.clipdistance_offset);
+            brw_MOV(p, c->reg.dp0, deref_1f(temp_ptr, 0));
+            brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx1), c->reg.clipdistance_offset);
+            brw_MOV(p, c->reg.dp1, deref_1f(temp_ptr, 0));
+         }
+         brw_ELSE(p);
+         {
+            /* fixed plane: fetch the hpos, dp4 against the plane. */
+            if (c->key.nr_userclip)
+               brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+            else
+               brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+
+            brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, hpos_offset), c->reg.plane_equation);
+            brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, hpos_offset), c->reg.plane_equation);
+         }
+         brw_ENDIF(p);
+
+         brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, vec1(c->reg.dp1), brw_imm_f(0.0f));
+
+         brw_IF(p, BRW_EXECUTE_1);
+         {
+             /*
+              * Both can be negative on GM965/G965 due to RHW workaround
+              * if so, this object should be rejected.
+              */
+             if (p->devinfo->has_negative_rhw_bug) {
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0));
+                 brw_IF(p, BRW_EXECUTE_1);
+                 {
+                     brw_clip_kill_thread(c);
+                 }
+                 brw_ENDIF(p);
+             }
+
+             brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
+             brw_math_invert(p, c->reg.t, c->reg.t);
+             brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);
+
+             brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
+             brw_MOV(p, c->reg.t1, c->reg.t);
+             brw_inst_set_pred_control(p->devinfo, brw_last_inst,
+                                       BRW_PREDICATE_NORMAL);
+	 }
+	 brw_ELSE(p);
+	 {
+             /* Coming back in.  We know that both cannot be negative
+              * because the line would have been culled in that case.
+              */
+
+             /* If both are positive, do nothing */
+             /* Only on GM965/G965 */
+             if (p->devinfo->has_negative_rhw_bug) {
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
+                 brw_IF(p, BRW_EXECUTE_1);
+             }
+
+             {
+                 brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
+                 brw_math_invert(p, c->reg.t, c->reg.t);
+                 brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);
+
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
+                 brw_MOV(p, c->reg.t0, c->reg.t);
+                 brw_inst_set_pred_control(p->devinfo, brw_last_inst,
+                                           BRW_PREDICATE_NORMAL);
+             }
+
+             if (p->devinfo->has_negative_rhw_bug) {
+                 brw_ENDIF(p);
+             }
+         }
+	 brw_ENDIF(p);
+      }
+      brw_ENDIF(p);
+
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* while (planemask>>=1) != 0
+       */
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.vertex_src_mask, c->reg.vertex_src_mask, brw_imm_ud(1));
+      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+      brw_ADD(p, c->reg.clipdistance_offset, c->reg.clipdistance_offset, brw_imm_w(sizeof(float)));
+      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+   }
+   brw_WHILE(p);
+   brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+
+   brw_ADD(p, c->reg.t, c->reg.t0, c->reg.t1);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.t, brw_imm_f(1.0));
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_interp_vertex(c, newvtx0, vtx0, vtx1, c->reg.t0, false);
+      brw_clip_interp_vertex(c, newvtx1, vtx1, vtx0, c->reg.t1, false);
+
+      brw_clip_emit_vue(c, newvtx0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
+                        (_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
+                        | URB_WRITE_PRIM_START);
+      brw_clip_emit_vue(c, newvtx1, BRW_URB_WRITE_EOT_COMPLETE,
+                        (_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
+                        | URB_WRITE_PRIM_END);
+   }
+   brw_ENDIF(p);
+   brw_clip_kill_thread(c);
+}
+
+
+
+void brw_emit_line_clip( struct brw_clip_compile *c )
+{
+   brw_clip_line_alloc_regs(c);
+   brw_clip_init_ff_sync(c);
+
+   if (c->key.contains_flat_varying) {
+      if (c->key.pv_first)
+         brw_clip_copy_flatshaded_attributes(c, 1, 0);
+      else
+         brw_clip_copy_flatshaded_attributes(c, 0, 1);
+   }
+
+   clip_and_emit_line(c);
+}
diff --git a/src/intel/compiler/elk/brw_clip_point.c b/src/intel/compiler/elk/brw_clip_point.c
new file mode 100644
index 00000000000..1cfb5f23357
--- /dev/null
+++ b/src/intel/compiler/elk/brw_clip_point.c
@@ -0,0 +1,45 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#include "brw_clip.h"
+
+
+/* Point clipping, nothing to do?
+ */
+void brw_emit_point_clip( struct brw_clip_compile *c )
+{
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_tri_alloc_regs(c, 0);
+   brw_clip_init_ff_sync(c);
+
+   brw_clip_kill_thread(c);
+}
diff --git a/src/intel/compiler/elk/brw_clip_tri.c b/src/intel/compiler/elk/brw_clip_tri.c
new file mode 100644
index 00000000000..a5bc2b85c12
--- /dev/null
+++ b/src/intel/compiler/elk/brw_clip_tri.c
@@ -0,0 +1,659 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#include "brw_clip.h"
+#include "brw_prim.h"
+
+static void release_tmps( struct brw_clip_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
+
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
+			      GLuint nr_verts )
+{
+   const struct intel_device_info *devinfo = c->func.devinfo;
+   GLuint i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   if (c->vue_map.num_slots % 2 && nr_verts > 0) {
+      /* The VUE has an odd number of slots so the last register is only half
+       * used.  Fill the second half with zero.
+       */
+      for (j = 0; j < 3; j++) {
+	 GLuint delta = brw_vue_slot_to_offset(c->vue_map.num_slots);
+
+	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
+      }
+   }
+
+   c->reg.t          = brw_vec1_grf(i, 0);
+   c->reg.loopcount  = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_D);
+   c->reg.nr_verts   = retype(brw_vec1_grf(i, 2), BRW_REGISTER_TYPE_UD);
+   c->reg.planemask  = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dpPrev     = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp         = brw_vec1_grf(i, 4);
+   i++;
+
+   c->reg.inlist     = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.outlist    = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.freelist   = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0);
+      i++;
+   }
+
+   if (c->key.do_unfilled) {
+      c->reg.dir     = brw_vec4_grf(i, 0);
+      c->reg.offset  = brw_vec4_grf(i, 4);
+      i++;
+      c->reg.tmp0    = brw_vec4_grf(i, 0);
+      c->reg.tmp1    = brw_vec4_grf(i, 4);
+      i++;
+   }
+
+   c->reg.vertex_src_mask = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+   c->reg.clipdistance_offset = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_W);
+   i++;
+
+   if (devinfo->ver == 5) {
+      c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+      i++;
+   }
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+
+   /* Initial list of indices for incoming vertices:
+    */
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   BRW_CONDITIONAL_EQ,
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
+
+   /* XXX: Is there an easier way to do this?  Need to reverse every
+    * second tristrip element:  Can ignore sometimes?
+    */
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[1]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[0]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(-1));
+   }
+   brw_ELSE(p);
+   {
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[0]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[1]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(1));
+   }
+   brw_ENDIF(p);
+
+   brw_MOV(p, get_element(c->reg.inlist, 2),  brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, brw_vec8_grf(c->reg.outlist.nr, 0), brw_imm_f(0));
+   brw_MOV(p, c->reg.nr_verts, brw_imm_ud(3));
+}
+
+
+
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   BRW_CONDITIONAL_EQ,
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_copy_flatshaded_attributes(c, 1, 0);
+      brw_clip_copy_flatshaded_attributes(c, 2, 0);
+   }
+   brw_ELSE(p);
+   {
+      if (c->key.pv_first) {
+	 brw_CMP(p,
+		 vec1(brw_null_reg()),
+		 BRW_CONDITIONAL_EQ,
+		 tmp0,
+		 brw_imm_ud(_3DPRIM_TRIFAN));
+	 brw_IF(p, BRW_EXECUTE_1);
+	 {
+	    brw_clip_copy_flatshaded_attributes(c, 0, 1);
+	    brw_clip_copy_flatshaded_attributes(c, 2, 1);
+	 }
+	 brw_ELSE(p);
+	 {
+	    brw_clip_copy_flatshaded_attributes(c, 1, 0);
+	    brw_clip_copy_flatshaded_attributes(c, 2, 0);
+	 }
+	 brw_ENDIF(p);
+      }
+      else {
+         brw_clip_copy_flatshaded_attributes(c, 0, 2);
+         brw_clip_copy_flatshaded_attributes(c, 1, 2);
+      }
+   }
+   brw_ENDIF(p);
+}
+
+
+/**
+ * Loads the clip distance for a vertex into `dst`, and ends with
+ * a comparison of it to zero with the condition `cond`.
+ *
+ * - If using a fixed plane, the distance is dot(hpos, plane).
+ * - If using a user clip plane, the distance is directly available in the vertex.
+ */
+static inline void
+load_clip_distance(struct brw_clip_compile *c, struct brw_indirect vtx,
+                struct brw_reg dst, GLuint hpos_offset, int cond)
+{
+   struct brw_codegen *p = &c->func;
+
+   dst = vec4(dst);
+   brw_AND(p, vec1(brw_null_reg()), c->reg.vertex_src_mask, brw_imm_ud(1));
+   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      struct brw_indirect temp_ptr = brw_indirect(7, 0);
+      brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx), c->reg.clipdistance_offset);
+      brw_MOV(p, vec1(dst), deref_1f(temp_ptr, 0));
+   }
+   brw_ELSE(p);
+   {
+      brw_MOV(p, dst, deref_4f(vtx, hpos_offset));
+      brw_DP4(p, dst, dst, c->reg.plane_equation);
+   }
+   brw_ENDIF(p);
+
+   brw_CMP(p, brw_null_reg(), cond, vec1(dst), brw_imm_f(0.0f));
+}
+
+
+/* Use mesa's clipping algorithms, translated to GFX4 assembly.
+ */
+void brw_clip_tri( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_indirect vtx = brw_indirect(0, 0);
+   struct brw_indirect vtxPrev = brw_indirect(1, 0);
+   struct brw_indirect vtxOut = brw_indirect(2, 0);
+   struct brw_indirect plane_ptr = brw_indirect(3, 0);
+   struct brw_indirect inlist_ptr = brw_indirect(4, 0);
+   struct brw_indirect outlist_ptr = brw_indirect(5, 0);
+   struct brw_indirect freelist_ptr = brw_indirect(6, 0);
+   GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
+   GLint clipdist0_offset = c->key.nr_userclip
+      ? brw_varying_to_offset(&c->vue_map, VARYING_SLOT_CLIP_DIST0)
+      : 0;
+
+   brw_MOV(p, get_addr_reg(vtxPrev),     brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, get_addr_reg(plane_ptr),   brw_clip_plane0_address(c));
+   brw_MOV(p, get_addr_reg(inlist_ptr),  brw_address(c->reg.inlist));
+   brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+
+   brw_MOV(p, get_addr_reg(freelist_ptr), brw_address(c->reg.vertex[3]) );
+
+   /* Set the initial vertex source mask: The first 6 planes are the bounds
+    * of the view volume; the next 8 planes are the user clipping planes.
+    */
+   brw_MOV(p, c->reg.vertex_src_mask, brw_imm_ud(0x3fc0));
+
+   /* Set the initial clipdistance offset to be 6 floats before gl_ClipDistance[0].
+    * We'll increment 6 times before we start hitting actual user clipping. */
+   brw_MOV(p, c->reg.clipdistance_offset, brw_imm_d(clipdist0_offset - 6*sizeof(float)));
+
+   brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_AND(p, vec1(brw_null_reg()), c->reg.planemask, brw_imm_ud(1));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+
+      brw_IF(p, BRW_EXECUTE_1);
+      {
+	 /* vtxOut = freelist_ptr++
+	  */
+	 brw_MOV(p, get_addr_reg(vtxOut),       get_addr_reg(freelist_ptr) );
+	 brw_ADD(p, get_addr_reg(freelist_ptr), get_addr_reg(freelist_ptr), brw_imm_uw(c->nr_regs * REG_SIZE));
+
+	 if (c->key.nr_userclip)
+	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+	 else
+	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+
+	 brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+	 brw_MOV(p, c->reg.nr_verts, brw_imm_ud(0));
+
+	 brw_DO(p, BRW_EXECUTE_1);
+	 {
+	    /* vtx = *input_ptr;
+	     */
+	    brw_MOV(p, get_addr_reg(vtx), deref_1uw(inlist_ptr, 0));
+
+            load_clip_distance(c, vtxPrev, c->reg.dpPrev, hpos_offset, BRW_CONDITIONAL_L);
+	    /* (prev < 0.0f) */
+	    brw_IF(p, BRW_EXECUTE_1);
+	    {
+               load_clip_distance(c, vtx, c->reg.dp, hpos_offset, BRW_CONDITIONAL_GE);
+	       /* IS_POSITIVE(next)
+		*/
+	       brw_IF(p, BRW_EXECUTE_1);
+	       {
+
+		  /* Coming back in.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dpPrev, negate(c->reg.dp));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dpPrev);
+
+		  /* If (vtxOut == 0) vtxOut = vtxPrev
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+                  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtxPrev));
+                  brw_inst_set_pred_control(p->devinfo, brw_last_inst,
+                                            BRW_PREDICATE_NORMAL);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtxPrev, vtx, c->reg.t, false);
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++;
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       }
+	       brw_ENDIF(p);
+
+	    }
+	    brw_ELSE(p);
+	    {
+	       /* *outlist_ptr++ = vtxPrev;
+		* nr_verts++;
+		*/
+	       brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxPrev));
+	       brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+	       brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+
+               load_clip_distance(c, vtx, c->reg.dp, hpos_offset, BRW_CONDITIONAL_L);
+	       /* (next < 0.0f)
+		*/
+	       brw_IF(p, BRW_EXECUTE_1);
+	       {
+		  /* Going out of bounds.  Avoid division by zero as we
+		   * know dp != dpPrev from DIFFERENT_SIGNS, above.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dp, negate(c->reg.dpPrev));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp);
+
+		  /* If (vtxOut == 0) vtxOut = vtx
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+                  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtx));
+                  brw_inst_set_pred_control(p->devinfo, brw_last_inst,
+                                            BRW_PREDICATE_NORMAL);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtx, vtxPrev, c->reg.t, true);
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++;
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       }
+	       brw_ENDIF(p);
+	    }
+	    brw_ENDIF(p);
+
+	    /* vtxPrev = vtx;
+	     * inlist_ptr++;
+	     */
+	    brw_MOV(p, get_addr_reg(vtxPrev), get_addr_reg(vtx));
+	    brw_ADD(p, get_addr_reg(inlist_ptr), get_addr_reg(inlist_ptr), brw_imm_uw(sizeof(short)));
+
+	    /* while (--loopcount != 0)
+	     */
+	    brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+            brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+	 }
+	 brw_WHILE(p);
+         brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+
+	 /* vtxPrev = *(outlist_ptr-1)  OR: outlist[nr_verts-1]
+	  * inlist = outlist
+	  * inlist_ptr = &inlist[0]
+	  * outlist_ptr = &outlist[0]
+	  */
+	 brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_w(-2));
+	 brw_MOV(p, get_addr_reg(vtxPrev), deref_1uw(outlist_ptr, 0));
+	 brw_MOV(p, brw_vec8_grf(c->reg.inlist.nr, 0), brw_vec8_grf(c->reg.outlist.nr, 0));
+	 brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist));
+	 brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+      }
+      brw_ENDIF(p);
+
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* nr_verts >= 3
+       */
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      c->reg.nr_verts,
+	      brw_imm_ud(3));
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
+
+      /* && (planemask>>=1) != 0
+       */
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.vertex_src_mask, c->reg.vertex_src_mask, brw_imm_ud(1));
+      brw_ADD(p, c->reg.clipdistance_offset, c->reg.clipdistance_offset, brw_imm_w(sizeof(float)));
+   }
+   brw_WHILE(p);
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+
+
+void brw_clip_tri_emit_polygon(struct brw_clip_compile *c)
+{
+   struct brw_codegen *p = &c->func;
+
+   /* for (loopcount = nr_verts-2; loopcount > 0; loopcount--)
+    */
+   brw_ADD(p,
+	   c->reg.loopcount,
+	   c->reg.nr_verts,
+	   brw_imm_d(-2));
+   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_G);
+
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      struct brw_indirect v0 = brw_indirect(0, 0);
+      struct brw_indirect vptr = brw_indirect(1, 0);
+
+      brw_MOV(p, get_addr_reg(vptr), brw_address(c->reg.inlist));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
+                        ((_3DPRIM_TRIFAN << URB_WRITE_PRIM_TYPE_SHIFT)
+                         | URB_WRITE_PRIM_START));
+
+      brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
+                           (_3DPRIM_TRIFAN << URB_WRITE_PRIM_TYPE_SHIFT));
+
+	 brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+         brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+      }
+      brw_WHILE(p);
+      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+
+      brw_clip_emit_vue(c, v0, BRW_URB_WRITE_EOT_COMPLETE,
+                        ((_3DPRIM_TRIFAN << URB_WRITE_PRIM_TYPE_SHIFT)
+                         | URB_WRITE_PRIM_END));
+   }
+   brw_ENDIF(p);
+}
+
+static void do_clip_tri( struct brw_clip_compile *c )
+{
+   brw_clip_init_planes(c);
+
+   brw_clip_tri(c);
+}
+
+
+static void maybe_do_clip_tri( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      do_clip_tri(c);
+   }
+   brw_ENDIF(p);
+}
+
+static void brw_clip_test( struct brw_clip_compile *c )
+{
+    struct brw_reg t = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t2 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t3 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+
+    struct brw_reg v0 = get_tmp(c);
+    struct brw_reg v1 = get_tmp(c);
+    struct brw_reg v2 = get_tmp(c);
+
+    struct brw_indirect vt0 = brw_indirect(0, 0);
+    struct brw_indirect vt1 = brw_indirect(1, 0);
+    struct brw_indirect vt2 = brw_indirect(2, 0);
+
+    struct brw_codegen *p = &c->func;
+    struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+
+    GLuint hpos_offset = brw_varying_to_offset(&c->vue_map,
+                                                   VARYING_SLOT_POS);
+
+    brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0]));
+    brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1]));
+    brw_MOV(p, get_addr_reg(vt2), brw_address(c->reg.vertex[2]));
+    brw_MOV(p, v0, deref_4f(vt0, hpos_offset));
+    brw_MOV(p, v1, deref_4f(vt1, hpos_offset));
+    brw_MOV(p, v2, deref_4f(vt2, hpos_offset));
+    brw_AND(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(~0x3f));
+
+    /* test nearz, xmin, ymin plane */
+    /* clip.xyz < -clip.w */
+    brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, negate(get_element(v0, 3)));
+    brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, negate(get_element(v1, 3)));
+    brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, negate(get_element(v2, 3)));
+
+    /* All vertices are outside of a plane, rejected */
+    brw_AND(p, t, t1, t2);
+    brw_AND(p, t, t, t3);
+    brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
+    brw_OR(p, tmp0, tmp0, get_element(t, 2));
+    brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
+    brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+    brw_IF(p, BRW_EXECUTE_1);
+    {
+        brw_clip_kill_thread(c);
+    }
+    brw_ENDIF(p);
+    brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* some vertices are inside a plane, some are outside,need to clip */
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+    brw_AND(p, t, t, brw_imm_ud(0x1));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5)));
+    brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3)));
+    brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1)));
+    brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+
+    /* test farz, xmax, ymax plane */
+    /* clip.xyz > clip.w */
+    brw_CMP(p, t1, BRW_CONDITIONAL_G, v0, get_element(v0, 3));
+    brw_CMP(p, t2, BRW_CONDITIONAL_G, v1, get_element(v1, 3));
+    brw_CMP(p, t3, BRW_CONDITIONAL_G, v2, get_element(v2, 3));
+
+    /* All vertices are outside of a plane, rejected */
+    brw_AND(p, t, t1, t2);
+    brw_AND(p, t, t, t3);
+    brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
+    brw_OR(p, tmp0, tmp0, get_element(t, 2));
+    brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
+    brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+    brw_IF(p, BRW_EXECUTE_1);
+    {
+        brw_clip_kill_thread(c);
+    }
+    brw_ENDIF(p);
+    brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* some vertices are inside a plane, some are outside,need to clip */
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+    brw_AND(p, t, t, brw_imm_ud(0x1));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4)));
+    brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2)));
+    brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0)));
+    brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+
+    release_tmps(c);
+}
+
+
+void brw_emit_tri_clip( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+   brw_clip_init_clipmask(c);
+   brw_clip_init_ff_sync(c);
+
+   /* if -ve rhw workaround bit is set,
+      do cliptest */
+   if (p->devinfo->has_negative_rhw_bug) {
+      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
+              brw_imm_ud(1<<20));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+      brw_IF(p, BRW_EXECUTE_1);
+      {
+         brw_clip_test(c);
+      }
+      brw_ENDIF(p);
+   }
+   /* Can't push into do_clip_tri because with polygon (or quad)
+    * flatshading, need to apply the flatshade here because we don't
+    * respect the PV when converting to trifan for emit:
+    */
+   if (c->key.contains_flat_varying)
+      brw_clip_tri_flat_shade(c);
+
+   if ((c->key.clip_mode == BRW_CLIP_MODE_NORMAL) ||
+       (c->key.clip_mode == BRW_CLIP_MODE_KERNEL_CLIP))
+      do_clip_tri(c);
+   else
+      maybe_do_clip_tri(c);
+
+   brw_clip_tri_emit_polygon(c);
+
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_kill_thread(c);
+}
diff --git a/src/intel/compiler/elk/brw_clip_unfilled.c b/src/intel/compiler/elk/brw_clip_unfilled.c
new file mode 100644
index 00000000000..c0e78acc7e8
--- /dev/null
+++ b/src/intel/compiler/elk/brw_clip_unfilled.c
@@ -0,0 +1,528 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#include "brw_clip.h"
+#include "brw_prim.h"
+
+
+/* This is performed against the original triangles, so no indirection
+ * required:
+BZZZT!
+ */
+static void compute_tri_direction( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg e = c->reg.tmp0;
+   struct brw_reg f = c->reg.tmp1;
+   GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
+   struct brw_reg v0 = byte_offset(c->reg.vertex[0], hpos_offset);
+   struct brw_reg v1 = byte_offset(c->reg.vertex[1], hpos_offset);
+   struct brw_reg v2 = byte_offset(c->reg.vertex[2], hpos_offset);
+
+
+   struct brw_reg v0n = get_tmp(c);
+   struct brw_reg v1n = get_tmp(c);
+   struct brw_reg v2n = get_tmp(c);
+
+   /* Convert to NDC.
+    * NOTE: We can't modify the original vertex coordinates,
+    * as it may impact further operations.
+    * So, we have to keep normalized coordinates in temp registers.
+    *
+    * TBD-KC
+    * Try to optimize unnecessary MOV's.
+    */
+   brw_MOV(p, v0n, v0);
+   brw_MOV(p, v1n, v1);
+   brw_MOV(p, v2n, v2);
+
+   brw_clip_project_position(c, v0n);
+   brw_clip_project_position(c, v1n);
+   brw_clip_project_position(c, v2n);
+
+   /* Calculate the vectors of two edges of the triangle:
+    */
+   brw_ADD(p, e, v0n, negate(v2n));
+   brw_ADD(p, f, v1n, negate(v2n));
+
+   /* Take their crossproduct:
+    */
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, BRW_SWIZZLE_YZXW),
+           brw_swizzle(f, BRW_SWIZZLE_ZXYW));
+   brw_MAC(p, vec4(e),  negate(brw_swizzle(e, BRW_SWIZZLE_ZXYW)),
+           brw_swizzle(f, BRW_SWIZZLE_YZXW));
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+   brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e));
+}
+
+
+static void cull_direction( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint conditional;
+
+   assert (!(c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL &&
+	     c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL));
+
+   if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p);
+}
+
+
+
+static void copy_bfc( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint conditional;
+
+   /* Do we have any colors to copy?
+    */
+   if (!(brw_clip_have_varying(c, VARYING_SLOT_COL0) &&
+         brw_clip_have_varying(c, VARYING_SLOT_BFC0)) &&
+       !(brw_clip_have_varying(c, VARYING_SLOT_COL1) &&
+         brw_clip_have_varying(c, VARYING_SLOT_BFC1)))
+      return;
+
+   /* In some weird degenerate cases we can end up testing the
+    * direction twice, once for culling and once for bfc copying.  Oh
+    * well, that's what you get for setting weird GL state.
+    */
+   if (c->key.copy_bfc_ccw)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      GLuint i;
+
+      for (i = 0; i < 3; i++) {
+	 if (brw_clip_have_varying(c, VARYING_SLOT_COL0) &&
+             brw_clip_have_varying(c, VARYING_SLOT_BFC0))
+	    brw_MOV(p,
+		    byte_offset(c->reg.vertex[i],
+                                brw_varying_to_offset(&c->vue_map,
+                                                      VARYING_SLOT_COL0)),
+		    byte_offset(c->reg.vertex[i],
+                                brw_varying_to_offset(&c->vue_map,
+                                                      VARYING_SLOT_BFC0)));
+
+	 if (brw_clip_have_varying(c, VARYING_SLOT_COL1) &&
+             brw_clip_have_varying(c, VARYING_SLOT_BFC1))
+	    brw_MOV(p,
+		    byte_offset(c->reg.vertex[i],
+                                brw_varying_to_offset(&c->vue_map,
+                                                      VARYING_SLOT_COL1)),
+		    byte_offset(c->reg.vertex[i],
+                                brw_varying_to_offset(&c->vue_map,
+                                                      VARYING_SLOT_BFC1)));
+      }
+   }
+   brw_ENDIF(p);
+}
+
+
+
+
+/*
+  GLfloat iz	= 1.0 / dir.z;
+  GLfloat ac	= dir.x * iz;
+  GLfloat bc	= dir.y * iz;
+  offset = ctx->Polygon.OffsetUnits * DEPTH_SCALE;
+  offset += MAX2( abs(ac), abs(bc) ) * ctx->Polygon.OffsetFactor;
+  if (ctx->Polygon.OffsetClamp && isfinite(ctx->Polygon.OffsetClamp)) {
+    if (ctx->Polygon.OffsetClamp < 0)
+      offset = MAX2( offset, ctx->Polygon.OffsetClamp );
+    else
+      offset = MIN2( offset, ctx->Polygon.OffsetClamp );
+  }
+  offset *= MRD;
+*/
+static void compute_offset( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg off = c->reg.offset;
+   struct brw_reg dir = c->reg.dir;
+
+   brw_math_invert(p, get_element(off, 2), get_element(dir, 2));
+   brw_MUL(p, vec2(off), vec2(dir), get_element(off, 2));
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   BRW_CONDITIONAL_GE,
+	   brw_abs(get_element(off, 0)),
+	   brw_abs(get_element(off, 1)));
+
+   brw_SEL(p, vec1(off),
+           brw_abs(get_element(off, 0)), brw_abs(get_element(off, 1)));
+   brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+
+   brw_MUL(p, vec1(off), vec1(off), brw_imm_f(c->key.offset_factor));
+   brw_ADD(p, vec1(off), vec1(off), brw_imm_f(c->key.offset_units));
+   if (c->key.offset_clamp && isfinite(c->key.offset_clamp)) {
+      brw_CMP(p,
+              vec1(brw_null_reg()),
+              c->key.offset_clamp < 0 ? BRW_CONDITIONAL_GE : BRW_CONDITIONAL_L,
+              vec1(off),
+              brw_imm_f(c->key.offset_clamp));
+      brw_SEL(p, vec1(off), vec1(off), brw_imm_f(c->key.offset_clamp));
+   }
+}
+
+
+static void merge_edgeflags( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg tmp0 = get_element_ud(c->reg.tmp0, 0);
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   BRW_CONDITIONAL_EQ,
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   /* Get away with using reg.vertex because we know that this is not
+    * a _3DPRIM_TRISTRIP_REVERSE:
+    */
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<8));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_EQ);
+      brw_MOV(p, byte_offset(c->reg.vertex[0],
+                             brw_varying_to_offset(&c->vue_map,
+                                                   VARYING_SLOT_EDGE)),
+              brw_imm_f(0));
+      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<9));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_EQ);
+      brw_MOV(p, byte_offset(c->reg.vertex[2],
+                             brw_varying_to_offset(&c->vue_map,
+                                                   VARYING_SLOT_EDGE)),
+              brw_imm_f(0));
+      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+   }
+   brw_ENDIF(p);
+}
+
+
+
+static void apply_one_offset( struct brw_clip_compile *c,
+			  struct brw_indirect vert )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint ndc_offset = brw_varying_to_offset(&c->vue_map,
+                                             BRW_VARYING_SLOT_NDC);
+   struct brw_reg z = deref_1f(vert, ndc_offset +
+			       2 * type_sz(BRW_REGISTER_TYPE_F));
+
+   brw_ADD(p, z, z, vec1(c->reg.offset));
+}
+
+
+
+/***********************************************************************
+ * Output clipped polygon as an unfilled primitive:
+ */
+static void emit_lines(struct brw_clip_compile *c,
+		       bool do_offset)
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v1 = brw_indirect(1, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+   struct brw_indirect v1ptr = brw_indirect(3, 0);
+
+   /* Need a separate loop for offset:
+    */
+   if (do_offset) {
+      brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+      brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+      brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+	 brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+	 apply_one_offset(c, v0);
+
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+         brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_G);
+      }
+      brw_WHILE(p);
+      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+   }
+
+   /* v1ptr = &inlist[nr_verts]
+    * *v1ptr = v0
+    */
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v0ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v1ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_MOV(p, deref_1uw(v1ptr, 0), deref_1uw(v0ptr, 0));
+
+   brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_MOV(p, get_addr_reg(v1), deref_1uw(v0ptr, 2));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw edge if edgeflag != 0 */
+      brw_CMP(p,
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ,
+	      deref_1f(v0, brw_varying_to_offset(&c->vue_map,
+                                                 VARYING_SLOT_EDGE)),
+	      brw_imm_f(0));
+      brw_IF(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
+                           (_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
+                           | URB_WRITE_PRIM_START);
+	 brw_clip_emit_vue(c, v1, BRW_URB_WRITE_ALLOCATE_COMPLETE,
+                           (_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
+                           | URB_WRITE_PRIM_END);
+      }
+      brw_ENDIF(p);
+
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+   }
+   brw_WHILE(p);
+   brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+}
+
+
+
+static void emit_points(struct brw_clip_compile *c,
+			bool do_offset )
+{
+   struct brw_codegen *p = &c->func;
+
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+   brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw if edgeflag != 0
+       */
+      brw_CMP(p,
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ,
+	      deref_1f(v0, brw_varying_to_offset(&c->vue_map,
+                                                 VARYING_SLOT_EDGE)),
+	      brw_imm_f(0));
+      brw_IF(p, BRW_EXECUTE_1);
+      {
+	 if (do_offset)
+	    apply_one_offset(c, v0);
+
+	 brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
+                           (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT)
+                           | URB_WRITE_PRIM_START | URB_WRITE_PRIM_END);
+      }
+      brw_ENDIF(p);
+
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+   }
+   brw_WHILE(p);
+   brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+}
+
+
+
+
+
+
+
+static void emit_primitives( struct brw_clip_compile *c,
+			     GLuint mode,
+			     bool do_offset )
+{
+   switch (mode) {
+   case BRW_CLIP_FILL_MODE_FILL:
+      brw_clip_tri_emit_polygon(c);
+      break;
+
+   case BRW_CLIP_FILL_MODE_LINE:
+      emit_lines(c, do_offset);
+      break;
+
+   case BRW_CLIP_FILL_MODE_POINT:
+      emit_points(c, do_offset);
+      break;
+
+   case BRW_CLIP_FILL_MODE_CULL:
+      unreachable("not reached");
+   }
+}
+
+
+
+static void emit_unfilled_primitives( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+
+   /* Direction culling has already been done.
+    */
+   if (c->key.fill_ccw != c->key.fill_cw &&
+       c->key.fill_ccw != BRW_CLIP_FILL_MODE_CULL &&
+       c->key.fill_cw != BRW_CLIP_FILL_MODE_CULL)
+   {
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      get_element(c->reg.dir, 2),
+	      brw_imm_f(0));
+
+      brw_IF(p, BRW_EXECUTE_1);
+      {
+	 emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+      }
+      brw_ELSE(p);
+      {
+	 emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+      }
+      brw_ENDIF(p);
+   }
+   else if (c->key.fill_cw != BRW_CLIP_FILL_MODE_CULL) {
+      emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+   }
+   else if (c->key.fill_ccw != BRW_CLIP_FILL_MODE_CULL) {
+      emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+   }
+}
+
+
+
+
+static void check_nr_verts( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.nr_verts, brw_imm_d(3));
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p);
+}
+
+
+void brw_emit_unfilled_clip( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+
+   c->need_direction = ((c->key.offset_ccw || c->key.offset_cw) ||
+			(c->key.fill_ccw != c->key.fill_cw) ||
+			c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL ||
+			c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL ||
+			c->key.copy_bfc_cw ||
+			c->key.copy_bfc_ccw);
+
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+   brw_clip_init_ff_sync(c);
+
+   assert(brw_clip_have_varying(c, VARYING_SLOT_EDGE));
+
+   if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL &&
+       c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL) {
+      brw_clip_kill_thread(c);
+      return;
+   }
+
+   merge_edgeflags(c);
+
+   /* Need to use the inlist indirection here:
+    */
+   if (c->need_direction)
+      compute_tri_direction(c);
+
+   if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL ||
+       c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL)
+      cull_direction(c);
+
+   if (c->key.offset_ccw ||
+       c->key.offset_cw)
+      compute_offset(c);
+
+   if (c->key.copy_bfc_ccw ||
+       c->key.copy_bfc_cw)
+      copy_bfc(c);
+
+   /* Need to do this whether we clip or not:
+    */
+   if (c->key.contains_flat_varying)
+      brw_clip_tri_flat_shade(c);
+
+   brw_clip_init_clipmask(c);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_init_planes(c);
+      brw_clip_tri(c);
+      check_nr_verts(c);
+   }
+   brw_ENDIF(p);
+
+   emit_unfilled_primitives(c);
+   brw_clip_kill_thread(c);
+}
diff --git a/src/intel/compiler/elk/brw_clip_util.c b/src/intel/compiler/elk/brw_clip_util.c
new file mode 100644
index 00000000000..270a6dc3225
--- /dev/null
+++ b/src/intel/compiler/elk/brw_clip_util.c
@@ -0,0 +1,464 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#include "brw_clip.h"
+
+
+struct brw_reg get_tmp( struct brw_clip_compile *c )
+{
+   struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmp( struct brw_clip_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+
+
+static struct brw_reg make_plane_ud(GLuint x, GLuint y, GLuint z, GLuint w)
+{
+   return brw_imm_ud((w<<24) | (z<<16) | (y<<8) | x);
+}
+
+
+void brw_clip_init_planes( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+
+   if (!c->key.nr_userclip) {
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 0), make_plane_ud( 0,    0, 0xff, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 1), make_plane_ud( 0,    0,    1, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 2), make_plane_ud( 0, 0xff,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 3), make_plane_ud( 0,    1,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 4), make_plane_ud(0xff,  0,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 5), make_plane_ud( 1,    0,    0, 1));
+   }
+}
+
+
+
+#define W 3
+
+/* Project 'pos' to screen space (or back again), overwrite with results:
+ */
+void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos )
+{
+   struct brw_codegen *p = &c->func;
+
+   /* calc rhw
+    */
+   brw_math_invert(p, get_element(pos, W), get_element(pos, W));
+
+   /* value.xyz *= value.rhw
+    */
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, brw_writemask(pos, WRITEMASK_XYZ), pos,
+           brw_swizzle(pos, BRW_SWIZZLE_WWWW));
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+}
+
+
+static void brw_clip_project_vertex( struct brw_clip_compile *c,
+				     struct brw_indirect vert_addr )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+   GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
+   GLuint ndc_offset = brw_varying_to_offset(&c->vue_map,
+                                             BRW_VARYING_SLOT_NDC);
+
+   /* Fixup position.  Extract from the original vertex and re-project
+    * to screen space:
+    */
+   brw_MOV(p, tmp, deref_4f(vert_addr, hpos_offset));
+   brw_clip_project_position(c, tmp);
+   brw_MOV(p, deref_4f(vert_addr, ndc_offset), tmp);
+
+   release_tmp(c, tmp);
+}
+
+
+
+
+/* Interpolate between two vertices and put the result into a0.0.
+ * Increment a0.0 accordingly.
+ *
+ * Beware that dest_ptr can be equal to v0_ptr!
+ */
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     bool force_edgeflag)
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg t_nopersp, v0_ndc_copy;
+   GLuint slot;
+
+   /* Just copy the vertex header:
+    */
+   /*
+    * After CLIP stage, only first 256 bits of the VUE are read
+    * back on Ironlake, so needn't change it
+    */
+   brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
+
+
+   /* First handle the 3D and NDC interpolation, in case we
+    * need noperspective interpolation. Doing it early has no
+    * performance impact in any case.
+    */
+
+   /* Take a copy of the v0 NDC coordinates, in case dest == v0. */
+   if (c->key.contains_noperspective_varying) {
+      GLuint offset = brw_varying_to_offset(&c->vue_map,
+                                                 BRW_VARYING_SLOT_NDC);
+      v0_ndc_copy = get_tmp(c);
+      brw_MOV(p, v0_ndc_copy, deref_4f(v0_ptr, offset));
+   }
+
+   /* Compute the new 3D position
+    *
+    * dest_hpos = v0_hpos * (1 - t0) + v1_hpos * t0
+    */
+   {
+      GLuint delta = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
+      struct brw_reg tmp = get_tmp(c);
+      brw_MUL(p, vec4(brw_null_reg()), deref_4f(v1_ptr, delta), t0);
+      brw_MAC(p, tmp, negate(deref_4f(v0_ptr, delta)), t0);
+      brw_ADD(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta), tmp);
+      release_tmp(c, tmp);
+   }
+
+   /* Recreate the projected (NDC) coordinate in the new vertex header */
+   brw_clip_project_vertex(c, dest_ptr);
+
+   /* If we have noperspective attributes,
+    * we need to compute the screen-space t
+    */
+   if (c->key.contains_noperspective_varying) {
+      GLuint delta = brw_varying_to_offset(&c->vue_map,
+                                                BRW_VARYING_SLOT_NDC);
+      struct brw_reg tmp = get_tmp(c);
+      t_nopersp = get_tmp(c);
+
+      /* t_nopersp = vec4(v1.xy, dest.xy) */
+      brw_MOV(p, t_nopersp, deref_4f(v1_ptr, delta));
+      brw_MOV(p, tmp, deref_4f(dest_ptr, delta));
+      brw_set_default_access_mode(p, BRW_ALIGN_16);
+      brw_MOV(p,
+              brw_writemask(t_nopersp, WRITEMASK_ZW),
+              brw_swizzle(tmp, BRW_SWIZZLE_XYXY));
+
+      /* t_nopersp = vec4(v1.xy, dest.xy) - v0.xyxy */
+      brw_ADD(p, t_nopersp, t_nopersp,
+              negate(brw_swizzle(v0_ndc_copy, BRW_SWIZZLE_XYXY)));
+
+      /* Add the absolute values of the X and Y deltas so that if
+       * the points aren't in the same place on the screen we get
+       * nonzero values to divide.
+       *
+       * After that, we have vert1 - vert0 in t_nopersp.x and
+       * vertnew - vert0 in t_nopersp.y
+       *
+       * t_nopersp = vec2(|v1.x  -v0.x| + |v1.y  -v0.y|,
+       *                  |dest.x-v0.x| + |dest.y-v0.y|)
+       */
+      brw_ADD(p,
+              brw_writemask(t_nopersp, WRITEMASK_XY),
+              brw_abs(brw_swizzle(t_nopersp, BRW_SWIZZLE_XZXZ)),
+              brw_abs(brw_swizzle(t_nopersp, BRW_SWIZZLE_YWYW)));
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+      /* If the points are in the same place, just substitute a
+       * value to avoid divide-by-zero
+       */
+      brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ,
+              vec1(t_nopersp),
+              brw_imm_f(0));
+      brw_IF(p, BRW_EXECUTE_1);
+      brw_MOV(p, t_nopersp, brw_imm_vf4(brw_float_to_vf(1.0),
+                                        brw_float_to_vf(0.0),
+                                        brw_float_to_vf(0.0),
+                                        brw_float_to_vf(0.0)));
+      brw_ENDIF(p);
+
+      /* Now compute t_nopersp = t_nopersp.y/t_nopersp.x and broadcast it. */
+      brw_math_invert(p, get_element(t_nopersp, 0), get_element(t_nopersp, 0));
+      brw_MUL(p, vec1(t_nopersp), vec1(t_nopersp),
+            vec1(suboffset(t_nopersp, 1)));
+      brw_set_default_access_mode(p, BRW_ALIGN_16);
+      brw_MOV(p, t_nopersp, brw_swizzle(t_nopersp, BRW_SWIZZLE_XXXX));
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+      release_tmp(c, tmp);
+      release_tmp(c, v0_ndc_copy);
+   }
+
+   /* Now we can iterate over each attribute
+    * (could be done in pairs?)
+    */
+   for (slot = 0; slot < c->vue_map.num_slots; slot++) {
+      int varying = c->vue_map.slot_to_varying[slot];
+      GLuint delta = brw_vue_slot_to_offset(slot);
+
+      /* HPOS, NDC already handled above */
+      if (varying == VARYING_SLOT_POS || varying == BRW_VARYING_SLOT_NDC)
+         continue;
+
+
+      if (varying == VARYING_SLOT_EDGE) {
+	 if (force_edgeflag)
+	    brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
+	 else
+	    brw_MOV(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta));
+      } else if (varying == VARYING_SLOT_PSIZ) {
+         /* PSIZ doesn't need interpolation because it isn't used by the
+          * fragment shader.
+          */
+      } else if (varying < VARYING_SLOT_MAX) {
+	 /* This is a true vertex result (and not a special value for the VUE
+	  * header), so interpolate:
+	  *
+	  *        New = attr0 + t*attr1 - t*attr0
+          *
+          * Unless the attribute is flat shaded -- in which case just copy
+          * from one of the sources (doesn't matter which; already copied from pv)
+	  */
+         GLuint interp = c->key.interp_mode[slot];
+
+         if (interp != INTERP_MODE_FLAT) {
+            struct brw_reg tmp = get_tmp(c);
+            struct brw_reg t =
+               interp == INTERP_MODE_NOPERSPECTIVE ? t_nopersp : t0;
+
+            brw_MUL(p,
+                  vec4(brw_null_reg()),
+                  deref_4f(v1_ptr, delta),
+                  t);
+
+            brw_MAC(p,
+                  tmp,
+                  negate(deref_4f(v0_ptr, delta)),
+                  t);
+
+            brw_ADD(p,
+                  deref_4f(dest_ptr, delta),
+                  deref_4f(v0_ptr, delta),
+                  tmp);
+
+            release_tmp(c, tmp);
+         }
+         else {
+            brw_MOV(p,
+                  deref_4f(dest_ptr, delta),
+                  deref_4f(v0_ptr, delta));
+         }
+      }
+   }
+
+   if (c->vue_map.num_slots % 2) {
+      GLuint delta = brw_vue_slot_to_offset(c->vue_map.num_slots);
+
+      brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
+   }
+
+   if (c->key.contains_noperspective_varying)
+      release_tmp(c, t_nopersp);
+}
+
+void brw_clip_emit_vue(struct brw_clip_compile *c,
+		       struct brw_indirect vert,
+                       enum brw_urb_write_flags flags,
+		       GLuint header)
+{
+   struct brw_codegen *p = &c->func;
+   bool allocate = flags & BRW_URB_WRITE_ALLOCATE;
+
+   brw_clip_ff_sync(c);
+
+   /* Any URB entry that is allocated must subsequently be used or discarded,
+    * so it doesn't make sense to mark EOT and ALLOCATE at the same time.
+    */
+   assert(!(allocate && (flags & BRW_URB_WRITE_EOT)));
+
+   /* Copy the vertex from vertn into m1..mN+1:
+    */
+   brw_copy_from_indirect(p, brw_message_reg(1), vert, c->nr_regs);
+
+   /* Overwrite PrimType and PrimStart in the message header, for
+    * each vertex in turn:
+    */
+   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
+
+
+   /* Send each vertex as a separate write to the urb.  This
+    * is different to the concept in brw_sf_emit.c, where
+    * subsequent writes are used to build up a single urb
+    * entry.  Each of these writes instantiates a separate
+    * urb entry - (I think... what about 'allocate'?)
+    */
+   brw_urb_WRITE(p,
+		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+                 flags,
+		 c->nr_regs + 1, /* msg length */
+		 allocate ? 1 : 0, /* response_length */
+		 0,		/* urb offset */
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+void brw_clip_kill_thread(struct brw_clip_compile *c)
+{
+   struct brw_codegen *p = &c->func;
+
+   brw_clip_ff_sync(c);
+   /* Send an empty message to kill the thread and release any
+    * allocated urb entry:
+    */
+   brw_urb_WRITE(p,
+		 retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+                 BRW_URB_WRITE_UNUSED | BRW_URB_WRITE_EOT_COMPLETE,
+		 1, 		/* msg len */
+		 0, 		/* response len */
+		 0,
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c )
+{
+   return brw_address(c->reg.fixed_planes);
+}
+
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c )
+{
+   if (c->key.nr_userclip) {
+      return brw_imm_uw(16);
+   }
+   else {
+      return brw_imm_uw(4);
+   }
+}
+
+
+/* Distribute flatshaded attributes from provoking vertex prior to
+ * clipping.
+ */
+void brw_clip_copy_flatshaded_attributes( struct brw_clip_compile *c,
+			   GLuint to, GLuint from )
+{
+   struct brw_codegen *p = &c->func;
+
+   for (int i = 0; i < c->vue_map.num_slots; i++) {
+      if (c->key.interp_mode[i] == INTERP_MODE_FLAT) {
+         brw_MOV(p,
+                 byte_offset(c->reg.vertex[to], brw_vue_slot_to_offset(i)),
+                 byte_offset(c->reg.vertex[from], brw_vue_slot_to_offset(i)));
+      }
+   }
+}
+
+
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg incoming = get_element_ud(c->reg.R0, 2);
+
+   /* Shift so that lowest outcode bit is rightmost:
+    */
+   brw_SHR(p, c->reg.planemask, incoming, brw_imm_ud(26));
+
+   if (c->key.nr_userclip) {
+      struct brw_reg tmp = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UD);
+
+      /* Rearrange userclip outcodes so that they come directly after
+       * the fixed plane bits.
+       */
+      if (p->devinfo->ver == 5 || p->devinfo->verx10 == 45)
+         brw_AND(p, tmp, incoming, brw_imm_ud(0xff<<14));
+      else
+         brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14));
+
+      brw_SHR(p, tmp, tmp, brw_imm_ud(8));
+      brw_OR(p, c->reg.planemask, c->reg.planemask, tmp);
+
+      release_tmp(c, tmp);
+   }
+}
+
+void brw_clip_ff_sync(struct brw_clip_compile *c)
+{
+    struct brw_codegen *p = &c->func;
+
+    if (p->devinfo->ver == 5) {
+        brw_AND(p, brw_null_reg(), c->reg.ff_sync, brw_imm_ud(0x1));
+        brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
+        brw_IF(p, BRW_EXECUTE_1);
+        {
+            brw_OR(p, c->reg.ff_sync, c->reg.ff_sync, brw_imm_ud(0x1));
+            brw_ff_sync(p,
+			c->reg.R0,
+			0,
+			c->reg.R0,
+			1, /* allocate */
+			1, /* response length */
+			0 /* eot */);
+        }
+        brw_ENDIF(p);
+        brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+    }
+}
+
+void brw_clip_init_ff_sync(struct brw_clip_compile *c)
+{
+    struct brw_codegen *p = &c->func;
+
+    if (p->devinfo->ver == 5) {
+        brw_MOV(p, c->reg.ff_sync, brw_imm_ud(0));
+    }
+}
diff --git a/src/intel/compiler/elk/brw_compile_clip.c b/src/intel/compiler/elk/brw_compile_clip.c
new file mode 100644
index 00000000000..25f476d4066
--- /dev/null
+++ b/src/intel/compiler/elk/brw_compile_clip.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright © 2006 - 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_clip.h"
+#include "brw_disasm.h"
+
+#include "dev/intel_debug.h"
+
+const unsigned *
+brw_compile_clip(const struct brw_compiler *compiler,
+                 void *mem_ctx,
+                 const struct brw_clip_prog_key *key,
+                 struct brw_clip_prog_data *prog_data,
+                 struct intel_vue_map *vue_map,
+                 unsigned *final_assembly_size)
+{
+   struct brw_clip_compile c;
+   memset(&c, 0, sizeof(c));
+
+   /* Begin the compilation:
+    */
+   brw_init_codegen(&compiler->isa, &c.func, mem_ctx);
+
+   c.func.single_program_flow = 1;
+
+   c.key = *key;
+   c.vue_map = *vue_map;
+
+   /* nr_regs is the number of registers filled by reading data from the VUE.
+    * This program accesses the entire VUE, so nr_regs needs to be the size of
+    * the VUE (measured in pairs, since two slots are stored in each
+    * register).
+    */
+   c.nr_regs = (c.vue_map.num_slots + 1)/2;
+
+   c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.
+    */
+   brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE);
+
+   /* Would ideally have the option of producing a program which could
+    * do all three:
+    */
+   switch (key->primitive) {
+   case MESA_PRIM_TRIANGLES:
+      if (key->do_unfilled)
+	 brw_emit_unfilled_clip( &c );
+      else
+	 brw_emit_tri_clip( &c );
+      break;
+   case MESA_PRIM_LINES:
+      brw_emit_line_clip( &c );
+      break;
+   case MESA_PRIM_POINTS:
+      brw_emit_point_clip( &c );
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   brw_compact_instructions(&c.func, 0, NULL);
+
+   *prog_data = c.prog_data;
+
+   const unsigned *program = brw_get_program(&c.func, final_assembly_size);
+
+   if (INTEL_DEBUG(DEBUG_CLIP)) {
+      fprintf(stderr, "clip:\n");
+      brw_disassemble_with_labels(&compiler->isa,
+                                  program, 0, *final_assembly_size, stderr);
+      fprintf(stderr, "\n");
+   }
+
+   return program;
+}
diff --git a/src/intel/compiler/elk/brw_compile_ff_gs.c b/src/intel/compiler/elk/brw_compile_ff_gs.c
new file mode 100644
index 00000000000..200a1dd0415
--- /dev/null
+++ b/src/intel/compiler/elk/brw_compile_ff_gs.c
@@ -0,0 +1,662 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#include "brw_compiler.h"
+#include "brw_disasm.h"
+#include "brw_eu.h"
+#include "brw_prim.h"
+
+#include "dev/intel_debug.h"
+
+#define MAX_GS_VERTS (4)
+
+struct brw_ff_gs_compile {
+   struct brw_codegen func;
+   struct brw_ff_gs_prog_key key;
+   struct brw_ff_gs_prog_data *prog_data;
+
+   struct {
+      struct brw_reg R0;
+
+      /**
+       * Register holding streamed vertex buffer pointers -- see the Sandy
+       * Bridge PRM, volume 2 part 1, section 4.4.2 (GS Thread Payload
+       * [DevSNB]).  These pointers are delivered in GRF 1.
+       */
+      struct brw_reg SVBI;
+
+      struct brw_reg vertex[MAX_GS_VERTS];
+      struct brw_reg header;
+      struct brw_reg temp;
+
+      /**
+       * Register holding destination indices for streamed buffer writes.
+       * Only used for SOL programs.
+       */
+      struct brw_reg destination_indices;
+   } reg;
+
+   /* Number of registers used to store vertex data */
+   GLuint nr_regs;
+
+   struct intel_vue_map vue_map;
+};
+
+/**
+ * Allocate registers for GS.
+ *
+ * If sol_program is true, then:
+ *
+ * - The thread will be spawned with the "SVBI Payload Enable" bit set, so GRF
+ *   1 needs to be set aside to hold the streamed vertex buffer indices.
+ *
+ * - The thread will need to use the destination_indices register.
+ */
+static void brw_ff_gs_alloc_regs(struct brw_ff_gs_compile *c,
+                                 GLuint nr_verts,
+                                 bool sol_program)
+{
+   GLuint i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   /* Streamed vertex buffer indices */
+   if (sol_program)
+      c->reg.SVBI = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->reg.header = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
+   c->reg.temp = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
+
+   if (sol_program) {
+      c->reg.destination_indices =
+         retype(brw_vec4_grf(i++, 0), BRW_REGISTER_TYPE_UD);
+   }
+
+   c->prog_data->urb_read_length = c->nr_regs;
+   c->prog_data->total_grf = i;
+}
+
+
+/**
+ * Set up the initial value of c->reg.header register based on c->reg.R0.
+ *
+ * The following information is passed to the GS thread in R0, and needs to be
+ * included in the first URB_WRITE or FF_SYNC message sent by the GS:
+ *
+ * - DWORD 0 [31:0] handle info (Gen4 only)
+ * - DWORD 5 [7:0] FFTID
+ * - DWORD 6 [31:0] Debug info
+ * - DWORD 7 [31:0] Debug info
+ *
+ * This function sets up the above data by copying by copying the contents of
+ * R0 to the header register.
+ */
+static void brw_ff_gs_initialize_header(struct brw_ff_gs_compile *c)
+{
+   struct brw_codegen *p = &c->func;
+   brw_MOV(p, c->reg.header, c->reg.R0);
+}
+
+/**
+ * Overwrite DWORD 2 of c->reg.header with the given immediate unsigned value.
+ *
+ * In URB_WRITE messages, DWORD 2 contains the fields PrimType, PrimStart,
+ * PrimEnd, Increment CL_INVOCATIONS, and SONumPrimsWritten, many of which we
+ * need to be able to update on a per-vertex basis.
+ */
+static void brw_ff_gs_overwrite_header_dw2(struct brw_ff_gs_compile *c,
+                                           unsigned dw2)
+{
+   struct brw_codegen *p = &c->func;
+   brw_MOV(p, get_element_ud(c->reg.header, 2), brw_imm_ud(dw2));
+}
+
+/**
+ * Overwrite DWORD 2 of c->reg.header with the primitive type from c->reg.R0.
+ *
+ * When the thread is spawned, GRF 0 contains the primitive type in bits 4:0
+ * of DWORD 2.  URB_WRITE messages need the primitive type in bits 6:2 of
+ * DWORD 2.  So this function extracts the primitive type field, bitshifts it
+ * appropriately, and stores it in c->reg.header.
+ */
+static void brw_ff_gs_overwrite_header_dw2_from_r0(struct brw_ff_gs_compile *c)
+{
+   struct brw_codegen *p = &c->func;
+   brw_AND(p, get_element_ud(c->reg.header, 2), get_element_ud(c->reg.R0, 2),
+           brw_imm_ud(0x1f));
+   brw_SHL(p, get_element_ud(c->reg.header, 2),
+           get_element_ud(c->reg.header, 2), brw_imm_ud(2));
+}
+
+/**
+ * Apply an additive offset to DWORD 2 of c->reg.header.
+ *
+ * This is used to set/unset the "PrimStart" and "PrimEnd" flags appropriately
+ * for each vertex.
+ */
+static void brw_ff_gs_offset_header_dw2(struct brw_ff_gs_compile *c,
+                                        int offset)
+{
+   struct brw_codegen *p = &c->func;
+   brw_ADD(p, get_element_d(c->reg.header, 2), get_element_d(c->reg.header, 2),
+           brw_imm_d(offset));
+}
+
+
+/**
+ * Emit a vertex using the URB_WRITE message.  Use the contents of
+ * c->reg.header for the message header, and the registers starting at \c vert
+ * for the vertex data.
+ *
+ * If \c last is true, then this is the last vertex, so no further URB space
+ * should be allocated, and this message should end the thread.
+ *
+ * If \c last is false, then a new URB entry will be allocated, and its handle
+ * will be stored in DWORD 0 of c->reg.header for use in the next URB_WRITE
+ * message.
+ */
+static void brw_ff_gs_emit_vue(struct brw_ff_gs_compile *c,
+                               struct brw_reg vert,
+                               bool last)
+{
+   struct brw_codegen *p = &c->func;
+   int write_offset = 0;
+   bool complete = false;
+
+   do {
+      /* We can't write more than 14 registers at a time to the URB */
+      int write_len = MIN2(c->nr_regs - write_offset, 14);
+      if (write_len == c->nr_regs - write_offset)
+         complete = true;
+
+      /* Copy the vertex from vertn into m1..mN+1:
+       */
+      brw_copy8(p, brw_message_reg(1), offset(vert, write_offset), write_len);
+
+      /* Send the vertex data to the URB.  If this is the last write for this
+       * vertex, then we mark it as complete, and either end the thread or
+       * allocate another vertex URB entry (depending whether this is the last
+       * vertex).
+       */
+      enum brw_urb_write_flags flags;
+      if (!complete)
+         flags = BRW_URB_WRITE_NO_FLAGS;
+      else if (last)
+         flags = BRW_URB_WRITE_EOT_COMPLETE;
+      else
+         flags = BRW_URB_WRITE_ALLOCATE_COMPLETE;
+      brw_urb_WRITE(p,
+                    (flags & BRW_URB_WRITE_ALLOCATE) ? c->reg.temp
+                    : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+                    0,
+                    c->reg.header,
+                    flags,
+                    write_len + 1, /* msg length */
+                    (flags & BRW_URB_WRITE_ALLOCATE) ? 1
+                    : 0, /* response length */
+                    write_offset,  /* urb offset */
+                    BRW_URB_SWIZZLE_NONE);
+      write_offset += write_len;
+   } while (!complete);
+
+   if (!last) {
+      brw_MOV(p, get_element_ud(c->reg.header, 0),
+              get_element_ud(c->reg.temp, 0));
+   }
+}
+
+/**
+ * Send an FF_SYNC message to ensure that all previously spawned GS threads
+ * have finished sending primitives down the pipeline, and to allocate a URB
+ * entry for the first output vertex.  Only needed on Ironlake+.
+ *
+ * This function modifies c->reg.header: in DWORD 1, it stores num_prim (which
+ * is needed by the FF_SYNC message), and in DWORD 0, it stores the handle to
+ * the allocated URB entry (which will be needed by the URB_WRITE meesage that
+ * follows).
+ */
+static void brw_ff_gs_ff_sync(struct brw_ff_gs_compile *c, int num_prim)
+{
+   struct brw_codegen *p = &c->func;
+
+   brw_MOV(p, get_element_ud(c->reg.header, 1), brw_imm_ud(num_prim));
+   brw_ff_sync(p,
+               c->reg.temp,
+               0,
+               c->reg.header,
+               1, /* allocate */
+               1, /* response length */
+               0 /* eot */);
+   brw_MOV(p, get_element_ud(c->reg.header, 0),
+           get_element_ud(c->reg.temp, 0));
+}
+
+
+static void
+brw_ff_gs_quads(struct brw_ff_gs_compile *c,
+		const struct brw_ff_gs_prog_key *key)
+{
+   brw_ff_gs_alloc_regs(c, 4, false);
+   brw_ff_gs_initialize_header(c);
+   /* Use polygons for correct edgeflag behaviour. Note that vertex 3
+    * is the PV for quads, but vertex 0 for polygons:
+    */
+   if (c->func.devinfo->ver == 5)
+      brw_ff_gs_ff_sync(c, 1);
+   brw_ff_gs_overwrite_header_dw2(
+      c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
+          | URB_WRITE_PRIM_START));
+   if (key->pv_first) {
+      brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
+             | URB_WRITE_PRIM_END));
+      brw_ff_gs_emit_vue(c, c->reg.vertex[3], 1);
+   }
+   else {
+      brw_ff_gs_emit_vue(c, c->reg.vertex[3], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
+             | URB_WRITE_PRIM_END));
+      brw_ff_gs_emit_vue(c, c->reg.vertex[2], 1);
+   }
+}
+
+static void
+brw_ff_gs_quad_strip(struct brw_ff_gs_compile *c,
+                     const struct brw_ff_gs_prog_key *key)
+{
+   brw_ff_gs_alloc_regs(c, 4, false);
+   brw_ff_gs_initialize_header(c);
+
+   if (c->func.devinfo->ver == 5)
+      brw_ff_gs_ff_sync(c, 1);
+   brw_ff_gs_overwrite_header_dw2(
+      c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
+          | URB_WRITE_PRIM_START));
+   if (key->pv_first) {
+      brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
+             | URB_WRITE_PRIM_END));
+      brw_ff_gs_emit_vue(c, c->reg.vertex[3], 1);
+   }
+   else {
+      brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[3], 0);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
+             | URB_WRITE_PRIM_END));
+      brw_ff_gs_emit_vue(c, c->reg.vertex[1], 1);
+   }
+}
+
+static void brw_ff_gs_lines(struct brw_ff_gs_compile *c)
+{
+   brw_ff_gs_alloc_regs(c, 2, false);
+   brw_ff_gs_initialize_header(c);
+
+   if (c->func.devinfo->ver == 5)
+      brw_ff_gs_ff_sync(c, 1);
+   brw_ff_gs_overwrite_header_dw2(
+      c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
+          | URB_WRITE_PRIM_START));
+   brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
+   brw_ff_gs_overwrite_header_dw2(
+      c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
+          | URB_WRITE_PRIM_END));
+   brw_ff_gs_emit_vue(c, c->reg.vertex[1], 1);
+}
+
+/**
+ * Generate the geometry shader program used on Gen6 to perform stream output
+ * (transform feedback).
+ */
+static void
+gfx6_sol_program(struct brw_ff_gs_compile *c, const struct brw_ff_gs_prog_key *key,
+                 unsigned num_verts, bool check_edge_flags)
+{
+   struct brw_codegen *p = &c->func;
+   brw_inst *inst;
+   c->prog_data->svbi_postincrement_value = num_verts;
+
+   brw_ff_gs_alloc_regs(c, num_verts, true);
+   brw_ff_gs_initialize_header(c);
+
+   if (key->num_transform_feedback_bindings > 0) {
+      unsigned vertex, binding;
+      struct brw_reg destination_indices_uw =
+         vec8(retype(c->reg.destination_indices, BRW_REGISTER_TYPE_UW));
+
+      /* Note: since we use the binding table to keep track of buffer offsets
+       * and stride, the GS doesn't need to keep track of a separate pointer
+       * into each buffer; it uses a single pointer which increments by 1 for
+       * each vertex.  So we use SVBI0 for this pointer, regardless of whether
+       * transform feedback is in interleaved or separate attribs mode.
+       *
+       * Make sure that the buffers have enough room for all the vertices.
+       */
+      brw_ADD(p, get_element_ud(c->reg.temp, 0),
+                 get_element_ud(c->reg.SVBI, 0), brw_imm_ud(num_verts));
+      brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE,
+                 get_element_ud(c->reg.temp, 0),
+                 get_element_ud(c->reg.SVBI, 4));
+      brw_IF(p, BRW_EXECUTE_1);
+
+      /* Compute the destination indices to write to.  Usually we use SVBI[0]
+       * + (0, 1, 2).  However, for odd-numbered triangles in tristrips, the
+       * vertices come down the pipeline in reversed winding order, so we need
+       * to flip the order when writing to the transform feedback buffer.  To
+       * ensure that flatshading accuracy is preserved, we need to write them
+       * in order SVBI[0] + (0, 2, 1) if we're using the first provoking
+       * vertex convention, and in order SVBI[0] + (1, 0, 2) if we're using
+       * the last provoking vertex convention.
+       *
+       * Note: since brw_imm_v can only be used in instructions in
+       * packed-word execution mode, and SVBI is a double-word, we need to
+       * first move the appropriate immediate constant ((0, 1, 2), (0, 2, 1),
+       * or (1, 0, 2)) to the destination_indices register, and then add SVBI
+       * using a separate instruction.  Also, since the immediate constant is
+       * expressed as packed words, and we need to load double-words into
+       * destination_indices, we need to intersperse zeros to fill the upper
+       * halves of each double-word.
+       */
+      brw_MOV(p, destination_indices_uw,
+              brw_imm_v(0x00020100)); /* (0, 1, 2) */
+      if (num_verts == 3) {
+         /* Get primitive type into temp register. */
+         brw_AND(p, get_element_ud(c->reg.temp, 0),
+                 get_element_ud(c->reg.R0, 2), brw_imm_ud(0x1f));
+
+         /* Test if primitive type is TRISTRIP_REVERSE.  We need to do this as
+          * an 8-wide comparison so that the conditional MOV that follows
+          * moves all 8 words correctly.
+          */
+         brw_CMP(p, vec8(brw_null_reg()), BRW_CONDITIONAL_EQ,
+                 get_element_ud(c->reg.temp, 0),
+                 brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
+
+         /* If so, then overwrite destination_indices_uw with the appropriate
+          * reordering.
+          */
+         inst = brw_MOV(p, destination_indices_uw,
+                        brw_imm_v(key->pv_first ? 0x00010200    /* (0, 2, 1) */
+                                                : 0x00020001)); /* (1, 0, 2) */
+         brw_inst_set_pred_control(p->devinfo, inst, BRW_PREDICATE_NORMAL);
+      }
+
+      assert(c->reg.destination_indices.width == BRW_EXECUTE_4);
+      brw_push_insn_state(p);
+      brw_set_default_exec_size(p, BRW_EXECUTE_4);
+      brw_ADD(p, c->reg.destination_indices,
+              c->reg.destination_indices, get_element_ud(c->reg.SVBI, 0));
+      brw_pop_insn_state(p);
+      /* For each vertex, generate code to output each varying using the
+       * appropriate binding table entry.
+       */
+      for (vertex = 0; vertex < num_verts; ++vertex) {
+         /* Set up the correct destination index for this vertex */
+         brw_MOV(p, get_element_ud(c->reg.header, 5),
+                 get_element_ud(c->reg.destination_indices, vertex));
+
+         for (binding = 0; binding < key->num_transform_feedback_bindings;
+              ++binding) {
+            unsigned char varying =
+               key->transform_feedback_bindings[binding];
+            unsigned char slot = c->vue_map.varying_to_slot[varying];
+            /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
+             *
+             *   "Prior to End of Thread with a URB_WRITE, the kernel must
+             *   ensure that all writes are complete by sending the final
+             *   write as a committed write."
+             */
+            bool final_write =
+               binding == key->num_transform_feedback_bindings - 1 &&
+               vertex == num_verts - 1;
+            struct brw_reg vertex_slot = c->reg.vertex[vertex];
+            vertex_slot.nr += slot / 2;
+            vertex_slot.subnr = (slot % 2) * 16;
+            /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w. */
+            vertex_slot.swizzle = varying == VARYING_SLOT_PSIZ
+               ? BRW_SWIZZLE_WWWW : key->transform_feedback_swizzles[binding];
+            brw_set_default_access_mode(p, BRW_ALIGN_16);
+            brw_push_insn_state(p);
+            brw_set_default_exec_size(p, BRW_EXECUTE_4);
+
+            brw_MOV(p, stride(c->reg.header, 4, 4, 1),
+                    retype(vertex_slot, BRW_REGISTER_TYPE_UD));
+            brw_pop_insn_state(p);
+
+            brw_set_default_access_mode(p, BRW_ALIGN_1);
+            brw_svb_write(p,
+                          final_write ? c->reg.temp : brw_null_reg(), /* dest */
+                          1, /* msg_reg_nr */
+                          c->reg.header, /* src0 */
+                          BRW_GFX6_SOL_BINDING_START + binding, /* binding_table_index */
+                          final_write); /* send_commit_msg */
+         }
+      }
+      brw_ENDIF(p);
+
+      /* Now, reinitialize the header register from R0 to restore the parts of
+       * the register that we overwrote while streaming out transform feedback
+       * data.
+       */
+      brw_ff_gs_initialize_header(c);
+
+      /* Finally, wait for the write commit to occur so that we can proceed to
+       * other things safely.
+       *
+       * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
+       *
+       *   The write commit does not modify the destination register, but
+       *   merely clears the dependency associated with the destination
+       *   register. Thus, a simple “mov” instruction using the register as a
+       *   source is sufficient to wait for the write commit to occur.
+       */
+      brw_MOV(p, c->reg.temp, c->reg.temp);
+   }
+
+   brw_ff_gs_ff_sync(c, 1);
+
+   brw_ff_gs_overwrite_header_dw2_from_r0(c);
+   switch (num_verts) {
+   case 1:
+      brw_ff_gs_offset_header_dw2(c,
+                                  URB_WRITE_PRIM_START | URB_WRITE_PRIM_END);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[0], true);
+      break;
+   case 2:
+      brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[0], false);
+      brw_ff_gs_offset_header_dw2(c,
+                                  URB_WRITE_PRIM_END - URB_WRITE_PRIM_START);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[1], true);
+      break;
+   case 3:
+      if (check_edge_flags) {
+         /* Only emit vertices 0 and 1 if this is the first triangle of the
+          * polygon.  Otherwise they are redundant.
+          */
+         brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+                 get_element_ud(c->reg.R0, 2),
+                 brw_imm_ud(BRW_GS_EDGE_INDICATOR_0));
+         brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+         brw_IF(p, BRW_EXECUTE_1);
+      }
+      brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[0], false);
+      brw_ff_gs_offset_header_dw2(c, -URB_WRITE_PRIM_START);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[1], false);
+      if (check_edge_flags) {
+         brw_ENDIF(p);
+         /* Only emit vertex 2 in PRIM_END mode if this is the last triangle
+          * of the polygon.  Otherwise leave the primitive incomplete because
+          * there are more polygon vertices coming.
+          */
+         brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+                 get_element_ud(c->reg.R0, 2),
+                 brw_imm_ud(BRW_GS_EDGE_INDICATOR_1));
+         brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+         brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
+      }
+      brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_END);
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[2], true);
+      break;
+   }
+}
+
+const unsigned *
+brw_compile_ff_gs_prog(struct brw_compiler *compiler,
+		       void *mem_ctx,
+		       const struct brw_ff_gs_prog_key *key,
+		       struct brw_ff_gs_prog_data *prog_data,
+		       struct intel_vue_map *vue_map,
+		       unsigned *final_assembly_size)
+{
+   struct brw_ff_gs_compile c;
+   const GLuint *program;
+
+   memset(&c, 0, sizeof(c));
+
+   c.key = *key;
+   c.vue_map = *vue_map;
+   c.nr_regs = (c.vue_map.num_slots + 1)/2;
+   c.prog_data = prog_data;
+
+   mem_ctx = ralloc_context(NULL);
+
+   /* Begin the compilation:
+    */
+   brw_init_codegen(&compiler->isa, &c.func, mem_ctx);
+
+   c.func.single_program_flow = 1;
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.
+    */
+   brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE);
+
+   if (compiler->devinfo->ver >= 6) {
+      unsigned num_verts;
+      bool check_edge_flag;
+      /* On Sandybridge, we use the GS for implementing transform feedback
+       * (called "Stream Out" in the PRM).
+       */
+      switch (key->primitive) {
+      case _3DPRIM_POINTLIST:
+         num_verts = 1;
+         check_edge_flag = false;
+         break;
+      case _3DPRIM_LINELIST:
+      case _3DPRIM_LINESTRIP:
+      case _3DPRIM_LINELOOP:
+         num_verts = 2;
+         check_edge_flag = false;
+         break;
+      case _3DPRIM_TRILIST:
+      case _3DPRIM_TRIFAN:
+      case _3DPRIM_TRISTRIP:
+      case _3DPRIM_RECTLIST:
+         num_verts = 3;
+         check_edge_flag = false;
+         break;
+      case _3DPRIM_QUADLIST:
+      case _3DPRIM_QUADSTRIP:
+      case _3DPRIM_POLYGON:
+         num_verts = 3;
+         check_edge_flag = true;
+         break;
+      default:
+         unreachable("Unexpected primitive type in Gen6 SOL program.");
+      }
+      gfx6_sol_program(&c, key, num_verts, check_edge_flag);
+   } else {
+      /* On Gen4-5, we use the GS to decompose certain types of primitives.
+       * Note that primitives which don't require a GS program have already
+       * been weeded out by now.
+       */
+      switch (key->primitive) {
+      case _3DPRIM_QUADLIST:
+         brw_ff_gs_quads( &c, key );
+         break;
+      case _3DPRIM_QUADSTRIP:
+         brw_ff_gs_quad_strip( &c, key );
+         break;
+      case _3DPRIM_LINELOOP:
+         brw_ff_gs_lines( &c );
+         break;
+      default:
+         return NULL;
+      }
+   }
+
+   brw_compact_instructions(&c.func, 0, NULL);
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, final_assembly_size);
+
+   if (INTEL_DEBUG(DEBUG_GS)) {
+      fprintf(stderr, "gs:\n");
+      brw_disassemble_with_labels(&compiler->isa, c.func.store,
+                                  0, *final_assembly_size, stderr);
+      fprintf(stderr, "\n");
+    }
+
+   return program;
+}
+
diff --git a/src/intel/compiler/elk/brw_compile_sf.c b/src/intel/compiler/elk/brw_compile_sf.c
new file mode 100644
index 00000000000..f9f23e3d2c9
--- /dev/null
+++ b/src/intel/compiler/elk/brw_compile_sf.c
@@ -0,0 +1,881 @@
+/*
+ * Copyright © 2006 - 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_compiler.h"
+#include "brw_disasm.h"
+#include "brw_eu.h"
+#include "brw_prim.h"
+
+#include "dev/intel_debug.h"
+
+struct brw_sf_compile {
+   struct brw_codegen func;
+   struct brw_sf_prog_key key;
+   struct brw_sf_prog_data prog_data;
+
+   struct brw_reg pv;
+   struct brw_reg det;
+   struct brw_reg dx0;
+   struct brw_reg dx2;
+   struct brw_reg dy0;
+   struct brw_reg dy2;
+
+   /* z and 1/w passed in separately:
+    */
+   struct brw_reg z[3];
+   struct brw_reg inv_w[3];
+
+   /* The vertices:
+    */
+   struct brw_reg vert[3];
+
+    /* Temporaries, allocated after last vertex reg.
+    */
+   struct brw_reg inv_det;
+   struct brw_reg a1_sub_a0;
+   struct brw_reg a2_sub_a0;
+   struct brw_reg tmp;
+
+   struct brw_reg m1Cx;
+   struct brw_reg m2Cy;
+   struct brw_reg m3C0;
+
+   GLuint nr_verts;
+   GLuint nr_attr_regs;
+   GLuint nr_setup_regs;
+   int urb_entry_read_offset;
+
+   /** The last known value of the f0.0 flag register. */
+   unsigned flag_value;
+
+   struct intel_vue_map vue_map;
+};
+
+/**
+ * Determine the vue slot corresponding to the given half of the given register.
+ */
+static inline int vert_reg_to_vue_slot(struct brw_sf_compile *c, GLuint reg,
+                                       int half)
+{
+   return (reg + c->urb_entry_read_offset) * 2 + half;
+}
+
+/**
+ * Determine the varying corresponding to the given half of the given
+ * register.  half=0 means the first half of a register, half=1 means the
+ * second half.
+ */
+static inline int vert_reg_to_varying(struct brw_sf_compile *c, GLuint reg,
+                                      int half)
+{
+   int vue_slot = vert_reg_to_vue_slot(c, reg, half);
+   return c->vue_map.slot_to_varying[vue_slot];
+}
+
+/**
+ * Determine the register corresponding to the given vue slot
+ */
+static struct brw_reg get_vue_slot(struct brw_sf_compile *c,
+                                   struct brw_reg vert,
+                                   int vue_slot)
+{
+   GLuint off = vue_slot / 2 - c->urb_entry_read_offset;
+   GLuint sub = vue_slot % 2;
+
+   return brw_vec4_grf(vert.nr + off, sub * 4);
+}
+
+/**
+ * Determine the register corresponding to the given varying.
+ */
+static struct brw_reg get_varying(struct brw_sf_compile *c,
+                                  struct brw_reg vert,
+                                  GLuint varying)
+{
+   int vue_slot = c->vue_map.varying_to_slot[varying];
+   assert (vue_slot >= c->urb_entry_read_offset);
+   return get_vue_slot(c, vert, vue_slot);
+}
+
+static bool
+have_attr(struct brw_sf_compile *c, GLuint attr)
+{
+   return (c->key.attrs & BITFIELD64_BIT(attr)) ? 1 : 0;
+}
+
+/***********************************************************************
+ * Twoside lighting
+ */
+static void copy_bfc( struct brw_sf_compile *c,
+		      struct brw_reg vert )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint i;
+
+   for (i = 0; i < 2; i++) {
+      if (have_attr(c, VARYING_SLOT_COL0+i) &&
+	  have_attr(c, VARYING_SLOT_BFC0+i))
+	 brw_MOV(p,
+		 get_varying(c, vert, VARYING_SLOT_COL0+i),
+		 get_varying(c, vert, VARYING_SLOT_BFC0+i));
+   }
+}
+
+
+static void do_twoside_color( struct brw_sf_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint backface_conditional = c->key.frontface_ccw ? BRW_CONDITIONAL_G : BRW_CONDITIONAL_L;
+
+   /* Already done in clip program:
+    */
+   if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
+      return;
+
+   /* If the vertex shader provides backface color, do the selection. The VS
+    * promises to set up the front color if the backface color is provided, but
+    * it may contain junk if never written to.
+    */
+   if (!(have_attr(c, VARYING_SLOT_COL0) && have_attr(c, VARYING_SLOT_BFC0)) &&
+       !(have_attr(c, VARYING_SLOT_COL1) && have_attr(c, VARYING_SLOT_BFC1)))
+      return;
+
+   /* Need to use BRW_EXECUTE_4 and also do an 4-wide compare in order
+    * to get all channels active inside the IF.  In the clipping code
+    * we run with NoMask, so it's not an option and we can use
+    * BRW_EXECUTE_1 for all comparisons.
+    */
+   brw_CMP(p, vec4(brw_null_reg()), backface_conditional, c->det, brw_imm_f(0));
+   brw_IF(p, BRW_EXECUTE_4);
+   {
+      switch (c->nr_verts) {
+      case 3: copy_bfc(c, c->vert[2]); FALLTHROUGH;
+      case 2: copy_bfc(c, c->vert[1]); FALLTHROUGH;
+      case 1: copy_bfc(c, c->vert[0]);
+      }
+   }
+   brw_ENDIF(p);
+}
+
+
+
+/***********************************************************************
+ * Flat shading
+ */
+
+static void copy_flatshaded_attributes(struct brw_sf_compile *c,
+                                       struct brw_reg dst,
+                                       struct brw_reg src)
+{
+   struct brw_codegen *p = &c->func;
+   int i;
+
+   for (i = 0; i < c->vue_map.num_slots; i++) {
+      if (c->key.interp_mode[i] == INTERP_MODE_FLAT) {
+         brw_MOV(p,
+                 get_vue_slot(c, dst, i),
+                 get_vue_slot(c, src, i));
+      }
+   }
+}
+
+static int count_flatshaded_attributes(struct brw_sf_compile *c)
+{
+   int i;
+   int count = 0;
+
+   for (i = 0; i < c->vue_map.num_slots; i++)
+      if (c->key.interp_mode[i] == INTERP_MODE_FLAT)
+         count++;
+
+   return count;
+}
+
+
+
+/* Need to use a computed jump to copy flatshaded attributes as the
+ * vertices are ordered according to y-coordinate before reaching this
+ * point, so the PV could be anywhere.
+ */
+static void do_flatshade_triangle( struct brw_sf_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint nr;
+   GLuint jmpi = 1;
+
+   /* Already done in clip program:
+    */
+   if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
+      return;
+
+   if (p->devinfo->ver == 5)
+       jmpi = 2;
+
+   nr = count_flatshaded_attributes(c);
+
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr*2+1)));
+   brw_JMPI(p, c->pv, BRW_PREDICATE_NONE);
+
+   copy_flatshaded_attributes(c, c->vert[1], c->vert[0]);
+   copy_flatshaded_attributes(c, c->vert[2], c->vert[0]);
+   brw_JMPI(p, brw_imm_d(jmpi*(nr*4+1)), BRW_PREDICATE_NONE);
+
+   copy_flatshaded_attributes(c, c->vert[0], c->vert[1]);
+   copy_flatshaded_attributes(c, c->vert[2], c->vert[1]);
+   brw_JMPI(p, brw_imm_d(jmpi*nr*2), BRW_PREDICATE_NONE);
+
+   copy_flatshaded_attributes(c, c->vert[0], c->vert[2]);
+   copy_flatshaded_attributes(c, c->vert[1], c->vert[2]);
+}
+
+
+static void do_flatshade_line( struct brw_sf_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint nr;
+   GLuint jmpi = 1;
+
+   /* Already done in clip program:
+    */
+   if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
+      return;
+
+   if (p->devinfo->ver == 5)
+       jmpi = 2;
+
+   nr = count_flatshaded_attributes(c);
+
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr+1)));
+   brw_JMPI(p, c->pv, BRW_PREDICATE_NONE);
+   copy_flatshaded_attributes(c, c->vert[1], c->vert[0]);
+
+   brw_JMPI(p, brw_imm_ud(jmpi*nr), BRW_PREDICATE_NONE);
+   copy_flatshaded_attributes(c, c->vert[0], c->vert[1]);
+}
+
+
+/***********************************************************************
+ * Triangle setup.
+ */
+
+
+static void alloc_regs( struct brw_sf_compile *c )
+{
+   GLuint reg, i;
+
+   /* Values computed by fixed function unit:
+    */
+   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_D);
+   c->det = brw_vec1_grf(1, 2);
+   c->dx0 = brw_vec1_grf(1, 3);
+   c->dx2 = brw_vec1_grf(1, 4);
+   c->dy0 = brw_vec1_grf(1, 5);
+   c->dy2 = brw_vec1_grf(1, 6);
+
+   /* z and 1/w passed in separately:
+    */
+   c->z[0]     = brw_vec1_grf(2, 0);
+   c->inv_w[0] = brw_vec1_grf(2, 1);
+   c->z[1]     = brw_vec1_grf(2, 2);
+   c->inv_w[1] = brw_vec1_grf(2, 3);
+   c->z[2]     = brw_vec1_grf(2, 4);
+   c->inv_w[2] = brw_vec1_grf(2, 5);
+
+   /* The vertices:
+    */
+   reg = 3;
+   for (i = 0; i < c->nr_verts; i++) {
+      c->vert[i] = brw_vec8_grf(reg, 0);
+      reg += c->nr_attr_regs;
+   }
+
+   /* Temporaries, allocated after last vertex reg.
+    */
+   c->inv_det = brw_vec1_grf(reg, 0);  reg++;
+   c->a1_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->a2_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->tmp = brw_vec8_grf(reg, 0);  reg++;
+
+   /* Note grf allocation:
+    */
+   c->prog_data.total_grf = reg;
+
+
+   /* Outputs of this program - interpolation coefficients for
+    * rasterization:
+    */
+   c->m1Cx = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 1, 0);
+   c->m2Cy = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 2, 0);
+   c->m3C0 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 3, 0);
+}
+
+
+static void copy_z_inv_w( struct brw_sf_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint i;
+
+   /* Copy both scalars with a single MOV:
+    */
+   for (i = 0; i < c->nr_verts; i++)
+      brw_MOV(p, vec2(suboffset(c->vert[i], 2)), vec2(c->z[i]));
+}
+
+
+static void invert_det( struct brw_sf_compile *c)
+{
+   /* Looks like we invert all 8 elements just to get 1/det in
+    * position 2 !?!
+    */
+   gfx4_math(&c->func,
+	     c->inv_det,
+	     BRW_MATH_FUNCTION_INV,
+	     0,
+	     c->det,
+	     BRW_MATH_PRECISION_FULL);
+
+}
+
+
+static bool
+calculate_masks(struct brw_sf_compile *c,
+                GLuint reg,
+                GLushort *pc,
+                GLushort *pc_persp,
+                GLushort *pc_linear)
+{
+   bool is_last_attr = (reg == c->nr_setup_regs - 1);
+   enum glsl_interp_mode interp;
+
+   *pc_persp = 0;
+   *pc_linear = 0;
+   *pc = 0xf;
+
+   interp = c->key.interp_mode[vert_reg_to_vue_slot(c, reg, 0)];
+   if (interp == INTERP_MODE_SMOOTH) {
+      *pc_linear = 0xf;
+      *pc_persp = 0xf;
+   } else if (interp == INTERP_MODE_NOPERSPECTIVE)
+      *pc_linear = 0xf;
+
+   /* Maybe only process one attribute on the final round:
+    */
+   if (vert_reg_to_varying(c, reg, 1) != BRW_VARYING_SLOT_COUNT) {
+      *pc |= 0xf0;
+
+      interp = c->key.interp_mode[vert_reg_to_vue_slot(c, reg, 1)];
+      if (interp == INTERP_MODE_SMOOTH) {
+         *pc_linear |= 0xf0;
+         *pc_persp |= 0xf0;
+      } else if (interp == INTERP_MODE_NOPERSPECTIVE)
+         *pc_linear |= 0xf0;
+   }
+
+   return is_last_attr;
+}
+
+/* Calculates the predicate control for which channels of a reg
+ * (containing 2 attrs) to do point sprite coordinate replacement on.
+ */
+static uint16_t
+calculate_point_sprite_mask(struct brw_sf_compile *c, GLuint reg)
+{
+   int varying1, varying2;
+   uint16_t pc = 0;
+
+   varying1 = vert_reg_to_varying(c, reg, 0);
+   if (varying1 >= VARYING_SLOT_TEX0 && varying1 <= VARYING_SLOT_TEX7) {
+      if (c->key.point_sprite_coord_replace & (1 << (varying1 - VARYING_SLOT_TEX0)))
+	 pc |= 0x0f;
+   }
+   if (varying1 == BRW_VARYING_SLOT_PNTC)
+      pc |= 0x0f;
+
+   varying2 = vert_reg_to_varying(c, reg, 1);
+   if (varying2 >= VARYING_SLOT_TEX0 && varying2 <= VARYING_SLOT_TEX7) {
+      if (c->key.point_sprite_coord_replace & (1 << (varying2 -
+                                                     VARYING_SLOT_TEX0)))
+         pc |= 0xf0;
+   }
+   if (varying2 == BRW_VARYING_SLOT_PNTC)
+      pc |= 0xf0;
+
+   return pc;
+}
+
+static void
+set_predicate_control_flag_value(struct brw_codegen *p,
+                                 struct brw_sf_compile *c,
+                                 unsigned value)
+{
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+
+   if (value != 0xff) {
+      if (value != c->flag_value) {
+         brw_MOV(p, brw_flag_reg(0, 0), brw_imm_uw(value));
+         c->flag_value = value;
+      }
+
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
+   }
+}
+
+static void brw_emit_tri_setup(struct brw_sf_compile *c, bool allocate)
+{
+   struct brw_codegen *p = &c->func;
+   GLuint i;
+
+   c->flag_value = 0xff;
+   c->nr_verts = 3;
+
+   if (allocate)
+      alloc_regs(c);
+
+   invert_det(c);
+   copy_z_inv_w(c);
+
+   if (c->key.do_twoside_color)
+      do_twoside_color(c);
+
+   if (c->key.contains_flat_varying)
+      do_flatshade_triangle(c);
+
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      struct brw_reg a2 = offset(c->vert[2], i);
+      GLushort pc, pc_persp, pc_linear;
+      bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 set_predicate_control_flag_value(p, c, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+	 brw_MUL(p, a2, a2, c->inv_w[2]);
+      }
+
+
+      /* Calculate coefficients for interpolated values:
+       */
+      if (pc_linear)
+      {
+	 set_predicate_control_flag_value(p, c, pc_linear);
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+	 brw_ADD(p, c->a2_sub_a0, a2, negate(a0));
+
+	 /* calculate dA/dx
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a1_sub_a0, c->dy2);
+	 brw_MAC(p, c->tmp, c->a2_sub_a0, negate(c->dy0));
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+
+	 /* calculate dA/dy
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a2_sub_a0, c->dx0);
+	 brw_MAC(p, c->tmp, c->a1_sub_a0, negate(c->dx2));
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 set_predicate_control_flag_value(p, c, pc);
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+
+	 /* Copy m0..m3 to URB.  m0 is implicitly copied from r0 in
+	  * the send instruction:
+	  */
+	 brw_urb_WRITE(p,
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0), /* r0, will be copied to m0 */
+                       last ? BRW_URB_WRITE_EOT_COMPLETE
+                       : BRW_URB_WRITE_NO_FLAGS,
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       i*4,	/* offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE); /* XXX: Swizzle control "SF to windower" */
+      }
+   }
+
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+
+
+static void brw_emit_line_setup(struct brw_sf_compile *c, bool allocate)
+{
+   struct brw_codegen *p = &c->func;
+   GLuint i;
+
+   c->flag_value = 0xff;
+   c->nr_verts = 2;
+
+   if (allocate)
+      alloc_regs(c);
+
+   invert_det(c);
+   copy_z_inv_w(c);
+
+   if (c->key.contains_flat_varying)
+      do_flatshade_line(c);
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      GLushort pc, pc_persp, pc_linear;
+      bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 set_predicate_control_flag_value(p, c, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+      }
+
+      /* Calculate coefficients for position, color:
+       */
+      if (pc_linear) {
+	 set_predicate_control_flag_value(p, c, pc_linear);
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+
+	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0);
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+
+	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dy0);
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 set_predicate_control_flag_value(p, c, pc);
+
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+
+	 /* Copy m0..m3 to URB.
+	  */
+	 brw_urb_WRITE(p,
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+                       last ? BRW_URB_WRITE_EOT_COMPLETE
+                       : BRW_URB_WRITE_NO_FLAGS,
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+static void brw_emit_point_sprite_setup(struct brw_sf_compile *c, bool allocate)
+{
+   struct brw_codegen *p = &c->func;
+   GLuint i;
+
+   c->flag_value = 0xff;
+   c->nr_verts = 1;
+
+   if (allocate)
+      alloc_regs(c);
+
+   copy_z_inv_w(c);
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      struct brw_reg a0 = offset(c->vert[0], i);
+      GLushort pc, pc_persp, pc_linear, pc_coord_replace;
+      bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      pc_coord_replace = calculate_point_sprite_mask(c, i);
+      pc_persp &= ~pc_coord_replace;
+
+      if (pc_persp) {
+	 set_predicate_control_flag_value(p, c, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+      }
+
+      /* Point sprite coordinate replacement: A texcoord with this
+       * enabled gets replaced with the value (x, y, 0, 1) where x and
+       * y vary from 0 to 1 across the horizontal and vertical of the
+       * point.
+       */
+      if (pc_coord_replace) {
+	 set_predicate_control_flag_value(p, c, pc_coord_replace);
+	 /* Calculate 1.0/PointWidth */
+	 gfx4_math(&c->func,
+		   c->tmp,
+		   BRW_MATH_FUNCTION_INV,
+		   0,
+		   c->dx0,
+		   BRW_MATH_PRECISION_FULL);
+
+	 brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+	 /* dA/dx, dA/dy */
+	 brw_MOV(p, c->m1Cx, brw_imm_f(0.0));
+	 brw_MOV(p, c->m2Cy, brw_imm_f(0.0));
+	 brw_MOV(p, brw_writemask(c->m1Cx, WRITEMASK_X), c->tmp);
+	 if (c->key.sprite_origin_lower_left) {
+	    brw_MOV(p, brw_writemask(c->m2Cy, WRITEMASK_Y), negate(c->tmp));
+	 } else {
+	    brw_MOV(p, brw_writemask(c->m2Cy, WRITEMASK_Y), c->tmp);
+	 }
+
+	 /* attribute constant offset */
+	 brw_MOV(p, c->m3C0, brw_imm_f(0.0));
+	 if (c->key.sprite_origin_lower_left) {
+	    brw_MOV(p, brw_writemask(c->m3C0, WRITEMASK_YW), brw_imm_f(1.0));
+	 } else {
+	    brw_MOV(p, brw_writemask(c->m3C0, WRITEMASK_W), brw_imm_f(1.0));
+	 }
+
+	 brw_set_default_access_mode(p, BRW_ALIGN_1);
+      }
+
+      if (pc & ~pc_coord_replace) {
+	 set_predicate_control_flag_value(p, c, pc & ~pc_coord_replace);
+	 brw_MOV(p, c->m1Cx, brw_imm_ud(0));
+	 brw_MOV(p, c->m2Cy, brw_imm_ud(0));
+	 brw_MOV(p, c->m3C0, a0); /* constant value */
+      }
+
+
+      set_predicate_control_flag_value(p, c, pc);
+      /* Copy m0..m3 to URB. */
+      brw_urb_WRITE(p,
+		    brw_null_reg(),
+		    0,
+		    brw_vec8_grf(0, 0),
+                    last ? BRW_URB_WRITE_EOT_COMPLETE
+                    : BRW_URB_WRITE_NO_FLAGS,
+		    4, 	/* msg len */
+		    0,	/* response len */
+		    i*4,	/* urb destination offset */
+		    BRW_URB_SWIZZLE_TRANSPOSE);
+   }
+
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+/* Points setup - several simplifications as all attributes are
+ * constant across the face of the point (point sprites excluded!)
+ */
+static void brw_emit_point_setup(struct brw_sf_compile *c, bool allocate)
+{
+   struct brw_codegen *p = &c->func;
+   GLuint i;
+
+   c->flag_value = 0xff;
+   c->nr_verts = 1;
+
+   if (allocate)
+      alloc_regs(c);
+
+   copy_z_inv_w(c);
+
+   brw_MOV(p, c->m1Cx, brw_imm_ud(0)); /* zero - move out of loop */
+   brw_MOV(p, c->m2Cy, brw_imm_ud(0)); /* zero - move out of loop */
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      struct brw_reg a0 = offset(c->vert[0], i);
+      GLushort pc, pc_persp, pc_linear;
+      bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 /* This seems odd as the values are all constant, but the
+	  * fragment shader will be expecting it:
+	  */
+	 set_predicate_control_flag_value(p, c, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+      }
+
+
+      /* The delta values are always zero, just send the starting
+       * coordinate.  Again, this is to fit in with the interpolation
+       * code in the fragment shader.
+       */
+      {
+	 set_predicate_control_flag_value(p, c, pc);
+
+	 brw_MOV(p, c->m3C0, a0); /* constant value */
+
+	 /* Copy m0..m3 to URB.
+	  */
+	 brw_urb_WRITE(p,
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+                       last ? BRW_URB_WRITE_EOT_COMPLETE
+                       : BRW_URB_WRITE_NO_FLAGS,
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+static void brw_emit_anyprim_setup( struct brw_sf_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg payload_prim = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0);
+   struct brw_reg payload_attr = get_element_ud(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), 0);
+   struct brw_reg primmask;
+   int jmp;
+   struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+
+   c->nr_verts = 3;
+   alloc_regs(c);
+
+   primmask = retype(get_element(c->tmp, 0), BRW_REGISTER_TYPE_UD);
+
+   brw_MOV(p, primmask, brw_imm_ud(1));
+   brw_SHL(p, primmask, primmask, payload_prim);
+
+   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_TRILIST) |
+					       (1<<_3DPRIM_TRISTRIP) |
+					       (1<<_3DPRIM_TRIFAN) |
+					       (1<<_3DPRIM_TRISTRIP_REVERSE) |
+					       (1<<_3DPRIM_POLYGON) |
+					       (1<<_3DPRIM_RECTLIST) |
+					       (1<<_3DPRIM_TRIFAN_NOSTIPPLE)));
+   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
+   jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
+   brw_emit_tri_setup(c, false);
+   brw_land_fwd_jump(p, jmp);
+
+   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_LINELIST) |
+					       (1<<_3DPRIM_LINESTRIP) |
+					       (1<<_3DPRIM_LINELOOP) |
+					       (1<<_3DPRIM_LINESTRIP_CONT) |
+					       (1<<_3DPRIM_LINESTRIP_BF) |
+					       (1<<_3DPRIM_LINESTRIP_CONT_BF)));
+   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
+   jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
+   brw_emit_line_setup(c, false);
+   brw_land_fwd_jump(p, jmp);
+
+   brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE));
+   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
+   jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
+   brw_emit_point_sprite_setup(c, false);
+   brw_land_fwd_jump(p, jmp);
+
+   brw_emit_point_setup( c, false );
+}
+
+const unsigned *
+brw_compile_sf(const struct brw_compiler *compiler,
+               void *mem_ctx,
+               const struct brw_sf_prog_key *key,
+               struct brw_sf_prog_data *prog_data,
+               struct intel_vue_map *vue_map,
+               unsigned *final_assembly_size)
+{
+   struct brw_sf_compile c;
+   memset(&c, 0, sizeof(c));
+
+   /* Begin the compilation:
+    */
+   brw_init_codegen(&compiler->isa, &c.func, mem_ctx);
+
+   c.key = *key;
+   c.vue_map = *vue_map;
+   if (c.key.do_point_coord) {
+      /*
+       * gl_PointCoord is a FS instead of VS builtin variable, thus it's
+       * not included in c.vue_map generated in VS stage. Here we add
+       * it manually to let SF shader generate the needed interpolation
+       * coefficient for FS shader.
+       */
+      c.vue_map.varying_to_slot[BRW_VARYING_SLOT_PNTC] = c.vue_map.num_slots;
+      c.vue_map.slot_to_varying[c.vue_map.num_slots++] = BRW_VARYING_SLOT_PNTC;
+   }
+   c.urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
+   c.nr_attr_regs = (c.vue_map.num_slots + 1)/2 - c.urb_entry_read_offset;
+   c.nr_setup_regs = c.nr_attr_regs;
+
+   c.prog_data.urb_read_length = c.nr_attr_regs;
+   c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
+
+   /* Which primitive?  Or all three?
+    */
+   switch (key->primitive) {
+   case BRW_SF_PRIM_TRIANGLES:
+      c.nr_verts = 3;
+      brw_emit_tri_setup( &c, true );
+      break;
+   case BRW_SF_PRIM_LINES:
+      c.nr_verts = 2;
+      brw_emit_line_setup( &c, true );
+      break;
+   case BRW_SF_PRIM_POINTS:
+      c.nr_verts = 1;
+      if (key->do_point_sprite)
+	  brw_emit_point_sprite_setup( &c, true );
+      else
+	  brw_emit_point_setup( &c, true );
+      break;
+   case BRW_SF_PRIM_UNFILLED_TRIS:
+      c.nr_verts = 3;
+      brw_emit_anyprim_setup( &c );
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   /* FINISHME: SF programs use calculated jumps (i.e., JMPI with a register
+    * source). Compacting would be difficult.
+    */
+   /* brw_compact_instructions(&c.func, 0, 0, NULL); */
+
+   *prog_data = c.prog_data;
+
+   const unsigned *program = brw_get_program(&c.func, final_assembly_size);
+
+   if (INTEL_DEBUG(DEBUG_SF)) {
+      fprintf(stderr, "sf:\n");
+      brw_disassemble_with_labels(&compiler->isa,
+                                  program, 0, *final_assembly_size, stderr);
+      fprintf(stderr, "\n");
+   }
+
+   return program;
+}
diff --git a/src/intel/compiler/elk/brw_compiler.c b/src/intel/compiler/elk/brw_compiler.c
new file mode 100644
index 00000000000..c267a05a0a5
--- /dev/null
+++ b/src/intel/compiler/elk/brw_compiler.c
@@ -0,0 +1,370 @@
+/*
+ * Copyright © 2015-2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_compiler.h"
+#include "brw_shader.h"
+#include "brw_eu.h"
+#include "brw_nir.h"
+#include "dev/intel_debug.h"
+#include "compiler/nir/nir.h"
+#include "util/u_debug.h"
+
+#define COMMON_OPTIONS                                                        \
+   .has_uclz = true,                                                          \
+   .lower_fdiv = true,                                                        \
+   .lower_scmp = true,                                                        \
+   .lower_flrp16 = true,                                                      \
+   .lower_fmod = true,                                                        \
+   .lower_ufind_msb = true,                                                   \
+   .lower_uadd_carry = true,                                                  \
+   .lower_usub_borrow = true,                                                 \
+   .lower_flrp64 = true,                                                      \
+   .lower_fisnormal = true,                                                   \
+   .lower_isign = true,                                                       \
+   .lower_ldexp = true,                                                       \
+   .lower_bitfield_extract = true,                                            \
+   .lower_bitfield_insert = true,                                             \
+   .lower_device_index_to_zero = true,                                        \
+   .vectorize_io = true,                                                      \
+   .vectorize_tess_levels = true,                                             \
+   .use_interpolated_input_intrinsics = true,                                 \
+   .lower_insert_byte = true,                                                 \
+   .lower_insert_word = true,                                                 \
+   .vertex_id_zero_based = true,                                              \
+   .lower_base_vertex = true,                                                 \
+   .support_16bit_alu = true,                                                 \
+   .lower_uniforms_to_ubo = true
+
+#define COMMON_SCALAR_OPTIONS                                                 \
+   .lower_to_scalar = true,                                                   \
+   .lower_pack_half_2x16 = true,                                              \
+   .lower_pack_snorm_2x16 = true,                                             \
+   .lower_pack_snorm_4x8 = true,                                              \
+   .lower_pack_unorm_2x16 = true,                                             \
+   .lower_pack_unorm_4x8 = true,                                              \
+   .lower_unpack_half_2x16 = true,                                            \
+   .lower_unpack_snorm_2x16 = true,                                           \
+   .lower_unpack_snorm_4x8 = true,                                            \
+   .lower_unpack_unorm_2x16 = true,                                           \
+   .lower_unpack_unorm_4x8 = true,                                            \
+   .lower_hadd64 = true,                                                      \
+   .avoid_ternary_with_two_constants = true,                                  \
+   .has_pack_32_4x8 = true,                                                   \
+   .max_unroll_iterations = 32,                                               \
+   .force_indirect_unrolling = nir_var_function_temp,                         \
+   .divergence_analysis_options =                                             \
+      (nir_divergence_single_patch_per_tcs_subgroup |                         \
+       nir_divergence_single_patch_per_tes_subgroup |                         \
+       nir_divergence_shader_record_ptr_uniform)
+
+const struct nir_shader_compiler_options brw_scalar_nir_options = {
+   COMMON_OPTIONS,
+   COMMON_SCALAR_OPTIONS,
+};
+
+const struct nir_shader_compiler_options brw_vector_nir_options = {
+   COMMON_OPTIONS,
+
+   /* In the vec4 backend, our dpN instruction replicates its result to all the
+    * components of a vec4.  We would like NIR to give us replicated fdot
+    * instructions because it can optimize better for us.
+    */
+   .fdot_replicates = true,
+
+   .lower_usub_sat = true,
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_extract_byte = true,
+   .lower_extract_word = true,
+   .intel_vec4 = true,
+   .max_unroll_iterations = 32,
+};
+
+struct brw_compiler *
+brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
+{
+   struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
+
+   compiler->devinfo = devinfo;
+
+   brw_init_isa_info(&compiler->isa, devinfo);
+
+   brw_fs_alloc_reg_sets(compiler);
+   if (devinfo->ver < 8)
+      brw_vec4_alloc_reg_set(compiler);
+
+   compiler->precise_trig = debug_get_bool_option("INTEL_PRECISE_TRIG", false);
+
+   compiler->use_tcs_multi_patch = devinfo->ver >= 12;
+
+   /* Default to the sampler since that's what we've done since forever */
+   compiler->indirect_ubos_use_sampler = true;
+
+   compiler->lower_dpas = devinfo->verx10 < 125 ||
+      intel_device_info_is_mtl(devinfo) ||
+      (intel_device_info_is_arl(devinfo) &&
+       devinfo->platform != INTEL_PLATFORM_ARL_H) ||
+      debug_get_bool_option("INTEL_LOWER_DPAS", false);
+
+   /* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */
+   for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) {
+      compiler->scalar_stage[i] = devinfo->ver >= 8 ||
+         i == MESA_SHADER_FRAGMENT || i == MESA_SHADER_COMPUTE;
+   }
+
+   for (int i = MESA_SHADER_TASK; i < MESA_VULKAN_SHADER_STAGES; i++)
+      compiler->scalar_stage[i] = true;
+
+   nir_lower_int64_options int64_options =
+      nir_lower_imul64 |
+      nir_lower_isign64 |
+      nir_lower_divmod64 |
+      nir_lower_imul_high64 |
+      nir_lower_find_lsb64 |
+      nir_lower_ufind_msb64 |
+      nir_lower_bit_count64;
+   nir_lower_doubles_options fp64_options =
+      nir_lower_drcp |
+      nir_lower_dsqrt |
+      nir_lower_drsq |
+      nir_lower_dtrunc |
+      nir_lower_dfloor |
+      nir_lower_dceil |
+      nir_lower_dfract |
+      nir_lower_dround_even |
+      nir_lower_dmod |
+      nir_lower_dsub |
+      nir_lower_ddiv;
+
+   if (!devinfo->has_64bit_float || INTEL_DEBUG(DEBUG_SOFT64))
+      fp64_options |= nir_lower_fp64_full_software;
+   if (!devinfo->has_64bit_int)
+      int64_options |= (nir_lower_int64_options)~0;
+
+   /* The Bspec's section titled "Instruction_multiply[DevBDW+]" claims that
+    * destination type can be Quadword and source type Doubleword for Gfx8 and
+    * Gfx9. So, lower 64 bit multiply instruction on rest of the platforms.
+    */
+   if (devinfo->ver < 8 || devinfo->ver > 9)
+      int64_options |= nir_lower_imul_2x32_64;
+
+   /* We want the GLSL compiler to emit code that uses condition codes */
+   for (int i = 0; i < MESA_ALL_SHADER_STAGES; i++) {
+      struct nir_shader_compiler_options *nir_options =
+         rzalloc(compiler, struct nir_shader_compiler_options);
+      bool is_scalar = compiler->scalar_stage[i];
+      if (is_scalar) {
+         *nir_options = brw_scalar_nir_options;
+         int64_options |= nir_lower_usub_sat64;
+      } else {
+         *nir_options = brw_vector_nir_options;
+      }
+
+      /* Prior to Gfx6, there are no three source operations, and Gfx11 loses
+       * LRP.
+       */
+      nir_options->lower_ffma16 = devinfo->ver < 6;
+      nir_options->lower_ffma32 = devinfo->ver < 6;
+      nir_options->lower_ffma64 = devinfo->ver < 6;
+      nir_options->lower_flrp32 = devinfo->ver < 6 || devinfo->ver >= 11;
+      nir_options->lower_fpow = devinfo->ver >= 12;
+
+      nir_options->has_bfe = devinfo->ver >= 7;
+      nir_options->has_bfm = devinfo->ver >= 7;
+      nir_options->has_bfi = devinfo->ver >= 7;
+
+      nir_options->has_rotate16 = devinfo->ver >= 11;
+      nir_options->has_rotate32 = devinfo->ver >= 11;
+      nir_options->lower_bitfield_reverse = devinfo->ver < 7;
+      nir_options->lower_find_lsb = devinfo->ver < 7;
+      nir_options->lower_ifind_msb = devinfo->ver < 7;
+      nir_options->has_iadd3 = devinfo->verx10 >= 125;
+
+      nir_options->has_sdot_4x8 = devinfo->ver >= 12;
+      nir_options->has_udot_4x8 = devinfo->ver >= 12;
+      nir_options->has_sudot_4x8 = devinfo->ver >= 12;
+      nir_options->has_sdot_4x8_sat = devinfo->ver >= 12;
+      nir_options->has_udot_4x8_sat = devinfo->ver >= 12;
+      nir_options->has_sudot_4x8_sat = devinfo->ver >= 12;
+
+      nir_options->lower_int64_options = int64_options;
+      nir_options->lower_doubles_options = fp64_options;
+
+      nir_options->unify_interfaces = i < MESA_SHADER_FRAGMENT;
+
+      nir_options->force_indirect_unrolling |=
+         brw_nir_no_indirect_mask(compiler, i);
+      nir_options->force_indirect_unrolling_sampler = devinfo->ver < 7;
+
+      if (compiler->use_tcs_multi_patch) {
+         /* TCS MULTI_PATCH mode has multiple patches per subgroup */
+         nir_options->divergence_analysis_options &=
+            ~nir_divergence_single_patch_per_tcs_subgroup;
+      }
+
+      if (devinfo->ver < 12)
+         nir_options->divergence_analysis_options |=
+            nir_divergence_single_prim_per_subgroup;
+
+      compiler->nir_options[i] = nir_options;
+   }
+
+   compiler->mesh.mue_header_packing =
+         (unsigned)debug_get_num_option("INTEL_MESH_HEADER_PACKING", 3);
+   compiler->mesh.mue_compaction =
+         debug_get_bool_option("INTEL_MESH_COMPACTION", true);
+
+   return compiler;
+}
+
+static void
+insert_u64_bit(uint64_t *val, bool add)
+{
+   *val = (*val << 1) | !!add;
+}
+
+uint64_t
+brw_get_compiler_config_value(const struct brw_compiler *compiler)
+{
+   uint64_t config = 0;
+   unsigned bits = 0;
+
+   insert_u64_bit(&config, compiler->precise_trig);
+   bits++;
+   insert_u64_bit(&config, compiler->lower_dpas);
+   bits++;
+   insert_u64_bit(&config, compiler->mesh.mue_compaction);
+   bits++;
+
+   uint64_t mask = DEBUG_DISK_CACHE_MASK;
+   bits += util_bitcount64(mask);
+
+   u_foreach_bit64(bit, mask)
+      insert_u64_bit(&config, INTEL_DEBUG(1ULL << bit));
+
+   mask = SIMD_DISK_CACHE_MASK;
+   bits += util_bitcount64(mask);
+
+   u_foreach_bit64(bit, mask)
+      insert_u64_bit(&config, (intel_simd & (1ULL << bit)) != 0);
+
+   mask = 3;
+   bits += util_bitcount64(mask);
+
+   u_foreach_bit64(bit, mask)
+      insert_u64_bit(&config, (compiler->mesh.mue_header_packing & (1ULL << bit)) != 0);
+
+   assert(bits <= util_bitcount64(UINT64_MAX));
+
+   return config;
+}
+
+void
+brw_device_sha1(char *hex,
+                const struct intel_device_info *devinfo) {
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+   brw_device_sha1_update(&ctx, devinfo);
+   unsigned char result[20];
+   _mesa_sha1_final(&ctx, result);
+   _mesa_sha1_format(hex, result);
+}
+
+unsigned
+brw_prog_data_size(gl_shader_stage stage)
+{
+   static const size_t stage_sizes[] = {
+      [MESA_SHADER_VERTEX]       = sizeof(struct brw_vs_prog_data),
+      [MESA_SHADER_TESS_CTRL]    = sizeof(struct brw_tcs_prog_data),
+      [MESA_SHADER_TESS_EVAL]    = sizeof(struct brw_tes_prog_data),
+      [MESA_SHADER_GEOMETRY]     = sizeof(struct brw_gs_prog_data),
+      [MESA_SHADER_FRAGMENT]     = sizeof(struct brw_wm_prog_data),
+      [MESA_SHADER_COMPUTE]      = sizeof(struct brw_cs_prog_data),
+      [MESA_SHADER_TASK]         = sizeof(struct brw_task_prog_data),
+      [MESA_SHADER_MESH]         = sizeof(struct brw_mesh_prog_data),
+      [MESA_SHADER_RAYGEN]       = sizeof(struct brw_bs_prog_data),
+      [MESA_SHADER_ANY_HIT]      = sizeof(struct brw_bs_prog_data),
+      [MESA_SHADER_CLOSEST_HIT]  = sizeof(struct brw_bs_prog_data),
+      [MESA_SHADER_MISS]         = sizeof(struct brw_bs_prog_data),
+      [MESA_SHADER_INTERSECTION] = sizeof(struct brw_bs_prog_data),
+      [MESA_SHADER_CALLABLE]     = sizeof(struct brw_bs_prog_data),
+      [MESA_SHADER_KERNEL]       = sizeof(struct brw_cs_prog_data),
+   };
+   assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_sizes));
+   return stage_sizes[stage];
+}
+
+unsigned
+brw_prog_key_size(gl_shader_stage stage)
+{
+   static const size_t stage_sizes[] = {
+      [MESA_SHADER_VERTEX]       = sizeof(struct brw_vs_prog_key),
+      [MESA_SHADER_TESS_CTRL]    = sizeof(struct brw_tcs_prog_key),
+      [MESA_SHADER_TESS_EVAL]    = sizeof(struct brw_tes_prog_key),
+      [MESA_SHADER_GEOMETRY]     = sizeof(struct brw_gs_prog_key),
+      [MESA_SHADER_FRAGMENT]     = sizeof(struct brw_wm_prog_key),
+      [MESA_SHADER_COMPUTE]      = sizeof(struct brw_cs_prog_key),
+      [MESA_SHADER_TASK]         = sizeof(struct brw_task_prog_key),
+      [MESA_SHADER_MESH]         = sizeof(struct brw_mesh_prog_key),
+      [MESA_SHADER_RAYGEN]       = sizeof(struct brw_bs_prog_key),
+      [MESA_SHADER_ANY_HIT]      = sizeof(struct brw_bs_prog_key),
+      [MESA_SHADER_CLOSEST_HIT]  = sizeof(struct brw_bs_prog_key),
+      [MESA_SHADER_MISS]         = sizeof(struct brw_bs_prog_key),
+      [MESA_SHADER_INTERSECTION] = sizeof(struct brw_bs_prog_key),
+      [MESA_SHADER_CALLABLE]     = sizeof(struct brw_bs_prog_key),
+      [MESA_SHADER_KERNEL]       = sizeof(struct brw_cs_prog_key),
+   };
+   assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_sizes));
+   return stage_sizes[stage];
+}
+
+void
+brw_write_shader_relocs(const struct brw_isa_info *isa,
+                        void *program,
+                        const struct brw_stage_prog_data *prog_data,
+                        struct brw_shader_reloc_value *values,
+                        unsigned num_values)
+{
+   for (unsigned i = 0; i < prog_data->num_relocs; i++) {
+      assert(prog_data->relocs[i].offset % 8 == 0);
+      void *dst = program + prog_data->relocs[i].offset;
+      for (unsigned j = 0; j < num_values; j++) {
+         if (prog_data->relocs[i].id == values[j].id) {
+            uint32_t value = values[j].value + prog_data->relocs[i].delta;
+            switch (prog_data->relocs[i].type) {
+            case BRW_SHADER_RELOC_TYPE_U32:
+               *(uint32_t *)dst = value;
+               break;
+            case BRW_SHADER_RELOC_TYPE_MOV_IMM:
+               brw_update_reloc_imm(isa, dst, value);
+               break;
+            default:
+               unreachable("Invalid relocation type");
+            }
+            break;
+         }
+      }
+   }
+}
diff --git a/src/intel/compiler/elk/brw_compiler.h b/src/intel/compiler/elk/brw_compiler.h
new file mode 100644
index 00000000000..30a05d8e287
--- /dev/null
+++ b/src/intel/compiler/elk/brw_compiler.h
@@ -0,0 +1,2131 @@
+/*
+ * Copyright © 2010 - 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_COMPILER_H
+#define BRW_COMPILER_H
+
+#include <stdio.h>
+#include "c11/threads.h"
+#include "dev/intel_device_info.h"
+#include "isl/isl.h"
+#include "util/macros.h"
+#include "util/mesa-sha1.h"
+#include "util/enum_operators.h"
+#include "util/ralloc.h"
+#include "util/u_math.h"
+#include "brw_isa_info.h"
+#include "intel_shader_enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ra_regs;
+struct nir_shader;
+struct shader_info;
+
+struct nir_shader_compiler_options;
+typedef struct nir_shader nir_shader;
+
+struct brw_compiler {
+   const struct intel_device_info *devinfo;
+
+   /* This lock must be taken if the compiler is to be modified in any way,
+    * including adding something to the ralloc child list.
+    */
+   mtx_t mutex;
+
+   struct brw_isa_info isa;
+
+   struct {
+      struct ra_regs *regs;
+
+      /**
+       * Array of the ra classes for the unaligned contiguous register
+       * block sizes used.
+       */
+      struct ra_class **classes;
+   } vec4_reg_set;
+
+   struct {
+      struct ra_regs *regs;
+
+      /**
+       * Array of the ra classes for the unaligned contiguous register
+       * block sizes used, indexed by register size.
+       */
+      struct ra_class *classes[16];
+
+      /**
+       * ra class for the aligned barycentrics we use for PLN, which doesn't
+       * appear in *classes.
+       */
+      struct ra_class *aligned_bary_class;
+   } fs_reg_sets[3];
+
+   void (*shader_debug_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
+   void (*shader_perf_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
+
+   bool scalar_stage[MESA_ALL_SHADER_STAGES];
+   bool use_tcs_multi_patch;
+   struct nir_shader_compiler_options *nir_options[MESA_ALL_SHADER_STAGES];
+
+   /**
+    * Apply workarounds for SIN and COS output range problems.
+    * This can negatively impact performance.
+    */
+   bool precise_trig;
+
+   /**
+    * Is 3DSTATE_CONSTANT_*'s Constant Buffer 0 relative to Dynamic State
+    * Base Address?  (If not, it's a normal GPU address.)
+    */
+   bool constant_buffer_0_is_relative;
+
+   /**
+    * Whether or not the driver supports NIR shader constants.  This controls
+    * whether nir_opt_large_constants will be run.
+    */
+   bool supports_shader_constants;
+
+   /**
+    * Whether indirect UBO loads should use the sampler or go through the
+    * data/constant cache.  For the sampler, UBO surface states have to be set
+    * up with VK_FORMAT_R32G32B32A32_FLOAT whereas if it's going through the
+    * constant or data cache, UBOs must use VK_FORMAT_RAW.
+    */
+   bool indirect_ubos_use_sampler;
+
+   /**
+    * Gfx12.5+ has a bit in the SEND instruction extending the bindless
+    * surface offset range from 20 to 26 bits, effectively giving us 4Gb of
+    * bindless surface descriptors instead of 64Mb previously.
+    */
+   bool extended_bindless_surface_offset;
+
+   /**
+    * Gfx11+ has a bit in the dword 3 of the sampler message header that
+    * indicates whether the sampler handle is relative to the dynamic state
+    * base address (0) or the bindless sampler base address (1). The driver
+    * can select this.
+    */
+   bool use_bindless_sampler_offset;
+
+   /**
+    * Should DPAS instructions be lowered?
+    *
+    * This will be set for all platforms before Gfx12.5. It may also be set
+    * platforms that support DPAS for testing purposes.
+    */
+   bool lower_dpas;
+
+   /**
+    * Calling the ra_allocate function after each register spill can take
+    * several minutes. This option speeds up shader compilation by spilling
+    * more registers after the ra_allocate failure. Required for
+    * Cyberpunk 2077, which uses a watchdog thread to terminate the process
+    * in case the render thread hasn't responded within 2 minutes.
+    */
+   int spilling_rate;
+
+   struct nir_shader *clc_shader;
+
+   struct {
+      unsigned mue_header_packing;
+      bool mue_compaction;
+   } mesh;
+};
+
+#define brw_shader_debug_log(compiler, data, fmt, ... ) do {    \
+   static unsigned id = 0;                                      \
+   compiler->shader_debug_log(data, &id, fmt, ##__VA_ARGS__);   \
+} while (0)
+
+#define brw_shader_perf_log(compiler, data, fmt, ... ) do {     \
+   static unsigned id = 0;                                      \
+   compiler->shader_perf_log(data, &id, fmt, ##__VA_ARGS__);    \
+} while (0)
+
+/**
+ * We use a constant subgroup size of 32.  It really only needs to be a
+ * maximum and, since we do SIMD32 for compute shaders in some cases, it
+ * needs to be at least 32.  SIMD8 and SIMD16 shaders will still claim a
+ * subgroup size of 32 but will act as if 16 or 24 of those channels are
+ * disabled.
+ */
+#define BRW_SUBGROUP_SIZE 32
+
+static inline bool
+brw_shader_stage_is_bindless(gl_shader_stage stage)
+{
+   return stage >= MESA_SHADER_RAYGEN &&
+          stage <= MESA_SHADER_CALLABLE;
+}
+
+static inline bool
+brw_shader_stage_requires_bindless_resources(gl_shader_stage stage)
+{
+   return brw_shader_stage_is_bindless(stage) || gl_shader_stage_is_mesh(stage);
+}
+
+/**
+ * Program key structures.
+ *
+ * When drawing, we look for the currently bound shaders in the program
+ * cache.  This is essentially a hash table lookup, and these are the keys.
+ *
+ * Sometimes OpenGL features specified as state need to be simulated via
+ * shader code, due to a mismatch between the API and the hardware.  This
+ * is often referred to as "non-orthagonal state" or "NOS".  We store NOS
+ * in the program key so it's considered when searching for a program.  If
+ * we haven't seen a particular combination before, we have to recompile a
+ * new specialized version.
+ *
+ * Shader compilation should not look up state in gl_context directly, but
+ * instead use the copy in the program key.  This guarantees recompiles will
+ * happen correctly.
+ *
+ *  @{
+ */
+
+enum PACKED gfx6_gather_sampler_wa {
+   WA_SIGN = 1,      /* whether we need to sign extend */
+   WA_8BIT = 2,      /* if we have an 8bit format needing wa */
+   WA_16BIT = 4,     /* if we have a 16bit format needing wa */
+};
+
+#define BRW_MAX_SAMPLERS 32
+
+/* Provide explicit padding for each member, to ensure that the compiler
+ * initializes every bit in the shader cache keys.  The keys will be compared
+ * with memcmp.
+ */
+PRAGMA_DIAGNOSTIC_PUSH
+PRAGMA_DIAGNOSTIC_ERROR(-Wpadded)
+
+/**
+ * Sampler information needed by VS, WM, and GS program cache keys.
+ */
+struct brw_sampler_prog_key_data {
+   /**
+    * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles.
+    *
+    * This field is not consumed by the back-end compiler and is only relevant
+    * for the crocus OpenGL driver for Broadwell and earlier hardware.
+    */
+   uint16_t swizzles[BRW_MAX_SAMPLERS];
+
+   uint32_t gl_clamp_mask[3];
+
+   /**
+    * For RG32F, gather4's channel select is broken.
+    */
+   uint32_t gather_channel_quirk_mask;
+
+   /**
+    * For Sandybridge, which shader w/a we need for gather quirks.
+    */
+   enum gfx6_gather_sampler_wa gfx6_gather_wa[BRW_MAX_SAMPLERS];
+};
+
+enum brw_robustness_flags {
+   BRW_ROBUSTNESS_UBO  = BITFIELD_BIT(0),
+   BRW_ROBUSTNESS_SSBO = BITFIELD_BIT(1),
+};
+
+struct brw_base_prog_key {
+   unsigned program_string_id;
+
+   enum brw_robustness_flags robust_flags:2;
+
+   unsigned padding:22;
+
+   /**
+    * Apply workarounds for SIN and COS input range problems.
+    * This limits input range for SIN and COS to [-2p : 2p] to
+    * avoid precision issues.
+    */
+   bool limit_trig_input_range;
+
+   struct brw_sampler_prog_key_data tex;
+};
+
+/**
+ * The VF can't natively handle certain types of attributes, such as GL_FIXED
+ * or most 10_10_10_2 types.  These flags enable various VS workarounds to
+ * "fix" attributes at the beginning of shaders.
+ */
+#define BRW_ATTRIB_WA_COMPONENT_MASK    7  /* mask for GL_FIXED scale channel count */
+#define BRW_ATTRIB_WA_NORMALIZE     8   /* normalize in shader */
+#define BRW_ATTRIB_WA_BGRA          16  /* swap r/b channels in shader */
+#define BRW_ATTRIB_WA_SIGN          32  /* interpret as signed in shader */
+#define BRW_ATTRIB_WA_SCALE         64  /* interpret as scaled in shader */
+
+/**
+ * OpenGL attribute slots fall in [0, VERT_ATTRIB_MAX - 1] with the range
+ * [VERT_ATTRIB_GENERIC0, VERT_ATTRIB_MAX - 1] reserved for up to 16 user
+ * input vertex attributes. In Vulkan, we expose up to 28 user vertex input
+ * attributes that are mapped to slots also starting at VERT_ATTRIB_GENERIC0.
+ */
+#define MAX_GL_VERT_ATTRIB     VERT_ATTRIB_MAX
+#define MAX_VK_VERT_ATTRIB     (VERT_ATTRIB_GENERIC0 + 28)
+
+/**
+ * Max number of binding table entries used for stream output.
+ *
+ * From the OpenGL 3.0 spec, table 6.44 (Transform Feedback State), the
+ * minimum value of MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS is 64.
+ *
+ * On Gfx6, the size of transform feedback data is limited not by the number
+ * of components but by the number of binding table entries we set aside.  We
+ * use one binding table entry for a float, one entry for a vector, and one
+ * entry per matrix column.  Since the only way we can communicate our
+ * transform feedback capabilities to the client is via
+ * MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS, we need to plan for the
+ * worst case, in which all the varyings are floats, so we use up one binding
+ * table entry per component.  Therefore we need to set aside at least 64
+ * binding table entries for use by transform feedback.
+ *
+ * Note: since we don't currently pack varyings, it is currently impossible
+ * for the client to actually use up all of these binding table entries--if
+ * all of their varyings were floats, they would run out of varying slots and
+ * fail to link.  But that's a bug, so it seems prudent to go ahead and
+ * allocate the number of binding table entries we will need once the bug is
+ * fixed.
+ */
+#define BRW_MAX_SOL_BINDINGS 64
+
+/** The program key for Vertex Shaders. */
+struct brw_vs_prog_key {
+   struct brw_base_prog_key base;
+
+   /**
+    * Per-attribute workaround flags
+    *
+    * For each attribute, a combination of BRW_ATTRIB_WA_*.
+    *
+    * For OpenGL, where we expose a maximum of 16 user input attributes
+    * we only need up to VERT_ATTRIB_MAX slots, however, in Vulkan
+    * slots preceding VERT_ATTRIB_GENERIC0 are unused and we can
+    * expose up to 28 user input vertex attributes that are mapped to slots
+    * starting at VERT_ATTRIB_GENERIC0, so this array needs to be large
+    * enough to hold this many slots.
+    */
+   uint8_t gl_attrib_wa_flags[MAX2(MAX_GL_VERT_ATTRIB, MAX_VK_VERT_ATTRIB)];
+
+   /**
+    * For pre-Gfx6 hardware, a bitfield indicating which texture coordinates
+    * are going to be replaced with point coordinates (as a consequence of a
+    * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)).  Because
+    * our SF thread requires exact matching between VS outputs and FS inputs,
+    * these texture coordinates will need to be unconditionally included in
+    * the VUE, even if they aren't written by the vertex shader.
+    */
+   uint8_t point_coord_replace;
+   unsigned clamp_pointsize:1;
+
+   bool copy_edgeflag:1;
+
+   bool clamp_vertex_color:1;
+
+   /**
+    * How many user clipping planes are being uploaded to the vertex shader as
+    * push constants.
+    *
+    * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
+    * clip distances.
+    */
+   unsigned nr_userclip_plane_consts:4;
+
+   uint32_t padding: 25;
+};
+
+/** The program key for Tessellation Control Shaders. */
+struct brw_tcs_prog_key
+{
+   struct brw_base_prog_key base;
+
+   /** A bitfield of per-vertex outputs written. */
+   uint64_t outputs_written;
+
+   enum tess_primitive_mode _tes_primitive_mode;
+
+   /** Number of input vertices, 0 means dynamic */
+   unsigned input_vertices;
+
+   /** A bitfield of per-patch outputs written. */
+   uint32_t patch_outputs_written;
+
+   bool quads_workaround;
+   uint32_t padding:24;
+};
+
+#define BRW_MAX_TCS_INPUT_VERTICES (32)
+
+static inline uint32_t
+brw_tcs_prog_key_input_vertices(const struct brw_tcs_prog_key *key)
+{
+   return key->input_vertices != 0 ?
+          key->input_vertices : BRW_MAX_TCS_INPUT_VERTICES;
+}
+
+/** The program key for Tessellation Evaluation Shaders. */
+struct brw_tes_prog_key
+{
+   struct brw_base_prog_key base;
+
+   /** A bitfield of per-vertex inputs read. */
+   uint64_t inputs_read;
+
+   /** A bitfield of per-patch inputs read. */
+   uint32_t patch_inputs_read;
+
+   /**
+    * How many user clipping planes are being uploaded to the tessellation
+    * evaluation shader as push constants.
+    *
+    * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
+    * clip distances.
+    */
+   unsigned nr_userclip_plane_consts:4;
+   unsigned clamp_pointsize:1;
+   uint32_t padding:27;
+};
+
+/** The program key for Geometry Shaders. */
+struct brw_gs_prog_key
+{
+   struct brw_base_prog_key base;
+
+   /**
+    * How many user clipping planes are being uploaded to the geometry shader
+    * as push constants.
+    *
+    * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
+    * clip distances.
+    */
+   unsigned nr_userclip_plane_consts:4;
+   unsigned clamp_pointsize:1;
+   unsigned padding:27;
+};
+
+struct brw_task_prog_key
+{
+   struct brw_base_prog_key base;
+};
+
+struct brw_mesh_prog_key
+{
+   struct brw_base_prog_key base;
+
+   bool compact_mue:1;
+   unsigned padding:31;
+};
+
+enum brw_sf_primitive {
+   BRW_SF_PRIM_POINTS = 0,
+   BRW_SF_PRIM_LINES = 1,
+   BRW_SF_PRIM_TRIANGLES = 2,
+   BRW_SF_PRIM_UNFILLED_TRIS = 3,
+};
+
+struct brw_sf_prog_key {
+   uint64_t attrs;
+   bool contains_flat_varying;
+   unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */
+   uint8_t point_sprite_coord_replace;
+   enum brw_sf_primitive primitive:2;
+   bool do_twoside_color:1;
+   bool frontface_ccw:1;
+   bool do_point_sprite:1;
+   bool do_point_coord:1;
+   bool sprite_origin_lower_left:1;
+   bool userclip_active:1;
+   unsigned padding: 32;
+};
+
+enum brw_clip_mode {
+   BRW_CLIP_MODE_NORMAL             = 0,
+   BRW_CLIP_MODE_CLIP_ALL           = 1,
+   BRW_CLIP_MODE_CLIP_NON_REJECTED  = 2,
+   BRW_CLIP_MODE_REJECT_ALL         = 3,
+   BRW_CLIP_MODE_ACCEPT_ALL         = 4,
+   BRW_CLIP_MODE_KERNEL_CLIP        = 5,
+};
+
+enum brw_clip_fill_mode {
+   BRW_CLIP_FILL_MODE_LINE = 0,
+   BRW_CLIP_FILL_MODE_POINT = 1,
+   BRW_CLIP_FILL_MODE_FILL = 2,
+   BRW_CLIP_FILL_MODE_CULL = 3,
+};
+
+/* Note that if unfilled primitives are being emitted, we have to fix
+ * up polygon offset and flatshading at this point:
+ */
+struct brw_clip_prog_key {
+   uint64_t attrs;
+   float offset_factor;
+   float offset_units;
+   float offset_clamp;
+   bool contains_flat_varying;
+   bool contains_noperspective_varying;
+   unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */
+   unsigned primitive:4;
+   unsigned nr_userclip:4;
+   bool pv_first:1;
+   bool do_unfilled:1;
+   enum brw_clip_fill_mode fill_cw:2;  /* includes cull information */
+   enum brw_clip_fill_mode fill_ccw:2; /* includes cull information */
+   bool offset_cw:1;
+   bool offset_ccw:1;
+   bool copy_bfc_cw:1;
+   bool copy_bfc_ccw:1;
+   enum brw_clip_mode clip_mode:3;
+   uint64_t padding:51;
+};
+
+/* A big lookup table is used to figure out which and how many
+ * additional regs will inserted before the main payload in the WM
+ * program execution.  These mainly relate to depth and stencil
+ * processing and the early-depth-test optimization.
+ */
+enum brw_wm_iz_bits {
+   BRW_WM_IZ_PS_KILL_ALPHATEST_BIT     = 0x1,
+   BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT     = 0x2,
+   BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT    = 0x4,
+   BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT     = 0x8,
+   BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT  = 0x10,
+   BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT   = 0x20,
+   BRW_WM_IZ_BIT_MAX                   = 0x40
+};
+
+enum brw_sometimes {
+   BRW_NEVER = 0,
+   BRW_SOMETIMES,
+   BRW_ALWAYS
+};
+
+static inline enum brw_sometimes
+brw_sometimes_invert(enum brw_sometimes x)
+{
+   return (enum brw_sometimes)((int)BRW_ALWAYS - (int)x);
+}
+
+/** The program key for Fragment/Pixel Shaders. */
+struct brw_wm_prog_key {
+   struct brw_base_prog_key base;
+
+   uint64_t input_slots_valid;
+   float alpha_test_ref;
+   uint8_t color_outputs_valid;
+
+   /* Some collection of BRW_WM_IZ_* */
+   uint8_t iz_lookup;
+   bool stats_wm:1;
+   bool flat_shade:1;
+   unsigned nr_color_regions:5;
+   bool emit_alpha_test:1;
+   enum compare_func alpha_test_func:3; /* < For Gfx4/5 MRT alpha test */
+   bool alpha_test_replicate_alpha:1;
+   enum brw_sometimes alpha_to_coverage:2;
+   bool clamp_fragment_color:1;
+
+   bool force_dual_color_blend:1;
+
+   /** Whether or inputs are interpolated at sample rate by default
+    *
+    * This corresponds to the sample shading API bit in Vulkan or OpenGL which
+    * controls how inputs with no interpolation qualifier are interpolated.
+    * This is distinct from the way that using gl_SampleID or similar requires
+    * us to run per-sample.  Even when running per-sample due to gl_SampleID,
+    * we may still interpolate unqualified inputs at the pixel center.
+    */
+   enum brw_sometimes persample_interp:2;
+
+   /* Whether or not we are running on a multisampled framebuffer */
+   enum brw_sometimes multisample_fbo:2;
+
+   enum brw_sometimes line_aa:2;
+
+   /* Whether the preceding shader stage is mesh */
+   enum brw_sometimes mesh_input:2;
+
+   bool coherent_fb_fetch:1;
+   bool ignore_sample_mask_out:1;
+   bool coarse_pixel:1;
+
+   uint64_t padding:53;
+};
+
+struct brw_cs_prog_key {
+   struct brw_base_prog_key base;
+};
+
+struct brw_bs_prog_key {
+   struct brw_base_prog_key base;
+
+   /* Represents enum enum brw_rt_ray_flags values given at pipeline creation
+    * to be combined with ray_flags handed to the traceRayEXT() calls by the
+    * shader.
+    */
+   uint32_t pipeline_ray_flags;
+};
+
+struct brw_ff_gs_prog_key {
+   uint64_t attrs;
+
+   /**
+    * Map from the index of a transform feedback binding table entry to the
+    * gl_varying_slot that should be streamed out through that binding table
+    * entry.
+    */
+   unsigned char transform_feedback_bindings[BRW_MAX_SOL_BINDINGS];
+
+   /**
+    * Map from the index of a transform feedback binding table entry to the
+    * swizzles that should be used when streaming out data through that
+    * binding table entry.
+    */
+   unsigned char transform_feedback_swizzles[BRW_MAX_SOL_BINDINGS];
+
+   /**
+    * Hardware primitive type being drawn, e.g. _3DPRIM_TRILIST.
+    */
+   unsigned primitive:8;
+
+   unsigned pv_first:1;
+   unsigned need_gs_prog:1;
+
+   /**
+    * Number of varyings that are output to transform feedback.
+    */
+   unsigned num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */
+   uint64_t padding:47;
+};
+
+/* brw_any_prog_key is any of the keys that map to an API stage */
+union brw_any_prog_key {
+   struct brw_base_prog_key base;
+   struct brw_vs_prog_key vs;
+   struct brw_tcs_prog_key tcs;
+   struct brw_tes_prog_key tes;
+   struct brw_gs_prog_key gs;
+   struct brw_wm_prog_key wm;
+   struct brw_cs_prog_key cs;
+   struct brw_bs_prog_key bs;
+   struct brw_task_prog_key task;
+   struct brw_mesh_prog_key mesh;
+};
+
+PRAGMA_DIAGNOSTIC_POP
+
+/** Max number of render targets in a shader */
+#define BRW_MAX_DRAW_BUFFERS 8
+
+/**
+ * Binding table index for the first gfx6 SOL binding.
+ */
+#define BRW_GFX6_SOL_BINDING_START 0
+
+struct brw_ubo_range
+{
+   uint16_t block;
+
+   /* In units of 32-byte registers */
+   uint8_t start;
+   uint8_t length;
+};
+
+/* We reserve the first 2^16 values for builtins */
+#define BRW_PARAM_IS_BUILTIN(param) (((param) & 0xffff0000) == 0)
+
+enum brw_param_builtin {
+   BRW_PARAM_BUILTIN_ZERO,
+
+   BRW_PARAM_BUILTIN_CLIP_PLANE_0_X,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_0_Y,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_0_Z,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_0_W,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_1_X,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_1_Y,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_1_Z,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_1_W,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_2_X,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_2_Y,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_2_Z,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_2_W,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_3_X,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_3_Y,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_3_Z,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_3_W,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_4_X,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_4_Y,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_4_Z,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_4_W,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_5_X,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_5_Y,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_5_Z,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_5_W,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_6_X,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_6_Y,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_6_Z,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_6_W,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_7_X,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_7_Y,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_7_Z,
+   BRW_PARAM_BUILTIN_CLIP_PLANE_7_W,
+
+   BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X,
+   BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y,
+   BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Z,
+   BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W,
+   BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X,
+   BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y,
+
+   BRW_PARAM_BUILTIN_PATCH_VERTICES_IN,
+
+   BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_X,
+   BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y,
+   BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z,
+   BRW_PARAM_BUILTIN_SUBGROUP_ID,
+   BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X,
+   BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Y,
+   BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z,
+   BRW_PARAM_BUILTIN_WORK_DIM,
+};
+
+#define BRW_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \
+   (BRW_PARAM_BUILTIN_CLIP_PLANE_0_X + ((idx) << 2) + (comp))
+
+#define BRW_PARAM_BUILTIN_IS_CLIP_PLANE(param)  \
+   ((param) >= BRW_PARAM_BUILTIN_CLIP_PLANE_0_X && \
+    (param) <= BRW_PARAM_BUILTIN_CLIP_PLANE_7_W)
+
+#define BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(param) \
+   (((param) - BRW_PARAM_BUILTIN_CLIP_PLANE_0_X) >> 2)
+
+#define BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(param) \
+   (((param) - BRW_PARAM_BUILTIN_CLIP_PLANE_0_X) & 0x3)
+
+enum brw_shader_reloc_id {
+   BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW,
+   BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH,
+   BRW_SHADER_RELOC_SHADER_START_OFFSET,
+   BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW,
+   BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH,
+   BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH,
+};
+
+enum brw_shader_reloc_type {
+   /** An arbitrary 32-bit value */
+   BRW_SHADER_RELOC_TYPE_U32,
+   /** A MOV instruction with an immediate source */
+   BRW_SHADER_RELOC_TYPE_MOV_IMM,
+};
+
+/** Represents a code relocation
+ *
+ * Relocatable constants are immediates in the code which we want to be able
+ * to replace post-compile with the actual value.
+ */
+struct brw_shader_reloc {
+   /** The 32-bit ID of the relocatable constant */
+   uint32_t id;
+
+   /** Type of this relocation */
+   enum brw_shader_reloc_type type;
+
+   /** The offset in the shader to the relocated value
+    *
+    * For MOV_IMM relocs, this is an offset to the MOV instruction.  This
+    * allows us to do some sanity checking while we update the value.
+    */
+   uint32_t offset;
+
+   /** Value to be added to the relocated value before it is written */
+   uint32_t delta;
+};
+
+/** A value to write to a relocation */
+struct brw_shader_reloc_value {
+   /** The 32-bit ID of the relocatable constant */
+   uint32_t id;
+
+   /** The value with which to replace the relocated immediate */
+   uint32_t value;
+};
+
+struct brw_stage_prog_data {
+   struct brw_ubo_range ubo_ranges[4];
+
+   unsigned nr_params;       /**< number of float params/constants */
+
+   gl_shader_stage stage;
+
+   /* zero_push_reg is a bitfield which indicates what push registers (if any)
+    * should be zeroed by SW at the start of the shader.  The corresponding
+    * push_reg_mask_param specifies the param index (in 32-bit units) where
+    * the actual runtime 64-bit mask will be pushed.  The shader will zero
+    * push reg i if
+    *
+    *    reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i)
+    *
+    * If this field is set, brw_compiler::compact_params must be false.
+    */
+   uint64_t zero_push_reg;
+   unsigned push_reg_mask_param;
+
+   unsigned curb_read_length;
+   unsigned total_scratch;
+   unsigned total_shared;
+
+   unsigned program_size;
+
+   unsigned const_data_size;
+   unsigned const_data_offset;
+
+   unsigned num_relocs;
+   const struct brw_shader_reloc *relocs;
+
+   /** Does this program pull from any UBO or other constant buffers? */
+   bool has_ubo_pull;
+
+   /** How many ray queries objects in this shader. */
+   unsigned ray_queries;
+
+   /**
+    * Register where the thread expects to find input data from the URB
+    * (typically uniforms, followed by vertex or fragment attributes).
+    */
+   unsigned dispatch_grf_start_reg;
+
+   bool use_alt_mode; /**< Use ALT floating point mode?  Otherwise, IEEE. */
+
+   /* 32-bit identifiers for all push/pull parameters.  These can be anything
+    * the driver wishes them to be; the core of the back-end compiler simply
+    * re-arranges them.  The one restriction is that the bottom 2^16 values
+    * are reserved for builtins defined in the brw_param_builtin enum defined
+    * above.
+    */
+   uint32_t *param;
+
+   /* Whether shader uses atomic operations. */
+   bool uses_atomic_load_store;
+};
+
+static inline uint32_t *
+brw_stage_prog_data_add_params(struct brw_stage_prog_data *prog_data,
+                               unsigned nr_new_params)
+{
+   unsigned old_nr_params = prog_data->nr_params;
+   prog_data->nr_params += nr_new_params;
+   prog_data->param = reralloc(ralloc_parent(prog_data->param),
+                               prog_data->param, uint32_t,
+                               prog_data->nr_params);
+   return prog_data->param + old_nr_params;
+}
+
+enum brw_barycentric_mode {
+   BRW_BARYCENTRIC_PERSPECTIVE_PIXEL       = 0,
+   BRW_BARYCENTRIC_PERSPECTIVE_CENTROID    = 1,
+   BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE      = 2,
+   BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL    = 3,
+   BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4,
+   BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE   = 5,
+   BRW_BARYCENTRIC_MODE_COUNT              = 6
+};
+#define BRW_BARYCENTRIC_PERSPECTIVE_BITS \
+   ((1 << BRW_BARYCENTRIC_PERSPECTIVE_PIXEL) | \
+    (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID) | \
+    (1 << BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE))
+#define BRW_BARYCENTRIC_NONPERSPECTIVE_BITS \
+   ((1 << BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \
+    (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \
+    (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))
+
+enum brw_pixel_shader_computed_depth_mode {
+   BRW_PSCDEPTH_OFF   = 0, /* PS does not compute depth */
+   BRW_PSCDEPTH_ON    = 1, /* PS computes depth; no guarantee about value */
+   BRW_PSCDEPTH_ON_GE = 2, /* PS guarantees output depth >= source depth */
+   BRW_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */
+};
+
+/* Data about a particular attempt to compile a program.  Note that
+ * there can be many of these, each in a different GL state
+ * corresponding to a different brw_wm_prog_key struct, with different
+ * compiled programs.
+ */
+struct brw_wm_prog_data {
+   struct brw_stage_prog_data base;
+
+   unsigned num_per_primitive_inputs;
+   unsigned num_varying_inputs;
+
+   uint8_t reg_blocks_8;
+   uint8_t reg_blocks_16;
+   uint8_t reg_blocks_32;
+
+   uint8_t dispatch_grf_start_reg_16;
+   uint8_t dispatch_grf_start_reg_32;
+   uint32_t prog_offset_16;
+   uint32_t prog_offset_32;
+
+   struct {
+      /** @{
+       * surface indices the WM-specific surfaces
+       */
+      uint32_t render_target_read_start;
+      /** @} */
+   } binding_table;
+
+   uint8_t color_outputs_written;
+   uint8_t computed_depth_mode;
+
+   /**
+    * Number of polygons handled in parallel by the multi-polygon PS
+    * kernel.
+    */
+   uint8_t max_polygons;
+
+   /**
+    * Dispatch width of the multi-polygon PS kernel, or 0 if no
+    * multi-polygon kernel was built.
+    */
+   uint8_t dispatch_multi;
+
+   bool computed_stencil;
+   bool early_fragment_tests;
+   bool post_depth_coverage;
+   bool inner_coverage;
+   bool dispatch_8;
+   bool dispatch_16;
+   bool dispatch_32;
+   bool dual_src_blend;
+   bool uses_pos_offset;
+   bool uses_omask;
+   bool uses_kill;
+   bool uses_src_depth;
+   bool uses_src_w;
+   bool uses_depth_w_coefficients;
+   bool uses_sample_mask;
+   bool uses_vmask;
+   bool has_render_target_reads;
+   bool has_side_effects;
+   bool pulls_bary;
+
+   bool contains_flat_varying;
+   bool contains_noperspective_varying;
+
+   /** True if the shader wants sample shading
+    *
+    * This corresponds to whether or not a gl_SampleId, gl_SamplePosition, or
+    * a sample-qualified input are used in the shader.  It is independent of
+    * GL_MIN_SAMPLE_SHADING_VALUE in GL or minSampleShading in Vulkan.
+    */
+   bool sample_shading;
+
+   /** Should this shader be dispatched per-sample */
+   enum brw_sometimes persample_dispatch;
+
+   /**
+    * Shader is ran at the coarse pixel shading dispatch rate (3DSTATE_CPS).
+    */
+   enum brw_sometimes coarse_pixel_dispatch;
+
+   /**
+    * Shader writes the SampleMask and this is AND-ed with the API's
+    * SampleMask to generate a new coverage mask.
+    */
+   enum brw_sometimes alpha_to_coverage;
+
+   unsigned msaa_flags_param;
+
+   /**
+    * Mask of which interpolation modes are required by the fragment shader.
+    * Those interpolations are delivered as part of the thread payload. Used
+    * in hardware setup on gfx6+.
+    */
+   uint32_t barycentric_interp_modes;
+
+   /**
+    * Whether nonperspective interpolation modes are used by the
+    * barycentric_interp_modes or fragment shader through interpolator messages.
+    */
+   bool uses_nonperspective_interp_modes;
+
+   /**
+    * Mask of which FS inputs are marked flat by the shader source.  This is
+    * needed for setting up 3DSTATE_SF/SBE.
+    */
+   uint32_t flat_inputs;
+
+   /**
+    * The FS inputs
+    */
+   uint64_t inputs;
+
+   /* Mapping of VUE slots to interpolation modes.
+    * Used by the Gfx4-5 clip/sf/wm stages.
+    */
+   unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */
+
+   /**
+    * Map from gl_varying_slot to the position within the FS setup data
+    * payload where the varying's attribute vertex deltas should be delivered.
+    * For varying slots that are not used by the FS, the value is -1.
+    */
+   int urb_setup[VARYING_SLOT_MAX];
+   int urb_setup_channel[VARYING_SLOT_MAX];
+
+   /**
+    * Cache structure into the urb_setup array above that contains the
+    * attribute numbers of active varyings out of urb_setup.
+    * The actual count is stored in urb_setup_attribs_count.
+    */
+   uint8_t urb_setup_attribs[VARYING_SLOT_MAX];
+   uint8_t urb_setup_attribs_count;
+};
+
+#ifdef GFX_VERx10
+
+#if GFX_VERx10 >= 200
+
+/** Returns the SIMD width corresponding to a given KSP index
+ *
+ * The "Variable Pixel Dispatch" table in the PRM (which can be found, for
+ * example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to
+ * kernel start pointer (KSP) indices that is based on what dispatch widths
+ * are enabled.  This function provides, effectively, the reverse mapping.
+ *
+ * If the given KSP is enabled, a SIMD width of 8, 16, or 32 is
+ * returned.  Note that for a multipolygon dispatch kernel 8 is always
+ * returned, since multipolygon kernels use the "_8" fields from
+ * brw_wm_prog_data regardless of their SIMD width.  If the KSP is
+ * invalid, 0 is returned.
+ */
+static inline unsigned
+brw_fs_simd_width_for_ksp(unsigned ksp_idx, bool enabled, unsigned width_sel)
+{
+   assert(ksp_idx < 2);
+   return !enabled ? 0 :
+          width_sel ? 32 :
+          16;
+}
+
+#define brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx)              \
+        (ksp_idx == 0 && (wm_state).Kernel0MaximumPolysperThread ? 8 :  \
+         ksp_idx == 0 ? brw_fs_simd_width_for_ksp(ksp_idx, (wm_state).Kernel0Enable, \
+                                                  (wm_state).Kernel0SIMDWidth): \
+         brw_fs_simd_width_for_ksp(ksp_idx, (wm_state).Kernel1Enable,   \
+                                   (wm_state).Kernel1SIMDWidth))
+
+#else
+
+/** Returns the SIMD width corresponding to a given KSP index
+ *
+ * The "Variable Pixel Dispatch" table in the PRM (which can be found, for
+ * example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to
+ * kernel start pointer (KSP) indices that is based on what dispatch widths
+ * are enabled.  This function provides, effectively, the reverse mapping.
+ *
+ * If the given KSP is valid with respect to the SIMD8/16/32 enables, a SIMD
+ * width of 8, 16, or 32 is returned.  If the KSP is invalid, 0 is returned.
+ */
+static inline unsigned
+brw_fs_simd_width_for_ksp(unsigned ksp_idx, bool simd8_enabled,
+                          bool simd16_enabled, bool simd32_enabled)
+{
+   /* This function strictly ignores contiguous dispatch */
+   switch (ksp_idx) {
+   case 0:
+      return simd8_enabled ? 8 :
+             (simd16_enabled && !simd32_enabled) ? 16 :
+             (simd32_enabled && !simd16_enabled) ? 32 : 0;
+   case 1:
+      return (simd32_enabled && (simd16_enabled || simd8_enabled)) ? 32 : 0;
+   case 2:
+      return (simd16_enabled && (simd32_enabled || simd8_enabled)) ? 16 : 0;
+   default:
+      unreachable("Invalid KSP index");
+   }
+}
+
+#define brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx)              \
+   brw_fs_simd_width_for_ksp((ksp_idx), (wm_state)._8PixelDispatchEnable, \
+                             (wm_state)._16PixelDispatchEnable, \
+                             (wm_state)._32PixelDispatchEnable)
+
+#endif
+
+#endif
+
+#define brw_wm_state_has_ksp(wm_state, ksp_idx) \
+   (brw_wm_state_simd_width_for_ksp((wm_state), (ksp_idx)) != 0)
+
+static inline uint32_t
+_brw_wm_prog_data_prog_offset(const struct brw_wm_prog_data *prog_data,
+                              unsigned simd_width)
+{
+   switch (simd_width) {
+   case 8: return 0;
+   case 16: return prog_data->prog_offset_16;
+   case 32: return prog_data->prog_offset_32;
+   default: return 0;
+   }
+}
+
+#define brw_wm_prog_data_prog_offset(prog_data, wm_state, ksp_idx) \
+   _brw_wm_prog_data_prog_offset(prog_data, \
+      brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
+
+static inline uint8_t
+_brw_wm_prog_data_dispatch_grf_start_reg(const struct brw_wm_prog_data *prog_data,
+                                         unsigned simd_width)
+{
+   switch (simd_width) {
+   case 8: return prog_data->base.dispatch_grf_start_reg;
+   case 16: return prog_data->dispatch_grf_start_reg_16;
+   case 32: return prog_data->dispatch_grf_start_reg_32;
+   default: return 0;
+   }
+}
+
+#define brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm_state, ksp_idx) \
+   _brw_wm_prog_data_dispatch_grf_start_reg(prog_data, \
+      brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
+
+static inline uint8_t
+_brw_wm_prog_data_reg_blocks(const struct brw_wm_prog_data *prog_data,
+                             unsigned simd_width)
+{
+   switch (simd_width) {
+   case 8: return prog_data->reg_blocks_8;
+   case 16: return prog_data->reg_blocks_16;
+   case 32: return prog_data->reg_blocks_32;
+   default: return 0;
+   }
+}
+
+#define brw_wm_prog_data_reg_blocks(prog_data, wm_state, ksp_idx) \
+   _brw_wm_prog_data_reg_blocks(prog_data, \
+      brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
+
+static inline bool
+brw_wm_prog_data_is_persample(const struct brw_wm_prog_data *prog_data,
+                              enum intel_msaa_flags pushed_msaa_flags)
+{
+   if (pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC) {
+      if (!(pushed_msaa_flags & INTEL_MSAA_FLAG_MULTISAMPLE_FBO))
+         return false;
+
+      if (prog_data->sample_shading)
+         assert(pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
+
+      if (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH)
+         assert(prog_data->persample_dispatch != BRW_NEVER);
+      else
+         assert(prog_data->persample_dispatch != BRW_ALWAYS);
+
+      return (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH) != 0;
+   }
+
+   assert(prog_data->persample_dispatch == BRW_ALWAYS ||
+          prog_data->persample_dispatch == BRW_NEVER);
+
+   return prog_data->persample_dispatch;
+}
+
+static inline uint32_t
+wm_prog_data_barycentric_modes(const struct brw_wm_prog_data *prog_data,
+                               enum intel_msaa_flags pushed_msaa_flags)
+{
+   uint32_t modes = prog_data->barycentric_interp_modes;
+
+   /* In the non dynamic case, we can just return the computed modes from
+    * compilation time.
+    */
+   if (!(pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC))
+      return modes;
+
+   if (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_INTERP) {
+      assert(prog_data->persample_dispatch == BRW_ALWAYS ||
+             (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH));
+
+      /* Making dynamic per-sample interpolation work is a bit tricky.  The
+       * hardware will hang if SAMPLE is requested but per-sample dispatch is
+       * not enabled.  This means we can't preemptively add SAMPLE to the
+       * barycentrics bitfield.  Instead, we have to add it late and only
+       * on-demand.  Annoyingly, changing the number of barycentrics requested
+       * changes the whole PS shader payload so we very much don't want to do
+       * that.  Instead, if the dynamic per-sample interpolation flag is set,
+       * we check to see if SAMPLE was requested and, if not, replace the
+       * highest barycentric bit in the [non]perspective grouping (CENTROID,
+       * if it exists, else PIXEL) with SAMPLE.  The shader will stomp all the
+       * barycentrics in the shader with SAMPLE so it really doesn't matter
+       * which one we replace.  The important thing is that we keep the number
+       * of barycentrics in each [non]perspective grouping the same.
+       */
+      if ((modes & BRW_BARYCENTRIC_PERSPECTIVE_BITS) &&
+          !(modes & BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE))) {
+         int sample_mode =
+            util_last_bit(modes & BRW_BARYCENTRIC_PERSPECTIVE_BITS) - 1;
+         assert(modes & BITFIELD_BIT(sample_mode));
+
+         modes &= ~BITFIELD_BIT(sample_mode);
+         modes |= BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE);
+      }
+
+      if ((modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) &&
+          !(modes & BITFIELD_BIT(BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))) {
+         int sample_mode =
+            util_last_bit(modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1;
+         assert(modes & BITFIELD_BIT(sample_mode));
+
+         modes &= ~BITFIELD_BIT(sample_mode);
+         modes |= BITFIELD_BIT(BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE);
+      }
+   } else {
+      /* If we're not using per-sample interpolation, we need to disable the
+       * per-sample bits.
+       *
+       * SKL PRMs, Volume 2a: Command Reference: Instructions,
+       * 3DSTATE_WM:Barycentric Interpolation Mode:
+
+       *    "MSDISPMODE_PERSAMPLE is required in order to select Perspective
+       *     Sample or Non-perspective Sample barycentric coordinates."
+       */
+      modes &= ~(BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE) |
+                 BITFIELD_BIT(BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE));
+   }
+
+   return modes;
+}
+
+static inline bool
+brw_wm_prog_data_is_coarse(const struct brw_wm_prog_data *prog_data,
+                           enum intel_msaa_flags pushed_msaa_flags)
+{
+   if (pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC) {
+      if (pushed_msaa_flags & INTEL_MSAA_FLAG_COARSE_RT_WRITES)
+         assert(prog_data->coarse_pixel_dispatch != BRW_NEVER);
+      else
+         assert(prog_data->coarse_pixel_dispatch != BRW_ALWAYS);
+
+      return pushed_msaa_flags & INTEL_MSAA_FLAG_COARSE_RT_WRITES;
+   }
+
+   assert(prog_data->coarse_pixel_dispatch == BRW_ALWAYS ||
+          prog_data->coarse_pixel_dispatch == BRW_NEVER);
+
+   return prog_data->coarse_pixel_dispatch;
+}
+
+struct brw_push_const_block {
+   unsigned dwords;     /* Dword count, not reg aligned */
+   unsigned regs;
+   unsigned size;       /* Bytes, register aligned */
+};
+
+struct brw_cs_prog_data {
+   struct brw_stage_prog_data base;
+
+   unsigned local_size[3];
+
+   /* Program offsets for the 8/16/32 SIMD variants.  Multiple variants are
+    * kept when using variable group size, and the right one can only be
+    * decided at dispatch time.
+    */
+   unsigned prog_offset[3];
+
+   /* Bitmask indicating which program offsets are valid. */
+   unsigned prog_mask;
+
+   /* Bitmask indicating which programs have spilled. */
+   unsigned prog_spilled;
+
+   bool uses_barrier;
+   bool uses_num_work_groups;
+   bool uses_inline_data;
+   bool uses_btd_stack_ids;
+   bool uses_systolic;
+   uint8_t generate_local_id;
+   enum intel_compute_walk_order walk_order;
+
+   struct {
+      struct brw_push_const_block cross_thread;
+      struct brw_push_const_block per_thread;
+   } push;
+
+   struct {
+      /** @{
+       * surface indices the CS-specific surfaces
+       */
+      uint32_t work_groups_start;
+      /** @} */
+   } binding_table;
+};
+
+static inline uint32_t
+brw_cs_prog_data_prog_offset(const struct brw_cs_prog_data *prog_data,
+                             unsigned dispatch_width)
+{
+   assert(dispatch_width == 8 ||
+          dispatch_width == 16 ||
+          dispatch_width == 32);
+   const unsigned index = dispatch_width / 16;
+   assert(prog_data->prog_mask & (1 << index));
+   return prog_data->prog_offset[index];
+}
+
+struct brw_bs_prog_data {
+   struct brw_stage_prog_data base;
+
+   /** SIMD size of the root shader */
+   uint8_t simd_size;
+
+   /** Maximum stack size of all shaders */
+   uint32_t max_stack_size;
+
+   /** Offset into the shader where the resume SBT is located */
+   uint32_t resume_sbt_offset;
+
+   /** Number of resume shaders */
+   uint32_t num_resume_shaders;
+};
+
+struct brw_ff_gs_prog_data {
+   unsigned urb_read_length;
+   unsigned total_grf;
+
+   /**
+    * Gfx6 transform feedback: Amount by which the streaming vertex buffer
+    * indices should be incremented each time the GS is invoked.
+    */
+   unsigned svbi_postincrement_value;
+};
+
+/**
+ * Enum representing the i965-specific vertex results that don't correspond
+ * exactly to any element of gl_varying_slot.  The values of this enum are
+ * assigned such that they don't conflict with gl_varying_slot.
+ */
+typedef enum
+{
+   BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX,
+   BRW_VARYING_SLOT_PAD,
+   /**
+    * Technically this is not a varying but just a placeholder that
+    * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord
+    * builtin variable to be compiled correctly. see compile_sf_prog() for
+    * more info.
+    */
+   BRW_VARYING_SLOT_PNTC,
+   BRW_VARYING_SLOT_COUNT
+} brw_varying_slot;
+
+/**
+ * We always program SF to start reading at an offset of 1 (2 varying slots)
+ * from the start of the vertex URB entry.  This causes it to skip:
+ * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gfx4-5
+ * - VARYING_SLOT_PSIZ and VARYING_SLOT_POS on gfx6+
+ */
+#define BRW_SF_URB_ENTRY_READ_OFFSET 1
+
+/**
+ * Bitmask indicating which fragment shader inputs represent varyings (and
+ * hence have to be delivered to the fragment shader by the SF/SBE stage).
+ */
+#define BRW_FS_VARYING_INPUT_MASK \
+   (BITFIELD64_RANGE(0, VARYING_SLOT_MAX) & \
+    ~VARYING_BIT_POS & ~VARYING_BIT_FACE)
+
+void brw_print_vue_map(FILE *fp, const struct intel_vue_map *vue_map,
+                       gl_shader_stage stage);
+
+/**
+ * Convert a VUE slot number into a byte offset within the VUE.
+ */
+static inline unsigned brw_vue_slot_to_offset(unsigned slot)
+{
+   return 16*slot;
+}
+
+/**
+ * Convert a vertex output (brw_varying_slot) into a byte offset within the
+ * VUE.
+ */
+static inline unsigned
+brw_varying_to_offset(const struct intel_vue_map *vue_map, unsigned varying)
+{
+   return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]);
+}
+
+void brw_compute_vue_map(const struct intel_device_info *devinfo,
+                         struct intel_vue_map *vue_map,
+                         uint64_t slots_valid,
+                         bool separate_shader,
+                         uint32_t pos_slots);
+
+void brw_compute_tess_vue_map(struct intel_vue_map *const vue_map,
+                              uint64_t slots_valid,
+                              uint32_t is_patch);
+
+/* brw_interpolation_map.c */
+void brw_setup_vue_interpolation(const struct intel_vue_map *vue_map,
+                                 struct nir_shader *nir,
+                                 struct brw_wm_prog_data *prog_data);
+
+struct brw_vue_prog_data {
+   struct brw_stage_prog_data base;
+   struct intel_vue_map vue_map;
+
+   /** Should the hardware deliver input VUE handles for URB pull loads? */
+   bool include_vue_handles;
+
+   unsigned urb_read_length;
+   unsigned total_grf;
+
+   uint32_t clip_distance_mask;
+   uint32_t cull_distance_mask;
+
+   /* Used for calculating urb partitions.  In the VS, this is the size of the
+    * URB entry used for both input and output to the thread.  In the GS, this
+    * is the size of the URB entry used for output.
+    */
+   unsigned urb_entry_size;
+
+   enum intel_shader_dispatch_mode dispatch_mode;
+};
+
+struct brw_vs_prog_data {
+   struct brw_vue_prog_data base;
+
+   uint64_t inputs_read;
+   uint64_t double_inputs_read;
+
+   unsigned nr_attribute_slots;
+
+   bool uses_vertexid;
+   bool uses_instanceid;
+   bool uses_is_indexed_draw;
+   bool uses_firstvertex;
+   bool uses_baseinstance;
+   bool uses_drawid;
+};
+
+struct brw_tcs_prog_data
+{
+   struct brw_vue_prog_data base;
+
+   /** Should the non-SINGLE_PATCH payload provide primitive ID? */
+   bool include_primitive_id;
+
+   /** Number vertices in output patch */
+   int instances;
+
+   /** Track patch count threshold */
+   int patch_count_threshold;
+};
+
+
+struct brw_tes_prog_data
+{
+   struct brw_vue_prog_data base;
+
+   enum intel_tess_partitioning partitioning;
+   enum intel_tess_output_topology output_topology;
+   enum intel_tess_domain domain;
+   bool include_primitive_id;
+};
+
+struct brw_gs_prog_data
+{
+   struct brw_vue_prog_data base;
+
+   unsigned vertices_in;
+
+   /**
+    * Size of an output vertex, measured in HWORDS (32 bytes).
+    */
+   unsigned output_vertex_size_hwords;
+
+   unsigned output_topology;
+
+   /**
+    * Size of the control data (cut bits or StreamID bits), in hwords (32
+    * bytes).  0 if there is no control data.
+    */
+   unsigned control_data_header_size_hwords;
+
+   /**
+    * Format of the control data (either GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID
+    * if the control data is StreamID bits, or
+    * GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits).
+    * Ignored if control_data_header_size is 0.
+    */
+   unsigned control_data_format;
+
+   bool include_primitive_id;
+
+   /**
+    * The number of vertices emitted, if constant - otherwise -1.
+    */
+   int static_vertex_count;
+
+   int invocations;
+
+   /**
+    * Gfx6: Provoking vertex convention for odd-numbered triangles
+    * in tristrips.
+    */
+   unsigned pv_first:1;
+
+   /**
+    * Gfx6: Number of varyings that are output to transform feedback.
+    */
+   unsigned num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */
+
+   /**
+    * Gfx6: Map from the index of a transform feedback binding table entry to the
+    * gl_varying_slot that should be streamed out through that binding table
+    * entry.
+    */
+   unsigned char transform_feedback_bindings[64 /* BRW_MAX_SOL_BINDINGS */];
+
+   /**
+    * Gfx6: Map from the index of a transform feedback binding table entry to the
+    * swizzles that should be used when streaming out data through that
+    * binding table entry.
+    */
+   unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */];
+};
+
+struct brw_sf_prog_data {
+   uint32_t urb_read_length;
+   uint32_t total_grf;
+
+   /* Each vertex may have up to 12 attributes, 4 components each,
+    * except WPOS which requires only 2.  (11*4 + 2) == 44 ==> 11
+    * rows.
+    *
+    * Actually we use 4 for each, so call it 12 rows.
+    */
+   unsigned urb_entry_size;
+};
+
+struct brw_clip_prog_data {
+   uint32_t curb_read_length;	/* user planes? */
+   uint32_t clip_mode;
+   uint32_t urb_read_length;
+   uint32_t total_grf;
+};
+
+struct brw_tue_map {
+   uint32_t size_dw;
+
+   uint32_t per_task_data_start_dw;
+};
+
+struct brw_mue_map {
+   int32_t start_dw[VARYING_SLOT_MAX];
+   uint32_t len_dw[VARYING_SLOT_MAX];
+   uint32_t per_primitive_indices_dw;
+
+   uint32_t size_dw;
+
+   uint32_t max_primitives;
+   uint32_t per_primitive_start_dw;
+   uint32_t per_primitive_header_size_dw;
+   uint32_t per_primitive_data_size_dw;
+   uint32_t per_primitive_pitch_dw;
+   bool user_data_in_primitive_header;
+
+   uint32_t max_vertices;
+   uint32_t per_vertex_start_dw;
+   uint32_t per_vertex_header_size_dw;
+   uint32_t per_vertex_data_size_dw;
+   uint32_t per_vertex_pitch_dw;
+   bool user_data_in_vertex_header;
+};
+
+struct brw_task_prog_data {
+   struct brw_cs_prog_data base;
+   struct brw_tue_map map;
+   bool uses_drawid;
+};
+
+enum brw_mesh_index_format {
+   BRW_INDEX_FORMAT_U32,
+   BRW_INDEX_FORMAT_U888X,
+};
+
+struct brw_mesh_prog_data {
+   struct brw_cs_prog_data base;
+   struct brw_mue_map map;
+
+   uint32_t clip_distance_mask;
+   uint32_t cull_distance_mask;
+   uint16_t primitive_type;
+
+   enum brw_mesh_index_format index_format;
+
+   bool uses_drawid;
+};
+
+/* brw_any_prog_data is prog_data for any stage that maps to an API stage */
+union brw_any_prog_data {
+   struct brw_stage_prog_data base;
+   struct brw_vue_prog_data vue;
+   struct brw_vs_prog_data vs;
+   struct brw_tcs_prog_data tcs;
+   struct brw_tes_prog_data tes;
+   struct brw_gs_prog_data gs;
+   struct brw_wm_prog_data wm;
+   struct brw_cs_prog_data cs;
+   struct brw_bs_prog_data bs;
+   struct brw_task_prog_data task;
+   struct brw_mesh_prog_data mesh;
+};
+
+#define DEFINE_PROG_DATA_DOWNCAST(STAGE, CHECK)                            \
+static inline struct brw_##STAGE##_prog_data *                             \
+brw_##STAGE##_prog_data(struct brw_stage_prog_data *prog_data)             \
+{                                                                          \
+   if (prog_data)                                                          \
+      assert(CHECK);                                                       \
+   return (struct brw_##STAGE##_prog_data *) prog_data;                    \
+}                                                                          \
+static inline const struct brw_##STAGE##_prog_data *                       \
+brw_##STAGE##_prog_data_const(const struct brw_stage_prog_data *prog_data) \
+{                                                                          \
+   if (prog_data)                                                          \
+      assert(CHECK);                                                       \
+   return (const struct brw_##STAGE##_prog_data *) prog_data;              \
+}
+
+DEFINE_PROG_DATA_DOWNCAST(vs,  prog_data->stage == MESA_SHADER_VERTEX)
+DEFINE_PROG_DATA_DOWNCAST(tcs, prog_data->stage == MESA_SHADER_TESS_CTRL)
+DEFINE_PROG_DATA_DOWNCAST(tes, prog_data->stage == MESA_SHADER_TESS_EVAL)
+DEFINE_PROG_DATA_DOWNCAST(gs,  prog_data->stage == MESA_SHADER_GEOMETRY)
+DEFINE_PROG_DATA_DOWNCAST(wm,  prog_data->stage == MESA_SHADER_FRAGMENT)
+DEFINE_PROG_DATA_DOWNCAST(cs,  gl_shader_stage_uses_workgroup(prog_data->stage))
+DEFINE_PROG_DATA_DOWNCAST(bs,  brw_shader_stage_is_bindless(prog_data->stage))
+
+DEFINE_PROG_DATA_DOWNCAST(vue, prog_data->stage == MESA_SHADER_VERTEX ||
+                               prog_data->stage == MESA_SHADER_TESS_CTRL ||
+                               prog_data->stage == MESA_SHADER_TESS_EVAL ||
+                               prog_data->stage == MESA_SHADER_GEOMETRY)
+
+DEFINE_PROG_DATA_DOWNCAST(task, prog_data->stage == MESA_SHADER_TASK)
+DEFINE_PROG_DATA_DOWNCAST(mesh, prog_data->stage == MESA_SHADER_MESH)
+
+/* These are not really brw_stage_prog_data. */
+DEFINE_PROG_DATA_DOWNCAST(ff_gs, true)
+DEFINE_PROG_DATA_DOWNCAST(clip,  true)
+DEFINE_PROG_DATA_DOWNCAST(sf,    true)
+#undef DEFINE_PROG_DATA_DOWNCAST
+
+struct brw_compile_stats {
+   uint32_t dispatch_width; /**< 0 for vec4 */
+   uint32_t max_polygons;
+   uint32_t max_dispatch_width;
+   uint32_t instructions;
+   uint32_t sends;
+   uint32_t loops;
+   uint32_t cycles;
+   uint32_t spills;
+   uint32_t fills;
+   uint32_t max_live_registers;
+};
+
+/** @} */
+
+struct brw_compiler *
+brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo);
+
+/**
+ * Returns a compiler configuration for use with disk shader cache
+ *
+ * This value only needs to change for settings that can cause different
+ * program generation between two runs on the same hardware.
+ *
+ * For example, it doesn't need to be different for gen 8 and gen 9 hardware,
+ * but it does need to be different if INTEL_DEBUG=nocompact is or isn't used.
+ */
+uint64_t
+brw_get_compiler_config_value(const struct brw_compiler *compiler);
+
+/* Provides a string sha1 hash of all device information fields that could
+ * affect shader compilation.
+ */
+void
+brw_device_sha1(char *hex, const struct intel_device_info *devinfo);
+
+/* For callers computing their own UUID or hash.  Hashes all device
+ * information fields that could affect shader compilation into the provided
+ * sha1_ctx.
+ */
+void
+brw_device_sha1_update(struct mesa_sha1 *sha1_ctx,
+                       const struct intel_device_info *devinfo);
+
+unsigned
+brw_prog_data_size(gl_shader_stage stage);
+
+unsigned
+brw_prog_key_size(gl_shader_stage stage);
+
+struct brw_compile_params {
+   void *mem_ctx;
+
+   nir_shader *nir;
+
+   struct brw_compile_stats *stats;
+
+   void *log_data;
+
+   char *error_str;
+
+   uint64_t debug_flag;
+
+   uint32_t source_hash;
+};
+
+/**
+ * Parameters for compiling a vertex shader.
+ *
+ * Some of these will be modified during the shader compilation.
+ */
+struct brw_compile_vs_params {
+   struct brw_compile_params base;
+
+   const struct brw_vs_prog_key *key;
+   struct brw_vs_prog_data *prog_data;
+
+   bool edgeflag_is_last; /* true for gallium */
+};
+
+/**
+ * Compile a vertex shader.
+ *
+ * Returns the final assembly and updates the parameters structure.
+ */
+const unsigned *
+brw_compile_vs(const struct brw_compiler *compiler,
+               struct brw_compile_vs_params *params);
+
+/**
+ * Parameters for compiling a tessellation control shader.
+ *
+ * Some of these will be modified during the shader compilation.
+ */
+struct brw_compile_tcs_params {
+   struct brw_compile_params base;
+
+   const struct brw_tcs_prog_key *key;
+   struct brw_tcs_prog_data *prog_data;
+};
+
+/**
+ * Compile a tessellation control shader.
+ *
+ * Returns the final assembly and updates the parameters structure.
+ */
+const unsigned *
+brw_compile_tcs(const struct brw_compiler *compiler,
+                struct brw_compile_tcs_params *params);
+
+/**
+ * Parameters for compiling a tessellation evaluation shader.
+ *
+ * Some of these will be modified during the shader compilation.
+ */
+struct brw_compile_tes_params {
+   struct brw_compile_params base;
+
+   const struct brw_tes_prog_key *key;
+   struct brw_tes_prog_data *prog_data;
+   const struct intel_vue_map *input_vue_map;
+};
+
+/**
+ * Compile a tessellation evaluation shader.
+ *
+ * Returns the final assembly and updates the parameters structure.
+ */
+const unsigned *
+brw_compile_tes(const struct brw_compiler *compiler,
+                struct brw_compile_tes_params *params);
+
+/**
+ * Parameters for compiling a geometry shader.
+ *
+ * Some of these will be modified during the shader compilation.
+ */
+struct brw_compile_gs_params {
+   struct brw_compile_params base;
+
+   const struct brw_gs_prog_key *key;
+   struct brw_gs_prog_data *prog_data;
+};
+
+/**
+ * Compile a geometry shader.
+ *
+ * Returns the final assembly and updates the parameters structure.
+ */
+const unsigned *
+brw_compile_gs(const struct brw_compiler *compiler,
+               struct brw_compile_gs_params *params);
+
+/**
+ * Compile a strips and fans shader.
+ *
+ * This is a fixed-function shader determined entirely by the shader key and
+ * a VUE map.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_sf(const struct brw_compiler *compiler,
+               void *mem_ctx,
+               const struct brw_sf_prog_key *key,
+               struct brw_sf_prog_data *prog_data,
+               struct intel_vue_map *vue_map,
+               unsigned *final_assembly_size);
+
+/**
+ * Compile a clipper shader.
+ *
+ * This is a fixed-function shader determined entirely by the shader key and
+ * a VUE map.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_clip(const struct brw_compiler *compiler,
+                 void *mem_ctx,
+                 const struct brw_clip_prog_key *key,
+                 struct brw_clip_prog_data *prog_data,
+                 struct intel_vue_map *vue_map,
+                 unsigned *final_assembly_size);
+
+struct brw_compile_task_params {
+   struct brw_compile_params base;
+
+   const struct brw_task_prog_key *key;
+   struct brw_task_prog_data *prog_data;
+};
+
+const unsigned *
+brw_compile_task(const struct brw_compiler *compiler,
+                 struct brw_compile_task_params *params);
+
+struct brw_compile_mesh_params {
+   struct brw_compile_params base;
+
+   const struct brw_mesh_prog_key *key;
+   struct brw_mesh_prog_data *prog_data;
+   const struct brw_tue_map *tue_map;
+};
+
+const unsigned *
+brw_compile_mesh(const struct brw_compiler *compiler,
+                 struct brw_compile_mesh_params *params);
+
+/**
+ * Parameters for compiling a fragment shader.
+ *
+ * Some of these will be modified during the shader compilation.
+ */
+struct brw_compile_fs_params {
+   struct brw_compile_params base;
+
+   const struct brw_wm_prog_key *key;
+   struct brw_wm_prog_data *prog_data;
+
+   const struct intel_vue_map *vue_map;
+   const struct brw_mue_map *mue_map;
+
+   bool allow_spilling;
+   bool use_rep_send;
+   uint8_t max_polygons;
+};
+
+/**
+ * Compile a fragment shader.
+ *
+ * Returns the final assembly and updates the parameters structure.
+ */
+const unsigned *
+brw_compile_fs(const struct brw_compiler *compiler,
+               struct brw_compile_fs_params *params);
+
+/**
+ * Parameters for compiling a compute shader.
+ *
+ * Some of these will be modified during the shader compilation.
+ */
+struct brw_compile_cs_params {
+   struct brw_compile_params base;
+
+   const struct brw_cs_prog_key *key;
+   struct brw_cs_prog_data *prog_data;
+};
+
+/**
+ * Compile a compute shader.
+ *
+ * Returns the final assembly and updates the parameters structure.
+ */
+const unsigned *
+brw_compile_cs(const struct brw_compiler *compiler,
+               struct brw_compile_cs_params *params);
+
+/**
+ * Parameters for compiling a Bindless shader.
+ *
+ * Some of these will be modified during the shader compilation.
+ */
+struct brw_compile_bs_params {
+   struct brw_compile_params base;
+
+   const struct brw_bs_prog_key *key;
+   struct brw_bs_prog_data *prog_data;
+
+   unsigned num_resume_shaders;
+   struct nir_shader **resume_shaders;
+};
+
+/**
+ * Compile a Bindless shader.
+ *
+ * Returns the final assembly and updates the parameters structure.
+ */
+const unsigned *
+brw_compile_bs(const struct brw_compiler *compiler,
+               struct brw_compile_bs_params *params);
+
+/**
+ * Compile a fixed function geometry shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_ff_gs_prog(struct brw_compiler *compiler,
+		       void *mem_ctx,
+		       const struct brw_ff_gs_prog_key *key,
+		       struct brw_ff_gs_prog_data *prog_data,
+		       struct intel_vue_map *vue_map,
+		       unsigned *final_assembly_size);
+
+void brw_debug_key_recompile(const struct brw_compiler *c, void *log,
+                             gl_shader_stage stage,
+                             const struct brw_base_prog_key *old_key,
+                             const struct brw_base_prog_key *key);
+
+/* Shared Local Memory Size is specified as powers of two,
+ * and also have a Gen-dependent minimum value if not zero.
+ */
+static inline uint32_t
+intel_calculate_slm_size(unsigned gen, uint32_t bytes)
+{
+   assert(bytes <= 64 * 1024);
+   if (bytes > 0)
+      return MAX2(util_next_power_of_two(bytes), gen >= 9 ? 1024 : 4096);
+   else
+      return 0;
+}
+
+static inline uint32_t
+encode_slm_size(unsigned gen, uint32_t bytes)
+{
+   uint32_t slm_size = 0;
+
+   /* Shared Local Memory is specified as powers of two, and encoded in
+    * INTERFACE_DESCRIPTOR_DATA with the following representations:
+    *
+    * Size   | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB |
+    * -------------------------------------------------------------------
+    * Gfx7-8 |    0 | none | none |    1 |    2 |     4 |     8 |    16 |
+    * -------------------------------------------------------------------
+    * Gfx9+  |    0 |    1 |    2 |    3 |    4 |     5 |     6 |     7 |
+    */
+
+   if (bytes > 0) {
+      slm_size = intel_calculate_slm_size(gen, bytes);
+      assert(util_is_power_of_two_nonzero(slm_size));
+
+      if (gen >= 9) {
+         /* Turn an exponent of 10 (1024 kB) into 1. */
+         assert(slm_size >= 1024);
+         slm_size = ffs(slm_size) - 10;
+      } else {
+         assert(slm_size >= 4096);
+         /* Convert to the pre-Gfx9 representation. */
+         slm_size = slm_size / 4096;
+      }
+   }
+
+   return slm_size;
+}
+
+unsigned
+brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
+                             unsigned threads);
+
+void
+brw_write_shader_relocs(const struct brw_isa_info *isa,
+                        void *program,
+                        const struct brw_stage_prog_data *prog_data,
+                        struct brw_shader_reloc_value *values,
+                        unsigned num_values);
+
+/**
+ * Get the dispatch information for a shader to be used with GPGPU_WALKER and
+ * similar instructions.
+ *
+ * If override_local_size is not NULL, it must to point to a 3-element that
+ * will override the value from prog_data->local_size.  This is used by
+ * ARB_compute_variable_group_size, where the size is set only at dispatch
+ * time (so prog_data is outdated).
+ */
+struct intel_cs_dispatch_info
+brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
+                         const struct brw_cs_prog_data *prog_data,
+                         const unsigned *override_local_size);
+
+/**
+ * Return true if the given shader stage is dispatched contiguously by the
+ * relevant fixed function starting from channel 0 of the SIMD thread, which
+ * implies that the dispatch mask of a thread can be assumed to have the form
+ * '2^n - 1' for some n.
+ */
+static inline bool
+brw_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo,
+                              gl_shader_stage stage, unsigned max_polygons,
+                              const struct brw_stage_prog_data *prog_data)
+{
+   /* The code below makes assumptions about the hardware's thread dispatch
+    * behavior that could be proven wrong in future generations -- Make sure
+    * to do a full test run with brw_fs_test_dispatch_packing() hooked up to
+    * the NIR front-end before changing this assertion.
+    */
+   assert(devinfo->ver <= 12);
+
+   switch (stage) {
+   case MESA_SHADER_FRAGMENT: {
+      /* The PSD discards subspans coming in with no lit samples, which in the
+       * per-pixel shading case implies that each subspan will either be fully
+       * lit (due to the VMask being used to allow derivative computations),
+       * or not dispatched at all.  In per-sample dispatch mode individual
+       * samples from the same subspan have a fixed relative location within
+       * the SIMD thread, so dispatch of unlit samples cannot be avoided in
+       * general and we should return false.
+       */
+      const struct brw_wm_prog_data *wm_prog_data =
+         (const struct brw_wm_prog_data *)prog_data;
+      return devinfo->verx10 < 125 &&
+             !wm_prog_data->persample_dispatch &&
+             wm_prog_data->uses_vmask &&
+             max_polygons < 2;
+   }
+   case MESA_SHADER_COMPUTE:
+      /* Compute shaders will be spawned with either a fully enabled dispatch
+       * mask or with whatever bottom/right execution mask was given to the
+       * GPGPU walker command to be used along the workgroup edges -- In both
+       * cases the dispatch mask is required to be tightly packed for our
+       * invocation index calculations to work.
+       */
+      return true;
+   default:
+      /* Most remaining fixed functions are limited to use a packed dispatch
+       * mask due to the hardware representation of the dispatch mask as a
+       * single counter representing the number of enabled channels.
+       */
+      return true;
+   }
+}
+
+/**
+ * Computes the first varying slot in the URB produced by the previous stage
+ * that is used in the next stage. We do this by testing the varying slots in
+ * the previous stage's vue map against the inputs read in the next stage.
+ *
+ * Note that:
+ *
+ * - Each URB offset contains two varying slots and we can only skip a
+ *   full offset if both slots are unused, so the value we return here is always
+ *   rounded down to the closest multiple of two.
+ *
+ * - gl_Layer and gl_ViewportIndex don't have their own varying slots, they are
+ *   part of the vue header, so if these are read we can't skip anything.
+ */
+static inline int
+brw_compute_first_urb_slot_required(uint64_t inputs_read,
+                                    const struct intel_vue_map *prev_stage_vue_map)
+{
+   if ((inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PRIMITIVE_SHADING_RATE)) == 0) {
+      for (int i = 0; i < prev_stage_vue_map->num_slots; i++) {
+         int varying = prev_stage_vue_map->slot_to_varying[i];
+         if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0)
+            return ROUND_DOWN_TO(i, 2);
+      }
+   }
+
+   return 0;
+}
+
+/* From InlineData in 3DSTATE_TASK_SHADER_DATA and 3DSTATE_MESH_SHADER_DATA. */
+#define BRW_TASK_MESH_INLINE_DATA_SIZE_DW 8
+
+/* InlineData[0-1] is used for Vulkan descriptor. */
+#define BRW_TASK_MESH_PUSH_CONSTANTS_START_DW 2
+
+#define BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW \
+   (BRW_TASK_MESH_INLINE_DATA_SIZE_DW - BRW_TASK_MESH_PUSH_CONSTANTS_START_DW)
+
+/**
+ * This enum is used as the base indice of the nir_load_topology_id_intel
+ * intrinsic. This is used to return different values based on some aspect of
+ * the topology of the device.
+ */
+enum brw_topology_id
+{
+   /* A value based of the DSS identifier the shader is currently running on.
+    * Be mindful that the DSS ID can be higher than the total number of DSS on
+    * the device. This is because of the fusing that can occur on different
+    * parts.
+    */
+   BRW_TOPOLOGY_ID_DSS,
+
+   /* A value composed of EU ID, thread ID & SIMD lane ID. */
+   BRW_TOPOLOGY_ID_EU_THREAD_SIMD,
+};
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* BRW_COMPILER_H */
diff --git a/src/intel/compiler/elk/brw_dead_control_flow.cpp b/src/intel/compiler/elk/brw_dead_control_flow.cpp
new file mode 100644
index 00000000000..0d9253bab18
--- /dev/null
+++ b/src/intel/compiler/elk/brw_dead_control_flow.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_dead_control_flow.cpp
+ *
+ * This file implements the dead control flow elimination optimization pass.
+ */
+
+#include "brw_shader.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+/* Look for and eliminate dead control flow:
+ *
+ *   - if/endif
+ *   - else in else/endif
+ *   - then in if/else/endif
+ */
+bool
+dead_control_flow_eliminate(backend_shader *s)
+{
+   bool progress = false;
+
+   foreach_block_safe (block, s->cfg) {
+      bblock_t *prev_block = block->prev();
+
+      if (!prev_block)
+         continue;
+
+      backend_instruction *const inst = block->start();
+      backend_instruction *const prev_inst = prev_block->end();
+
+      /* ENDIF instructions, by definition, can only be found at the start of
+       * basic blocks.
+       */
+      if (inst->opcode == BRW_OPCODE_ENDIF &&
+          prev_inst->opcode == BRW_OPCODE_ELSE) {
+         bblock_t *const else_block = prev_block;
+         backend_instruction *const else_inst = prev_inst;
+
+         else_inst->remove(else_block);
+         progress = true;
+      } else if (inst->opcode == BRW_OPCODE_ENDIF &&
+                 prev_inst->opcode == BRW_OPCODE_IF) {
+         bblock_t *const endif_block = block;
+         bblock_t *const if_block = prev_block;
+         backend_instruction *const endif_inst = inst;
+         backend_instruction *const if_inst = prev_inst;
+
+         bblock_t *earlier_block = NULL, *later_block = NULL;
+
+         if (if_block->start_ip == if_block->end_ip) {
+            earlier_block = if_block->prev();
+         } else {
+            earlier_block = if_block;
+         }
+         if_inst->remove(if_block);
+
+         if (endif_block->start_ip == endif_block->end_ip) {
+            later_block = endif_block->next();
+         } else {
+            later_block = endif_block;
+         }
+         endif_inst->remove(endif_block);
+
+         assert((earlier_block == NULL) == (later_block == NULL));
+         if (earlier_block && earlier_block->can_combine_with(later_block)) {
+            earlier_block->combine_with(later_block);
+
+            /* If ENDIF was in its own block, then we've now deleted it and
+             * merged the two surrounding blocks, the latter of which the
+             * __next block pointer was pointing to.
+             */
+            if (endif_block != later_block) {
+               __next = earlier_block->next();
+            }
+         }
+
+         progress = true;
+      } else if (inst->opcode == BRW_OPCODE_ELSE &&
+                 prev_inst->opcode == BRW_OPCODE_IF) {
+         bblock_t *const else_block = block;
+         backend_instruction *const if_inst = prev_inst;
+         backend_instruction *const else_inst = inst;
+
+         /* Since the else-branch is becoming the new then-branch, the
+          * condition has to be inverted.
+          */
+         if_inst->predicate_inverse = !if_inst->predicate_inverse;
+         else_inst->remove(else_block);
+
+         progress = true;
+      }
+   }
+
+   if (progress)
+      s->invalidate_analysis(DEPENDENCY_BLOCKS | DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_dead_control_flow.h b/src/intel/compiler/elk/brw_dead_control_flow.h
new file mode 100644
index 00000000000..9732c2b9f3f
--- /dev/null
+++ b/src/intel/compiler/elk/brw_dead_control_flow.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_DEAD_CONTROL_FLOW_H
+#define BRW_DEAD_CONTROL_FLOW_H
+
+#include "brw_shader.h"
+
+bool dead_control_flow_eliminate(backend_shader *s);
+
+#endif /* BRW_DEAD_CONTROL_FLOW_H */
diff --git a/src/intel/compiler/elk/brw_debug_recompile.c b/src/intel/compiler/elk/brw_debug_recompile.c
new file mode 100644
index 00000000000..6e055e09f7c
--- /dev/null
+++ b/src/intel/compiler/elk/brw_debug_recompile.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file brw_debug_recompiles.c
+ */
+
+#include <stdio.h>
+
+#include "brw_compiler.h"
+
+static bool
+key_debug(const struct brw_compiler *c, void *log,
+          const char *name, int a, int b)
+{
+   if (a != b) {
+      brw_shader_perf_log(c, log, "  %s %d->%d\n", name, a, b);
+      return true;
+   }
+   return false;
+}
+
+static bool
+key_debug_float(const struct brw_compiler *c, void *log,
+                const char *name, float a, float b)
+{
+   if (a != b) {
+      brw_shader_perf_log(c, log, "  %s %f->%f\n", name, a, b);
+      return true;
+   }
+   return false;
+}
+
+#define check(name, field) \
+   key_debug(c, log, name, old_key->field, key->field)
+#define check_float(name, field) \
+   key_debug_float(c, log, name, old_key->field, key->field)
+
+static bool
+debug_sampler_recompile(const struct brw_compiler *c, void *log,
+                        const struct brw_sampler_prog_key_data *old_key,
+                        const struct brw_sampler_prog_key_data *key)
+{
+   bool found = false;
+
+   found |= check("gather channel quirk", gather_channel_quirk_mask);
+
+   for (unsigned i = 0; i < BRW_MAX_SAMPLERS; i++) {
+      found |= check("EXT_texture_swizzle or DEPTH_TEXTURE_MODE", swizzles[i]);
+      found |= check("textureGather workarounds", gfx6_gather_wa[i]);
+   }
+
+   for (unsigned i = 0; i < 3; i++) {
+      found |= check("GL_CLAMP enabled on any texture unit", gl_clamp_mask[i]);
+   }
+
+   return found;
+}
+
+static bool
+debug_base_recompile(const struct brw_compiler *c, void *log,
+                     const struct brw_base_prog_key *old_key,
+                     const struct brw_base_prog_key *key)
+{
+   return debug_sampler_recompile(c, log, &old_key->tex, &key->tex);
+}
+
+static void
+debug_vs_recompile(const struct brw_compiler *c, void *log,
+                   const struct brw_vs_prog_key *old_key,
+                   const struct brw_vs_prog_key *key)
+{
+   bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
+
+   for (unsigned i = 0; i < VERT_ATTRIB_MAX; i++) {
+      found |= check("vertex attrib w/a flags", gl_attrib_wa_flags[i]);
+   }
+
+   found |= check("legacy user clipping", nr_userclip_plane_consts);
+   found |= check("copy edgeflag", copy_edgeflag);
+   found |= check("pointcoord replace", point_coord_replace);
+   found |= check("vertex color clamping", clamp_vertex_color);
+
+   if (!found) {
+      brw_shader_perf_log(c, log, "  something else\n");
+   }
+}
+
+static void
+debug_tcs_recompile(const struct brw_compiler *c, void *log,
+                    const struct brw_tcs_prog_key *old_key,
+                    const struct brw_tcs_prog_key *key)
+{
+   bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
+
+   found |= check("input vertices", input_vertices);
+   found |= check("outputs written", outputs_written);
+   found |= check("patch outputs written", patch_outputs_written);
+   found |= check("tes primitive mode", _tes_primitive_mode);
+   found |= check("quads and equal_spacing workaround", quads_workaround);
+
+   if (!found) {
+      brw_shader_perf_log(c, log, "  something else\n");
+   }
+}
+
+static void
+debug_tes_recompile(const struct brw_compiler *c, void *log,
+                    const struct brw_tes_prog_key *old_key,
+                    const struct brw_tes_prog_key *key)
+{
+   bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
+
+   found |= check("inputs read", inputs_read);
+   found |= check("patch inputs read", patch_inputs_read);
+
+   if (!found) {
+      brw_shader_perf_log(c, log, "  something else\n");
+   }
+}
+
+static void
+debug_gs_recompile(const struct brw_compiler *c, void *log,
+                   const struct brw_gs_prog_key *old_key,
+                   const struct brw_gs_prog_key *key)
+{
+   bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
+
+   if (!found) {
+      brw_shader_perf_log(c, log, "  something else\n");
+   }
+}
+
+static void
+debug_fs_recompile(const struct brw_compiler *c, void *log,
+                   const struct brw_wm_prog_key *old_key,
+                   const struct brw_wm_prog_key *key)
+{
+   bool found = false;
+
+   found |= check("alphatest, computed depth, depth test, or depth write",
+                  iz_lookup);
+   found |= check("depth statistics", stats_wm);
+   found |= check("flat shading", flat_shade);
+   found |= check("number of color buffers", nr_color_regions);
+   found |= check("MRT alpha test", alpha_test_replicate_alpha);
+   found |= check("alpha to coverage", alpha_to_coverage);
+   found |= check("fragment color clamping", clamp_fragment_color);
+   found |= check("per-sample interpolation", persample_interp);
+   found |= check("multisampled FBO", multisample_fbo);
+   found |= check("line smoothing", line_aa);
+   found |= check("force dual color blending", force_dual_color_blend);
+   found |= check("coherent fb fetch", coherent_fb_fetch);
+   found |= check("ignore sample mask out", ignore_sample_mask_out);
+   found |= check("coarse pixel", coarse_pixel);
+
+   found |= check("input slots valid", input_slots_valid);
+   found |= check("mrt alpha test function", alpha_test_func);
+   found |= check("mrt alpha test reference value", alpha_test_ref);
+
+   found |= debug_base_recompile(c, log, &old_key->base, &key->base);
+
+   if (!found) {
+      brw_shader_perf_log(c, log, "  something else\n");
+   }
+}
+
+static void
+debug_cs_recompile(const struct brw_compiler *c, void *log,
+                   const struct brw_cs_prog_key *old_key,
+                   const struct brw_cs_prog_key *key)
+{
+   bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
+
+   if (!found) {
+      brw_shader_perf_log(c, log, "  something else\n");
+   }
+}
+
+void
+brw_debug_key_recompile(const struct brw_compiler *c, void *log,
+                        gl_shader_stage stage,
+                        const struct brw_base_prog_key *old_key,
+                        const struct brw_base_prog_key *key)
+{
+   if (!old_key) {
+      brw_shader_perf_log(c, log, "  No previous compile found...\n");
+      return;
+   }
+
+   switch (stage) {
+   case MESA_SHADER_VERTEX:
+      debug_vs_recompile(c, log, (const struct brw_vs_prog_key *)old_key,
+                                 (const struct brw_vs_prog_key *)key);
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      debug_tcs_recompile(c, log, (const struct brw_tcs_prog_key *)old_key,
+                                  (const struct brw_tcs_prog_key *)key);
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      debug_tes_recompile(c, log, (const struct brw_tes_prog_key *)old_key,
+                                  (const struct brw_tes_prog_key *)key);
+      break;
+   case MESA_SHADER_GEOMETRY:
+      debug_gs_recompile(c, log, (const struct brw_gs_prog_key *)old_key,
+                                 (const struct brw_gs_prog_key *)key);
+      break;
+   case MESA_SHADER_FRAGMENT:
+      debug_fs_recompile(c, log, (const struct brw_wm_prog_key *)old_key,
+                                 (const struct brw_wm_prog_key *)key);
+      break;
+   case MESA_SHADER_COMPUTE:
+      debug_cs_recompile(c, log, (const struct brw_cs_prog_key *)old_key,
+                                 (const struct brw_cs_prog_key *)key);
+      break;
+   default:
+      break;
+   }
+}
diff --git a/src/intel/compiler/elk/brw_device_sha1_gen_c.py b/src/intel/compiler/elk/brw_device_sha1_gen_c.py
new file mode 100755
index 00000000000..06aaa3b5478
--- /dev/null
+++ b/src/intel/compiler/elk/brw_device_sha1_gen_c.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+COPYRIGHT = """\
+/*
+ * Copyright 2024 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+"""
+
+import argparse
+import os
+import sys
+
+from mako.template import Template
+from mako import exceptions
+
+sys.path.append(f"{os.path.dirname(sys.argv[0])}/../dev")
+import intel_device_info
+
+template = COPYRIGHT + """
+
+/* DO NOT EDIT - This file generated automatically by intel_device_serialize_c.py script */
+
+#include "dev/intel_device_info.h"
+#include "brw_compiler.h"
+#define SHA_UPDATE_FIELD(field)     _mesa_sha1_update(ctx, &devinfo->field, sizeof(devinfo->field))
+
+void
+brw_device_sha1_update(struct mesa_sha1 *ctx,
+                       const struct intel_device_info *devinfo) {
+% for member in compiler_fields:
+   SHA_UPDATE_FIELD(${member.name});
+% endfor
+}
+
+#undef SHA_UPDATE_FIELD
+
+"""
+
+def main():
+    """print intel_device_serialize.c at the specified path"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--outdir', required=True,
+                        help='Directory to put the generated files in')
+    args = parser.parse_args()
+    path = os.path.join(args.outdir, 'brw_device_sha1_gen.c')
+    device_members = intel_device_info.TYPES_BY_NAME["intel_device_info"].members
+    compiler_fields = [field for field in device_members if field.compiler_field]
+    with open(path, 'w', encoding='utf-8') as f:
+        try:
+            f.write(Template(template).render(compiler_fields=compiler_fields))
+        except:
+            print(exceptions.text_error_template().render(compiler_fields=compiler_fields))
+
+if __name__ == "__main__":
+    main()
diff --git a/src/intel/compiler/elk/brw_disasm.c b/src/intel/compiler/elk/brw_disasm.c
new file mode 100644
index 00000000000..b70ee663a9a
--- /dev/null
+++ b/src/intel/compiler/elk/brw_disasm.c
@@ -0,0 +1,2887 @@
+/*
+ * Copyright © 2008 Keith Packard
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "brw_disasm.h"
+#include "brw_disasm_info.h"
+#include "brw_eu_defines.h"
+#include "brw_eu.h"
+#include "brw_inst.h"
+#include "brw_isa_info.h"
+#include "brw_reg.h"
+#include "brw_shader.h"
+#include "util/half_float.h"
+
+bool
+brw_has_jip(const struct intel_device_info *devinfo, enum opcode opcode)
+{
+   if (devinfo->ver < 6)
+      return false;
+
+   return opcode == BRW_OPCODE_IF ||
+          opcode == BRW_OPCODE_ELSE ||
+          opcode == BRW_OPCODE_ENDIF ||
+          opcode == BRW_OPCODE_WHILE ||
+          opcode == BRW_OPCODE_BREAK ||
+          opcode == BRW_OPCODE_CONTINUE ||
+          opcode == BRW_OPCODE_HALT;
+}
+
+bool
+brw_has_uip(const struct intel_device_info *devinfo, enum opcode opcode)
+{
+   if (devinfo->ver < 6)
+      return false;
+
+   return (devinfo->ver >= 7 && opcode == BRW_OPCODE_IF) ||
+          (devinfo->ver >= 8 && opcode == BRW_OPCODE_ELSE) ||
+          opcode == BRW_OPCODE_BREAK ||
+          opcode == BRW_OPCODE_CONTINUE ||
+          opcode == BRW_OPCODE_HALT;
+}
+
+static bool
+has_branch_ctrl(const struct intel_device_info *devinfo, enum opcode opcode)
+{
+   if (devinfo->ver < 8)
+      return false;
+
+   return opcode == BRW_OPCODE_IF ||
+          opcode == BRW_OPCODE_ELSE;
+          /* opcode == BRW_OPCODE_GOTO; */
+}
+
+static bool
+is_logic_instruction(unsigned opcode)
+{
+   return opcode == BRW_OPCODE_AND ||
+          opcode == BRW_OPCODE_NOT ||
+          opcode == BRW_OPCODE_OR ||
+          opcode == BRW_OPCODE_XOR;
+}
+
+static bool
+is_send(unsigned opcode)
+{
+   return opcode == BRW_OPCODE_SEND ||
+          opcode == BRW_OPCODE_SENDC ||
+          opcode == BRW_OPCODE_SENDS ||
+          opcode == BRW_OPCODE_SENDSC;
+}
+
+static bool
+is_split_send(UNUSED const struct intel_device_info *devinfo, unsigned opcode)
+{
+   if (devinfo->ver >= 12)
+      return is_send(opcode);
+   else
+      return opcode == BRW_OPCODE_SENDS ||
+             opcode == BRW_OPCODE_SENDSC;
+}
+
+const char *const conditional_modifier[16] = {
+   [BRW_CONDITIONAL_NONE] = "",
+   [BRW_CONDITIONAL_Z]    = ".z",
+   [BRW_CONDITIONAL_NZ]   = ".nz",
+   [BRW_CONDITIONAL_G]    = ".g",
+   [BRW_CONDITIONAL_GE]   = ".ge",
+   [BRW_CONDITIONAL_L]    = ".l",
+   [BRW_CONDITIONAL_LE]   = ".le",
+   [BRW_CONDITIONAL_R]    = ".r",
+   [BRW_CONDITIONAL_O]    = ".o",
+   [BRW_CONDITIONAL_U]    = ".u",
+};
+
+static const char *const m_negate[2] = {
+   [0] = "",
+   [1] = "-",
+};
+
+static const char *const _abs[2] = {
+   [0] = "",
+   [1] = "(abs)",
+};
+
+static const char *const m_bitnot[2] = { "", "~" };
+
+static const char *const vert_stride[16] = {
+   [0] = "0",
+   [1] = "1",
+   [2] = "2",
+   [3] = "4",
+   [4] = "8",
+   [5] = "16",
+   [6] = "32",
+   [15] = "VxH",
+};
+
+static const char *const width[8] = {
+   [0] = "1",
+   [1] = "2",
+   [2] = "4",
+   [3] = "8",
+   [4] = "16",
+};
+
+static const char *const horiz_stride[4] = {
+   [0] = "0",
+   [1] = "1",
+   [2] = "2",
+   [3] = "4"
+};
+
+static const char *const chan_sel[4] = {
+   [0] = "x",
+   [1] = "y",
+   [2] = "z",
+   [3] = "w",
+};
+
+static const char *const debug_ctrl[2] = {
+   [0] = "",
+   [1] = ".breakpoint"
+};
+
+static const char *const saturate[2] = {
+   [0] = "",
+   [1] = ".sat"
+};
+
+static const char *const cmpt_ctrl[2] = {
+   [0] = "",
+   [1] = "compacted"
+};
+
+static const char *const accwr[2] = {
+   [0] = "",
+   [1] = "AccWrEnable"
+};
+
+static const char *const branch_ctrl[2] = {
+   [0] = "",
+   [1] = "BranchCtrl"
+};
+
+static const char *const wectrl[2] = {
+   [0] = "",
+   [1] = "WE_all"
+};
+
+static const char *const exec_size[8] = {
+   [0] = "1",
+   [1] = "2",
+   [2] = "4",
+   [3] = "8",
+   [4] = "16",
+   [5] = "32"
+};
+
+static const char *const pred_inv[2] = {
+   [0] = "+",
+   [1] = "-"
+};
+
+const char *const pred_ctrl_align16[16] = {
+   [1] = "",
+   [2] = ".x",
+   [3] = ".y",
+   [4] = ".z",
+   [5] = ".w",
+   [6] = ".any4h",
+   [7] = ".all4h",
+};
+
+static const char *const pred_ctrl_align1[16] = {
+   [BRW_PREDICATE_NORMAL]        = "",
+   [BRW_PREDICATE_ALIGN1_ANYV]   = ".anyv",
+   [BRW_PREDICATE_ALIGN1_ALLV]   = ".allv",
+   [BRW_PREDICATE_ALIGN1_ANY2H]  = ".any2h",
+   [BRW_PREDICATE_ALIGN1_ALL2H]  = ".all2h",
+   [BRW_PREDICATE_ALIGN1_ANY4H]  = ".any4h",
+   [BRW_PREDICATE_ALIGN1_ALL4H]  = ".all4h",
+   [BRW_PREDICATE_ALIGN1_ANY8H]  = ".any8h",
+   [BRW_PREDICATE_ALIGN1_ALL8H]  = ".all8h",
+   [BRW_PREDICATE_ALIGN1_ANY16H] = ".any16h",
+   [BRW_PREDICATE_ALIGN1_ALL16H] = ".all16h",
+   [BRW_PREDICATE_ALIGN1_ANY32H] = ".any32h",
+   [BRW_PREDICATE_ALIGN1_ALL32H] = ".all32h",
+};
+
+static const char *const xe2_pred_ctrl[4] = {
+   [BRW_PREDICATE_NORMAL]        = "",
+   [XE2_PREDICATE_ANY]           = ".any",
+   [XE2_PREDICATE_ALL]           = ".all",
+};
+
+static const char *const thread_ctrl[4] = {
+   [BRW_THREAD_NORMAL] = "",
+   [BRW_THREAD_ATOMIC] = "atomic",
+   [BRW_THREAD_SWITCH] = "switch",
+};
+
+static const char *const compr_ctrl[4] = {
+   [0] = "",
+   [1] = "sechalf",
+   [2] = "compr",
+   [3] = "compr4",
+};
+
+static const char *const dep_ctrl[4] = {
+   [0] = "",
+   [1] = "NoDDClr",
+   [2] = "NoDDChk",
+   [3] = "NoDDClr,NoDDChk",
+};
+
+static const char *const mask_ctrl[4] = {
+   [0] = "",
+   [1] = "nomask",
+};
+
+static const char *const access_mode[2] = {
+   [0] = "align1",
+   [1] = "align16",
+};
+
+static const char *const reg_file[4] = {
+   [0] = "A",
+   [1] = "g",
+   [2] = "m",
+   [3] = "imm",
+};
+
+static const char *const writemask[16] = {
+   [0x0] = ".",
+   [0x1] = ".x",
+   [0x2] = ".y",
+   [0x3] = ".xy",
+   [0x4] = ".z",
+   [0x5] = ".xz",
+   [0x6] = ".yz",
+   [0x7] = ".xyz",
+   [0x8] = ".w",
+   [0x9] = ".xw",
+   [0xa] = ".yw",
+   [0xb] = ".xyw",
+   [0xc] = ".zw",
+   [0xd] = ".xzw",
+   [0xe] = ".yzw",
+   [0xf] = "",
+};
+
+static const char *const end_of_thread[2] = {
+   [0] = "",
+   [1] = "EOT"
+};
+
+/* SFIDs on Gfx4-5 */
+static const char *const gfx4_sfid[16] = {
+   [BRW_SFID_NULL]            = "null",
+   [BRW_SFID_MATH]            = "math",
+   [BRW_SFID_SAMPLER]         = "sampler",
+   [BRW_SFID_MESSAGE_GATEWAY] = "gateway",
+   [BRW_SFID_DATAPORT_READ]   = "read",
+   [BRW_SFID_DATAPORT_WRITE]  = "write",
+   [BRW_SFID_URB]             = "urb",
+   [BRW_SFID_THREAD_SPAWNER]  = "thread_spawner",
+   [BRW_SFID_VME]             = "vme",
+};
+
+static const char *const gfx6_sfid[16] = {
+   [BRW_SFID_NULL]                     = "null",
+   [BRW_SFID_MATH]                     = "math",
+   [BRW_SFID_SAMPLER]                  = "sampler",
+   [BRW_SFID_MESSAGE_GATEWAY]          = "gateway",
+   [BRW_SFID_URB]                      = "urb",
+   [BRW_SFID_THREAD_SPAWNER]           = "thread_spawner",
+   [GFX6_SFID_DATAPORT_SAMPLER_CACHE]  = "dp_sampler",
+   [GFX6_SFID_DATAPORT_RENDER_CACHE]   = "render",
+   [GFX6_SFID_DATAPORT_CONSTANT_CACHE] = "const",
+   [GFX7_SFID_DATAPORT_DATA_CACHE]     = "data",
+   [GFX7_SFID_PIXEL_INTERPOLATOR]      = "pixel interp",
+   [HSW_SFID_DATAPORT_DATA_CACHE_1]    = "dp data 1",
+   [HSW_SFID_CRE]                      = "cre",
+   [GEN_RT_SFID_RAY_TRACE_ACCELERATOR] = "rt accel",
+   [GFX12_SFID_SLM]                    = "slm",
+   [GFX12_SFID_TGM]                    = "tgm",
+   [GFX12_SFID_UGM]                    = "ugm",
+};
+
+static const char *const gfx7_gateway_subfuncid[8] = {
+   [BRW_MESSAGE_GATEWAY_SFID_OPEN_GATEWAY] = "open",
+   [BRW_MESSAGE_GATEWAY_SFID_CLOSE_GATEWAY] = "close",
+   [BRW_MESSAGE_GATEWAY_SFID_FORWARD_MSG] = "forward msg",
+   [BRW_MESSAGE_GATEWAY_SFID_GET_TIMESTAMP] = "get timestamp",
+   [BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG] = "barrier msg",
+   [BRW_MESSAGE_GATEWAY_SFID_UPDATE_GATEWAY_STATE] = "update state",
+   [BRW_MESSAGE_GATEWAY_SFID_MMIO_READ_WRITE] = "mmio read/write",
+};
+
+static const char *const gfx4_dp_read_port_msg_type[4] = {
+   [0b00] = "OWord Block Read",
+   [0b01] = "OWord Dual Block Read",
+   [0b10] = "Media Block Read",
+   [0b11] = "DWord Scattered Read",
+};
+
+static const char *const g45_dp_read_port_msg_type[8] = {
+   [0b000] = "OWord Block Read",
+   [0b010] = "OWord Dual Block Read",
+   [0b100] = "Media Block Read",
+   [0b110] = "DWord Scattered Read",
+   [0b001] = "Render Target UNORM Read",
+   [0b011] = "AVC Loop Filter Read",
+};
+
+static const char *const dp_write_port_msg_type[8] = {
+   [0b000] = "OWord block write",
+   [0b001] = "OWord dual block write",
+   [0b010] = "media block write",
+   [0b011] = "DWord scattered write",
+   [0b100] = "RT write",
+   [0b101] = "streamed VB write",
+   [0b110] = "RT UNORM write", /* G45+ */
+   [0b111] = "flush render cache",
+};
+
+static const char *const dp_rc_msg_type_gfx6[16] = {
+   [BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ] = "OWORD block read",
+   [GFX6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ] = "RT UNORM read",
+   [GFX6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ] = "OWORD dual block read",
+   [GFX6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ] = "media block read",
+   [GFX6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ] =
+      "OWORD unaligned block read",
+   [GFX6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ] = "DWORD scattered read",
+   [GFX6_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE] = "DWORD atomic write",
+   [GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE] = "OWORD block write",
+   [GFX6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE] =
+      "OWORD dual block write",
+   [GFX6_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE] = "media block write",
+   [GFX6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE] =
+      "DWORD scattered write",
+   [GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE] = "RT write",
+   [GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE] = "streamed VB write",
+   [GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE] = "RT UNORM write",
+};
+
+static const char *const dp_rc_msg_type_gfx7[16] = {
+   [GFX7_DATAPORT_RC_MEDIA_BLOCK_READ] = "media block read",
+   [GFX7_DATAPORT_RC_TYPED_SURFACE_READ] = "typed surface read",
+   [GFX7_DATAPORT_RC_TYPED_ATOMIC_OP] = "typed atomic op",
+   [GFX7_DATAPORT_RC_MEMORY_FENCE] = "memory fence",
+   [GFX7_DATAPORT_RC_MEDIA_BLOCK_WRITE] = "media block write",
+   [GFX7_DATAPORT_RC_RENDER_TARGET_WRITE] = "RT write",
+   [GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE] = "typed surface write"
+};
+
+static const char *const dp_rc_msg_type_gfx9[16] = {
+   [GFX9_DATAPORT_RC_RENDER_TARGET_WRITE] = "RT write",
+   [GFX9_DATAPORT_RC_RENDER_TARGET_READ] = "RT read"
+};
+
+static const char *const *
+dp_rc_msg_type(const struct intel_device_info *devinfo)
+{
+   return (devinfo->ver >= 9 ? dp_rc_msg_type_gfx9 :
+           devinfo->ver >= 7 ? dp_rc_msg_type_gfx7 :
+           devinfo->ver >= 6 ? dp_rc_msg_type_gfx6 :
+           dp_write_port_msg_type);
+}
+
+static const char *const m_rt_write_subtype[] = {
+   [0b000] = "SIMD16",
+   [0b001] = "SIMD16/RepData",
+   [0b010] = "SIMD8/DualSrcLow",
+   [0b011] = "SIMD8/DualSrcHigh",
+   [0b100] = "SIMD8",
+   [0b101] = "SIMD8/ImageWrite",   /* Gfx6+ */
+   [0b111] = "SIMD16/RepData-111", /* no idea how this is different than 1 */
+};
+
+static const char *const dp_dc0_msg_type_gfx7[16] = {
+   [GFX7_DATAPORT_DC_OWORD_BLOCK_READ] = "DC OWORD block read",
+   [GFX7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ] =
+      "DC unaligned OWORD block read",
+   [GFX7_DATAPORT_DC_OWORD_DUAL_BLOCK_READ] = "DC OWORD dual block read",
+   [GFX7_DATAPORT_DC_DWORD_SCATTERED_READ] = "DC DWORD scattered read",
+   [GFX7_DATAPORT_DC_BYTE_SCATTERED_READ] = "DC byte scattered read",
+   [GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ] = "DC untyped surface read",
+   [GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP] = "DC untyped atomic",
+   [GFX7_DATAPORT_DC_MEMORY_FENCE] = "DC mfence",
+   [GFX7_DATAPORT_DC_OWORD_BLOCK_WRITE] = "DC OWORD block write",
+   [GFX7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE] = "DC OWORD dual block write",
+   [GFX7_DATAPORT_DC_DWORD_SCATTERED_WRITE] = "DC DWORD scatterd write",
+   [GFX7_DATAPORT_DC_BYTE_SCATTERED_WRITE] = "DC byte scattered write",
+   [GFX7_DATAPORT_DC_UNTYPED_SURFACE_WRITE] = "DC untyped surface write",
+};
+
+static const char *const dp_oword_block_rw[8] = {
+      [BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW]  = "1-low",
+      [BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH] = "1-high",
+      [BRW_DATAPORT_OWORD_BLOCK_2_OWORDS]    = "2",
+      [BRW_DATAPORT_OWORD_BLOCK_4_OWORDS]    = "4",
+      [BRW_DATAPORT_OWORD_BLOCK_8_OWORDS]    = "8",
+};
+
+static const char *const dp_dc1_msg_type_hsw[32] = {
+   [HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ] = "untyped surface read",
+   [HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP] = "DC untyped atomic op",
+   [HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2] =
+      "DC untyped 4x2 atomic op",
+   [HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_READ] = "DC media block read",
+   [HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ] = "DC typed surface read",
+   [HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP] = "DC typed atomic",
+   [HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2] = "DC typed 4x2 atomic op",
+   [HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE] = "DC untyped surface write",
+   [HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_WRITE] = "DC media block write",
+   [HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP] = "DC atomic counter op",
+   [HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2] =
+      "DC 4x2 atomic counter op",
+   [HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE] = "DC typed surface write",
+   [GFX9_DATAPORT_DC_PORT1_A64_SCATTERED_READ] = "DC A64 scattered read",
+   [GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ] = "DC A64 untyped surface read",
+   [GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP] = "DC A64 untyped atomic op",
+   [GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ] = "DC A64 oword block read",
+   [GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE] = "DC A64 oword block write",
+   [GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE] = "DC A64 untyped surface write",
+   [GFX8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE] = "DC A64 scattered write",
+   [GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP] =
+      "DC untyped atomic float op",
+   [GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP] =
+      "DC A64 untyped atomic float op",
+   [GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP] =
+      "DC A64 untyped atomic half-integer op",
+   [GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP] =
+      "DC A64 untyped atomic half-float op",
+};
+
+static const char *const aop[16] = {
+   [BRW_AOP_AND]    = "and",
+   [BRW_AOP_OR]     = "or",
+   [BRW_AOP_XOR]    = "xor",
+   [BRW_AOP_MOV]    = "mov",
+   [BRW_AOP_INC]    = "inc",
+   [BRW_AOP_DEC]    = "dec",
+   [BRW_AOP_ADD]    = "add",
+   [BRW_AOP_SUB]    = "sub",
+   [BRW_AOP_REVSUB] = "revsub",
+   [BRW_AOP_IMAX]   = "imax",
+   [BRW_AOP_IMIN]   = "imin",
+   [BRW_AOP_UMAX]   = "umax",
+   [BRW_AOP_UMIN]   = "umin",
+   [BRW_AOP_CMPWR]  = "cmpwr",
+   [BRW_AOP_PREDEC] = "predec",
+};
+
+static const char *const aop_float[5] = {
+   [BRW_AOP_FMAX]   = "fmax",
+   [BRW_AOP_FMIN]   = "fmin",
+   [BRW_AOP_FCMPWR] = "fcmpwr",
+   [BRW_AOP_FADD]   = "fadd",
+};
+
+static const char * const pixel_interpolator_msg_types[4] = {
+    [GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET] = "per_message_offset",
+    [GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE] = "sample_position",
+    [GFX7_PIXEL_INTERPOLATOR_LOC_CENTROID] = "centroid",
+    [GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET] = "per_slot_offset",
+};
+
+static const char *const math_function[16] = {
+   [BRW_MATH_FUNCTION_INV]    = "inv",
+   [BRW_MATH_FUNCTION_LOG]    = "log",
+   [BRW_MATH_FUNCTION_EXP]    = "exp",
+   [BRW_MATH_FUNCTION_SQRT]   = "sqrt",
+   [BRW_MATH_FUNCTION_RSQ]    = "rsq",
+   [BRW_MATH_FUNCTION_SIN]    = "sin",
+   [BRW_MATH_FUNCTION_COS]    = "cos",
+   [BRW_MATH_FUNCTION_SINCOS] = "sincos",
+   [BRW_MATH_FUNCTION_FDIV]   = "fdiv",
+   [BRW_MATH_FUNCTION_POW]    = "pow",
+   [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
+   [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT]  = "intdiv",
+   [BRW_MATH_FUNCTION_INT_DIV_REMAINDER] = "intmod",
+   [GFX8_MATH_FUNCTION_INVM]  = "invm",
+   [GFX8_MATH_FUNCTION_RSQRTM] = "rsqrtm",
+};
+
+static const char *const sync_function[16] = {
+   [TGL_SYNC_NOP] = "nop",
+   [TGL_SYNC_ALLRD] = "allrd",
+   [TGL_SYNC_ALLWR] = "allwr",
+   [TGL_SYNC_FENCE] = "fence",
+   [TGL_SYNC_BAR] = "bar",
+   [TGL_SYNC_HOST] = "host",
+};
+
+static const char *const math_saturate[2] = {
+   [0] = "",
+   [1] = "sat"
+};
+
+static const char *const math_signed[2] = {
+   [0] = "",
+   [1] = "signed"
+};
+
+static const char *const math_scalar[2] = {
+   [0] = "",
+   [1] = "scalar"
+};
+
+static const char *const math_precision[2] = {
+   [0] = "",
+   [1] = "partial_precision"
+};
+
+static const char *const gfx5_urb_opcode[] = {
+   [0] = "urb_write",
+   [1] = "ff_sync",
+};
+
+static const char *const gfx7_urb_opcode[] = {
+   [BRW_URB_OPCODE_WRITE_HWORD] = "write HWord",
+   [BRW_URB_OPCODE_WRITE_OWORD] = "write OWord",
+   [BRW_URB_OPCODE_READ_HWORD] = "read HWord",
+   [BRW_URB_OPCODE_READ_OWORD] = "read OWord",
+   [GFX7_URB_OPCODE_ATOMIC_MOV] = "atomic mov",  /* Gfx7+ */
+   [GFX7_URB_OPCODE_ATOMIC_INC] = "atomic inc",  /* Gfx7+ */
+   [GFX8_URB_OPCODE_ATOMIC_ADD] = "atomic add",  /* Gfx8+ */
+   [GFX8_URB_OPCODE_SIMD8_WRITE] = "SIMD8 write", /* Gfx8+ */
+   [GFX8_URB_OPCODE_SIMD8_READ] = "SIMD8 read",  /* Gfx8+ */
+   [GFX125_URB_OPCODE_FENCE] = "fence",  /* Gfx12.5+ */
+   /* [10-15] - reserved */
+};
+
+static const char *const urb_swizzle[4] = {
+   [BRW_URB_SWIZZLE_NONE]       = "",
+   [BRW_URB_SWIZZLE_INTERLEAVE] = "interleave",
+   [BRW_URB_SWIZZLE_TRANSPOSE]  = "transpose",
+};
+
+static const char *const urb_allocate[2] = {
+   [0] = "",
+   [1] = "allocate"
+};
+
+static const char *const urb_used[2] = {
+   [0] = "",
+   [1] = "used"
+};
+
+static const char *const urb_complete[2] = {
+   [0] = "",
+   [1] = "complete"
+};
+
+static const char *const gfx5_sampler_msg_type[] = {
+   [GFX5_SAMPLER_MESSAGE_SAMPLE]              = "sample",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS]         = "sample_b",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_LOD]          = "sample_l",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE]      = "sample_c",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS]       = "sample_d",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE] = "sample_b_c",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE]  = "sample_l_c",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_LD]           = "ld",
+   [GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4]      = "gather4",
+   [GFX5_SAMPLER_MESSAGE_LOD]                 = "lod",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO]      = "resinfo",
+   [GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO]   = "sampleinfo",
+   [GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C]    = "gather4_c",
+   [GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO]   = "gather4_po",
+   [GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C] = "gather4_po_c",
+   [HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE] = "sample_d_c",
+   [GFX9_SAMPLER_MESSAGE_SAMPLE_LZ]           = "sample_lz",
+   [GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ]         = "sample_c_lz",
+   [GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ]        = "ld_lz",
+   [GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W]     = "ld2dms_w",
+   [GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS]       = "ld_mcs",
+   [GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS]       = "ld2dms",
+   [GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS]       = "ld2dss",
+};
+
+static const char *const xe2_sampler_msg_type[] = {
+   [GFX5_SAMPLER_MESSAGE_SAMPLE]              = "sample",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS]         = "sample_b",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_LOD]          = "sample_l",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE]      = "sample_c",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS]       = "sample_d",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE] = "sample_b_c",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE]  = "sample_l_c",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_LD]           = "ld",
+   [GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4]      = "gather4",
+   [GFX5_SAMPLER_MESSAGE_LOD]                 = "lod",
+   [GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO]      = "resinfo",
+   [GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO]   = "sampleinfo",
+   [GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C]    = "gather4_c",
+   [GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO]   = "gather4_po",
+   [XE2_SAMPLER_MESSAGE_SAMPLE_MLOD]          = "sample_mlod",
+   [XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD]  = "sample_c_mlod",
+   [HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE] = "sample_d_c",
+   [GFX9_SAMPLER_MESSAGE_SAMPLE_LZ]           = "sample_lz",
+   [GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ]         = "sample_c_lz",
+   [GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ]        = "ld_lz",
+   [GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W]     = "ld2dms_w",
+   [GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS]       = "ld_mcs",
+   [GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS]       = "ld2dms",
+   [GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS]       = "ld2dss",
+};
+
+static const char *const gfx5_sampler_simd_mode[7] = {
+   [BRW_SAMPLER_SIMD_MODE_SIMD4X2]   = "SIMD4x2",
+   [BRW_SAMPLER_SIMD_MODE_SIMD8]     = "SIMD8",
+   [BRW_SAMPLER_SIMD_MODE_SIMD16]    = "SIMD16",
+   [BRW_SAMPLER_SIMD_MODE_SIMD32_64] = "SIMD32/64",
+   [GFX10_SAMPLER_SIMD_MODE_SIMD8H]  = "SIMD8H",
+   [GFX10_SAMPLER_SIMD_MODE_SIMD16H] = "SIMD16H",
+};
+
+static const char *const xe2_sampler_simd_mode[7] = {
+   [XE2_SAMPLER_SIMD_MODE_SIMD16]  = "SIMD16",
+   [XE2_SAMPLER_SIMD_MODE_SIMD32]  = "SIMD32",
+   [XE2_SAMPLER_SIMD_MODE_SIMD16H] = "SIMD16H",
+   [XE2_SAMPLER_SIMD_MODE_SIMD32H] = "SIMD32H",
+};
+
+static const char *const sampler_target_format[4] = {
+   [0] = "F",
+   [2] = "UD",
+   [3] = "D"
+};
+
+static const char *const lsc_operation[] = {
+   [LSC_OP_LOAD]            = "load",
+   [LSC_OP_LOAD_CMASK]      = "load_cmask",
+   [LSC_OP_STORE]           = "store",
+   [LSC_OP_STORE_CMASK]     = "store_cmask",
+   [LSC_OP_FENCE]           = "fence",
+   [LSC_OP_ATOMIC_INC]      = "atomic_inc",
+   [LSC_OP_ATOMIC_DEC]      = "atomic_dec",
+   [LSC_OP_ATOMIC_LOAD]     = "atomic_load",
+   [LSC_OP_ATOMIC_STORE]    = "atomic_store",
+   [LSC_OP_ATOMIC_ADD]      = "atomic_add",
+   [LSC_OP_ATOMIC_SUB]      = "atomic_sub",
+   [LSC_OP_ATOMIC_MIN]      = "atomic_min",
+   [LSC_OP_ATOMIC_MAX]      = "atomic_max",
+   [LSC_OP_ATOMIC_UMIN]     = "atomic_umin",
+   [LSC_OP_ATOMIC_UMAX]     = "atomic_umax",
+   [LSC_OP_ATOMIC_CMPXCHG]  = "atomic_cmpxchg",
+   [LSC_OP_ATOMIC_FADD]     = "atomic_fadd",
+   [LSC_OP_ATOMIC_FSUB]     = "atomic_fsub",
+   [LSC_OP_ATOMIC_FMIN]     = "atomic_fmin",
+   [LSC_OP_ATOMIC_FMAX]     = "atomic_fmax",
+   [LSC_OP_ATOMIC_FCMPXCHG] = "atomic_fcmpxchg",
+   [LSC_OP_ATOMIC_AND]      = "atomic_and",
+   [LSC_OP_ATOMIC_OR]       = "atomic_or",
+   [LSC_OP_ATOMIC_XOR]      = "atomic_xor",
+};
+
+static const char *const lsc_addr_surface_type[] = {
+   [LSC_ADDR_SURFTYPE_FLAT] = "flat",
+   [LSC_ADDR_SURFTYPE_BSS]  = "bss",
+   [LSC_ADDR_SURFTYPE_SS]   = "ss",
+   [LSC_ADDR_SURFTYPE_BTI]  = "bti",
+};
+
+static const char* const lsc_fence_scope[] = {
+   [LSC_FENCE_THREADGROUP]     = "threadgroup",
+   [LSC_FENCE_LOCAL]           = "local",
+   [LSC_FENCE_TILE]            = "tile",
+   [LSC_FENCE_GPU]             = "gpu",
+   [LSC_FENCE_ALL_GPU]         = "all_gpu",
+   [LSC_FENCE_SYSTEM_RELEASE]  = "system_release",
+   [LSC_FENCE_SYSTEM_ACQUIRE]  = "system_acquire",
+};
+
+static const char* const lsc_flush_type[] = {
+   [LSC_FLUSH_TYPE_NONE]       = "none",
+   [LSC_FLUSH_TYPE_EVICT]      = "evict",
+   [LSC_FLUSH_TYPE_INVALIDATE] = "invalidate",
+   [LSC_FLUSH_TYPE_DISCARD]    = "discard",
+   [LSC_FLUSH_TYPE_CLEAN]      = "clean",
+   [LSC_FLUSH_TYPE_L3ONLY]     = "l3only",
+   [LSC_FLUSH_TYPE_NONE_6]     = "none_6",
+};
+
+static const char* const lsc_addr_size[] = {
+   [LSC_ADDR_SIZE_A16] = "a16",
+   [LSC_ADDR_SIZE_A32] = "a32",
+   [LSC_ADDR_SIZE_A64] = "a64",
+};
+
+static const char* const lsc_backup_fence_routing[] = {
+   [LSC_NORMAL_ROUTING]  = "normal_routing",
+   [LSC_ROUTE_TO_LSC]    = "route_to_lsc",
+};
+
+static const char* const lsc_data_size[] = {
+   [LSC_DATA_SIZE_D8]      = "d8",
+   [LSC_DATA_SIZE_D16]     = "d16",
+   [LSC_DATA_SIZE_D32]     = "d32",
+   [LSC_DATA_SIZE_D64]     = "d64",
+   [LSC_DATA_SIZE_D8U32]   = "d8u32",
+   [LSC_DATA_SIZE_D16U32]  = "d16u32",
+   [LSC_DATA_SIZE_D16BF32] = "d16bf32",
+};
+
+static const char* const lsc_vect_size_str[] = {
+   [LSC_VECT_SIZE_V1] = "V1",
+   [LSC_VECT_SIZE_V2] = "V2",
+   [LSC_VECT_SIZE_V3] = "V3",
+   [LSC_VECT_SIZE_V4] = "V4",
+   [LSC_VECT_SIZE_V8] = "V8",
+   [LSC_VECT_SIZE_V16] = "V16",
+   [LSC_VECT_SIZE_V32] = "V32",
+   [LSC_VECT_SIZE_V64] = "V64",
+};
+
+static const char* const lsc_cmask_str[] = {
+   [LSC_CMASK_X]      = "x",
+   [LSC_CMASK_Y]      = "y",
+   [LSC_CMASK_XY]     = "xy",
+   [LSC_CMASK_Z]      = "z",
+   [LSC_CMASK_XZ]     = "xz",
+   [LSC_CMASK_YZ]     = "yz",
+   [LSC_CMASK_XYZ]    = "xyz",
+   [LSC_CMASK_W]      = "w",
+   [LSC_CMASK_XW]     = "xw",
+   [LSC_CMASK_YW]     = "yw",
+   [LSC_CMASK_XYW]    = "xyw",
+   [LSC_CMASK_ZW]     = "zw",
+   [LSC_CMASK_XZW]    = "xzw",
+   [LSC_CMASK_YZW]    = "yzw",
+   [LSC_CMASK_XYZW]   = "xyzw",
+};
+
+static const char* const lsc_cache_load[] = {
+   [LSC_CACHE_LOAD_L1STATE_L3MOCS]   = "L1STATE_L3MOCS",
+   [LSC_CACHE_LOAD_L1UC_L3UC]        = "L1UC_L3UC",
+   [LSC_CACHE_LOAD_L1UC_L3C]         = "L1UC_L3C",
+   [LSC_CACHE_LOAD_L1C_L3UC]         = "L1C_L3UC",
+   [LSC_CACHE_LOAD_L1C_L3C]          = "L1C_L3C",
+   [LSC_CACHE_LOAD_L1S_L3UC]         = "L1S_L3UC",
+   [LSC_CACHE_LOAD_L1S_L3C]          = "L1S_L3C",
+   [LSC_CACHE_LOAD_L1IAR_L3C]        = "L1IAR_L3C",
+};
+
+static const char* const lsc_cache_store[] = {
+   [LSC_CACHE_STORE_L1STATE_L3MOCS]  = "L1STATE_L3MOCS",
+   [LSC_CACHE_STORE_L1UC_L3UC]       = "L1UC_L3UC",
+   [LSC_CACHE_STORE_L1UC_L3WB]       = "L1UC_L3WB",
+   [LSC_CACHE_STORE_L1WT_L3UC]       = "L1WT_L3UC",
+   [LSC_CACHE_STORE_L1WT_L3WB]       = "L1WT_L3WB",
+   [LSC_CACHE_STORE_L1S_L3UC]        = "L1S_L3UC",
+   [LSC_CACHE_STORE_L1S_L3WB]        = "L1S_L3WB",
+   [LSC_CACHE_STORE_L1WB_L3WB]       = "L1WB_L3WB",
+};
+
+static const char* const xe2_lsc_cache_load[] = {
+   [XE2_LSC_CACHE_LOAD_L1STATE_L3MOCS]   = "L1STATE_L3MOCS",
+   [XE2_LSC_CACHE_LOAD_L1UC_L3UC]        = "L1UC_L3UC",
+   [XE2_LSC_CACHE_LOAD_L1UC_L3C]         = "L1UC_L3C",
+   [XE2_LSC_CACHE_LOAD_L1UC_L3CC]        = "L1UC_L3CC",
+   [XE2_LSC_CACHE_LOAD_L1C_L3UC]         = "L1C_L3UC",
+   [XE2_LSC_CACHE_LOAD_L1C_L3C]          = "L1C_L3C",
+   [XE2_LSC_CACHE_LOAD_L1C_L3CC]         = "L1C_L3CC",
+   [XE2_LSC_CACHE_LOAD_L1S_L3UC]         = "L1S_L3UC",
+   [XE2_LSC_CACHE_LOAD_L1S_L3C]          = "L1S_L3C",
+   [XE2_LSC_CACHE_LOAD_L1IAR_L3IAR]      = "L1IAR_L3IAR",
+};
+
+static const char* const xe2_lsc_cache_store[] = {
+   [XE2_LSC_CACHE_STORE_L1STATE_L3MOCS]  = "L1STATE_L3MOCS",
+   [XE2_LSC_CACHE_STORE_L1UC_L3UC]       = "L1UC_L3UC",
+   [XE2_LSC_CACHE_STORE_L1UC_L3WB]       = "L1UC_L3WB",
+   [XE2_LSC_CACHE_STORE_L1WT_L3UC]       = "L1WT_L3UC",
+   [XE2_LSC_CACHE_STORE_L1WT_L3WB]       = "L1WT_L3WB",
+   [XE2_LSC_CACHE_STORE_L1S_L3UC]        = "L1S_L3UC",
+   [XE2_LSC_CACHE_STORE_L1S_L3WB]        = "L1S_L3WB",
+   [XE2_LSC_CACHE_STORE_L1WB_L3WB]       = "L1WB_L3WB",
+};
+
+static const char* const dpas_systolic_depth[4] = {
+   [0] = "16",
+   [1] = "2",
+   [2] = "4",
+   [3] = "8"
+};
+
+static int column;
+
+static int
+string(FILE *file, const char *string)
+{
+   fputs(string, file);
+   column += strlen(string);
+   return 0;
+}
+
+static int
+format(FILE *f, const char *format, ...) PRINTFLIKE(2, 3);
+
+static int
+format(FILE *f, const char *format, ...)
+{
+   char buf[1024];
+   va_list args;
+   va_start(args, format);
+
+   vsnprintf(buf, sizeof(buf) - 1, format, args);
+   va_end(args);
+   string(f, buf);
+   return 0;
+}
+
+static int
+newline(FILE *f)
+{
+   putc('\n', f);
+   column = 0;
+   return 0;
+}
+
+static int
+pad(FILE *f, int c)
+{
+   do
+      string(f, " ");
+   while (column < c);
+   return 0;
+}
+
+static int
+control(FILE *file, const char *name, const char *const ctrl[],
+        unsigned id, int *space)
+{
+   if (!ctrl[id]) {
+      fprintf(file, "*** invalid %s value %d ", name, id);
+      return 1;
+   }
+   if (ctrl[id][0]) {
+      if (space && *space)
+         string(file, " ");
+      string(file, ctrl[id]);
+      if (space)
+         *space = 1;
+   }
+   return 0;
+}
+
+static int
+print_opcode(FILE *file, const struct brw_isa_info *isa,
+             enum opcode id)
+{
+   const struct opcode_desc *desc = brw_opcode_desc(isa, id);
+   if (!desc) {
+      format(file, "*** invalid opcode value %d ", id);
+      return 1;
+   }
+   string(file, desc->name);
+   return 0;
+}
+
+static int
+reg(FILE *file, unsigned _reg_file, unsigned _reg_nr)
+{
+   int err = 0;
+
+   /* Clear the Compr4 instruction compression bit. */
+   if (_reg_file == BRW_MESSAGE_REGISTER_FILE)
+      _reg_nr &= ~BRW_MRF_COMPR4;
+
+   if (_reg_file == BRW_ARCHITECTURE_REGISTER_FILE) {
+      switch (_reg_nr & 0xf0) {
+      case BRW_ARF_NULL:
+         string(file, "null");
+         break;
+      case BRW_ARF_ADDRESS:
+         format(file, "a%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_ACCUMULATOR:
+         format(file, "acc%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_FLAG:
+         format(file, "f%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_MASK:
+         format(file, "mask%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_MASK_STACK:
+         format(file, "ms%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_MASK_STACK_DEPTH:
+         format(file, "msd%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_STATE:
+         format(file, "sr%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_CONTROL:
+         format(file, "cr%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_NOTIFICATION_COUNT:
+         format(file, "n%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_IP:
+         string(file, "ip");
+         return -1;
+         break;
+      case BRW_ARF_TDR:
+         format(file, "tdr0");
+         return -1;
+      case BRW_ARF_TIMESTAMP:
+         format(file, "tm%d", _reg_nr & 0x0f);
+         break;
+      default:
+         format(file, "ARF%d", _reg_nr);
+         break;
+      }
+   } else {
+      err |= control(file, "src reg file", reg_file, _reg_file, NULL);
+      format(file, "%d", _reg_nr);
+   }
+   return err;
+}
+
+static int
+dest(FILE *file, const struct brw_isa_info *isa, const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+   enum brw_reg_type type = brw_inst_dst_type(devinfo, inst);
+   unsigned elem_size = brw_reg_type_to_size(type);
+   int err = 0;
+
+   if (is_split_send(devinfo, brw_inst_opcode(isa, inst))) {
+      /* These are fixed for split sends */
+      type = BRW_REGISTER_TYPE_UD;
+      elem_size = 4;
+      if (devinfo->ver >= 12) {
+         err |= reg(file, brw_inst_send_dst_reg_file(devinfo, inst),
+                    brw_inst_dst_da_reg_nr(devinfo, inst));
+         string(file, brw_reg_type_to_letters(type));
+      } else if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         err |= reg(file, brw_inst_send_dst_reg_file(devinfo, inst),
+                    brw_inst_dst_da_reg_nr(devinfo, inst));
+         unsigned subreg_nr = brw_inst_dst_da16_subreg_nr(devinfo, inst);
+         if (subreg_nr)
+            format(file, ".%u", subreg_nr);
+         string(file, brw_reg_type_to_letters(type));
+      } else {
+         string(file, "g[a0");
+         if (brw_inst_dst_ia_subreg_nr(devinfo, inst))
+            format(file, ".%"PRIu64, brw_inst_dst_ia_subreg_nr(devinfo, inst) /
+                   elem_size);
+         if (brw_inst_send_dst_ia16_addr_imm(devinfo, inst))
+            format(file, " %d", brw_inst_send_dst_ia16_addr_imm(devinfo, inst));
+         string(file, "]<");
+         string(file, brw_reg_type_to_letters(type));
+      }
+   } else if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+      if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         err |= reg(file, brw_inst_dst_reg_file(devinfo, inst),
+                    brw_inst_dst_da_reg_nr(devinfo, inst));
+         if (err == -1)
+            return 0;
+         if (brw_inst_dst_da1_subreg_nr(devinfo, inst))
+            format(file, ".%"PRIu64, brw_inst_dst_da1_subreg_nr(devinfo, inst) /
+                   elem_size);
+         string(file, "<");
+         err |= control(file, "horiz stride", horiz_stride,
+                        brw_inst_dst_hstride(devinfo, inst), NULL);
+         string(file, ">");
+         string(file, brw_reg_type_to_letters(type));
+      } else {
+         string(file, "g[a0");
+         if (brw_inst_dst_ia_subreg_nr(devinfo, inst))
+            format(file, ".%"PRIu64, brw_inst_dst_ia_subreg_nr(devinfo, inst) /
+                   elem_size);
+         if (brw_inst_dst_ia1_addr_imm(devinfo, inst))
+            format(file, " %d", brw_inst_dst_ia1_addr_imm(devinfo, inst));
+         string(file, "]<");
+         err |= control(file, "horiz stride", horiz_stride,
+                        brw_inst_dst_hstride(devinfo, inst), NULL);
+         string(file, ">");
+         string(file, brw_reg_type_to_letters(type));
+      }
+   } else {
+      if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         err |= reg(file, brw_inst_dst_reg_file(devinfo, inst),
+                    brw_inst_dst_da_reg_nr(devinfo, inst));
+         if (err == -1)
+            return 0;
+         if (brw_inst_dst_da16_subreg_nr(devinfo, inst))
+            format(file, ".%u", 16 / elem_size);
+         string(file, "<1>");
+         err |= control(file, "writemask", writemask,
+                        brw_inst_da16_writemask(devinfo, inst), NULL);
+         string(file, brw_reg_type_to_letters(type));
+      } else {
+         err = 1;
+         string(file, "Indirect align16 address mode not supported");
+      }
+   }
+
+   return 0;
+}
+
+static int
+dest_3src(FILE *file, const struct intel_device_info *devinfo,
+          const brw_inst *inst)
+{
+   bool is_align1 = brw_inst_3src_access_mode(devinfo, inst) == BRW_ALIGN_1;
+   int err = 0;
+   uint32_t reg_file;
+   unsigned subreg_nr;
+   enum brw_reg_type type;
+
+   if (devinfo->ver < 10 && is_align1)
+      return 0;
+
+   if (devinfo->ver == 6 && brw_inst_3src_a16_dst_reg_file(devinfo, inst))
+      reg_file = BRW_MESSAGE_REGISTER_FILE;
+   else if (devinfo->ver >= 12)
+      reg_file = brw_inst_3src_a1_dst_reg_file(devinfo, inst);
+   else if (is_align1 && brw_inst_3src_a1_dst_reg_file(devinfo, inst))
+      reg_file = BRW_ARCHITECTURE_REGISTER_FILE;
+   else
+      reg_file = BRW_GENERAL_REGISTER_FILE;
+
+   err |= reg(file, reg_file, brw_inst_3src_dst_reg_nr(devinfo, inst));
+   if (err == -1)
+      return 0;
+
+   if (is_align1) {
+      type = brw_inst_3src_a1_dst_type(devinfo, inst);
+      subreg_nr = brw_inst_3src_a1_dst_subreg_nr(devinfo, inst);
+   } else {
+      type = brw_inst_3src_a16_dst_type(devinfo, inst);
+      subreg_nr = brw_inst_3src_a16_dst_subreg_nr(devinfo, inst) * 4;
+   }
+   subreg_nr /= brw_reg_type_to_size(type);
+
+   if (subreg_nr)
+      format(file, ".%u", subreg_nr);
+   string(file, "<1>");
+
+   if (!is_align1) {
+      err |= control(file, "writemask", writemask,
+                     brw_inst_3src_a16_dst_writemask(devinfo, inst), NULL);
+   }
+   string(file, brw_reg_type_to_letters(type));
+
+   return 0;
+}
+
+static int
+dest_dpas_3src(FILE *file, const struct intel_device_info *devinfo,
+               const brw_inst *inst)
+{
+   uint32_t reg_file = brw_inst_dpas_3src_dst_reg_file(devinfo, inst);
+
+   if (reg(file, reg_file, brw_inst_dpas_3src_dst_reg_nr(devinfo, inst)) == -1)
+      return 0;
+
+   enum brw_reg_type type = brw_inst_dpas_3src_dst_type(devinfo, inst);
+   unsigned subreg_nr = brw_inst_dpas_3src_dst_subreg_nr(devinfo, inst);
+
+   if (subreg_nr)
+      format(file, ".%u", subreg_nr);
+   string(file, "<1>");
+
+   string(file, brw_reg_type_to_letters(type));
+
+   return 0;
+}
+
+static int
+src_align1_region(FILE *file,
+                  unsigned _vert_stride, unsigned _width,
+                  unsigned _horiz_stride)
+{
+   int err = 0;
+   string(file, "<");
+   err |= control(file, "vert stride", vert_stride, _vert_stride, NULL);
+   string(file, ",");
+   err |= control(file, "width", width, _width, NULL);
+   string(file, ",");
+   err |= control(file, "horiz_stride", horiz_stride, _horiz_stride, NULL);
+   string(file, ">");
+   return err;
+}
+
+static int
+src_da1(FILE *file,
+        const struct intel_device_info *devinfo,
+        unsigned opcode,
+        enum brw_reg_type type, unsigned _reg_file,
+        unsigned _vert_stride, unsigned _width, unsigned _horiz_stride,
+        unsigned reg_num, unsigned sub_reg_num, unsigned __abs,
+        unsigned _negate)
+{
+   int err = 0;
+
+   if (devinfo->ver >= 8 && is_logic_instruction(opcode))
+      err |= control(file, "bitnot", m_bitnot, _negate, NULL);
+   else
+      err |= control(file, "negate", m_negate, _negate, NULL);
+
+   err |= control(file, "abs", _abs, __abs, NULL);
+
+   err |= reg(file, _reg_file, reg_num);
+   if (err == -1)
+      return 0;
+   if (sub_reg_num) {
+      unsigned elem_size = brw_reg_type_to_size(type);
+      format(file, ".%d", sub_reg_num / elem_size);   /* use formal style like spec */
+   }
+   src_align1_region(file, _vert_stride, _width, _horiz_stride);
+   string(file, brw_reg_type_to_letters(type));
+   return err;
+}
+
+static int
+src_ia1(FILE *file,
+        const struct intel_device_info *devinfo,
+        unsigned opcode,
+        enum brw_reg_type type,
+        int _addr_imm,
+        unsigned _addr_subreg_nr,
+        unsigned _negate,
+        unsigned __abs,
+        unsigned _horiz_stride, unsigned _width, unsigned _vert_stride)
+{
+   int err = 0;
+
+   if (devinfo->ver >= 8 && is_logic_instruction(opcode))
+      err |= control(file, "bitnot", m_bitnot, _negate, NULL);
+   else
+      err |= control(file, "negate", m_negate, _negate, NULL);
+
+   err |= control(file, "abs", _abs, __abs, NULL);
+
+   string(file, "g[a0");
+   if (_addr_subreg_nr)
+      format(file, ".%d", _addr_subreg_nr);
+   if (_addr_imm)
+      format(file, " %d", _addr_imm);
+   string(file, "]");
+   src_align1_region(file, _vert_stride, _width, _horiz_stride);
+   string(file, brw_reg_type_to_letters(type));
+   return err;
+}
+
+static int
+src_swizzle(FILE *file, unsigned swiz)
+{
+   unsigned x = BRW_GET_SWZ(swiz, BRW_CHANNEL_X);
+   unsigned y = BRW_GET_SWZ(swiz, BRW_CHANNEL_Y);
+   unsigned z = BRW_GET_SWZ(swiz, BRW_CHANNEL_Z);
+   unsigned w = BRW_GET_SWZ(swiz, BRW_CHANNEL_W);
+   int err = 0;
+
+   if (x == y && x == z && x == w) {
+      string(file, ".");
+      err |= control(file, "channel select", chan_sel, x, NULL);
+   } else if (swiz != BRW_SWIZZLE_XYZW) {
+      string(file, ".");
+      err |= control(file, "channel select", chan_sel, x, NULL);
+      err |= control(file, "channel select", chan_sel, y, NULL);
+      err |= control(file, "channel select", chan_sel, z, NULL);
+      err |= control(file, "channel select", chan_sel, w, NULL);
+   }
+   return err;
+}
+
+static int
+src_da16(FILE *file,
+         const struct intel_device_info *devinfo,
+         unsigned opcode,
+         enum brw_reg_type type,
+         unsigned _reg_file,
+         unsigned _vert_stride,
+         unsigned _reg_nr,
+         unsigned _subreg_nr,
+         unsigned __abs,
+         unsigned _negate,
+         unsigned swz_x, unsigned swz_y, unsigned swz_z, unsigned swz_w)
+{
+   int err = 0;
+
+   if (devinfo->ver >= 8 && is_logic_instruction(opcode))
+      err |= control(file, "bitnot", m_bitnot, _negate, NULL);
+   else
+      err |= control(file, "negate", m_negate, _negate, NULL);
+
+   err |= control(file, "abs", _abs, __abs, NULL);
+
+   err |= reg(file, _reg_file, _reg_nr);
+   if (err == -1)
+      return 0;
+   if (_subreg_nr) {
+      unsigned elem_size = brw_reg_type_to_size(type);
+
+      /* bit4 for subreg number byte addressing. Make this same meaning as
+         in da1 case, so output looks consistent. */
+      format(file, ".%d", 16 / elem_size);
+   }
+   string(file, "<");
+   err |= control(file, "vert stride", vert_stride, _vert_stride, NULL);
+   string(file, ">");
+   err |= src_swizzle(file, BRW_SWIZZLE4(swz_x, swz_y, swz_z, swz_w));
+   string(file, brw_reg_type_to_letters(type));
+   return err;
+}
+
+static enum brw_vertical_stride
+vstride_from_align1_3src_vstride(const struct intel_device_info *devinfo,
+                                 enum gfx10_align1_3src_vertical_stride vstride)
+{
+   switch (vstride) {
+   case BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0: return BRW_VERTICAL_STRIDE_0;
+   case BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2:
+      if (devinfo->ver >= 12)
+         return BRW_VERTICAL_STRIDE_1;
+      else
+         return BRW_VERTICAL_STRIDE_2;
+   case BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4: return BRW_VERTICAL_STRIDE_4;
+   case BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8: return BRW_VERTICAL_STRIDE_8;
+   default:
+      unreachable("not reached");
+   }
+}
+
+static enum brw_horizontal_stride
+hstride_from_align1_3src_hstride(enum gfx10_align1_3src_src_horizontal_stride hstride)
+{
+   switch (hstride) {
+   case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0: return BRW_HORIZONTAL_STRIDE_0;
+   case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1: return BRW_HORIZONTAL_STRIDE_1;
+   case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2: return BRW_HORIZONTAL_STRIDE_2;
+   case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4: return BRW_HORIZONTAL_STRIDE_4;
+   default:
+      unreachable("not reached");
+   }
+}
+
+static enum brw_vertical_stride
+vstride_from_align1_3src_hstride(enum gfx10_align1_3src_src_horizontal_stride hstride)
+{
+   switch (hstride) {
+   case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0: return BRW_VERTICAL_STRIDE_0;
+   case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1: return BRW_VERTICAL_STRIDE_1;
+   case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2: return BRW_VERTICAL_STRIDE_2;
+   case BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4: return BRW_VERTICAL_STRIDE_4;
+   default:
+      unreachable("not reached");
+   }
+}
+
+/* From "GFX10 Regioning Rules for Align1 Ternary Operations" in the
+ * "Register Region Restrictions" documentation
+ */
+static enum brw_width
+implied_width(enum brw_vertical_stride _vert_stride,
+              enum brw_horizontal_stride _horiz_stride)
+{
+   /* "1. Width is 1 when Vertical and Horizontal Strides are both zero." */
+   if (_vert_stride == BRW_VERTICAL_STRIDE_0 &&
+       _horiz_stride == BRW_HORIZONTAL_STRIDE_0) {
+      return BRW_WIDTH_1;
+
+   /* "2. Width is equal to vertical stride when Horizontal Stride is zero." */
+   } else if (_horiz_stride == BRW_HORIZONTAL_STRIDE_0) {
+      switch (_vert_stride) {
+      case BRW_VERTICAL_STRIDE_1: return BRW_WIDTH_1;
+      case BRW_VERTICAL_STRIDE_2: return BRW_WIDTH_2;
+      case BRW_VERTICAL_STRIDE_4: return BRW_WIDTH_4;
+      case BRW_VERTICAL_STRIDE_8: return BRW_WIDTH_8;
+      case BRW_VERTICAL_STRIDE_0:
+      default:
+         unreachable("not reached");
+      }
+
+   } else {
+      /* FINISHME: Implement these: */
+
+      /* "3. Width is equal to Vertical Stride/Horizontal Stride when both
+       *     Strides are non-zero.
+       *
+       *  4. Vertical Stride must not be zero if Horizontal Stride is non-zero.
+       *     This implies Vertical Stride is always greater than Horizontal
+       *     Stride."
+       *
+       * Given these statements and the knowledge that the stride and width
+       * values are encoded in logarithmic form, we can perform the division
+       * by just subtracting.
+       */
+      return _vert_stride - _horiz_stride;
+   }
+}
+
+static int
+src0_3src(FILE *file, const struct intel_device_info *devinfo,
+          const brw_inst *inst)
+{
+   int err = 0;
+   unsigned reg_nr, subreg_nr;
+   enum brw_reg_file _file;
+   enum brw_reg_type type;
+   enum brw_vertical_stride _vert_stride;
+   enum brw_width _width;
+   enum brw_horizontal_stride _horiz_stride;
+   bool is_scalar_region;
+   bool is_align1 = brw_inst_3src_access_mode(devinfo, inst) == BRW_ALIGN_1;
+
+   if (devinfo->ver < 10 && is_align1)
+      return 0;
+
+   if (is_align1) {
+      if (devinfo->ver >= 12 && !brw_inst_3src_a1_src0_is_imm(devinfo, inst)) {
+         _file = brw_inst_3src_a1_src0_reg_file(devinfo, inst);
+      } else if (brw_inst_3src_a1_src0_reg_file(devinfo, inst) ==
+                 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE) {
+         _file = BRW_GENERAL_REGISTER_FILE;
+      } else if (brw_inst_3src_a1_src0_type(devinfo, inst) ==
+                 BRW_REGISTER_TYPE_NF) {
+         _file = BRW_ARCHITECTURE_REGISTER_FILE;
+      } else {
+         _file = BRW_IMMEDIATE_VALUE;
+         uint16_t imm_val = brw_inst_3src_a1_src0_imm(devinfo, inst);
+         enum brw_reg_type type = brw_inst_3src_a1_src0_type(devinfo, inst);
+
+         if (type == BRW_REGISTER_TYPE_W) {
+            format(file, "%dW", imm_val);
+         } else if (type == BRW_REGISTER_TYPE_UW) {
+            format(file, "0x%04xUW", imm_val);
+         } else if (type == BRW_REGISTER_TYPE_HF) {
+            format(file, "0x%04xHF", imm_val);
+         }
+         return 0;
+      }
+
+      reg_nr = brw_inst_3src_src0_reg_nr(devinfo, inst);
+      subreg_nr = brw_inst_3src_a1_src0_subreg_nr(devinfo, inst);
+      type = brw_inst_3src_a1_src0_type(devinfo, inst);
+      _vert_stride = vstride_from_align1_3src_vstride(
+         devinfo, brw_inst_3src_a1_src0_vstride(devinfo, inst));
+      _horiz_stride = hstride_from_align1_3src_hstride(
+                         brw_inst_3src_a1_src0_hstride(devinfo, inst));
+      _width = implied_width(_vert_stride, _horiz_stride);
+   } else {
+      _file = BRW_GENERAL_REGISTER_FILE;
+      reg_nr = brw_inst_3src_src0_reg_nr(devinfo, inst);
+      subreg_nr = brw_inst_3src_a16_src0_subreg_nr(devinfo, inst) * 4;
+      type = brw_inst_3src_a16_src_type(devinfo, inst);
+
+      if (brw_inst_3src_a16_src0_rep_ctrl(devinfo, inst)) {
+         _vert_stride = BRW_VERTICAL_STRIDE_0;
+         _width = BRW_WIDTH_1;
+         _horiz_stride = BRW_HORIZONTAL_STRIDE_0;
+      } else {
+         _vert_stride = BRW_VERTICAL_STRIDE_4;
+         _width = BRW_WIDTH_4;
+         _horiz_stride = BRW_HORIZONTAL_STRIDE_1;
+      }
+   }
+   is_scalar_region = _vert_stride == BRW_VERTICAL_STRIDE_0 &&
+                      _width == BRW_WIDTH_1 &&
+                      _horiz_stride == BRW_HORIZONTAL_STRIDE_0;
+
+   subreg_nr /= brw_reg_type_to_size(type);
+
+   err |= control(file, "negate", m_negate,
+                  brw_inst_3src_src0_negate(devinfo, inst), NULL);
+   err |= control(file, "abs", _abs, brw_inst_3src_src0_abs(devinfo, inst), NULL);
+
+   err |= reg(file, _file, reg_nr);
+   if (err == -1)
+      return 0;
+   if (subreg_nr || is_scalar_region)
+      format(file, ".%d", subreg_nr);
+   src_align1_region(file, _vert_stride, _width, _horiz_stride);
+   if (!is_scalar_region && !is_align1)
+      err |= src_swizzle(file, brw_inst_3src_a16_src0_swizzle(devinfo, inst));
+   string(file, brw_reg_type_to_letters(type));
+   return err;
+}
+
+static int
+src1_3src(FILE *file, const struct intel_device_info *devinfo,
+          const brw_inst *inst)
+{
+   int err = 0;
+   unsigned reg_nr, subreg_nr;
+   enum brw_reg_file _file;
+   enum brw_reg_type type;
+   enum brw_vertical_stride _vert_stride;
+   enum brw_width _width;
+   enum brw_horizontal_stride _horiz_stride;
+   bool is_scalar_region;
+   bool is_align1 = brw_inst_3src_access_mode(devinfo, inst) == BRW_ALIGN_1;
+
+   if (devinfo->ver < 10 && is_align1)
+      return 0;
+
+   if (is_align1) {
+      if (devinfo->ver >= 12) {
+         _file = brw_inst_3src_a1_src1_reg_file(devinfo, inst);
+      } else if (brw_inst_3src_a1_src1_reg_file(devinfo, inst) ==
+                 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE) {
+         _file = BRW_GENERAL_REGISTER_FILE;
+      } else {
+         _file = BRW_ARCHITECTURE_REGISTER_FILE;
+      }
+
+      reg_nr = brw_inst_3src_src1_reg_nr(devinfo, inst);
+      subreg_nr = brw_inst_3src_a1_src1_subreg_nr(devinfo, inst);
+      type = brw_inst_3src_a1_src1_type(devinfo, inst);
+
+      _vert_stride = vstride_from_align1_3src_vstride(
+         devinfo, brw_inst_3src_a1_src1_vstride(devinfo, inst));
+      _horiz_stride = hstride_from_align1_3src_hstride(
+                         brw_inst_3src_a1_src1_hstride(devinfo, inst));
+      _width = implied_width(_vert_stride, _horiz_stride);
+   } else {
+      _file = BRW_GENERAL_REGISTER_FILE;
+      reg_nr = brw_inst_3src_src1_reg_nr(devinfo, inst);
+      subreg_nr = brw_inst_3src_a16_src1_subreg_nr(devinfo, inst) * 4;
+      type = brw_inst_3src_a16_src_type(devinfo, inst);
+
+      if (brw_inst_3src_a16_src1_rep_ctrl(devinfo, inst)) {
+         _vert_stride = BRW_VERTICAL_STRIDE_0;
+         _width = BRW_WIDTH_1;
+         _horiz_stride = BRW_HORIZONTAL_STRIDE_0;
+      } else {
+         _vert_stride = BRW_VERTICAL_STRIDE_4;
+         _width = BRW_WIDTH_4;
+         _horiz_stride = BRW_HORIZONTAL_STRIDE_1;
+      }
+   }
+   is_scalar_region = _vert_stride == BRW_VERTICAL_STRIDE_0 &&
+                      _width == BRW_WIDTH_1 &&
+                      _horiz_stride == BRW_HORIZONTAL_STRIDE_0;
+
+   subreg_nr /= brw_reg_type_to_size(type);
+
+   err |= control(file, "negate", m_negate,
+                  brw_inst_3src_src1_negate(devinfo, inst), NULL);
+   err |= control(file, "abs", _abs, brw_inst_3src_src1_abs(devinfo, inst), NULL);
+
+   err |= reg(file, _file, reg_nr);
+   if (err == -1)
+      return 0;
+   if (subreg_nr || is_scalar_region)
+      format(file, ".%d", subreg_nr);
+   src_align1_region(file, _vert_stride, _width, _horiz_stride);
+   if (!is_scalar_region && !is_align1)
+      err |= src_swizzle(file, brw_inst_3src_a16_src1_swizzle(devinfo, inst));
+   string(file, brw_reg_type_to_letters(type));
+   return err;
+}
+
+static int
+src2_3src(FILE *file, const struct intel_device_info *devinfo,
+          const brw_inst *inst)
+{
+   int err = 0;
+   unsigned reg_nr, subreg_nr;
+   enum brw_reg_file _file;
+   enum brw_reg_type type;
+   enum brw_vertical_stride _vert_stride;
+   enum brw_width _width;
+   enum brw_horizontal_stride _horiz_stride;
+   bool is_scalar_region;
+   bool is_align1 = brw_inst_3src_access_mode(devinfo, inst) == BRW_ALIGN_1;
+
+   if (devinfo->ver < 10 && is_align1)
+      return 0;
+
+   if (is_align1) {
+      if (devinfo->ver >= 12 && !brw_inst_3src_a1_src2_is_imm(devinfo, inst)) {
+         _file = brw_inst_3src_a1_src2_reg_file(devinfo, inst);
+      } else if (brw_inst_3src_a1_src2_reg_file(devinfo, inst) ==
+                 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE) {
+         _file = BRW_GENERAL_REGISTER_FILE;
+      } else {
+         _file = BRW_IMMEDIATE_VALUE;
+         uint16_t imm_val = brw_inst_3src_a1_src2_imm(devinfo, inst);
+         enum brw_reg_type type = brw_inst_3src_a1_src2_type(devinfo, inst);
+
+         if (type == BRW_REGISTER_TYPE_W) {
+            format(file, "%dW", imm_val);
+         } else if (type == BRW_REGISTER_TYPE_UW) {
+            format(file, "0x%04xUW", imm_val);
+         } else if (type == BRW_REGISTER_TYPE_HF) {
+            format(file, "0x%04xHF", imm_val);
+         }
+         return 0;
+      }
+
+      reg_nr = brw_inst_3src_src2_reg_nr(devinfo, inst);
+      subreg_nr = brw_inst_3src_a1_src2_subreg_nr(devinfo, inst);
+      type = brw_inst_3src_a1_src2_type(devinfo, inst);
+      /* FINISHME: No vertical stride on src2. Is using the hstride in place
+       *           correct? Doesn't seem like it, since there's hstride=1 but
+       *           no vstride=1.
+       */
+      _vert_stride = vstride_from_align1_3src_hstride(
+                        brw_inst_3src_a1_src2_hstride(devinfo, inst));
+      _horiz_stride = hstride_from_align1_3src_hstride(
+                         brw_inst_3src_a1_src2_hstride(devinfo, inst));
+      _width = implied_width(_vert_stride, _horiz_stride);
+   } else {
+      _file = BRW_GENERAL_REGISTER_FILE;
+      reg_nr = brw_inst_3src_src2_reg_nr(devinfo, inst);
+      subreg_nr = brw_inst_3src_a16_src2_subreg_nr(devinfo, inst) * 4;
+      type = brw_inst_3src_a16_src_type(devinfo, inst);
+
+      if (brw_inst_3src_a16_src2_rep_ctrl(devinfo, inst)) {
+         _vert_stride = BRW_VERTICAL_STRIDE_0;
+         _width = BRW_WIDTH_1;
+         _horiz_stride = BRW_HORIZONTAL_STRIDE_0;
+      } else {
+         _vert_stride = BRW_VERTICAL_STRIDE_4;
+         _width = BRW_WIDTH_4;
+         _horiz_stride = BRW_HORIZONTAL_STRIDE_1;
+      }
+   }
+   is_scalar_region = _vert_stride == BRW_VERTICAL_STRIDE_0 &&
+                      _width == BRW_WIDTH_1 &&
+                      _horiz_stride == BRW_HORIZONTAL_STRIDE_0;
+
+   subreg_nr /= brw_reg_type_to_size(type);
+
+   err |= control(file, "negate", m_negate,
+                  brw_inst_3src_src2_negate(devinfo, inst), NULL);
+   err |= control(file, "abs", _abs, brw_inst_3src_src2_abs(devinfo, inst), NULL);
+
+   err |= reg(file, _file, reg_nr);
+   if (err == -1)
+      return 0;
+   if (subreg_nr || is_scalar_region)
+      format(file, ".%d", subreg_nr);
+   src_align1_region(file, _vert_stride, _width, _horiz_stride);
+   if (!is_scalar_region && !is_align1)
+      err |= src_swizzle(file, brw_inst_3src_a16_src2_swizzle(devinfo, inst));
+   string(file, brw_reg_type_to_letters(type));
+   return err;
+}
+
+static int
+src0_dpas_3src(FILE *file, const struct intel_device_info *devinfo,
+               const brw_inst *inst)
+{
+   uint32_t reg_file = brw_inst_dpas_3src_src0_reg_file(devinfo, inst);
+
+   if (reg(file, reg_file, brw_inst_dpas_3src_src0_reg_nr(devinfo, inst)) == -1)
+      return 0;
+
+   unsigned subreg_nr = brw_inst_dpas_3src_src0_subreg_nr(devinfo, inst);
+   enum brw_reg_type type = brw_inst_dpas_3src_src0_type(devinfo, inst);
+
+   if (subreg_nr)
+      format(file, ".%d", subreg_nr);
+   src_align1_region(file, 1, 1, 0);
+
+   string(file, brw_reg_type_to_letters(type));
+
+   return 0;
+}
+
+static int
+src1_dpas_3src(FILE *file, const struct intel_device_info *devinfo,
+               const brw_inst *inst)
+{
+   uint32_t reg_file = brw_inst_dpas_3src_src1_reg_file(devinfo, inst);
+
+   if (reg(file, reg_file, brw_inst_dpas_3src_src1_reg_nr(devinfo, inst)) == -1)
+      return 0;
+
+   unsigned subreg_nr = brw_inst_dpas_3src_src1_subreg_nr(devinfo, inst);
+   enum brw_reg_type type = brw_inst_dpas_3src_src1_type(devinfo, inst);
+
+   if (subreg_nr)
+      format(file, ".%d", subreg_nr);
+   src_align1_region(file, 1, 1, 0);
+
+   string(file, brw_reg_type_to_letters(type));
+
+   return 0;
+}
+
+static int
+src2_dpas_3src(FILE *file, const struct intel_device_info *devinfo,
+               const brw_inst *inst)
+{
+   uint32_t reg_file = brw_inst_dpas_3src_src2_reg_file(devinfo, inst);
+
+   if (reg(file, reg_file, brw_inst_dpas_3src_src2_reg_nr(devinfo, inst)) == -1)
+      return 0;
+
+   unsigned subreg_nr = brw_inst_dpas_3src_src2_subreg_nr(devinfo, inst);
+   enum brw_reg_type type = brw_inst_dpas_3src_src2_type(devinfo, inst);
+
+   if (subreg_nr)
+      format(file, ".%d", subreg_nr);
+   src_align1_region(file, 1, 1, 0);
+
+   string(file, brw_reg_type_to_letters(type));
+
+   return 0;
+}
+
+static int
+imm(FILE *file, const struct brw_isa_info *isa, enum brw_reg_type type,
+    const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   switch (type) {
+   case BRW_REGISTER_TYPE_UQ:
+      format(file, "0x%016"PRIx64"UQ", brw_inst_imm_uq(devinfo, inst));
+      break;
+   case BRW_REGISTER_TYPE_Q:
+      format(file, "0x%016"PRIx64"Q", brw_inst_imm_uq(devinfo, inst));
+      break;
+   case BRW_REGISTER_TYPE_UD:
+      format(file, "0x%08xUD", brw_inst_imm_ud(devinfo, inst));
+      break;
+   case BRW_REGISTER_TYPE_D:
+      format(file, "%dD", brw_inst_imm_d(devinfo, inst));
+      break;
+   case BRW_REGISTER_TYPE_UW:
+      format(file, "0x%04xUW", (uint16_t) brw_inst_imm_ud(devinfo, inst));
+      break;
+   case BRW_REGISTER_TYPE_W:
+      format(file, "%dW", (int16_t) brw_inst_imm_d(devinfo, inst));
+      break;
+   case BRW_REGISTER_TYPE_UV:
+      format(file, "0x%08xUV", brw_inst_imm_ud(devinfo, inst));
+      break;
+   case BRW_REGISTER_TYPE_VF:
+      format(file, "0x%"PRIx64"VF", brw_inst_bits(inst, 127, 96));
+      pad(file, 48);
+      format(file, "/* [%-gF, %-gF, %-gF, %-gF]VF */",
+             brw_vf_to_float(brw_inst_imm_ud(devinfo, inst)),
+             brw_vf_to_float(brw_inst_imm_ud(devinfo, inst) >> 8),
+             brw_vf_to_float(brw_inst_imm_ud(devinfo, inst) >> 16),
+             brw_vf_to_float(brw_inst_imm_ud(devinfo, inst) >> 24));
+      break;
+   case BRW_REGISTER_TYPE_V:
+      format(file, "0x%08xV", brw_inst_imm_ud(devinfo, inst));
+      break;
+   case BRW_REGISTER_TYPE_F:
+      /* The DIM instruction's src0 uses an F type but contains a
+       * 64-bit immediate
+       */
+      if (brw_inst_opcode(isa, inst) == BRW_OPCODE_DIM) {
+         format(file, "0x%"PRIx64"F", brw_inst_bits(inst, 127, 64));
+         pad(file, 48);
+         format(file, "/* %-gF */", brw_inst_imm_df(devinfo, inst));
+      } else {
+         format(file, "0x%"PRIx64"F", brw_inst_bits(inst, 127, 96));
+         pad(file, 48);
+         format(file, " /* %-gF */", brw_inst_imm_f(devinfo, inst));
+      }
+      break;
+   case BRW_REGISTER_TYPE_DF:
+      format(file, "0x%016"PRIx64"DF", brw_inst_imm_uq(devinfo, inst));
+      pad(file, 48);
+      format(file, "/* %-gDF */", brw_inst_imm_df(devinfo, inst));
+      break;
+   case BRW_REGISTER_TYPE_HF:
+      format(file, "0x%04xHF",
+             (uint16_t) brw_inst_imm_ud(devinfo, inst));
+      pad(file, 48);
+      format(file, "/* %-gHF */",
+             _mesa_half_to_float((uint16_t) brw_inst_imm_ud(devinfo, inst)));
+      break;
+   case BRW_REGISTER_TYPE_NF:
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+      format(file, "*** invalid immediate type %d ", type);
+   }
+   return 0;
+}
+
+static int
+src_sends_da(FILE *file,
+             const struct intel_device_info *devinfo,
+             enum brw_reg_type type,
+             enum brw_reg_file _reg_file,
+             unsigned _reg_nr,
+             unsigned _reg_subnr)
+{
+   int err = 0;
+
+   err |= reg(file, _reg_file, _reg_nr);
+   if (err == -1)
+      return 0;
+   if (_reg_subnr)
+      format(file, ".1");
+   string(file, brw_reg_type_to_letters(type));
+
+   return err;
+}
+
+static int
+src_sends_ia(FILE *file,
+             const struct intel_device_info *devinfo,
+             enum brw_reg_type type,
+             int _addr_imm,
+             unsigned _addr_subreg_nr)
+{
+   string(file, "g[a0");
+   if (_addr_subreg_nr)
+      format(file, ".1");
+   if (_addr_imm)
+      format(file, " %d", _addr_imm);
+   string(file, "]");
+   string(file, brw_reg_type_to_letters(type));
+
+   return 0;
+}
+
+static int
+src_send_desc_ia(FILE *file,
+                 const struct intel_device_info *devinfo,
+                 unsigned _addr_subreg_nr)
+{
+   string(file, "a0");
+   if (_addr_subreg_nr)
+      format(file, ".%d", _addr_subreg_nr);
+   format(file, "<0>UD");
+
+   return 0;
+}
+
+static int
+src0(FILE *file, const struct brw_isa_info *isa, const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   if (is_split_send(devinfo, brw_inst_opcode(isa, inst))) {
+      if (devinfo->ver >= 12) {
+         return src_sends_da(file,
+                             devinfo,
+                             BRW_REGISTER_TYPE_UD,
+                             brw_inst_send_src0_reg_file(devinfo, inst),
+                             brw_inst_src0_da_reg_nr(devinfo, inst),
+                             0);
+      } else if (brw_inst_send_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         return src_sends_da(file,
+                             devinfo,
+                             BRW_REGISTER_TYPE_UD,
+                             BRW_GENERAL_REGISTER_FILE,
+                             brw_inst_src0_da_reg_nr(devinfo, inst),
+                             brw_inst_src0_da16_subreg_nr(devinfo, inst));
+      } else {
+         return src_sends_ia(file,
+                             devinfo,
+                             BRW_REGISTER_TYPE_UD,
+                             brw_inst_send_src0_ia16_addr_imm(devinfo, inst),
+                             brw_inst_src0_ia_subreg_nr(devinfo, inst));
+      }
+   } else if (brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) {
+      return imm(file, isa, brw_inst_src0_type(devinfo, inst), inst);
+   } else if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+      if (brw_inst_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         return src_da1(file,
+                        devinfo,
+                        brw_inst_opcode(isa, inst),
+                        brw_inst_src0_type(devinfo, inst),
+                        brw_inst_src0_reg_file(devinfo, inst),
+                        brw_inst_src0_vstride(devinfo, inst),
+                        brw_inst_src0_width(devinfo, inst),
+                        brw_inst_src0_hstride(devinfo, inst),
+                        brw_inst_src0_da_reg_nr(devinfo, inst),
+                        brw_inst_src0_da1_subreg_nr(devinfo, inst),
+                        brw_inst_src0_abs(devinfo, inst),
+                        brw_inst_src0_negate(devinfo, inst));
+      } else {
+         return src_ia1(file,
+                        devinfo,
+                        brw_inst_opcode(isa, inst),
+                        brw_inst_src0_type(devinfo, inst),
+                        brw_inst_src0_ia1_addr_imm(devinfo, inst),
+                        brw_inst_src0_ia_subreg_nr(devinfo, inst),
+                        brw_inst_src0_negate(devinfo, inst),
+                        brw_inst_src0_abs(devinfo, inst),
+                        brw_inst_src0_hstride(devinfo, inst),
+                        brw_inst_src0_width(devinfo, inst),
+                        brw_inst_src0_vstride(devinfo, inst));
+      }
+   } else {
+      if (brw_inst_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         return src_da16(file,
+                         devinfo,
+                         brw_inst_opcode(isa, inst),
+                         brw_inst_src0_type(devinfo, inst),
+                         brw_inst_src0_reg_file(devinfo, inst),
+                         brw_inst_src0_vstride(devinfo, inst),
+                         brw_inst_src0_da_reg_nr(devinfo, inst),
+                         brw_inst_src0_da16_subreg_nr(devinfo, inst),
+                         brw_inst_src0_abs(devinfo, inst),
+                         brw_inst_src0_negate(devinfo, inst),
+                         brw_inst_src0_da16_swiz_x(devinfo, inst),
+                         brw_inst_src0_da16_swiz_y(devinfo, inst),
+                         brw_inst_src0_da16_swiz_z(devinfo, inst),
+                         brw_inst_src0_da16_swiz_w(devinfo, inst));
+      } else {
+         string(file, "Indirect align16 address mode not supported");
+         return 1;
+      }
+   }
+}
+
+static int
+src1(FILE *file, const struct brw_isa_info *isa, const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   if (is_split_send(devinfo, brw_inst_opcode(isa, inst))) {
+      return src_sends_da(file,
+                          devinfo,
+                          BRW_REGISTER_TYPE_UD,
+                          brw_inst_send_src1_reg_file(devinfo, inst),
+                          brw_inst_send_src1_reg_nr(devinfo, inst),
+                          0 /* subreg_nr */);
+   } else if (brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) {
+      return imm(file, isa, brw_inst_src1_type(devinfo, inst), inst);
+   } else if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+      if (brw_inst_src1_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         return src_da1(file,
+                        devinfo,
+                        brw_inst_opcode(isa, inst),
+                        brw_inst_src1_type(devinfo, inst),
+                        brw_inst_src1_reg_file(devinfo, inst),
+                        brw_inst_src1_vstride(devinfo, inst),
+                        brw_inst_src1_width(devinfo, inst),
+                        brw_inst_src1_hstride(devinfo, inst),
+                        brw_inst_src1_da_reg_nr(devinfo, inst),
+                        brw_inst_src1_da1_subreg_nr(devinfo, inst),
+                        brw_inst_src1_abs(devinfo, inst),
+                        brw_inst_src1_negate(devinfo, inst));
+      } else {
+         return src_ia1(file,
+                        devinfo,
+                        brw_inst_opcode(isa, inst),
+                        brw_inst_src1_type(devinfo, inst),
+                        brw_inst_src1_ia1_addr_imm(devinfo, inst),
+                        brw_inst_src1_ia_subreg_nr(devinfo, inst),
+                        brw_inst_src1_negate(devinfo, inst),
+                        brw_inst_src1_abs(devinfo, inst),
+                        brw_inst_src1_hstride(devinfo, inst),
+                        brw_inst_src1_width(devinfo, inst),
+                        brw_inst_src1_vstride(devinfo, inst));
+      }
+   } else {
+      if (brw_inst_src1_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         return src_da16(file,
+                         devinfo,
+                         brw_inst_opcode(isa, inst),
+                         brw_inst_src1_type(devinfo, inst),
+                         brw_inst_src1_reg_file(devinfo, inst),
+                         brw_inst_src1_vstride(devinfo, inst),
+                         brw_inst_src1_da_reg_nr(devinfo, inst),
+                         brw_inst_src1_da16_subreg_nr(devinfo, inst),
+                         brw_inst_src1_abs(devinfo, inst),
+                         brw_inst_src1_negate(devinfo, inst),
+                         brw_inst_src1_da16_swiz_x(devinfo, inst),
+                         brw_inst_src1_da16_swiz_y(devinfo, inst),
+                         brw_inst_src1_da16_swiz_z(devinfo, inst),
+                         brw_inst_src1_da16_swiz_w(devinfo, inst));
+      } else {
+         string(file, "Indirect align16 address mode not supported");
+         return 1;
+      }
+   }
+}
+
+static int
+qtr_ctrl(FILE *file, const struct intel_device_info *devinfo,
+         const brw_inst *inst)
+{
+   int qtr_ctl = brw_inst_qtr_control(devinfo, inst);
+   int exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+   const unsigned nib_ctl = devinfo->ver < 7 || devinfo->ver >= 20 ? 0 :
+                            brw_inst_nib_control(devinfo, inst);
+
+   if (exec_size < 8 || nib_ctl) {
+      format(file, " %dN", qtr_ctl * 2 + nib_ctl + 1);
+   } else if (exec_size == 8) {
+      switch (qtr_ctl) {
+      case 0:
+         string(file, " 1Q");
+         break;
+      case 1:
+         string(file, " 2Q");
+         break;
+      case 2:
+         string(file, " 3Q");
+         break;
+      case 3:
+         string(file, " 4Q");
+         break;
+      }
+   } else if (exec_size == 16) {
+      if (qtr_ctl < 2)
+         string(file, " 1H");
+      else
+         string(file, " 2H");
+   }
+   return 0;
+}
+
+static bool
+inst_has_type(const struct brw_isa_info *isa,
+              const brw_inst *inst,
+              enum brw_reg_type type)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+   const unsigned num_sources = brw_num_sources_from_inst(isa, inst);
+
+   if (brw_inst_dst_type(devinfo, inst) == type)
+      return true;
+
+   if (num_sources >= 3) {
+      if (brw_inst_3src_access_mode(devinfo, inst) == BRW_ALIGN_1)
+         return brw_inst_3src_a1_src0_type(devinfo, inst) == type ||
+                brw_inst_3src_a1_src1_type(devinfo, inst) == type ||
+                brw_inst_3src_a1_src2_type(devinfo, inst) == type;
+      else
+         return brw_inst_3src_a16_src_type(devinfo, inst) == type;
+   } else if (num_sources == 2) {
+      return brw_inst_src0_type(devinfo, inst) == type ||
+             brw_inst_src1_type(devinfo, inst) == type;
+   } else {
+      return brw_inst_src0_type(devinfo, inst) == type;
+   }
+}
+
+static int
+swsb(FILE *file, const struct brw_isa_info *isa, const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+   const enum opcode opcode = brw_inst_opcode(isa, inst);
+   const uint32_t x = brw_inst_swsb(devinfo, inst);
+   const bool is_unordered =
+      opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC ||
+      opcode == BRW_OPCODE_MATH || opcode == BRW_OPCODE_DPAS ||
+      (devinfo->has_64bit_float_via_math_pipe &&
+       inst_has_type(isa, inst, BRW_REGISTER_TYPE_DF));
+   const struct tgl_swsb swsb = tgl_swsb_decode(devinfo, is_unordered, x);
+   if (swsb.regdist)
+      format(file, " %s@%d",
+             (swsb.pipe == TGL_PIPE_FLOAT ? "F" :
+              swsb.pipe == TGL_PIPE_INT ? "I" :
+              swsb.pipe == TGL_PIPE_LONG ? "L" :
+              swsb.pipe == TGL_PIPE_ALL ? "A"  : "" ),
+             swsb.regdist);
+   if (swsb.mode)
+      format(file, " $%d%s", swsb.sbid,
+             (swsb.mode & TGL_SBID_SET ? "" :
+              swsb.mode & TGL_SBID_DST ? ".dst" : ".src"));
+   return 0;
+}
+
+#ifdef DEBUG
+static __attribute__((__unused__)) int
+brw_disassemble_imm(const struct brw_isa_info *isa,
+                    uint32_t dw3, uint32_t dw2, uint32_t dw1, uint32_t dw0)
+{
+   brw_inst inst;
+   inst.data[0] = (((uint64_t) dw1) << 32) | ((uint64_t) dw0);
+   inst.data[1] = (((uint64_t) dw3) << 32) | ((uint64_t) dw2);
+   return brw_disassemble_inst(stderr, isa, &inst, false, 0, NULL);
+}
+#endif
+
+static void
+write_label(FILE *file, const struct intel_device_info *devinfo,
+            const struct brw_label *root_label,
+            int offset, int jump)
+{
+   if (root_label != NULL) {
+      int to_bytes_scale = sizeof(brw_inst) / brw_jump_scale(devinfo);
+      const struct brw_label *label =
+         brw_find_label(root_label, offset + jump * to_bytes_scale);
+      if (label != NULL) {
+         format(file, " LABEL%d", label->number);
+      }
+   }
+}
+
+static void
+lsc_disassemble_ex_desc(const struct intel_device_info *devinfo,
+                        uint32_t imm_desc,
+                        uint32_t imm_ex_desc,
+                        FILE *file)
+{
+   const unsigned addr_type = lsc_msg_desc_addr_type(devinfo, imm_desc);
+   switch (addr_type) {
+   case LSC_ADDR_SURFTYPE_FLAT:
+      format(file, " base_offset %u ",
+             lsc_flat_ex_desc_base_offset(devinfo, imm_ex_desc));
+      break;
+   case LSC_ADDR_SURFTYPE_BSS:
+   case LSC_ADDR_SURFTYPE_SS:
+      format(file, " surface_state_index %u ",
+             lsc_bss_ex_desc_index(devinfo, imm_ex_desc));
+      break;
+   case LSC_ADDR_SURFTYPE_BTI:
+      format(file, " BTI %u ",
+             lsc_bti_ex_desc_index(devinfo, imm_ex_desc));
+      format(file, " base_offset %u ",
+             lsc_bti_ex_desc_base_offset(devinfo, imm_ex_desc));
+      break;
+   default:
+      format(file, "unsupported address surface type %d", addr_type);
+      break;
+   }
+}
+
+static inline bool
+brw_sfid_is_lsc(unsigned sfid)
+{
+   switch (sfid) {
+   case GFX12_SFID_UGM:
+   case GFX12_SFID_SLM:
+   case GFX12_SFID_TGM:
+      return true;
+   default:
+      break;
+   }
+
+   return false;
+}
+
+int
+brw_disassemble_inst(FILE *file, const struct brw_isa_info *isa,
+                     const brw_inst *inst, bool is_compacted,
+                     int offset, const struct brw_label *root_label)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   int err = 0;
+   int space = 0;
+
+   const enum opcode opcode = brw_inst_opcode(isa, inst);
+   const struct opcode_desc *desc = brw_opcode_desc(isa, opcode);
+
+   if (brw_inst_pred_control(devinfo, inst)) {
+      string(file, "(");
+      err |= control(file, "predicate inverse", pred_inv,
+                     brw_inst_pred_inv(devinfo, inst), NULL);
+      format(file, "f%"PRIu64".%"PRIu64,
+             devinfo->ver >= 7 ? brw_inst_flag_reg_nr(devinfo, inst) : 0,
+             brw_inst_flag_subreg_nr(devinfo, inst));
+      if (devinfo->ver >= 20) {
+         err |= control(file, "predicate control", xe2_pred_ctrl,
+                        brw_inst_pred_control(devinfo, inst), NULL);
+      } else if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+         err |= control(file, "predicate control align1", pred_ctrl_align1,
+                        brw_inst_pred_control(devinfo, inst), NULL);
+      } else {
+         err |= control(file, "predicate control align16", pred_ctrl_align16,
+                        brw_inst_pred_control(devinfo, inst), NULL);
+      }
+      string(file, ") ");
+   }
+
+   err |= print_opcode(file, isa, opcode);
+
+   if (!is_send(opcode))
+      err |= control(file, "saturate", saturate, brw_inst_saturate(devinfo, inst),
+                     NULL);
+
+   err |= control(file, "debug control", debug_ctrl,
+                  brw_inst_debug_control(devinfo, inst), NULL);
+
+   if (opcode == BRW_OPCODE_MATH) {
+      string(file, " ");
+      err |= control(file, "function", math_function,
+                     brw_inst_math_function(devinfo, inst), NULL);
+
+   } else if (opcode == BRW_OPCODE_SYNC) {
+      string(file, " ");
+      err |= control(file, "function", sync_function,
+                     brw_inst_cond_modifier(devinfo, inst), NULL);
+
+   } else if (opcode == BRW_OPCODE_DPAS) {
+      string(file, ".");
+
+      err |= control(file, "systolic depth", dpas_systolic_depth,
+                     brw_inst_dpas_3src_sdepth(devinfo, inst), NULL);
+
+      const unsigned rcount = brw_inst_dpas_3src_rcount(devinfo, inst) + 1;
+
+      format(file, "x%d", rcount);
+   } else if (!is_send(opcode) &&
+              (devinfo->ver < 12 ||
+               brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE ||
+               type_sz(brw_inst_src0_type(devinfo, inst)) < 8)) {
+      err |= control(file, "conditional modifier", conditional_modifier,
+                     brw_inst_cond_modifier(devinfo, inst), NULL);
+
+      /* If we're using the conditional modifier, print which flags reg is
+       * used for it.  Note that on gfx6+, the embedded-condition SEL and
+       * control flow doesn't update flags.
+       */
+      if (brw_inst_cond_modifier(devinfo, inst) &&
+          (devinfo->ver < 6 || (opcode != BRW_OPCODE_SEL &&
+                                opcode != BRW_OPCODE_CSEL &&
+                                opcode != BRW_OPCODE_IF &&
+                                opcode != BRW_OPCODE_WHILE))) {
+         format(file, ".f%"PRIu64".%"PRIu64,
+                devinfo->ver >= 7 ? brw_inst_flag_reg_nr(devinfo, inst) : 0,
+                brw_inst_flag_subreg_nr(devinfo, inst));
+      }
+   }
+
+   if (opcode != BRW_OPCODE_NOP && opcode != BRW_OPCODE_NENOP) {
+      string(file, "(");
+      err |= control(file, "execution size", exec_size,
+                     brw_inst_exec_size(devinfo, inst), NULL);
+      string(file, ")");
+   }
+
+   if (opcode == BRW_OPCODE_SEND && devinfo->ver < 6)
+      format(file, " %"PRIu64, brw_inst_base_mrf(devinfo, inst));
+
+   if (brw_has_uip(devinfo, opcode)) {
+      /* Instructions that have UIP also have JIP. */
+      pad(file, 16);
+      string(file, "JIP: ");
+      write_label(file, devinfo, root_label, offset, brw_inst_jip(devinfo, inst));
+
+      pad(file, 38);
+      string(file, "UIP: ");
+      write_label(file, devinfo, root_label, offset, brw_inst_uip(devinfo, inst));
+   } else if (brw_has_jip(devinfo, opcode)) {
+      int jip;
+      if (devinfo->ver >= 7) {
+         jip = brw_inst_jip(devinfo, inst);
+      } else {
+         jip = brw_inst_gfx6_jump_count(devinfo, inst);
+      }
+
+      pad(file, 16);
+      string(file, "JIP: ");
+      write_label(file, devinfo, root_label, offset, jip);
+   } else if (devinfo->ver < 6 && (opcode == BRW_OPCODE_BREAK ||
+                                   opcode == BRW_OPCODE_CONTINUE ||
+                                   opcode == BRW_OPCODE_ELSE)) {
+      pad(file, 16);
+      format(file, "Jump: %d", brw_inst_gfx4_jump_count(devinfo, inst));
+      pad(file, 32);
+      format(file, "Pop: %"PRIu64, brw_inst_gfx4_pop_count(devinfo, inst));
+   } else if (devinfo->ver < 6 && (opcode == BRW_OPCODE_IF ||
+                                   opcode == BRW_OPCODE_IFF ||
+                                   opcode == BRW_OPCODE_HALT ||
+                                   opcode == BRW_OPCODE_WHILE)) {
+      pad(file, 16);
+      format(file, "Jump: %d", brw_inst_gfx4_jump_count(devinfo, inst));
+   } else if (devinfo->ver < 6 && opcode == BRW_OPCODE_ENDIF) {
+      pad(file, 16);
+      format(file, "Pop: %"PRIu64, brw_inst_gfx4_pop_count(devinfo, inst));
+   } else if (opcode == BRW_OPCODE_JMPI) {
+      pad(file, 16);
+      err |= src1(file, isa, inst);
+   } else if (opcode == BRW_OPCODE_DPAS) {
+      pad(file, 16);
+      err |= dest_dpas_3src(file, devinfo, inst);
+
+      pad(file, 32);
+      err |= src0_dpas_3src(file, devinfo, inst);
+
+      pad(file, 48);
+      err |= src1_dpas_3src(file, devinfo, inst);
+
+      pad(file, 64);
+      err |= src2_dpas_3src(file, devinfo, inst);
+
+   } else if (desc && desc->nsrc == 3) {
+      pad(file, 16);
+      err |= dest_3src(file, devinfo, inst);
+
+      pad(file, 32);
+      err |= src0_3src(file, devinfo, inst);
+
+      pad(file, 48);
+      err |= src1_3src(file, devinfo, inst);
+
+      pad(file, 64);
+      err |= src2_3src(file, devinfo, inst);
+   } else if (desc) {
+      if (desc->ndst > 0) {
+         pad(file, 16);
+         err |= dest(file, isa, inst);
+      }
+
+      if (desc->nsrc > 0) {
+         pad(file, 32);
+         err |= src0(file, isa, inst);
+      }
+
+      if (desc->nsrc > 1) {
+         pad(file, 48);
+         err |= src1(file, isa, inst);
+      }
+   }
+
+   if (is_send(opcode)) {
+      enum brw_message_target sfid = brw_inst_sfid(devinfo, inst);
+
+      bool has_imm_desc = false, has_imm_ex_desc = false;
+      uint32_t imm_desc = 0, imm_ex_desc = 0;
+      if (is_split_send(devinfo, opcode)) {
+         pad(file, 64);
+         if (brw_inst_send_sel_reg32_desc(devinfo, inst)) {
+            /* show the indirect descriptor source */
+            err |= src_send_desc_ia(file, devinfo, 0);
+         } else {
+            has_imm_desc = true;
+            imm_desc = brw_inst_send_desc(devinfo, inst);
+            fprintf(file, "0x%08"PRIx32, imm_desc);
+         }
+
+         pad(file, 80);
+         if (brw_inst_send_sel_reg32_ex_desc(devinfo, inst)) {
+            /* show the indirect descriptor source */
+            err |= src_send_desc_ia(file, devinfo,
+                                    brw_inst_send_ex_desc_ia_subreg_nr(devinfo, inst));
+         } else {
+            has_imm_ex_desc = true;
+            imm_ex_desc = brw_inst_sends_ex_desc(devinfo, inst);
+            fprintf(file, "0x%08"PRIx32, imm_ex_desc);
+         }
+      } else {
+         if (brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE) {
+            /* show the indirect descriptor source */
+            pad(file, 48);
+            err |= src1(file, isa, inst);
+            pad(file, 64);
+         } else {
+            has_imm_desc = true;
+            imm_desc = brw_inst_send_desc(devinfo, inst);
+            pad(file, 48);
+         }
+
+         /* Print message descriptor as immediate source */
+         fprintf(file, "0x%08"PRIx64, inst->data[1] >> 32);
+      }
+
+      newline(file);
+      pad(file, 16);
+      space = 0;
+
+      fprintf(file, "            ");
+      err |= control(file, "SFID", devinfo->ver >= 6 ? gfx6_sfid : gfx4_sfid,
+                     sfid, &space);
+      string(file, " MsgDesc:");
+
+      if (!has_imm_desc) {
+         format(file, " indirect");
+      } else {
+         bool unsupported = false;
+         switch (sfid) {
+         case BRW_SFID_MATH:
+            err |= control(file, "math function", math_function,
+                           brw_inst_math_msg_function(devinfo, inst), &space);
+            err |= control(file, "math saturate", math_saturate,
+                           brw_inst_math_msg_saturate(devinfo, inst), &space);
+            err |= control(file, "math signed", math_signed,
+                           brw_inst_math_msg_signed_int(devinfo, inst), &space);
+            err |= control(file, "math scalar", math_scalar,
+                           brw_inst_math_msg_data_type(devinfo, inst), &space);
+            err |= control(file, "math precision", math_precision,
+                           brw_inst_math_msg_precision(devinfo, inst), &space);
+            break;
+         case BRW_SFID_SAMPLER:
+            if (devinfo->ver >= 20) {
+               err |= control(file, "sampler message", xe2_sampler_msg_type,
+                              brw_sampler_desc_msg_type(devinfo, imm_desc),
+                              &space);
+               err |= control(file, "sampler simd mode", xe2_sampler_simd_mode,
+                              brw_sampler_desc_simd_mode(devinfo, imm_desc),
+                              &space);
+               if (brw_sampler_desc_return_format(devinfo, imm_desc)) {
+                  string(file, " HP");
+               }
+               format(file, " Surface = %u Sampler = %u",
+                      brw_sampler_desc_binding_table_index(devinfo, imm_desc),
+                      brw_sampler_desc_sampler(devinfo, imm_desc));
+            } else if (devinfo->ver >= 5) {
+               err |= control(file, "sampler message", gfx5_sampler_msg_type,
+                              brw_sampler_desc_msg_type(devinfo, imm_desc),
+                              &space);
+               err |= control(file, "sampler simd mode", gfx5_sampler_simd_mode,
+                              brw_sampler_desc_simd_mode(devinfo, imm_desc),
+                              &space);
+               if (devinfo->ver >= 8 &&
+                   brw_sampler_desc_return_format(devinfo, imm_desc)) {
+                  string(file, " HP");
+               }
+               format(file, " Surface = %u Sampler = %u",
+                      brw_sampler_desc_binding_table_index(devinfo, imm_desc),
+                      brw_sampler_desc_sampler(devinfo, imm_desc));
+            } else {
+               format(file, " (bti %u, sampler %u, msg_type %u, ",
+                      brw_sampler_desc_binding_table_index(devinfo, imm_desc),
+                      brw_sampler_desc_sampler(devinfo, imm_desc),
+                      brw_sampler_desc_msg_type(devinfo, imm_desc));
+               if (devinfo->verx10 != 45) {
+                  err |= control(file, "sampler target format",
+                                 sampler_target_format,
+                                 brw_sampler_desc_return_format(devinfo, imm_desc),
+                                 NULL);
+               }
+               string(file, ")");
+            }
+            break;
+         case GFX6_SFID_DATAPORT_SAMPLER_CACHE:
+         case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
+            /* aka BRW_SFID_DATAPORT_READ on Gfx4-5 */
+            if (devinfo->ver >= 6) {
+               format(file, " (bti %u, msg_ctrl %u, msg_type %u, write_commit %u)",
+                      brw_dp_desc_binding_table_index(devinfo, imm_desc),
+                      brw_dp_desc_msg_control(devinfo, imm_desc),
+                      brw_dp_desc_msg_type(devinfo, imm_desc),
+                      devinfo->ver >= 7 ? 0u :
+                      brw_dp_write_desc_write_commit(devinfo, imm_desc));
+            } else {
+               bool is_965 = devinfo->verx10 == 40;
+               err |= control(file, "DP read message type",
+                              is_965 ? gfx4_dp_read_port_msg_type :
+                                       g45_dp_read_port_msg_type,
+                              brw_dp_read_desc_msg_type(devinfo, imm_desc),
+                              &space);
+
+               format(file, " MsgCtrl = 0x%u",
+                      brw_dp_read_desc_msg_control(devinfo, imm_desc));
+
+               format(file, " Surface = %u",
+                      brw_dp_desc_binding_table_index(devinfo, imm_desc));
+            }
+            break;
+
+         case GFX6_SFID_DATAPORT_RENDER_CACHE: {
+            /* aka BRW_SFID_DATAPORT_WRITE on Gfx4-5 */
+            unsigned msg_type = brw_fb_write_desc_msg_type(devinfo, imm_desc);
+
+            err |= control(file, "DP rc message type",
+                           dp_rc_msg_type(devinfo), msg_type, &space);
+
+            bool is_rt_write = msg_type ==
+               (devinfo->ver >= 6 ? GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
+                                  : BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE);
+
+            if (is_rt_write) {
+               err |= control(file, "RT message type", m_rt_write_subtype,
+                              brw_inst_rt_message_type(devinfo, inst), &space);
+               if (devinfo->ver >= 6 && brw_inst_rt_slot_group(devinfo, inst))
+                  string(file, " Hi");
+               if (brw_fb_write_desc_last_render_target(devinfo, imm_desc))
+                  string(file, " LastRT");
+               if (devinfo->ver >= 10 &&
+                   brw_fb_write_desc_coarse_write(devinfo, imm_desc))
+                  string(file, " CoarseWrite");
+               if (devinfo->ver < 7 &&
+                   brw_fb_write_desc_write_commit(devinfo, imm_desc))
+                  string(file, " WriteCommit");
+            } else {
+               format(file, " MsgCtrl = 0x%u",
+                      brw_fb_write_desc_msg_control(devinfo, imm_desc));
+            }
+
+            format(file, " Surface = %u",
+                   brw_fb_desc_binding_table_index(devinfo, imm_desc));
+            break;
+         }
+
+         case BRW_SFID_URB: {
+            if (devinfo->ver >= 20) {
+               format(file, " (");
+               const enum lsc_opcode op = lsc_msg_desc_opcode(devinfo, imm_desc);
+               err |= control(file, "operation", lsc_operation,
+                              op, &space);
+               format(file, ",");
+               err |= control(file, "addr_size", lsc_addr_size,
+                              lsc_msg_desc_addr_size(devinfo, imm_desc),
+                              &space);
+
+               format(file, ",");
+               err |= control(file, "data_size", lsc_data_size,
+                              lsc_msg_desc_data_size(devinfo, imm_desc),
+                              &space);
+               format(file, ",");
+               if (lsc_opcode_has_cmask(op)) {
+                  err |= control(file, "component_mask",
+                                 lsc_cmask_str,
+                                 lsc_msg_desc_cmask(devinfo, imm_desc),
+                                 &space);
+               } else {
+                  err |= control(file, "vector_size",
+                                 lsc_vect_size_str,
+                                 lsc_msg_desc_vect_size(devinfo, imm_desc),
+                                 &space);
+                  if (lsc_msg_desc_transpose(devinfo, imm_desc))
+                     format(file, ", transpose");
+               }
+               switch(op) {
+               case LSC_OP_LOAD_CMASK:
+               case LSC_OP_LOAD:
+                  format(file, ",");
+                  err |= control(file, "cache_load",
+                                 lsc_cache_load,
+                                 lsc_msg_desc_cache_ctrl(devinfo, imm_desc),
+                                 &space);
+                  break;
+               default:
+                  format(file, ",");
+                  err |= control(file, "cache_store",
+                                 lsc_cache_store,
+                                 lsc_msg_desc_cache_ctrl(devinfo, imm_desc),
+                                 &space);
+                  break;
+               }
+
+               format(file, " dst_len = %u,", lsc_msg_desc_dest_len(devinfo, imm_desc));
+               format(file, " src0_len = %u,", lsc_msg_desc_src0_len(devinfo, imm_desc));
+               format(file, " src1_len = %d", brw_message_ex_desc_ex_mlen(devinfo, imm_ex_desc));
+               err |= control(file, "address_type", lsc_addr_surface_type,
+                              lsc_msg_desc_addr_type(devinfo, imm_desc), &space);
+               format(file, " )");
+            } else {
+               unsigned urb_opcode = brw_inst_urb_opcode(devinfo, inst);
+
+               format(file, " offset %"PRIu64, brw_inst_urb_global_offset(devinfo, inst));
+
+               space = 1;
+
+               err |= control(file, "urb opcode",
+                              devinfo->ver >= 7 ? gfx7_urb_opcode
+                              : gfx5_urb_opcode,
+                              urb_opcode, &space);
+
+               if (devinfo->ver >= 7 &&
+                   brw_inst_urb_per_slot_offset(devinfo, inst)) {
+                  string(file, " per-slot");
+               }
+
+               if (urb_opcode == GFX8_URB_OPCODE_SIMD8_WRITE ||
+                   urb_opcode == GFX8_URB_OPCODE_SIMD8_READ) {
+                  if (brw_inst_urb_channel_mask_present(devinfo, inst))
+                     string(file, " masked");
+               } else if (urb_opcode != GFX125_URB_OPCODE_FENCE) {
+                  err |= control(file, "urb swizzle", urb_swizzle,
+                                 brw_inst_urb_swizzle_control(devinfo, inst),
+                                 &space);
+               }
+
+               if (devinfo->ver < 7) {
+                  err |= control(file, "urb allocate", urb_allocate,
+                                 brw_inst_urb_allocate(devinfo, inst), &space);
+                  err |= control(file, "urb used", urb_used,
+                                 brw_inst_urb_used(devinfo, inst), &space);
+               }
+               if (devinfo->ver < 8) {
+                  err |= control(file, "urb complete", urb_complete,
+                                 brw_inst_urb_complete(devinfo, inst), &space);
+               }
+            }
+            break;
+         }
+         case BRW_SFID_THREAD_SPAWNER:
+            break;
+
+         case BRW_SFID_MESSAGE_GATEWAY:
+            format(file, " (%s)",
+                   gfx7_gateway_subfuncid[brw_inst_gateway_subfuncid(devinfo, inst)]);
+            break;
+
+         case GFX12_SFID_SLM:
+         case GFX12_SFID_TGM:
+         case GFX12_SFID_UGM: {
+            assert(devinfo->has_lsc);
+            format(file, " (");
+            const enum lsc_opcode op = lsc_msg_desc_opcode(devinfo, imm_desc);
+            err |= control(file, "operation", lsc_operation,
+                           op, &space);
+            format(file, ",");
+            err |= control(file, "addr_size", lsc_addr_size,
+                           lsc_msg_desc_addr_size(devinfo, imm_desc),
+                           &space);
+
+            if (op == LSC_OP_FENCE) {
+               format(file, ",");
+               err |= control(file, "scope", lsc_fence_scope,
+                              lsc_fence_msg_desc_scope(devinfo, imm_desc),
+                              &space);
+               format(file, ",");
+               err |= control(file, "flush_type", lsc_flush_type,
+                              lsc_fence_msg_desc_flush_type(devinfo, imm_desc),
+                              &space);
+               format(file, ",");
+               err |= control(file, "backup_mode_fence_routing",
+                              lsc_backup_fence_routing,
+                              lsc_fence_msg_desc_backup_routing(devinfo, imm_desc),
+                              &space);
+            } else {
+               format(file, ",");
+               err |= control(file, "data_size", lsc_data_size,
+                              lsc_msg_desc_data_size(devinfo, imm_desc),
+                              &space);
+               format(file, ",");
+               if (lsc_opcode_has_cmask(op)) {
+                  err |= control(file, "component_mask",
+                                 lsc_cmask_str,
+                                 lsc_msg_desc_cmask(devinfo, imm_desc),
+                                 &space);
+               } else {
+                  err |= control(file, "vector_size",
+                                 lsc_vect_size_str,
+                                 lsc_msg_desc_vect_size(devinfo, imm_desc),
+                                 &space);
+                  if (lsc_msg_desc_transpose(devinfo, imm_desc))
+                     format(file, ", transpose");
+               }
+               switch(op) {
+               case LSC_OP_LOAD_CMASK:
+               case LSC_OP_LOAD:
+                  format(file, ",");
+                  err |= control(file, "cache_load",
+                                 devinfo->ver >= 20 ?
+                                 xe2_lsc_cache_load :
+                                 lsc_cache_load,
+                                 lsc_msg_desc_cache_ctrl(devinfo, imm_desc),
+                                 &space);
+                  break;
+               default:
+                  format(file, ",");
+                  err |= control(file, "cache_store",
+                                 devinfo->ver >= 20 ?
+                                 xe2_lsc_cache_store :
+                                 lsc_cache_store,
+                                 lsc_msg_desc_cache_ctrl(devinfo, imm_desc),
+                                 &space);
+                  break;
+               }
+            }
+            format(file, " dst_len = %u,", lsc_msg_desc_dest_len(devinfo, imm_desc));
+            format(file, " src0_len = %u,", lsc_msg_desc_src0_len(devinfo, imm_desc));
+
+            if (!brw_inst_send_sel_reg32_ex_desc(devinfo, inst))
+               format(file, " src1_len = %d",
+                      brw_message_ex_desc_ex_mlen(devinfo, imm_ex_desc));
+
+            err |= control(file, "address_type", lsc_addr_surface_type,
+                           lsc_msg_desc_addr_type(devinfo, imm_desc), &space);
+            format(file, " )");
+            break;
+         }
+
+         case GFX7_SFID_DATAPORT_DATA_CACHE:
+            if (devinfo->ver >= 7) {
+               format(file, " (");
+               space = 0;
+
+               err |= control(file, "DP DC0 message type",
+                              dp_dc0_msg_type_gfx7,
+                              brw_dp_desc_msg_type(devinfo, imm_desc), &space);
+
+               format(file, ", bti %u, ",
+                      brw_dp_desc_binding_table_index(devinfo, imm_desc));
+
+               switch (brw_inst_dp_msg_type(devinfo, inst)) {
+               case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
+                  control(file, "atomic op", aop,
+                          brw_dp_desc_msg_control(devinfo, imm_desc) & 0xf,
+                          &space);
+                  break;
+               case GFX7_DATAPORT_DC_OWORD_BLOCK_READ:
+               case GFX7_DATAPORT_DC_OWORD_BLOCK_WRITE: {
+                  unsigned msg_ctrl = brw_dp_desc_msg_control(devinfo, imm_desc);
+                  assert(dp_oword_block_rw[msg_ctrl & 7]);
+                  format(file, "owords = %s, aligned = %d",
+                        dp_oword_block_rw[msg_ctrl & 7], (msg_ctrl >> 3) & 3);
+                  break;
+               }
+               default:
+                  format(file, "%u",
+                         brw_dp_desc_msg_control(devinfo, imm_desc));
+               }
+               format(file, ")");
+            } else {
+               unsupported = true;
+            }
+            break;
+
+         case HSW_SFID_DATAPORT_DATA_CACHE_1: {
+            if (devinfo->ver >= 7) {
+               format(file, " (");
+               space = 0;
+
+               unsigned msg_ctrl = brw_dp_desc_msg_control(devinfo, imm_desc);
+
+               err |= control(file, "DP DC1 message type",
+                              dp_dc1_msg_type_hsw,
+                              brw_dp_desc_msg_type(devinfo, imm_desc), &space);
+
+               format(file, ", Surface = %u, ",
+                      brw_dp_desc_binding_table_index(devinfo, imm_desc));
+
+               switch (brw_inst_dp_msg_type(devinfo, inst)) {
+               case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
+               case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
+               case HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP:
+                  format(file, "SIMD%d,", (msg_ctrl & (1 << 4)) ? 8 : 16);
+                  FALLTHROUGH;
+               case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
+               case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
+               case HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2:
+               case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP:
+               case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP:
+                  control(file, "atomic op", aop, msg_ctrl & 0xf, &space);
+                  break;
+               case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ:
+               case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE:
+               case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ:
+               case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE:
+               case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE:
+               case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ: {
+                  static const char *simd_modes[] = { "4x2", "16", "8" };
+                  format(file, "SIMD%s, Mask = 0x%x",
+                         simd_modes[msg_ctrl >> 4], msg_ctrl & 0xf);
+                  break;
+               }
+               case GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP:
+               case GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP:
+               case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP:
+                  format(file, "SIMD%d,", (msg_ctrl & (1 << 4)) ? 8 : 16);
+                  control(file, "atomic float op", aop_float, msg_ctrl & 0xf,
+                          &space);
+                  break;
+               case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE:
+               case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ:
+                  assert(dp_oword_block_rw[msg_ctrl & 7]);
+                  format(file, "owords = %s, aligned = %d",
+                        dp_oword_block_rw[msg_ctrl & 7], (msg_ctrl >> 3) & 3);
+                  break;
+               default:
+                  format(file, "0x%x", msg_ctrl);
+               }
+               format(file, ")");
+            } else {
+               unsupported = true;
+            }
+            break;
+         }
+
+         case GFX7_SFID_PIXEL_INTERPOLATOR:
+            if (devinfo->ver >= 7) {
+               format(file, " (%s, %s, 0x%02"PRIx64")",
+                      brw_inst_pi_nopersp(devinfo, inst) ? "linear" : "persp",
+                      pixel_interpolator_msg_types[brw_inst_pi_message_type(devinfo, inst)],
+                      brw_inst_pi_message_data(devinfo, inst));
+            } else {
+               unsupported = true;
+            }
+            break;
+
+         case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
+            if (devinfo->has_ray_tracing) {
+               format(file, " SIMD%d,",
+                      brw_rt_trace_ray_desc_exec_size(devinfo, imm_desc));
+            } else {
+               unsupported = true;
+            }
+            break;
+
+         default:
+            unsupported = true;
+            break;
+         }
+
+         if (unsupported)
+            format(file, "unsupported shared function ID %d", sfid);
+
+         if (space)
+            string(file, " ");
+      }
+      if (devinfo->verx10 >= 125 &&
+          brw_inst_send_sel_reg32_ex_desc(devinfo, inst) &&
+          brw_inst_send_ex_bso(devinfo, inst)) {
+         format(file, " src1_len = %u",
+                (unsigned) brw_inst_send_src1_len(devinfo, inst));
+
+         format(file, " ex_bso");
+      }
+      if (brw_sfid_is_lsc(sfid) ||
+          (sfid == BRW_SFID_URB && devinfo->ver >= 20)) {
+            lsc_disassemble_ex_desc(devinfo, imm_desc, imm_ex_desc, file);
+      } else {
+         if (has_imm_desc)
+            format(file, " mlen %u", brw_message_desc_mlen(devinfo, imm_desc));
+         if (has_imm_ex_desc) {
+            format(file, " ex_mlen %u",
+                   brw_message_ex_desc_ex_mlen(devinfo, imm_ex_desc));
+         }
+         if (has_imm_desc)
+            format(file, " rlen %u", brw_message_desc_rlen(devinfo, imm_desc));
+      }
+   }
+   pad(file, 64);
+   if (opcode != BRW_OPCODE_NOP && opcode != BRW_OPCODE_NENOP) {
+      string(file, "{");
+      space = 1;
+      err |= control(file, "access mode", access_mode,
+                     brw_inst_access_mode(devinfo, inst), &space);
+      if (devinfo->ver >= 6) {
+         err |= control(file, "write enable control", wectrl,
+                        brw_inst_mask_control(devinfo, inst), &space);
+      } else {
+         err |= control(file, "mask control", mask_ctrl,
+                        brw_inst_mask_control(devinfo, inst), &space);
+      }
+
+      if (devinfo->ver < 12) {
+         err |= control(file, "dependency control", dep_ctrl,
+                        ((brw_inst_no_dd_check(devinfo, inst) << 1) |
+                         brw_inst_no_dd_clear(devinfo, inst)), &space);
+      }
+
+      if (devinfo->ver >= 6)
+         err |= qtr_ctrl(file, devinfo, inst);
+      else {
+         if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_COMPRESSED &&
+             desc && desc->ndst > 0 &&
+             brw_inst_dst_reg_file(devinfo, inst) == BRW_MESSAGE_REGISTER_FILE &&
+             brw_inst_dst_da_reg_nr(devinfo, inst) & BRW_MRF_COMPR4) {
+            format(file, " compr4");
+         } else {
+            err |= control(file, "compression control", compr_ctrl,
+                           brw_inst_qtr_control(devinfo, inst), &space);
+         }
+      }
+
+      if (devinfo->ver >= 12)
+         err |= swsb(file, isa, inst);
+
+      err |= control(file, "compaction", cmpt_ctrl, is_compacted, &space);
+      err |= control(file, "thread control", thread_ctrl,
+                     (devinfo->ver >= 12 ? brw_inst_atomic_control(devinfo, inst) :
+                                           brw_inst_thread_control(devinfo, inst)),
+                     &space);
+      if (has_branch_ctrl(devinfo, opcode)) {
+         err |= control(file, "branch ctrl", branch_ctrl,
+                        brw_inst_branch_control(devinfo, inst), &space);
+      } else if (devinfo->ver >= 6 && devinfo->ver < 20) {
+         err |= control(file, "acc write control", accwr,
+                        brw_inst_acc_wr_control(devinfo, inst), &space);
+      }
+      if (is_send(opcode))
+         err |= control(file, "end of thread", end_of_thread,
+                        brw_inst_eot(devinfo, inst), &space);
+      if (space)
+         string(file, " ");
+      string(file, "}");
+   }
+   string(file, ";");
+   newline(file);
+   return err;
+}
+
+int
+brw_disassemble_find_end(const struct brw_isa_info *isa,
+                         const void *assembly, int start)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+   int offset = start;
+
+   /* This loop exits when send-with-EOT or when opcode is 0 */
+   while (true) {
+      const brw_inst *insn = assembly + offset;
+
+      if (brw_inst_cmpt_control(devinfo, insn)) {
+         offset += 8;
+      } else {
+         offset += 16;
+      }
+
+      /* Simplistic, but efficient way to terminate disasm */
+      uint32_t opcode = brw_inst_opcode(isa, insn);
+      if (opcode == 0 || (is_send(opcode) && brw_inst_eot(devinfo, insn))) {
+         break;
+      }
+   }
+
+   return offset;
+}
+
+void
+brw_disassemble_with_errors(const struct brw_isa_info *isa,
+                            const void *assembly, int start, FILE *out)
+{
+   int end = brw_disassemble_find_end(isa, assembly, start);
+
+   /* Make a dummy disasm structure that brw_validate_instructions
+    * can work from.
+    */
+   struct disasm_info *disasm_info = disasm_initialize(isa, NULL);
+   disasm_new_inst_group(disasm_info, start);
+   disasm_new_inst_group(disasm_info, end);
+
+   brw_validate_instructions(isa, assembly, start, end, disasm_info);
+
+   void *mem_ctx = ralloc_context(NULL);
+   const struct brw_label *root_label =
+      brw_label_assembly(isa, assembly, start, end, mem_ctx);
+
+   foreach_list_typed(struct inst_group, group, link,
+                      &disasm_info->group_list) {
+      struct exec_node *next_node = exec_node_get_next(&group->link);
+      if (exec_node_is_tail_sentinel(next_node))
+         break;
+
+      struct inst_group *next =
+         exec_node_data(struct inst_group, next_node, link);
+
+      int start_offset = group->offset;
+      int end_offset = next->offset;
+
+      brw_disassemble(isa, assembly, start_offset, end_offset,
+                      root_label, out);
+
+      if (group->error) {
+         fputs(group->error, out);
+      }
+   }
+
+   ralloc_free(mem_ctx);
+   ralloc_free(disasm_info);
+}
diff --git a/src/intel/compiler/elk/brw_disasm.h b/src/intel/compiler/elk/brw_disasm.h
new file mode 100644
index 00000000000..3ebfcfd3051
--- /dev/null
+++ b/src/intel/compiler/elk/brw_disasm.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2024 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef BRW_DISASM_H
+#define BRW_DISASM_H
+
+#include <stdio.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct brw_isa_info;
+struct brw_inst;
+
+const struct brw_label *brw_find_label(const struct brw_label *root, int offset);
+void brw_create_label(struct brw_label **labels, int offset, void *mem_ctx);
+int brw_disassemble_inst(FILE *file, const struct brw_isa_info *isa,
+                         const struct brw_inst *inst, bool is_compacted,
+                         int offset, const struct brw_label *root_label);
+const struct
+brw_label *brw_label_assembly(const struct brw_isa_info *isa,
+                              const void *assembly, int start, int end,
+                              void *mem_ctx);
+void brw_disassemble_with_labels(const struct brw_isa_info *isa,
+                                 const void *assembly, int start, int end, FILE *out);
+void brw_disassemble(const struct brw_isa_info *isa,
+                     const void *assembly, int start, int end,
+                     const struct brw_label *root_label, FILE *out);
+int brw_disassemble_find_end(const struct brw_isa_info *isa,
+                             const void *assembly, int start);
+void brw_disassemble_with_errors(const struct brw_isa_info *isa,
+                                 const void *assembly, int start, FILE *out);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* BRW_DISASM_H */
diff --git a/src/intel/compiler/elk/brw_disasm_info.c b/src/intel/compiler/elk/brw_disasm_info.c
new file mode 100644
index 00000000000..cb9a2e42233
--- /dev/null
+++ b/src/intel/compiler/elk/brw_disasm_info.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_cfg.h"
+#include "brw_eu.h"
+#include "brw_disasm.h"
+#include "brw_disasm_info.h"
+#include "dev/intel_debug.h"
+#include "compiler/nir/nir.h"
+
+__attribute__((weak)) void nir_print_instr(UNUSED const nir_instr *instr,
+                                           UNUSED FILE *fp) {}
+
+void
+dump_assembly(void *assembly, int start_offset, int end_offset,
+              struct disasm_info *disasm, const unsigned *block_latency)
+{
+   const struct brw_isa_info *isa = disasm->isa;
+   const char *last_annotation_string = NULL;
+   const void *last_annotation_ir = NULL;
+
+   void *mem_ctx = ralloc_context(NULL);
+   const struct brw_label *root_label =
+      brw_label_assembly(isa, assembly, start_offset, end_offset, mem_ctx);
+
+   foreach_list_typed(struct inst_group, group, link, &disasm->group_list) {
+      struct exec_node *next_node = exec_node_get_next(&group->link);
+      if (exec_node_is_tail_sentinel(next_node))
+         break;
+
+      struct inst_group *next =
+         exec_node_data(struct inst_group, next_node, link);
+
+      int start_offset = group->offset;
+      int end_offset = next->offset;
+
+      if (group->block_start) {
+         fprintf(stderr, "   START B%d", group->block_start->num);
+         foreach_list_typed(struct bblock_link, predecessor_link, link,
+                            &group->block_start->parents) {
+            struct bblock_t *predecessor_block = predecessor_link->block;
+            fprintf(stderr, " <-B%d", predecessor_block->num);
+         }
+         if (block_latency)
+            fprintf(stderr, " (%u cycles)",
+                    block_latency[group->block_start->num]);
+         fprintf(stderr, "\n");
+      }
+
+      if (last_annotation_ir != group->ir) {
+         last_annotation_ir = group->ir;
+         if (last_annotation_ir) {
+            fprintf(stderr, "   ");
+            nir_print_instr(group->ir, stderr);
+            fprintf(stderr, "\n");
+         }
+      }
+
+      if (last_annotation_string != group->annotation) {
+         last_annotation_string = group->annotation;
+         if (last_annotation_string)
+            fprintf(stderr, "   %s\n", last_annotation_string);
+      }
+
+      brw_disassemble(isa, assembly, start_offset, end_offset,
+                      root_label, stderr);
+
+      if (group->error) {
+         fputs(group->error, stderr);
+      }
+
+      if (group->block_end) {
+         fprintf(stderr, "   END B%d", group->block_end->num);
+         foreach_list_typed(struct bblock_link, successor_link, link,
+                            &group->block_end->children) {
+            struct bblock_t *successor_block = successor_link->block;
+            fprintf(stderr, " ->B%d", successor_block->num);
+         }
+         fprintf(stderr, "\n");
+      }
+   }
+   fprintf(stderr, "\n");
+
+   ralloc_free(mem_ctx);
+}
+
+struct disasm_info *
+disasm_initialize(const struct brw_isa_info *isa,
+                  const struct cfg_t *cfg)
+{
+   struct disasm_info *disasm = ralloc(NULL, struct disasm_info);
+   exec_list_make_empty(&disasm->group_list);
+   disasm->isa = isa;
+   disasm->cfg = cfg;
+   disasm->cur_block = 0;
+   disasm->use_tail = false;
+   return disasm;
+}
+
+struct inst_group *
+disasm_new_inst_group(struct disasm_info *disasm, unsigned next_inst_offset)
+{
+   struct inst_group *tail = rzalloc(disasm, struct inst_group);
+   tail->offset = next_inst_offset;
+   exec_list_push_tail(&disasm->group_list, &tail->link);
+   return tail;
+}
+
+void
+disasm_annotate(struct disasm_info *disasm,
+                struct backend_instruction *inst, unsigned offset)
+{
+   const struct intel_device_info *devinfo = disasm->isa->devinfo;
+   const struct cfg_t *cfg = disasm->cfg;
+
+   struct inst_group *group;
+   if (!disasm->use_tail) {
+      group = disasm_new_inst_group(disasm, offset);
+   } else {
+      disasm->use_tail = false;
+      group = exec_node_data(struct inst_group,
+                             exec_list_get_tail_raw(&disasm->group_list), link);
+   }
+
+   if (INTEL_DEBUG(DEBUG_ANNOTATION)) {
+      group->ir = inst->ir;
+      group->annotation = inst->annotation;
+   }
+
+   if (bblock_start(cfg->blocks[disasm->cur_block]) == inst) {
+      group->block_start = cfg->blocks[disasm->cur_block];
+   }
+
+   /* There is no hardware DO instruction on Gfx6+, so since DO always
+    * starts a basic block, we need to set the .block_start of the next
+    * instruction's annotation with a pointer to the bblock started by
+    * the DO.
+    *
+    * There's also only complication from emitting an annotation without
+    * a corresponding hardware instruction to disassemble.
+    */
+   if (devinfo->ver >= 6 && inst->opcode == BRW_OPCODE_DO) {
+      disasm->use_tail = true;
+   }
+
+   if (bblock_end(cfg->blocks[disasm->cur_block]) == inst) {
+      group->block_end = cfg->blocks[disasm->cur_block];
+      disasm->cur_block++;
+   }
+}
+
+void
+disasm_insert_error(struct disasm_info *disasm, unsigned offset,
+                    unsigned inst_size, const char *error)
+{
+   foreach_list_typed(struct inst_group, cur, link, &disasm->group_list) {
+      struct exec_node *next_node = exec_node_get_next(&cur->link);
+      if (exec_node_is_tail_sentinel(next_node))
+         break;
+
+      struct inst_group *next =
+         exec_node_data(struct inst_group, next_node, link);
+
+      if (next->offset <= offset)
+         continue;
+
+      if (offset + inst_size != next->offset) {
+         struct inst_group *new = ralloc(disasm, struct inst_group);
+         memcpy(new, cur, sizeof(struct inst_group));
+
+         cur->error = NULL;
+         cur->error_length = 0;
+         cur->block_end = NULL;
+
+         new->offset = offset + inst_size;
+         new->block_start = NULL;
+
+         exec_node_insert_after(&cur->link, &new->link);
+      }
+
+      if (cur->error)
+         ralloc_strcat(&cur->error, error);
+      else
+         cur->error = ralloc_strdup(disasm, error);
+      return;
+   }
+}
diff --git a/src/intel/compiler/elk/brw_disasm_info.h b/src/intel/compiler/elk/brw_disasm_info.h
new file mode 100644
index 00000000000..937180b7e2e
--- /dev/null
+++ b/src/intel/compiler/elk/brw_disasm_info.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _INTEL_ASM_ANNOTATION_H
+#define _INTEL_ASM_ANNOTATION_H
+
+#include "compiler/glsl/list.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct cfg_t;
+struct backend_instruction;
+struct intel_device_info;
+
+struct inst_group {
+   struct exec_node link;
+
+   int offset;
+
+   size_t error_length;
+   char *error;
+
+   /* Pointers to the basic block in the CFG if the instruction group starts
+    * or ends a basic block.
+    */
+   struct bblock_t *block_start;
+   struct bblock_t *block_end;
+
+   /* Annotation for the generated IR.  One of the two can be set. */
+   const void *ir;
+   const char *annotation;
+};
+
+struct disasm_info {
+   struct exec_list group_list;
+
+   const struct brw_isa_info *isa;
+   const struct cfg_t *cfg;
+
+   /** Block index in the cfg. */
+   int cur_block;
+   bool use_tail;
+};
+
+void
+dump_assembly(void *assembly, int start_offset, int end_offset,
+              struct disasm_info *disasm, const unsigned *block_latency);
+
+struct disasm_info *
+disasm_initialize(const struct brw_isa_info *isa,
+                  const struct cfg_t *cfg);
+
+struct inst_group *
+disasm_new_inst_group(struct disasm_info *disasm, unsigned offset);
+
+void
+disasm_annotate(struct disasm_info *disasm,
+                struct backend_instruction *inst, unsigned offset);
+
+void
+disasm_insert_error(struct disasm_info *disasm, unsigned offset,
+                    unsigned inst_size, const char *error);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* _INTEL_ASM_ANNOTATION_H */
diff --git a/src/intel/compiler/elk/brw_disasm_tool.c b/src/intel/compiler/elk/brw_disasm_tool.c
new file mode 100644
index 00000000000..1771b2e369c
--- /dev/null
+++ b/src/intel/compiler/elk/brw_disasm_tool.c
@@ -0,0 +1,242 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+
+#include "compiler/brw_disasm.h"
+#include "compiler/brw_isa_info.h"
+#include "dev/intel_device_info.h"
+#include "util/u_dynarray.h"
+
+enum opt_input_type {
+   OPT_INPUT_BINARY,
+   OPT_INPUT_C_LITERAL,
+};
+
+static enum opt_input_type input_type = OPT_INPUT_BINARY;
+
+/* Return size of file in bytes pointed by fp */
+static long
+i965_disasm_get_file_size(FILE *fp)
+{
+   long size;
+
+   fseek(fp, 0L, SEEK_END);
+   size = ftell(fp);
+   fseek(fp, 0L, SEEK_SET);
+
+   return size;
+}
+
+/* Read hex file which should be in following format:
+ * for example :
+ *    { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }
+ */
+static void *
+i965_disasm_read_c_literal_file(FILE *fp, size_t *end)
+{
+   struct util_dynarray assembly = {};
+   uint32_t temp[2];
+
+   if (fscanf(fp, " { ") == EOF) {
+      fprintf(stderr, "Couldn't find opening `{`\n");
+      return NULL;
+   }
+
+   if (fscanf(fp, "0x%x , 0x%x", &temp[0], &temp[1]) == 2) {
+      util_dynarray_append(&assembly, uint32_t, temp[0]);
+      util_dynarray_append(&assembly, uint32_t, temp[1]);
+   } else {
+      fprintf(stderr, "Couldn't read hex values\n");
+      return NULL;
+   }
+
+   while (fscanf(fp, " , 0x%x , 0x%x ", &temp[0], &temp[1]) == 2) {
+      util_dynarray_append(&assembly, uint32_t, temp[0]);
+      util_dynarray_append(&assembly, uint32_t, temp[1]);
+   }
+
+   if (fscanf(fp, "}") == EOF) {
+      fprintf(stderr, "Couldn't find closing `}`\n");
+      return NULL;
+   }
+
+   *end = assembly.size;
+   return assembly.data;
+}
+
+static void *
+i965_disasm_read_binary(FILE *fp, size_t *end)
+{
+   size_t size;
+   void *assembly;
+
+   long sz = i965_disasm_get_file_size(fp);
+   if (sz < 0)
+      return NULL;
+
+   *end = (size_t)sz;
+   if (!*end)
+      return NULL;
+
+   assembly = malloc(*end + 1);
+   if (assembly == NULL)
+      return NULL;
+
+   size = fread(assembly, *end, 1, fp);
+   if (!size) {
+      free(assembly);
+      return NULL;
+   }
+   return assembly;
+}
+
+static void
+print_help(const char *progname, FILE *file)
+{
+   fprintf(file,
+           "Usage: %s [OPTION]...\n"
+           "Disassemble i965 instructions from binary file.\n\n"
+           "      --help             display this help and exit\n"
+           "      --input-path=PATH  read binary file from binary file PATH\n"
+           "      --type=INPUT_TYPE  INPUT_TYPE can be 'bin' (default if omitted),\n"
+           "                         'c_literal'.\n"
+           "      --gen=platform     disassemble instructions for given \n"
+           "                         platform (3 letter platform name)\n",
+           progname);
+}
+
+int main(int argc, char *argv[])
+{
+   FILE *fp = NULL;
+   void *assembly = NULL;
+   char *file_path = NULL;
+   size_t start = 0, end = 0;
+   uint16_t pci_id = 0;
+   int c;
+   int result = EXIT_FAILURE;
+
+   bool help = false;
+   const struct option i965_disasm_opts[] = {
+      { "help",          no_argument,       (int *) &help,      true },
+      { "input-path",    required_argument, NULL,               'i' },
+      { "type",          required_argument, NULL,               't' },
+      { "gen",           required_argument, NULL,               'g'},
+      { NULL,            0,                 NULL,                0 }
+   };
+
+   while ((c = getopt_long(argc, argv, ":i:t:g:h", i965_disasm_opts, NULL)) != -1) {
+      switch (c) {
+      case 'g': {
+         const int id = intel_device_name_to_pci_device_id(optarg);
+         if (id < 0) {
+            fprintf(stderr, "can't parse gen: '%s', expected 3 letter "
+                            "platform name\n", optarg);
+            goto end;
+         } else {
+            pci_id = id;
+         }
+         break;
+      }
+      case 'i':
+         file_path = strdup(optarg);
+         fp = fopen(file_path, "r");
+         if (!fp) {
+            fprintf(stderr, "Unable to read input file : %s\n",
+                    file_path);
+            goto end;
+         }
+         break;
+      case 't':
+         if (strcmp(optarg, "c_literal") == 0) {
+            input_type = OPT_INPUT_C_LITERAL;
+         } else if (strcmp(optarg, "bin") == 0) {
+            input_type = OPT_INPUT_BINARY;
+         } else {
+            fprintf(stderr, "invalid value for --type: %s\n", optarg);
+            goto end;
+         }
+         break;
+      case 'h':
+         help = true;
+         print_help(argv[0], stderr);
+         goto end;
+      case 0:
+         break;
+      case ':':
+         fprintf(stderr, "%s: option `-%c' requires an argument\n",
+                 argv[0], optopt);
+         goto end;
+      case '?':
+      default:
+         fprintf(stderr, "%s: option `-%c' is invalid: ignored\n",
+                 argv[0], optopt);
+         goto end;
+      }
+   }
+
+   if (help || !file_path || !pci_id) {
+      print_help(argv[0], stderr);
+      exit(0);
+   }
+
+   struct intel_device_info devinfo;
+   if (!intel_get_device_info_from_pci_id(pci_id, &devinfo)) {
+      fprintf(stderr, "can't find device information: pci_id=0x%x\n", pci_id);
+      exit(EXIT_FAILURE);
+   }
+
+   struct brw_isa_info isa;
+   brw_init_isa_info(&isa, &devinfo);
+
+   if (input_type == OPT_INPUT_BINARY)
+      assembly = i965_disasm_read_binary(fp, &end);
+   else if (input_type == OPT_INPUT_C_LITERAL)
+      assembly = i965_disasm_read_c_literal_file(fp, &end);
+
+   if (!assembly) {
+      if (end)
+        fprintf(stderr, "Unable to allocate buffer to read input file\n");
+      else
+        fprintf(stderr, "Failed to read input file\n");
+
+      goto end;
+   }
+
+   /* Disassemble i965 instructions from buffer assembly */
+   brw_disassemble_with_labels(&isa, assembly, start, end, stdout);
+
+   result = EXIT_SUCCESS;
+
+end:
+   if (fp)
+      fclose(fp);
+
+   free(file_path);
+   free(assembly);
+
+   exit(result);
+}
diff --git a/src/intel/compiler/elk/brw_eu.c b/src/intel/compiler/elk/brw_eu.c
new file mode 100644
index 00000000000..d6b94f3441d
--- /dev/null
+++ b/src/intel/compiler/elk/brw_eu.c
@@ -0,0 +1,856 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "brw_disasm.h"
+#include "brw_eu_defines.h"
+#include "brw_eu.h"
+#include "brw_shader.h"
+#include "intel_gfx_ver_enum.h"
+#include "dev/intel_debug.h"
+
+#include "util/u_debug.h"
+#include "util/ralloc.h"
+
+/* Returns a conditional modifier that negates the condition. */
+enum brw_conditional_mod
+brw_negate_cmod(enum brw_conditional_mod cmod)
+{
+   switch (cmod) {
+   case BRW_CONDITIONAL_Z:
+      return BRW_CONDITIONAL_NZ;
+   case BRW_CONDITIONAL_NZ:
+      return BRW_CONDITIONAL_Z;
+   case BRW_CONDITIONAL_G:
+      return BRW_CONDITIONAL_LE;
+   case BRW_CONDITIONAL_GE:
+      return BRW_CONDITIONAL_L;
+   case BRW_CONDITIONAL_L:
+      return BRW_CONDITIONAL_GE;
+   case BRW_CONDITIONAL_LE:
+      return BRW_CONDITIONAL_G;
+   default:
+      unreachable("Can't negate this cmod");
+   }
+}
+
+/* Returns the corresponding conditional mod for swapping src0 and
+ * src1 in e.g. CMP.
+ */
+enum brw_conditional_mod
+brw_swap_cmod(enum brw_conditional_mod cmod)
+{
+   switch (cmod) {
+   case BRW_CONDITIONAL_Z:
+   case BRW_CONDITIONAL_NZ:
+      return cmod;
+   case BRW_CONDITIONAL_G:
+      return BRW_CONDITIONAL_L;
+   case BRW_CONDITIONAL_GE:
+      return BRW_CONDITIONAL_LE;
+   case BRW_CONDITIONAL_L:
+      return BRW_CONDITIONAL_G;
+   case BRW_CONDITIONAL_LE:
+      return BRW_CONDITIONAL_GE;
+   default:
+      return BRW_CONDITIONAL_NONE;
+   }
+}
+
+/**
+ * Get the least significant bit offset of the i+1-th component of immediate
+ * type \p type.  For \p i equal to the two's complement of j, return the
+ * offset of the j-th component starting from the end of the vector.  For
+ * scalar register types return zero.
+ */
+static unsigned
+imm_shift(enum brw_reg_type type, unsigned i)
+{
+   assert(type != BRW_REGISTER_TYPE_UV && type != BRW_REGISTER_TYPE_V &&
+          "Not implemented.");
+
+   if (type == BRW_REGISTER_TYPE_VF)
+      return 8 * (i & 3);
+   else
+      return 0;
+}
+
+/**
+ * Swizzle an arbitrary immediate \p x of the given type according to the
+ * permutation specified as \p swz.
+ */
+uint32_t
+brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz)
+{
+   if (imm_shift(type, 1)) {
+      const unsigned n = 32 / imm_shift(type, 1);
+      uint32_t y = 0;
+
+      for (unsigned i = 0; i < n; i++) {
+         /* Shift the specified component all the way to the right and left to
+          * discard any undesired L/MSBs, then shift it right into component i.
+          */
+         y |= x >> imm_shift(type, (i & ~3) + BRW_GET_SWZ(swz, i & 3))
+                << imm_shift(type, ~0u)
+                >> imm_shift(type, ~0u - i);
+      }
+
+      return y;
+   } else {
+      return x;
+   }
+}
+
+unsigned
+brw_get_default_exec_size(struct brw_codegen *p)
+{
+   return p->current->exec_size;
+}
+
+unsigned
+brw_get_default_group(struct brw_codegen *p)
+{
+   return p->current->group;
+}
+
+unsigned
+brw_get_default_access_mode(struct brw_codegen *p)
+{
+   return p->current->access_mode;
+}
+
+struct tgl_swsb
+brw_get_default_swsb(struct brw_codegen *p)
+{
+   return p->current->swsb;
+}
+
+void
+brw_set_default_exec_size(struct brw_codegen *p, unsigned value)
+{
+   p->current->exec_size = value;
+}
+
+void brw_set_default_predicate_control(struct brw_codegen *p, enum brw_predicate pc)
+{
+   p->current->predicate = pc;
+}
+
+void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse)
+{
+   p->current->pred_inv = predicate_inverse;
+}
+
+void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg)
+{
+   assert(subreg < 2);
+   p->current->flag_subreg = reg * 2 + subreg;
+}
+
+void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode )
+{
+   p->current->access_mode = access_mode;
+}
+
+void
+brw_set_default_compression_control(struct brw_codegen *p,
+			    enum brw_compression compression_control)
+{
+   switch (compression_control) {
+   case BRW_COMPRESSION_NONE:
+      /* This is the "use the first set of bits of dmask/vmask/arf
+       * according to execsize" option.
+       */
+      p->current->group = 0;
+      break;
+   case BRW_COMPRESSION_2NDHALF:
+      /* For SIMD8, this is "use the second set of 8 bits." */
+      p->current->group = 8;
+      break;
+   case BRW_COMPRESSION_COMPRESSED:
+      /* For SIMD16 instruction compression, use the first set of 16 bits
+       * since we don't do SIMD32 dispatch.
+       */
+      p->current->group = 0;
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   if (p->devinfo->ver <= 6) {
+      p->current->compressed =
+         (compression_control == BRW_COMPRESSION_COMPRESSED);
+   }
+}
+
+/**
+ * Enable or disable instruction compression on the given instruction leaving
+ * the currently selected channel enable group untouched.
+ */
+void
+brw_inst_set_compression(const struct intel_device_info *devinfo,
+                         brw_inst *inst, bool on)
+{
+   if (devinfo->ver >= 6) {
+      /* No-op, the EU will figure out for us whether the instruction needs to
+       * be compressed.
+       */
+   } else {
+      /* The channel group and compression controls are non-orthogonal, there
+       * are two possible representations for uncompressed instructions and we
+       * may need to preserve the current one to avoid changing the selected
+       * channel group inadvertently.
+       */
+      if (on)
+         brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_COMPRESSED);
+      else if (brw_inst_qtr_control(devinfo, inst)
+               == BRW_COMPRESSION_COMPRESSED)
+         brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
+   }
+}
+
+void
+brw_set_default_compression(struct brw_codegen *p, bool on)
+{
+   p->current->compressed = on;
+}
+
+/**
+ * Apply the range of channel enable signals given by
+ * [group, group + exec_size) to the instruction passed as argument.
+ */
+void
+brw_inst_set_group(const struct intel_device_info *devinfo,
+                   brw_inst *inst, unsigned group)
+{
+   if (devinfo->ver >= 20) {
+      assert(group % 8 == 0 && group < 32);
+      brw_inst_set_qtr_control(devinfo, inst, group / 8);
+
+   } else if (devinfo->ver >= 7) {
+      assert(group % 4 == 0 && group < 32);
+      brw_inst_set_qtr_control(devinfo, inst, group / 8);
+      brw_inst_set_nib_control(devinfo, inst, (group / 4) % 2);
+
+   } else if (devinfo->ver == 6) {
+      assert(group % 8 == 0 && group < 32);
+      brw_inst_set_qtr_control(devinfo, inst, group / 8);
+
+   } else {
+      assert(group % 8 == 0 && group < 16);
+      /* The channel group and compression controls are non-orthogonal, there
+       * are two possible representations for group zero and we may need to
+       * preserve the current one to avoid changing the selected compression
+       * enable inadvertently.
+       */
+      if (group == 8)
+         brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_2NDHALF);
+      else if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_2NDHALF)
+         brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
+   }
+}
+
+void
+brw_set_default_group(struct brw_codegen *p, unsigned group)
+{
+   p->current->group = group;
+}
+
+void brw_set_default_mask_control( struct brw_codegen *p, unsigned value )
+{
+   p->current->mask_control = value;
+}
+
+void brw_set_default_saturate( struct brw_codegen *p, bool enable )
+{
+   p->current->saturate = enable;
+}
+
+void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value)
+{
+   p->current->acc_wr_control = value;
+}
+
+void brw_set_default_swsb(struct brw_codegen *p, struct tgl_swsb value)
+{
+   p->current->swsb = value;
+}
+
+void brw_push_insn_state( struct brw_codegen *p )
+{
+   assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
+   *(p->current + 1) = *p->current;
+   p->current++;
+}
+
+void brw_pop_insn_state( struct brw_codegen *p )
+{
+   assert(p->current != p->stack);
+   p->current--;
+}
+
+
+/***********************************************************************
+ */
+void
+brw_init_codegen(const struct brw_isa_info *isa,
+                 struct brw_codegen *p, void *mem_ctx)
+{
+   memset(p, 0, sizeof(*p));
+
+   p->isa = isa;
+   p->devinfo = isa->devinfo;
+   p->automatic_exec_sizes = true;
+   /*
+    * Set the initial instruction store array size to 1024, if found that
+    * isn't enough, then it will double the store size at brw_next_insn()
+    * until out of memory.
+    */
+   p->store_size = 1024;
+   p->store = rzalloc_array(mem_ctx, brw_inst, p->store_size);
+   p->nr_insn = 0;
+   p->current = p->stack;
+   memset(p->current, 0, sizeof(p->current[0]));
+
+   p->mem_ctx = mem_ctx;
+
+   /* Some defaults?
+    */
+   brw_set_default_exec_size(p, BRW_EXECUTE_8);
+   brw_set_default_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */
+   brw_set_default_saturate(p, 0);
+   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+
+   /* Set up control flow stack */
+   p->if_stack_depth = 0;
+   p->if_stack_array_size = 16;
+   p->if_stack = rzalloc_array(mem_ctx, int, p->if_stack_array_size);
+
+   p->loop_stack_depth = 0;
+   p->loop_stack_array_size = 16;
+   p->loop_stack = rzalloc_array(mem_ctx, int, p->loop_stack_array_size);
+   p->if_depth_in_loop = rzalloc_array(mem_ctx, int, p->loop_stack_array_size);
+}
+
+
+const unsigned *brw_get_program( struct brw_codegen *p,
+			       unsigned *sz )
+{
+   *sz = p->next_insn_offset;
+   return (const unsigned *)p->store;
+}
+
+const struct brw_shader_reloc *
+brw_get_shader_relocs(struct brw_codegen *p, unsigned *num_relocs)
+{
+   *num_relocs = p->num_relocs;
+   return p->relocs;
+}
+
+DEBUG_GET_ONCE_OPTION(shader_bin_dump_path, "INTEL_SHADER_BIN_DUMP_PATH", NULL);
+
+bool brw_should_dump_shader_bin(void)
+{
+   return debug_get_option_shader_bin_dump_path() != NULL;
+}
+
+void brw_dump_shader_bin(void *assembly, int start_offset, int end_offset,
+                         const char *identifier)
+{
+   char *name = ralloc_asprintf(NULL, "%s/%s.bin",
+                                debug_get_option_shader_bin_dump_path(),
+                                identifier);
+
+   int fd = open(name, O_CREAT | O_WRONLY, 0777);
+   ralloc_free(name);
+
+   if (fd < 0)
+      return;
+
+   struct stat sb;
+   if (fstat(fd, &sb) != 0 || (!S_ISREG(sb.st_mode))) {
+      close(fd);
+      return;
+   }
+
+   size_t to_write = end_offset - start_offset;
+   void *write_ptr = assembly + start_offset;
+
+   while (to_write) {
+      ssize_t ret = write(fd, write_ptr, to_write);
+
+      if (ret <= 0) {
+         close(fd);
+         return;
+      }
+
+      to_write -= ret;
+      write_ptr += ret;
+   }
+
+   close(fd);
+}
+
+bool brw_try_override_assembly(struct brw_codegen *p, int start_offset,
+                               const char *identifier)
+{
+   const char *read_path = getenv("INTEL_SHADER_ASM_READ_PATH");
+   if (!read_path) {
+      return false;
+   }
+
+   char *name = ralloc_asprintf(NULL, "%s/%s.bin", read_path, identifier);
+
+   int fd = open(name, O_RDONLY);
+   ralloc_free(name);
+
+   if (fd == -1) {
+      return false;
+   }
+
+   struct stat sb;
+   if (fstat(fd, &sb) != 0 || (!S_ISREG(sb.st_mode))) {
+      close(fd);
+      return false;
+   }
+
+   p->nr_insn -= (p->next_insn_offset - start_offset) / sizeof(brw_inst);
+   p->nr_insn += sb.st_size / sizeof(brw_inst);
+
+   p->next_insn_offset = start_offset + sb.st_size;
+   p->store_size = (start_offset + sb.st_size) / sizeof(brw_inst);
+   p->store = (brw_inst *)reralloc_size(p->mem_ctx, p->store, p->next_insn_offset);
+   assert(p->store);
+
+   ssize_t ret = read(fd, (char *)p->store + start_offset, sb.st_size);
+   close(fd);
+   if (ret != sb.st_size) {
+      return false;
+   }
+
+   ASSERTED bool valid =
+      brw_validate_instructions(p->isa, p->store,
+                                start_offset, p->next_insn_offset,
+                                NULL);
+   assert(valid);
+
+   return true;
+}
+
+const struct brw_label *
+brw_find_label(const struct brw_label *root, int offset)
+{
+   const struct brw_label *curr = root;
+
+   if (curr != NULL)
+   {
+      do {
+         if (curr->offset == offset)
+            return curr;
+
+         curr = curr->next;
+      } while (curr != NULL);
+   }
+
+   return curr;
+}
+
+void
+brw_create_label(struct brw_label **labels, int offset, void *mem_ctx)
+{
+   if (*labels != NULL) {
+      struct brw_label *curr = *labels;
+      struct brw_label *prev;
+
+      do {
+         prev = curr;
+
+         if (curr->offset == offset)
+            return;
+
+         curr = curr->next;
+      } while (curr != NULL);
+
+      curr = ralloc(mem_ctx, struct brw_label);
+      curr->offset = offset;
+      curr->number = prev->number + 1;
+      curr->next = NULL;
+      prev->next = curr;
+   } else {
+      struct brw_label *root = ralloc(mem_ctx, struct brw_label);
+      root->number = 0;
+      root->offset = offset;
+      root->next = NULL;
+      *labels = root;
+   }
+}
+
+const struct brw_label *
+brw_label_assembly(const struct brw_isa_info *isa,
+                   const void *assembly, int start, int end, void *mem_ctx)
+{
+   const struct intel_device_info *const devinfo = isa->devinfo;
+
+   struct brw_label *root_label = NULL;
+
+   int to_bytes_scale = sizeof(brw_inst) / brw_jump_scale(devinfo);
+
+   for (int offset = start; offset < end;) {
+      const brw_inst *inst = (const brw_inst *) ((const char *) assembly + offset);
+      brw_inst uncompacted;
+
+      bool is_compact = brw_inst_cmpt_control(devinfo, inst);
+
+      if (is_compact) {
+         brw_compact_inst *compacted = (brw_compact_inst *)inst;
+         brw_uncompact_instruction(isa, &uncompacted, compacted);
+         inst = &uncompacted;
+      }
+
+      if (brw_has_uip(devinfo, brw_inst_opcode(isa, inst))) {
+         /* Instructions that have UIP also have JIP. */
+         brw_create_label(&root_label,
+            offset + brw_inst_uip(devinfo, inst) * to_bytes_scale, mem_ctx);
+         brw_create_label(&root_label,
+            offset + brw_inst_jip(devinfo, inst) * to_bytes_scale, mem_ctx);
+      } else if (brw_has_jip(devinfo, brw_inst_opcode(isa, inst))) {
+         int jip;
+         if (devinfo->ver >= 7) {
+            jip = brw_inst_jip(devinfo, inst);
+         } else {
+            jip = brw_inst_gfx6_jump_count(devinfo, inst);
+         }
+
+         brw_create_label(&root_label, offset + jip * to_bytes_scale, mem_ctx);
+      }
+
+      if (is_compact) {
+         offset += sizeof(brw_compact_inst);
+      } else {
+         offset += sizeof(brw_inst);
+      }
+   }
+
+   return root_label;
+}
+
+void
+brw_disassemble_with_labels(const struct brw_isa_info *isa,
+                            const void *assembly, int start, int end, FILE *out)
+{
+   void *mem_ctx = ralloc_context(NULL);
+   const struct brw_label *root_label =
+      brw_label_assembly(isa, assembly, start, end, mem_ctx);
+
+   brw_disassemble(isa, assembly, start, end, root_label, out);
+
+   ralloc_free(mem_ctx);
+}
+
+void
+brw_disassemble(const struct brw_isa_info *isa,
+                const void *assembly, int start, int end,
+                const struct brw_label *root_label, FILE *out)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   bool dump_hex = INTEL_DEBUG(DEBUG_HEX);
+
+   for (int offset = start; offset < end;) {
+      const brw_inst *insn = (const brw_inst *)((char *)assembly + offset);
+      brw_inst uncompacted;
+
+      if (root_label != NULL) {
+        const struct brw_label *label = brw_find_label(root_label, offset);
+        if (label != NULL) {
+           fprintf(out, "\nLABEL%d:\n", label->number);
+        }
+      }
+
+      bool compacted = brw_inst_cmpt_control(devinfo, insn);
+      if (0)
+         fprintf(out, "0x%08x: ", offset);
+
+      if (compacted) {
+         brw_compact_inst *compacted = (brw_compact_inst *)insn;
+         if (dump_hex) {
+            unsigned char * insn_ptr = ((unsigned char *)&insn[0]);
+            const unsigned int blank_spaces = 24;
+            for (int i = 0 ; i < 8; i = i + 4) {
+               fprintf(out, "%02x %02x %02x %02x ",
+                       insn_ptr[i],
+                       insn_ptr[i + 1],
+                       insn_ptr[i + 2],
+                       insn_ptr[i + 3]);
+            }
+            /* Make compacted instructions hex value output vertically aligned
+             * with uncompacted instructions hex value
+             */
+            fprintf(out, "%*c", blank_spaces, ' ');
+         }
+
+         brw_uncompact_instruction(isa, &uncompacted, compacted);
+         insn = &uncompacted;
+      } else {
+         if (dump_hex) {
+            unsigned char * insn_ptr = ((unsigned char *)&insn[0]);
+            for (int i = 0 ; i < 16; i = i + 4) {
+               fprintf(out, "%02x %02x %02x %02x ",
+                       insn_ptr[i],
+                       insn_ptr[i + 1],
+                       insn_ptr[i + 2],
+                       insn_ptr[i + 3]);
+            }
+         }
+      }
+
+      brw_disassemble_inst(out, isa, insn, compacted, offset, root_label);
+
+      if (compacted) {
+         offset += sizeof(brw_compact_inst);
+      } else {
+         offset += sizeof(brw_inst);
+      }
+   }
+}
+
+static const struct opcode_desc opcode_descs[] = {
+   /* IR,                 HW,  name,      nsrc, ndst, gfx_vers */
+   { BRW_OPCODE_ILLEGAL,  0,   "illegal", 0,    0,    GFX_ALL },
+   { BRW_OPCODE_SYNC,     1,   "sync",    1,    0,    GFX_GE(GFX12) },
+   { BRW_OPCODE_MOV,      1,   "mov",     1,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_MOV,      97,  "mov",     1,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_SEL,      2,   "sel",     2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_SEL,      98,  "sel",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_MOVI,     3,   "movi",    2,    1,    GFX_GE(GFX45) & GFX_LT(GFX12) },
+   { BRW_OPCODE_MOVI,     99,  "movi",    2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_NOT,      4,   "not",     1,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_NOT,      100, "not",     1,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_AND,      5,   "and",     2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_AND,      101, "and",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_OR,       6,   "or",      2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_OR,       102, "or",      2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_XOR,      7,   "xor",     2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_XOR,      103, "xor",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_SHR,      8,   "shr",     2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_SHR,      104, "shr",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_SHL,      9,   "shl",     2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_SHL,      105, "shl",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_DIM,      10,  "dim",     1,    1,    GFX75 },
+   { BRW_OPCODE_SMOV,     10,  "smov",    0,    0,    GFX_GE(GFX8) & GFX_LT(GFX12) },
+   { BRW_OPCODE_SMOV,     106, "smov",    0,    0,    GFX_GE(GFX12) },
+   { BRW_OPCODE_ASR,      12,  "asr",     2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_ASR,      108, "asr",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_ROR,      14,  "ror",     2,    1,    GFX11 },
+   { BRW_OPCODE_ROR,      110, "ror",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_ROL,      15,  "rol",     2,    1,    GFX11 },
+   { BRW_OPCODE_ROL,      111, "rol",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_CMP,      16,  "cmp",     2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_CMP,      112, "cmp",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_CMPN,     17,  "cmpn",    2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_CMPN,     113, "cmpn",    2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_CSEL,     18,  "csel",    3,    1,    GFX_GE(GFX8) & GFX_LT(GFX12) },
+   { BRW_OPCODE_CSEL,     114, "csel",    3,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_F32TO16,  19,  "f32to16", 1,    1,    GFX7 | GFX75 },
+   { BRW_OPCODE_F16TO32,  20,  "f16to32", 1,    1,    GFX7 | GFX75 },
+   { BRW_OPCODE_BFREV,    23,  "bfrev",   1,    1,    GFX_GE(GFX7) & GFX_LT(GFX12) },
+   { BRW_OPCODE_BFREV,    119, "bfrev",   1,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_BFE,      24,  "bfe",     3,    1,    GFX_GE(GFX7) & GFX_LT(GFX12) },
+   { BRW_OPCODE_BFE,      120, "bfe",     3,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_BFI1,     25,  "bfi1",    2,    1,    GFX_GE(GFX7) & GFX_LT(GFX12) },
+   { BRW_OPCODE_BFI1,     121, "bfi1",    2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_BFI2,     26,  "bfi2",    3,    1,    GFX_GE(GFX7) & GFX_LT(GFX12) },
+   { BRW_OPCODE_BFI2,     122, "bfi2",    3,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_JMPI,     32,  "jmpi",    0,    0,    GFX_ALL },
+   { BRW_OPCODE_BRD,      33,  "brd",     0,    0,    GFX_GE(GFX7) },
+   { BRW_OPCODE_IF,       34,  "if",      0,    0,    GFX_ALL },
+   { BRW_OPCODE_IFF,      35,  "iff",     0,    0,    GFX_LE(GFX5) },
+   { BRW_OPCODE_BRC,      35,  "brc",     0,    0,    GFX_GE(GFX7) },
+   { BRW_OPCODE_ELSE,     36,  "else",    0,    0,    GFX_ALL },
+   { BRW_OPCODE_ENDIF,    37,  "endif",   0,    0,    GFX_ALL },
+   { BRW_OPCODE_DO,       38,  "do",      0,    0,    GFX_LE(GFX5) },
+   { BRW_OPCODE_CASE,     38,  "case",    0,    0,    GFX6 },
+   { BRW_OPCODE_WHILE,    39,  "while",   0,    0,    GFX_ALL },
+   { BRW_OPCODE_BREAK,    40,  "break",   0,    0,    GFX_ALL },
+   { BRW_OPCODE_CONTINUE, 41,  "cont",    0,    0,    GFX_ALL },
+   { BRW_OPCODE_HALT,     42,  "halt",    0,    0,    GFX_ALL },
+   { BRW_OPCODE_CALLA,    43,  "calla",   0,    0,    GFX_GE(GFX75) },
+   { BRW_OPCODE_MSAVE,    44,  "msave",   0,    0,    GFX_LE(GFX5) },
+   { BRW_OPCODE_CALL,     44,  "call",    0,    0,    GFX_GE(GFX6) },
+   { BRW_OPCODE_MREST,    45,  "mrest",   0,    0,    GFX_LE(GFX5) },
+   { BRW_OPCODE_RET,      45,  "ret",     0,    0,    GFX_GE(GFX6) },
+   { BRW_OPCODE_PUSH,     46,  "push",    0,    0,    GFX_LE(GFX5) },
+   { BRW_OPCODE_FORK,     46,  "fork",    0,    0,    GFX6 },
+   { BRW_OPCODE_GOTO,     46,  "goto",    0,    0,    GFX_GE(GFX8) },
+   { BRW_OPCODE_POP,      47,  "pop",     2,    0,    GFX_LE(GFX5) },
+   { BRW_OPCODE_WAIT,     48,  "wait",    0,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_SEND,     49,  "send",    1,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_SENDC,    50,  "sendc",   1,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_SEND,     49,  "send",    2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_SENDC,    50,  "sendc",   2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_SENDS,    51,  "sends",   2,    1,    GFX_GE(GFX9) & GFX_LT(GFX12) },
+   { BRW_OPCODE_SENDSC,   52,  "sendsc",  2,    1,    GFX_GE(GFX9) & GFX_LT(GFX12) },
+   { BRW_OPCODE_MATH,     56,  "math",    2,    1,    GFX_GE(GFX6) },
+   { BRW_OPCODE_ADD,      64,  "add",     2,    1,    GFX_ALL },
+   { BRW_OPCODE_MUL,      65,  "mul",     2,    1,    GFX_ALL },
+   { BRW_OPCODE_AVG,      66,  "avg",     2,    1,    GFX_ALL },
+   { BRW_OPCODE_FRC,      67,  "frc",     1,    1,    GFX_ALL },
+   { BRW_OPCODE_RNDU,     68,  "rndu",    1,    1,    GFX_ALL },
+   { BRW_OPCODE_RNDD,     69,  "rndd",    1,    1,    GFX_ALL },
+   { BRW_OPCODE_RNDE,     70,  "rnde",    1,    1,    GFX_ALL },
+   { BRW_OPCODE_RNDZ,     71,  "rndz",    1,    1,    GFX_ALL },
+   { BRW_OPCODE_MAC,      72,  "mac",     2,    1,    GFX_ALL },
+   { BRW_OPCODE_MACH,     73,  "mach",    2,    1,    GFX_ALL },
+   { BRW_OPCODE_LZD,      74,  "lzd",     1,    1,    GFX_ALL },
+   { BRW_OPCODE_FBH,      75,  "fbh",     1,    1,    GFX_GE(GFX7) },
+   { BRW_OPCODE_FBL,      76,  "fbl",     1,    1,    GFX_GE(GFX7) },
+   { BRW_OPCODE_CBIT,     77,  "cbit",    1,    1,    GFX_GE(GFX7) },
+   { BRW_OPCODE_ADDC,     78,  "addc",    2,    1,    GFX_GE(GFX7) },
+   { BRW_OPCODE_SUBB,     79,  "subb",    2,    1,    GFX_GE(GFX7) },
+   { BRW_OPCODE_SAD2,     80,  "sad2",    2,    1,    GFX_ALL },
+   { BRW_OPCODE_SADA2,    81,  "sada2",   2,    1,    GFX_ALL },
+   { BRW_OPCODE_ADD3,     82,  "add3",    3,    1,    GFX_GE(GFX125) },
+   { BRW_OPCODE_DP4,      84,  "dp4",     2,    1,    GFX_LT(GFX11) },
+   { BRW_OPCODE_DPH,      85,  "dph",     2,    1,    GFX_LT(GFX11) },
+   { BRW_OPCODE_DP3,      86,  "dp3",     2,    1,    GFX_LT(GFX11) },
+   { BRW_OPCODE_DP2,      87,  "dp2",     2,    1,    GFX_LT(GFX11) },
+   { BRW_OPCODE_DP4A,     88,  "dp4a",    3,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_LINE,     89,  "line",    2,    1,    GFX_LE(GFX10) },
+   { BRW_OPCODE_DPAS,     89,  "dpas",    3,    1,    GFX_GE(GFX125) },
+   { BRW_OPCODE_PLN,      90,  "pln",     2,    1,    GFX_GE(GFX45) & GFX_LE(GFX10) },
+   { BRW_OPCODE_MAD,      91,  "mad",     3,    1,    GFX_GE(GFX6) },
+   { BRW_OPCODE_LRP,      92,  "lrp",     3,    1,    GFX_GE(GFX6) & GFX_LE(GFX10) },
+   { BRW_OPCODE_MADM,     93,  "madm",    3,    1,    GFX_GE(GFX8) },
+   { BRW_OPCODE_NENOP,    125, "nenop",   0,    0,    GFX45 },
+   { BRW_OPCODE_NOP,      126, "nop",     0,    0,    GFX_LT(GFX12) },
+   { BRW_OPCODE_NOP,      96,  "nop",     0,    0,    GFX_GE(GFX12) }
+};
+
+void
+brw_init_isa_info(struct brw_isa_info *isa,
+                  const struct intel_device_info *devinfo)
+{
+   isa->devinfo = devinfo;
+
+   enum gfx_ver ver = gfx_ver_from_devinfo(devinfo);
+
+   memset(isa->ir_to_descs, 0, sizeof(isa->ir_to_descs));
+   memset(isa->hw_to_descs, 0, sizeof(isa->hw_to_descs));
+
+   for (unsigned i = 0; i < ARRAY_SIZE(opcode_descs); i++) {
+      if (opcode_descs[i].gfx_vers & ver) {
+         const unsigned e = opcode_descs[i].ir;
+         const unsigned h = opcode_descs[i].hw;
+         assert(e < ARRAY_SIZE(isa->ir_to_descs) && !isa->ir_to_descs[e]);
+         assert(h < ARRAY_SIZE(isa->hw_to_descs) && !isa->hw_to_descs[h]);
+         isa->ir_to_descs[e] = &opcode_descs[i];
+         isa->hw_to_descs[h] = &opcode_descs[i];
+      }
+   }
+}
+
+/**
+ * Return the matching opcode_desc for the specified IR opcode and hardware
+ * generation, or NULL if the opcode is not supported by the device.
+ */
+const struct opcode_desc *
+brw_opcode_desc(const struct brw_isa_info *isa, enum opcode op)
+{
+   return op < ARRAY_SIZE(isa->ir_to_descs) ? isa->ir_to_descs[op] : NULL;
+}
+
+/**
+ * Return the matching opcode_desc for the specified HW opcode and hardware
+ * generation, or NULL if the opcode is not supported by the device.
+ */
+const struct opcode_desc *
+brw_opcode_desc_from_hw(const struct brw_isa_info *isa, unsigned hw)
+{
+   return hw < ARRAY_SIZE(isa->hw_to_descs) ? isa->hw_to_descs[hw] : NULL;
+}
+
+unsigned
+brw_num_sources_from_inst(const struct brw_isa_info *isa,
+                          const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+   const struct opcode_desc *desc =
+      brw_opcode_desc(isa, brw_inst_opcode(isa, inst));
+   unsigned math_function;
+
+   if (brw_inst_opcode(isa, inst) == BRW_OPCODE_MATH) {
+      math_function = brw_inst_math_function(devinfo, inst);
+   } else if (devinfo->ver < 6 &&
+              brw_inst_opcode(isa, inst) == BRW_OPCODE_SEND) {
+      if (brw_inst_sfid(devinfo, inst) == BRW_SFID_MATH) {
+         /* src1 must be a descriptor (including the information to determine
+          * that the SEND is doing an extended math operation), but src0 can
+          * actually be null since it serves as the source of the implicit GRF
+          * to MRF move.
+          *
+          * If we stop using that functionality, we'll have to revisit this.
+          */
+         return 2;
+      } else {
+         /* Send instructions are allowed to have null sources since they use
+          * the base_mrf field to specify which message register source.
+          */
+         return 0;
+      }
+   } else {
+      assert(desc->nsrc < 4);
+      return desc->nsrc;
+   }
+
+   switch (math_function) {
+   case BRW_MATH_FUNCTION_INV:
+   case BRW_MATH_FUNCTION_LOG:
+   case BRW_MATH_FUNCTION_EXP:
+   case BRW_MATH_FUNCTION_SQRT:
+   case BRW_MATH_FUNCTION_RSQ:
+   case BRW_MATH_FUNCTION_SIN:
+   case BRW_MATH_FUNCTION_COS:
+   case BRW_MATH_FUNCTION_SINCOS:
+   case GFX8_MATH_FUNCTION_INVM:
+   case GFX8_MATH_FUNCTION_RSQRTM:
+      return 1;
+   case BRW_MATH_FUNCTION_FDIV:
+   case BRW_MATH_FUNCTION_POW:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
+   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
+      return 2;
+   default:
+      unreachable("not reached");
+   }
+}
diff --git a/src/intel/compiler/elk/brw_eu.h b/src/intel/compiler/elk/brw_eu.h
new file mode 100644
index 00000000000..e62e6e1c9e9
--- /dev/null
+++ b/src/intel/compiler/elk/brw_eu.h
@@ -0,0 +1,2089 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+
+#ifndef BRW_EU_H
+#define BRW_EU_H
+
+#include <stdbool.h>
+#include <stdio.h>
+#include "brw_inst.h"
+#include "brw_compiler.h"
+#include "brw_eu_defines.h"
+#include "brw_isa_info.h"
+#include "brw_reg.h"
+
+#include "util/bitset.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct disasm_info;
+
+#define BRW_EU_MAX_INSN_STACK 5
+
+struct brw_insn_state {
+   /* One of BRW_EXECUTE_* */
+   unsigned exec_size:3;
+
+   /* Group in units of channels */
+   unsigned group:5;
+
+   /* Compression control on gfx4-5 */
+   bool compressed:1;
+
+   /* One of BRW_MASK_* */
+   unsigned mask_control:1;
+
+   /* Scheduling info for Gfx12+ */
+   struct tgl_swsb swsb;
+
+   bool saturate:1;
+
+   /* One of BRW_ALIGN_* */
+   unsigned access_mode:1;
+
+   /* One of BRW_PREDICATE_* */
+   enum brw_predicate predicate:4;
+
+   bool pred_inv:1;
+
+   /* Flag subreg.  Bottom bit is subreg, top bit is reg */
+   unsigned flag_subreg:2;
+
+   bool acc_wr_control:1;
+};
+
+
+/* A helper for accessing the last instruction emitted.  This makes it easy
+ * to set various bits on an instruction without having to create temporary
+ * variable and assign the emitted instruction to those.
+ */
+#define brw_last_inst (&p->store[p->nr_insn - 1])
+
+struct brw_codegen {
+   brw_inst *store;
+   int store_size;
+   unsigned nr_insn;
+   unsigned int next_insn_offset;
+
+   void *mem_ctx;
+
+   /* Allow clients to push/pop instruction state:
+    */
+   struct brw_insn_state stack[BRW_EU_MAX_INSN_STACK];
+   struct brw_insn_state *current;
+
+   /** Whether or not the user wants automatic exec sizes
+    *
+    * If true, codegen will try to automatically infer the exec size of an
+    * instruction from the width of the destination register.  If false, it
+    * will take whatever is set by brw_set_default_exec_size verbatim.
+    *
+    * This is set to true by default in brw_init_codegen.
+    */
+   bool automatic_exec_sizes;
+
+   bool single_program_flow;
+   const struct brw_isa_info *isa;
+   const struct intel_device_info *devinfo;
+
+   /* Control flow stacks:
+    * - if_stack contains IF and ELSE instructions which must be patched
+    *   (and popped) once the matching ENDIF instruction is encountered.
+    *
+    *   Just store the instruction pointer(an index).
+    */
+   int *if_stack;
+   int if_stack_depth;
+   int if_stack_array_size;
+
+   /**
+    * loop_stack contains the instruction pointers of the starts of loops which
+    * must be patched (and popped) once the matching WHILE instruction is
+    * encountered.
+    */
+   int *loop_stack;
+   /**
+    * pre-gfx6, the BREAK and CONT instructions had to tell how many IF/ENDIF
+    * blocks they were popping out of, to fix up the mask stack.  This tracks
+    * the IF/ENDIF nesting in each current nested loop level.
+    */
+   int *if_depth_in_loop;
+   int loop_stack_depth;
+   int loop_stack_array_size;
+
+   struct brw_shader_reloc *relocs;
+   int num_relocs;
+   int reloc_array_size;
+};
+
+struct brw_label {
+   int offset;
+   int number;
+   struct brw_label *next;
+};
+
+void brw_pop_insn_state( struct brw_codegen *p );
+void brw_push_insn_state( struct brw_codegen *p );
+unsigned brw_get_default_exec_size(struct brw_codegen *p);
+unsigned brw_get_default_group(struct brw_codegen *p);
+unsigned brw_get_default_access_mode(struct brw_codegen *p);
+struct tgl_swsb brw_get_default_swsb(struct brw_codegen *p);
+void brw_set_default_exec_size(struct brw_codegen *p, unsigned value);
+void brw_set_default_mask_control( struct brw_codegen *p, unsigned value );
+void brw_set_default_saturate( struct brw_codegen *p, bool enable );
+void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode );
+void brw_inst_set_compression(const struct intel_device_info *devinfo,
+                              brw_inst *inst, bool on);
+void brw_set_default_compression(struct brw_codegen *p, bool on);
+void brw_inst_set_group(const struct intel_device_info *devinfo,
+                        brw_inst *inst, unsigned group);
+void brw_set_default_group(struct brw_codegen *p, unsigned group);
+void brw_set_default_compression_control(struct brw_codegen *p, enum brw_compression c);
+void brw_set_default_predicate_control(struct brw_codegen *p, enum brw_predicate pc);
+void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse);
+void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg);
+void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value);
+void brw_set_default_swsb(struct brw_codegen *p, struct tgl_swsb value);
+
+void brw_init_codegen(const struct brw_isa_info *isa,
+                      struct brw_codegen *p, void *mem_ctx);
+bool brw_has_jip(const struct intel_device_info *devinfo, enum opcode opcode);
+bool brw_has_uip(const struct intel_device_info *devinfo, enum opcode opcode);
+const struct brw_shader_reloc *brw_get_shader_relocs(struct brw_codegen *p,
+                                                     unsigned *num_relocs);
+const unsigned *brw_get_program( struct brw_codegen *p, unsigned *sz );
+
+bool brw_should_dump_shader_bin(void);
+void brw_dump_shader_bin(void *assembly, int start_offset, int end_offset,
+                         const char *identifier);
+
+bool brw_try_override_assembly(struct brw_codegen *p, int start_offset,
+                               const char *identifier);
+
+void brw_realign(struct brw_codegen *p, unsigned alignment);
+int brw_append_data(struct brw_codegen *p, void *data,
+                    unsigned size, unsigned alignment);
+brw_inst *brw_next_insn(struct brw_codegen *p, unsigned opcode);
+void brw_add_reloc(struct brw_codegen *p, uint32_t id,
+                   enum brw_shader_reloc_type type,
+                   uint32_t offset, uint32_t delta);
+void brw_set_dest(struct brw_codegen *p, brw_inst *insn, struct brw_reg dest);
+void brw_set_src0(struct brw_codegen *p, brw_inst *insn, struct brw_reg reg);
+
+void gfx6_resolve_implied_move(struct brw_codegen *p,
+			       struct brw_reg *src,
+			       unsigned msg_reg_nr);
+
+/* Helpers for regular instructions:
+ */
+#define ALU1(OP)				\
+brw_inst *brw_##OP(struct brw_codegen *p,	\
+	      struct brw_reg dest,		\
+	      struct brw_reg src0);
+
+#define ALU2(OP)				\
+brw_inst *brw_##OP(struct brw_codegen *p,	\
+	      struct brw_reg dest,		\
+	      struct brw_reg src0,		\
+	      struct brw_reg src1);
+
+#define ALU3(OP)				\
+brw_inst *brw_##OP(struct brw_codegen *p,	\
+	      struct brw_reg dest,		\
+	      struct brw_reg src0,		\
+	      struct brw_reg src1,		\
+	      struct brw_reg src2);
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU1(DIM)
+ALU2(ASR)
+ALU2(ROL)
+ALU2(ROR)
+ALU3(CSEL)
+ALU1(F32TO16)
+ALU1(F16TO32)
+ALU2(ADD)
+ALU3(ADD3)
+ALU2(AVG)
+ALU2(MUL)
+ALU1(FRC)
+ALU1(RNDD)
+ALU1(RNDE)
+ALU1(RNDU)
+ALU1(RNDZ)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU3(DP4A)
+ALU2(LINE)
+ALU2(PLN)
+ALU3(MAD)
+ALU3(LRP)
+ALU1(BFREV)
+ALU3(BFE)
+ALU2(BFI1)
+ALU3(BFI2)
+ALU1(FBH)
+ALU1(FBL)
+ALU1(CBIT)
+ALU2(ADDC)
+ALU2(SUBB)
+
+#undef ALU1
+#undef ALU2
+#undef ALU3
+
+static inline unsigned
+reg_unit(const struct intel_device_info *devinfo)
+{
+   return devinfo->ver >= 20 ? 2 : 1;
+}
+
+
+/* Helpers for SEND instruction:
+ */
+
+/**
+ * Construct a message descriptor immediate with the specified common
+ * descriptor controls.
+ */
+static inline uint32_t
+brw_message_desc(const struct intel_device_info *devinfo,
+                 unsigned msg_length,
+                 unsigned response_length,
+                 bool header_present)
+{
+   if (devinfo->ver >= 5) {
+      assert(msg_length % reg_unit(devinfo) == 0);
+      assert(response_length % reg_unit(devinfo) == 0);
+      return (SET_BITS(msg_length / reg_unit(devinfo), 28, 25) |
+              SET_BITS(response_length / reg_unit(devinfo), 24, 20) |
+              SET_BITS(header_present, 19, 19));
+   } else {
+      return (SET_BITS(msg_length, 23, 20) |
+              SET_BITS(response_length, 19, 16));
+   }
+}
+
+static inline unsigned
+brw_message_desc_mlen(const struct intel_device_info *devinfo, uint32_t desc)
+{
+   if (devinfo->ver >= 5)
+      return GET_BITS(desc, 28, 25) * reg_unit(devinfo);
+   else
+      return GET_BITS(desc, 23, 20);
+}
+
+static inline unsigned
+brw_message_desc_rlen(const struct intel_device_info *devinfo, uint32_t desc)
+{
+   if (devinfo->ver >= 5)
+      return GET_BITS(desc, 24, 20) * reg_unit(devinfo);
+   else
+      return GET_BITS(desc, 19, 16);
+}
+
+static inline bool
+brw_message_desc_header_present(ASSERTED
+                                const struct intel_device_info *devinfo,
+                                uint32_t desc)
+{
+   assert(devinfo->ver >= 5);
+   return GET_BITS(desc, 19, 19);
+}
+
+static inline unsigned
+brw_message_ex_desc(const struct intel_device_info *devinfo,
+                    unsigned ex_msg_length)
+{
+   assert(ex_msg_length % reg_unit(devinfo) == 0);
+   return SET_BITS(ex_msg_length / reg_unit(devinfo), 9, 6);
+}
+
+static inline unsigned
+brw_message_ex_desc_ex_mlen(const struct intel_device_info *devinfo,
+                            uint32_t ex_desc)
+{
+   return GET_BITS(ex_desc, 9, 6) * reg_unit(devinfo);
+}
+
+static inline uint32_t
+brw_urb_desc(const struct intel_device_info *devinfo,
+             unsigned msg_type,
+             bool per_slot_offset_present,
+             bool channel_mask_present,
+             unsigned global_offset)
+{
+   if (devinfo->ver >= 8) {
+      return (SET_BITS(per_slot_offset_present, 17, 17) |
+              SET_BITS(channel_mask_present, 15, 15) |
+              SET_BITS(global_offset, 14, 4) |
+              SET_BITS(msg_type, 3, 0));
+   } else if (devinfo->ver >= 7) {
+      assert(!channel_mask_present);
+      return (SET_BITS(per_slot_offset_present, 16, 16) |
+              SET_BITS(global_offset, 13, 3) |
+              SET_BITS(msg_type, 3, 0));
+   } else {
+      unreachable("unhandled URB write generation");
+   }
+}
+
+static inline uint32_t
+brw_urb_desc_msg_type(ASSERTED const struct intel_device_info *devinfo,
+                      uint32_t desc)
+{
+   assert(devinfo->ver >= 7);
+   return GET_BITS(desc, 3, 0);
+}
+
+static inline uint32_t
+brw_urb_fence_desc(const struct intel_device_info *devinfo)
+{
+   assert(devinfo->has_lsc);
+   return brw_urb_desc(devinfo, GFX125_URB_OPCODE_FENCE, false, false, 0);
+}
+
+/**
+ * Construct a message descriptor immediate with the specified sampler
+ * function controls.
+ */
+static inline uint32_t
+brw_sampler_desc(const struct intel_device_info *devinfo,
+                 unsigned binding_table_index,
+                 unsigned sampler,
+                 unsigned msg_type,
+                 unsigned simd_mode,
+                 unsigned return_format)
+{
+   const unsigned desc = (SET_BITS(binding_table_index, 7, 0) |
+                          SET_BITS(sampler, 11, 8));
+
+   /* From GFX20 Bspec: Shared Functions - Message Descriptor -
+    * Sampling Engine:
+    *
+    *    Message Type[5]  31  This bit represents the upper bit of message type
+    *                         6-bit encoding (c.f. [16:12]). This bit is set
+    *                         for messages with programmable offsets.
+    */
+   if (devinfo->ver >= 20)
+      return desc | SET_BITS(msg_type & 0x1F, 16, 12) |
+             SET_BITS(simd_mode & 0x3, 18, 17) |
+             SET_BITS(simd_mode >> 2, 29, 29) |
+             SET_BITS(return_format, 30, 30) |
+             SET_BITS(msg_type >> 5, 31, 31);
+
+   /* From the CHV Bspec: Shared Functions - Message Descriptor -
+    * Sampling Engine:
+    *
+    *   SIMD Mode[2]  29    This field is the upper bit of the 3-bit
+    *                       SIMD Mode field.
+    */
+   if (devinfo->ver >= 8)
+      return desc | SET_BITS(msg_type, 16, 12) |
+             SET_BITS(simd_mode & 0x3, 18, 17) |
+             SET_BITS(simd_mode >> 2, 29, 29) |
+             SET_BITS(return_format, 30, 30);
+   if (devinfo->ver >= 7)
+      return (desc | SET_BITS(msg_type, 16, 12) |
+              SET_BITS(simd_mode, 18, 17));
+   else if (devinfo->ver >= 5)
+      return (desc | SET_BITS(msg_type, 15, 12) |
+              SET_BITS(simd_mode, 17, 16));
+   else if (devinfo->verx10 >= 45)
+      return desc | SET_BITS(msg_type, 15, 12);
+   else
+      return (desc | SET_BITS(return_format, 13, 12) |
+              SET_BITS(msg_type, 15, 14));
+}
+
+static inline unsigned
+brw_sampler_desc_binding_table_index(UNUSED
+                                     const struct intel_device_info *devinfo,
+                                     uint32_t desc)
+{
+   return GET_BITS(desc, 7, 0);
+}
+
+static inline unsigned
+brw_sampler_desc_sampler(UNUSED const struct intel_device_info *devinfo,
+                         uint32_t desc)
+{
+   return GET_BITS(desc, 11, 8);
+}
+
+static inline unsigned
+brw_sampler_desc_msg_type(const struct intel_device_info *devinfo, uint32_t desc)
+{
+   if (devinfo->ver >= 20)
+      return GET_BITS(desc, 31, 31) << 5 | GET_BITS(desc, 16, 12);
+   else if (devinfo->ver >= 7)
+      return GET_BITS(desc, 16, 12);
+   else if (devinfo->verx10 >= 45)
+      return GET_BITS(desc, 15, 12);
+   else
+      return GET_BITS(desc, 15, 14);
+}
+
+static inline unsigned
+brw_sampler_desc_simd_mode(const struct intel_device_info *devinfo,
+                           uint32_t desc)
+{
+   assert(devinfo->ver >= 5);
+   if (devinfo->ver >= 8)
+      return GET_BITS(desc, 18, 17) | GET_BITS(desc, 29, 29) << 2;
+   else if (devinfo->ver >= 7)
+      return GET_BITS(desc, 18, 17);
+   else
+      return GET_BITS(desc, 17, 16);
+}
+
+static  inline unsigned
+brw_sampler_desc_return_format(ASSERTED const struct intel_device_info *devinfo,
+                               uint32_t desc)
+{
+   assert(devinfo->verx10 == 40 || devinfo->ver >= 8);
+   if (devinfo->ver >= 8)
+      return GET_BITS(desc, 30, 30);
+   else
+      return GET_BITS(desc, 13, 12);
+}
+
+/**
+ * Construct a message descriptor for the dataport
+ */
+static inline uint32_t
+brw_dp_desc(const struct intel_device_info *devinfo,
+            unsigned binding_table_index,
+            unsigned msg_type,
+            unsigned msg_control)
+{
+   /* Prior to gfx6, things are too inconsistent; use the dp_read/write_desc
+    * helpers instead.
+    */
+   assert(devinfo->ver >= 6);
+   const unsigned desc = SET_BITS(binding_table_index, 7, 0);
+   if (devinfo->ver >= 8) {
+      return (desc | SET_BITS(msg_control, 13, 8) |
+              SET_BITS(msg_type, 18, 14));
+   } else if (devinfo->ver >= 7) {
+      return (desc | SET_BITS(msg_control, 13, 8) |
+              SET_BITS(msg_type, 17, 14));
+   } else {
+      return (desc | SET_BITS(msg_control, 12, 8) |
+              SET_BITS(msg_type, 16, 13));
+   }
+}
+
+static inline unsigned
+brw_dp_desc_binding_table_index(UNUSED const struct intel_device_info *devinfo,
+                                uint32_t desc)
+{
+   return GET_BITS(desc, 7, 0);
+}
+
+static inline unsigned
+brw_dp_desc_msg_type(const struct intel_device_info *devinfo, uint32_t desc)
+{
+   assert(devinfo->ver >= 6);
+   if (devinfo->ver >= 8)
+      return GET_BITS(desc, 18, 14);
+   else if (devinfo->ver >= 7)
+      return GET_BITS(desc, 17, 14);
+   else
+      return GET_BITS(desc, 16, 13);
+}
+
+static inline unsigned
+brw_dp_desc_msg_control(const struct intel_device_info *devinfo, uint32_t desc)
+{
+   assert(devinfo->ver >= 6);
+   if (devinfo->ver >= 7)
+      return GET_BITS(desc, 13, 8);
+   else
+      return GET_BITS(desc, 12, 8);
+}
+
+/**
+ * Construct a message descriptor immediate with the specified dataport read
+ * function controls.
+ */
+static inline uint32_t
+brw_dp_read_desc(const struct intel_device_info *devinfo,
+                 unsigned binding_table_index,
+                 unsigned msg_control,
+                 unsigned msg_type,
+                 unsigned target_cache)
+{
+   if (devinfo->ver >= 6)
+      return brw_dp_desc(devinfo, binding_table_index, msg_type, msg_control);
+   else if (devinfo->verx10 >= 45)
+      return (SET_BITS(binding_table_index, 7, 0) |
+              SET_BITS(msg_control, 10, 8) |
+              SET_BITS(msg_type, 13, 11) |
+              SET_BITS(target_cache, 15, 14));
+   else
+      return (SET_BITS(binding_table_index, 7, 0) |
+              SET_BITS(msg_control, 11, 8) |
+              SET_BITS(msg_type, 13, 12) |
+              SET_BITS(target_cache, 15, 14));
+}
+
+static inline unsigned
+brw_dp_read_desc_msg_type(const struct intel_device_info *devinfo,
+                          uint32_t desc)
+{
+   if (devinfo->ver >= 6)
+      return brw_dp_desc_msg_type(devinfo, desc);
+   else if (devinfo->verx10 >= 45)
+      return GET_BITS(desc, 13, 11);
+   else
+      return GET_BITS(desc, 13, 12);
+}
+
+static inline unsigned
+brw_dp_read_desc_msg_control(const struct intel_device_info *devinfo,
+                             uint32_t desc)
+{
+   if (devinfo->ver >= 6)
+      return brw_dp_desc_msg_control(devinfo, desc);
+   else if (devinfo->verx10 >= 45)
+      return GET_BITS(desc, 10, 8);
+   else
+      return GET_BITS(desc, 11, 8);
+}
+
+/**
+ * Construct a message descriptor immediate with the specified dataport write
+ * function controls.
+ */
+static inline uint32_t
+brw_dp_write_desc(const struct intel_device_info *devinfo,
+                  unsigned binding_table_index,
+                  unsigned msg_control,
+                  unsigned msg_type,
+                  unsigned send_commit_msg)
+{
+   assert(devinfo->ver <= 6 || !send_commit_msg);
+   if (devinfo->ver >= 6) {
+      return brw_dp_desc(devinfo, binding_table_index, msg_type, msg_control) |
+             SET_BITS(send_commit_msg, 17, 17);
+   } else {
+      return (SET_BITS(binding_table_index, 7, 0) |
+              SET_BITS(msg_control, 11, 8) |
+              SET_BITS(msg_type, 14, 12) |
+              SET_BITS(send_commit_msg, 15, 15));
+   }
+}
+
+static inline unsigned
+brw_dp_write_desc_msg_type(const struct intel_device_info *devinfo,
+                           uint32_t desc)
+{
+   if (devinfo->ver >= 6)
+      return brw_dp_desc_msg_type(devinfo, desc);
+   else
+      return GET_BITS(desc, 14, 12);
+}
+
+static inline unsigned
+brw_dp_write_desc_msg_control(const struct intel_device_info *devinfo,
+                              uint32_t desc)
+{
+   if (devinfo->ver >= 6)
+      return brw_dp_desc_msg_control(devinfo, desc);
+   else
+      return GET_BITS(desc, 11, 8);
+}
+
+static inline bool
+brw_dp_write_desc_write_commit(const struct intel_device_info *devinfo,
+                               uint32_t desc)
+{
+   assert(devinfo->ver <= 6);
+   if (devinfo->ver >= 6)
+      return GET_BITS(desc, 17, 17);
+   else
+      return GET_BITS(desc, 15, 15);
+}
+
+/**
+ * Construct a message descriptor immediate with the specified dataport
+ * surface function controls.
+ */
+static inline uint32_t
+brw_dp_surface_desc(const struct intel_device_info *devinfo,
+                    unsigned msg_type,
+                    unsigned msg_control)
+{
+   assert(devinfo->ver >= 7);
+   /* We'll OR in the binding table index later */
+   return brw_dp_desc(devinfo, 0, msg_type, msg_control);
+}
+
+static inline uint32_t
+brw_dp_untyped_atomic_desc(const struct intel_device_info *devinfo,
+                           unsigned exec_size, /**< 0 for SIMD4x2 */
+                           unsigned atomic_op,
+                           bool response_expected)
+{
+   assert(exec_size <= 8 || exec_size == 16);
+
+   unsigned msg_type;
+   if (devinfo->verx10 >= 75) {
+      if (exec_size > 0) {
+         msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
+      } else {
+         msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2;
+      }
+   } else {
+      msg_type = GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
+   }
+
+   const unsigned msg_control =
+      SET_BITS(atomic_op, 3, 0) |
+      SET_BITS(0 < exec_size && exec_size <= 8, 4, 4) |
+      SET_BITS(response_expected, 5, 5);
+
+   return brw_dp_surface_desc(devinfo, msg_type, msg_control);
+}
+
+static inline uint32_t
+brw_dp_untyped_atomic_float_desc(const struct intel_device_info *devinfo,
+                                 unsigned exec_size,
+                                 unsigned atomic_op,
+                                 bool response_expected)
+{
+   assert(exec_size <= 8 || exec_size == 16);
+   assert(devinfo->ver >= 9);
+
+   assert(exec_size > 0);
+   const unsigned msg_type = GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP;
+
+   const unsigned msg_control =
+      SET_BITS(atomic_op, 1, 0) |
+      SET_BITS(exec_size <= 8, 4, 4) |
+      SET_BITS(response_expected, 5, 5);
+
+   return brw_dp_surface_desc(devinfo, msg_type, msg_control);
+}
+
+static inline unsigned
+brw_mdc_cmask(unsigned num_channels)
+{
+   /* See also MDC_CMASK in the SKL PRM Vol 2d. */
+   return 0xf & (0xf << num_channels);
+}
+
+static inline unsigned
+lsc_cmask(unsigned num_channels)
+{
+   assert(num_channels > 0 && num_channels <= 4);
+   return BITSET_MASK(num_channels);
+}
+
+static inline uint32_t
+brw_dp_untyped_surface_rw_desc(const struct intel_device_info *devinfo,
+                               unsigned exec_size, /**< 0 for SIMD4x2 */
+                               unsigned num_channels,
+                               bool write)
+{
+   assert(exec_size <= 8 || exec_size == 16);
+
+   unsigned msg_type;
+   if (write) {
+      if (devinfo->verx10 >= 75) {
+         msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE;
+      } else {
+         msg_type = GFX7_DATAPORT_DC_UNTYPED_SURFACE_WRITE;
+      }
+   } else {
+      /* Read */
+      if (devinfo->verx10 >= 75) {
+         msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ;
+      } else {
+         msg_type = GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ;
+      }
+   }
+
+   /* SIMD4x2 is only valid for read messages on IVB; use SIMD8 instead */
+   if (write && devinfo->verx10 == 70 && exec_size == 0)
+      exec_size = 8;
+
+   /* See also MDC_SM3 in the SKL PRM Vol 2d. */
+   const unsigned simd_mode = exec_size == 0 ? 0 : /* SIMD4x2 */
+                              exec_size <= 8 ? 2 : 1;
+
+   const unsigned msg_control =
+      SET_BITS(brw_mdc_cmask(num_channels), 3, 0) |
+      SET_BITS(simd_mode, 5, 4);
+
+   return brw_dp_surface_desc(devinfo, msg_type, msg_control);
+}
+
+static inline unsigned
+brw_mdc_ds(unsigned bit_size)
+{
+   switch (bit_size) {
+   case 8:
+      return GFX7_BYTE_SCATTERED_DATA_ELEMENT_BYTE;
+   case 16:
+      return GFX7_BYTE_SCATTERED_DATA_ELEMENT_WORD;
+   case 32:
+      return GFX7_BYTE_SCATTERED_DATA_ELEMENT_DWORD;
+   default:
+      unreachable("Unsupported bit_size for byte scattered messages");
+   }
+}
+
+static inline uint32_t
+brw_dp_byte_scattered_rw_desc(const struct intel_device_info *devinfo,
+                              unsigned exec_size,
+                              unsigned bit_size,
+                              bool write)
+{
+   assert(exec_size <= 8 || exec_size == 16);
+
+   assert(devinfo->verx10 >= 75);
+   const unsigned msg_type =
+      write ? HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE :
+              HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ;
+
+   assert(exec_size > 0);
+   const unsigned msg_control =
+      SET_BITS(exec_size == 16, 0, 0) |
+      SET_BITS(brw_mdc_ds(bit_size), 3, 2);
+
+   return brw_dp_surface_desc(devinfo, msg_type, msg_control);
+}
+
+static inline uint32_t
+brw_dp_dword_scattered_rw_desc(const struct intel_device_info *devinfo,
+                               unsigned exec_size,
+                               bool write)
+{
+   assert(exec_size == 8 || exec_size == 16);
+
+   unsigned msg_type;
+   if (write) {
+      if (devinfo->ver >= 6) {
+         msg_type = GFX6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE;
+      } else {
+         msg_type = BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE;
+      }
+   } else {
+      if (devinfo->ver >= 7) {
+         msg_type = GFX7_DATAPORT_DC_DWORD_SCATTERED_READ;
+      } else if (devinfo->verx10 >= 45) {
+         msg_type = G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ;
+      } else {
+         msg_type = BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ;
+      }
+   }
+
+   const unsigned msg_control =
+      SET_BITS(1, 1, 1) | /* Legacy SIMD Mode */
+      SET_BITS(exec_size == 16, 0, 0);
+
+   return brw_dp_surface_desc(devinfo, msg_type, msg_control);
+}
+
+static inline uint32_t
+brw_dp_oword_block_rw_desc(const struct intel_device_info *devinfo,
+                           bool align_16B,
+                           unsigned num_dwords,
+                           bool write)
+{
+   /* Writes can only have addresses aligned by OWORDs (16 Bytes). */
+   assert(!write || align_16B);
+
+   const unsigned msg_type =
+      write ?     GFX7_DATAPORT_DC_OWORD_BLOCK_WRITE :
+      align_16B ? GFX7_DATAPORT_DC_OWORD_BLOCK_READ :
+                  GFX7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ;
+
+   const unsigned msg_control =
+      SET_BITS(BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_dwords), 2, 0);
+
+   return brw_dp_surface_desc(devinfo, msg_type, msg_control);
+}
+
+static inline uint32_t
+brw_dp_a64_untyped_surface_rw_desc(const struct intel_device_info *devinfo,
+                                   unsigned exec_size, /**< 0 for SIMD4x2 */
+                                   unsigned num_channels,
+                                   bool write)
+{
+   assert(exec_size <= 8 || exec_size == 16);
+   assert(devinfo->ver >= 8);
+
+   unsigned msg_type =
+      write ? GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE :
+              GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ;
+
+   /* See also MDC_SM3 in the SKL PRM Vol 2d. */
+   const unsigned simd_mode = exec_size == 0 ? 0 : /* SIMD4x2 */
+                              exec_size <= 8 ? 2 : 1;
+
+   const unsigned msg_control =
+      SET_BITS(brw_mdc_cmask(num_channels), 3, 0) |
+      SET_BITS(simd_mode, 5, 4);
+
+   return brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT,
+                      msg_type, msg_control);
+}
+
+static inline uint32_t
+brw_dp_a64_oword_block_rw_desc(const struct intel_device_info *devinfo,
+                               bool align_16B,
+                               unsigned num_dwords,
+                               bool write)
+{
+   /* Writes can only have addresses aligned by OWORDs (16 Bytes). */
+   assert(!write || align_16B);
+
+   unsigned msg_type =
+      write ? GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE :
+              GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ;
+
+   unsigned msg_control =
+      SET_BITS(!align_16B, 4, 3) |
+      SET_BITS(BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_dwords), 2, 0);
+
+   return brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT,
+                      msg_type, msg_control);
+}
+
+/**
+ * Calculate the data size (see MDC_A64_DS in the "Structures" volume of the
+ * Skylake PRM).
+ */
+static inline uint32_t
+brw_mdc_a64_ds(unsigned elems)
+{
+   switch (elems) {
+   case 1:  return 0;
+   case 2:  return 1;
+   case 4:  return 2;
+   case 8:  return 3;
+   default:
+      unreachable("Unsupported elmeent count for A64 scattered message");
+   }
+}
+
+static inline uint32_t
+brw_dp_a64_byte_scattered_rw_desc(const struct intel_device_info *devinfo,
+                                  unsigned exec_size, /**< 0 for SIMD4x2 */
+                                  unsigned bit_size,
+                                  bool write)
+{
+   assert(exec_size <= 8 || exec_size == 16);
+   assert(devinfo->ver >= 8);
+
+   unsigned msg_type =
+      write ? GFX8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE :
+              GFX9_DATAPORT_DC_PORT1_A64_SCATTERED_READ;
+
+   const unsigned msg_control =
+      SET_BITS(GFX8_A64_SCATTERED_SUBTYPE_BYTE, 1, 0) |
+      SET_BITS(brw_mdc_a64_ds(bit_size / 8), 3, 2) |
+      SET_BITS(exec_size == 16, 4, 4);
+
+   return brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT,
+                      msg_type, msg_control);
+}
+
+static inline uint32_t
+brw_dp_a64_untyped_atomic_desc(const struct intel_device_info *devinfo,
+                               ASSERTED unsigned exec_size, /**< 0 for SIMD4x2 */
+                               unsigned bit_size,
+                               unsigned atomic_op,
+                               bool response_expected)
+{
+   assert(exec_size == 8);
+   assert(devinfo->ver >= 8);
+   assert(bit_size == 16 || bit_size == 32 || bit_size == 64);
+   assert(devinfo->ver >= 12 || bit_size >= 32);
+
+   const unsigned msg_type = bit_size == 16 ?
+      GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP :
+      GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP;
+
+   const unsigned msg_control =
+      SET_BITS(atomic_op, 3, 0) |
+      SET_BITS(bit_size == 64, 4, 4) |
+      SET_BITS(response_expected, 5, 5);
+
+   return brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT,
+                      msg_type, msg_control);
+}
+
+static inline uint32_t
+brw_dp_a64_untyped_atomic_float_desc(const struct intel_device_info *devinfo,
+                                     ASSERTED unsigned exec_size,
+                                     unsigned bit_size,
+                                     unsigned atomic_op,
+                                     bool response_expected)
+{
+   assert(exec_size == 8);
+   assert(devinfo->ver >= 9);
+   assert(bit_size == 16 || bit_size == 32);
+   assert(devinfo->ver >= 12 || bit_size == 32);
+
+   assert(exec_size > 0);
+   const unsigned msg_type = bit_size == 32 ?
+      GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP :
+      GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP;
+
+   const unsigned msg_control =
+      SET_BITS(atomic_op, 1, 0) |
+      SET_BITS(response_expected, 5, 5);
+
+   return brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT,
+                      msg_type, msg_control);
+}
+
+static inline uint32_t
+brw_dp_typed_atomic_desc(const struct intel_device_info *devinfo,
+                         unsigned exec_size,
+                         unsigned exec_group,
+                         unsigned atomic_op,
+                         bool response_expected)
+{
+   assert(exec_size > 0 || exec_group == 0);
+   assert(exec_group % 8 == 0);
+
+   unsigned msg_type;
+   if (devinfo->verx10 >= 75) {
+      if (exec_size == 0) {
+         msg_type = HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2;
+      } else {
+         msg_type = HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP;
+      }
+   } else {
+      /* SIMD4x2 typed surface R/W messages only exist on HSW+ */
+      assert(exec_size > 0);
+      msg_type = GFX7_DATAPORT_RC_TYPED_ATOMIC_OP;
+   }
+
+   const bool high_sample_mask = (exec_group / 8) % 2 == 1;
+
+   const unsigned msg_control =
+      SET_BITS(atomic_op, 3, 0) |
+      SET_BITS(high_sample_mask, 4, 4) |
+      SET_BITS(response_expected, 5, 5);
+
+   return brw_dp_surface_desc(devinfo, msg_type, msg_control);
+}
+
+static inline uint32_t
+brw_dp_typed_surface_rw_desc(const struct intel_device_info *devinfo,
+                             unsigned exec_size,
+                             unsigned exec_group,
+                             unsigned num_channels,
+                             bool write)
+{
+   assert(exec_size > 0 || exec_group == 0);
+   assert(exec_group % 8 == 0);
+
+   /* Typed surface reads and writes don't support SIMD16 */
+   assert(exec_size <= 8);
+
+   unsigned msg_type;
+   if (write) {
+      if (devinfo->verx10 >= 75) {
+         msg_type = HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE;
+      } else {
+         msg_type = GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE;
+      }
+   } else {
+      if (devinfo->verx10 >= 75) {
+         msg_type = HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ;
+      } else {
+         msg_type = GFX7_DATAPORT_RC_TYPED_SURFACE_READ;
+      }
+   }
+
+   /* See also MDC_SG3 in the SKL PRM Vol 2d. */
+   unsigned msg_control;
+   if (devinfo->verx10 >= 75) {
+      /* See also MDC_SG3 in the SKL PRM Vol 2d. */
+      const unsigned slot_group = exec_size == 0 ? 0 : /* SIMD4x2 */
+                                  1 + ((exec_group / 8) % 2);
+
+      msg_control =
+         SET_BITS(brw_mdc_cmask(num_channels), 3, 0) |
+         SET_BITS(slot_group, 5, 4);
+   } else {
+      /* SIMD4x2 typed surface R/W messages only exist on HSW+ */
+      assert(exec_size > 0);
+      const unsigned slot_group = ((exec_group / 8) % 2);
+
+      msg_control =
+         SET_BITS(brw_mdc_cmask(num_channels), 3, 0) |
+         SET_BITS(slot_group, 5, 5);
+   }
+
+   return brw_dp_surface_desc(devinfo, msg_type, msg_control);
+}
+
+static inline uint32_t
+brw_fb_desc(const struct intel_device_info *devinfo,
+            unsigned binding_table_index,
+            unsigned msg_type,
+            unsigned msg_control)
+{
+   /* Prior to gen6, things are too inconsistent; use the fb_(read|write)_desc
+    * helpers instead.
+    */
+   assert(devinfo->ver >= 6);
+   const unsigned desc = SET_BITS(binding_table_index, 7, 0);
+   if (devinfo->ver >= 7) {
+      return (desc | SET_BITS(msg_control, 13, 8) |
+              SET_BITS(msg_type, 17, 14));
+   } else {
+      return (desc | SET_BITS(msg_control, 12, 8) |
+              SET_BITS(msg_type, 16, 13));
+   }
+}
+
+static inline unsigned
+brw_fb_desc_binding_table_index(UNUSED const struct intel_device_info *devinfo,
+                                uint32_t desc)
+{
+   return GET_BITS(desc, 7, 0);
+}
+
+static inline uint32_t
+brw_fb_desc_msg_control(const struct intel_device_info *devinfo, uint32_t desc)
+{
+   assert(devinfo->ver >= 6);
+   if (devinfo->ver >= 7)
+      return GET_BITS(desc, 13, 8);
+   else
+      return GET_BITS(desc, 12, 8);
+}
+
+static inline unsigned
+brw_fb_desc_msg_type(const struct intel_device_info *devinfo, uint32_t desc)
+{
+   assert(devinfo->ver >= 6);
+   if (devinfo->ver >= 7)
+      return GET_BITS(desc, 17, 14);
+   else
+      return GET_BITS(desc, 16, 13);
+}
+
+static inline uint32_t
+brw_fb_read_desc(const struct intel_device_info *devinfo,
+                 unsigned binding_table_index,
+                 unsigned msg_control,
+                 unsigned exec_size,
+                 bool per_sample)
+{
+   assert(devinfo->ver >= 9);
+   assert(exec_size == 8 || exec_size == 16);
+
+   return brw_fb_desc(devinfo, binding_table_index,
+                      GFX9_DATAPORT_RC_RENDER_TARGET_READ, msg_control) |
+          SET_BITS(per_sample, 13, 13) |
+          SET_BITS(exec_size == 8, 8, 8) /* Render Target Message Subtype */;
+}
+
+static inline uint32_t
+brw_fb_write_desc(const struct intel_device_info *devinfo,
+                  unsigned binding_table_index,
+                  unsigned msg_control,
+                  bool last_render_target,
+                  bool coarse_write)
+{
+   const unsigned msg_type =
+      devinfo->ver >= 6 ?
+      GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE :
+      BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
+
+   assert(devinfo->ver >= 10 || !coarse_write);
+
+   if (devinfo->ver >= 6) {
+      return brw_fb_desc(devinfo, binding_table_index, msg_type, msg_control) |
+             SET_BITS(last_render_target, 12, 12) |
+             SET_BITS(coarse_write, 18, 18);
+   } else {
+      return (SET_BITS(binding_table_index, 7, 0) |
+              SET_BITS(msg_control, 11, 8) |
+              SET_BITS(last_render_target, 11, 11) |
+              SET_BITS(msg_type, 14, 12));
+   }
+}
+
+static inline unsigned
+brw_fb_write_desc_msg_type(const struct intel_device_info *devinfo,
+                           uint32_t desc)
+{
+   if (devinfo->ver >= 6)
+      return brw_fb_desc_msg_type(devinfo, desc);
+   else
+      return GET_BITS(desc, 14, 12);
+}
+
+static inline unsigned
+brw_fb_write_desc_msg_control(const struct intel_device_info *devinfo,
+                              uint32_t desc)
+{
+   if (devinfo->ver >= 6)
+      return brw_fb_desc_msg_control(devinfo, desc);
+   else
+      return GET_BITS(desc, 11, 8);
+}
+
+static inline bool
+brw_fb_write_desc_last_render_target(const struct intel_device_info *devinfo,
+                                     uint32_t desc)
+{
+   if (devinfo->ver >= 6)
+      return GET_BITS(desc, 12, 12);
+   else
+      return GET_BITS(desc, 11, 11);
+}
+
+static inline bool
+brw_fb_write_desc_write_commit(const struct intel_device_info *devinfo,
+                               uint32_t desc)
+{
+   assert(devinfo->ver <= 6);
+   if (devinfo->ver >= 6)
+      return GET_BITS(desc, 17, 17);
+   else
+      return GET_BITS(desc, 15, 15);
+}
+
+static inline bool
+brw_fb_write_desc_coarse_write(const struct intel_device_info *devinfo,
+                               uint32_t desc)
+{
+   assert(devinfo->ver >= 10);
+   return GET_BITS(desc, 18, 18);
+}
+
+static inline bool
+lsc_opcode_has_cmask(enum lsc_opcode opcode)
+{
+   return opcode == LSC_OP_LOAD_CMASK || opcode == LSC_OP_STORE_CMASK;
+}
+
+static inline bool
+lsc_opcode_has_transpose(enum lsc_opcode opcode)
+{
+   return opcode == LSC_OP_LOAD || opcode == LSC_OP_STORE;
+}
+
+static inline bool
+lsc_opcode_is_store(enum lsc_opcode opcode)
+{
+   return opcode == LSC_OP_STORE ||
+          opcode == LSC_OP_STORE_CMASK;
+}
+
+static inline bool
+lsc_opcode_is_atomic(enum lsc_opcode opcode)
+{
+   switch (opcode) {
+   case LSC_OP_ATOMIC_INC:
+   case LSC_OP_ATOMIC_DEC:
+   case LSC_OP_ATOMIC_LOAD:
+   case LSC_OP_ATOMIC_STORE:
+   case LSC_OP_ATOMIC_ADD:
+   case LSC_OP_ATOMIC_SUB:
+   case LSC_OP_ATOMIC_MIN:
+   case LSC_OP_ATOMIC_MAX:
+   case LSC_OP_ATOMIC_UMIN:
+   case LSC_OP_ATOMIC_UMAX:
+   case LSC_OP_ATOMIC_CMPXCHG:
+   case LSC_OP_ATOMIC_FADD:
+   case LSC_OP_ATOMIC_FSUB:
+   case LSC_OP_ATOMIC_FMIN:
+   case LSC_OP_ATOMIC_FMAX:
+   case LSC_OP_ATOMIC_FCMPXCHG:
+   case LSC_OP_ATOMIC_AND:
+   case LSC_OP_ATOMIC_OR:
+   case LSC_OP_ATOMIC_XOR:
+      return true;
+
+   default:
+      return false;
+   }
+}
+
+static inline bool
+lsc_opcode_is_atomic_float(enum lsc_opcode opcode)
+{
+   switch (opcode) {
+   case LSC_OP_ATOMIC_FADD:
+   case LSC_OP_ATOMIC_FSUB:
+   case LSC_OP_ATOMIC_FMIN:
+   case LSC_OP_ATOMIC_FMAX:
+   case LSC_OP_ATOMIC_FCMPXCHG:
+      return true;
+
+   default:
+      return false;
+   }
+}
+
+static inline unsigned
+lsc_op_num_data_values(unsigned _op)
+{
+   enum lsc_opcode op = (enum lsc_opcode) _op;
+
+   switch (op) {
+   case LSC_OP_ATOMIC_CMPXCHG:
+   case LSC_OP_ATOMIC_FCMPXCHG:
+      return 2;
+   case LSC_OP_ATOMIC_INC:
+   case LSC_OP_ATOMIC_DEC:
+   case LSC_OP_LOAD:
+   case LSC_OP_LOAD_CMASK:
+   case LSC_OP_FENCE:
+      /* XXX: actually check docs */
+      return 0;
+   default:
+      return 1;
+   }
+}
+
+static inline unsigned
+lsc_op_to_legacy_atomic(unsigned _op)
+{
+   enum lsc_opcode op = (enum lsc_opcode) _op;
+
+   switch (op) {
+   case LSC_OP_ATOMIC_INC:
+      return BRW_AOP_INC;
+   case LSC_OP_ATOMIC_DEC:
+      return BRW_AOP_DEC;
+   case LSC_OP_ATOMIC_STORE:
+      return BRW_AOP_MOV;
+   case LSC_OP_ATOMIC_ADD:
+      return BRW_AOP_ADD;
+   case LSC_OP_ATOMIC_SUB:
+      return BRW_AOP_SUB;
+   case LSC_OP_ATOMIC_MIN:
+      return BRW_AOP_IMIN;
+   case LSC_OP_ATOMIC_MAX:
+      return BRW_AOP_IMAX;
+   case LSC_OP_ATOMIC_UMIN:
+      return BRW_AOP_UMIN;
+   case LSC_OP_ATOMIC_UMAX:
+      return BRW_AOP_UMAX;
+   case LSC_OP_ATOMIC_CMPXCHG:
+      return BRW_AOP_CMPWR;
+   case LSC_OP_ATOMIC_FADD:
+      return BRW_AOP_FADD;
+   case LSC_OP_ATOMIC_FMIN:
+      return BRW_AOP_FMIN;
+   case LSC_OP_ATOMIC_FMAX:
+      return BRW_AOP_FMAX;
+   case LSC_OP_ATOMIC_FCMPXCHG:
+      return BRW_AOP_FCMPWR;
+   case LSC_OP_ATOMIC_AND:
+      return BRW_AOP_AND;
+   case LSC_OP_ATOMIC_OR:
+      return BRW_AOP_OR;
+   case LSC_OP_ATOMIC_XOR:
+      return BRW_AOP_XOR;
+   /* No LSC op maps to BRW_AOP_PREDEC */
+   case LSC_OP_ATOMIC_LOAD:
+   case LSC_OP_ATOMIC_FSUB:
+      unreachable("no corresponding legacy atomic operation");
+   case LSC_OP_LOAD:
+   case LSC_OP_LOAD_CMASK:
+   case LSC_OP_STORE:
+   case LSC_OP_STORE_CMASK:
+   case LSC_OP_FENCE:
+      unreachable("not an atomic op");
+   }
+
+   unreachable("invalid LSC op");
+}
+
+static inline uint32_t
+lsc_data_size_bytes(enum lsc_data_size data_size)
+{
+   switch (data_size) {
+   case LSC_DATA_SIZE_D8:
+      return 1;
+   case LSC_DATA_SIZE_D16:
+      return 2;
+   case LSC_DATA_SIZE_D32:
+   case LSC_DATA_SIZE_D8U32:
+   case LSC_DATA_SIZE_D16U32:
+   case LSC_DATA_SIZE_D16BF32:
+      return 4;
+   case LSC_DATA_SIZE_D64:
+      return 8;
+   default:
+      unreachable("Unsupported data payload size.");
+   }
+}
+
+static inline uint32_t
+lsc_addr_size_bytes(enum lsc_addr_size addr_size)
+{
+   switch (addr_size) {
+   case LSC_ADDR_SIZE_A16: return 2;
+   case LSC_ADDR_SIZE_A32: return 4;
+   case LSC_ADDR_SIZE_A64: return 8;
+   default:
+      unreachable("Unsupported address size.");
+   }
+}
+
+static inline uint32_t
+lsc_vector_length(enum lsc_vect_size vect_size)
+{
+   switch (vect_size) {
+   case LSC_VECT_SIZE_V1: return 1;
+   case LSC_VECT_SIZE_V2: return 2;
+   case LSC_VECT_SIZE_V3: return 3;
+   case LSC_VECT_SIZE_V4: return 4;
+   case LSC_VECT_SIZE_V8: return 8;
+   case LSC_VECT_SIZE_V16: return 16;
+   case LSC_VECT_SIZE_V32: return 32;
+   case LSC_VECT_SIZE_V64: return 64;
+   default:
+      unreachable("Unsupported size of vector");
+   }
+}
+
+static inline enum lsc_vect_size
+lsc_vect_size(unsigned vect_size)
+{
+   switch(vect_size) {
+   case 1:  return LSC_VECT_SIZE_V1;
+   case 2:  return LSC_VECT_SIZE_V2;
+   case 3:  return LSC_VECT_SIZE_V3;
+   case 4:  return LSC_VECT_SIZE_V4;
+   case 8:  return LSC_VECT_SIZE_V8;
+   case 16: return LSC_VECT_SIZE_V16;
+   case 32: return LSC_VECT_SIZE_V32;
+   case 64: return LSC_VECT_SIZE_V64;
+   default:
+      unreachable("Unsupported vector size for dataport");
+   }
+}
+
+static inline uint32_t
+lsc_msg_desc_wcmask(UNUSED const struct intel_device_info *devinfo,
+             enum lsc_opcode opcode, unsigned simd_size,
+             enum lsc_addr_surface_type addr_type,
+             enum lsc_addr_size addr_sz, unsigned num_coordinates,
+             enum lsc_data_size data_sz, unsigned num_channels,
+             bool transpose, unsigned cache_ctrl, bool has_dest, unsigned cmask)
+{
+   assert(devinfo->has_lsc);
+
+   unsigned dest_length = !has_dest ? 0 :
+      DIV_ROUND_UP(lsc_data_size_bytes(data_sz) * num_channels * simd_size,
+                   reg_unit(devinfo) * REG_SIZE);
+
+   unsigned src0_length =
+      DIV_ROUND_UP(lsc_addr_size_bytes(addr_sz) * num_coordinates * simd_size,
+                   reg_unit(devinfo) * REG_SIZE);
+
+   assert(!transpose || lsc_opcode_has_transpose(opcode));
+
+   unsigned msg_desc =
+      SET_BITS(opcode, 5, 0) |
+      SET_BITS(addr_sz, 8, 7) |
+      SET_BITS(data_sz, 11, 9) |
+      SET_BITS(transpose, 15, 15) |
+      SET_BITS(cache_ctrl, 19, 17) |
+      SET_BITS(dest_length, 24, 20) |
+      SET_BITS(src0_length, 28, 25) |
+      SET_BITS(addr_type, 30, 29);
+
+   if (lsc_opcode_has_cmask(opcode))
+      msg_desc |= SET_BITS(cmask ? cmask : lsc_cmask(num_channels), 15, 12);
+   else
+      msg_desc |= SET_BITS(lsc_vect_size(num_channels), 14, 12);
+
+   return msg_desc;
+}
+
+static inline uint32_t
+lsc_msg_desc(UNUSED const struct intel_device_info *devinfo,
+             enum lsc_opcode opcode, unsigned simd_size,
+             enum lsc_addr_surface_type addr_type,
+             enum lsc_addr_size addr_sz, unsigned num_coordinates,
+             enum lsc_data_size data_sz, unsigned num_channels,
+             bool transpose, unsigned cache_ctrl, bool has_dest)
+{
+   return lsc_msg_desc_wcmask(devinfo, opcode, simd_size, addr_type, addr_sz,
+         num_coordinates, data_sz, num_channels, transpose, cache_ctrl,
+         has_dest, 0);
+}
+
+static inline enum lsc_opcode
+lsc_msg_desc_opcode(UNUSED const struct intel_device_info *devinfo,
+                    uint32_t desc)
+{
+   assert(devinfo->has_lsc);
+   return (enum lsc_opcode) GET_BITS(desc, 5, 0);
+}
+
+static inline enum lsc_addr_size
+lsc_msg_desc_addr_size(UNUSED const struct intel_device_info *devinfo,
+                       uint32_t desc)
+{
+   assert(devinfo->has_lsc);
+   return (enum lsc_addr_size) GET_BITS(desc, 8, 7);
+}
+
+static inline enum lsc_data_size
+lsc_msg_desc_data_size(UNUSED const struct intel_device_info *devinfo,
+                       uint32_t desc)
+{
+   assert(devinfo->has_lsc);
+   return (enum lsc_data_size) GET_BITS(desc, 11, 9);
+}
+
+static inline enum lsc_vect_size
+lsc_msg_desc_vect_size(UNUSED const struct intel_device_info *devinfo,
+                       uint32_t desc)
+{
+   assert(devinfo->has_lsc);
+   assert(!lsc_opcode_has_cmask(lsc_msg_desc_opcode(devinfo, desc)));
+   return (enum lsc_vect_size) GET_BITS(desc, 14, 12);
+}
+
+static inline enum lsc_cmask
+lsc_msg_desc_cmask(UNUSED const struct intel_device_info *devinfo,
+                   uint32_t desc)
+{
+   assert(devinfo->has_lsc);
+   assert(lsc_opcode_has_cmask(lsc_msg_desc_opcode(devinfo, desc)));
+   return (enum lsc_cmask) GET_BITS(desc, 15, 12);
+}
+
+static inline bool
+lsc_msg_desc_transpose(UNUSED const struct intel_device_info *devinfo,
+                       uint32_t desc)
+{
+   assert(devinfo->has_lsc);
+   return GET_BITS(desc, 15, 15);
+}
+
+static inline unsigned
+lsc_msg_desc_cache_ctrl(UNUSED const struct intel_device_info *devinfo,
+                        uint32_t desc)
+{
+   assert(devinfo->has_lsc);
+   return GET_BITS(desc, 19, 17);
+}
+
+static inline unsigned
+lsc_msg_desc_dest_len(const struct intel_device_info *devinfo,
+                      uint32_t desc)
+{
+   assert(devinfo->has_lsc);
+   return GET_BITS(desc, 24, 20) * reg_unit(devinfo);
+}
+
+static inline unsigned
+lsc_msg_desc_src0_len(const struct intel_device_info *devinfo,
+                      uint32_t desc)
+{
+   assert(devinfo->has_lsc);
+   return GET_BITS(desc, 28, 25) * reg_unit(devinfo);
+}
+
+static inline enum lsc_addr_surface_type
+lsc_msg_desc_addr_type(UNUSED const struct intel_device_info *devinfo,
+                       uint32_t desc)
+{
+   assert(devinfo->has_lsc);
+   return (enum lsc_addr_surface_type) GET_BITS(desc, 30, 29);
+}
+
+static inline uint32_t
+lsc_fence_msg_desc(UNUSED const struct intel_device_info *devinfo,
+                   enum lsc_fence_scope scope,
+                   enum lsc_flush_type flush_type,
+                   bool route_to_lsc)
+{
+   assert(devinfo->has_lsc);
+   return SET_BITS(LSC_OP_FENCE, 5, 0) |
+          SET_BITS(LSC_ADDR_SIZE_A32, 8, 7) |
+          SET_BITS(scope, 11, 9) |
+          SET_BITS(flush_type, 14, 12) |
+          SET_BITS(route_to_lsc, 18, 18) |
+          SET_BITS(LSC_ADDR_SURFTYPE_FLAT, 30, 29);
+}
+
+static inline enum lsc_fence_scope
+lsc_fence_msg_desc_scope(UNUSED const struct intel_device_info *devinfo,
+                         uint32_t desc)
+{
+   assert(devinfo->has_lsc);
+   return (enum lsc_fence_scope) GET_BITS(desc, 11, 9);
+}
+
+static inline enum lsc_flush_type
+lsc_fence_msg_desc_flush_type(UNUSED const struct intel_device_info *devinfo,
+                              uint32_t desc)
+{
+   assert(devinfo->has_lsc);
+   return (enum lsc_flush_type) GET_BITS(desc, 14, 12);
+}
+
+static inline enum lsc_backup_fence_routing
+lsc_fence_msg_desc_backup_routing(UNUSED const struct intel_device_info *devinfo,
+                                  uint32_t desc)
+{
+   assert(devinfo->has_lsc);
+   return (enum lsc_backup_fence_routing) GET_BITS(desc, 18, 18);
+}
+
+static inline uint32_t
+lsc_bti_ex_desc(const struct intel_device_info *devinfo, unsigned bti)
+{
+   assert(devinfo->has_lsc);
+   return SET_BITS(bti, 31, 24) |
+          SET_BITS(0, 23, 12);  /* base offset */
+}
+
+static inline unsigned
+lsc_bti_ex_desc_base_offset(const struct intel_device_info *devinfo,
+                            uint32_t ex_desc)
+{
+   assert(devinfo->has_lsc);
+   return GET_BITS(ex_desc, 23, 12);
+}
+
+static inline unsigned
+lsc_bti_ex_desc_index(const struct intel_device_info *devinfo,
+                      uint32_t ex_desc)
+{
+   assert(devinfo->has_lsc);
+   return GET_BITS(ex_desc, 31, 24);
+}
+
+static inline unsigned
+lsc_flat_ex_desc_base_offset(const struct intel_device_info *devinfo,
+                             uint32_t ex_desc)
+{
+   assert(devinfo->has_lsc);
+   return GET_BITS(ex_desc, 31, 12);
+}
+
+static inline uint32_t
+lsc_bss_ex_desc(const struct intel_device_info *devinfo,
+                unsigned surface_state_index)
+{
+   assert(devinfo->has_lsc);
+   return SET_BITS(surface_state_index, 31, 6);
+}
+
+static inline unsigned
+lsc_bss_ex_desc_index(const struct intel_device_info *devinfo,
+                      uint32_t ex_desc)
+{
+   assert(devinfo->has_lsc);
+   return GET_BITS(ex_desc, 31, 6);
+}
+
+static inline uint32_t
+brw_mdc_sm2(unsigned exec_size)
+{
+   assert(exec_size == 8 || exec_size == 16);
+   return exec_size > 8;
+}
+
+static inline uint32_t
+brw_mdc_sm2_exec_size(uint32_t sm2)
+{
+   assert(sm2 <= 1);
+   return 8 << sm2;
+}
+
+static inline uint32_t
+brw_btd_spawn_desc(ASSERTED const struct intel_device_info *devinfo,
+                   unsigned exec_size, unsigned msg_type)
+{
+   assert(devinfo->has_ray_tracing);
+   assert(devinfo->ver < 20 || exec_size == 16);
+
+   return SET_BITS(0, 19, 19) | /* No header */
+          SET_BITS(msg_type, 17, 14) |
+          SET_BITS(brw_mdc_sm2(exec_size), 8, 8);
+}
+
+static inline uint32_t
+brw_btd_spawn_msg_type(UNUSED const struct intel_device_info *devinfo,
+                       uint32_t desc)
+{
+   return GET_BITS(desc, 17, 14);
+}
+
+static inline uint32_t
+brw_btd_spawn_exec_size(UNUSED const struct intel_device_info *devinfo,
+                        uint32_t desc)
+{
+   return brw_mdc_sm2_exec_size(GET_BITS(desc, 8, 8));
+}
+
+static inline uint32_t
+brw_rt_trace_ray_desc(ASSERTED const struct intel_device_info *devinfo,
+                      unsigned exec_size)
+{
+   assert(devinfo->has_ray_tracing);
+   assert(devinfo->ver < 20 || exec_size == 16);
+
+   return SET_BITS(0, 19, 19) | /* No header */
+          SET_BITS(0, 17, 14) | /* Message type */
+          SET_BITS(brw_mdc_sm2(exec_size), 8, 8);
+}
+
+static inline uint32_t
+brw_rt_trace_ray_desc_exec_size(UNUSED const struct intel_device_info *devinfo,
+                                uint32_t desc)
+{
+   return brw_mdc_sm2_exec_size(GET_BITS(desc, 8, 8));
+}
+
+/**
+ * Construct a message descriptor immediate with the specified pixel
+ * interpolator function controls.
+ */
+static inline uint32_t
+brw_pixel_interp_desc(UNUSED const struct intel_device_info *devinfo,
+                      unsigned msg_type,
+                      bool noperspective,
+                      bool coarse_pixel_rate,
+                      unsigned exec_size,
+                      unsigned group)
+{
+   assert(exec_size == 8 || exec_size == 16);
+   const bool simd_mode = exec_size == 16;
+   const bool slot_group = group >= 16;
+
+   assert(devinfo->ver >= 10 || !coarse_pixel_rate);
+   return (SET_BITS(slot_group, 11, 11) |
+           SET_BITS(msg_type, 13, 12) |
+           SET_BITS(!!noperspective, 14, 14) |
+           SET_BITS(coarse_pixel_rate, 15, 15) |
+           SET_BITS(simd_mode, 16, 16));
+}
+
+void brw_urb_WRITE(struct brw_codegen *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+                   enum brw_urb_write_flags flags,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   unsigned offset,
+		   unsigned swizzle);
+
+/**
+ * Send message to shared unit \p sfid with a possibly indirect descriptor \p
+ * desc.  If \p desc is not an immediate it will be transparently loaded to an
+ * address register using an OR instruction.
+ */
+void
+brw_send_indirect_message(struct brw_codegen *p,
+                          unsigned sfid,
+                          struct brw_reg dst,
+                          struct brw_reg payload,
+                          struct brw_reg desc,
+                          unsigned desc_imm,
+                          bool eot);
+
+void
+brw_send_indirect_split_message(struct brw_codegen *p,
+                                unsigned sfid,
+                                struct brw_reg dst,
+                                struct brw_reg payload0,
+                                struct brw_reg payload1,
+                                struct brw_reg desc,
+                                unsigned desc_imm,
+                                struct brw_reg ex_desc,
+                                unsigned ex_desc_imm,
+                                bool ex_desc_scratch,
+                                bool ex_bso,
+                                bool eot);
+
+void brw_ff_sync(struct brw_codegen *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   bool allocate,
+		   unsigned response_length,
+		   bool eot);
+
+void brw_svb_write(struct brw_codegen *p,
+                   struct brw_reg dest,
+                   unsigned msg_reg_nr,
+                   struct brw_reg src0,
+                   unsigned binding_table_index,
+                   bool   send_commit_msg);
+
+brw_inst *brw_fb_WRITE(struct brw_codegen *p,
+                       struct brw_reg payload,
+                       struct brw_reg implied_header,
+                       unsigned msg_control,
+                       unsigned binding_table_index,
+                       unsigned msg_length,
+                       unsigned response_length,
+                       bool eot,
+                       bool last_render_target,
+                       bool header_present);
+
+brw_inst *gfx9_fb_READ(struct brw_codegen *p,
+                       struct brw_reg dst,
+                       struct brw_reg payload,
+                       unsigned binding_table_index,
+                       unsigned msg_length,
+                       unsigned response_length,
+                       bool per_sample);
+
+void brw_SAMPLE(struct brw_codegen *p,
+		struct brw_reg dest,
+		unsigned msg_reg_nr,
+		struct brw_reg src0,
+		unsigned binding_table_index,
+		unsigned sampler,
+		unsigned msg_type,
+		unsigned response_length,
+		unsigned msg_length,
+		unsigned header_present,
+		unsigned simd_mode,
+		unsigned return_format);
+
+void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
+                                      struct brw_reg header,
+                                      struct brw_reg sampler_index);
+
+void gfx4_math(struct brw_codegen *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       unsigned msg_reg_nr,
+	       struct brw_reg src,
+	       unsigned precision );
+
+void gfx6_math(struct brw_codegen *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       struct brw_reg src0,
+	       struct brw_reg src1);
+
+void brw_oword_block_read(struct brw_codegen *p,
+			  struct brw_reg dest,
+			  struct brw_reg mrf,
+			  uint32_t offset,
+			  uint32_t bind_table_index);
+
+unsigned brw_scratch_surface_idx(const struct brw_codegen *p);
+
+void brw_oword_block_read_scratch(struct brw_codegen *p,
+				  struct brw_reg dest,
+				  struct brw_reg mrf,
+				  int num_regs,
+				  unsigned offset);
+
+void brw_oword_block_write_scratch(struct brw_codegen *p,
+				   struct brw_reg mrf,
+				   int num_regs,
+				   unsigned offset);
+
+void gfx7_block_read_scratch(struct brw_codegen *p,
+                             struct brw_reg dest,
+                             int num_regs,
+                             unsigned offset);
+
+/**
+ * Return the generation-specific jump distance scaling factor.
+ *
+ * Given the number of instructions to jump, we need to scale by
+ * some number to obtain the actual jump distance to program in an
+ * instruction.
+ */
+static inline unsigned
+brw_jump_scale(const struct intel_device_info *devinfo)
+{
+   /* Broadwell measures jump targets in bytes. */
+   if (devinfo->ver >= 8)
+      return 16;
+
+   /* Ironlake and later measure jump targets in 64-bit data chunks (in order
+    * (to support compaction), so each 128-bit instruction requires 2 chunks.
+    */
+   if (devinfo->ver >= 5)
+      return 2;
+
+   /* Gfx4 simply uses the number of 128-bit instructions. */
+   return 1;
+}
+
+void brw_barrier(struct brw_codegen *p, struct brw_reg src);
+
+/* If/else/endif.  Works by manipulating the execution flags on each
+ * channel.
+ */
+brw_inst *brw_IF(struct brw_codegen *p, unsigned execute_size);
+brw_inst *gfx6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
+                  struct brw_reg src0, struct brw_reg src1);
+
+void brw_ELSE(struct brw_codegen *p);
+void brw_ENDIF(struct brw_codegen *p);
+
+/* DO/WHILE loops:
+ */
+brw_inst *brw_DO(struct brw_codegen *p, unsigned execute_size);
+
+brw_inst *brw_WHILE(struct brw_codegen *p);
+
+brw_inst *brw_BREAK(struct brw_codegen *p);
+brw_inst *brw_CONT(struct brw_codegen *p);
+brw_inst *brw_HALT(struct brw_codegen *p);
+
+/* Forward jumps:
+ */
+void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx);
+
+brw_inst *brw_JMPI(struct brw_codegen *p, struct brw_reg index,
+                   unsigned predicate_control);
+
+void brw_NOP(struct brw_codegen *p);
+
+void brw_WAIT(struct brw_codegen *p);
+
+void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func);
+
+/* Special case: there is never a destination, execution size will be
+ * taken from src0:
+ */
+void brw_CMP(struct brw_codegen *p,
+	     struct brw_reg dest,
+	     unsigned conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1);
+
+void brw_CMPN(struct brw_codegen *p,
+              struct brw_reg dest,
+              unsigned conditional,
+              struct brw_reg src0,
+              struct brw_reg src1);
+
+brw_inst *brw_DPAS(struct brw_codegen *p, enum gfx12_systolic_depth sdepth,
+                   unsigned rcount, struct brw_reg dest, struct brw_reg src0,
+                   struct brw_reg src1, struct brw_reg src2);
+
+void
+brw_untyped_atomic(struct brw_codegen *p,
+                   struct brw_reg dst,
+                   struct brw_reg payload,
+                   struct brw_reg surface,
+                   unsigned atomic_op,
+                   unsigned msg_length,
+                   bool response_expected,
+                   bool header_present);
+
+void
+brw_untyped_surface_read(struct brw_codegen *p,
+                         struct brw_reg dst,
+                         struct brw_reg payload,
+                         struct brw_reg surface,
+                         unsigned msg_length,
+                         unsigned num_channels);
+
+void
+brw_untyped_surface_write(struct brw_codegen *p,
+                          struct brw_reg payload,
+                          struct brw_reg surface,
+                          unsigned msg_length,
+                          unsigned num_channels,
+                          bool header_present);
+
+void
+brw_memory_fence(struct brw_codegen *p,
+                 struct brw_reg dst,
+                 struct brw_reg src,
+                 enum opcode send_op,
+                 enum brw_message_target sfid,
+                 uint32_t desc,
+                 bool commit_enable,
+                 unsigned bti);
+
+void
+brw_pixel_interpolator_query(struct brw_codegen *p,
+                             struct brw_reg dest,
+                             struct brw_reg mrf,
+                             bool noperspective,
+                             bool coarse_pixel_rate,
+                             unsigned mode,
+                             struct brw_reg data,
+                             unsigned msg_length,
+                             unsigned response_length);
+
+void
+brw_find_live_channel(struct brw_codegen *p,
+                      struct brw_reg dst,
+                      bool last);
+
+void
+brw_broadcast(struct brw_codegen *p,
+              struct brw_reg dst,
+              struct brw_reg src,
+              struct brw_reg idx);
+
+void
+brw_float_controls_mode(struct brw_codegen *p,
+                        unsigned mode, unsigned mask);
+
+void
+brw_update_reloc_imm(const struct brw_isa_info *isa,
+                     brw_inst *inst,
+                     uint32_t value);
+
+void
+brw_MOV_reloc_imm(struct brw_codegen *p,
+                  struct brw_reg dst,
+                  enum brw_reg_type src_type,
+                  uint32_t id);
+
+unsigned
+brw_num_sources_from_inst(const struct brw_isa_info *isa,
+                          const brw_inst *inst);
+
+/***********************************************************************
+ * brw_eu_util.c:
+ */
+
+void brw_copy_indirect_to_indirect(struct brw_codegen *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   unsigned count);
+
+void brw_copy_from_indirect(struct brw_codegen *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    unsigned count);
+
+void brw_copy4(struct brw_codegen *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count);
+
+void brw_copy8(struct brw_codegen *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count);
+
+void brw_math_invert( struct brw_codegen *p,
+		      struct brw_reg dst,
+		      struct brw_reg src);
+
+void brw_set_src1(struct brw_codegen *p, brw_inst *insn, struct brw_reg reg);
+
+void brw_set_desc_ex(struct brw_codegen *p, brw_inst *insn,
+                     unsigned desc, unsigned ex_desc);
+
+static inline void
+brw_set_desc(struct brw_codegen *p, brw_inst *insn, unsigned desc)
+{
+   brw_set_desc_ex(p, insn, desc, 0);
+}
+
+void brw_set_uip_jip(struct brw_codegen *p, int start_offset);
+
+enum brw_conditional_mod brw_negate_cmod(enum brw_conditional_mod cmod);
+enum brw_conditional_mod brw_swap_cmod(enum brw_conditional_mod cmod);
+
+/* brw_eu_compact.c */
+void brw_compact_instructions(struct brw_codegen *p, int start_offset,
+                              struct disasm_info *disasm);
+void brw_uncompact_instruction(const struct brw_isa_info *isa,
+                               brw_inst *dst, brw_compact_inst *src);
+bool brw_try_compact_instruction(const struct brw_isa_info *isa,
+                                 brw_compact_inst *dst, const brw_inst *src);
+
+void brw_debug_compact_uncompact(const struct brw_isa_info *isa,
+                                 brw_inst *orig, brw_inst *uncompacted);
+
+/* brw_eu_validate.c */
+bool brw_validate_instruction(const struct brw_isa_info *isa,
+                              const brw_inst *inst, int offset,
+                              unsigned inst_size,
+                              struct disasm_info *disasm);
+bool brw_validate_instructions(const struct brw_isa_info *isa,
+                               const void *assembly, int start_offset, int end_offset,
+                               struct disasm_info *disasm);
+
+static inline int
+next_offset(const struct intel_device_info *devinfo, void *store, int offset)
+{
+   brw_inst *insn = (brw_inst *)((char *)store + offset);
+
+   if (brw_inst_cmpt_control(devinfo, insn))
+      return offset + 8;
+   else
+      return offset + 16;
+}
+
+/** Maximum SEND message length */
+#define BRW_MAX_MSG_LENGTH 15
+
+/** First MRF register used by pull loads */
+#define FIRST_SPILL_MRF(gen) ((gen) == 6 ? 21 : 13)
+
+/** First MRF register used by spills */
+#define FIRST_PULL_LOAD_MRF(gen) ((gen) == 6 ? 16 : 13)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/intel/compiler/elk/brw_eu_compact.c b/src/intel/compiler/elk/brw_eu_compact.c
new file mode 100644
index 00000000000..356650ffd20
--- /dev/null
+++ b/src/intel/compiler/elk/brw_eu_compact.c
@@ -0,0 +1,3081 @@
+/*
+ * Copyright © 2012-2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_eu_compact.c
+ *
+ * Instruction compaction is a feature of G45 and newer hardware that allows
+ * for a smaller instruction encoding.
+ *
+ * The instruction cache is on the order of 32KB, and many programs generate
+ * far more instructions than that.  The instruction cache is built to barely
+ * keep up with instruction dispatch ability in cache hit cases -- L1
+ * instruction cache misses that still hit in the next level could limit
+ * throughput by around 50%.
+ *
+ * The idea of instruction compaction is that most instructions use a tiny
+ * subset of the GPU functionality, so we can encode what would be a 16 byte
+ * instruction in 8 bytes using some lookup tables for various fields.
+ *
+ *
+ * Instruction compaction capabilities vary subtly by generation.
+ *
+ * G45's support for instruction compaction is very limited. Jump counts on
+ * this generation are in units of 16-byte uncompacted instructions. As such,
+ * all jump targets must be 16-byte aligned. Also, all instructions must be
+ * naturally aligned, i.e. uncompacted instructions must be 16-byte aligned.
+ * A G45-only instruction, NENOP, must be used to provide padding to align
+ * uncompacted instructions.
+ *
+ * Gfx5 removes these restrictions and changes jump counts to be in units of
+ * 8-byte compacted instructions, allowing jump targets to be only 8-byte
+ * aligned. Uncompacted instructions can also be placed on 8-byte boundaries.
+ *
+ * Gfx6 adds the ability to compact instructions with a limited range of
+ * immediate values. Compactable immediates have 12 unrestricted bits, and a
+ * 13th bit that's replicated through the high 20 bits, to create the 32-bit
+ * value of DW3 in the uncompacted instruction word.
+ *
+ * On Gfx7 we can compact some control flow instructions with a small positive
+ * immediate in the low bits of DW3, like ENDIF with the JIP field. Other
+ * control flow instructions with UIP cannot be compacted, because of the
+ * replicated 13th bit. No control flow instructions can be compacted on Gfx6
+ * since the jump count field is not in DW3.
+ *
+ *    break    JIP/UIP
+ *    cont     JIP/UIP
+ *    halt     JIP/UIP
+ *    if       JIP/UIP
+ *    else     JIP (plus UIP on BDW+)
+ *    endif    JIP
+ *    while    JIP (must be negative)
+ *
+ * Gen 8 adds support for compacting 3-src instructions.
+ *
+ * Gfx12 reduces the number of bits that available to compacted immediates from
+ * 13 to 12, but improves the compaction of floating-point immediates by
+ * allowing the high bits to be encoded (the sign, 8-bit exponent, and the
+ * three most significant bits of the mantissa), rather than the lowest bits of
+ * the mantissa.
+ */
+
+#include "brw_eu.h"
+#include "brw_disasm.h"
+#include "brw_shader.h"
+#include "brw_disasm_info.h"
+#include "dev/intel_debug.h"
+
+static const uint32_t g45_control_index_table[32] = {
+   0b00000000000000000,
+   0b01000000000000000,
+   0b00110000000000000,
+   0b00000000000000010,
+   0b00100000000000000,
+   0b00010000000000000,
+   0b01000000000100000,
+   0b01000000100000000,
+   0b01010000000100000,
+   0b00000000100000010,
+   0b11000000000000000,
+   0b00001000100000010,
+   0b01001000100000000,
+   0b00000000100000000,
+   0b11000000000100000,
+   0b00001000100000000,
+   0b10110000000000000,
+   0b11010000000100000,
+   0b00110000100000000,
+   0b00100000100000000,
+   0b01000000000001000,
+   0b01000000000000100,
+   0b00111100000000000,
+   0b00101011000000000,
+   0b00110000000010000,
+   0b00010000100000000,
+   0b01000000000100100,
+   0b01000000000101000,
+   0b00110000000000110,
+   0b00000000000001010,
+   0b01010000000101000,
+   0b01010000000100100,
+};
+
+static const uint32_t g45_datatype_table[32] = {
+   0b001000000000100001,
+   0b001011010110101101,
+   0b001000001000110001,
+   0b001111011110111101,
+   0b001011010110101100,
+   0b001000000110101101,
+   0b001000000000100000,
+   0b010100010110110001,
+   0b001100011000101101,
+   0b001000000000100010,
+   0b001000001000110110,
+   0b010000001000110001,
+   0b001000001000110010,
+   0b011000001000110010,
+   0b001111011110111100,
+   0b001000000100101000,
+   0b010100011000110001,
+   0b001010010100101001,
+   0b001000001000101001,
+   0b010000001000110110,
+   0b101000001000110001,
+   0b001011011000101101,
+   0b001000000100001001,
+   0b001011011000101100,
+   0b110100011000110001,
+   0b001000001110111101,
+   0b110000001000110001,
+   0b011000000100101010,
+   0b101000001000101001,
+   0b001011010110001100,
+   0b001000000110100001,
+   0b001010010100001000,
+};
+
+static const uint16_t g45_subreg_table[32] = {
+   0b000000000000000,
+   0b000000010000000,
+   0b000001000000000,
+   0b000100000000000,
+   0b000000000100000,
+   0b100000000000000,
+   0b000000000010000,
+   0b001100000000000,
+   0b001010000000000,
+   0b000000100000000,
+   0b001000000000000,
+   0b000000000001000,
+   0b000000001000000,
+   0b000000000000001,
+   0b000010000000000,
+   0b000000010100000,
+   0b000000000000111,
+   0b000001000100000,
+   0b011000000000000,
+   0b000000110000000,
+   0b000000000000010,
+   0b000000000000100,
+   0b000000001100000,
+   0b000100000000010,
+   0b001110011000110,
+   0b001110100001000,
+   0b000110011000110,
+   0b000001000011000,
+   0b000110010000100,
+   0b001100000000110,
+   0b000000010000110,
+   0b000001000110000,
+};
+
+static const uint16_t g45_src_index_table[32] = {
+   0b000000000000,
+   0b010001101000,
+   0b010110001000,
+   0b011010010000,
+   0b001101001000,
+   0b010110001010,
+   0b010101110000,
+   0b011001111000,
+   0b001000101000,
+   0b000000101000,
+   0b010001010000,
+   0b111101101100,
+   0b010110001100,
+   0b010001101100,
+   0b011010010100,
+   0b010001001100,
+   0b001100101000,
+   0b000000000010,
+   0b111101001100,
+   0b011001101000,
+   0b010101001000,
+   0b000000000100,
+   0b000000101100,
+   0b010001101010,
+   0b000000111000,
+   0b010101011000,
+   0b000100100000,
+   0b010110000000,
+   0b010000000100,
+   0b010000111000,
+   0b000101100000,
+   0b111101110100,
+};
+
+static const uint32_t gfx6_control_index_table[32] = {
+   0b00000000000000000,
+   0b01000000000000000,
+   0b00110000000000000,
+   0b00000000100000000,
+   0b00010000000000000,
+   0b00001000100000000,
+   0b00000000100000010,
+   0b00000000000000010,
+   0b01000000100000000,
+   0b01010000000000000,
+   0b10110000000000000,
+   0b00100000000000000,
+   0b11010000000000000,
+   0b11000000000000000,
+   0b01001000100000000,
+   0b01000000000001000,
+   0b01000000000000100,
+   0b00000000000001000,
+   0b00000000000000100,
+   0b00111000100000000,
+   0b00001000100000010,
+   0b00110000100000000,
+   0b00110000000000001,
+   0b00100000000000001,
+   0b00110000000000010,
+   0b00110000000000101,
+   0b00110000000001001,
+   0b00110000000010000,
+   0b00110000000000011,
+   0b00110000000000100,
+   0b00110000100001000,
+   0b00100000000001001,
+};
+
+static const uint32_t gfx6_datatype_table[32] = {
+   0b001001110000000000,
+   0b001000110000100000,
+   0b001001110000000001,
+   0b001000000001100000,
+   0b001010110100101001,
+   0b001000000110101101,
+   0b001100011000101100,
+   0b001011110110101101,
+   0b001000000111101100,
+   0b001000000001100001,
+   0b001000110010100101,
+   0b001000000001000001,
+   0b001000001000110001,
+   0b001000001000101001,
+   0b001000000000100000,
+   0b001000001000110010,
+   0b001010010100101001,
+   0b001011010010100101,
+   0b001000000110100101,
+   0b001100011000101001,
+   0b001011011000101100,
+   0b001011010110100101,
+   0b001011110110100101,
+   0b001111011110111101,
+   0b001111011110111100,
+   0b001111011110111101,
+   0b001111011110011101,
+   0b001111011110111110,
+   0b001000000000100001,
+   0b001000000000100010,
+   0b001001111111011101,
+   0b001000001110111110,
+};
+
+static const uint16_t gfx6_subreg_table[32] = {
+   0b000000000000000,
+   0b000000000000100,
+   0b000000110000000,
+   0b111000000000000,
+   0b011110000001000,
+   0b000010000000000,
+   0b000000000010000,
+   0b000110000001100,
+   0b001000000000000,
+   0b000001000000000,
+   0b000001010010100,
+   0b000000001010110,
+   0b010000000000000,
+   0b110000000000000,
+   0b000100000000000,
+   0b000000010000000,
+   0b000000000001000,
+   0b100000000000000,
+   0b000001010000000,
+   0b001010000000000,
+   0b001100000000000,
+   0b000000001010100,
+   0b101101010010100,
+   0b010100000000000,
+   0b000000010001111,
+   0b011000000000000,
+   0b111110000000000,
+   0b101000000000000,
+   0b000000000001111,
+   0b000100010001111,
+   0b001000010001111,
+   0b000110000000000,
+};
+
+static const uint16_t gfx6_src_index_table[32] = {
+   0b000000000000,
+   0b010110001000,
+   0b010001101000,
+   0b001000101000,
+   0b011010010000,
+   0b000100100000,
+   0b010001101100,
+   0b010101110000,
+   0b011001111000,
+   0b001100101000,
+   0b010110001100,
+   0b001000100000,
+   0b010110001010,
+   0b000000000010,
+   0b010101010000,
+   0b010101101000,
+   0b111101001100,
+   0b111100101100,
+   0b011001110000,
+   0b010110001001,
+   0b010101011000,
+   0b001101001000,
+   0b010000101100,
+   0b010000000000,
+   0b001101110000,
+   0b001100010000,
+   0b001100000000,
+   0b010001101010,
+   0b001101111000,
+   0b000001110000,
+   0b001100100000,
+   0b001101010000,
+};
+
+static const uint32_t gfx7_control_index_table[32] = {
+   0b0000000000000000010,
+   0b0000100000000000000,
+   0b0000100000000000001,
+   0b0000100000000000010,
+   0b0000100000000000011,
+   0b0000100000000000100,
+   0b0000100000000000101,
+   0b0000100000000000111,
+   0b0000100000000001000,
+   0b0000100000000001001,
+   0b0000100000000001101,
+   0b0000110000000000000,
+   0b0000110000000000001,
+   0b0000110000000000010,
+   0b0000110000000000011,
+   0b0000110000000000100,
+   0b0000110000000000101,
+   0b0000110000000000111,
+   0b0000110000000001001,
+   0b0000110000000001101,
+   0b0000110000000010000,
+   0b0000110000100000000,
+   0b0001000000000000000,
+   0b0001000000000000010,
+   0b0001000000000000100,
+   0b0001000000100000000,
+   0b0010110000000000000,
+   0b0010110000000010000,
+   0b0011000000000000000,
+   0b0011000000100000000,
+   0b0101000000000000000,
+   0b0101000000100000000,
+};
+
+static const uint32_t gfx7_datatype_table[32] = {
+   0b001000000000000001,
+   0b001000000000100000,
+   0b001000000000100001,
+   0b001000000001100001,
+   0b001000000010111101,
+   0b001000001011111101,
+   0b001000001110100001,
+   0b001000001110100101,
+   0b001000001110111101,
+   0b001000010000100001,
+   0b001000110000100000,
+   0b001000110000100001,
+   0b001001010010100101,
+   0b001001110010100100,
+   0b001001110010100101,
+   0b001111001110111101,
+   0b001111011110011101,
+   0b001111011110111100,
+   0b001111011110111101,
+   0b001111111110111100,
+   0b000000001000001100,
+   0b001000000000111101,
+   0b001000000010100101,
+   0b001000010000100000,
+   0b001001010010100100,
+   0b001001110010000100,
+   0b001010010100001001,
+   0b001101111110111101,
+   0b001111111110111101,
+   0b001011110110101100,
+   0b001010010100101000,
+   0b001010110100101000,
+};
+
+static const uint16_t gfx7_subreg_table[32] = {
+   0b000000000000000,
+   0b000000000000001,
+   0b000000000001000,
+   0b000000000001111,
+   0b000000000010000,
+   0b000000010000000,
+   0b000000100000000,
+   0b000000110000000,
+   0b000001000000000,
+   0b000001000010000,
+   0b000010100000000,
+   0b001000000000000,
+   0b001000000000001,
+   0b001000010000001,
+   0b001000010000010,
+   0b001000010000011,
+   0b001000010000100,
+   0b001000010000111,
+   0b001000010001000,
+   0b001000010001110,
+   0b001000010001111,
+   0b001000110000000,
+   0b001000111101000,
+   0b010000000000000,
+   0b010000110000000,
+   0b011000000000000,
+   0b011110010000111,
+   0b100000000000000,
+   0b101000000000000,
+   0b110000000000000,
+   0b111000000000000,
+   0b111000000011100,
+};
+
+static const uint16_t gfx7_src_index_table[32] = {
+   0b000000000000,
+   0b000000000010,
+   0b000000010000,
+   0b000000010010,
+   0b000000011000,
+   0b000000100000,
+   0b000000101000,
+   0b000001001000,
+   0b000001010000,
+   0b000001110000,
+   0b000001111000,
+   0b001100000000,
+   0b001100000010,
+   0b001100001000,
+   0b001100010000,
+   0b001100010010,
+   0b001100100000,
+   0b001100101000,
+   0b001100111000,
+   0b001101000000,
+   0b001101000010,
+   0b001101001000,
+   0b001101010000,
+   0b001101100000,
+   0b001101101000,
+   0b001101110000,
+   0b001101110001,
+   0b001101111000,
+   0b010001101000,
+   0b010001101001,
+   0b010001101010,
+   0b010110001000,
+};
+
+static const uint32_t gfx8_control_index_table[32] = {
+   0b0000000000000000010,
+   0b0000100000000000000,
+   0b0000100000000000001,
+   0b0000100000000000010,
+   0b0000100000000000011,
+   0b0000100000000000100,
+   0b0000100000000000101,
+   0b0000100000000000111,
+   0b0000100000000001000,
+   0b0000100000000001001,
+   0b0000100000000001101,
+   0b0000110000000000000,
+   0b0000110000000000001,
+   0b0000110000000000010,
+   0b0000110000000000011,
+   0b0000110000000000100,
+   0b0000110000000000101,
+   0b0000110000000000111,
+   0b0000110000000001001,
+   0b0000110000000001101,
+   0b0000110000000010000,
+   0b0000110000100000000,
+   0b0001000000000000000,
+   0b0001000000000000010,
+   0b0001000000000000100,
+   0b0001000000100000000,
+   0b0010110000000000000,
+   0b0010110000000010000,
+   0b0011000000000000000,
+   0b0011000000100000000,
+   0b0101000000000000000,
+   0b0101000000100000000,
+};
+
+static const uint32_t gfx8_datatype_table[32] = {
+   0b001000000000000000001,
+   0b001000000000001000000,
+   0b001000000000001000001,
+   0b001000000000011000001,
+   0b001000000000101011101,
+   0b001000000010111011101,
+   0b001000000011101000001,
+   0b001000000011101000101,
+   0b001000000011101011101,
+   0b001000001000001000001,
+   0b001000011000001000000,
+   0b001000011000001000001,
+   0b001000101000101000101,
+   0b001000111000101000100,
+   0b001000111000101000101,
+   0b001011100011101011101,
+   0b001011101011100011101,
+   0b001011101011101011100,
+   0b001011101011101011101,
+   0b001011111011101011100,
+   0b000000000010000001100,
+   0b001000000000001011101,
+   0b001000000000101000101,
+   0b001000001000001000000,
+   0b001000101000101000100,
+   0b001000111000100000100,
+   0b001001001001000001001,
+   0b001010111011101011101,
+   0b001011111011101011101,
+   0b001001111001101001100,
+   0b001001001001001001000,
+   0b001001011001001001000,
+};
+
+static const uint16_t gfx8_subreg_table[32] = {
+   0b000000000000000,
+   0b000000000000001,
+   0b000000000001000,
+   0b000000000001111,
+   0b000000000010000,
+   0b000000010000000,
+   0b000000100000000,
+   0b000000110000000,
+   0b000001000000000,
+   0b000001000010000,
+   0b000001010000000,
+   0b001000000000000,
+   0b001000000000001,
+   0b001000010000001,
+   0b001000010000010,
+   0b001000010000011,
+   0b001000010000100,
+   0b001000010000111,
+   0b001000010001000,
+   0b001000010001110,
+   0b001000010001111,
+   0b001000110000000,
+   0b001000111101000,
+   0b010000000000000,
+   0b010000110000000,
+   0b011000000000000,
+   0b011110010000111,
+   0b100000000000000,
+   0b101000000000000,
+   0b110000000000000,
+   0b111000000000000,
+   0b111000000011100,
+};
+
+static const uint16_t gfx8_src_index_table[32] = {
+   0b000000000000,
+   0b000000000010,
+   0b000000010000,
+   0b000000010010,
+   0b000000011000,
+   0b000000100000,
+   0b000000101000,
+   0b000001001000,
+   0b000001010000,
+   0b000001110000,
+   0b000001111000,
+   0b001100000000,
+   0b001100000010,
+   0b001100001000,
+   0b001100010000,
+   0b001100010010,
+   0b001100100000,
+   0b001100101000,
+   0b001100111000,
+   0b001101000000,
+   0b001101000010,
+   0b001101001000,
+   0b001101010000,
+   0b001101100000,
+   0b001101101000,
+   0b001101110000,
+   0b001101110001,
+   0b001101111000,
+   0b010001101000,
+   0b010001101001,
+   0b010001101010,
+   0b010110001000,
+};
+
+static const uint32_t gfx11_datatype_table[32] = {
+   0b001000000000000000001,
+   0b001000000000001000000,
+   0b001000000000001000001,
+   0b001000000000011000001,
+   0b001000000000101100101,
+   0b001000000101111100101,
+   0b001000000100101000001,
+   0b001000000100101000101,
+   0b001000000100101100101,
+   0b001000001000001000001,
+   0b001000011000001000000,
+   0b001000011000001000001,
+   0b001000101000101000101,
+   0b001000111000101000100,
+   0b001000111000101000101,
+   0b001100100100101100101,
+   0b001100101100100100101,
+   0b001100101100101100100,
+   0b001100101100101100101,
+   0b001100111100101100100,
+   0b000000000010000001100,
+   0b001000000000001100101,
+   0b001000000000101000101,
+   0b001000001000001000000,
+   0b001000101000101000100,
+   0b001000111000100000100,
+   0b001001001001000001001,
+   0b001101111100101100101,
+   0b001100111100101100101,
+   0b001001111001101001100,
+   0b001001001001001001000,
+   0b001001011001001001000,
+};
+
+static const uint32_t gfx12_control_index_table[32] = {
+   0b000000000000000000100, /* 	       (16|M0)                            */
+   0b000000000000000000011, /* 	       (8|M0)                             */
+   0b000000010000000000000, /* 	(W)    (1|M0)                             */
+   0b000000010000000000100, /* 	(W)    (16|M0)                            */
+   0b000000010000000000011, /* 	(W)    (8|M0)                             */
+   0b010000000000000000100, /* 	       (16|M0)  (ge)f0.0                  */
+   0b000000000000000100100, /* 	       (16|M16)                           */
+   0b010100000000000000100, /* 	       (16|M0)  (lt)f0.0                  */
+   0b000000000000000000000, /* 	       (1|M0)                             */
+   0b000010000000000000100, /* 	       (16|M0)           (sat)            */
+   0b000000000000000010011, /* 	       (8|M8)                             */
+   0b001100000000000000100, /* 	       (16|M0)  (gt)f0.0                  */
+   0b000100000000000000100, /* 	       (16|M0)  (eq)f0.0                  */
+   0b000100010000000000100, /* 	(W)    (16|M0)  (eq)f0.0                  */
+   0b001000000000000000100, /* 	       (16|M0)  (ne)f0.0                  */
+   0b000000000000100000100, /* 	(f0.0) (16|M0)                            */
+   0b010100000000000000011, /* 	       (8|M0)   (lt)f0.0                  */
+   0b000000000000110000100, /* 	(f1.0) (16|M0)                            */
+   0b000000010000000000001, /* 	(W)    (2|M0)                             */
+   0b000000000000101000100, /* 	(f0.1) (16|M0)                            */
+   0b000000000000111000100, /* 	(f1.1) (16|M0)                            */
+   0b010000010000000000100, /* 	(W)    (16|M0)  (ge)f0.0                  */
+   0b000000000000000100011, /* 	       (8|M16)                            */
+   0b000000000000000110011, /* 	       (8|M24)                            */
+   0b010100010000000000100, /* 	(W)    (16|M0)  (lt)f0.0                  */
+   0b010000000000000000011, /* 	       (8|M0)   (ge)f0.0                  */
+   0b000100010000000000000, /* 	(W)    (1|M0)   (eq)f0.0                  */
+   0b000010000000000000011, /* 	       (8|M0)            (sat)            */
+   0b010100000000010000100, /* 	       (16|M0)  (lt)f1.0                  */
+   0b000100000000000000011, /* 	       (8|M0)   (eq)f0.0                  */
+   0b000001000000000000011, /* 	       (8|M0)                   {AccWrEn} */
+   0b000000010000000100100, /* 	(W)    (16|M16)                           */
+};
+
+static const uint32_t gfx12_datatype_table[32] = {
+   0b11010110100101010100, /* grf<1>:f  grf:f  grf:f  */
+   0b00000110100101010100, /* grf<1>:f  grf:f  arf:ub */
+   0b00000010101101010100, /* grf<1>:f  imm:f  arf:ub */
+   0b01010110110101010100, /* grf<1>:f  grf:f  imm:f  */
+   0b11010100100101010100, /* arf<1>:f  grf:f  grf:f  */
+   0b11010010100101010100, /* grf<1>:f  arf:f  grf:f  */
+   0b01010100110101010100, /* arf<1>:f  grf:f  imm:f  */
+   0b00000000100000000000, /* arf<1>:ub arf:ub arf:ub */
+   0b11010000100101010100, /* arf<1>:f  arf:f  grf:f  */
+   0b00101110110011001100, /* grf<1>:d  grf:d  imm:w  */
+   0b10110110100011001100, /* grf<1>:d  grf:d  grf:d  */
+   0b01010010110101010100, /* grf<1>:f  arf:f  imm:f  */
+   0b10010110100001000100, /* grf<1>:ud grf:ud grf:ud */
+   0b01010000110101010100, /* arf<1>:f  arf:f  imm:f  */
+   0b00110110110011001100, /* grf<1>:d  grf:d  imm:d  */
+   0b00010110110001000100, /* grf<1>:ud grf:ud imm:ud */
+   0b00000111000101010100, /* grf<2>:f  grf:f  arf:ub */
+   0b00101100110011001100, /* arf<1>:d  grf:d  imm:w  */
+   0b00000000100000100010, /* arf<1>:uw arf:uw arf:ub */
+   0b00000010100001000100, /* grf<1>:ud arf:ud arf:ub */
+   0b00100110110000101010, /* grf<1>:w  grf:uw imm:uv */
+   0b00001110110000100010, /* grf<1>:uw grf:uw imm:uw */
+   0b10010111000001000100, /* grf<2>:ud grf:ud grf:ud */
+   0b00000110100101001100, /* grf<1>:d  grf:f  arf:ub */
+   0b10001100100011001100, /* arf<1>:d  grf:d  grf:uw */
+   0b00000110100001010100, /* grf<1>:f  grf:ud arf:ub */
+   0b00101110110001001100, /* grf<1>:d  grf:ud imm:w  */
+   0b00000010100000100010, /* grf<1>:uw arf:uw arf:ub */
+   0b00000110100000110100, /* grf<1>:f  grf:uw arf:ub */
+   0b00000110100000010100, /* grf<1>:f  grf:ub arf:ub */
+   0b00000110100011010100, /* grf<1>:f  grf:d  arf:ub */
+   0b00000010100101010100, /* grf<1>:f  arf:f  arf:ub */
+};
+
+static const uint16_t gfx12_subreg_table[32] = {
+   0b000000000000000, /* .0  .0  .0  */
+   0b100000000000000, /* .0  .0  .16 */
+   0b001000000000000, /* .0  .0  .4  */
+   0b011000000000000, /* .0  .0  .12 */
+   0b000000010000000, /* .0  .4  .0  */
+   0b010000000000000, /* .0  .0  .8  */
+   0b101000000000000, /* .0  .0  .20 */
+   0b000000000001000, /* .8  .0  .0  */
+   0b000000100000000, /* .0  .8  .0  */
+   0b110000000000000, /* .0  .0  .24 */
+   0b111000000000000, /* .0  .0  .28 */
+   0b000001000000000, /* .0  .16 .0  */
+   0b000000000000100, /* .4  .0  .0  */
+   0b000001100000000, /* .0  .24 .0  */
+   0b000001010000000, /* .0  .20 .0  */
+   0b000000110000000, /* .0  .12 .0  */
+   0b000001110000000, /* .0  .28 .0  */
+   0b000000000011100, /* .28 .0  .0  */
+   0b000000000010000, /* .16 .0  .0  */
+   0b000000000001100, /* .12 .0  .0  */
+   0b000000000011000, /* .24 .0  .0  */
+   0b000000000010100, /* .20 .0  .0  */
+   0b000000000000010, /* .2  .0  .0  */
+   0b000000101000000, /* .0  .10 .0  */
+   0b000000001000000, /* .0  .2  .0  */
+   0b000000010000100, /* .4  .4  .0  */
+   0b000000001011100, /* .28 .2  .0  */
+   0b000000001000010, /* .2  .2  .0  */
+   0b000000110001100, /* .12 .12 .0  */
+   0b000000000100000, /* .0  .1  .0  */
+   0b000000001100000, /* .0  .3  .0  */
+   0b110001100000000, /* .0  .24 .24 */
+};
+
+static const uint16_t gfx12_src0_index_table[16] = {
+   0b010001100100, /*       r<8;8,1>  */
+   0b000000000000, /*       r<0;1,0>  */
+   0b010001100110, /*      -r<8;8,1>  */
+   0b010001100101, /*  (abs)r<8;8,1>  */
+   0b000000000010, /*      -r<0;1,0>  */
+   0b001000000000, /*       r<2;1,0>  */
+   0b001001000000, /*       r<2;4,0>  */
+   0b001101000000, /*       r<4;4,0>  */
+   0b001000100100, /*       r<2;2,1>  */
+   0b001100000000, /*       r<4;1,0>  */
+   0b001000100110, /*      -r<2;2,1>  */
+   0b001101000100, /*       r<4;4,1>  */
+   0b010001100111, /* -(abs)r<8;8,1>  */
+   0b000100000000, /*       r<1;1,0>  */
+   0b000000000001, /*  (abs)r<0;1,0>  */
+   0b111100010000, /*       r[a]<1,0> */
+};
+
+static const uint16_t gfx12_src1_index_table[16] = {
+   0b000100011001, /*       r<8;8,1> */
+   0b000000000000, /*       r<0;1,0> */
+   0b100100011001, /*      -r<8;8,1> */
+   0b100000000000, /*      -r<0;1,0> */
+   0b010100011001, /*  (abs)r<8;8,1> */
+   0b100011010000, /*      -r<4;4,0> */
+   0b000010000000, /*       r<2;1,0> */
+   0b000010001001, /*       r<2;2,1> */
+   0b100010001001, /*      -r<2;2,1> */
+   0b000011010000, /*       r<4;4,0> */
+   0b000011010001, /*       r<4;4,1> */
+   0b000011000000, /*       r<4;1,0> */
+   0b110100011001, /* -(abs)r<8;8,1> */
+   0b010000000000, /*  (abs)r<0;1,0> */
+   0b110000000000, /* -(abs)r<0;1,0> */
+   0b100011010001, /*      -r<4;4,1> */
+};
+
+static const uint16_t xehp_src0_index_table[16] = {
+   0b000100000000, /*       r<1;1,0>  */
+   0b000000000000, /*       r<0;1,0>  */
+   0b000100000010, /*      -r<1;1,0>  */
+   0b000100000001, /*  (abs)r<1;1,0>  */
+   0b000000000010, /*      -r<0;1,0>  */
+   0b001000000000, /*       r<2;1,0>  */
+   0b001001000000, /*       r<2;4,0>  */
+   0b001101000000, /*       r<4;4,0>  */
+   0b001100000000, /*       r<4;1,0>  */
+   0b000100000011, /* -(abs)r<1;1,0>  */
+   0b000000000001, /*  (abs)r<0;1,0>  */
+   0b111100010000, /*       r[a]<1,0> */
+   0b010001100000, /*       r<8;8,0>  */
+   0b000101000000, /*       r<1;4,0>  */
+   0b010001001000, /*       r<8;4,2>  */
+   0b001000000010, /*      -r<2;1,0>  */
+};
+
+static const uint16_t xehp_src1_index_table[16] = {
+   0b000001000000, /*       r<1;1,0>    */
+   0b000000000000, /*       r<0;1,0>    */
+   0b100001000000, /*      -r<1;1,0>    */
+   0b100000000000, /*      -r<0;1,0>    */
+   0b010001000000, /*  (abs)r<1;1,0>    */
+   0b100011010000, /*      -r<4;4,0>    */
+   0b000010000000, /*       r<2;1,0>    */
+   0b000011010000, /*       r<4;4,0>    */
+   0b000011000000, /*       r<4;1,0>    */
+   0b110001000000, /* -(abs)r<1;1,0>    */
+   0b010000000000, /*  (abs)r<0;1,0>    */
+   0b110000000000, /* -(abs)r<0;1,0>    */
+   0b000100011000, /*       r<8;8,0>    */
+   0b100010000000, /*      -r<2;1,0>    */
+   0b100000001001, /*      -r<0;2,1>    */
+   0b100001000100, /*      -r[a]<1;1,0> */
+};
+
+static const uint32_t xe2_control_index_table[32] = {
+   0b000000000000000100, /* (16|M0)               */
+   0b000000100000000000, /* (W) (1|M0)            */
+   0b000000000010000100, /* (16|M16)              */
+   0b000000000000000000, /* (1|M0)                */
+   0b000000100000000100, /* (W) (16|M0)           */
+   0b010000000000000100, /* (16|M0) (.ge)f0.0     */
+   0b010100000000000100, /* (16|M0) (.lt)f0.0     */
+   0b000000100000000010, /* (W) (4|M0)            */
+   0b000000000000000101, /* (32|M0)               */
+   0b000000100000000011, /* (W) (8|M0)            */
+   0b001100100000000000, /* (W) (1|M0) (.gt)f0.0  */
+   0b000010000000000100, /* (16|M0) (sat)         */
+   0b000100000000000100, /* (16|M0) (.eq)f0.0     */
+   0b000000100000000001, /* (W) (2|M0)            */
+   0b001100000000000100, /* (16|M0) (.gt)f0.0     */
+   0b000100100000000000, /* (W) (1|M0) (.eq)f0.0  */
+   0b010100100000000010, /* (W) (4|M0) (.lt)f0.0  */
+   0b010000100000000000, /* (W) (1|M0) (.ge)f0.0  */
+   0b010000100000000010, /* (W) (4|M0) (.ge)f0.0  */
+   0b010100100000000000, /* (W) (1|M0) (.lt)f0.0  */
+   0b001000000000000100, /* (16|M0) (.ne)f0.0     */
+   0b000000000100100100, /* (f2.0) (16|M0)        */
+   0b010100100000000011, /* (W) (8|M0) (.lt)f0.0  */
+   0b000000000100011100, /* (f1.1) (16|M0)        */
+   0b010000100000000011, /* (W) (8|M0) (.ge)f0.0  */
+   0b000000000100001100, /* (f0.1) (16|M0)        */
+   0b000000000100010100, /* (f1.0) (16|M0)        */
+   0b000000000100110100, /* (f3.0) (16|M0)        */
+   0b000000000100111100, /* (f3.1) (16|M0)        */
+   0b000000000100101100, /* (f2.1) (16|M0)        */
+   0b000000000100000100, /* (f0.0) (16|M0)        */
+   0b010100000000100100, /* (16|M0) (.lt)f2.0     */
+};
+
+static const uint32_t xe2_datatype_table[32] = {
+   0b11010110100101010100, /* grf<1>:f grf:f grf:f    */
+   0b11010100100101010100, /* arf<1>:f grf:f grf:f    */
+   0b00000110100101010100, /* grf<1>:f grf:f arf:ub   */
+   0b00000110100001000100, /* grf<1>:ud grf:ud arf:ub */
+   0b01010110110101010100, /* grf<1>:f grf:f imm:f    */
+   0b11010010100101010100, /* grf<1>:f arf:f grf:f    */
+   0b10111110100011101110, /* grf<1>:q grf:q grf:q    */
+   0b00000000100000000000, /* arf<1>:ub arf:ub arf:ub */
+   0b01010110100101010100, /* grf<1>:f grf:f arf:f    */
+   0b00000010101001000100, /* grf<1>:ud imm:ud        */
+   0b00101110110011001100, /* grf<1>:d grf:d imm:w    */
+   0b11010000100101010100, /* arf<1>:f arf:f grf:f    */
+   0b01010100100101010100, /* arf<1>:f grf:f arf:f    */
+   0b01010100110101010100, /* arf<1>:f grf:f imm:f    */
+   0b00000010101101010100, /* grf<1>:f imm:f          */
+   0b00000110100011001100, /* grf<1>:d grf:d arf:ub   */
+   0b00101110110011101110, /* grf<1>:q grf:q imm:w    */
+   0b00000110100001100110, /* grf<1>:uq grf:uq arf:ub */
+   0b01010000100101010100, /* arf<1>:f arf:f arf:f    */
+   0b10110110100011001100, /* grf<1>:d grf:d grf:d    */
+   0b01010010100101010100, /* grf<1>:f arf:f arf:f    */
+   0b00000111000001000100, /* grf<2>:ud grf:ud arf:ub */
+   0b00110110110011001110, /* grf<1>:q grf:d imm:d    */
+   0b00101100110011001100, /* arf<1>:d grf:d imm:w    */
+   0b11011110100101110110, /* grf<1>:df grf:df grf:df */
+   0b01010010110101010100, /* grf<1>:f arf:f imm:f    */
+   0b10010110100001000100, /* grf<1>:ud grf:ud grf:ud */
+   0b00000010100001000100, /* grf<1>:ud arf:ud arf:ub */
+   0b00001110110001000100, /* grf<1>:ud grf:ud imm:uw */
+   0b00000010101010101100, /* grf<1>:d imm:w          */
+   0b01010000110101010100, /* arf<1>:f arf:f imm:f    */
+   0b00000100100001000100, /* arf<1>:ud grf:ud arf:ub */
+};
+
+static const uint16_t xe2_subreg_table[16] = {
+   0b000000000000, /* .0 .0  */
+   0b000010000000, /* .0 .4  */
+   0b000000000100, /* .4 .0  */
+   0b010000000000, /* .0 .32 */
+   0b001000000000, /* .0 .16 */
+   0b000000001000, /* .8 .0  */
+   0b000100000000, /* .0 .8  */
+   0b010100000000, /* .0 .40 */
+   0b011000000000, /* .0 .48 */
+   0b000110000000, /* .0 .12 */
+   0b000000010000, /* .16 .0 */
+   0b011010000000, /* .0 .52 */
+   0b001100000000, /* .0 .24 */
+   0b011100000000, /* .0 .56 */
+   0b010110000000, /* .0 .44 */
+   0b010010000000, /* .0 .36 */
+};
+
+static const uint16_t xe2_src0_index_table[8] = {
+   0b00100000000, /* r<1;1,0>      */
+   0b00000000000, /* r<0;1,0>      */
+   0b01000000000, /* r<2;1,0>      */
+   0b00100000010, /* -r<1;1,0>     */
+   0b01100000000, /* r<4;1,0>      */
+   0b00100000001, /* (abs)r<1;1,0> */
+   0b00000000010, /* -r<0;1,0>     */
+   0b01001000000, /* r<2;4,0>      */
+};
+
+static const uint16_t xe2_src1_index_table[16] = {
+   0b0000100000000000, /* r<1;1,0>.0  */
+   0b0000000000000000, /* r<0;1,0>.0  */
+   0b1000100000000000, /* -r<1;1,0>.0 */
+   0b0000000000010000, /* r<0;1,0>.8  */
+   0b0000000000001000, /* r<0;1,0>.4  */
+   0b0000000000011000, /* r<0;1,0>.12 */
+   0b0000000001010000, /* r<0;1,0>.40 */
+   0b0000000001000000, /* r<0;1,0>.32 */
+   0b0000000000100000, /* r<0;1,0>.16 */
+   0b0000000001111000, /* r<0;1,0>.60 */
+   0b0000000000111000, /* r<0;1,0>.28 */
+   0b0000000000101000, /* r<0;1,0>.20 */
+   0b0000000001011000, /* r<0;1,0>.44 */
+   0b0000000001001000, /* r<0;1,0>.36 */
+   0b0000000001110000, /* r<0;1,0>.56 */
+   0b0000000000110000, /* r<0;1,0>.24 */
+};
+
+/* This is actually the control index table for Cherryview (26 bits), but the
+ * only difference from Broadwell (24 bits) is that it has two extra 0-bits at
+ * the start.
+ *
+ * The low 24 bits have the same mappings on both hardware.
+ */
+static const uint32_t gfx8_3src_control_index_table[4] = {
+   0b00100000000110000000000001,
+   0b00000000000110000000000001,
+   0b00000000001000000000000001,
+   0b00000000001000000000100001,
+};
+
+/* This is actually the control index table for Cherryview (49 bits), but the
+ * only difference from Broadwell (46 bits) is that it has three extra 0-bits
+ * at the start.
+ *
+ * The low 44 bits have the same mappings on both hardware, and since the high
+ * three bits on Broadwell are zero, we can reuse Cherryview's table.
+ */
+static const uint64_t gfx8_3src_source_index_table[4] = {
+   0b0000001110010011100100111001000001111000000000000,
+   0b0000001110010011100100111001000001111000000000010,
+   0b0000001110010011100100111001000001111000000001000,
+   0b0000001110010011100100111001000001111000000100000,
+};
+
+static const uint64_t gfx12_3src_control_index_table[32] = {
+   0b000001001010010101000000000000000100, /*      (16|M0)       grf<1>:f   :f  :f  :f */
+   0b000001001010010101000000000000000011, /*      (8|M0)        grf<1>:f   :f  :f  :f */
+   0b000001001000010101000000000000000011, /*      (8|M0)        arf<1>:f   :f  :f  :f */
+   0b000001001010010101000010000000000011, /* (W)  (8|M0)        grf<1>:f   :f  :f  :f */
+   0b000001001000010101000010000000000011, /* (W)  (8|M0)        arf<1>:f   :f  :f  :f */
+   0b000001001000010101000000000000010011, /*      (8|M8)        arf<1>:f   :f  :f  :f */
+   0b000001001010010101000000000000010011, /*      (8|M8)        grf<1>:f   :f  :f  :f */
+   0b000001001000010101000010000000010011, /* (W)  (8|M8)        arf<1>:f   :f  :f  :f */
+   0b000001001010010101000010000000010011, /* (W)  (8|M8)        grf<1>:f   :f  :f  :f */
+   0b000001001010010101000010000000000100, /* (W)  (16|M0)       grf<1>:f   :f  :f  :f */
+   0b000001001000010101000000000000000100, /*      (16|M0)       arf<1>:f   :f  :f  :f */
+   0b000001001010010101010000000000000100, /*      (16|M0)  (sat)grf<1>:f   :f  :f  :f */
+   0b000001001010010101000000000000100100, /*      (16|M16)      grf<1>:f   :f  :f  :f */
+   0b000001001000010101000010000000000100, /* (W)  (16|M0)       arf<1>:f   :f  :f  :f */
+   0b000001001010010101000010000000000000, /* (W)  (1|M0)        grf<1>:f   :f  :f  :f */
+   0b000001001010010101010000000000000011, /*      (8|M0)   (sat)grf<1>:f   :f  :f  :f */
+   0b000001001000010101000010000000110011, /* (W)  (8|M24)       arf<1>:f   :f  :f  :f */
+   0b000001001000010101000010000000100011, /* (W)  (8|M16)       arf<1>:f   :f  :f  :f */
+   0b000001001010010101000010000000110011, /* (W)  (8|M24)       grf<1>:f   :f  :f  :f */
+   0b000001001010010101000010000000100011, /* (W)  (8|M16)       grf<1>:f   :f  :f  :f */
+   0b000001001000010101000000000000100011, /*      (8|M16)       arf<1>:f   :f  :f  :f */
+   0b000001001000010101000000000000110011, /*      (8|M24)       arf<1>:f   :f  :f  :f */
+   0b000001001010010101000000000000100011, /*      (8|M16)       grf<1>:f   :f  :f  :f */
+   0b000001001010010101000000000000110011, /*      (8|M24)       grf<1>:f   :f  :f  :f */
+   0b000001001000010101010000000000000100, /*      (16|M0)  (sat)arf<1>:f   :f  :f  :f */
+   0b000001001010010101010010000000000100, /* (W)  (16|M0)  (sat)grf<1>:f   :f  :f  :f */
+   0b000001001010010101000010000000100100, /* (W)  (16|M16)      grf<1>:f   :f  :f  :f */
+   0b000001001010010001000010000000000000, /* (W)  (1|M0)        grf<1>:ud :ud :ud :ud */
+   0b000001001000010101000000000000100100, /*      (16|M16)      arf<1>:f   :f  :f  :f */
+   0b000001001010010101010000000000100100, /*      (16|M16) (sat)grf<1>:f   :f  :f  :f */
+   0b000001001010010101000010000000000010, /* (W)  (4|M0)        grf<1>:f   :f  :f  :f */
+   0b000001001000010101010000000000000011, /*      (8|M0)   (sat)arf<1>:f   :f  :f  :f */
+};
+
+static const uint64_t xehp_3src_control_index_table[32] = {
+   0b0000010010100010101000000000000000100, /*          (16|M0)       grf<1>:f   :f   :f   :f          */
+   0b0000010010100010101000000000000000011, /*          (8|M0)        grf<1>:f   :f   :f   :f          */
+   0b0000010010000010101000000000000000011, /*          (8|M0)        arf<1>:f   :f   :f   :f          */
+   0b0000010010100010101000010000000000011, /*     (W)  (8|M0)        grf<1>:f   :f   :f   :f          */
+   0b0000010010000010101000010000000000011, /*     (W)  (8|M0)        arf<1>:f   :f   :f   :f          */
+   0b0000010010000010101000000000000010011, /*          (8|M8)        arf<1>:f   :f   :f   :f          */
+   0b0000010010100010101000000000000010011, /*          (8|M8)        grf<1>:f   :f   :f   :f          */
+   0b0000010010000010101000010000000010011, /*     (W)  (8|M8)        arf<1>:f   :f   :f   :f          */
+   0b0000010010100010101000010000000010011, /*     (W)  (8|M8)        grf<1>:f   :f   :f   :f          */
+   0b0000010010100010101000010000000000100, /*     (W)  (16|M0)       grf<1>:f   :f   :f   :f          */
+   0b0000010010000010101000000000000000100, /*          (16|M0)       arf<1>:f   :f   :f   :f          */
+   0b0000010010100010101010000000000000100, /*          (16|M0)  (sat)grf<1>:f   :f   :f   :f          */
+   0b0000010010100010101000000000000100100, /*          (16|M16)      grf<1>:f   :f   :f   :f          */
+   0b0000010010000010101000010000000000100, /*     (W)  (16|M0)       arf<1>:f   :f   :f   :f          */
+   0b0000010010100010101000010000000000000, /*     (W)  (1|M0)        grf<1>:f   :f   :f   :f          */
+   0b0000010010100010101010000000000000011, /*          (8|M0)   (sat)grf<1>:f   :f   :f   :f          */
+   0b0000010010000010101000010000000100011, /*     (W)  (8|M16)       arf<1>:f   :f   :f   :f          */
+   0b0000010010000010101000010000000110011, /*     (W)  (8|M24)       arf<1>:f   :f   :f   :f          */
+   0b0000010010100010101000010000000100011, /*     (W)  (8|M16)       grf<1>:f   :f   :f   :f          */
+   0b0000010010100010101000010000000110011, /*     (W)  (8|M24)       grf<1>:f   :f   :f   :f          */
+   0b0000010010000010101000000000000110011, /*          (8|M24)       arf<1>:f   :f   :f   :f          */
+   0b0000010010000010101000000000000100011, /*          (8|M16)       arf<1>:f   :f   :f   :f          */
+   0b0000000100111110011000000000000000011, /* dpas.8x* (8|M0)        grf<1>:d   :d  :ub   :b          */
+   0b0000000000111110011000100000000000011, /* dpas.8x* (8|M0)        grf<1>:d   :d  :ub  :ub {Atomic} */
+   0b0000100100111110011000100000000000011, /* dpas.8x* (8|M0)        grf<1>:d   :d   :b   :b {Atomic} */
+   0b0000100000111110011000100000000000011, /* dpas.8x* (8|M0)        grf<1>:d   :d   :b  :ub {Atomic} */
+   0b0000100100111110011000000000000000011, /* dpas.8x* (8|M0)        grf<1>:d   :d   :b   :b          */
+   0b0000000000111110011000000000000000011, /* dpas.8x* (8|M0)        grf<1>:d   :d  :ub  :ub          */
+   0b0000000100111110011000100000000000011, /* dpas.8x* (8|M0)        grf<1>:d   :d  :ub   :b {Atomic} */
+   0b0000100000111110011000000000000000011, /* dpas.8x* (8|M0)        grf<1>:d   :d   :b  :ub          */
+   0b0000101101111010101000100000000000011, /* dpas.8x* (8|M0)        grf<1>:f   :f  :bf  :bf {Atomic} */
+   0b0000101101111010101000000000000000011, /* dpas.8x* (8|M0)        grf<1>:f   :f  :bf  :bf          */
+};
+
+static const uint64_t xe2_3src_control_index_table[16] = {
+   0b0000010010100010101000000000000100, /* (16|M0) grf<1>:f :f :f :f      */
+   0b0000010010000010101000000000000100, /* (16|M0) arf<1>:f :f :f :f      */
+   0b0000010010100010101000100000000100, /* (W)(16|M0) grf<1>:f :f :f :f   */
+   0b0000010010000010101000100000000100, /* (W)(16|M0) arf<1>:f :f :f :f   */
+   0b0000011011100011101100000000000100, /* (16|M0) grf<1>:df :df :df :df  */
+   0b0000011011100011101100000010000100, /* (16|M16) grf<1>:df :df :df :df */
+   0b0000011011000011101100000000000100, /* (16|M0) arf<1>:df :df :df :df  */
+   0b0000010010100010101000000000000101, /* (32|M0) grf<1>:f :f :f :f      */
+   0b0000010010000010101000000000000101, /* (32|M0) arf<1>:f :f :f :f      */
+   0b0000010010000010101010000000000100, /* (16|M0) (sat)arf<1>:f :f :f :f */
+   0b0000010010100010101010000000000100, /* (16|M0) (sat)grf<1>:f :f :f :f */
+   0b0000011011000011101100000010000100, /* (16|M16) arf<1>:df :df :df :df */
+   0b0000010010100010101000100000000000, /* (W)(1|M0) grf<1>:f :f :f :f    */
+   0b0000010010100010001000000000000100, /* (16|M0) grf<1>:ud :ud :ud :ud  */
+   0b0000110110100110011000000000000101, /* (32|M0) grf<1>:d :d :d :d      */
+   0b0000011011000011101100000000000011, /* (8|M0) arf<1>:df :df :df :df   */
+};
+
+static const uint64_t xe2_3src_dpas_control_index_table[16] = {
+   0b0000000000111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :ub :ub Atomic */
+   0b0000000100111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :ub :b Atomic */
+   0b0000100000111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :b :ub Atomic */
+   0b0000100100111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :b :b Atomic */
+   0b0000000000111110011000000000000100, /* dpas.8x* (16|M0) grf:d :d :ub :ub */
+   0b0000100100111110011000000000000100, /* dpas.8x* (16|M0) grf:d :d :b :b */
+   0b0000101101111010101001000000000100, /* dpas.8x* (16|M0) grf:f :f :bf :bf Atomic */
+   0b0000101101111101101001000000000100, /* dpas.8x* (16|M0) grf:f :bf :bf :bf Atomic */
+   0b0000101101111010110101000000000100, /* dpas.8x* (16|M0) grf:bf :f :bf :bf Atomic */
+   0b0000101101111101110101000000000100, /* dpas.8x* (16|M0) grf:bf :bf :bf :bf Atomic */
+   0b0000101101111010101000000000000100, /* dpas.8x* (16|M0) grf:f :f :bf :bf */
+   0b0000001001111010101001000000000100, /* dpas.8x* (16|M0) grf:f :f :hf :hf Atomic */
+   0b0000001001111001101001000000000100, /* dpas.8x* (16|M0) grf:f :hf :hf :hf Atomic */
+   0b0000001001111010100101000000000100, /* dpas.8x* (16|M0) grf:hf :f :hf :hf Atomic */
+   0b0000001001111001100101000000000100, /* dpas.8x* (16|M0) grf:hf :hf :hf :hf Atomic */
+   0b0000001001111010101000000000000100, /* dpas.8x* (16|M0) grf:f :f :hf :hf */
+};
+
+static const uint32_t gfx12_3src_source_index_table[32] = {
+   0b100101100001100000000, /*  grf<0;0>   grf<8;1>  grf<0> */
+   0b100101100001001000010, /*  arf<4;1>   grf<8;1>  grf<0> */
+   0b101101100001101000011, /*  grf<8;1>   grf<8;1>  grf<1> */
+   0b100101100001101000011, /*  grf<8;1>   grf<8;1>  grf<0> */
+   0b101100000000101000011, /*  grf<8;1>   grf<0;0>  grf<1> */
+   0b101101100001101001011, /* -grf<8;1>   grf<8;1>  grf<1> */
+   0b101001100001101000011, /*  grf<8;1>   arf<8;1>  grf<1> */
+   0b100001100001100000000, /*  grf<0;0>   arf<8;1>  grf<0> */
+   0b101101100001100000000, /*  grf<0;0>   grf<8;1>  grf<1> */
+   0b101101100101101000011, /*  grf<8;1>   grf<8;1> -grf<1> */
+   0b101101110001101000011, /*  grf<8;1>  -grf<8;1>  grf<1> */
+   0b101100000000100000000, /*  grf<0;0>   grf<0;0>  grf<1> */
+   0b100001100001101000011, /*  grf<8;1>   arf<8;1>  grf<0> */
+   0b100101110001100000000, /*  grf<0;0>  -grf<8;1>  grf<0> */
+   0b100101110001101000011, /*  grf<8;1>  -grf<8;1>  grf<0> */
+   0b100101100001101001011, /* -grf<8;1>   grf<8;1>  grf<0> */
+   0b100100000000101000011, /*  grf<8;1>   grf<0;0>  grf<0> */
+   0b100101100001100001000, /* -grf<0;0>   grf<8;1>  grf<0> */
+   0b100100000000100000000, /*  grf<0;0>   grf<0;0>  grf<0> */
+   0b101101110001100000000, /*  grf<0;0>  -grf<8;1>  grf<1> */
+   0b100101100101100000000, /*  grf<0;0>   grf<8;1> -grf<0> */
+   0b101001100001100000000, /*  grf<0;0>   arf<8;1>  grf<1> */
+   0b100101100101101000011, /*  grf<8;1>   grf<8;1> -grf<0> */
+   0b101101100101101001011, /* -grf<8;1>   grf<8;1> -grf<1> */
+   0b101001100001101001011, /* -grf<8;1>   arf<8;1>  grf<1> */
+   0b101101110001101001011, /* -grf<8;1>  -grf<8;1>  grf<1> */
+   0b101100010000101000011, /*  grf<8;1>  -grf<0;0>  grf<1> */
+   0b101100000100101000011, /*  grf<8;1>   grf<0;0> -grf<1> */
+   0b101101100001100001000, /* -grf<0;0>   grf<8;1>  grf<1> */
+   0b101101100101100000000, /*  grf<0;0>   grf<8;1> -grf<1> */
+   0b100100000100101000011, /*  grf<8;1>   grf<0;0> -grf<0> */
+   0b101001100101101000011, /*  grf<8;1>   arf<8;1> -grf<1> */
+};
+
+static const uint32_t xehp_3src_source_index_table[32] = {
+   0b100100000001100000000, /*           grf<0;0>   grf<1;0>     grf<0>      */
+   0b100100000001000000001, /*           arf<1;0>   grf<1;0>     grf<0>      */
+   0b101100000001100000001, /*           grf<1;0>   grf<1;0>     grf<1>      */
+   0b100100000001100000001, /*           grf<1;0>   grf<1;0>     grf<0>      */
+   0b101100000000100000001, /*           grf<1;0>   grf<0;0>     grf<1>      */
+   0b101100000001100001001, /*          -grf<1;0>   grf<1;0>     grf<1>      */
+   0b101000000001100000001, /*           grf<1;0>   arf<1;0>     grf<1>      */
+   0b101100000001100000000, /*           grf<0;0>   grf<1;0>     grf<1>      */
+   0b100000000001100000000, /*           grf<0;0>   arf<1;0>     grf<0>      */
+   0b101100000101100000001, /*           grf<1;0>   grf<1;0>    -grf<1>      */
+   0b101100010001100000001, /*           grf<1;0>  -grf<1;0>     grf<1>      */
+   0b101100000000100000000, /*           grf<0;0>   grf<0;0>     grf<1>      */
+   0b100000000001100000001, /*           grf<1;0>   arf<1;0>     grf<0>      */
+   0b100100010001100000000, /*           grf<0;0>  -grf<1;0>     grf<0>      */
+   0b100100010001100000001, /*           grf<1;0>  -grf<1;0>     grf<0>      */
+   0b100100000001100001001, /*          -grf<1;0>   grf<1;0>     grf<0>      */
+   0b100100000000100000001, /*           grf<1;0>   grf<0;0>     grf<0>      */
+   0b100100000001100001000, /*          -grf<0;0>   grf<1;0>     grf<0>      */
+   0b100100000000100000000, /*           grf<0;0>   grf<0;0>     grf<0>
+                             * dpas.*x1  grf:d      grf:[ub,b]   grf:[ub,b]
+                             * dpas.*x1  grf:f      grf:bf       grf:bf
+                             */
+   0b101100010001100000000, /*           grf<0;0>  -grf<1;0>     grf<1>      */
+   0b100100000101100000000, /*           grf<0;0>   grf<1;0>    -grf<0>      */
+   0b101000000001100000000, /*           grf<0;0>   arf<1;0>     grf<1>      */
+   0b100100000101100000001, /*           grf<1;0>   grf<1;0>    -grf<0>      */
+   0b101100000101100001001, /*          -grf<1;0>   grf<1;0>    -grf<1>      */
+   0b100100010000100000000, /* dpas.*x1  grf:d      grf:[u2,s2]  grf:[ub,b]  */
+   0b100100000100100000000, /* dpas.*x1  grf:d      grf:[ub,b]   grf:[u2,s2] */
+   0b100100010100100000000, /* dpas.*x1  grf:d      grf:[u2,s2]  grf:[u2,s2] */
+   0b100100001000100000000, /* dpas.*x1  grf:d      grf:[u4,s4]  grf:[ub,b]  */
+   0b100100001100100000000, /* dpas.*x1  grf:d      grf:[u4,s4]  grf:[u2,s2] */
+   0b100100000010100000000, /* dpas.*x1  grf:d      grf:[ub,b]   grf:[u4,s4] */
+   0b100100001010100000000, /* dpas.*x1  grf:d      grf:[u4,s4]  grf:[u4,s4] */
+   0b100100010010100000000, /* dpas.*x1  grf:d      grf:[u2,s2]  grf:[u4,s4] */
+};
+
+static const uint32_t xe2_3src_source_index_table[16] = {
+   0b101100000001100000001, /* grf<1;0> grf<1;0> grf<1>  */
+   0b101100000001000000001, /* arf<1;0> grf<1;0> grf<1>  */
+   0b100100000001100000000, /* grf<0;0> grf<1;0> grf<0>  */
+   0b100100000001000000001, /* arf<1;0> grf<1;0> grf<0>  */
+   0b100100000001100000001, /* grf<1;0> grf<1;0> grf<0>  */
+   0b100000000001100000000, /* grf<0;0> arf<1;0> grf<0>  */
+   0b100000000001100000001, /* grf<1;0> arf<1;0> grf<0>  */
+   0b101100000101100000001, /* grf<1;0> grf<1;0> -grf<1> */
+   0b101000000001100000001, /* grf<1;0> arf<1;0> grf<1>  */
+   0b101000000001000000001, /* arf<1;0> arf<1;0> grf<1>  */
+   0b100000000001000000001, /* arf<1;0> arf<1;0> grf<0>  */
+   0b100100000000100000000, /* grf<0;0> grf<0;0> grf<0>  */
+   0b100100000000100000001, /* grf<1;0> grf<0;0> grf<0>  */
+   0b101100000101000000001, /* arf<1;0> grf<1;0> -grf<1> */
+   0b100100010001100000001, /* grf<1;0> -grf<1;0> grf<0> */
+   0b100100010001000000001, /* arf<1;0> -grf<1;0> grf<0> */
+};
+
+static const uint32_t xe2_3src_dpas_source_index_table[16] = {
+   0b100100000000100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[ub,b]
+                             * dpas.*x1 grf:[f,bf] grf:bf grf:bf
+                             * dpas.*x1 grf:[f,hf] grf:hf grf:hf
+                             */
+   0b100100000010100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[u4,s4] */
+   0b100100000100100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[u2,s2] */
+   0b100100001000100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[ub,b] */
+   0b100100001010100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[u4,s4] */
+   0b100100001100100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[u2,s2] */
+   0b100100010000100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[ub,b] */
+   0b100100010010100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[u4,s4] */
+   0b100100010100100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[u2,s2] */
+   0b100100000000100000010, /* dpas.*x2 grf:d grf:[ub,b] grf:[ub,b] */
+   0b100100000010100000010, /* dpas.*x2 grf:d grf:[ub,b] grf:[u4,s4] */
+   0b100100001000100000010, /* dpas.*x2 grf:d grf:[u4,s4] grf:[ub,b] */
+   0b100100001010100000010, /* dpas.*x2 grf:d grf:[u4,s4] grf:[u4,s4] */
+   0b100100010100100000010, /* dpas.*x2 grf:d grf:[u2,s2] grf:[u2,s2] */
+   0b100100000000100001110, /* dpas.*x8 grf:d grf:[ub,b] grf:[ub,b] */
+   0b100100001010100001110, /* dpas.*x8 grf:d grf:[u4,s4] grf:[u4,s4] */
+};
+
+static const uint32_t gfx12_3src_subreg_table[32] = {
+   0b00000000000000000000, /* .0  .0  .0  .0  */
+   0b00100000000000000000, /* .0  .0  .0  .4  */
+   0b00000000000110000000, /* .0  .12 .0  .0  */
+   0b10100000000000000000, /* .0  .0  .0  .20 */
+   0b10000000001110000000, /* .0  .28 .0  .16 */
+   0b01100000000000000000, /* .0  .0  .0  .12 */
+   0b01000000000000000000, /* .0  .0  .0  .8  */
+   0b00000010000000000000, /* .0  .0  .8  .0  */
+   0b00000001000000000000, /* .0  .0  .4  .0  */
+   0b11000000000000000000, /* .0  .0  .0  .24 */
+   0b10000000000000000000, /* .0  .0  .0  .16 */
+   0b11100000000000000000, /* .0  .0  .0  .28 */
+   0b00000110000000000000, /* .0  .0  .24 .0  */
+   0b00000000000010000000, /* .0  .4  .0  .0  */
+   0b00000100000000000000, /* .0  .0  .16 .0  */
+   0b00000011000000000000, /* .0  .0  .12 .0  */
+   0b00000101000000000000, /* .0  .0  .20 .0  */
+   0b00000111000000000000, /* .0  .0  .28 .0  */
+   0b00000000000100000000, /* .0  .8  .0  .0  */
+   0b00000000001000000000, /* .0  .16 .0  .0  */
+   0b00000000001100000000, /* .0  .24 .0  .0  */
+   0b00000000001010000000, /* .0  .20 .0  .0  */
+   0b00000000001110000000, /* .0  .28 .0  .0  */
+   0b11000000001110000000, /* .0  .28 .0  .24 */
+   0b00100000000100000000, /* .0  .8  .0  .4  */
+   0b00100000000110000000, /* .0  .12 .0  .4  */
+   0b01000000000110000000, /* .0  .12 .0  .8  */
+   0b10000000001100000000, /* .0  .24 .0  .16 */
+   0b10000000001010000000, /* .0  .20 .0  .16 */
+   0b01100000000010000000, /* .0  .4  .0  .12 */
+   0b10100000001110000000, /* .0  .28 .0  .20 */
+   0b01000000000010000000, /* .0  .4  .0  .8  */
+};
+
+static const uint32_t xe2_3src_subreg_table[32] = {
+   0b00000000000000000000, /* .0 .0 .0 .0   */
+   0b00100000000000000000, /* .0 .0 .0 .8   */
+   0b10000000000000000000, /* .0 .0 .0 .32  */
+   0b00010000000000000000, /* .0 .0 .0 .4   */
+   0b11100000000000000000, /* .0 .0 .0 .56  */
+   0b01010000000000000000, /* .0 .0 .0 .20  */
+   0b10110000000000000000, /* .0 .0 .0 .44  */
+   0b01000000000011000000, /* .0 .12 .0 .16 */
+   0b01100000000000000000, /* .0 .0 .0 .24  */
+   0b10100000000000000000, /* .0 .0 .0 .40  */
+   0b11000000000000000000, /* .0 .0 .0 .48  */
+   0b01000000000000000000, /* .0 .0 .0 .16  */
+   0b01110000000110000000, /* .0 .24 .0 .28 */
+   0b10100000001001000000, /* .0 .36 .0 .40 */
+   0b11010000001100000000, /* .0 .48 .0 .52 */
+   0b01110000000000000000, /* .0 .0 .0 .28  */
+   0b11110000000000000000, /* .0 .0 .0 .60  */
+   0b10010000000000000000, /* .0 .0 .0 .36  */
+   0b00110000000000000000, /* .0 .0 .0 .12  */
+   0b00100000000010000000, /* .0 .8 .0 .8   */
+   0b00010000000001000000, /* .0 .4 .0 .4   */
+   0b00110000000011000000, /* .0 .12 .0 .12 */
+   0b11010000000000000000, /* .0 .0 .0 .52  */
+   0b00000000000001000000, /* .0 .4 .0 .0   */
+   0b00000101100000000000, /* .0 .0 .44 .0  */
+   0b00000100000000000000, /* .0 .0 .32 .0  */
+   0b00000000000010000000, /* .0 .8 .0 .0   */
+   0b00000000001100000000, /* .0 .48 .0 .0  */
+   0b00000000001101000000, /* .0 .52 .0 .0  */
+   0b00000110100000000000, /* .0 .0 .52 .0  */
+   0b00000000001000000000, /* .0 .32 .0 .0  */
+   0b00000000001111000000, /* .0 .60 .0 .0  */
+};
+
+struct compaction_state {
+   const struct brw_isa_info *isa;
+   const uint32_t *control_index_table;
+   const uint32_t *datatype_table;
+   const uint16_t *subreg_table;
+   const uint16_t *src0_index_table;
+   const uint16_t *src1_index_table;
+};
+
+static void compaction_state_init(struct compaction_state *c,
+                                  const struct brw_isa_info *isa);
+
+static bool
+set_control_index(const struct compaction_state *c,
+                  brw_compact_inst *dst, const brw_inst *src)
+{
+   const struct intel_device_info *devinfo = c->isa->devinfo;
+   uint32_t uncompacted; /* 17b/G45; 19b/IVB+; 21b/TGL+ */
+
+   if (devinfo->ver >= 20) {
+      uncompacted = (brw_inst_bits(src, 95, 92) << 14) | /*  4b */
+                    (brw_inst_bits(src, 34, 34) << 13) | /*  1b */
+                    (brw_inst_bits(src, 32, 32) << 12) | /*  1b */
+                    (brw_inst_bits(src, 31, 31) << 11) | /*  1b */
+                    (brw_inst_bits(src, 28, 28) << 10) | /*  1b */
+                    (brw_inst_bits(src, 27, 26) <<  8) | /*  2b */
+                    (brw_inst_bits(src, 25, 24) <<  6) | /*  2b */
+                    (brw_inst_bits(src, 23, 21) <<  3) | /*  3b */
+                    (brw_inst_bits(src, 20, 18));        /*  3b */
+   } else if (devinfo->ver >= 12) {
+      uncompacted = (brw_inst_bits(src, 95, 92) << 17) | /*  4b */
+                    (brw_inst_bits(src, 34, 34) << 16) | /*  1b */
+                    (brw_inst_bits(src, 33, 33) << 15) | /*  1b */
+                    (brw_inst_bits(src, 32, 32) << 14) | /*  1b */
+                    (brw_inst_bits(src, 31, 31) << 13) | /*  1b */
+                    (brw_inst_bits(src, 28, 28) << 12) | /*  1b */
+                    (brw_inst_bits(src, 27, 24) <<  8) | /*  4b */
+                    (brw_inst_bits(src, 23, 22) <<  6) | /*  2b */
+                    (brw_inst_bits(src, 21, 19) <<  3) | /*  3b */
+                    (brw_inst_bits(src, 18, 16));        /*  3b */
+   } else if (devinfo->ver >= 8) {
+      uncompacted = (brw_inst_bits(src, 33, 31) << 16) | /*  3b */
+                    (brw_inst_bits(src, 23, 12) <<  4) | /* 12b */
+                    (brw_inst_bits(src, 10,  9) <<  2) | /*  2b */
+                    (brw_inst_bits(src, 34, 34) <<  1) | /*  1b */
+                    (brw_inst_bits(src,  8,  8));        /*  1b */
+   } else {
+      uncompacted = (brw_inst_bits(src, 31, 31) << 16) | /*  1b */
+                    (brw_inst_bits(src, 23,  8));        /* 16b */
+
+      /* On gfx7, the flag register and subregister numbers are integrated into
+       * the control index.
+       */
+      if (devinfo->ver == 7)
+         uncompacted |= brw_inst_bits(src, 90, 89) << 17; /* 2b */
+   }
+
+   for (int i = 0; i < 32; i++) {
+      if (c->control_index_table[i] == uncompacted) {
+         brw_compact_inst_set_control_index(devinfo, dst, i);
+	 return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+set_datatype_index(const struct compaction_state *c, brw_compact_inst *dst,
+                   const brw_inst *src, bool is_immediate)
+{
+   const struct intel_device_info *devinfo = c->isa->devinfo;
+   uint32_t uncompacted; /* 18b/G45+; 21b/BDW+; 20b/TGL+ */
+
+   if (devinfo->ver >= 12) {
+      uncompacted = (brw_inst_bits(src, 91, 88) << 15) | /*  4b */
+                    (brw_inst_bits(src, 66, 66) << 14) | /*  1b */
+                    (brw_inst_bits(src, 50, 50) << 13) | /*  1b */
+                    (brw_inst_bits(src, 49, 48) << 11) | /*  2b */
+                    (brw_inst_bits(src, 47, 47) << 10) | /*  1b */
+                    (brw_inst_bits(src, 46, 46) <<  9) | /*  1b */
+                    (brw_inst_bits(src, 43, 40) <<  5) | /*  4b */
+                    (brw_inst_bits(src, 39, 36) <<  1) | /*  4b */
+                    (brw_inst_bits(src, 35, 35));        /*  1b */
+
+      /* Src1.RegFile overlaps with the immediate, so ignore it if an immediate
+       * is present
+       */
+      if (!is_immediate) {
+         uncompacted |= brw_inst_bits(src, 98, 98) << 19; /* 1b */
+      }
+   } else if (devinfo->ver >= 8) {
+      uncompacted = (brw_inst_bits(src, 63, 61) << 18) | /*  3b */
+                    (brw_inst_bits(src, 94, 89) << 12) | /*  6b */
+                    (brw_inst_bits(src, 46, 35));        /* 12b */
+   } else {
+      uncompacted = (brw_inst_bits(src, 63, 61) << 15) | /*  3b */
+                    (brw_inst_bits(src, 46, 32));        /* 15b */
+   }
+
+   for (int i = 0; i < 32; i++) {
+      if (c->datatype_table[i] == uncompacted) {
+         brw_compact_inst_set_datatype_index(devinfo, dst, i);
+	 return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+set_subreg_index(const struct compaction_state *c, brw_compact_inst *dst,
+                 const brw_inst *src, bool is_immediate)
+{
+   const struct intel_device_info *devinfo = c->isa->devinfo;
+   const unsigned table_len = devinfo->ver >= 20 ?
+      ARRAY_SIZE(xe2_subreg_table) : ARRAY_SIZE(g45_subreg_table);
+   uint16_t uncompacted; /* 15b/G45+; 12b/Xe2+ */
+
+   if (devinfo->ver >= 20) {
+      uncompacted = (brw_inst_bits(src, 33, 33) << 0) |    /* 1b */
+                    (brw_inst_bits(src, 55, 51) << 1) |    /* 5b */
+                    (brw_inst_bits(src, 71, 67) << 6) |    /* 5b */
+                    (brw_inst_bits(src, 87, 87) << 11);    /* 1b */
+   } else if (devinfo->ver >= 12) {
+      uncompacted = (brw_inst_bits(src, 55, 51) << 0) |    /* 5b */
+                    (brw_inst_bits(src, 71, 67) << 5);     /* 5b */
+
+      if (!is_immediate)
+         uncompacted |= brw_inst_bits(src, 103, 99) << 10; /* 5b */
+   } else {
+      uncompacted = (brw_inst_bits(src, 52, 48) << 0) |    /* 5b */
+                    (brw_inst_bits(src, 68, 64) << 5);     /* 5b */
+
+      if (!is_immediate)
+         uncompacted |= brw_inst_bits(src, 100, 96) << 10; /* 5b */
+   }
+
+   for (int i = 0; i < table_len; i++) {
+      if (c->subreg_table[i] == uncompacted) {
+         brw_compact_inst_set_subreg_index(devinfo, dst, i);
+	 return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+set_src0_index(const struct compaction_state *c, brw_compact_inst *dst,
+               const brw_inst *src)
+{
+   const struct intel_device_info *devinfo = c->isa->devinfo;
+   uint16_t uncompacted; /* 12b/G45+; 11b/Xe2+ */
+   int table_len;
+
+   if (devinfo->ver >= 12) {
+      table_len = (devinfo->ver >= 20 ? ARRAY_SIZE(xe2_src0_index_table) :
+                   ARRAY_SIZE(gfx12_src0_index_table));
+      uncompacted = (devinfo->ver >= 20 ? 0 :
+                     brw_inst_bits(src, 87, 87) << 11) | /*  1b */
+                    (brw_inst_bits(src, 86, 84) << 8) | /*  3b */
+                    (brw_inst_bits(src, 83, 81) << 5) | /*  3b */
+                    (brw_inst_bits(src, 80, 80) << 4) | /*  1b */
+                    (brw_inst_bits(src, 65, 64) << 2) | /*  2b */
+                    (brw_inst_bits(src, 45, 44));       /*  2b */
+   } else {
+      table_len = ARRAY_SIZE(gfx8_src_index_table);
+      uncompacted = brw_inst_bits(src, 88, 77);         /* 12b */
+   }
+
+   for (int i = 0; i < table_len; i++) {
+      if (c->src0_index_table[i] == uncompacted) {
+         brw_compact_inst_set_src0_index(devinfo, dst, i);
+	 return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+set_src1_index(const struct compaction_state *c, brw_compact_inst *dst,
+               const brw_inst *src, bool is_immediate, unsigned imm)
+{
+   const struct intel_device_info *devinfo = c->isa->devinfo;
+   if (is_immediate) {
+      if (devinfo->ver >= 12) {
+         /* src1 index takes the low 4 bits of the 12-bit compacted value */
+         brw_compact_inst_set_src1_index(devinfo, dst, imm & 0xf);
+      } else {
+         /* src1 index takes the high 5 bits of the 13-bit compacted value */
+         brw_compact_inst_set_src1_index(devinfo, dst, imm >> 8);
+      }
+      return true;
+   } else {
+      uint16_t uncompacted; /* 12b/G45+ 16b/Xe2+ */
+      int table_len;
+
+      if (devinfo->ver >= 20) {
+         table_len = ARRAY_SIZE(xe2_src1_index_table);
+         uncompacted = (brw_inst_bits(src, 121, 120) << 14) | /*  2b */
+                       (brw_inst_bits(src, 118, 116) << 11) | /*  3b */
+                       (brw_inst_bits(src, 115, 113) <<  8) | /*  3b */
+                       (brw_inst_bits(src, 112, 112) <<  7) | /*  1b */
+                       (brw_inst_bits(src, 103,  99) <<  2) | /*  5b */
+                       (brw_inst_bits(src,  97,  96));        /*  2b */
+      } else if (devinfo->ver >= 12) {
+         table_len = ARRAY_SIZE(gfx12_src0_index_table);
+         uncompacted = (brw_inst_bits(src, 121, 120) << 10) | /*  2b */
+                       (brw_inst_bits(src, 119, 116) <<  6) | /*  4b */
+                       (brw_inst_bits(src, 115, 113) <<  3) | /*  3b */
+                       (brw_inst_bits(src, 112, 112) <<  2) | /*  1b */
+                       (brw_inst_bits(src,  97,  96));        /*  2b */
+      } else {
+         table_len = ARRAY_SIZE(gfx8_src_index_table);
+         uncompacted = brw_inst_bits(src, 120, 109);          /* 12b */
+      }
+
+      for (int i = 0; i < table_len; i++) {
+         if (c->src1_index_table[i] == uncompacted) {
+            brw_compact_inst_set_src1_index(devinfo, dst, i);
+            return true;
+         }
+      }
+   }
+
+   return false;
+}
+
+static bool
+set_3src_control_index(const struct intel_device_info *devinfo,
+                       brw_compact_inst *dst, const brw_inst *src,
+                       bool is_dpas)
+{
+   assert(devinfo->ver >= 8);
+
+   if (devinfo->ver >= 20) {
+      assert(is_dpas || !brw_inst_bits(src, 49, 49));
+
+      const uint64_t uncompacted =        /* 34b/Xe2+ */
+         (brw_inst_bits(src, 95, 92) << 30) | /*  4b */
+         (brw_inst_bits(src, 90, 88) << 27) | /*  3b */
+         (brw_inst_bits(src, 82, 80) << 24) | /*  3b */
+         (brw_inst_bits(src, 50, 50) << 23) | /*  1b */
+         (brw_inst_bits(src, 49, 48) << 21) | /*  2b */
+         (brw_inst_bits(src, 42, 40) << 18) | /*  3b */
+         (brw_inst_bits(src, 39, 39) << 17) | /*  1b */
+         (brw_inst_bits(src, 38, 36) << 14) | /*  3b */
+         (brw_inst_bits(src, 34, 34) << 13) | /*  1b */
+         (brw_inst_bits(src, 32, 32) << 12) | /*  1b */
+         (brw_inst_bits(src, 31, 31) << 11) | /*  1b */
+         (brw_inst_bits(src, 28, 28) << 10) | /*  1b */
+         (brw_inst_bits(src, 27, 26) <<  8) | /*  2b */
+         (brw_inst_bits(src, 25, 24) <<  6) | /*  2b */
+         (brw_inst_bits(src, 23, 21) <<  3) | /*  3b */
+         (brw_inst_bits(src, 20, 18));        /*  3b */
+
+      /* The bits used to index the tables for 3src and 3src-dpas
+       * are the same, so just need to pick the right one.
+       */
+      const uint64_t *table = is_dpas ? xe2_3src_dpas_control_index_table :
+                                        xe2_3src_control_index_table;
+      const unsigned size = is_dpas ? ARRAY_SIZE(xe2_3src_dpas_control_index_table) :
+                                      ARRAY_SIZE(xe2_3src_control_index_table);
+      for (unsigned i = 0; i < size; i++) {
+         if (table[i] == uncompacted) {
+            brw_compact_inst_set_3src_control_index(devinfo, dst, i);
+            return true;
+         }
+      }
+   } else if (devinfo->verx10 >= 125) {
+      uint64_t uncompacted =             /* 37b/XeHP+ */
+         (brw_inst_bits(src, 95, 92) << 33) | /*  4b */
+         (brw_inst_bits(src, 90, 88) << 30) | /*  3b */
+         (brw_inst_bits(src, 82, 80) << 27) | /*  3b */
+         (brw_inst_bits(src, 50, 50) << 26) | /*  1b */
+         (brw_inst_bits(src, 49, 48) << 24) | /*  2b */
+         (brw_inst_bits(src, 42, 40) << 21) | /*  3b */
+         (brw_inst_bits(src, 39, 39) << 20) | /*  1b */
+         (brw_inst_bits(src, 38, 36) << 17) | /*  3b */
+         (brw_inst_bits(src, 34, 34) << 16) | /*  1b */
+         (brw_inst_bits(src, 33, 33) << 15) | /*  1b */
+         (brw_inst_bits(src, 32, 32) << 14) | /*  1b */
+         (brw_inst_bits(src, 31, 31) << 13) | /*  1b */
+         (brw_inst_bits(src, 28, 28) << 12) | /*  1b */
+         (brw_inst_bits(src, 27, 24) <<  8) | /*  4b */
+         (brw_inst_bits(src, 23, 23) <<  7) | /*  1b */
+         (brw_inst_bits(src, 22, 22) <<  6) | /*  1b */
+         (brw_inst_bits(src, 21, 19) <<  3) | /*  3b */
+         (brw_inst_bits(src, 18, 16));        /*  3b */
+
+      for (unsigned i = 0; i < ARRAY_SIZE(xehp_3src_control_index_table); i++) {
+         if (xehp_3src_control_index_table[i] == uncompacted) {
+            brw_compact_inst_set_3src_control_index(devinfo, dst, i);
+            return true;
+         }
+      }
+   } else if (devinfo->ver >= 12) {
+      uint64_t uncompacted =             /* 36b/TGL+ */
+         (brw_inst_bits(src, 95, 92) << 32) | /*  4b */
+         (brw_inst_bits(src, 90, 88) << 29) | /*  3b */
+         (brw_inst_bits(src, 82, 80) << 26) | /*  3b */
+         (brw_inst_bits(src, 50, 50) << 25) | /*  1b */
+         (brw_inst_bits(src, 48, 48) << 24) | /*  1b */
+         (brw_inst_bits(src, 42, 40) << 21) | /*  3b */
+         (brw_inst_bits(src, 39, 39) << 20) | /*  1b */
+         (brw_inst_bits(src, 38, 36) << 17) | /*  3b */
+         (brw_inst_bits(src, 34, 34) << 16) | /*  1b */
+         (brw_inst_bits(src, 33, 33) << 15) | /*  1b */
+         (brw_inst_bits(src, 32, 32) << 14) | /*  1b */
+         (brw_inst_bits(src, 31, 31) << 13) | /*  1b */
+         (brw_inst_bits(src, 28, 28) << 12) | /*  1b */
+         (brw_inst_bits(src, 27, 24) <<  8) | /*  4b */
+         (brw_inst_bits(src, 23, 23) <<  7) | /*  1b */
+         (brw_inst_bits(src, 22, 22) <<  6) | /*  1b */
+         (brw_inst_bits(src, 21, 19) <<  3) | /*  3b */
+         (brw_inst_bits(src, 18, 16));        /*  3b */
+
+      for (unsigned i = 0; i < ARRAY_SIZE(gfx12_3src_control_index_table); i++) {
+         if (gfx12_3src_control_index_table[i] == uncompacted) {
+            brw_compact_inst_set_3src_control_index(devinfo, dst, i);
+            return true;
+         }
+      }
+   } else {
+      uint32_t uncompacted = /* 24b/BDW; 26b/CHV/SKL+ */
+         (brw_inst_bits(src, 34, 32) << 21) |  /*  3b */
+         (brw_inst_bits(src, 28,  8));         /* 21b */
+
+      if (devinfo->ver >= 9 || devinfo->platform == INTEL_PLATFORM_CHV) {
+         uncompacted |=
+            brw_inst_bits(src, 36, 35) << 24;  /*  2b */
+      }
+
+      for (unsigned i = 0; i < ARRAY_SIZE(gfx8_3src_control_index_table); i++) {
+         if (gfx8_3src_control_index_table[i] == uncompacted) {
+            brw_compact_inst_set_3src_control_index(devinfo, dst, i);
+            return true;
+         }
+      }
+   }
+
+   return false;
+}
+
+static bool
+set_3src_source_index(const struct intel_device_info *devinfo,
+                      brw_compact_inst *dst, const brw_inst *src,
+                      bool is_dpas)
+{
+   assert(devinfo->ver >= 8);
+
+   if (devinfo->ver >= 12) {
+      uint32_t uncompacted =               /* 21b/TGL+ */
+         (brw_inst_bits(src, 114, 114) << 20) | /*  1b */
+         (brw_inst_bits(src, 113, 112) << 18) | /*  2b */
+         (brw_inst_bits(src,  98,  98) << 17) | /*  1b */
+         (brw_inst_bits(src,  97,  96) << 15) | /*  2b */
+         (brw_inst_bits(src,  91,  91) << 14) | /*  1b */
+         (brw_inst_bits(src,  87,  86) << 12) | /*  2b */
+         (brw_inst_bits(src,  85,  84) << 10) | /*  2b */
+         (brw_inst_bits(src,  83,  83) <<  9) | /*  1b */
+         (brw_inst_bits(src,  66,  66) <<  8) | /*  1b */
+         (brw_inst_bits(src,  65,  64) <<  6) | /*  2b */
+         (brw_inst_bits(src,  47,  47) <<  5) | /*  1b */
+         (brw_inst_bits(src,  46,  46) <<  4) | /*  1b */
+         (brw_inst_bits(src,  45,  44) <<  2) | /*  2b */
+         (brw_inst_bits(src,  43,  43) <<  1) | /*  1b */
+         (brw_inst_bits(src,  35,  35));        /*  1b */
+
+      /* In Xe2, the bits used to index the tables for 3src and 3src-dpas
+       * are the same, so just need to pick the right one.
+       */
+      const uint32_t *three_src_source_index_table =
+         devinfo->ver >= 20 ? (is_dpas ? xe2_3src_dpas_source_index_table :
+                                         xe2_3src_source_index_table) :
+         devinfo->verx10 >= 125 ? xehp_3src_source_index_table :
+         gfx12_3src_source_index_table;
+      const uint32_t three_src_source_index_table_len =
+         devinfo->ver >= 20 ? (is_dpas ? ARRAY_SIZE(xe2_3src_dpas_source_index_table) :
+                                         ARRAY_SIZE(xe2_3src_source_index_table)) :
+         devinfo->verx10 >= 125 ? ARRAY_SIZE(xehp_3src_source_index_table) :
+         ARRAY_SIZE(gfx12_3src_source_index_table);
+
+      for (unsigned i = 0; i < three_src_source_index_table_len; i++) {
+         if (three_src_source_index_table[i] == uncompacted) {
+            brw_compact_inst_set_3src_source_index(devinfo, dst, i);
+            return true;
+         }
+      }
+   } else {
+      uint64_t uncompacted =    /* 46b/BDW; 49b/CHV/SKL+ */
+         (brw_inst_bits(src,  83,  83) << 43) |   /*  1b */
+         (brw_inst_bits(src, 114, 107) << 35) |   /*  8b */
+         (brw_inst_bits(src,  93,  86) << 27) |   /*  8b */
+         (brw_inst_bits(src,  72,  65) << 19) |   /*  8b */
+         (brw_inst_bits(src,  55,  37));          /* 19b */
+
+      if (devinfo->ver >= 9 || devinfo->platform == INTEL_PLATFORM_CHV) {
+         uncompacted |=
+            (brw_inst_bits(src, 126, 125) << 47) | /* 2b */
+            (brw_inst_bits(src, 105, 104) << 45) | /* 2b */
+            (brw_inst_bits(src,  84,  84) << 44);  /* 1b */
+      } else {
+         uncompacted |=
+            (brw_inst_bits(src, 125, 125) << 45) | /* 1b */
+            (brw_inst_bits(src, 104, 104) << 44);  /* 1b */
+      }
+
+      for (unsigned i = 0; i < ARRAY_SIZE(gfx8_3src_source_index_table); i++) {
+         if (gfx8_3src_source_index_table[i] == uncompacted) {
+            brw_compact_inst_set_3src_source_index(devinfo, dst, i);
+            return true;
+         }
+      }
+   }
+
+   return false;
+}
+
+static bool
+set_3src_subreg_index(const struct intel_device_info *devinfo,
+                      brw_compact_inst *dst, const brw_inst *src)
+{
+   assert(devinfo->ver >= 12);
+
+   uint32_t uncompacted =               /* 20b/TGL+ */
+      (brw_inst_bits(src, 119, 115) << 15) | /*  5b */
+      (brw_inst_bits(src, 103,  99) << 10) | /*  5b */
+      (brw_inst_bits(src,  71,  67) <<  5) | /*  5b */
+      (brw_inst_bits(src,  55,  51));        /*  5b */
+
+   const uint32_t *table = devinfo->ver >= 20 ? xe2_3src_subreg_table :
+                           gfx12_3src_subreg_table;
+   const uint32_t len =
+      devinfo->ver >= 20 ? ARRAY_SIZE(xe2_3src_subreg_table) :
+      ARRAY_SIZE(gfx12_3src_subreg_table);
+
+   for (unsigned i = 0; i < len; i++) {
+      if (table[i] == uncompacted) {
+         brw_compact_inst_set_3src_subreg_index(devinfo, dst, i);
+	 return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+has_unmapped_bits(const struct brw_isa_info *isa, const brw_inst *src)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   /* EOT can only be mapped on a send if the src1 is an immediate */
+   if ((brw_inst_opcode(isa, src) == BRW_OPCODE_SENDC ||
+        brw_inst_opcode(isa, src) == BRW_OPCODE_SEND) &&
+       brw_inst_eot(devinfo, src))
+      return true;
+
+   /* Check for instruction bits that don't map to any of the fields of the
+    * compacted instruction.  The instruction cannot be compacted if any of
+    * them are set.  They overlap with:
+    *  - NibCtrl (bit 47 on Gfx7, bit 11 on Gfx8)
+    *  - Dst.AddrImm[9] (bit 47 on Gfx8)
+    *  - Src0.AddrImm[9] (bit 95 on Gfx8)
+    *  - Imm64[27:31] (bits 91-95 on Gfx7, bit 95 on Gfx8)
+    *  - UIP[31] (bit 95 on Gfx8)
+    */
+   if (devinfo->ver >= 12) {
+      assert(!brw_inst_bits(src, 7,  7));
+      return false;
+   } else if (devinfo->ver >= 8) {
+      assert(!brw_inst_bits(src, 7,  7));
+      return brw_inst_bits(src, 95, 95) ||
+             brw_inst_bits(src, 47, 47) ||
+             brw_inst_bits(src, 11, 11);
+   } else {
+      assert(!brw_inst_bits(src, 7,  7) &&
+             !(devinfo->ver < 7 && brw_inst_bits(src, 90, 90)));
+      return brw_inst_bits(src, 95, 91) ||
+             brw_inst_bits(src, 47, 47);
+   }
+}
+
+static bool
+has_3src_unmapped_bits(const struct intel_device_info *devinfo,
+                       const brw_inst *src, bool is_dpas)
+{
+   /* Check for three-source instruction bits that don't map to any of the
+    * fields of the compacted instruction.  All of them seem to be reserved
+    * bits currently.
+    */
+   if (devinfo->ver >= 20) {
+      assert(is_dpas || !brw_inst_bits(src, 49, 49));
+      assert(!brw_inst_bits(src, 33, 33));
+      assert(!brw_inst_bits(src, 7, 7));
+   } else if (devinfo->ver >= 12) {
+      assert(is_dpas || !brw_inst_bits(src, 49, 49));
+      assert(!brw_inst_bits(src, 7, 7));
+   } else if (devinfo->ver >= 9 || devinfo->platform == INTEL_PLATFORM_CHV) {
+      assert(!brw_inst_bits(src, 127, 127) &&
+             !brw_inst_bits(src, 7,  7));
+   } else {
+      assert(devinfo->ver >= 8);
+      assert(!brw_inst_bits(src, 127, 126) &&
+             !brw_inst_bits(src, 105, 105) &&
+             !brw_inst_bits(src, 84, 84) &&
+             !brw_inst_bits(src, 7,  7));
+
+      /* Src1Type and Src2Type, used for mixed-precision floating point */
+      if (brw_inst_bits(src, 36, 35))
+         return true;
+   }
+
+   return false;
+}
+
+static bool
+brw_try_compact_3src_instruction(const struct brw_isa_info *isa,
+                                 brw_compact_inst *dst, const brw_inst *src)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+   assert(devinfo->ver >= 8);
+
+   bool is_dpas = brw_inst_opcode(isa, src) == BRW_OPCODE_DPAS;
+   if (has_3src_unmapped_bits(devinfo, src, is_dpas))
+      return false;
+
+#define compact(field) \
+   brw_compact_inst_set_3src_##field(devinfo, dst, brw_inst_3src_##field(devinfo, src))
+#define compact_a16(field) \
+   brw_compact_inst_set_3src_##field(devinfo, dst, brw_inst_3src_a16_##field(devinfo, src))
+
+   compact(hw_opcode);
+
+   if (!set_3src_control_index(devinfo, dst, src, is_dpas))
+      return false;
+
+   if (!set_3src_source_index(devinfo, dst, src, is_dpas))
+      return false;
+
+   if (devinfo->ver >= 12) {
+      if (!set_3src_subreg_index(devinfo, dst, src))
+         return false;
+
+      compact(swsb);
+      compact(debug_control);
+      compact(dst_reg_nr);
+      compact(src0_reg_nr);
+      compact(src1_reg_nr);
+      compact(src2_reg_nr);
+   } else {
+      compact(dst_reg_nr);
+      compact_a16(src0_rep_ctrl);
+      compact(debug_control);
+      compact(saturate);
+      compact_a16(src1_rep_ctrl);
+      compact_a16(src2_rep_ctrl);
+      compact(src0_reg_nr);
+      compact(src1_reg_nr);
+      compact(src2_reg_nr);
+      compact_a16(src0_subreg_nr);
+      compact_a16(src1_subreg_nr);
+      compact_a16(src2_subreg_nr);
+   }
+   brw_compact_inst_set_3src_cmpt_control(devinfo, dst, true);
+
+#undef compact
+#undef compact_a16
+
+   return true;
+}
+
+/* On SNB through ICL, compacted instructions have 12-bits for immediate
+ * sources, and a 13th bit that's replicated through the high 20 bits.
+ *
+ * Effectively this means we get 12-bit integers, 0.0f, and some limited uses
+ * of packed vectors as compactable immediates.
+ *
+ * On TGL+, the high 12-bits of floating-point values (:f and :hf) are encoded
+ * rather than the low 12-bits. For signed integer the 12th bit is replicated,
+ * while for unsigned integers it is not.
+ *
+ * Returns the compacted immediate, or -1 if immediate cannot be compacted
+ */
+static int
+compact_immediate(const struct intel_device_info *devinfo,
+                  enum brw_reg_type type, unsigned imm)
+{
+   if (devinfo->ver >= 12) {
+      /* 16-bit immediates need to be replicated through the 32-bit immediate
+       * field
+       */
+      switch (type) {
+      case BRW_REGISTER_TYPE_W:
+      case BRW_REGISTER_TYPE_UW:
+      case BRW_REGISTER_TYPE_HF:
+         if ((imm >> 16) != (imm & 0xffff))
+            return -1;
+         break;
+      default:
+         break;
+      }
+
+      switch (type) {
+      case BRW_REGISTER_TYPE_F:
+         /* We get the high 12-bits as-is; rest must be zero */
+         if ((imm & 0xfffff) == 0)
+            return (imm >> 20) & 0xfff;
+         break;
+      case BRW_REGISTER_TYPE_HF:
+         /* We get the high 12-bits as-is; rest must be zero */
+         if ((imm & 0xf) == 0)
+            return (imm >> 4) & 0xfff;
+         break;
+      case BRW_REGISTER_TYPE_UD:
+      case BRW_REGISTER_TYPE_VF:
+      case BRW_REGISTER_TYPE_UV:
+      case BRW_REGISTER_TYPE_V:
+         /* We get the low 12-bits as-is; rest must be zero */
+         if ((imm & 0xfffff000) == 0)
+            return imm & 0xfff;
+         break;
+      case BRW_REGISTER_TYPE_UW:
+         /* We get the low 12-bits as-is; rest must be zero */
+         if ((imm & 0xf000) == 0)
+            return imm & 0xfff;
+         break;
+      case BRW_REGISTER_TYPE_D:
+         /* We get the low 11-bits as-is; 12th is replicated */
+         if (((int)imm >> 11) == 0 || ((int)imm >> 11) == -1)
+            return imm & 0xfff;
+         break;
+      case BRW_REGISTER_TYPE_W:
+         /* We get the low 11-bits as-is; 12th is replicated */
+         if (((short)imm >> 11) == 0 || ((short)imm >> 11) == -1)
+            return imm & 0xfff;
+         break;
+      case BRW_REGISTER_TYPE_NF:
+      case BRW_REGISTER_TYPE_DF:
+      case BRW_REGISTER_TYPE_Q:
+      case BRW_REGISTER_TYPE_UQ:
+      case BRW_REGISTER_TYPE_B:
+      case BRW_REGISTER_TYPE_UB:
+         return -1;
+      }
+   } else {
+      /* We get the low 12 bits as-is; 13th is replicated */
+      if (((int)imm >> 12) == 0 || ((int)imm >> 12 == -1)) {
+         return imm & 0x1fff;
+      }
+   }
+
+   return -1;
+}
+
+static int
+uncompact_immediate(const struct intel_device_info *devinfo,
+                    enum brw_reg_type type, unsigned compact_imm)
+{
+   if (devinfo->ver >= 12) {
+      switch (type) {
+      case BRW_REGISTER_TYPE_F:
+         return compact_imm << 20;
+      case BRW_REGISTER_TYPE_HF:
+         return (compact_imm << 20) | (compact_imm << 4);
+      case BRW_REGISTER_TYPE_UD:
+      case BRW_REGISTER_TYPE_VF:
+      case BRW_REGISTER_TYPE_UV:
+      case BRW_REGISTER_TYPE_V:
+         return compact_imm;
+      case BRW_REGISTER_TYPE_UW:
+         /* Replicate */
+         return compact_imm << 16 | compact_imm;
+      case BRW_REGISTER_TYPE_D:
+         /* Extend the 12th bit into the high 20 bits */
+         return (int)(compact_imm << 20) >> 20;
+      case BRW_REGISTER_TYPE_W:
+         /* Extend the 12th bit into the high 4 bits and replicate */
+         return ((int)(compact_imm << 20) >> 4) |
+                ((unsigned short)((short)(compact_imm << 4) >> 4));
+      case BRW_REGISTER_TYPE_NF:
+      case BRW_REGISTER_TYPE_DF:
+      case BRW_REGISTER_TYPE_Q:
+      case BRW_REGISTER_TYPE_UQ:
+      case BRW_REGISTER_TYPE_B:
+      case BRW_REGISTER_TYPE_UB:
+         unreachable("not reached");
+      }
+   } else {
+      /* Replicate the 13th bit into the high 19 bits */
+      return (int)(compact_imm << 19) >> 19;
+   }
+
+   unreachable("not reached");
+}
+
+static bool
+has_immediate(const struct intel_device_info *devinfo, const brw_inst *inst,
+              enum brw_reg_type *type)
+{
+   if (brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) {
+      *type = brw_inst_src0_type(devinfo, inst);
+      return *type != INVALID_REG_TYPE;
+   } else if (brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) {
+      *type = brw_inst_src1_type(devinfo, inst);
+      return *type != INVALID_REG_TYPE;
+   }
+
+   return false;
+}
+
+/**
+ * Applies some small changes to instruction types to increase chances of
+ * compaction.
+ */
+static brw_inst
+precompact(const struct brw_isa_info *isa, brw_inst inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   /* In XeHP the compaction tables removed the entries for source regions
+    * <8;8,1> giving preference to <1;1,0> as the way to indicate
+    * sequential elements, so convert to those before compacting.
+    */
+   if (devinfo->verx10 >= 125) {
+      if (brw_inst_src0_reg_file(devinfo, &inst) == BRW_GENERAL_REGISTER_FILE &&
+          brw_inst_src0_vstride(devinfo, &inst) > BRW_VERTICAL_STRIDE_1 &&
+          brw_inst_src0_vstride(devinfo, &inst) == (brw_inst_src0_width(devinfo, &inst) + 1) &&
+          brw_inst_src0_hstride(devinfo, &inst) == BRW_HORIZONTAL_STRIDE_1) {
+         brw_inst_set_src0_vstride(devinfo, &inst, BRW_VERTICAL_STRIDE_1);
+         brw_inst_set_src0_width(devinfo, &inst, BRW_WIDTH_1);
+         brw_inst_set_src0_hstride(devinfo, &inst, BRW_HORIZONTAL_STRIDE_0);
+      }
+
+      if (brw_inst_src1_reg_file(devinfo, &inst) == BRW_GENERAL_REGISTER_FILE &&
+          brw_inst_src1_vstride(devinfo, &inst) > BRW_VERTICAL_STRIDE_1 &&
+          brw_inst_src1_vstride(devinfo, &inst) == (brw_inst_src1_width(devinfo, &inst) + 1) &&
+          brw_inst_src1_hstride(devinfo, &inst) == BRW_HORIZONTAL_STRIDE_1) {
+         brw_inst_set_src1_vstride(devinfo, &inst, BRW_VERTICAL_STRIDE_1);
+         brw_inst_set_src1_width(devinfo, &inst, BRW_WIDTH_1);
+         brw_inst_set_src1_hstride(devinfo, &inst, BRW_HORIZONTAL_STRIDE_0);
+      }
+   }
+
+   if (brw_inst_src0_reg_file(devinfo, &inst) != BRW_IMMEDIATE_VALUE)
+      return inst;
+
+   /* The Bspec's section titled "Non-present Operands" claims that if src0
+    * is an immediate that src1's type must be the same as that of src0.
+    *
+    * The SNB+ DataTypeIndex instruction compaction tables contain mappings
+    * that do not follow this rule. E.g., from the IVB/HSW table:
+    *
+    *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
+    *        3         001000001011111101   r:f | i:vf | a:ud | <1> | dir |
+    *
+    * And from the SNB table:
+    *
+    *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
+    *        8         001000000111101100   a:w | i:w | a:ud | <1> | dir |
+    *
+    * Neither of these cause warnings from the simulator when used,
+    * compacted or otherwise. In fact, all compaction mappings that have an
+    * immediate in src0 use a:ud for src1.
+    *
+    * The GM45 instruction compaction tables do not contain mapped meanings
+    * so it's not clear whether it has the restriction. We'll assume it was
+    * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
+    *
+    * Don't do any of this for 64-bit immediates, since the src1 fields
+    * overlap with the immediate and setting them would overwrite the
+    * immediate we set.
+    */
+   if (devinfo->ver >= 6 &&
+       !(devinfo->platform == INTEL_PLATFORM_HSW &&
+         brw_inst_opcode(isa, &inst) == BRW_OPCODE_DIM) &&
+       !(devinfo->ver >= 8 &&
+         (brw_inst_src0_type(devinfo, &inst) == BRW_REGISTER_TYPE_DF ||
+          brw_inst_src0_type(devinfo, &inst) == BRW_REGISTER_TYPE_UQ ||
+          brw_inst_src0_type(devinfo, &inst) == BRW_REGISTER_TYPE_Q))) {
+      brw_inst_set_src1_reg_hw_type(devinfo, &inst, 0);
+   }
+
+   /* Compacted instructions only have 12-bits (plus 1 for the other 20)
+    * for immediate values. Presumably the hardware engineers realized
+    * that the only useful floating-point value that could be represented
+    * in this format is 0.0, which can also be represented as a VF-typed
+    * immediate, so they gave us the previously mentioned mapping on IVB+.
+    *
+    * Strangely, we do have a mapping for imm:f in src1, so we don't need
+    * to do this there.
+    *
+    * If we see a 0.0:F, change the type to VF so that it can be compacted.
+    *
+    * Compaction of floating-point immediates is improved on Gfx12, thus
+    * removing the need for this.
+    */
+   if (devinfo->ver < 12 &&
+       brw_inst_imm_ud(devinfo, &inst) == 0x0 &&
+       brw_inst_src0_type(devinfo, &inst) == BRW_REGISTER_TYPE_F &&
+       brw_inst_dst_type(devinfo, &inst) == BRW_REGISTER_TYPE_F &&
+       brw_inst_dst_hstride(devinfo, &inst) == BRW_HORIZONTAL_STRIDE_1) {
+      enum brw_reg_file file = brw_inst_src0_reg_file(devinfo, &inst);
+      brw_inst_set_src0_file_type(devinfo, &inst, file, BRW_REGISTER_TYPE_VF);
+   }
+
+   /* There are no mappings for dst:d | i:d, so if the immediate is suitable
+    * set the types to :UD so the instruction can be compacted.
+    *
+    * FINISHME: Use dst:f | imm:f on Gfx12
+    */
+   if (devinfo->ver < 12 &&
+       compact_immediate(devinfo, BRW_REGISTER_TYPE_D,
+                         brw_inst_imm_ud(devinfo, &inst)) != -1 &&
+       brw_inst_cond_modifier(devinfo, &inst) == BRW_CONDITIONAL_NONE &&
+       brw_inst_src0_type(devinfo, &inst) == BRW_REGISTER_TYPE_D &&
+       brw_inst_dst_type(devinfo, &inst) == BRW_REGISTER_TYPE_D) {
+      enum brw_reg_file src_file = brw_inst_src0_reg_file(devinfo, &inst);
+      enum brw_reg_file dst_file = brw_inst_dst_reg_file(devinfo, &inst);
+
+      brw_inst_set_src0_file_type(devinfo, &inst, src_file, BRW_REGISTER_TYPE_UD);
+      brw_inst_set_dst_file_type(devinfo, &inst, dst_file, BRW_REGISTER_TYPE_UD);
+   }
+
+   return inst;
+}
+
+/**
+ * Tries to compact instruction src into dst.
+ *
+ * It doesn't modify dst unless src is compactable, which is relied on by
+ * brw_compact_instructions().
+ */
+static bool
+try_compact_instruction(const struct compaction_state *c,
+                        brw_compact_inst *dst, const brw_inst *src)
+{
+   const struct intel_device_info *devinfo = c->isa->devinfo;
+   brw_compact_inst temp;
+
+   assert(brw_inst_cmpt_control(devinfo, src) == 0);
+
+   if (is_3src(c->isa, brw_inst_opcode(c->isa, src))) {
+      if (devinfo->ver >= 8) {
+         memset(&temp, 0, sizeof(temp));
+         if (brw_try_compact_3src_instruction(c->isa, &temp, src)) {
+            *dst = temp;
+            return true;
+         } else {
+            return false;
+         }
+      } else {
+         return false;
+      }
+   }
+
+   enum brw_reg_type type;
+   bool is_immediate = has_immediate(devinfo, src, &type);
+
+   unsigned compacted_imm = 0;
+
+   if (is_immediate) {
+      /* Instructions with immediates cannot be compacted on Gen < 6 */
+      if (devinfo->ver < 6)
+         return false;
+
+      compacted_imm = compact_immediate(devinfo, type,
+                                        brw_inst_imm_ud(devinfo, src));
+      if (compacted_imm == -1)
+         return false;
+   }
+
+   if (has_unmapped_bits(c->isa, src))
+      return false;
+
+   memset(&temp, 0, sizeof(temp));
+
+#define compact(field) \
+   brw_compact_inst_set_##field(devinfo, &temp, brw_inst_##field(devinfo, src))
+#define compact_reg(field) \
+   brw_compact_inst_set_##field##_reg_nr(devinfo, &temp, \
+                                       brw_inst_##field##_da_reg_nr(devinfo, src))
+
+   compact(hw_opcode);
+   compact(debug_control);
+
+   if (!set_control_index(c, &temp, src))
+      return false;
+   if (!set_datatype_index(c, &temp, src, is_immediate))
+      return false;
+   if (!set_subreg_index(c, &temp, src, is_immediate))
+      return false;
+   if (!set_src0_index(c, &temp, src))
+      return false;
+   if (!set_src1_index(c, &temp, src, is_immediate, compacted_imm))
+      return false;
+
+   if (devinfo->ver >= 12) {
+      compact(swsb);
+      compact_reg(dst);
+      compact_reg(src0);
+
+      if (is_immediate) {
+         /* src1 reg takes the high 8 bits (of the 12-bit compacted value) */
+         brw_compact_inst_set_src1_reg_nr(devinfo, &temp, compacted_imm >> 4);
+      } else {
+         compact_reg(src1);
+      }
+   } else {
+      if (devinfo->ver >= 6) {
+         compact(acc_wr_control);
+      } else {
+         compact(mask_control_ex);
+      }
+
+      if (devinfo->ver <= 6)
+         compact(flag_subreg_nr);
+
+      compact(cond_modifier);
+
+      compact_reg(dst);
+      compact_reg(src0);
+
+      if (is_immediate) {
+         /* src1 reg takes the low 8 bits (of the 13-bit compacted value) */
+         brw_compact_inst_set_src1_reg_nr(devinfo, &temp, compacted_imm & 0xff);
+      } else {
+         compact_reg(src1);
+      }
+   }
+   brw_compact_inst_set_cmpt_control(devinfo, &temp, true);
+
+#undef compact
+#undef compact_reg
+
+   *dst = temp;
+
+   return true;
+}
+
+bool
+brw_try_compact_instruction(const struct brw_isa_info *isa,
+                            brw_compact_inst *dst, const brw_inst *src)
+{
+   struct compaction_state c;
+   compaction_state_init(&c, isa);
+   return try_compact_instruction(&c, dst, src);
+}
+
+static void
+set_uncompacted_control(const struct compaction_state *c, brw_inst *dst,
+                        brw_compact_inst *src)
+{
+   const struct intel_device_info *devinfo = c->isa->devinfo;
+   uint32_t uncompacted =
+      c->control_index_table[brw_compact_inst_control_index(devinfo, src)];
+
+   if (devinfo->ver >= 20) {
+      brw_inst_set_bits(dst, 95, 92, (uncompacted >> 14) & 0xf);
+      brw_inst_set_bits(dst, 34, 34, (uncompacted >> 13) & 0x1);
+      brw_inst_set_bits(dst, 32, 32, (uncompacted >> 12) & 0x1);
+      brw_inst_set_bits(dst, 31, 31, (uncompacted >> 11) & 0x1);
+      brw_inst_set_bits(dst, 28, 28, (uncompacted >> 10) & 0x1);
+      brw_inst_set_bits(dst, 27, 26, (uncompacted >>  8) & 0x3);
+      brw_inst_set_bits(dst, 25, 24, (uncompacted >>  6) & 0x3);
+      brw_inst_set_bits(dst, 23, 21, (uncompacted >>  3) & 0x7);
+      brw_inst_set_bits(dst, 20, 18, (uncompacted >>  0) & 0x7);
+   } else if (devinfo->ver >= 12) {
+      brw_inst_set_bits(dst, 95, 92, (uncompacted >> 17));
+      brw_inst_set_bits(dst, 34, 34, (uncompacted >> 16) & 0x1);
+      brw_inst_set_bits(dst, 33, 33, (uncompacted >> 15) & 0x1);
+      brw_inst_set_bits(dst, 32, 32, (uncompacted >> 14) & 0x1);
+      brw_inst_set_bits(dst, 31, 31, (uncompacted >> 13) & 0x1);
+      brw_inst_set_bits(dst, 28, 28, (uncompacted >> 12) & 0x1);
+      brw_inst_set_bits(dst, 27, 24, (uncompacted >>  8) & 0xf);
+      brw_inst_set_bits(dst, 23, 22, (uncompacted >>  6) & 0x3);
+      brw_inst_set_bits(dst, 21, 19, (uncompacted >>  3) & 0x7);
+      brw_inst_set_bits(dst, 18, 16, (uncompacted >>  0) & 0x7);
+   } else if (devinfo->ver >= 8) {
+      brw_inst_set_bits(dst, 33, 31, (uncompacted >> 16));
+      brw_inst_set_bits(dst, 23, 12, (uncompacted >>  4) & 0xfff);
+      brw_inst_set_bits(dst, 10,  9, (uncompacted >>  2) & 0x3);
+      brw_inst_set_bits(dst, 34, 34, (uncompacted >>  1) & 0x1);
+      brw_inst_set_bits(dst,  8,  8, (uncompacted >>  0) & 0x1);
+   } else {
+      brw_inst_set_bits(dst, 31, 31, (uncompacted >> 16) & 0x1);
+      brw_inst_set_bits(dst, 23,  8, (uncompacted & 0xffff));
+
+      if (devinfo->ver == 7)
+         brw_inst_set_bits(dst, 90, 89, uncompacted >> 17);
+   }
+}
+
+static void
+set_uncompacted_datatype(const struct compaction_state *c, brw_inst *dst,
+                         brw_compact_inst *src)
+{
+   const struct intel_device_info *devinfo = c->isa->devinfo;
+   uint32_t uncompacted =
+      c->datatype_table[brw_compact_inst_datatype_index(devinfo, src)];
+
+   if (devinfo->ver >= 12) {
+      brw_inst_set_bits(dst, 98, 98, (uncompacted >> 19));
+      brw_inst_set_bits(dst, 91, 88, (uncompacted >> 15) & 0xf);
+      brw_inst_set_bits(dst, 66, 66, (uncompacted >> 14) & 0x1);
+      brw_inst_set_bits(dst, 50, 50, (uncompacted >> 13) & 0x1);
+      brw_inst_set_bits(dst, 49, 48, (uncompacted >> 11) & 0x3);
+      brw_inst_set_bits(dst, 47, 47, (uncompacted >> 10) & 0x1);
+      brw_inst_set_bits(dst, 46, 46, (uncompacted >>  9) & 0x1);
+      brw_inst_set_bits(dst, 43, 40, (uncompacted >>  5) & 0xf);
+      brw_inst_set_bits(dst, 39, 36, (uncompacted >>  1) & 0xf);
+      brw_inst_set_bits(dst, 35, 35, (uncompacted >>  0) & 0x1);
+   } else if (devinfo->ver >= 8) {
+      brw_inst_set_bits(dst, 63, 61, (uncompacted >> 18));
+      brw_inst_set_bits(dst, 94, 89, (uncompacted >> 12) & 0x3f);
+      brw_inst_set_bits(dst, 46, 35, (uncompacted >>  0) & 0xfff);
+   } else {
+      brw_inst_set_bits(dst, 63, 61, (uncompacted >> 15));
+      brw_inst_set_bits(dst, 46, 32, (uncompacted & 0x7fff));
+   }
+}
+
+static void
+set_uncompacted_subreg(const struct compaction_state *c, brw_inst *dst,
+                       brw_compact_inst *src)
+{
+   const struct intel_device_info *devinfo = c->isa->devinfo;
+   uint16_t uncompacted =
+      c->subreg_table[brw_compact_inst_subreg_index(devinfo, src)];
+
+   if (devinfo->ver >= 20) {
+      brw_inst_set_bits(dst, 33, 33, (uncompacted >> 0) & 0x1);
+      brw_inst_set_bits(dst, 55, 51, (uncompacted >> 1) & 0x1f);
+      brw_inst_set_bits(dst, 71, 67, (uncompacted >> 6) & 0x1f);
+      brw_inst_set_bits(dst, 87, 87, (uncompacted >> 11) & 0x1);
+   } else if (devinfo->ver >= 12) {
+      brw_inst_set_bits(dst, 103, 99, (uncompacted >> 10));
+      brw_inst_set_bits(dst,  71, 67, (uncompacted >>  5) & 0x1f);
+      brw_inst_set_bits(dst,  55, 51, (uncompacted >>  0) & 0x1f);
+   } else {
+      brw_inst_set_bits(dst, 100, 96, (uncompacted >> 10));
+      brw_inst_set_bits(dst,  68, 64, (uncompacted >>  5) & 0x1f);
+      brw_inst_set_bits(dst,  52, 48, (uncompacted >>  0) & 0x1f);
+   }
+}
+
+static void
+set_uncompacted_src0(const struct compaction_state *c, brw_inst *dst,
+                     brw_compact_inst *src)
+{
+   const struct intel_device_info *devinfo = c->isa->devinfo;
+   uint32_t compacted = brw_compact_inst_src0_index(devinfo, src);
+   uint16_t uncompacted = c->src0_index_table[compacted];
+
+   if (devinfo->ver >= 12) {
+      if (devinfo->ver < 20)
+         brw_inst_set_bits(dst, 87, 87, (uncompacted >> 11) & 0x1);
+      brw_inst_set_bits(dst, 86, 84, (uncompacted >> 8) & 0x7);
+      brw_inst_set_bits(dst, 83, 81, (uncompacted >> 5) & 0x7);
+      brw_inst_set_bits(dst, 80, 80, (uncompacted >> 4) & 0x1);
+      brw_inst_set_bits(dst, 65, 64, (uncompacted >> 2) & 0x3);
+      brw_inst_set_bits(dst, 45, 44, (uncompacted >> 0) & 0x3);
+   } else {
+      brw_inst_set_bits(dst, 88, 77, uncompacted);
+   }
+}
+
+static void
+set_uncompacted_src1(const struct compaction_state *c, brw_inst *dst,
+                     brw_compact_inst *src)
+{
+   const struct intel_device_info *devinfo = c->isa->devinfo;
+   uint16_t uncompacted =
+      c->src1_index_table[brw_compact_inst_src1_index(devinfo, src)];
+
+   if (devinfo->ver >= 20) {
+      brw_inst_set_bits(dst, 121, 120, (uncompacted >> 14) & 0x3);
+      brw_inst_set_bits(dst, 118, 116, (uncompacted >> 11) & 0x7);
+      brw_inst_set_bits(dst, 115, 113, (uncompacted >>  8) & 0x7);
+      brw_inst_set_bits(dst, 112, 112, (uncompacted >>  7) & 0x1);
+      brw_inst_set_bits(dst, 103,  99, (uncompacted >>  2) & 0x1f);
+      brw_inst_set_bits(dst,  97,  96, (uncompacted >>  0) & 0x3);
+   } else if (devinfo->ver >= 12) {
+      brw_inst_set_bits(dst, 121, 120, (uncompacted >> 10));
+      brw_inst_set_bits(dst, 119, 116, (uncompacted >>  6) & 0xf);
+      brw_inst_set_bits(dst, 115, 113, (uncompacted >>  3) & 0x7);
+      brw_inst_set_bits(dst, 112, 112, (uncompacted >>  2) & 0x1);
+      brw_inst_set_bits(dst,  97,  96, (uncompacted >>  0) & 0x3);
+   } else {
+      brw_inst_set_bits(dst, 120, 109, uncompacted);
+   }
+}
+
+static void
+set_uncompacted_3src_control_index(const struct compaction_state *c,
+                                   brw_inst *dst, brw_compact_inst *src,
+                                   bool is_dpas)
+{
+   const struct intel_device_info *devinfo = c->isa->devinfo;
+   assert(devinfo->ver >= 8);
+
+   if (devinfo->ver >= 20) {
+      uint64_t compacted = brw_compact_inst_3src_control_index(devinfo, src);
+      uint64_t uncompacted = is_dpas ? xe2_3src_dpas_control_index_table[compacted] :
+                                       xe2_3src_control_index_table[compacted];
+
+      brw_inst_set_bits(dst, 95, 92, (uncompacted >> 30) & 0xf);
+      brw_inst_set_bits(dst, 90, 88, (uncompacted >> 27) & 0x7);
+      brw_inst_set_bits(dst, 82, 80, (uncompacted >> 24) & 0x7);
+      brw_inst_set_bits(dst, 50, 50, (uncompacted >> 23) & 0x1);
+      brw_inst_set_bits(dst, 49, 48, (uncompacted >> 21) & 0x3);
+      brw_inst_set_bits(dst, 42, 40, (uncompacted >> 18) & 0x7);
+      brw_inst_set_bits(dst, 39, 39, (uncompacted >> 17) & 0x1);
+      brw_inst_set_bits(dst, 38, 36, (uncompacted >> 14) & 0x7);
+      brw_inst_set_bits(dst, 34, 34, (uncompacted >> 13) & 0x1);
+      brw_inst_set_bits(dst, 32, 32, (uncompacted >> 12) & 0x1);
+      brw_inst_set_bits(dst, 31, 31, (uncompacted >> 11) & 0x1);
+      brw_inst_set_bits(dst, 28, 28, (uncompacted >> 10) & 0x1);
+      brw_inst_set_bits(dst, 27, 26, (uncompacted >>  8) & 0x3);
+      brw_inst_set_bits(dst, 25, 24, (uncompacted >>  6) & 0x3);
+      brw_inst_set_bits(dst, 23, 21, (uncompacted >>  3) & 0x7);
+      brw_inst_set_bits(dst, 20, 18, (uncompacted >>  0) & 0x7);
+
+   } else if (devinfo->verx10 >= 125) {
+      uint64_t compacted = brw_compact_inst_3src_control_index(devinfo, src);
+      uint64_t uncompacted = xehp_3src_control_index_table[compacted];
+
+      brw_inst_set_bits(dst, 95, 92, (uncompacted >> 33));
+      brw_inst_set_bits(dst, 90, 88, (uncompacted >> 30) & 0x7);
+      brw_inst_set_bits(dst, 82, 80, (uncompacted >> 27) & 0x7);
+      brw_inst_set_bits(dst, 50, 50, (uncompacted >> 26) & 0x1);
+      brw_inst_set_bits(dst, 49, 48, (uncompacted >> 24) & 0x3);
+      brw_inst_set_bits(dst, 42, 40, (uncompacted >> 21) & 0x7);
+      brw_inst_set_bits(dst, 39, 39, (uncompacted >> 20) & 0x1);
+      brw_inst_set_bits(dst, 38, 36, (uncompacted >> 17) & 0x7);
+      brw_inst_set_bits(dst, 34, 34, (uncompacted >> 16) & 0x1);
+      brw_inst_set_bits(dst, 33, 33, (uncompacted >> 15) & 0x1);
+      brw_inst_set_bits(dst, 32, 32, (uncompacted >> 14) & 0x1);
+      brw_inst_set_bits(dst, 31, 31, (uncompacted >> 13) & 0x1);
+      brw_inst_set_bits(dst, 28, 28, (uncompacted >> 12) & 0x1);
+      brw_inst_set_bits(dst, 27, 24, (uncompacted >>  8) & 0xf);
+      brw_inst_set_bits(dst, 23, 23, (uncompacted >>  7) & 0x1);
+      brw_inst_set_bits(dst, 22, 22, (uncompacted >>  6) & 0x1);
+      brw_inst_set_bits(dst, 21, 19, (uncompacted >>  3) & 0x7);
+      brw_inst_set_bits(dst, 18, 16, (uncompacted >>  0) & 0x7);
+
+   } else if (devinfo->ver >= 12) {
+      uint64_t compacted = brw_compact_inst_3src_control_index(devinfo, src);
+      uint64_t uncompacted = gfx12_3src_control_index_table[compacted];
+
+      brw_inst_set_bits(dst, 95, 92, (uncompacted >> 32));
+      brw_inst_set_bits(dst, 90, 88, (uncompacted >> 29) & 0x7);
+      brw_inst_set_bits(dst, 82, 80, (uncompacted >> 26) & 0x7);
+      brw_inst_set_bits(dst, 50, 50, (uncompacted >> 25) & 0x1);
+      brw_inst_set_bits(dst, 48, 48, (uncompacted >> 24) & 0x1);
+      brw_inst_set_bits(dst, 42, 40, (uncompacted >> 21) & 0x7);
+      brw_inst_set_bits(dst, 39, 39, (uncompacted >> 20) & 0x1);
+      brw_inst_set_bits(dst, 38, 36, (uncompacted >> 17) & 0x7);
+      brw_inst_set_bits(dst, 34, 34, (uncompacted >> 16) & 0x1);
+      brw_inst_set_bits(dst, 33, 33, (uncompacted >> 15) & 0x1);
+      brw_inst_set_bits(dst, 32, 32, (uncompacted >> 14) & 0x1);
+      brw_inst_set_bits(dst, 31, 31, (uncompacted >> 13) & 0x1);
+      brw_inst_set_bits(dst, 28, 28, (uncompacted >> 12) & 0x1);
+      brw_inst_set_bits(dst, 27, 24, (uncompacted >>  8) & 0xf);
+      brw_inst_set_bits(dst, 23, 23, (uncompacted >>  7) & 0x1);
+      brw_inst_set_bits(dst, 22, 22, (uncompacted >>  6) & 0x1);
+      brw_inst_set_bits(dst, 21, 19, (uncompacted >>  3) & 0x7);
+      brw_inst_set_bits(dst, 18, 16, (uncompacted >>  0) & 0x7);
+   } else {
+      uint32_t compacted = brw_compact_inst_3src_control_index(devinfo, src);
+      uint32_t uncompacted = gfx8_3src_control_index_table[compacted];
+
+      brw_inst_set_bits(dst, 34, 32, (uncompacted >> 21) & 0x7);
+      brw_inst_set_bits(dst, 28,  8, (uncompacted >>  0) & 0x1fffff);
+
+      if (devinfo->ver >= 9 || devinfo->platform == INTEL_PLATFORM_CHV)
+         brw_inst_set_bits(dst, 36, 35, (uncompacted >> 24) & 0x3);
+   }
+}
+
+static void
+set_uncompacted_3src_source_index(const struct intel_device_info *devinfo,
+                                  brw_inst *dst, brw_compact_inst *src,
+                                  bool is_dpas)
+{
+   assert(devinfo->ver >= 8);
+
+   uint32_t compacted = brw_compact_inst_3src_source_index(devinfo, src);
+
+   if (devinfo->ver >= 12) {
+      const uint32_t *three_src_source_index_table =
+         devinfo->ver >= 20 ? (is_dpas ? xe2_3src_dpas_source_index_table :
+                                         xe2_3src_source_index_table) :
+         devinfo->verx10 >= 125 ? xehp_3src_source_index_table :
+                                  gfx12_3src_source_index_table;
+      uint32_t uncompacted = three_src_source_index_table[compacted];
+
+      brw_inst_set_bits(dst, 114, 114, (uncompacted >> 20));
+      brw_inst_set_bits(dst, 113, 112, (uncompacted >> 18) & 0x3);
+      brw_inst_set_bits(dst,  98,  98, (uncompacted >> 17) & 0x1);
+      brw_inst_set_bits(dst,  97,  96, (uncompacted >> 15) & 0x3);
+      brw_inst_set_bits(dst,  91,  91, (uncompacted >> 14) & 0x1);
+      brw_inst_set_bits(dst,  87,  86, (uncompacted >> 12) & 0x3);
+      brw_inst_set_bits(dst,  85,  84, (uncompacted >> 10) & 0x3);
+      brw_inst_set_bits(dst,  83,  83, (uncompacted >>  9) & 0x1);
+      brw_inst_set_bits(dst,  66,  66, (uncompacted >>  8) & 0x1);
+      brw_inst_set_bits(dst,  65,  64, (uncompacted >>  6) & 0x3);
+      brw_inst_set_bits(dst,  47,  47, (uncompacted >>  5) & 0x1);
+      brw_inst_set_bits(dst,  46,  46, (uncompacted >>  4) & 0x1);
+      brw_inst_set_bits(dst,  45,  44, (uncompacted >>  2) & 0x3);
+      brw_inst_set_bits(dst,  43,  43, (uncompacted >>  1) & 0x1);
+      brw_inst_set_bits(dst,  35,  35, (uncompacted >>  0) & 0x1);
+   } else {
+      uint64_t uncompacted = gfx8_3src_source_index_table[compacted];
+
+      brw_inst_set_bits(dst,  83,  83, (uncompacted >> 43) & 0x1);
+      brw_inst_set_bits(dst, 114, 107, (uncompacted >> 35) & 0xff);
+      brw_inst_set_bits(dst,  93,  86, (uncompacted >> 27) & 0xff);
+      brw_inst_set_bits(dst,  72,  65, (uncompacted >> 19) & 0xff);
+      brw_inst_set_bits(dst,  55,  37, (uncompacted >>  0) & 0x7ffff);
+
+      if (devinfo->ver >= 9 || devinfo->platform == INTEL_PLATFORM_CHV) {
+         brw_inst_set_bits(dst, 126, 125, (uncompacted >> 47) & 0x3);
+         brw_inst_set_bits(dst, 105, 104, (uncompacted >> 45) & 0x3);
+         brw_inst_set_bits(dst,  84,  84, (uncompacted >> 44) & 0x1);
+      } else {
+         brw_inst_set_bits(dst, 125, 125, (uncompacted >> 45) & 0x1);
+         brw_inst_set_bits(dst, 104, 104, (uncompacted >> 44) & 0x1);
+      }
+   }
+}
+
+static void
+set_uncompacted_3src_subreg_index(const struct intel_device_info *devinfo,
+                                  brw_inst *dst, brw_compact_inst *src)
+{
+   assert(devinfo->ver >= 12);
+
+   uint32_t compacted = brw_compact_inst_3src_subreg_index(devinfo, src);
+   uint32_t uncompacted = (devinfo->ver >= 20 ? xe2_3src_subreg_table[compacted]:
+                           gfx12_3src_subreg_table[compacted]);
+
+   brw_inst_set_bits(dst, 119, 115, (uncompacted >> 15));
+   brw_inst_set_bits(dst, 103,  99, (uncompacted >> 10) & 0x1f);
+   brw_inst_set_bits(dst,  71,  67, (uncompacted >>  5) & 0x1f);
+   brw_inst_set_bits(dst,  55,  51, (uncompacted >>  0) & 0x1f);
+}
+
+static void
+brw_uncompact_3src_instruction(const struct compaction_state *c,
+                               brw_inst *dst, brw_compact_inst *src, bool is_dpas)
+{
+   const struct intel_device_info *devinfo = c->isa->devinfo;
+   assert(devinfo->ver >= 8);
+
+#define uncompact(field) \
+   brw_inst_set_3src_##field(devinfo, dst, brw_compact_inst_3src_##field(devinfo, src))
+#define uncompact_a16(field) \
+   brw_inst_set_3src_a16_##field(devinfo, dst, brw_compact_inst_3src_##field(devinfo, src))
+
+   uncompact(hw_opcode);
+
+   if (devinfo->ver >= 12) {
+      set_uncompacted_3src_control_index(c, dst, src, is_dpas);
+      set_uncompacted_3src_source_index(devinfo, dst, src, is_dpas);
+      set_uncompacted_3src_subreg_index(devinfo, dst, src);
+
+      uncompact(debug_control);
+      uncompact(swsb);
+      uncompact(dst_reg_nr);
+      uncompact(src0_reg_nr);
+      uncompact(src1_reg_nr);
+      uncompact(src2_reg_nr);
+   } else {
+      set_uncompacted_3src_control_index(c, dst, src, is_dpas);
+      set_uncompacted_3src_source_index(devinfo, dst, src, is_dpas);
+
+      uncompact(dst_reg_nr);
+      uncompact_a16(src0_rep_ctrl);
+      uncompact(debug_control);
+      uncompact(saturate);
+      uncompact_a16(src1_rep_ctrl);
+      uncompact_a16(src2_rep_ctrl);
+      uncompact(src0_reg_nr);
+      uncompact(src1_reg_nr);
+      uncompact(src2_reg_nr);
+      uncompact_a16(src0_subreg_nr);
+      uncompact_a16(src1_subreg_nr);
+      uncompact_a16(src2_subreg_nr);
+   }
+   brw_inst_set_3src_cmpt_control(devinfo, dst, false);
+
+#undef uncompact
+#undef uncompact_a16
+}
+
+static void
+uncompact_instruction(const struct compaction_state *c, brw_inst *dst,
+                      brw_compact_inst *src)
+{
+   const struct intel_device_info *devinfo = c->isa->devinfo;
+   memset(dst, 0, sizeof(*dst));
+
+   if (devinfo->ver >= 8) {
+      const enum opcode opcode =
+         brw_opcode_decode(c->isa, brw_compact_inst_3src_hw_opcode(devinfo, src));
+      if (is_3src(c->isa, opcode)) {
+         const bool is_dpas = opcode == BRW_OPCODE_DPAS;
+         brw_uncompact_3src_instruction(c, dst, src, is_dpas);
+         return;
+      }
+   }
+
+#define uncompact(field) \
+   brw_inst_set_##field(devinfo, dst, brw_compact_inst_##field(devinfo, src))
+#define uncompact_reg(field) \
+   brw_inst_set_##field##_da_reg_nr(devinfo, dst, \
+                                    brw_compact_inst_##field##_reg_nr(devinfo, src))
+
+   uncompact(hw_opcode);
+   uncompact(debug_control);
+
+   set_uncompacted_control(c, dst, src);
+   set_uncompacted_datatype(c, dst, src);
+   set_uncompacted_subreg(c, dst, src);
+   set_uncompacted_src0(c, dst, src);
+
+   enum brw_reg_type type;
+   if (has_immediate(devinfo, dst, &type)) {
+      unsigned imm = uncompact_immediate(devinfo, type,
+                                         brw_compact_inst_imm(devinfo, src));
+      brw_inst_set_imm_ud(devinfo, dst, imm);
+   } else {
+      set_uncompacted_src1(c, dst, src);
+      uncompact_reg(src1);
+   }
+
+   if (devinfo->ver >= 12) {
+      uncompact(swsb);
+      uncompact_reg(dst);
+      uncompact_reg(src0);
+   } else {
+      if (devinfo->ver >= 6) {
+         uncompact(acc_wr_control);
+      } else {
+         uncompact(mask_control_ex);
+      }
+
+      uncompact(cond_modifier);
+
+      if (devinfo->ver <= 6)
+         uncompact(flag_subreg_nr);
+
+      uncompact_reg(dst);
+      uncompact_reg(src0);
+   }
+   brw_inst_set_cmpt_control(devinfo, dst, false);
+
+#undef uncompact
+#undef uncompact_reg
+}
+
+void
+brw_uncompact_instruction(const struct brw_isa_info *isa,
+                          brw_inst *dst, brw_compact_inst *src)
+{
+   struct compaction_state c;
+   compaction_state_init(&c, isa);
+   uncompact_instruction(&c, dst, src);
+}
+
+void
+brw_debug_compact_uncompact(const struct brw_isa_info *isa,
+                            brw_inst *orig,
+                            brw_inst *uncompacted)
+{
+   fprintf(stderr, "Instruction compact/uncompact changed (gen%d):\n",
+           isa->devinfo->ver);
+
+   fprintf(stderr, "  before: ");
+   brw_disassemble_inst(stderr, isa, orig, true, 0, NULL);
+
+   fprintf(stderr, "  after:  ");
+   brw_disassemble_inst(stderr, isa, uncompacted, false, 0, NULL);
+
+   uint32_t *before_bits = (uint32_t *)orig;
+   uint32_t *after_bits = (uint32_t *)uncompacted;
+   fprintf(stderr, "  changed bits:\n");
+   for (int i = 0; i < 128; i++) {
+      uint32_t before = before_bits[i / 32] & (1 << (i & 31));
+      uint32_t after = after_bits[i / 32] & (1 << (i & 31));
+
+      if (before != after) {
+         fprintf(stderr, "  bit %d, %s to %s\n", i,
+                 before ? "set" : "unset",
+                 after ? "set" : "unset");
+      }
+   }
+}
+
+static int
+compacted_between(int old_ip, int old_target_ip, int *compacted_counts)
+{
+   int this_compacted_count = compacted_counts[old_ip];
+   int target_compacted_count = compacted_counts[old_target_ip];
+   return target_compacted_count - this_compacted_count;
+}
+
+static void
+update_uip_jip(const struct brw_isa_info *isa, brw_inst *insn,
+               int this_old_ip, int *compacted_counts)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   /* JIP and UIP are in units of:
+    *    - bytes on Gfx8+; and
+    *    - compacted instructions on Gfx6+.
+    */
+   int shift = devinfo->ver >= 8 ? 3 : 0;
+
+   int32_t jip_compacted = brw_inst_jip(devinfo, insn) >> shift;
+   jip_compacted -= compacted_between(this_old_ip,
+                                      this_old_ip + (jip_compacted / 2),
+                                      compacted_counts);
+   brw_inst_set_jip(devinfo, insn, jip_compacted << shift);
+
+   if (brw_inst_opcode(isa, insn) == BRW_OPCODE_ENDIF ||
+       brw_inst_opcode(isa, insn) == BRW_OPCODE_WHILE ||
+       (brw_inst_opcode(isa, insn) == BRW_OPCODE_ELSE && devinfo->ver <= 7))
+      return;
+
+   int32_t uip_compacted = brw_inst_uip(devinfo, insn) >> shift;
+   uip_compacted -= compacted_between(this_old_ip,
+                                      this_old_ip + (uip_compacted / 2),
+                                      compacted_counts);
+   brw_inst_set_uip(devinfo, insn, uip_compacted << shift);
+}
+
+static void
+update_gfx4_jump_count(const struct intel_device_info *devinfo, brw_inst *insn,
+                       int this_old_ip, int *compacted_counts)
+{
+   assert(devinfo->ver == 5 || devinfo->platform == INTEL_PLATFORM_G4X);
+
+   /* Jump Count is in units of:
+    *    - uncompacted instructions on G45; and
+    *    - compacted instructions on Gfx5.
+    */
+   int shift = devinfo->platform == INTEL_PLATFORM_G4X ? 1 : 0;
+
+   int jump_count_compacted = brw_inst_gfx4_jump_count(devinfo, insn) << shift;
+
+   int target_old_ip = this_old_ip + (jump_count_compacted / 2);
+
+   int this_compacted_count = compacted_counts[this_old_ip];
+   int target_compacted_count = compacted_counts[target_old_ip];
+
+   jump_count_compacted -= (target_compacted_count - this_compacted_count);
+   brw_inst_set_gfx4_jump_count(devinfo, insn, jump_count_compacted >> shift);
+}
+
+static void
+compaction_state_init(struct compaction_state *c,
+                      const struct brw_isa_info *isa)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   assert(g45_control_index_table[ARRAY_SIZE(g45_control_index_table) - 1] != 0);
+   assert(g45_datatype_table[ARRAY_SIZE(g45_datatype_table) - 1] != 0);
+   assert(g45_subreg_table[ARRAY_SIZE(g45_subreg_table) - 1] != 0);
+   assert(g45_src_index_table[ARRAY_SIZE(g45_src_index_table) - 1] != 0);
+   assert(gfx6_control_index_table[ARRAY_SIZE(gfx6_control_index_table) - 1] != 0);
+   assert(gfx6_datatype_table[ARRAY_SIZE(gfx6_datatype_table) - 1] != 0);
+   assert(gfx6_subreg_table[ARRAY_SIZE(gfx6_subreg_table) - 1] != 0);
+   assert(gfx6_src_index_table[ARRAY_SIZE(gfx6_src_index_table) - 1] != 0);
+   assert(gfx7_control_index_table[ARRAY_SIZE(gfx7_control_index_table) - 1] != 0);
+   assert(gfx7_datatype_table[ARRAY_SIZE(gfx7_datatype_table) - 1] != 0);
+   assert(gfx7_subreg_table[ARRAY_SIZE(gfx7_subreg_table) - 1] != 0);
+   assert(gfx7_src_index_table[ARRAY_SIZE(gfx7_src_index_table) - 1] != 0);
+   assert(gfx8_control_index_table[ARRAY_SIZE(gfx8_control_index_table) - 1] != 0);
+   assert(gfx8_datatype_table[ARRAY_SIZE(gfx8_datatype_table) - 1] != 0);
+   assert(gfx8_subreg_table[ARRAY_SIZE(gfx8_subreg_table) - 1] != 0);
+   assert(gfx8_src_index_table[ARRAY_SIZE(gfx8_src_index_table) - 1] != 0);
+   assert(gfx11_datatype_table[ARRAY_SIZE(gfx11_datatype_table) - 1] != 0);
+   assert(gfx12_control_index_table[ARRAY_SIZE(gfx12_control_index_table) - 1] != 0);
+   assert(gfx12_datatype_table[ARRAY_SIZE(gfx12_datatype_table) - 1] != 0);
+   assert(gfx12_subreg_table[ARRAY_SIZE(gfx12_subreg_table) - 1] != 0);
+   assert(gfx12_src0_index_table[ARRAY_SIZE(gfx12_src0_index_table) - 1] != 0);
+   assert(gfx12_src1_index_table[ARRAY_SIZE(gfx12_src1_index_table) - 1] != 0);
+   assert(xehp_src0_index_table[ARRAY_SIZE(xehp_src0_index_table) - 1] != 0);
+   assert(xehp_src1_index_table[ARRAY_SIZE(xehp_src1_index_table) - 1] != 0);
+   assert(xe2_control_index_table[ARRAY_SIZE(xe2_control_index_table) - 1] != 0);
+   assert(xe2_datatype_table[ARRAY_SIZE(xe2_datatype_table) - 1] != 0);
+   assert(xe2_subreg_table[ARRAY_SIZE(xe2_subreg_table) - 1] != 0);
+   assert(xe2_src0_index_table[ARRAY_SIZE(xe2_src0_index_table) - 1] != 0);
+   assert(xe2_src1_index_table[ARRAY_SIZE(xe2_src1_index_table) - 1] != 0);
+
+   c->isa = isa;
+   switch (devinfo->ver) {
+   case 20:
+      c->control_index_table = xe2_control_index_table;
+      c->datatype_table = xe2_datatype_table;
+      c->subreg_table = xe2_subreg_table;
+      c->src0_index_table = xe2_src0_index_table;
+      c->src1_index_table = xe2_src1_index_table;
+      break;
+   case 12:
+      c->control_index_table = gfx12_control_index_table;;
+      c->datatype_table = gfx12_datatype_table;
+      c->subreg_table = gfx12_subreg_table;
+      if (devinfo->verx10 >= 125) {
+         c->src0_index_table = xehp_src0_index_table;
+         c->src1_index_table = xehp_src1_index_table;
+      } else {
+         c->src0_index_table = gfx12_src0_index_table;
+         c->src1_index_table = gfx12_src1_index_table;
+      }
+      break;
+   case 11:
+      c->control_index_table = gfx8_control_index_table;
+      c->datatype_table = gfx11_datatype_table;
+      c->subreg_table = gfx8_subreg_table;
+      c->src0_index_table = gfx8_src_index_table;
+      c->src1_index_table = gfx8_src_index_table;
+      break;
+   case 9:
+   case 8:
+      c->control_index_table = gfx8_control_index_table;
+      c->datatype_table = gfx8_datatype_table;
+      c->subreg_table = gfx8_subreg_table;
+      c->src0_index_table = gfx8_src_index_table;
+      c->src1_index_table = gfx8_src_index_table;
+      break;
+   case 7:
+      c->control_index_table = gfx7_control_index_table;
+      c->datatype_table = gfx7_datatype_table;
+      c->subreg_table = gfx7_subreg_table;
+      c->src0_index_table = gfx7_src_index_table;
+      c->src1_index_table = gfx7_src_index_table;
+      break;
+   case 6:
+      c->control_index_table = gfx6_control_index_table;
+      c->datatype_table = gfx6_datatype_table;
+      c->subreg_table = gfx6_subreg_table;
+      c->src0_index_table = gfx6_src_index_table;
+      c->src1_index_table = gfx6_src_index_table;
+      break;
+   case 5:
+   case 4:
+      c->control_index_table = g45_control_index_table;
+      c->datatype_table = g45_datatype_table;
+      c->subreg_table = g45_subreg_table;
+      c->src0_index_table = g45_src_index_table;
+      c->src1_index_table = g45_src_index_table;
+      break;
+   default:
+      unreachable("unknown generation");
+   }
+}
+
+void
+brw_compact_instructions(struct brw_codegen *p, int start_offset,
+                         struct disasm_info *disasm)
+{
+   if (INTEL_DEBUG(DEBUG_NO_COMPACTION))
+      return;
+
+   const struct intel_device_info *devinfo = p->devinfo;
+   if (devinfo->ver == 4 && devinfo->platform != INTEL_PLATFORM_G4X)
+      return;
+
+   void *store = p->store + start_offset / 16;
+   /* For an instruction at byte offset 16*i before compaction, this is the
+    * number of compacted instructions minus the number of padding NOP/NENOPs
+    * that preceded it.
+    */
+   unsigned num_compacted_counts =
+      (p->next_insn_offset - start_offset) / sizeof(brw_inst);
+   int *compacted_counts =
+      calloc(1, sizeof(*compacted_counts) * num_compacted_counts);
+
+   /* For an instruction at byte offset 8*i after compaction, this was its IP
+    * (in 16-byte units) before compaction.
+    */
+   unsigned num_old_ip =
+      (p->next_insn_offset - start_offset) / sizeof(brw_compact_inst) + 1;
+   int *old_ip = calloc(1, sizeof(*old_ip) * num_old_ip);
+
+   struct compaction_state c;
+   compaction_state_init(&c, p->isa);
+
+   int offset = 0;
+   int compacted_count = 0;
+   for (int src_offset = 0; src_offset < p->next_insn_offset - start_offset;
+        src_offset += sizeof(brw_inst)) {
+      brw_inst *src = store + src_offset;
+      void *dst = store + offset;
+
+      old_ip[offset / sizeof(brw_compact_inst)] = src_offset / sizeof(brw_inst);
+      compacted_counts[src_offset / sizeof(brw_inst)] = compacted_count;
+
+      brw_inst inst = precompact(p->isa, *src);
+      brw_inst saved = inst;
+
+      if (try_compact_instruction(&c, dst, &inst)) {
+         compacted_count++;
+
+         if (INTEL_DEBUG(DEBUG_VS | DEBUG_GS | DEBUG_TCS | DEBUG_TASK |
+                         DEBUG_WM | DEBUG_CS | DEBUG_TES | DEBUG_MESH |
+                         DEBUG_RT)) {
+            brw_inst uncompacted;
+            uncompact_instruction(&c, &uncompacted, dst);
+            if (memcmp(&saved, &uncompacted, sizeof(uncompacted))) {
+               brw_debug_compact_uncompact(p->isa, &saved, &uncompacted);
+            }
+         }
+
+         offset += sizeof(brw_compact_inst);
+      } else {
+         /* All uncompacted instructions need to be aligned on G45. */
+         if ((offset & sizeof(brw_compact_inst)) != 0 &&
+             devinfo->platform == INTEL_PLATFORM_G4X) {
+            brw_compact_inst *align = store + offset;
+            memset(align, 0, sizeof(*align));
+            brw_compact_inst_set_hw_opcode(
+               devinfo, align, brw_opcode_encode(p->isa, BRW_OPCODE_NENOP));
+            brw_compact_inst_set_cmpt_control(devinfo, align, true);
+            offset += sizeof(brw_compact_inst);
+            compacted_count--;
+            compacted_counts[src_offset / sizeof(brw_inst)] = compacted_count;
+            old_ip[offset / sizeof(brw_compact_inst)] = src_offset / sizeof(brw_inst);
+
+            dst = store + offset;
+         }
+
+         /* If we didn't compact this instruction, we need to move it down into
+          * place.
+          */
+         if (offset != src_offset) {
+            memmove(dst, src, sizeof(brw_inst));
+         }
+         offset += sizeof(brw_inst);
+      }
+   }
+
+   /* Add an entry for the ending offset of the program. This greatly
+    * simplifies the linked list walk at the end of the function.
+    */
+   old_ip[offset / sizeof(brw_compact_inst)] =
+      (p->next_insn_offset - start_offset) / sizeof(brw_inst);
+
+   /* Fix up control flow offsets. */
+   p->next_insn_offset = start_offset + offset;
+   for (offset = 0; offset < p->next_insn_offset - start_offset;
+        offset = next_offset(devinfo, store, offset)) {
+      brw_inst *insn = store + offset;
+      int this_old_ip = old_ip[offset / sizeof(brw_compact_inst)];
+      int this_compacted_count = compacted_counts[this_old_ip];
+
+      switch (brw_inst_opcode(p->isa, insn)) {
+      case BRW_OPCODE_BREAK:
+      case BRW_OPCODE_CONTINUE:
+      case BRW_OPCODE_HALT:
+         if (devinfo->ver >= 6) {
+            update_uip_jip(p->isa, insn, this_old_ip, compacted_counts);
+         } else {
+            update_gfx4_jump_count(devinfo, insn, this_old_ip,
+                                   compacted_counts);
+         }
+         break;
+
+      case BRW_OPCODE_IF:
+      case BRW_OPCODE_IFF:
+      case BRW_OPCODE_ELSE:
+      case BRW_OPCODE_ENDIF:
+      case BRW_OPCODE_WHILE:
+         if (devinfo->ver >= 7) {
+            if (brw_inst_cmpt_control(devinfo, insn)) {
+               brw_inst uncompacted;
+               uncompact_instruction(&c, &uncompacted,
+                                     (brw_compact_inst *)insn);
+
+               update_uip_jip(p->isa, &uncompacted, this_old_ip,
+                              compacted_counts);
+
+               bool ret = try_compact_instruction(&c, (brw_compact_inst *)insn,
+                                                  &uncompacted);
+               assert(ret); (void)ret;
+            } else {
+               update_uip_jip(p->isa, insn, this_old_ip, compacted_counts);
+            }
+         } else if (devinfo->ver == 6) {
+            assert(!brw_inst_cmpt_control(devinfo, insn));
+
+            /* Jump Count is in units of compacted instructions on Gfx6. */
+            int jump_count_compacted = brw_inst_gfx6_jump_count(devinfo, insn);
+
+            int target_old_ip = this_old_ip + (jump_count_compacted / 2);
+            int target_compacted_count = compacted_counts[target_old_ip];
+            jump_count_compacted -= (target_compacted_count - this_compacted_count);
+            brw_inst_set_gfx6_jump_count(devinfo, insn, jump_count_compacted);
+         } else {
+            update_gfx4_jump_count(devinfo, insn, this_old_ip,
+                                   compacted_counts);
+         }
+         break;
+
+      case BRW_OPCODE_ADD:
+         /* Add instructions modifying the IP register use an immediate src1,
+          * and Gens that use this cannot compact instructions with immediate
+          * operands.
+          */
+         if (brw_inst_cmpt_control(devinfo, insn))
+            break;
+
+         if (brw_inst_dst_reg_file(devinfo, insn) == BRW_ARCHITECTURE_REGISTER_FILE &&
+             brw_inst_dst_da_reg_nr(devinfo, insn) == BRW_ARF_IP) {
+            assert(brw_inst_src1_reg_file(devinfo, insn) == BRW_IMMEDIATE_VALUE);
+
+            int shift = 3;
+            int jump_compacted = brw_inst_imm_d(devinfo, insn) >> shift;
+
+            int target_old_ip = this_old_ip + (jump_compacted / 2);
+            int target_compacted_count = compacted_counts[target_old_ip];
+            jump_compacted -= (target_compacted_count - this_compacted_count);
+            brw_inst_set_imm_ud(devinfo, insn, jump_compacted << shift);
+         }
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   /* p->nr_insn is counting the number of uncompacted instructions still, so
+    * divide.  We do want to be sure there's a valid instruction in any
+    * alignment padding, so that the next compression pass (for the FS 8/16
+    * compile passes) parses correctly.
+    */
+   if (p->next_insn_offset & sizeof(brw_compact_inst)) {
+      brw_compact_inst *align = store + offset;
+      memset(align, 0, sizeof(*align));
+      brw_compact_inst_set_hw_opcode(
+         devinfo, align, brw_opcode_encode(p->isa, BRW_OPCODE_NOP));
+      brw_compact_inst_set_cmpt_control(devinfo, align, true);
+      p->next_insn_offset += sizeof(brw_compact_inst);
+   }
+   p->nr_insn = p->next_insn_offset / sizeof(brw_inst);
+
+   for (int i = 0; i < p->num_relocs; i++) {
+      if (p->relocs[i].offset < (uint32_t)start_offset)
+         continue;
+
+      assert(p->relocs[i].offset % 16 == 0);
+      unsigned idx = (p->relocs[i].offset - start_offset) / 16;
+      p->relocs[i].offset -= compacted_counts[idx] * 8;
+   }
+
+   /* Update the instruction offsets for each group. */
+   if (disasm) {
+      int offset = 0;
+
+      foreach_list_typed(struct inst_group, group, link, &disasm->group_list) {
+         while (start_offset + old_ip[offset / sizeof(brw_compact_inst)] *
+                sizeof(brw_inst) != group->offset) {
+            assert(start_offset + old_ip[offset / sizeof(brw_compact_inst)] *
+                   sizeof(brw_inst) < group->offset);
+            offset = next_offset(devinfo, store, offset);
+         }
+
+         group->offset = start_offset + offset;
+
+         offset = next_offset(devinfo, store, offset);
+      }
+   }
+
+   free(compacted_counts);
+   free(old_ip);
+}
diff --git a/src/intel/compiler/elk/brw_eu_defines.h b/src/intel/compiler/elk/brw_eu_defines.h
new file mode 100644
index 00000000000..0302334014d
--- /dev/null
+++ b/src/intel/compiler/elk/brw_eu_defines.h
@@ -0,0 +1,2218 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#ifndef BRW_EU_DEFINES_H
+#define BRW_EU_DEFINES_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "util/macros.h"
+#include "dev/intel_device_info.h"
+
+/* The following hunk, up-to "Execution Unit" is used by both the
+ * intel/compiler and i965 codebase. */
+
+#define INTEL_MASK(high, low) (((1u<<((high)-(low)+1))-1)<<(low))
+/* Using the GNU statement expression extension */
+#define SET_FIELD(value, field)                                         \
+   ({                                                                   \
+      uint32_t fieldval = (uint32_t)(value) << field ## _SHIFT;         \
+      assert((fieldval & ~ field ## _MASK) == 0);                       \
+      fieldval & field ## _MASK;                                        \
+   })
+
+#define SET_BITS(value, high, low)                                      \
+   ({                                                                   \
+      const uint32_t fieldval = (uint32_t)(value) << (low);             \
+      assert((fieldval & ~INTEL_MASK(high, low)) == 0);                 \
+      fieldval & INTEL_MASK(high, low);                                 \
+   })
+
+#define GET_BITS(data, high, low) ((data & INTEL_MASK((high), (low))) >> (low))
+#define GET_FIELD(word, field) (((word)  & field ## _MASK) >> field ## _SHIFT)
+
+/* Bitfields for the URB_WRITE message, DW2 of message header: */
+#define URB_WRITE_PRIM_END		0x1
+#define URB_WRITE_PRIM_START		0x2
+#define URB_WRITE_PRIM_TYPE_SHIFT	2
+
+#define BRW_SPRITE_POINT_ENABLE  16
+
+# define GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT		0
+# define GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID		1
+
+/* Execution Unit (EU) defines
+ */
+
+#define BRW_ALIGN_1   0
+#define BRW_ALIGN_16  1
+
+#define BRW_ADDRESS_DIRECT                        0
+#define BRW_ADDRESS_REGISTER_INDIRECT_REGISTER    1
+
+#define BRW_CHANNEL_X     0
+#define BRW_CHANNEL_Y     1
+#define BRW_CHANNEL_Z     2
+#define BRW_CHANNEL_W     3
+
+enum brw_compression {
+   BRW_COMPRESSION_NONE       = 0,
+   BRW_COMPRESSION_2NDHALF    = 1,
+   BRW_COMPRESSION_COMPRESSED = 2,
+};
+
+#define GFX6_COMPRESSION_1Q		0
+#define GFX6_COMPRESSION_2Q		1
+#define GFX6_COMPRESSION_3Q		2
+#define GFX6_COMPRESSION_4Q		3
+#define GFX6_COMPRESSION_1H		0
+#define GFX6_COMPRESSION_2H		2
+
+enum ENUM_PACKED brw_conditional_mod {
+   BRW_CONDITIONAL_NONE = 0,
+   BRW_CONDITIONAL_Z    = 1,
+   BRW_CONDITIONAL_NZ   = 2,
+   BRW_CONDITIONAL_EQ   = 1,	/* Z */
+   BRW_CONDITIONAL_NEQ  = 2,	/* NZ */
+   BRW_CONDITIONAL_G    = 3,
+   BRW_CONDITIONAL_GE   = 4,
+   BRW_CONDITIONAL_L    = 5,
+   BRW_CONDITIONAL_LE   = 6,
+   BRW_CONDITIONAL_R    = 7,    /* Gen <= 5 */
+   BRW_CONDITIONAL_O    = 8,
+   BRW_CONDITIONAL_U    = 9,
+};
+
+#define BRW_DEBUG_NONE        0
+#define BRW_DEBUG_BREAKPOINT  1
+
+#define BRW_DEPENDENCY_NORMAL         0
+#define BRW_DEPENDENCY_NOTCLEARED     1
+#define BRW_DEPENDENCY_NOTCHECKED     2
+#define BRW_DEPENDENCY_DISABLE        3
+
+enum ENUM_PACKED brw_execution_size {
+   BRW_EXECUTE_1  = 0,
+   BRW_EXECUTE_2  = 1,
+   BRW_EXECUTE_4  = 2,
+   BRW_EXECUTE_8  = 3,
+   BRW_EXECUTE_16 = 4,
+   BRW_EXECUTE_32 = 5,
+};
+
+enum ENUM_PACKED brw_horizontal_stride {
+   BRW_HORIZONTAL_STRIDE_0 = 0,
+   BRW_HORIZONTAL_STRIDE_1 = 1,
+   BRW_HORIZONTAL_STRIDE_2 = 2,
+   BRW_HORIZONTAL_STRIDE_4 = 3,
+};
+
+enum ENUM_PACKED gfx10_align1_3src_src_horizontal_stride {
+   BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0 = 0,
+   BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1 = 1,
+   BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2 = 2,
+   BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4 = 3,
+};
+
+enum ENUM_PACKED gfx10_align1_3src_dst_horizontal_stride {
+   BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1 = 0,
+   BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_2 = 1,
+};
+
+#define BRW_INSTRUCTION_NORMAL    0
+#define BRW_INSTRUCTION_SATURATE  1
+
+#define BRW_MASK_ENABLE   0
+#define BRW_MASK_DISABLE  1
+
+/** @{
+ *
+ * Gfx6 has replaced "mask enable/disable" with WECtrl, which is
+ * effectively the same but much simpler to think about.  Now, there
+ * are two contributors ANDed together to whether channels are
+ * executed: The predication on the instruction, and the channel write
+ * enable.
+ */
+/**
+ * This is the default value.  It means that a channel's write enable is set
+ * if the per-channel IP is pointing at this instruction.
+ */
+#define BRW_WE_NORMAL		0
+/**
+ * This is used like BRW_MASK_DISABLE, and causes all channels to have
+ * their write enable set.  Note that predication still contributes to
+ * whether the channel actually gets written.
+ */
+#define BRW_WE_ALL		1
+/** @} */
+
+enum opcode {
+   /* These are the actual hardware instructions. */
+   BRW_OPCODE_ILLEGAL,
+   BRW_OPCODE_SYNC,
+   BRW_OPCODE_MOV,
+   BRW_OPCODE_SEL,
+   BRW_OPCODE_MOVI, /**< G45+ */
+   BRW_OPCODE_NOT,
+   BRW_OPCODE_AND,
+   BRW_OPCODE_OR,
+   BRW_OPCODE_XOR,
+   BRW_OPCODE_SHR,
+   BRW_OPCODE_SHL,
+   BRW_OPCODE_DIM, /**< Gfx7.5 only */
+   BRW_OPCODE_SMOV, /**< Gfx8+ */
+   BRW_OPCODE_ASR,
+   BRW_OPCODE_ROR,  /**< Gfx11+ */
+   BRW_OPCODE_ROL,  /**< Gfx11+ */
+   BRW_OPCODE_CMP,
+   BRW_OPCODE_CMPN,
+   BRW_OPCODE_CSEL, /**< Gfx8+ */
+   BRW_OPCODE_F32TO16, /**< Gfx7 only */
+   BRW_OPCODE_F16TO32, /**< Gfx7 only */
+   BRW_OPCODE_BFREV, /**< Gfx7+ */
+   BRW_OPCODE_BFE, /**< Gfx7+ */
+   BRW_OPCODE_BFI1, /**< Gfx7+ */
+   BRW_OPCODE_BFI2, /**< Gfx7+ */
+   BRW_OPCODE_JMPI,
+   BRW_OPCODE_BRD, /**< Gfx7+ */
+   BRW_OPCODE_IF,
+   BRW_OPCODE_IFF, /**< Pre-Gfx6 */
+   BRW_OPCODE_BRC, /**< Gfx7+ */
+   BRW_OPCODE_ELSE,
+   BRW_OPCODE_ENDIF,
+   BRW_OPCODE_DO, /**< Pre-Gfx6 */
+   BRW_OPCODE_CASE, /**< Gfx6 only */
+   BRW_OPCODE_WHILE,
+   BRW_OPCODE_BREAK,
+   BRW_OPCODE_CONTINUE,
+   BRW_OPCODE_HALT,
+   BRW_OPCODE_CALLA, /**< Gfx7.5+ */
+   BRW_OPCODE_MSAVE, /**< Pre-Gfx6 */
+   BRW_OPCODE_CALL, /**< Gfx6+ */
+   BRW_OPCODE_MREST, /**< Pre-Gfx6 */
+   BRW_OPCODE_RET, /**< Gfx6+ */
+   BRW_OPCODE_PUSH, /**< Pre-Gfx6 */
+   BRW_OPCODE_FORK, /**< Gfx6 only */
+   BRW_OPCODE_GOTO, /**< Gfx8+ */
+   BRW_OPCODE_POP, /**< Pre-Gfx6 */
+   BRW_OPCODE_WAIT,
+   BRW_OPCODE_SEND,
+   BRW_OPCODE_SENDC,
+   BRW_OPCODE_SENDS, /**< Gfx9+ */
+   BRW_OPCODE_SENDSC, /**< Gfx9+ */
+   BRW_OPCODE_MATH, /**< Gfx6+ */
+   BRW_OPCODE_ADD,
+   BRW_OPCODE_MUL,
+   BRW_OPCODE_AVG,
+   BRW_OPCODE_FRC,
+   BRW_OPCODE_RNDU,
+   BRW_OPCODE_RNDD,
+   BRW_OPCODE_RNDE,
+   BRW_OPCODE_RNDZ,
+   BRW_OPCODE_MAC,
+   BRW_OPCODE_MACH,
+   BRW_OPCODE_LZD,
+   BRW_OPCODE_FBH, /**< Gfx7+ */
+   BRW_OPCODE_FBL, /**< Gfx7+ */
+   BRW_OPCODE_CBIT, /**< Gfx7+ */
+   BRW_OPCODE_ADDC, /**< Gfx7+ */
+   BRW_OPCODE_SUBB, /**< Gfx7+ */
+   BRW_OPCODE_SAD2,
+   BRW_OPCODE_SADA2,
+   BRW_OPCODE_ADD3, /* Gen12+ only */
+   BRW_OPCODE_DP4,
+   BRW_OPCODE_DPH,
+   BRW_OPCODE_DP3,
+   BRW_OPCODE_DP2,
+   BRW_OPCODE_DP4A, /**< Gfx12+ */
+   BRW_OPCODE_LINE,
+   BRW_OPCODE_DPAS,  /**< Gfx12.5+ */
+   BRW_OPCODE_PLN, /**< G45+ */
+   BRW_OPCODE_MAD, /**< Gfx6+ */
+   BRW_OPCODE_LRP, /**< Gfx6+ */
+   BRW_OPCODE_MADM, /**< Gfx8+ */
+   BRW_OPCODE_NENOP, /**< G45 only */
+   BRW_OPCODE_NOP,
+
+   NUM_BRW_OPCODES,
+
+   /* These are compiler backend opcodes that get translated into other
+    * instructions.
+    */
+   FS_OPCODE_FB_WRITE = NUM_BRW_OPCODES,
+
+   /**
+    * Same as FS_OPCODE_FB_WRITE but expects its arguments separately as
+    * individual sources instead of as a single payload blob. The
+    * position/ordering of the arguments are defined by the enum
+    * fb_write_logical_srcs.
+    */
+   FS_OPCODE_FB_WRITE_LOGICAL,
+
+   FS_OPCODE_REP_FB_WRITE,
+
+   FS_OPCODE_FB_READ,
+   FS_OPCODE_FB_READ_LOGICAL,
+
+   SHADER_OPCODE_RCP,
+   SHADER_OPCODE_RSQ,
+   SHADER_OPCODE_SQRT,
+   SHADER_OPCODE_EXP2,
+   SHADER_OPCODE_LOG2,
+   SHADER_OPCODE_POW,
+   SHADER_OPCODE_INT_QUOTIENT,
+   SHADER_OPCODE_INT_REMAINDER,
+   SHADER_OPCODE_SIN,
+   SHADER_OPCODE_COS,
+
+   /**
+    * A generic "send" opcode.  The first two sources are the message
+    * descriptor and extended message descriptor respectively.  The third
+    * and optional fourth sources are the message payload
+    */
+   SHADER_OPCODE_SEND,
+
+   /**
+    * An "undefined" write which does nothing but indicates to liveness that
+    * we don't care about any values in the register which predate this
+    * instruction.  Used to prevent partial writes from causing issues with
+    * live ranges.
+    */
+   SHADER_OPCODE_UNDEF,
+
+   /**
+    * Texture sampling opcodes.
+    *
+    * LOGICAL opcodes are eventually translated to the matching non-LOGICAL
+    * opcode but instead of taking a single payload blob they expect their
+    * arguments separately as individual sources. The position/ordering of the
+    * arguments are defined by the enum tex_logical_srcs.
+    */
+   SHADER_OPCODE_TEX,
+   SHADER_OPCODE_TEX_LOGICAL,
+   SHADER_OPCODE_TXD,
+   SHADER_OPCODE_TXD_LOGICAL,
+   SHADER_OPCODE_TXF,
+   SHADER_OPCODE_TXF_LOGICAL,
+   SHADER_OPCODE_TXF_LZ,
+   SHADER_OPCODE_TXL,
+   SHADER_OPCODE_TXL_LOGICAL,
+   SHADER_OPCODE_TXL_LZ,
+   SHADER_OPCODE_TXS,
+   SHADER_OPCODE_TXS_LOGICAL,
+   FS_OPCODE_TXB,
+   FS_OPCODE_TXB_LOGICAL,
+   SHADER_OPCODE_TXF_CMS,
+   SHADER_OPCODE_TXF_CMS_LOGICAL,
+   SHADER_OPCODE_TXF_CMS_W,
+   SHADER_OPCODE_TXF_CMS_W_LOGICAL,
+   SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL,
+   SHADER_OPCODE_TXF_UMS,
+   SHADER_OPCODE_TXF_UMS_LOGICAL,
+   SHADER_OPCODE_TXF_MCS,
+   SHADER_OPCODE_TXF_MCS_LOGICAL,
+   SHADER_OPCODE_LOD,
+   SHADER_OPCODE_LOD_LOGICAL,
+   SHADER_OPCODE_TG4,
+   SHADER_OPCODE_TG4_LOGICAL,
+   SHADER_OPCODE_TG4_OFFSET,
+   SHADER_OPCODE_TG4_OFFSET_LOGICAL,
+   SHADER_OPCODE_SAMPLEINFO,
+   SHADER_OPCODE_SAMPLEINFO_LOGICAL,
+
+   SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
+
+   /**
+    * Combines multiple sources of size 1 into a larger virtual GRF.
+    * For example, parameters for a send-from-GRF message.  Or, updating
+    * channels of a size 4 VGRF used to store vec4s such as texturing results.
+    *
+    * This will be lowered into MOVs from each source to consecutive offsets
+    * of the destination VGRF.
+    *
+    * src[0] may be BAD_FILE.  If so, the lowering pass skips emitting the MOV,
+    * but still reserves the first channel of the destination VGRF.  This can be
+    * used to reserve space for, say, a message header set up by the generators.
+    */
+   SHADER_OPCODE_LOAD_PAYLOAD,
+
+   /**
+    * Packs a number of sources into a single value. Unlike LOAD_PAYLOAD, this
+    * acts intra-channel, obtaining the final value for each channel by
+    * combining the sources values for the same channel, the first source
+    * occupying the lowest bits and the last source occupying the highest
+    * bits.
+    */
+   FS_OPCODE_PACK,
+
+   /**
+    * Typed and untyped surface access opcodes.
+    *
+    * LOGICAL opcodes are eventually translated to the matching non-LOGICAL
+    * opcode but instead of taking a single payload blob they expect their
+    * arguments separately as individual sources:
+    *
+    * Source 0: [required] Surface coordinates.
+    * Source 1: [optional] Operation source.
+    * Source 2: [required] Surface index.
+    * Source 3: [required] Number of coordinate components (as UD immediate).
+    * Source 4: [required] Opcode-specific control immediate, same as source 2
+    *                      of the matching non-LOGICAL opcode.
+    */
+   VEC4_OPCODE_UNTYPED_ATOMIC,
+   SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
+   VEC4_OPCODE_UNTYPED_SURFACE_READ,
+   SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
+   VEC4_OPCODE_UNTYPED_SURFACE_WRITE,
+   SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
+
+   SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
+   SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL,
+
+   /**
+    * Untyped A64 surface access opcodes.
+    *
+    * Source 0: 64-bit address
+    * Source 1: Operational source
+    * Source 2: [required] Opcode-specific control immediate, same as source 2
+    *                      of the matching non-LOGICAL opcode.
+    */
+   SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL,
+   SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL,
+   SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL,
+   SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL,
+   SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,
+   SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
+   SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL,
+   SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
+
+   SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
+   SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
+   SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
+
+   SHADER_OPCODE_RND_MODE,
+   SHADER_OPCODE_FLOAT_CONTROL_MODE,
+
+   /**
+    * Byte scattered write/read opcodes.
+    *
+    * LOGICAL opcodes are eventually translated to the matching non-LOGICAL
+    * opcode, but instead of taking a single payload blog they expect their
+    * arguments separately as individual sources, like untyped write/read.
+    */
+   SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
+   SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
+   SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
+   SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
+
+   /**
+    * Memory fence messages.
+    *
+    * Source 0: Must be register g0, used as header.
+    * Source 1: Immediate bool to indicate whether control is returned to the
+    *           thread only after the fence has been honored.
+    * Source 2: Immediate byte indicating which memory to fence.  Zero means
+    *           global memory; GFX7_BTI_SLM means SLM (for Gfx11+ only).
+    *
+    * Vec4 backend only uses Source 0.
+    */
+   SHADER_OPCODE_MEMORY_FENCE,
+
+   /**
+    * Scheduling-only fence.
+    *
+    * Sources can be used to force a stall until the registers in those are
+    * available.  This might generate MOVs or SYNC_NOPs (Gfx12+).
+    */
+   FS_OPCODE_SCHEDULING_FENCE,
+
+   SHADER_OPCODE_GFX4_SCRATCH_READ,
+   SHADER_OPCODE_GFX4_SCRATCH_WRITE,
+   SHADER_OPCODE_GFX7_SCRATCH_READ,
+
+   SHADER_OPCODE_SCRATCH_HEADER,
+
+   /**
+    * Gfx8+ SIMD8 URB messages.
+    */
+   SHADER_OPCODE_URB_READ_LOGICAL,
+   SHADER_OPCODE_URB_WRITE_LOGICAL,
+
+   /**
+    * Return the index of the first enabled live channel and assign it to
+    * to the first component of the destination.  Frequently used as input
+    * for the BROADCAST pseudo-opcode.
+    */
+   SHADER_OPCODE_FIND_LIVE_CHANNEL,
+
+   /**
+    * Return the index of the last enabled live channel and assign it to
+    * the first component of the destination.
+    */
+   SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL,
+
+   /**
+    * Return the current execution mask in the specified flag subregister.
+    * Can be CSE'ed more easily than a plain MOV from the ce0 ARF register.
+    */
+   FS_OPCODE_LOAD_LIVE_CHANNELS,
+
+   /**
+    * Pick the channel from its first source register given by the index
+    * specified as second source.  Useful for variable indexing of surfaces.
+    *
+    * Note that because the result of this instruction is by definition
+    * uniform and it can always be splatted to multiple channels using a
+    * scalar regioning mode, only the first channel of the destination region
+    * is guaranteed to be updated, which implies that BROADCAST instructions
+    * should usually be marked force_writemask_all.
+    */
+   SHADER_OPCODE_BROADCAST,
+
+   /* Pick the channel from its first source register given by the index
+    * specified as second source.
+    *
+    * This is similar to the BROADCAST instruction except that it takes a
+    * dynamic index and potentially puts a different value in each output
+    * channel.
+    */
+   SHADER_OPCODE_SHUFFLE,
+
+   /* Select between src0 and src1 based on channel enables.
+    *
+    * This instruction copies src0 into the enabled channels of the
+    * destination and copies src1 into the disabled channels.
+    */
+   SHADER_OPCODE_SEL_EXEC,
+
+   /* This turns into an align16 mov from src0 to dst with a swizzle
+    * provided as an immediate in src1.
+    */
+   SHADER_OPCODE_QUAD_SWIZZLE,
+
+   /* Take every Nth element in src0 and broadcast it to the group of N
+    * channels in which it lives in the destination.  The offset within the
+    * cluster is given by src1 and the cluster size is given by src2.
+    */
+   SHADER_OPCODE_CLUSTER_BROADCAST,
+
+   SHADER_OPCODE_GET_BUFFER_SIZE,
+
+   SHADER_OPCODE_INTERLOCK,
+
+   /** Target for a HALT
+    *
+    * All HALT instructions in a shader must target the same jump point and
+    * that point is denoted by a HALT_TARGET instruction.
+    */
+   SHADER_OPCODE_HALT_TARGET,
+
+   VEC4_OPCODE_MOV_BYTES,
+   VEC4_OPCODE_PACK_BYTES,
+   VEC4_OPCODE_UNPACK_UNIFORM,
+   VEC4_OPCODE_DOUBLE_TO_F32,
+   VEC4_OPCODE_DOUBLE_TO_D32,
+   VEC4_OPCODE_DOUBLE_TO_U32,
+   VEC4_OPCODE_TO_DOUBLE,
+   VEC4_OPCODE_PICK_LOW_32BIT,
+   VEC4_OPCODE_PICK_HIGH_32BIT,
+   VEC4_OPCODE_SET_LOW_32BIT,
+   VEC4_OPCODE_SET_HIGH_32BIT,
+   VEC4_OPCODE_MOV_FOR_SCRATCH,
+   VEC4_OPCODE_ZERO_OOB_PUSH_REGS,
+
+   FS_OPCODE_DDX_COARSE,
+   FS_OPCODE_DDX_FINE,
+   /**
+    * Compute dFdy(), dFdyCoarse(), or dFdyFine().
+    */
+   FS_OPCODE_DDY_COARSE,
+   FS_OPCODE_DDY_FINE,
+   FS_OPCODE_LINTERP,
+   FS_OPCODE_PIXEL_X,
+   FS_OPCODE_PIXEL_Y,
+   FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+   FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4,
+   FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
+   FS_OPCODE_SET_SAMPLE_ID,
+   FS_OPCODE_PACK_HALF_2x16_SPLIT,
+   FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+   FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
+   FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET,
+
+   VEC4_VS_OPCODE_URB_WRITE,
+   VS_OPCODE_PULL_CONSTANT_LOAD,
+   VS_OPCODE_PULL_CONSTANT_LOAD_GFX7,
+
+   VS_OPCODE_UNPACK_FLAGS_SIMD4X2,
+
+   /**
+    * Write geometry shader output data to the URB.
+    *
+    * Unlike VEC4_VS_OPCODE_URB_WRITE, this opcode doesn't do an implied move from
+    * R0 to the first MRF.  This allows the geometry shader to override the
+    * "Slot {0,1} Offset" fields in the message header.
+    */
+   VEC4_GS_OPCODE_URB_WRITE,
+
+   /**
+    * Write geometry shader output data to the URB and request a new URB
+    * handle (gfx6).
+    *
+    * This opcode doesn't do an implied move from R0 to the first MRF.
+    */
+   VEC4_GS_OPCODE_URB_WRITE_ALLOCATE,
+
+   /**
+    * Terminate the geometry shader thread by doing an empty URB write.
+    *
+    * This opcode doesn't do an implied move from R0 to the first MRF.  This
+    * allows the geometry shader to override the "GS Number of Output Vertices
+    * for Slot {0,1}" fields in the message header.
+    */
+   GS_OPCODE_THREAD_END,
+
+   /**
+    * Set the "Slot {0,1} Offset" fields of a URB_WRITE message header.
+    *
+    * - dst is the MRF containing the message header.
+    *
+    * - src0.x indicates which portion of the URB should be written to (e.g. a
+    *   vertex number)
+    *
+    * - src1 is an immediate multiplier which will be applied to src0
+    *   (e.g. the size of a single vertex in the URB).
+    *
+    * Note: the hardware will apply this offset *in addition to* the offset in
+    * vec4_instruction::offset.
+    */
+   GS_OPCODE_SET_WRITE_OFFSET,
+
+   /**
+    * Set the "GS Number of Output Vertices for Slot {0,1}" fields of a
+    * URB_WRITE message header.
+    *
+    * - dst is the MRF containing the message header.
+    *
+    * - src0.x is the vertex count.  The upper 16 bits will be ignored.
+    */
+   GS_OPCODE_SET_VERTEX_COUNT,
+
+   /**
+    * Set DWORD 2 of dst to the value in src.
+    */
+   GS_OPCODE_SET_DWORD_2,
+
+   /**
+    * Prepare the dst register for storage in the "Channel Mask" fields of a
+    * URB_WRITE message header.
+    *
+    * DWORD 4 of dst is shifted left by 4 bits, so that later,
+    * GS_OPCODE_SET_CHANNEL_MASKS can OR DWORDs 0 and 4 together to form the
+    * final channel mask.
+    *
+    * Note: since GS_OPCODE_SET_CHANNEL_MASKS ORs DWORDs 0 and 4 together to
+    * form the final channel mask, DWORDs 0 and 4 of the dst register must not
+    * have any extraneous bits set prior to execution of this opcode (that is,
+    * they should be in the range 0x0 to 0xf).
+    */
+   GS_OPCODE_PREPARE_CHANNEL_MASKS,
+
+   /**
+    * Set the "Channel Mask" fields of a URB_WRITE message header.
+    *
+    * - dst is the MRF containing the message header.
+    *
+    * - src.x is the channel mask, as prepared by
+    *   GS_OPCODE_PREPARE_CHANNEL_MASKS.  DWORDs 0 and 4 are OR'ed together to
+    *   form the final channel mask.
+    */
+   GS_OPCODE_SET_CHANNEL_MASKS,
+
+   /**
+    * Get the "Instance ID" fields from the payload.
+    *
+    * - dst is the GRF for gl_InvocationID.
+    */
+   GS_OPCODE_GET_INSTANCE_ID,
+
+   /**
+    * Send a FF_SYNC message to allocate initial URB handles (gfx6).
+    *
+    * - dst will be used as the writeback register for the FF_SYNC operation.
+    *
+    * - src0 is the number of primitives written.
+    *
+    * - src1 is the value to hold in M0.0: number of SO vertices to write
+    *   and number of SO primitives needed. Its value will be overwritten
+    *   with the SVBI values if transform feedback is enabled.
+    *
+    * Note: This opcode uses an implicit MRF register for the ff_sync message
+    * header, so the caller is expected to set inst->base_mrf and initialize
+    * that MRF register to r0. This opcode will also write to this MRF register
+    * to include the allocated URB handle so it can then be reused directly as
+    * the header in the URB write operation we are allocating the handle for.
+    */
+   GS_OPCODE_FF_SYNC,
+
+   /**
+    * Move r0.1 (which holds PrimitiveID information in gfx6) to a separate
+    * register.
+    *
+    * - dst is the GRF where PrimitiveID information will be moved.
+    */
+   GS_OPCODE_SET_PRIMITIVE_ID,
+
+   /**
+    * Write transform feedback data to the SVB by sending a SVB WRITE message.
+    * Used in gfx6.
+    *
+    * - dst is the MRF register containing the message header.
+    *
+    * - src0 is the register where the vertex data is going to be copied from.
+    *
+    * - src1 is the destination register when write commit occurs.
+    */
+   GS_OPCODE_SVB_WRITE,
+
+   /**
+    * Set destination index in the SVB write message payload (M0.5). Used
+    * in gfx6 for transform feedback.
+    *
+    * - dst is the header to save the destination indices for SVB WRITE.
+    * - src is the register that holds the destination indices value.
+    */
+   GS_OPCODE_SVB_SET_DST_INDEX,
+
+   /**
+    * Prepare Mx.0 subregister for being used in the FF_SYNC message header.
+    * Used in gfx6 for transform feedback.
+    *
+    * - dst will hold the register with the final Mx.0 value.
+    *
+    * - src0 has the number of vertices emitted in SO (NumSOVertsToWrite)
+    *
+    * - src1 has the number of needed primitives for SO (NumSOPrimsNeeded)
+    *
+    * - src2 is the value to hold in M0: number of SO vertices to write
+    *   and number of SO primitives needed.
+    */
+   GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
+
+   /**
+    * Terminate the compute shader.
+    */
+   CS_OPCODE_CS_TERMINATE,
+
+   /**
+    * GLSL barrier()
+    */
+   SHADER_OPCODE_BARRIER,
+
+   /**
+    * Calculate the high 32-bits of a 32x32 multiply.
+    */
+   SHADER_OPCODE_MULH,
+
+   /** Signed subtraction with saturation. */
+   SHADER_OPCODE_ISUB_SAT,
+
+   /** Unsigned subtraction with saturation. */
+   SHADER_OPCODE_USUB_SAT,
+
+   /**
+    * A MOV that uses VxH indirect addressing.
+    *
+    * Source 0: A register to start from (HW_REG).
+    * Source 1: An indirect offset (in bytes, UD GRF).
+    * Source 2: The length of the region that could be accessed (in bytes,
+    *           UD immediate).
+    */
+   SHADER_OPCODE_MOV_INDIRECT,
+
+   /** Fills out a relocatable immediate */
+   SHADER_OPCODE_MOV_RELOC_IMM,
+
+   VEC4_OPCODE_URB_READ,
+   TCS_OPCODE_GET_INSTANCE_ID,
+   VEC4_TCS_OPCODE_URB_WRITE,
+   VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS,
+   VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS,
+   TCS_OPCODE_GET_PRIMITIVE_ID,
+   TCS_OPCODE_CREATE_BARRIER_HEADER,
+   TCS_OPCODE_SRC0_010_IS_ZERO,
+   TCS_OPCODE_RELEASE_INPUT,
+   TCS_OPCODE_THREAD_END,
+
+   TES_OPCODE_GET_PRIMITIVE_ID,
+   TES_OPCODE_CREATE_INPUT_READ_HEADER,
+   TES_OPCODE_ADD_INDIRECT_URB_OFFSET,
+
+   SHADER_OPCODE_BTD_SPAWN_LOGICAL,
+   SHADER_OPCODE_BTD_RETIRE_LOGICAL,
+
+   SHADER_OPCODE_READ_SR_REG,
+
+   RT_OPCODE_TRACE_RAY_LOGICAL,
+};
+
+enum brw_urb_write_flags {
+   BRW_URB_WRITE_NO_FLAGS = 0,
+
+   /**
+    * Causes a new URB entry to be allocated, and its address stored in the
+    * destination register (gen < 7).
+    */
+   BRW_URB_WRITE_ALLOCATE = 0x1,
+
+   /**
+    * Causes the current URB entry to be deallocated (gen < 7).
+    */
+   BRW_URB_WRITE_UNUSED = 0x2,
+
+   /**
+    * Causes the thread to terminate.
+    */
+   BRW_URB_WRITE_EOT = 0x4,
+
+   /**
+    * Indicates that the given URB entry is complete, and may be sent further
+    * down the 3D pipeline (gen < 7).
+    */
+   BRW_URB_WRITE_COMPLETE = 0x8,
+
+   /**
+    * Indicates that an additional offset (which may be different for the two
+    * vec4 slots) is stored in the message header (gen == 7).
+    */
+   BRW_URB_WRITE_PER_SLOT_OFFSET = 0x10,
+
+   /**
+    * Indicates that the channel masks in the URB_WRITE message header should
+    * not be overridden to 0xff (gen == 7).
+    */
+   BRW_URB_WRITE_USE_CHANNEL_MASKS = 0x20,
+
+   /**
+    * Indicates that the data should be sent to the URB using the
+    * URB_WRITE_OWORD message rather than URB_WRITE_HWORD (gen == 7).  This
+    * causes offsets to be interpreted as multiples of an OWORD instead of an
+    * HWORD, and only allows one OWORD to be written.
+    */
+   BRW_URB_WRITE_OWORD = 0x40,
+
+   /**
+    * Convenient combination of flags: end the thread while simultaneously
+    * marking the given URB entry as complete.
+    */
+   BRW_URB_WRITE_EOT_COMPLETE = BRW_URB_WRITE_EOT | BRW_URB_WRITE_COMPLETE,
+
+   /**
+    * Convenient combination of flags: mark the given URB entry as complete
+    * and simultaneously allocate a new one.
+    */
+   BRW_URB_WRITE_ALLOCATE_COMPLETE =
+      BRW_URB_WRITE_ALLOCATE | BRW_URB_WRITE_COMPLETE,
+};
+
+enum fb_write_logical_srcs {
+   FB_WRITE_LOGICAL_SRC_COLOR0,      /* REQUIRED */
+   FB_WRITE_LOGICAL_SRC_COLOR1,      /* for dual source blend messages */
+   FB_WRITE_LOGICAL_SRC_SRC0_ALPHA,
+   FB_WRITE_LOGICAL_SRC_SRC_DEPTH,   /* gl_FragDepth */
+   FB_WRITE_LOGICAL_SRC_DST_DEPTH,   /* GFX4-5: passthrough from thread */
+   FB_WRITE_LOGICAL_SRC_SRC_STENCIL, /* gl_FragStencilRefARB */
+   FB_WRITE_LOGICAL_SRC_OMASK,       /* Sample Mask (gl_SampleMask) */
+   FB_WRITE_LOGICAL_SRC_COMPONENTS,  /* REQUIRED */
+   FB_WRITE_LOGICAL_NUM_SRCS
+};
+
+enum tex_logical_srcs {
+   /** Texture coordinates */
+   TEX_LOGICAL_SRC_COORDINATE,
+   /** Shadow comparator */
+   TEX_LOGICAL_SRC_SHADOW_C,
+   /** dPdx if the operation takes explicit derivatives, otherwise LOD value */
+   TEX_LOGICAL_SRC_LOD,
+   /** dPdy if the operation takes explicit derivatives */
+   TEX_LOGICAL_SRC_LOD2,
+   /** Min LOD */
+   TEX_LOGICAL_SRC_MIN_LOD,
+   /** Sample index */
+   TEX_LOGICAL_SRC_SAMPLE_INDEX,
+   /** MCS data */
+   TEX_LOGICAL_SRC_MCS,
+   /** REQUIRED: Texture surface index */
+   TEX_LOGICAL_SRC_SURFACE,
+   /** Texture sampler index */
+   TEX_LOGICAL_SRC_SAMPLER,
+   /** Texture surface bindless handle */
+   TEX_LOGICAL_SRC_SURFACE_HANDLE,
+   /** Texture sampler bindless handle */
+   TEX_LOGICAL_SRC_SAMPLER_HANDLE,
+   /** Texel offset for gathers */
+   TEX_LOGICAL_SRC_TG4_OFFSET,
+   /** REQUIRED: Number of coordinate components (as UD immediate) */
+   TEX_LOGICAL_SRC_COORD_COMPONENTS,
+   /** REQUIRED: Number of derivative components (as UD immediate) */
+   TEX_LOGICAL_SRC_GRAD_COMPONENTS,
+   /** REQUIRED: request residency (as UD immediate) */
+   TEX_LOGICAL_SRC_RESIDENCY,
+
+   TEX_LOGICAL_NUM_SRCS,
+};
+
+enum pull_uniform_constant_srcs {
+   /** Surface binding table index */
+   PULL_UNIFORM_CONSTANT_SRC_SURFACE,
+   /** Surface bindless handle */
+   PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE,
+   /** Surface offset */
+   PULL_UNIFORM_CONSTANT_SRC_OFFSET,
+   /** Pull size */
+   PULL_UNIFORM_CONSTANT_SRC_SIZE,
+
+   PULL_UNIFORM_CONSTANT_SRCS,
+};
+
+enum pull_varying_constant_srcs {
+   /** Surface binding table index */
+   PULL_VARYING_CONSTANT_SRC_SURFACE,
+   /** Surface bindless handle */
+   PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE,
+   /** Surface offset */
+   PULL_VARYING_CONSTANT_SRC_OFFSET,
+   /** Pull alignment */
+   PULL_VARYING_CONSTANT_SRC_ALIGNMENT,
+
+   PULL_VARYING_CONSTANT_SRCS,
+};
+
+enum get_buffer_size_srcs {
+   /** Surface binding table index */
+   GET_BUFFER_SIZE_SRC_SURFACE,
+   /** Surface bindless handle */
+   GET_BUFFER_SIZE_SRC_SURFACE_HANDLE,
+   /** LOD */
+   GET_BUFFER_SIZE_SRC_LOD,
+
+   GET_BUFFER_SIZE_SRCS
+};
+
+enum surface_logical_srcs {
+   /** Surface binding table index */
+   SURFACE_LOGICAL_SRC_SURFACE,
+   /** Surface bindless handle */
+   SURFACE_LOGICAL_SRC_SURFACE_HANDLE,
+   /** Surface address; could be multi-dimensional for typed opcodes */
+   SURFACE_LOGICAL_SRC_ADDRESS,
+   /** Data to be written or used in an atomic op */
+   SURFACE_LOGICAL_SRC_DATA,
+   /** Surface number of dimensions.  Affects the size of ADDRESS */
+   SURFACE_LOGICAL_SRC_IMM_DIMS,
+   /** Per-opcode immediate argument.  For atomics, this is the atomic opcode */
+   SURFACE_LOGICAL_SRC_IMM_ARG,
+   /**
+    * Some instructions with side-effects should not be predicated on
+    * sample mask, e.g. lowered stores to scratch.
+    */
+   SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK,
+
+   SURFACE_LOGICAL_NUM_SRCS
+};
+
+enum a64_logical_srcs {
+   /** Address the A64 message operates on */
+   A64_LOGICAL_ADDRESS,
+   /** Source for the operation (unused of LOAD ops) */
+   A64_LOGICAL_SRC,
+   /** Per-opcode immediate argument. Number of dwords, bit size, or atomic op. */
+   A64_LOGICAL_ARG,
+   /**
+    * Some instructions do want to run on helper lanes (like ray queries).
+    */
+   A64_LOGICAL_ENABLE_HELPERS,
+
+   A64_LOGICAL_NUM_SRCS
+};
+
+enum rt_logical_srcs {
+   /** Address of the globals */
+   RT_LOGICAL_SRC_GLOBALS,
+   /** Level at which the tracing should start */
+   RT_LOGICAL_SRC_BVH_LEVEL,
+   /** Type of tracing operation */
+   RT_LOGICAL_SRC_TRACE_RAY_CONTROL,
+   /** Synchronous tracing (ray query) */
+   RT_LOGICAL_SRC_SYNCHRONOUS,
+
+   RT_LOGICAL_NUM_SRCS
+};
+
+enum urb_logical_srcs {
+   URB_LOGICAL_SRC_HANDLE,
+   URB_LOGICAL_SRC_PER_SLOT_OFFSETS,
+   URB_LOGICAL_SRC_CHANNEL_MASK,
+   /** Data to be written.  BAD_FILE for reads. */
+   URB_LOGICAL_SRC_DATA,
+   URB_LOGICAL_SRC_COMPONENTS,
+   URB_LOGICAL_NUM_SRCS
+};
+
+enum interpolator_logical_srcs {
+   /** Interpolation offset */
+   INTERP_SRC_OFFSET,
+   /** Message data  */
+   INTERP_SRC_MSG_DESC,
+   /** Flag register for dynamic mode */
+   INTERP_SRC_DYNAMIC_MODE,
+
+   INTERP_NUM_SRCS
+};
+
+
+#ifdef __cplusplus
+/**
+ * Allow brw_urb_write_flags enums to be ORed together.
+ */
+inline brw_urb_write_flags
+operator|(brw_urb_write_flags x, brw_urb_write_flags y)
+{
+   return static_cast<brw_urb_write_flags>(static_cast<int>(x) |
+                                           static_cast<int>(y));
+}
+#endif
+
+enum ENUM_PACKED brw_predicate {
+   BRW_PREDICATE_NONE                =  0,
+   BRW_PREDICATE_NORMAL              =  1,
+   BRW_PREDICATE_ALIGN1_ANYV         =  2,
+   BRW_PREDICATE_ALIGN1_ALLV         =  3,
+   BRW_PREDICATE_ALIGN1_ANY2H        =  4,
+   BRW_PREDICATE_ALIGN1_ALL2H        =  5,
+   BRW_PREDICATE_ALIGN1_ANY4H        =  6,
+   BRW_PREDICATE_ALIGN1_ALL4H        =  7,
+   BRW_PREDICATE_ALIGN1_ANY8H        =  8,
+   BRW_PREDICATE_ALIGN1_ALL8H        =  9,
+   BRW_PREDICATE_ALIGN1_ANY16H       = 10,
+   BRW_PREDICATE_ALIGN1_ALL16H       = 11,
+   BRW_PREDICATE_ALIGN1_ANY32H       = 12,
+   BRW_PREDICATE_ALIGN1_ALL32H       = 13,
+   BRW_PREDICATE_ALIGN16_REPLICATE_X =  2,
+   BRW_PREDICATE_ALIGN16_REPLICATE_Y =  3,
+   BRW_PREDICATE_ALIGN16_REPLICATE_Z =  4,
+   BRW_PREDICATE_ALIGN16_REPLICATE_W =  5,
+   BRW_PREDICATE_ALIGN16_ANY4H       =  6,
+   BRW_PREDICATE_ALIGN16_ALL4H       =  7,
+   XE2_PREDICATE_ANY = 2,
+   XE2_PREDICATE_ALL = 3
+};
+
+enum ENUM_PACKED brw_reg_file {
+   BRW_ARCHITECTURE_REGISTER_FILE = 0,
+   BRW_GENERAL_REGISTER_FILE      = 1,
+   BRW_MESSAGE_REGISTER_FILE      = 2,
+   BRW_IMMEDIATE_VALUE            = 3,
+
+   ARF = BRW_ARCHITECTURE_REGISTER_FILE,
+   FIXED_GRF = BRW_GENERAL_REGISTER_FILE,
+   MRF = BRW_MESSAGE_REGISTER_FILE,
+   IMM = BRW_IMMEDIATE_VALUE,
+
+   /* These are not hardware values */
+   VGRF,
+   ATTR,
+   UNIFORM, /* prog_data->params[reg] */
+   BAD_FILE,
+};
+
+enum ENUM_PACKED gfx10_align1_3src_reg_file {
+   BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE = 0,
+   BRW_ALIGN1_3SRC_IMMEDIATE_VALUE       = 1, /* src0, src2 */
+   BRW_ALIGN1_3SRC_ACCUMULATOR           = 1, /* dest, src1 */
+};
+
+/* CNL adds Align1 support for 3-src instructions. Bit 35 of the instruction
+ * word is "Execution Datatype" which controls whether the instruction operates
+ * on float or integer types. The register arguments have fields that offer
+ * more fine control their respective types.
+ */
+enum ENUM_PACKED gfx10_align1_3src_exec_type {
+   BRW_ALIGN1_3SRC_EXEC_TYPE_INT   = 0,
+   BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT = 1,
+};
+
+#define BRW_ARF_NULL                  0x00
+#define BRW_ARF_ADDRESS               0x10
+#define BRW_ARF_ACCUMULATOR           0x20
+#define BRW_ARF_FLAG                  0x30
+#define BRW_ARF_MASK                  0x40
+#define BRW_ARF_MASK_STACK            0x50
+#define BRW_ARF_MASK_STACK_DEPTH      0x60
+#define BRW_ARF_STATE                 0x70
+#define BRW_ARF_CONTROL               0x80
+#define BRW_ARF_NOTIFICATION_COUNT    0x90
+#define BRW_ARF_IP                    0xA0
+#define BRW_ARF_TDR                   0xB0
+#define BRW_ARF_TIMESTAMP             0xC0
+
+#define BRW_MRF_COMPR4			(1 << 7)
+
+#define BRW_AMASK   0
+#define BRW_IMASK   1
+#define BRW_LMASK   2
+#define BRW_CMASK   3
+
+
+
+#define BRW_THREAD_NORMAL     0
+#define BRW_THREAD_ATOMIC     1
+#define BRW_THREAD_SWITCH     2
+
+enum ENUM_PACKED brw_vertical_stride {
+   BRW_VERTICAL_STRIDE_0               = 0,
+   BRW_VERTICAL_STRIDE_1               = 1,
+   BRW_VERTICAL_STRIDE_2               = 2,
+   BRW_VERTICAL_STRIDE_4               = 3,
+   BRW_VERTICAL_STRIDE_8               = 4,
+   BRW_VERTICAL_STRIDE_16              = 5,
+   BRW_VERTICAL_STRIDE_32              = 6,
+   BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL = 0xF,
+};
+
+enum ENUM_PACKED gfx10_align1_3src_vertical_stride {
+   BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0 = 0,
+   BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1 = 1,
+   BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2 = 1,
+   BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4 = 2,
+   BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8 = 3,
+};
+
+enum ENUM_PACKED brw_width {
+   BRW_WIDTH_1  = 0,
+   BRW_WIDTH_2  = 1,
+   BRW_WIDTH_4  = 2,
+   BRW_WIDTH_8  = 3,
+   BRW_WIDTH_16 = 4,
+};
+
+/**
+ * Gfx12+ SWSB SBID synchronization mode.
+ *
+ * This is represented as a bitmask including any required SBID token
+ * synchronization modes, used to synchronize out-of-order instructions.  Only
+ * the strongest mode of the mask will be provided to the hardware in the SWSB
+ * field of an actual hardware instruction, but virtual instructions may be
+ * able to take into account multiple of them.
+ */
+enum tgl_sbid_mode {
+   TGL_SBID_NULL = 0,
+   TGL_SBID_SRC = 1,
+   TGL_SBID_DST = 2,
+   TGL_SBID_SET = 4
+};
+
+
+enum gfx12_sub_byte_precision {
+   BRW_SUB_BYTE_PRECISION_NONE = 0,
+
+   /** 4 bits. Signedness determined by base type */
+   BRW_SUB_BYTE_PRECISION_4BIT = 1,
+
+   /** 2 bits. Signedness determined by base type */
+   BRW_SUB_BYTE_PRECISION_2BIT = 2,
+};
+
+enum gfx12_systolic_depth {
+   BRW_SYSTOLIC_DEPTH_16 = 0,
+   BRW_SYSTOLIC_DEPTH_2 = 1,
+   BRW_SYSTOLIC_DEPTH_4 = 2,
+   BRW_SYSTOLIC_DEPTH_8 = 3,
+};
+
+#ifdef __cplusplus
+/**
+ * Allow bitwise arithmetic of tgl_sbid_mode enums.
+ */
+inline tgl_sbid_mode
+operator|(tgl_sbid_mode x, tgl_sbid_mode y)
+{
+   return tgl_sbid_mode(unsigned(x) | unsigned(y));
+}
+
+inline tgl_sbid_mode
+operator&(tgl_sbid_mode x, tgl_sbid_mode y)
+{
+   return tgl_sbid_mode(unsigned(x) & unsigned(y));
+}
+
+inline tgl_sbid_mode &
+operator|=(tgl_sbid_mode &x, tgl_sbid_mode y)
+{
+   return x = x | y;
+}
+
+#endif
+
+/**
+ * TGL+ SWSB RegDist synchronization pipeline.
+ *
+ * On TGL all instructions that use the RegDist synchronization mechanism are
+ * considered to be executed as a single in-order pipeline, therefore only the
+ * TGL_PIPE_FLOAT pipeline is applicable.  On XeHP+ platforms there are two
+ * additional asynchronous ALU pipelines (which still execute instructions
+ * in-order and use the RegDist synchronization mechanism).  TGL_PIPE_NONE
+ * doesn't provide any RegDist pipeline synchronization information and allows
+ * the hardware to infer the pipeline based on the source types of the
+ * instruction.  TGL_PIPE_ALL can be used when synchronization with all ALU
+ * pipelines is intended.
+ */
+enum tgl_pipe {
+   TGL_PIPE_NONE = 0,
+   TGL_PIPE_FLOAT,
+   TGL_PIPE_INT,
+   TGL_PIPE_LONG,
+   TGL_PIPE_MATH,
+   TGL_PIPE_ALL
+};
+
+/**
+ * Logical representation of the SWSB scheduling information of a hardware
+ * instruction.  The binary representation is slightly more compact.
+ */
+struct tgl_swsb {
+   unsigned regdist : 3;
+   enum tgl_pipe pipe : 3;
+   unsigned sbid : 5;
+   enum tgl_sbid_mode mode : 3;
+};
+
+/**
+ * Construct a scheduling annotation with a single RegDist dependency.  This
+ * synchronizes with the completion of the d-th previous in-order instruction.
+ * The index is one-based, zero causes a no-op tgl_swsb to be constructed.
+ */
+static inline struct tgl_swsb
+tgl_swsb_regdist(unsigned d)
+{
+   const struct tgl_swsb swsb = { d, d ? TGL_PIPE_ALL : TGL_PIPE_NONE };
+   assert(swsb.regdist == d);
+   return swsb;
+}
+
+/**
+ * Construct a scheduling annotation that synchronizes with the specified SBID
+ * token.
+ */
+static inline struct tgl_swsb
+tgl_swsb_sbid(enum tgl_sbid_mode mode, unsigned sbid)
+{
+   const struct tgl_swsb swsb = { 0, TGL_PIPE_NONE, sbid, mode };
+   assert(swsb.sbid == sbid);
+   return swsb;
+}
+
+/**
+ * Construct a no-op scheduling annotation.
+ */
+static inline struct tgl_swsb
+tgl_swsb_null(void)
+{
+   return tgl_swsb_regdist(0);
+}
+
+/**
+ * Return a scheduling annotation that allocates the same SBID synchronization
+ * token as \p swsb.  In addition it will synchronize against a previous
+ * in-order instruction if \p regdist is non-zero.
+ */
+static inline struct tgl_swsb
+tgl_swsb_dst_dep(struct tgl_swsb swsb, unsigned regdist)
+{
+   swsb.regdist = regdist;
+   swsb.mode = swsb.mode & TGL_SBID_SET;
+   swsb.pipe = (regdist ? TGL_PIPE_ALL : TGL_PIPE_NONE);
+   return swsb;
+}
+
+/**
+ * Return a scheduling annotation that synchronizes against the same SBID and
+ * RegDist dependencies as \p swsb, but doesn't allocate any SBID token.
+ */
+static inline struct tgl_swsb
+tgl_swsb_src_dep(struct tgl_swsb swsb)
+{
+   swsb.mode = swsb.mode & (TGL_SBID_SRC | TGL_SBID_DST);
+   return swsb;
+}
+
+/**
+ * Convert the provided tgl_swsb to the hardware's binary representation of an
+ * SWSB annotation.
+ */
+static inline uint32_t
+tgl_swsb_encode(const struct intel_device_info *devinfo, struct tgl_swsb swsb)
+{
+   if (!swsb.mode) {
+      const unsigned pipe = devinfo->verx10 < 125 ? 0 :
+         swsb.pipe == TGL_PIPE_FLOAT ? 0x10 :
+         swsb.pipe == TGL_PIPE_INT ? 0x18 :
+         swsb.pipe == TGL_PIPE_LONG ? 0x20 :
+         swsb.pipe == TGL_PIPE_MATH ? 0x28 :
+         swsb.pipe == TGL_PIPE_ALL ? 0x8 : 0;
+      return pipe | swsb.regdist;
+
+   } else if (swsb.regdist) {
+      if (devinfo->ver >= 20) {
+         if ((swsb.mode & TGL_SBID_SET)) {
+            assert(swsb.pipe == TGL_PIPE_ALL ||
+                   swsb.pipe == TGL_PIPE_INT || swsb.pipe == TGL_PIPE_FLOAT);
+            return (swsb.pipe == TGL_PIPE_INT ? 0x300 :
+                    swsb.pipe == TGL_PIPE_FLOAT ? 0x200 : 0x100) |
+                   swsb.regdist << 5 | swsb.sbid;
+         } else {
+            assert(!(swsb.mode & ~(TGL_SBID_DST | TGL_SBID_SRC)));
+            return (swsb.pipe == TGL_PIPE_ALL ? 0x300 :
+                    swsb.mode == TGL_SBID_SRC ? 0x200 : 0x100) |
+                   swsb.regdist << 5 | swsb.sbid;
+         }
+      } else {
+         assert(!(swsb.sbid & ~0xfu));
+         return 0x80 | swsb.regdist << 4 | swsb.sbid;
+      }
+
+   } else {
+      if (devinfo->ver >= 20) {
+         return swsb.sbid | (swsb.mode & TGL_SBID_SET ? 0xc0 :
+                             swsb.mode & TGL_SBID_DST ? 0x80 : 0xa0);
+      } else {
+         assert(!(swsb.sbid & ~0xfu));
+         return swsb.sbid | (swsb.mode & TGL_SBID_SET ? 0x40 :
+                             swsb.mode & TGL_SBID_DST ? 0x20 : 0x30);
+      }
+   }
+}
+
+/**
+ * Convert the provided binary representation of an SWSB annotation to a
+ * tgl_swsb.
+ */
+static inline struct tgl_swsb
+tgl_swsb_decode(const struct intel_device_info *devinfo,
+                const bool is_unordered, const uint32_t x)
+{
+   if (devinfo->ver >= 20) {
+      if (x & 0x300) {
+         if (is_unordered) {
+            const struct tgl_swsb swsb = {
+               (x & 0xe0u) >> 5,
+               ((x & 0x300) == 0x300 ? TGL_PIPE_INT :
+                (x & 0x300) == 0x200 ? TGL_PIPE_FLOAT :
+                TGL_PIPE_ALL),
+               x & 0x1fu,
+               TGL_SBID_SET
+            };
+            return swsb;
+         } else {
+            const struct tgl_swsb swsb = {
+               (x & 0xe0u) >> 5,
+               ((x & 0x300) == 0x300 ? TGL_PIPE_ALL : TGL_PIPE_NONE),
+               x & 0x1fu,
+               ((x & 0x300) == 0x200 ? TGL_SBID_SRC : TGL_SBID_DST)
+            };
+            return swsb;
+         }
+
+      } else if ((x & 0xe0) == 0x80) {
+         return tgl_swsb_sbid(TGL_SBID_DST, x & 0x1f);
+      } else if ((x & 0xe0) == 0xa0) {
+         return tgl_swsb_sbid(TGL_SBID_SRC, x & 0x1fu);
+      } else if ((x & 0xe0) == 0xc0) {
+         return tgl_swsb_sbid(TGL_SBID_SET, x & 0x1fu);
+      } else {
+            const struct tgl_swsb swsb = { x & 0x7u,
+                                           ((x & 0x38) == 0x10 ? TGL_PIPE_FLOAT :
+                                            (x & 0x38) == 0x18 ? TGL_PIPE_INT :
+                                            (x & 0x38) == 0x20 ? TGL_PIPE_LONG :
+                                            (x & 0x38) == 0x28 ? TGL_PIPE_MATH :
+                                            (x & 0x38) == 0x8 ? TGL_PIPE_ALL :
+                                            TGL_PIPE_NONE) };
+            return swsb;
+      }
+
+   } else {
+      if (x & 0x80) {
+         const struct tgl_swsb swsb = { (x & 0x70u) >> 4, TGL_PIPE_NONE,
+                                        x & 0xfu,
+                                        is_unordered ?
+                                        TGL_SBID_SET : TGL_SBID_DST };
+         return swsb;
+      } else if ((x & 0x70) == 0x20) {
+         return tgl_swsb_sbid(TGL_SBID_DST, x & 0xfu);
+      } else if ((x & 0x70) == 0x30) {
+         return tgl_swsb_sbid(TGL_SBID_SRC, x & 0xfu);
+      } else if ((x & 0x70) == 0x40) {
+         return tgl_swsb_sbid(TGL_SBID_SET, x & 0xfu);
+      } else {
+         const struct tgl_swsb swsb = { x & 0x7u,
+                                        ((x & 0x78) == 0x10 ? TGL_PIPE_FLOAT :
+                                         (x & 0x78) == 0x18 ? TGL_PIPE_INT :
+                                         (x & 0x78) == 0x50 ? TGL_PIPE_LONG :
+                                         (x & 0x78) == 0x8 ? TGL_PIPE_ALL :
+                                         TGL_PIPE_NONE) };
+         assert(devinfo->verx10 >= 125 || swsb.pipe == TGL_PIPE_NONE);
+         return swsb;
+      }
+   }
+}
+
+enum tgl_sync_function {
+   TGL_SYNC_NOP = 0x0,
+   TGL_SYNC_ALLRD = 0x2,
+   TGL_SYNC_ALLWR = 0x3,
+   TGL_SYNC_FENCE = 0xd,
+   TGL_SYNC_BAR = 0xe,
+   TGL_SYNC_HOST = 0xf
+};
+
+/**
+ * Message target: Shared Function ID for where to SEND a message.
+ *
+ * These are enumerated in the ISA reference under "send - Send Message".
+ * In particular, see the following tables:
+ * - G45 PRM, Volume 4, Table 14-15 "Message Descriptor Definition"
+ * - Sandybridge PRM, Volume 4 Part 2, Table 8-16 "Extended Message Descriptor"
+ * - Ivybridge PRM, Volume 1 Part 1, section 3.2.7 "GPE Function IDs"
+ */
+enum brw_message_target {
+   BRW_SFID_NULL                     = 0,
+   BRW_SFID_MATH                     = 1, /* Only valid on Gfx4-5 */
+   BRW_SFID_SAMPLER                  = 2,
+   BRW_SFID_MESSAGE_GATEWAY          = 3,
+   BRW_SFID_DATAPORT_READ            = 4,
+   BRW_SFID_DATAPORT_WRITE           = 5,
+   BRW_SFID_URB                      = 6,
+   BRW_SFID_THREAD_SPAWNER           = 7,
+   BRW_SFID_VME                      = 8,
+
+   GFX6_SFID_DATAPORT_SAMPLER_CACHE  = 4,
+   GFX6_SFID_DATAPORT_RENDER_CACHE   = 5,
+   GFX6_SFID_DATAPORT_CONSTANT_CACHE = 9,
+
+   GFX7_SFID_DATAPORT_DATA_CACHE     = 10,
+   GFX7_SFID_PIXEL_INTERPOLATOR      = 11,
+   HSW_SFID_DATAPORT_DATA_CACHE_1    = 12,
+   HSW_SFID_CRE                      = 13,
+
+   GFX12_SFID_TGM                      = 13, /* Typed Global Memory */
+   GFX12_SFID_SLM                      = 14, /* Shared Local Memory */
+   GFX12_SFID_UGM                      = 15, /* Untyped Global Memory */
+
+   GEN_RT_SFID_BINDLESS_THREAD_DISPATCH = 7,
+   GEN_RT_SFID_RAY_TRACE_ACCELERATOR = 8,
+};
+
+#define GFX7_MESSAGE_TARGET_DP_DATA_CACHE     10
+
+#define BRW_SAMPLER_RETURN_FORMAT_FLOAT32     0
+#define BRW_SAMPLER_RETURN_FORMAT_UINT32      2
+#define BRW_SAMPLER_RETURN_FORMAT_SINT32      3
+
+#define GFX8_SAMPLER_RETURN_FORMAT_32BITS    0
+#define GFX8_SAMPLER_RETURN_FORMAT_16BITS    1
+
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE              0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE             0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS        0
+#define BRW_SAMPLER_MESSAGE_SIMD8_KILLPIX             1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD        1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD         1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS  2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS    2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE    0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE     2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE 0
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE 1
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE  1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
+#define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO            2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_LD                3
+#define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
+#define BRW_SAMPLER_MESSAGE_SIMD16_LD                 3
+
+#define GFX5_SAMPLER_MESSAGE_SAMPLE              0
+#define GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS         1
+#define GFX5_SAMPLER_MESSAGE_SAMPLE_LOD          2
+#define GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE      3
+#define GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS       4
+#define GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE 5
+#define GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE  6
+#define GFX5_SAMPLER_MESSAGE_SAMPLE_LD           7
+#define GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4      8
+#define GFX5_SAMPLER_MESSAGE_LOD                 9
+#define GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO      10
+#define GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO   11
+#define GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C    16
+#define GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO   17
+#define GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C 18
+#define XE2_SAMPLER_MESSAGE_SAMPLE_MLOD          18
+#define XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD  19
+#define HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE 20
+#define GFX9_SAMPLER_MESSAGE_SAMPLE_LZ           24
+#define GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ         25
+#define GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ        26
+#define GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W     28
+#define GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS       29
+#define GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS       30
+#define GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS       31
+
+/* for GFX5 only */
+#define BRW_SAMPLER_SIMD_MODE_SIMD4X2                   0
+#define BRW_SAMPLER_SIMD_MODE_SIMD8                     1
+#define BRW_SAMPLER_SIMD_MODE_SIMD16                    2
+#define BRW_SAMPLER_SIMD_MODE_SIMD32_64                 3
+
+#define GFX10_SAMPLER_SIMD_MODE_SIMD8H                  5
+#define GFX10_SAMPLER_SIMD_MODE_SIMD16H                 6
+
+#define XE2_SAMPLER_SIMD_MODE_SIMD16                  1
+#define XE2_SAMPLER_SIMD_MODE_SIMD32                  2
+#define XE2_SAMPLER_SIMD_MODE_SIMD16H                 5
+#define XE2_SAMPLER_SIMD_MODE_SIMD32H                 6
+
+/* GFX9 changes SIMD mode 0 to mean SIMD8D, but lets us get the SIMD4x2
+ * behavior by setting bit 22 of dword 2 in the message header. */
+#define GFX9_SAMPLER_SIMD_MODE_SIMD8D                   0
+#define GFX9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2        (1 << 22)
+
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW   0
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH  1
+#define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS     2
+#define BRW_DATAPORT_OWORD_BLOCK_4_OWORDS     3
+#define BRW_DATAPORT_OWORD_BLOCK_8_OWORDS     4
+#define GFX12_DATAPORT_OWORD_BLOCK_16_OWORDS  5
+#define BRW_DATAPORT_OWORD_BLOCK_OWORDS(n)              \
+   ((n) == 1 ? BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW :    \
+    (n) == 2 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS :      \
+    (n) == 4 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS :      \
+    (n) == 8 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS :      \
+    (n) == 16 ? GFX12_DATAPORT_OWORD_BLOCK_16_OWORDS :  \
+    (abort(), ~0))
+#define BRW_DATAPORT_OWORD_BLOCK_DWORDS(n)              \
+   ((n) == 4 ? BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW :    \
+    (n) == 8 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS :      \
+    (n) == 16 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS :     \
+    (n) == 32 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS :     \
+    (abort(), ~0))
+
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD     0
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_4OWORDS    2
+
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS   2
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS  3
+
+/* This one stays the same across generations. */
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ          0
+/* GFX4 */
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     1
+#define BRW_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ          2
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      3
+/* G45, GFX5 */
+#define G45_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ	    1
+#define G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     2
+#define G45_DATAPORT_READ_MESSAGE_AVC_LOOP_FILTER_READ	    3
+#define G45_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ          4
+#define G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      6
+/* GFX6 */
+#define GFX6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ	    1
+#define GFX6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     2
+#define GFX6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ          4
+#define GFX6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ  5
+#define GFX6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      6
+
+#define BRW_DATAPORT_READ_TARGET_DATA_CACHE      0
+#define BRW_DATAPORT_READ_TARGET_RENDER_CACHE    1
+#define BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE   2
+
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE                0
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED     1
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01         2
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23         3
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01       4
+
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE                0
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE           1
+#define BRW_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE                2
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE            3
+#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE              4
+#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE     5
+#define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE               7
+
+/* GFX6 */
+#define GFX6_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE              7
+#define GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE               8
+#define GFX6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE          9
+#define GFX6_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE               10
+#define GFX6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE           11
+#define GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE             12
+#define GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE               13
+#define GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE       14
+
+/* GFX7 */
+#define GFX7_DATAPORT_RC_MEDIA_BLOCK_READ                           4
+#define GFX7_DATAPORT_RC_TYPED_SURFACE_READ                         5
+#define GFX7_DATAPORT_RC_TYPED_ATOMIC_OP                            6
+#define GFX7_DATAPORT_RC_MEMORY_FENCE                               7
+#define GFX7_DATAPORT_RC_MEDIA_BLOCK_WRITE                          10
+#define GFX7_DATAPORT_RC_RENDER_TARGET_WRITE                        12
+#define GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE                        13
+#define GFX7_DATAPORT_DC_OWORD_BLOCK_READ                           0
+#define GFX7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ                 1
+#define GFX7_DATAPORT_DC_OWORD_DUAL_BLOCK_READ                      2
+#define GFX7_DATAPORT_DC_DWORD_SCATTERED_READ                       3
+#define GFX7_DATAPORT_DC_BYTE_SCATTERED_READ                        4
+#define GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ                       5
+#define GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP                          6
+#define GFX7_DATAPORT_DC_MEMORY_FENCE                               7
+#define GFX7_DATAPORT_DC_OWORD_BLOCK_WRITE                          8
+#define GFX7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE                     10
+#define GFX7_DATAPORT_DC_DWORD_SCATTERED_WRITE                      11
+#define GFX7_DATAPORT_DC_BYTE_SCATTERED_WRITE                       12
+#define GFX7_DATAPORT_DC_UNTYPED_SURFACE_WRITE                      13
+
+#define GFX7_DATAPORT_SCRATCH_READ                            ((1 << 18) | \
+                                                               (0 << 17))
+#define GFX7_DATAPORT_SCRATCH_WRITE                           ((1 << 18) | \
+                                                               (1 << 17))
+#define GFX7_DATAPORT_SCRATCH_NUM_REGS_SHIFT                        12
+
+#define GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET     0
+#define GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE            1
+#define GFX7_PIXEL_INTERPOLATOR_LOC_CENTROID          2
+#define GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET   3
+
+/* HSW */
+#define HSW_DATAPORT_DC_PORT0_OWORD_BLOCK_READ                      0
+#define HSW_DATAPORT_DC_PORT0_UNALIGNED_OWORD_BLOCK_READ            1
+#define HSW_DATAPORT_DC_PORT0_OWORD_DUAL_BLOCK_READ                 2
+#define HSW_DATAPORT_DC_PORT0_DWORD_SCATTERED_READ                  3
+#define HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ                   4
+#define HSW_DATAPORT_DC_PORT0_MEMORY_FENCE                          7
+#define HSW_DATAPORT_DC_PORT0_OWORD_BLOCK_WRITE                     8
+#define HSW_DATAPORT_DC_PORT0_OWORD_DUAL_BLOCK_WRITE                10
+#define HSW_DATAPORT_DC_PORT0_DWORD_SCATTERED_WRITE                 11
+#define HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE                  12
+
+#define HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ                  1
+#define HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP                     2
+#define HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2             3
+#define HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_READ                      4
+#define HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ                    5
+#define HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP                       6
+#define HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2               7
+#define HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE                 9
+#define HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_WRITE                     10
+#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP                     11
+#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2             12
+#define HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE                   13
+#define GFX9_DATAPORT_DC_PORT1_A64_SCATTERED_READ                   0x10
+#define GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ             0x11
+#define GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP                0x12
+#define GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP      0x13
+#define GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ                 0x14
+#define GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE                0x15
+#define GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE            0x19
+#define GFX8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE                  0x1a
+#define GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP              0x1b
+#define GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP          0x1d
+#define GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP    0x1e
+
+/* GFX9 */
+#define GFX9_DATAPORT_RC_RENDER_TARGET_WRITE                        12
+#define GFX9_DATAPORT_RC_RENDER_TARGET_READ                         13
+
+/* A64 scattered message subtype */
+#define GFX8_A64_SCATTERED_SUBTYPE_BYTE                             0
+#define GFX8_A64_SCATTERED_SUBTYPE_DWORD                            1
+#define GFX8_A64_SCATTERED_SUBTYPE_QWORD                            2
+#define GFX8_A64_SCATTERED_SUBTYPE_HWORD                            3
+
+/* Dataport special binding table indices: */
+#define BRW_BTI_STATELESS                255
+#define GFX7_BTI_SLM                     254
+
+#define HSW_BTI_STATELESS_LOCALLY_COHERENT 255
+#define HSW_BTI_STATELESS_NON_COHERENT 253
+#define HSW_BTI_STATELESS_GLOBALLY_COHERENT 252
+#define HSW_BTI_STATELESS_LLC_COHERENT 251
+#define HSW_BTI_STATELESS_L3_UNCACHED 250
+
+/* The hardware docs are a bit contradictory here.  On Haswell, where they
+ * first added cache ability control, there were 5 different cache modes (see
+ * HSW_BTI_STATELESS_* above).  On Broadwell, they reduced to two:
+ *
+ *  - IA-Coherent (BTI=255): Coherent within Gen and coherent within the
+ *    entire IA cache memory hierarchy.
+ *
+ *  - Non-Coherent (BTI=253): Coherent within Gen, same cache type.
+ *
+ * Information about stateless cache coherency can be found in the "A32
+ * Stateless" section of the "3D Media GPGPU" volume of the PRM for each
+ * hardware generation.
+ *
+ * Unfortunately, the docs for MDC_STATELESS appear to have been copied and
+ * pasted from Haswell and give the Haswell definitions for the BTI values of
+ * 255 and 253 including a warning about accessing 253 surfaces from multiple
+ * threads.  This seems to be a copy+paste error and the definitions from the
+ * "A32 Stateless" section should be trusted instead.
+ *
+ * Note that because the DRM sets bit 4 of HDC_CHICKEN0 on BDW, CHV and at
+ * least some pre-production steppings of SKL due to WaForceEnableNonCoherent,
+ * HDC memory access may have been overridden by the kernel to be non-coherent
+ * (matching the behavior of the same BTI on pre-Gfx8 hardware) and BTI 255
+ * may actually be an alias for BTI 253.
+ */
+#define GFX8_BTI_STATELESS_IA_COHERENT   255
+#define GFX8_BTI_STATELESS_NON_COHERENT  253
+#define GFX9_BTI_BINDLESS                252
+
+/* This ID doesn't map anything HW related value. It exists to inform the
+ * lowering code to not use the bindless heap.
+ */
+#define GFX125_NON_BINDLESS              (1u << 16)
+
+/* Dataport atomic operations for Untyped Atomic Integer Operation message
+ * (and others).
+ */
+#define BRW_AOP_AND                   1
+#define BRW_AOP_OR                    2
+#define BRW_AOP_XOR                   3
+#define BRW_AOP_MOV                   4
+#define BRW_AOP_INC                   5
+#define BRW_AOP_DEC                   6
+#define BRW_AOP_ADD                   7
+#define BRW_AOP_SUB                   8
+#define BRW_AOP_REVSUB                9
+#define BRW_AOP_IMAX                  10
+#define BRW_AOP_IMIN                  11
+#define BRW_AOP_UMAX                  12
+#define BRW_AOP_UMIN                  13
+#define BRW_AOP_CMPWR                 14
+#define BRW_AOP_PREDEC                15
+
+/* Dataport atomic operations for Untyped Atomic Float Operation message. */
+#define BRW_AOP_FMAX                  1
+#define BRW_AOP_FMIN                  2
+#define BRW_AOP_FCMPWR                3
+#define BRW_AOP_FADD                  4
+
+#define BRW_MATH_FUNCTION_INV                              1
+#define BRW_MATH_FUNCTION_LOG                              2
+#define BRW_MATH_FUNCTION_EXP                              3
+#define BRW_MATH_FUNCTION_SQRT                             4
+#define BRW_MATH_FUNCTION_RSQ                              5
+#define BRW_MATH_FUNCTION_SIN                              6
+#define BRW_MATH_FUNCTION_COS                              7
+#define BRW_MATH_FUNCTION_SINCOS                           8 /* gfx4, gfx5 */
+#define BRW_MATH_FUNCTION_FDIV                             9 /* gfx6+ */
+#define BRW_MATH_FUNCTION_POW                              10
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER   11
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT                 12
+#define BRW_MATH_FUNCTION_INT_DIV_REMAINDER                13
+#define GFX8_MATH_FUNCTION_INVM                            14
+#define GFX8_MATH_FUNCTION_RSQRTM                          15
+
+#define BRW_MATH_INTEGER_UNSIGNED     0
+#define BRW_MATH_INTEGER_SIGNED       1
+
+#define BRW_MATH_PRECISION_FULL        0
+#define BRW_MATH_PRECISION_PARTIAL     1
+
+#define BRW_MATH_SATURATE_NONE         0
+#define BRW_MATH_SATURATE_SATURATE     1
+
+#define BRW_MATH_DATA_VECTOR  0
+#define BRW_MATH_DATA_SCALAR  1
+
+#define BRW_URB_OPCODE_WRITE_HWORD  0
+#define BRW_URB_OPCODE_WRITE_OWORD  1
+#define BRW_URB_OPCODE_READ_HWORD   2
+#define BRW_URB_OPCODE_READ_OWORD   3
+#define GFX7_URB_OPCODE_ATOMIC_MOV  4
+#define GFX7_URB_OPCODE_ATOMIC_INC  5
+#define GFX8_URB_OPCODE_ATOMIC_ADD  6
+#define GFX8_URB_OPCODE_SIMD8_WRITE 7
+#define GFX8_URB_OPCODE_SIMD8_READ  8
+#define GFX125_URB_OPCODE_FENCE     9
+
+#define BRW_URB_SWIZZLE_NONE          0
+#define BRW_URB_SWIZZLE_INTERLEAVE    1
+#define BRW_URB_SWIZZLE_TRANSPOSE     2
+
+#define BRW_SCRATCH_SPACE_SIZE_1K     0
+#define BRW_SCRATCH_SPACE_SIZE_2K     1
+#define BRW_SCRATCH_SPACE_SIZE_4K     2
+#define BRW_SCRATCH_SPACE_SIZE_8K     3
+#define BRW_SCRATCH_SPACE_SIZE_16K    4
+#define BRW_SCRATCH_SPACE_SIZE_32K    5
+#define BRW_SCRATCH_SPACE_SIZE_64K    6
+#define BRW_SCRATCH_SPACE_SIZE_128K   7
+#define BRW_SCRATCH_SPACE_SIZE_256K   8
+#define BRW_SCRATCH_SPACE_SIZE_512K   9
+#define BRW_SCRATCH_SPACE_SIZE_1M     10
+#define BRW_SCRATCH_SPACE_SIZE_2M     11
+
+#define BRW_MESSAGE_GATEWAY_SFID_OPEN_GATEWAY         0
+#define BRW_MESSAGE_GATEWAY_SFID_CLOSE_GATEWAY        1
+#define BRW_MESSAGE_GATEWAY_SFID_FORWARD_MSG          2
+#define BRW_MESSAGE_GATEWAY_SFID_GET_TIMESTAMP        3
+#define BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG          4
+#define BRW_MESSAGE_GATEWAY_SFID_UPDATE_GATEWAY_STATE 5
+#define BRW_MESSAGE_GATEWAY_SFID_MMIO_READ_WRITE      6
+
+
+/* Gfx7 "GS URB Entry Allocation Size" is a U9-1 field, so the maximum gs_size
+ * is 2^9, or 512.  It's counted in multiples of 64 bytes.
+ *
+ * Identical for VS, DS, and HS.
+ */
+#define GFX7_MAX_GS_URB_ENTRY_SIZE_BYTES                (512*64)
+#define GFX7_MAX_DS_URB_ENTRY_SIZE_BYTES                (512*64)
+#define GFX7_MAX_HS_URB_ENTRY_SIZE_BYTES                (512*64)
+#define GFX7_MAX_VS_URB_ENTRY_SIZE_BYTES                (512*64)
+
+#define BRW_GS_EDGE_INDICATOR_0			(1 << 8)
+#define BRW_GS_EDGE_INDICATOR_1			(1 << 9)
+
+/* Gfx6 "GS URB Entry Allocation Size" is defined as a number of 1024-bit
+ * (128 bytes) URB rows and the maximum allowed value is 5 rows.
+ */
+#define GFX6_MAX_GS_URB_ENTRY_SIZE_BYTES                (5*128)
+
+/* GS Thread Payload
+ */
+
+/* 3DSTATE_GS "Output Vertex Size" has an effective maximum of 62. It's
+ * counted in multiples of 16 bytes.
+ */
+#define GFX7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES            (62*16)
+
+
+/* R0 */
+# define GFX7_GS_PAYLOAD_INSTANCE_ID_SHIFT		27
+
+/* CR0.0[5:4] Floating-Point Rounding Modes
+ *  Skylake PRM, Volume 7 Part 1, "Control Register", page 756
+ */
+
+#define BRW_CR0_RND_MODE_MASK     0x30
+#define BRW_CR0_RND_MODE_SHIFT    4
+
+enum ENUM_PACKED brw_rnd_mode {
+   BRW_RND_MODE_RTNE = 0,  /* Round to Nearest or Even */
+   BRW_RND_MODE_RU = 1,    /* Round Up, toward +inf */
+   BRW_RND_MODE_RD = 2,    /* Round Down, toward -inf */
+   BRW_RND_MODE_RTZ = 3,   /* Round Toward Zero */
+   BRW_RND_MODE_UNSPECIFIED,  /* Unspecified rounding mode */
+};
+
+#define BRW_CR0_FP64_DENORM_PRESERVE (1 << 6)
+#define BRW_CR0_FP32_DENORM_PRESERVE (1 << 7)
+#define BRW_CR0_FP16_DENORM_PRESERVE (1 << 10)
+
+#define BRW_CR0_FP_MODE_MASK (BRW_CR0_FP64_DENORM_PRESERVE | \
+                              BRW_CR0_FP32_DENORM_PRESERVE | \
+                              BRW_CR0_FP16_DENORM_PRESERVE | \
+                              BRW_CR0_RND_MODE_MASK)
+
+/* MDC_DS - Data Size Message Descriptor Control Field
+ * Skylake PRM, Volume 2d, page 129
+ *
+ * Specifies the number of Bytes to be read or written per Dword used at
+ * byte_scattered read/write and byte_scaled read/write messages.
+ */
+#define GFX7_BYTE_SCATTERED_DATA_ELEMENT_BYTE     0
+#define GFX7_BYTE_SCATTERED_DATA_ELEMENT_WORD     1
+#define GFX7_BYTE_SCATTERED_DATA_ELEMENT_DWORD    2
+
+#define GEN_RT_BTD_MESSAGE_SPAWN 1
+
+#define GEN_RT_TRACE_RAY_INITAL       0
+#define GEN_RT_TRACE_RAY_INSTANCE     1
+#define GEN_RT_TRACE_RAY_COMMIT       2
+#define GEN_RT_TRACE_RAY_CONTINUE     3
+
+#define GEN_RT_BTD_SHADER_TYPE_ANY_HIT        0
+#define GEN_RT_BTD_SHADER_TYPE_CLOSEST_HIT    1
+#define GEN_RT_BTD_SHADER_TYPE_MISS           2
+#define GEN_RT_BTD_SHADER_TYPE_INTERSECTION   3
+
+/* Starting with Xe-HPG, the old dataport was massively reworked dataport.
+ * The new thing, called Load/Store Cache or LSC, has a significantly improved
+ * interface.  Instead of bespoke messages for every case, there's basically
+ * one or two messages with different bits to control things like address
+ * size, how much data is read/written, etc.  It's way nicer but also means we
+ * get to rewrite all our dataport encoding/decoding code.  This patch kicks
+ * off the party with all of the new enums.
+ */
+enum lsc_opcode {
+   LSC_OP_LOAD            = 0,
+   LSC_OP_LOAD_CMASK      = 2,
+   LSC_OP_STORE           = 4,
+   LSC_OP_STORE_CMASK     = 6,
+   LSC_OP_ATOMIC_INC      = 8,
+   LSC_OP_ATOMIC_DEC      = 9,
+   LSC_OP_ATOMIC_LOAD     = 10,
+   LSC_OP_ATOMIC_STORE    = 11,
+   LSC_OP_ATOMIC_ADD      = 12,
+   LSC_OP_ATOMIC_SUB      = 13,
+   LSC_OP_ATOMIC_MIN      = 14,
+   LSC_OP_ATOMIC_MAX      = 15,
+   LSC_OP_ATOMIC_UMIN     = 16,
+   LSC_OP_ATOMIC_UMAX     = 17,
+   LSC_OP_ATOMIC_CMPXCHG  = 18,
+   LSC_OP_ATOMIC_FADD     = 19,
+   LSC_OP_ATOMIC_FSUB     = 20,
+   LSC_OP_ATOMIC_FMIN     = 21,
+   LSC_OP_ATOMIC_FMAX     = 22,
+   LSC_OP_ATOMIC_FCMPXCHG = 23,
+   LSC_OP_ATOMIC_AND      = 24,
+   LSC_OP_ATOMIC_OR       = 25,
+   LSC_OP_ATOMIC_XOR      = 26,
+   LSC_OP_FENCE           = 31
+};
+
+/*
+ * Specifies the size of the dataport address payload in registers.
+ */
+enum ENUM_PACKED lsc_addr_reg_size {
+   LSC_ADDR_REG_SIZE_1  = 1,
+   LSC_ADDR_REG_SIZE_2  = 2,
+   LSC_ADDR_REG_SIZE_3  = 3,
+   LSC_ADDR_REG_SIZE_4  = 4,
+   LSC_ADDR_REG_SIZE_6  = 6,
+   LSC_ADDR_REG_SIZE_8  = 8,
+};
+
+/*
+ * Specifies the size of the address payload item in a dataport message.
+ */
+enum ENUM_PACKED lsc_addr_size {
+  LSC_ADDR_SIZE_A16 = 1,    /* 16-bit address offset */
+  LSC_ADDR_SIZE_A32 = 2,    /* 32-bit address offset */
+  LSC_ADDR_SIZE_A64 = 3,    /* 64-bit address offset */
+};
+
+/*
+ * Specifies the type of the address payload item in a dataport message. The
+ * address type specifies how the dataport message decodes the Extended
+ * Descriptor for the surface attributes and address calculation.
+ */
+enum ENUM_PACKED lsc_addr_surface_type {
+   LSC_ADDR_SURFTYPE_FLAT = 0, /* Flat */
+   LSC_ADDR_SURFTYPE_BSS = 1,  /* Bindless surface state */
+   LSC_ADDR_SURFTYPE_SS = 2,   /* Surface state */
+   LSC_ADDR_SURFTYPE_BTI = 3,  /* Binding table index */
+};
+
+/*
+ * Specifies the dataport message override to the default L1 and L3 memory
+ * cache policies. Dataport L1 cache policies are uncached (UC), cached (C),
+ * cache streaming (S) and invalidate-after-read (IAR). Dataport L3 cache
+ * policies are uncached (UC) and cached (C).
+ */
+enum lsc_cache_load {
+   /* No override. Use the non-pipelined state or surface state cache settings
+    * for L1 and L3.
+    */
+   LSC_CACHE_LOAD_L1STATE_L3MOCS = 0,
+   /* Override to L1 uncached and L3 uncached */
+   LSC_CACHE_LOAD_L1UC_L3UC      = 1,
+   /* Override to L1 uncached and L3 cached */
+   LSC_CACHE_LOAD_L1UC_L3C       = 2,
+   /* Override to L1 cached and L3 uncached */
+   LSC_CACHE_LOAD_L1C_L3UC       = 3,
+   /* Override to cache at both L1 and L3 */
+   LSC_CACHE_LOAD_L1C_L3C        = 4,
+   /* Override to L1 streaming load and L3 uncached */
+   LSC_CACHE_LOAD_L1S_L3UC       = 5,
+   /* Override to L1 streaming load and L3 cached */
+   LSC_CACHE_LOAD_L1S_L3C        = 6,
+   /* For load messages, override to L1 invalidate-after-read, and L3 cached. */
+   LSC_CACHE_LOAD_L1IAR_L3C      = 7,
+};
+
+/*
+ * Specifies the dataport message override to the default L1 and L3 memory
+ * cache policies. Dataport L1 cache policies are uncached (UC), cached (C),
+ * streaming (S) and invalidate-after-read (IAR). Dataport L3 cache policies
+ * are uncached (UC), cached (C), cached-as-a-constand (CC) and
+ * invalidate-after-read (IAR).
+ */
+enum PACKED xe2_lsc_cache_load {
+   /* No override. Use the non-pipelined or surface state cache settings for L1
+    * and L3.
+    */
+   XE2_LSC_CACHE_LOAD_L1STATE_L3MOCS = 0,
+   /* Override to L1 uncached and L3 uncached */
+   XE2_LSC_CACHE_LOAD_L1UC_L3UC = 2,
+   /* Override to L1 uncached and L3 cached */
+   XE2_LSC_CACHE_LOAD_L1UC_L3C = 4,
+   /* Override to L1 uncached and L3 cached as a constant */
+   XE2_LSC_CACHE_LOAD_L1UC_L3CC = 5,
+   /* Override to L1 cached and L3 uncached */
+   XE2_LSC_CACHE_LOAD_L1C_L3UC = 6,
+   /* Override to L1 cached and L3 cached */
+   XE2_LSC_CACHE_LOAD_L1C_L3C = 8,
+   /* Override to L1 cached and L3 cached as a constant */
+   XE2_LSC_CACHE_LOAD_L1C_L3CC = 9,
+   /* Override to L1 cached as streaming load and L3 uncached */
+   XE2_LSC_CACHE_LOAD_L1S_L3UC = 10,
+   /* Override to L1 cached as streaming load and L3 cached */
+   XE2_LSC_CACHE_LOAD_L1S_L3C = 12,
+   /* Override to L1 and L3 invalidate after read */
+   XE2_LSC_CACHE_LOAD_L1IAR_L3IAR = 14,
+
+};
+
+/*
+ * Specifies the dataport message override to the default L1 and L3 memory
+ * cache policies. Dataport L1 cache policies are uncached (UC), write-through
+ * (WT), write-back (WB) and streaming (S). Dataport L3 cache policies are
+ * uncached (UC) and cached (WB).
+ */
+enum ENUM_PACKED lsc_cache_store {
+   /* No override. Use the non-pipelined or surface state cache settings for L1
+    * and L3.
+    */
+   LSC_CACHE_STORE_L1STATE_L3MOCS = 0,
+   /* Override to L1 uncached and L3 uncached */
+   LSC_CACHE_STORE_L1UC_L3UC = 1,
+   /* Override to L1 uncached and L3 cached */
+   LSC_CACHE_STORE_L1UC_L3WB = 2,
+   /* Override to L1 write-through and L3 uncached */
+   LSC_CACHE_STORE_L1WT_L3UC = 3,
+   /* Override to L1 write-through and L3 cached */
+   LSC_CACHE_STORE_L1WT_L3WB = 4,
+   /* Override to L1 streaming and L3 uncached */
+   LSC_CACHE_STORE_L1S_L3UC = 5,
+   /* Override to L1 streaming and L3 cached */
+   LSC_CACHE_STORE_L1S_L3WB = 6,
+   /* Override to L1 write-back, and L3 cached */
+   LSC_CACHE_STORE_L1WB_L3WB = 7,
+
+};
+
+/*
+ * Specifies the dataport message override to the default L1 and L3 memory
+ * cache policies. Dataport L1 cache policies are uncached (UC), write-through
+ * (WT), write-back (WB) and streaming (S). Dataport L3 cache policies are
+ * uncached (UC) and cached (WB).
+ */
+enum PACKED xe2_lsc_cache_store {
+   /* No override. Use the non-pipelined or surface state cache settings for L1
+    * and L3.
+    */
+   XE2_LSC_CACHE_STORE_L1STATE_L3MOCS = 0,
+   /* Override to L1 uncached and L3 uncached */
+   XE2_LSC_CACHE_STORE_L1UC_L3UC = 2,
+   /* Override to L1 uncached and L3 cached */
+   XE2_LSC_CACHE_STORE_L1UC_L3WB = 4,
+   /* Override to L1 write-through and L3 uncached */
+   XE2_LSC_CACHE_STORE_L1WT_L3UC = 6,
+   /* Override to L1 write-through and L3 cached */
+   XE2_LSC_CACHE_STORE_L1WT_L3WB = 8,
+   /* Override to L1 streaming and L3 uncached */
+   XE2_LSC_CACHE_STORE_L1S_L3UC = 10,
+   /* Override to L1 streaming and L3 cached */
+   XE2_LSC_CACHE_STORE_L1S_L3WB = 12,
+   /* Override to L1 write-back and L3 cached */
+   XE2_LSC_CACHE_STORE_L1WB_L3WB = 14,
+
+};
+
+#define LSC_CACHE(devinfo, l_or_s, cc)                                  \
+   ((devinfo)->ver < 20 ? (unsigned)LSC_CACHE_ ## l_or_s ## _ ## cc :   \
+                          (unsigned)XE2_LSC_CACHE_ ## l_or_s ## _ ## cc)
+
+/*
+ * Specifies which components of the data payload 4-element vector (X,Y,Z,W) is
+ * packed into the register payload.
+ */
+enum ENUM_PACKED lsc_cmask {
+   LSC_CMASK_X = 0x1,
+   LSC_CMASK_Y = 0x2,
+   LSC_CMASK_XY = 0x3,
+   LSC_CMASK_Z = 0x4,
+   LSC_CMASK_XZ = 0x5,
+   LSC_CMASK_YZ = 0x6,
+   LSC_CMASK_XYZ = 0x7,
+   LSC_CMASK_W = 0x8,
+   LSC_CMASK_XW = 0x9,
+   LSC_CMASK_YW = 0xa,
+   LSC_CMASK_XYW = 0xb,
+   LSC_CMASK_ZW = 0xc,
+   LSC_CMASK_XZW = 0xd,
+   LSC_CMASK_YZW = 0xe,
+   LSC_CMASK_XYZW = 0xf,
+};
+
+/*
+ * Specifies the size of the data payload item in a dataport message.
+ */
+enum ENUM_PACKED lsc_data_size {
+   /* 8-bit scalar data value in memory, packed into a 8-bit data value in
+    * register.
+    */
+   LSC_DATA_SIZE_D8 = 0,
+   /* 16-bit scalar data value in memory, packed into a 16-bit data value in
+    * register.
+    */
+   LSC_DATA_SIZE_D16 = 1,
+   /* 32-bit scalar data value in memory, packed into 32-bit data value in
+    * register.
+    */
+   LSC_DATA_SIZE_D32 = 2,
+   /* 64-bit scalar data value in memory, packed into 64-bit data value in
+    * register.
+    */
+   LSC_DATA_SIZE_D64 = 3,
+   /* 8-bit scalar data value in memory, packed into 32-bit unsigned data value
+    * in register.
+    */
+   LSC_DATA_SIZE_D8U32 = 4,
+   /* 16-bit scalar data value in memory, packed into 32-bit unsigned data
+    * value in register.
+    */
+   LSC_DATA_SIZE_D16U32 = 5,
+   /* 16-bit scalar BigFloat data value in memory, packed into 32-bit float
+    * value in register.
+    */
+   LSC_DATA_SIZE_D16BF32 = 6,
+};
+
+/*
+ *  Enum specifies the scope of the fence.
+ */
+enum ENUM_PACKED lsc_fence_scope {
+   /* Wait until all previous memory transactions from this thread are observed
+    * within the local thread-group.
+    */
+   LSC_FENCE_THREADGROUP = 0,
+   /* Wait until all previous memory transactions from this thread are observed
+    * within the local sub-slice.
+    */
+   LSC_FENCE_LOCAL = 1,
+   /* Wait until all previous memory transactions from this thread are observed
+    * in the local tile.
+    */
+   LSC_FENCE_TILE = 2,
+   /* Wait until all previous memory transactions from this thread are observed
+    * in the local GPU.
+    */
+   LSC_FENCE_GPU = 3,
+   /* Wait until all previous memory transactions from this thread are observed
+    * across all GPUs in the system.
+    */
+   LSC_FENCE_ALL_GPU = 4,
+   /* Wait until all previous memory transactions from this thread are observed
+    * at the "system" level.
+    */
+   LSC_FENCE_SYSTEM_RELEASE = 5,
+   /* For GPUs that do not follow PCIe Write ordering for downstream writes
+    * targeting device memory, a fence message with scope=System_Acquire will
+    * commit to device memory all downstream and peer writes that have reached
+    * the device.
+    */
+   LSC_FENCE_SYSTEM_ACQUIRE = 6,
+};
+
+/*
+ * Specifies the type of cache flush operation to perform after a fence is
+ * complete.
+ */
+enum ENUM_PACKED lsc_flush_type {
+   LSC_FLUSH_TYPE_NONE = 0,
+   /*
+    * For a R/W cache, evict dirty lines (M to I state) and invalidate clean
+    * lines. For a RO cache, invalidate clean lines.
+    */
+   LSC_FLUSH_TYPE_EVICT = 1,
+   /*
+    * For both R/W and RO cache, invalidate clean lines in the cache.
+    */
+   LSC_FLUSH_TYPE_INVALIDATE = 2,
+   /*
+    * For a R/W cache, invalidate dirty lines (M to I state), without
+    * write-back to next level. This opcode does nothing for a RO cache.
+    */
+   LSC_FLUSH_TYPE_DISCARD = 3,
+   /*
+    * For a R/W cache, write-back dirty lines to the next level, but kept in
+    * the cache as "clean" (M to V state). This opcode does nothing for a RO
+    * cache.
+    */
+   LSC_FLUSH_TYPE_CLEAN = 4,
+   /*
+    * Flush "RW" section of the L3 cache, but leave L1 and L2 caches untouched.
+    */
+   LSC_FLUSH_TYPE_L3ONLY = 5,
+   /*
+    * HW maps this flush type internally to NONE.
+    */
+   LSC_FLUSH_TYPE_NONE_6 = 6,
+
+};
+
+enum ENUM_PACKED lsc_backup_fence_routing {
+   /* Normal routing: UGM fence is routed to UGM pipeline. */
+   LSC_NORMAL_ROUTING,
+   /* Route UGM fence to LSC unit. */
+   LSC_ROUTE_TO_LSC,
+};
+
+/*
+ * Specifies the size of the vector in a dataport message.
+ */
+enum ENUM_PACKED lsc_vect_size {
+   LSC_VECT_SIZE_V1 = 0,    /* vector length 1 */
+   LSC_VECT_SIZE_V2 = 1,    /* vector length 2 */
+   LSC_VECT_SIZE_V3 = 2,    /* Vector length 3 */
+   LSC_VECT_SIZE_V4 = 3,    /* Vector length 4 */
+   LSC_VECT_SIZE_V8 = 4,    /* Vector length 8 */
+   LSC_VECT_SIZE_V16 = 5,   /* Vector length 16 */
+   LSC_VECT_SIZE_V32 = 6,   /* Vector length 32 */
+   LSC_VECT_SIZE_V64 = 7,   /* Vector length 64 */
+};
+
+#define LSC_ONE_ADDR_REG   1
+
+#endif /* BRW_EU_DEFINES_H */
diff --git a/src/intel/compiler/elk/brw_eu_emit.c b/src/intel/compiler/elk/brw_eu_emit.c
new file mode 100644
index 00000000000..74bd9c0ddbe
--- /dev/null
+++ b/src/intel/compiler/elk/brw_eu_emit.c
@@ -0,0 +1,3770 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+
+#include "brw_eu_defines.h"
+#include "brw_eu.h"
+
+#include "util/ralloc.h"
+
+/**
+ * Prior to Sandybridge, the SEND instruction accepted non-MRF source
+ * registers, implicitly moving the operand to a message register.
+ *
+ * On Sandybridge, this is no longer the case.  This function performs the
+ * explicit move; it should be called before emitting a SEND instruction.
+ */
+void
+gfx6_resolve_implied_move(struct brw_codegen *p,
+			  struct brw_reg *src,
+			  unsigned msg_reg_nr)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   if (devinfo->ver < 6)
+      return;
+
+   if (src->file == BRW_MESSAGE_REGISTER_FILE)
+      return;
+
+   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
+      assert(devinfo->ver < 12);
+      brw_push_insn_state(p);
+      brw_set_default_exec_size(p, BRW_EXECUTE_8);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
+	      retype(*src, BRW_REGISTER_TYPE_UD));
+      brw_pop_insn_state(p);
+   }
+   *src = brw_message_reg(msg_reg_nr);
+}
+
+static void
+gfx7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
+{
+   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
+    * "The send with EOT should use register space R112-R127 for <src>. This is
+    *  to enable loading of a new thread into the same slot while the message
+    *  with EOT for current thread is pending dispatch."
+    *
+    * Since we're pretending to have 16 MRFs anyway, we may as well use the
+    * registers required for messages with EOT.
+    */
+   const struct intel_device_info *devinfo = p->devinfo;
+   if (devinfo->ver >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
+      reg->file = BRW_GENERAL_REGISTER_FILE;
+      reg->nr += GFX7_MRF_HACK_START;
+   }
+}
+
+void
+brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   if (dest.file == BRW_MESSAGE_REGISTER_FILE)
+      assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
+   else if (dest.file == BRW_GENERAL_REGISTER_FILE)
+      assert(dest.nr < XE2_MAX_GRF);
+
+   /* The hardware has a restriction where a destination of size Byte with
+    * a stride of 1 is only allowed for a packed byte MOV. For any other
+    * instruction, the stride must be at least 2, even when the destination
+    * is the NULL register.
+    */
+   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+       dest.nr == BRW_ARF_NULL &&
+       type_sz(dest.type) == 1 &&
+       dest.hstride == BRW_HORIZONTAL_STRIDE_1) {
+      dest.hstride = BRW_HORIZONTAL_STRIDE_2;
+   }
+
+   gfx7_convert_mrf_to_grf(p, &dest);
+
+   if (devinfo->ver >= 12 &&
+       (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
+        brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC)) {
+      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
+             dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
+      assert(dest.address_mode == BRW_ADDRESS_DIRECT);
+      assert(dest.subnr == 0);
+      assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
+             (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
+              dest.vstride == dest.width + 1));
+      assert(!dest.negate && !dest.abs);
+      brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
+      brw_inst_set_dst_da_reg_nr(devinfo, inst, phys_nr(devinfo, dest));
+
+   } else if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
+              brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC) {
+      assert(devinfo->ver < 12);
+      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
+             dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
+      assert(dest.address_mode == BRW_ADDRESS_DIRECT);
+      assert(dest.subnr % 16 == 0);
+      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
+             dest.vstride == dest.width + 1);
+      assert(!dest.negate && !dest.abs);
+      brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
+      brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
+      brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
+   } else {
+      brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
+      brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
+
+      if (dest.address_mode == BRW_ADDRESS_DIRECT) {
+         brw_inst_set_dst_da_reg_nr(devinfo, inst, phys_nr(devinfo, dest));
+
+         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+            brw_inst_set_dst_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest));
+            if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
+               dest.hstride = BRW_HORIZONTAL_STRIDE_1;
+            brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
+         } else {
+            brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
+            brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
+            if (dest.file == BRW_GENERAL_REGISTER_FILE ||
+                dest.file == BRW_MESSAGE_REGISTER_FILE) {
+               assert(dest.writemask != 0);
+            }
+            /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
+             *    Although Dst.HorzStride is a don't care for Align16, HW needs
+             *    this to be programmed as "01".
+             */
+            brw_inst_set_dst_hstride(devinfo, inst, 1);
+         }
+      } else {
+         brw_inst_set_dst_ia_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest));
+
+         /* These are different sizes in align1 vs align16:
+          */
+         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+            brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
+                                          dest.indirect_offset);
+            if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
+               dest.hstride = BRW_HORIZONTAL_STRIDE_1;
+            brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
+         } else {
+            brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
+                                           dest.indirect_offset);
+            /* even ignored in da16, still need to set as '01' */
+            brw_inst_set_dst_hstride(devinfo, inst, 1);
+         }
+      }
+   }
+
+   /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
+    * or 16 (SIMD16), as that's normally correct.  However, when dealing with
+    * small registers, it can be useful for us to automatically reduce it to
+    * match the register size.
+    */
+   if (p->automatic_exec_sizes) {
+      /*
+       * In platforms that support fp64 we can emit instructions with a width
+       * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
+       * these cases we need to make sure that these instructions have their
+       * exec sizes set properly when they are emitted and we can't rely on
+       * this code to fix it.
+       */
+      bool fix_exec_size;
+      if (devinfo->ver >= 6)
+         fix_exec_size = dest.width < BRW_EXECUTE_4;
+      else
+         fix_exec_size = dest.width < BRW_EXECUTE_8;
+
+      if (fix_exec_size)
+         brw_inst_set_exec_size(devinfo, inst, dest.width);
+   }
+}
+
+void
+brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   if (reg.file == BRW_MESSAGE_REGISTER_FILE)
+      assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
+   else if (reg.file == BRW_GENERAL_REGISTER_FILE)
+      assert(reg.nr < XE2_MAX_GRF);
+
+   gfx7_convert_mrf_to_grf(p, &reg);
+
+   if (devinfo->ver >= 6 &&
+       (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
+        brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC ||
+        brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
+        brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC)) {
+      /* Any source modifiers or regions will be ignored, since this just
+       * identifies the MRF/GRF to start reading the message contents from.
+       * Check for some likely failures.
+       */
+      assert(!reg.negate);
+      assert(!reg.abs);
+      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
+   }
+
+   if (devinfo->ver >= 12 &&
+       (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
+        brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC)) {
+      assert(reg.file != BRW_IMMEDIATE_VALUE);
+      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
+      assert(reg.subnr == 0);
+      assert(has_scalar_region(reg) ||
+             (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
+              reg.vstride == reg.width + 1));
+      assert(!reg.negate && !reg.abs);
+      brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
+      brw_inst_set_src0_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
+
+   } else if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
+              brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC) {
+      assert(reg.file == BRW_GENERAL_REGISTER_FILE);
+      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
+      assert(reg.subnr % 16 == 0);
+      assert(has_scalar_region(reg) ||
+             (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
+              reg.vstride == reg.width + 1));
+      assert(!reg.negate && !reg.abs);
+      brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
+      brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
+   } else {
+      brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
+      brw_inst_set_src0_abs(devinfo, inst, reg.abs);
+      brw_inst_set_src0_negate(devinfo, inst, reg.negate);
+      brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
+
+      if (reg.file == BRW_IMMEDIATE_VALUE) {
+         if (reg.type == BRW_REGISTER_TYPE_DF ||
+             brw_inst_opcode(p->isa, inst) == BRW_OPCODE_DIM)
+            brw_inst_set_imm_df(devinfo, inst, reg.df);
+         else if (reg.type == BRW_REGISTER_TYPE_UQ ||
+                  reg.type == BRW_REGISTER_TYPE_Q)
+            brw_inst_set_imm_uq(devinfo, inst, reg.u64);
+         else
+            brw_inst_set_imm_ud(devinfo, inst, reg.ud);
+
+         if (devinfo->ver < 12 && type_sz(reg.type) < 8) {
+            brw_inst_set_src1_reg_file(devinfo, inst,
+                                       BRW_ARCHITECTURE_REGISTER_FILE);
+            brw_inst_set_src1_reg_hw_type(devinfo, inst,
+                                          brw_inst_src0_reg_hw_type(devinfo, inst));
+         }
+      } else {
+         if (reg.address_mode == BRW_ADDRESS_DIRECT) {
+            brw_inst_set_src0_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
+            if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+               brw_inst_set_src0_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
+            } else {
+               brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
+            }
+         } else {
+            brw_inst_set_src0_ia_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
+
+            if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+               brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
+            } else {
+               brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
+            }
+         }
+
+         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+            if (reg.width == BRW_WIDTH_1 &&
+                brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
+               brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
+               brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
+               brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
+            } else {
+               brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
+               brw_inst_set_src0_width(devinfo, inst, reg.width);
+               brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
+            }
+         } else {
+            brw_inst_set_src0_da16_swiz_x(devinfo, inst,
+               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
+            brw_inst_set_src0_da16_swiz_y(devinfo, inst,
+               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
+            brw_inst_set_src0_da16_swiz_z(devinfo, inst,
+               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
+            brw_inst_set_src0_da16_swiz_w(devinfo, inst,
+               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
+
+            if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
+               /* This is an oddity of the fact we're using the same
+                * descriptions for registers in align_16 as align_1:
+                */
+               brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
+            } else if (devinfo->verx10 == 70 &&
+                       reg.type == BRW_REGISTER_TYPE_DF &&
+                       reg.vstride == BRW_VERTICAL_STRIDE_2) {
+               /* From SNB PRM:
+                *
+                * "For Align16 access mode, only encodings of 0000 and 0011
+                *  are allowed. Other codes are reserved."
+                *
+                * Presumably the DevSNB behavior applies to IVB as well.
+                */
+               brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
+            } else {
+               brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
+            }
+         }
+      }
+   }
+}
+
+
+void
+brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   if (reg.file == BRW_GENERAL_REGISTER_FILE)
+      assert(reg.nr < XE2_MAX_GRF);
+
+   if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
+       brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC ||
+       (devinfo->ver >= 12 &&
+        (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
+         brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC))) {
+      assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
+             reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
+      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
+      assert(reg.subnr == 0);
+      assert(has_scalar_region(reg) ||
+             (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
+              reg.vstride == reg.width + 1));
+      assert(!reg.negate && !reg.abs);
+      brw_inst_set_send_src1_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
+      brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
+   } else {
+      /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
+       *
+       *    "Accumulator registers may be accessed explicitly as src0
+       *    operands only."
+       */
+      assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+             reg.nr != BRW_ARF_ACCUMULATOR);
+
+      gfx7_convert_mrf_to_grf(p, &reg);
+      assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
+
+      brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
+      brw_inst_set_src1_abs(devinfo, inst, reg.abs);
+      brw_inst_set_src1_negate(devinfo, inst, reg.negate);
+
+      /* Only src1 can be immediate in two-argument instructions.
+       */
+      assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
+
+      if (reg.file == BRW_IMMEDIATE_VALUE) {
+         /* two-argument instructions can only use 32-bit immediates */
+         assert(type_sz(reg.type) < 8);
+         brw_inst_set_imm_ud(devinfo, inst, reg.ud);
+      } else {
+         /* This is a hardware restriction, which may or may not be lifted
+          * in the future:
+          */
+         assert (reg.address_mode == BRW_ADDRESS_DIRECT);
+         /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
+
+         brw_inst_set_src1_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
+         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+            brw_inst_set_src1_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
+         } else {
+            brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
+         }
+
+         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+            if (reg.width == BRW_WIDTH_1 &&
+                brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
+               brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
+               brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
+               brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
+            } else {
+               brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
+               brw_inst_set_src1_width(devinfo, inst, reg.width);
+               brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
+            }
+         } else {
+            brw_inst_set_src1_da16_swiz_x(devinfo, inst,
+               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
+            brw_inst_set_src1_da16_swiz_y(devinfo, inst,
+               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
+            brw_inst_set_src1_da16_swiz_z(devinfo, inst,
+               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
+            brw_inst_set_src1_da16_swiz_w(devinfo, inst,
+               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
+
+            if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
+               /* This is an oddity of the fact we're using the same
+                * descriptions for registers in align_16 as align_1:
+                */
+               brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
+            } else if (devinfo->verx10 == 70 &&
+                       reg.type == BRW_REGISTER_TYPE_DF &&
+                       reg.vstride == BRW_VERTICAL_STRIDE_2) {
+               /* From SNB PRM:
+                *
+                * "For Align16 access mode, only encodings of 0000 and 0011
+                *  are allowed. Other codes are reserved."
+                *
+                * Presumably the DevSNB behavior applies to IVB as well.
+                */
+               brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
+            } else {
+               brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
+            }
+         }
+      }
+   }
+}
+
+/**
+ * Specify the descriptor and extended descriptor immediate for a SEND(C)
+ * message instruction.
+ */
+void
+brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
+                unsigned desc, unsigned ex_desc)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   assert(brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
+          brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC);
+   if (devinfo->ver < 12)
+      brw_inst_set_src1_file_type(devinfo, inst,
+                                  BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
+   brw_inst_set_send_desc(devinfo, inst, desc);
+   if (devinfo->ver >= 9)
+      brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
+}
+
+static void brw_set_math_message( struct brw_codegen *p,
+				  brw_inst *inst,
+				  unsigned function,
+				  unsigned integer_type,
+				  bool low_precision,
+				  unsigned dataType )
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   unsigned msg_length;
+   unsigned response_length;
+
+   /* Infer message length from the function */
+   switch (function) {
+   case BRW_MATH_FUNCTION_POW:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
+   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+      msg_length = 2;
+      break;
+   default:
+      msg_length = 1;
+      break;
+   }
+
+   /* Infer response length from the function */
+   switch (function) {
+   case BRW_MATH_FUNCTION_SINCOS:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+      response_length = 2;
+      break;
+   default:
+      response_length = 1;
+      break;
+   }
+
+   brw_set_desc(p, inst, brw_message_desc(
+                   devinfo, msg_length, response_length, false));
+
+   brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
+   brw_inst_set_math_msg_function(devinfo, inst, function);
+   brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
+   brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
+   brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
+   brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
+   brw_inst_set_saturate(devinfo, inst, 0);
+}
+
+
+static void brw_set_ff_sync_message(struct brw_codegen *p,
+				    brw_inst *insn,
+				    bool allocate,
+				    unsigned response_length,
+				    bool end_of_thread)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   brw_set_desc(p, insn, brw_message_desc(
+                   devinfo, 1, response_length, true));
+
+   brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
+   brw_inst_set_eot(devinfo, insn, end_of_thread);
+   brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
+   brw_inst_set_urb_allocate(devinfo, insn, allocate);
+   /* The following fields are not used by FF_SYNC: */
+   brw_inst_set_urb_global_offset(devinfo, insn, 0);
+   brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
+   brw_inst_set_urb_used(devinfo, insn, 0);
+   brw_inst_set_urb_complete(devinfo, insn, 0);
+}
+
+static void brw_set_urb_message( struct brw_codegen *p,
+				 brw_inst *insn,
+                                 enum brw_urb_write_flags flags,
+				 unsigned msg_length,
+				 unsigned response_length,
+				 unsigned offset,
+				 unsigned swizzle_control )
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   assert(devinfo->ver < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
+   assert(devinfo->ver < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
+   assert(devinfo->ver >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
+
+   brw_set_desc(p, insn, brw_message_desc(
+                   devinfo, msg_length, response_length, true));
+
+   brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
+   brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
+
+   if (flags & BRW_URB_WRITE_OWORD) {
+      assert(msg_length == 2); /* header + one OWORD of data */
+      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
+   } else {
+      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
+   }
+
+   brw_inst_set_urb_global_offset(devinfo, insn, offset);
+   brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
+
+   if (devinfo->ver < 8) {
+      brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
+   }
+
+   if (devinfo->ver < 7) {
+      brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
+      brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
+   } else {
+      brw_inst_set_urb_per_slot_offset(devinfo, insn,
+         !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
+   }
+}
+
+static void
+gfx7_set_dp_scratch_message(struct brw_codegen *p,
+                            brw_inst *inst,
+                            bool write,
+                            bool dword,
+                            bool invalidate_after_read,
+                            unsigned num_regs,
+                            unsigned addr_offset,
+                            unsigned mlen,
+                            unsigned rlen,
+                            bool header_present)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
+          (devinfo->ver >= 8 && num_regs == 8));
+   const unsigned block_size = (devinfo->ver >= 8 ? util_logbase2(num_regs) :
+                                num_regs - 1);
+
+   brw_set_desc(p, inst, brw_message_desc(
+                   devinfo, mlen, rlen, header_present));
+
+   brw_inst_set_sfid(devinfo, inst, GFX7_SFID_DATAPORT_DATA_CACHE);
+   brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
+   brw_inst_set_scratch_read_write(devinfo, inst, write);
+   brw_inst_set_scratch_type(devinfo, inst, dword);
+   brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
+   brw_inst_set_scratch_block_size(devinfo, inst, block_size);
+   brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
+}
+
+static void
+brw_inst_set_state(const struct brw_isa_info *isa,
+                   brw_inst *insn,
+                   const struct brw_insn_state *state)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   brw_inst_set_exec_size(devinfo, insn, state->exec_size);
+   brw_inst_set_group(devinfo, insn, state->group);
+   brw_inst_set_compression(devinfo, insn, state->compressed);
+   brw_inst_set_access_mode(devinfo, insn, state->access_mode);
+   brw_inst_set_mask_control(devinfo, insn, state->mask_control);
+   if (devinfo->ver >= 12)
+      brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(devinfo, state->swsb));
+   brw_inst_set_saturate(devinfo, insn, state->saturate);
+   brw_inst_set_pred_control(devinfo, insn, state->predicate);
+   brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
+
+   if (is_3src(isa, brw_inst_opcode(isa, insn)) &&
+       state->access_mode == BRW_ALIGN_16) {
+      brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
+      if (devinfo->ver >= 7)
+         brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
+   } else {
+      brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
+      if (devinfo->ver >= 7)
+         brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
+   }
+
+   if (devinfo->ver >= 6 && devinfo->ver < 20)
+      brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
+}
+
+static brw_inst *
+brw_append_insns(struct brw_codegen *p, unsigned nr_insn, unsigned alignment)
+{
+   assert(util_is_power_of_two_or_zero(sizeof(brw_inst)));
+   assert(util_is_power_of_two_or_zero(alignment));
+   const unsigned align_insn = MAX2(alignment / sizeof(brw_inst), 1);
+   const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
+   const unsigned new_nr_insn = start_insn + nr_insn;
+
+   if (p->store_size < new_nr_insn) {
+      p->store_size = util_next_power_of_two(new_nr_insn * sizeof(brw_inst));
+      p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
+   }
+
+   /* Memset any padding due to alignment to 0.  We don't want to be hashing
+    * or caching a bunch of random bits we got from a memory allocation.
+    */
+   if (p->nr_insn < start_insn) {
+      memset(&p->store[p->nr_insn], 0,
+             (start_insn - p->nr_insn) * sizeof(brw_inst));
+   }
+
+   assert(p->next_insn_offset == p->nr_insn * sizeof(brw_inst));
+   p->nr_insn = new_nr_insn;
+   p->next_insn_offset = new_nr_insn * sizeof(brw_inst);
+
+   return &p->store[start_insn];
+}
+
+void
+brw_realign(struct brw_codegen *p, unsigned alignment)
+{
+   brw_append_insns(p, 0, alignment);
+}
+
+int
+brw_append_data(struct brw_codegen *p, void *data,
+                unsigned size, unsigned alignment)
+{
+   unsigned nr_insn = DIV_ROUND_UP(size, sizeof(brw_inst));
+   void *dst = brw_append_insns(p, nr_insn, alignment);
+   memcpy(dst, data, size);
+
+   /* If it's not a whole number of instructions, memset the end */
+   if (size < nr_insn * sizeof(brw_inst))
+      memset(dst + size, 0, nr_insn * sizeof(brw_inst) - size);
+
+   return dst - (void *)p->store;
+}
+
+#define next_insn brw_next_insn
+brw_inst *
+brw_next_insn(struct brw_codegen *p, unsigned opcode)
+{
+   brw_inst *insn = brw_append_insns(p, 1, sizeof(brw_inst));
+
+   memset(insn, 0, sizeof(*insn));
+   brw_inst_set_opcode(p->isa, insn, opcode);
+
+   /* Apply the default instruction state */
+   brw_inst_set_state(p->isa, insn, p->current);
+
+   return insn;
+}
+
+void
+brw_add_reloc(struct brw_codegen *p, uint32_t id,
+              enum brw_shader_reloc_type type,
+              uint32_t offset, uint32_t delta)
+{
+   if (p->num_relocs + 1 > p->reloc_array_size) {
+      p->reloc_array_size = MAX2(16, p->reloc_array_size * 2);
+      p->relocs = reralloc(p->mem_ctx, p->relocs,
+                           struct brw_shader_reloc, p->reloc_array_size);
+   }
+
+   p->relocs[p->num_relocs++] = (struct brw_shader_reloc) {
+      .id = id,
+      .type = type,
+      .offset = offset,
+      .delta = delta,
+   };
+}
+
+static brw_inst *
+brw_alu1(struct brw_codegen *p, unsigned opcode,
+         struct brw_reg dest, struct brw_reg src)
+{
+   brw_inst *insn = next_insn(p, opcode);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src);
+   return insn;
+}
+
+static brw_inst *
+brw_alu2(struct brw_codegen *p, unsigned opcode,
+         struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
+{
+   /* 64-bit immediates are only supported on 1-src instructions */
+   assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
+   assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
+
+   brw_inst *insn = next_insn(p, opcode);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, src1);
+   return insn;
+}
+
+static int
+get_3src_subreg_nr(struct brw_reg reg)
+{
+   /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
+    * use 32-bit units (components 0..7).  Since they only support F/D/UD
+    * types, this doesn't lose any flexibility, but uses fewer bits.
+    */
+   return reg.subnr / 4;
+}
+
+static enum gfx10_align1_3src_vertical_stride
+to_3src_align1_vstride(const struct intel_device_info *devinfo,
+                       enum brw_vertical_stride vstride)
+{
+   switch (vstride) {
+   case BRW_VERTICAL_STRIDE_0:
+      return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
+   case BRW_VERTICAL_STRIDE_1:
+      assert(devinfo->ver >= 12);
+      return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
+   case BRW_VERTICAL_STRIDE_2:
+      assert(devinfo->ver < 12);
+      return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
+   case BRW_VERTICAL_STRIDE_4:
+      return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
+   case BRW_VERTICAL_STRIDE_8:
+   case BRW_VERTICAL_STRIDE_16:
+      return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
+   default:
+      unreachable("invalid vstride");
+   }
+}
+
+
+static enum gfx10_align1_3src_src_horizontal_stride
+to_3src_align1_hstride(enum brw_horizontal_stride hstride)
+{
+   switch (hstride) {
+   case BRW_HORIZONTAL_STRIDE_0:
+      return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
+   case BRW_HORIZONTAL_STRIDE_1:
+      return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
+   case BRW_HORIZONTAL_STRIDE_2:
+      return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
+   case BRW_HORIZONTAL_STRIDE_4:
+      return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
+   default:
+      unreachable("invalid hstride");
+   }
+}
+
+static brw_inst *
+brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
+         struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *inst = next_insn(p, opcode);
+
+   gfx7_convert_mrf_to_grf(p, &dest);
+
+   assert(dest.nr < XE2_MAX_GRF);
+
+   if (devinfo->ver >= 10)
+      assert(!(src0.file == BRW_IMMEDIATE_VALUE &&
+               src2.file == BRW_IMMEDIATE_VALUE));
+
+   assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < XE2_MAX_GRF);
+   assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < XE2_MAX_GRF);
+   assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < XE2_MAX_GRF);
+   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
+   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
+   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
+   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
+
+   if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
+             (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+              dest.nr == BRW_ARF_ACCUMULATOR));
+
+      if (devinfo->ver >= 12) {
+         brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
+         brw_inst_set_3src_dst_reg_nr(devinfo, inst, phys_nr(devinfo, dest));
+      } else {
+         if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
+            brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
+                                              BRW_ALIGN1_3SRC_ACCUMULATOR);
+            brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
+         } else {
+            brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
+                                              BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
+            brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
+         }
+      }
+      brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest) / 8);
+
+      brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
+
+      if (brw_reg_type_is_floating_point(dest.type)) {
+         brw_inst_set_3src_a1_exec_type(devinfo, inst,
+                                        BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
+      } else {
+         brw_inst_set_3src_a1_exec_type(devinfo, inst,
+                                        BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
+      }
+
+      brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
+      brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
+      brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
+      brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
+
+      if (src0.file == BRW_IMMEDIATE_VALUE) {
+         brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
+      } else {
+         brw_inst_set_3src_a1_src0_vstride(
+            devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
+         brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
+                                           to_3src_align1_hstride(src0.hstride));
+         brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, phys_subnr(devinfo, src0));
+         if (src0.type == BRW_REGISTER_TYPE_NF) {
+            brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
+         } else {
+            brw_inst_set_3src_src0_reg_nr(devinfo, inst, phys_nr(devinfo, src0));
+         }
+         brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
+         brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
+      }
+      brw_inst_set_3src_a1_src1_vstride(
+         devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
+      brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
+                                        to_3src_align1_hstride(src1.hstride));
+
+      brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, phys_subnr(devinfo, src1));
+      if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
+         brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
+      } else {
+         brw_inst_set_3src_src1_reg_nr(devinfo, inst, phys_nr(devinfo, src1));
+      }
+      brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
+      brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
+
+      if (src2.file == BRW_IMMEDIATE_VALUE) {
+         brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
+      } else {
+         brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
+                                           to_3src_align1_hstride(src2.hstride));
+         /* no vstride on src2 */
+         brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, phys_subnr(devinfo, src2));
+         brw_inst_set_3src_src2_reg_nr(devinfo, inst, phys_nr(devinfo, src2));
+         brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
+         brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
+      }
+
+      assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
+             src0.file == BRW_IMMEDIATE_VALUE ||
+             (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+              src0.type == BRW_REGISTER_TYPE_NF));
+      assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
+             (src1.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+              src1.nr == BRW_ARF_ACCUMULATOR));
+      assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
+             src2.file == BRW_IMMEDIATE_VALUE);
+
+      if (devinfo->ver >= 12) {
+         if (src0.file == BRW_IMMEDIATE_VALUE) {
+            brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);
+         } else {
+            brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
+         }
+
+         brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
+
+         if (src2.file == BRW_IMMEDIATE_VALUE) {
+            brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);
+         } else {
+            brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
+         }
+      } else {
+         brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
+                                            src0.file == BRW_GENERAL_REGISTER_FILE ?
+                                            BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
+                                            BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
+         brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
+                                            src1.file == BRW_GENERAL_REGISTER_FILE ?
+                                            BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
+                                            BRW_ALIGN1_3SRC_ACCUMULATOR);
+         brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
+                                            src2.file == BRW_GENERAL_REGISTER_FILE ?
+                                            BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
+                                            BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
+      }
+
+   } else {
+      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
+             dest.file == BRW_MESSAGE_REGISTER_FILE);
+      assert(dest.type == BRW_REGISTER_TYPE_F  ||
+             dest.type == BRW_REGISTER_TYPE_DF ||
+             dest.type == BRW_REGISTER_TYPE_D  ||
+             dest.type == BRW_REGISTER_TYPE_UD ||
+             (dest.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 8));
+      if (devinfo->ver == 6) {
+         brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
+                                            dest.file == BRW_MESSAGE_REGISTER_FILE);
+      }
+      brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
+      brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);
+      brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
+
+      assert(src0.file == BRW_GENERAL_REGISTER_FILE);
+      brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
+      brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
+      brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
+      brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
+      brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
+      brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
+                                          src0.vstride == BRW_VERTICAL_STRIDE_0);
+
+      assert(src1.file == BRW_GENERAL_REGISTER_FILE);
+      brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
+      brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
+      brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
+      brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
+      brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
+      brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
+                                          src1.vstride == BRW_VERTICAL_STRIDE_0);
+
+      assert(src2.file == BRW_GENERAL_REGISTER_FILE);
+      brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
+      brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
+      brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
+      brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
+      brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
+      brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
+                                          src2.vstride == BRW_VERTICAL_STRIDE_0);
+
+      if (devinfo->ver >= 7) {
+         /* Set both the source and destination types based on dest.type,
+          * ignoring the source register types.  The MAD and LRP emitters ensure
+          * that all four types are float.  The BFE and BFI2 emitters, however,
+          * may send us mixed D and UD types and want us to ignore that and use
+          * the destination type.
+          */
+         brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
+         brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
+
+         /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
+          *
+          *    "Three source instructions can use operands with mixed-mode
+          *     precision. When SrcType field is set to :f or :hf it defines
+          *     precision for source 0 only, and fields Src1Type and Src2Type
+          *     define precision for other source operands:
+          *
+          *     0b = :f. Single precision Float (32-bit).
+          *     1b = :hf. Half precision Float (16-bit)."
+          */
+         if (src1.type == BRW_REGISTER_TYPE_HF)
+            brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
+
+         if (src2.type == BRW_REGISTER_TYPE_HF)
+            brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
+      }
+   }
+
+   return inst;
+}
+
+static brw_inst *
+brw_dpas_three_src(struct brw_codegen *p, enum gfx12_systolic_depth opcode,
+                   unsigned sdepth, unsigned rcount, struct brw_reg dest,
+                   struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *inst = next_insn(p, opcode);
+
+   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
+   brw_inst_set_dpas_3src_dst_reg_file(devinfo, inst,
+                                       BRW_GENERAL_REGISTER_FILE);
+   brw_inst_set_dpas_3src_dst_reg_nr(devinfo, inst, dest.nr);
+   brw_inst_set_dpas_3src_dst_subreg_nr(devinfo, inst, dest.subnr);
+
+   if (brw_reg_type_is_floating_point(dest.type)) {
+      brw_inst_set_dpas_3src_exec_type(devinfo, inst,
+                                       BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
+   } else {
+      brw_inst_set_dpas_3src_exec_type(devinfo, inst,
+                                       BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
+   }
+
+   brw_inst_set_dpas_3src_sdepth(devinfo, inst, sdepth);
+   brw_inst_set_dpas_3src_rcount(devinfo, inst, rcount - 1);
+
+   brw_inst_set_dpas_3src_dst_type(devinfo, inst, dest.type);
+   brw_inst_set_dpas_3src_src0_type(devinfo, inst, src0.type);
+   brw_inst_set_dpas_3src_src1_type(devinfo, inst, src1.type);
+   brw_inst_set_dpas_3src_src2_type(devinfo, inst, src2.type);
+
+   assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
+          (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+           src0.nr == BRW_ARF_NULL));
+
+   brw_inst_set_dpas_3src_src0_reg_file(devinfo, inst, src0.file);
+   brw_inst_set_dpas_3src_src0_reg_nr(devinfo, inst, src0.nr);
+   brw_inst_set_dpas_3src_src0_subreg_nr(devinfo, inst, src0.subnr);
+
+   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
+
+   brw_inst_set_dpas_3src_src1_reg_file(devinfo, inst, src1.file);
+   brw_inst_set_dpas_3src_src1_reg_nr(devinfo, inst, src1.nr);
+   brw_inst_set_dpas_3src_src1_subreg_nr(devinfo, inst, src1.subnr);
+   brw_inst_set_dpas_3src_src1_subbyte(devinfo, inst, BRW_SUB_BYTE_PRECISION_NONE);
+
+   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
+
+   brw_inst_set_dpas_3src_src2_reg_file(devinfo, inst, src2.file);
+   brw_inst_set_dpas_3src_src2_reg_nr(devinfo, inst, src2.nr);
+   brw_inst_set_dpas_3src_src2_subreg_nr(devinfo, inst, src2.subnr);
+   brw_inst_set_dpas_3src_src2_subbyte(devinfo, inst, BRW_SUB_BYTE_PRECISION_NONE);
+
+   return inst;
+}
+
+/***********************************************************************
+ * Convenience routines.
+ */
+#define ALU1(OP)					\
+brw_inst *brw_##OP(struct brw_codegen *p,		\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0)   			\
+{							\
+   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
+}
+
+#define ALU2(OP)					\
+brw_inst *brw_##OP(struct brw_codegen *p,		\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0,			\
+	      struct brw_reg src1)   			\
+{							\
+   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
+}
+
+#define ALU3(OP)					\
+brw_inst *brw_##OP(struct brw_codegen *p,		\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0,			\
+	      struct brw_reg src1,			\
+	      struct brw_reg src2)   			\
+{                                                       \
+   if (p->current->access_mode == BRW_ALIGN_16) {       \
+      if (src0.vstride == BRW_VERTICAL_STRIDE_0)        \
+         src0.swizzle = BRW_SWIZZLE_XXXX;               \
+      if (src1.vstride == BRW_VERTICAL_STRIDE_0)        \
+         src1.swizzle = BRW_SWIZZLE_XXXX;               \
+      if (src2.vstride == BRW_VERTICAL_STRIDE_0)        \
+         src2.swizzle = BRW_SWIZZLE_XXXX;               \
+   }                                                    \
+   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
+}
+
+#define ALU3F(OP)                                               \
+brw_inst *brw_##OP(struct brw_codegen *p,         \
+                                 struct brw_reg dest,           \
+                                 struct brw_reg src0,           \
+                                 struct brw_reg src1,           \
+                                 struct brw_reg src2)           \
+{                                                               \
+   assert(dest.type == BRW_REGISTER_TYPE_F ||                   \
+          dest.type == BRW_REGISTER_TYPE_DF);                   \
+   if (dest.type == BRW_REGISTER_TYPE_F) {                      \
+      assert(src0.type == BRW_REGISTER_TYPE_F);                 \
+      assert(src1.type == BRW_REGISTER_TYPE_F);                 \
+      assert(src2.type == BRW_REGISTER_TYPE_F);                 \
+   } else if (dest.type == BRW_REGISTER_TYPE_DF) {              \
+      assert(src0.type == BRW_REGISTER_TYPE_DF);                \
+      assert(src1.type == BRW_REGISTER_TYPE_DF);                \
+      assert(src2.type == BRW_REGISTER_TYPE_DF);                \
+   }                                                            \
+                                                                \
+   if (p->current->access_mode == BRW_ALIGN_16) {               \
+      if (src0.vstride == BRW_VERTICAL_STRIDE_0)                \
+         src0.swizzle = BRW_SWIZZLE_XXXX;                       \
+      if (src1.vstride == BRW_VERTICAL_STRIDE_0)                \
+         src1.swizzle = BRW_SWIZZLE_XXXX;                       \
+      if (src2.vstride == BRW_VERTICAL_STRIDE_0)                \
+         src2.swizzle = BRW_SWIZZLE_XXXX;                       \
+   }                                                            \
+   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
+}
+
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU1(DIM)
+ALU2(ASR)
+ALU2(ROL)
+ALU2(ROR)
+ALU3(CSEL)
+ALU1(FRC)
+ALU1(RNDD)
+ALU1(RNDE)
+ALU1(RNDU)
+ALU1(RNDZ)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU3(DP4A)
+ALU3(MAD)
+ALU3F(LRP)
+ALU1(BFREV)
+ALU3(BFE)
+ALU2(BFI1)
+ALU3(BFI2)
+ALU1(FBH)
+ALU1(FBL)
+ALU1(CBIT)
+ALU2(ADDC)
+ALU2(SUBB)
+ALU3(ADD3)
+
+brw_inst *
+brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
+    * To avoid the problems that causes, we use an <X,2,0> source region to
+    * read each element twice.
+    */
+   if (devinfo->verx10 == 70 &&
+       brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
+       dest.type == BRW_REGISTER_TYPE_DF &&
+       (src0.type == BRW_REGISTER_TYPE_F ||
+        src0.type == BRW_REGISTER_TYPE_D ||
+        src0.type == BRW_REGISTER_TYPE_UD) &&
+       !has_scalar_region(src0)) {
+      assert(src0.vstride == src0.width + src0.hstride);
+      src0.vstride = src0.hstride;
+      src0.width = BRW_WIDTH_2;
+      src0.hstride = BRW_HORIZONTAL_STRIDE_0;
+   }
+
+   return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
+}
+
+brw_inst *
+brw_ADD(struct brw_codegen *p, struct brw_reg dest,
+        struct brw_reg src0, struct brw_reg src1)
+{
+   /* 6.2.2: add */
+   if (src0.type == BRW_REGISTER_TYPE_F ||
+       (src0.file == BRW_IMMEDIATE_VALUE &&
+	src0.type == BRW_REGISTER_TYPE_VF)) {
+      assert(src1.type != BRW_REGISTER_TYPE_UD);
+      assert(src1.type != BRW_REGISTER_TYPE_D);
+   }
+
+   if (src1.type == BRW_REGISTER_TYPE_F ||
+       (src1.file == BRW_IMMEDIATE_VALUE &&
+	src1.type == BRW_REGISTER_TYPE_VF)) {
+      assert(src0.type != BRW_REGISTER_TYPE_UD);
+      assert(src0.type != BRW_REGISTER_TYPE_D);
+   }
+
+   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
+}
+
+brw_inst *
+brw_AVG(struct brw_codegen *p, struct brw_reg dest,
+        struct brw_reg src0, struct brw_reg src1)
+{
+   assert(dest.type == src0.type);
+   assert(src0.type == src1.type);
+   switch (src0.type) {
+   case BRW_REGISTER_TYPE_B:
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UD:
+      break;
+   default:
+      unreachable("Bad type for brw_AVG");
+   }
+
+   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
+}
+
+brw_inst *
+brw_MUL(struct brw_codegen *p, struct brw_reg dest,
+        struct brw_reg src0, struct brw_reg src1)
+{
+   /* 6.32.38: mul */
+   if (src0.type == BRW_REGISTER_TYPE_D ||
+       src0.type == BRW_REGISTER_TYPE_UD ||
+       src1.type == BRW_REGISTER_TYPE_D ||
+       src1.type == BRW_REGISTER_TYPE_UD) {
+      assert(dest.type != BRW_REGISTER_TYPE_F);
+   }
+
+   if (src0.type == BRW_REGISTER_TYPE_F ||
+       (src0.file == BRW_IMMEDIATE_VALUE &&
+	src0.type == BRW_REGISTER_TYPE_VF)) {
+      assert(src1.type != BRW_REGISTER_TYPE_UD);
+      assert(src1.type != BRW_REGISTER_TYPE_D);
+   }
+
+   if (src1.type == BRW_REGISTER_TYPE_F ||
+       (src1.file == BRW_IMMEDIATE_VALUE &&
+	src1.type == BRW_REGISTER_TYPE_VF)) {
+      assert(src0.type != BRW_REGISTER_TYPE_UD);
+      assert(src0.type != BRW_REGISTER_TYPE_D);
+   }
+
+   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+	  src0.nr != BRW_ARF_ACCUMULATOR);
+   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+	  src1.nr != BRW_ARF_ACCUMULATOR);
+
+   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
+}
+
+brw_inst *
+brw_LINE(struct brw_codegen *p, struct brw_reg dest,
+         struct brw_reg src0, struct brw_reg src1)
+{
+   src0.vstride = BRW_VERTICAL_STRIDE_0;
+   src0.width = BRW_WIDTH_1;
+   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
+   return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
+}
+
+brw_inst *
+brw_PLN(struct brw_codegen *p, struct brw_reg dest,
+        struct brw_reg src0, struct brw_reg src1)
+{
+   src0.vstride = BRW_VERTICAL_STRIDE_0;
+   src0.width = BRW_WIDTH_1;
+   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
+   src1.vstride = BRW_VERTICAL_STRIDE_8;
+   src1.width = BRW_WIDTH_8;
+   src1.hstride = BRW_HORIZONTAL_STRIDE_1;
+   return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
+}
+
+brw_inst *
+brw_DPAS(struct brw_codegen *p, enum gfx12_systolic_depth sdepth,
+         unsigned rcount, struct brw_reg dest, struct brw_reg src0,
+         struct brw_reg src1, struct brw_reg src2)
+{
+   return brw_dpas_three_src(p, BRW_OPCODE_DPAS, sdepth, rcount, dest, src0,
+                             src1, src2);
+}
+
+brw_inst *
+brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
+{
+   assert(p->devinfo->ver == 7);
+
+   /* The F32TO16 instruction doesn't support 32-bit destination types in
+    * Align1 mode.  Gfx7 (only) does zero out the high 16 bits in Align16
+    * mode as an undocumented feature.
+    */
+   if (BRW_ALIGN_16 == brw_get_default_access_mode(p)) {
+      assert(dst.type == BRW_REGISTER_TYPE_UD);
+   } else {
+      assert(dst.type == BRW_REGISTER_TYPE_W ||
+             dst.type == BRW_REGISTER_TYPE_UW);
+   }
+
+   return brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
+}
+
+brw_inst *
+brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
+{
+   assert(p->devinfo->ver == 7);
+
+   if (BRW_ALIGN_16 == brw_get_default_access_mode(p)) {
+      assert(src.type == BRW_REGISTER_TYPE_UD);
+   } else {
+      /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
+       *
+       *   Because this instruction does not have a 16-bit floating-point
+       *   type, the source data type must be Word (W). The destination type
+       *   must be F (Float).
+       */
+      assert(src.type == BRW_REGISTER_TYPE_W ||
+             src.type == BRW_REGISTER_TYPE_UW);
+   }
+
+   return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
+}
+
+
+void brw_NOP(struct brw_codegen *p)
+{
+   brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
+   memset(insn, 0, sizeof(*insn));
+   brw_inst_set_opcode(p->isa, insn, BRW_OPCODE_NOP);
+}
+
+void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)
+{
+   brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);
+   brw_inst_set_cond_modifier(p->devinfo, insn, func);
+}
+
+/***********************************************************************
+ * Comparisons, if/else/endif
+ */
+
+brw_inst *
+brw_JMPI(struct brw_codegen *p, struct brw_reg index,
+         unsigned predicate_control)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   struct brw_reg ip = brw_ip_reg();
+   brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
+
+   brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
+   brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
+   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
+   brw_inst_set_pred_control(devinfo, inst, predicate_control);
+
+   return inst;
+}
+
+static void
+push_if_stack(struct brw_codegen *p, brw_inst *inst)
+{
+   p->if_stack[p->if_stack_depth] = inst - p->store;
+
+   p->if_stack_depth++;
+   if (p->if_stack_array_size <= p->if_stack_depth) {
+      p->if_stack_array_size *= 2;
+      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
+			     p->if_stack_array_size);
+   }
+}
+
+static brw_inst *
+pop_if_stack(struct brw_codegen *p)
+{
+   p->if_stack_depth--;
+   return &p->store[p->if_stack[p->if_stack_depth]];
+}
+
+static void
+push_loop_stack(struct brw_codegen *p, brw_inst *inst)
+{
+   if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
+      p->loop_stack_array_size *= 2;
+      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
+			       p->loop_stack_array_size);
+      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
+				     p->loop_stack_array_size);
+   }
+
+   p->loop_stack[p->loop_stack_depth] = inst - p->store;
+   p->loop_stack_depth++;
+   p->if_depth_in_loop[p->loop_stack_depth] = 0;
+}
+
+static brw_inst *
+get_inner_do_insn(struct brw_codegen *p)
+{
+   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
+}
+
+/* EU takes the value from the flag register and pushes it onto some
+ * sort of a stack (presumably merging with any flag value already on
+ * the stack).  Within an if block, the flags at the top of the stack
+ * control execution on each channel of the unit, eg. on each of the
+ * 16 pixel values in our wm programs.
+ *
+ * When the matching 'else' instruction is reached (presumably by
+ * countdown of the instruction count patched in by our ELSE/ENDIF
+ * functions), the relevant flags are inverted.
+ *
+ * When the matching 'endif' instruction is reached, the flags are
+ * popped off.  If the stack is now empty, normal execution resumes.
+ */
+brw_inst *
+brw_IF(struct brw_codegen *p, unsigned execute_size)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   insn = next_insn(p, BRW_OPCODE_IF);
+
+   /* Override the defaults for this instruction:
+    */
+   if (devinfo->ver < 6) {
+      brw_set_dest(p, insn, brw_ip_reg());
+      brw_set_src0(p, insn, brw_ip_reg());
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+   } else if (devinfo->ver == 6) {
+      brw_set_dest(p, insn, brw_imm_w(0));
+      brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
+      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
+      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
+   } else if (devinfo->ver == 7) {
+      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
+      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
+      brw_set_src1(p, insn, brw_imm_w(0));
+      brw_inst_set_jip(devinfo, insn, 0);
+      brw_inst_set_uip(devinfo, insn, 0);
+   } else {
+      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
+      if (devinfo->ver < 12)
+         brw_set_src0(p, insn, brw_imm_d(0));
+      brw_inst_set_jip(devinfo, insn, 0);
+      brw_inst_set_uip(devinfo, insn, 0);
+   }
+
+   brw_inst_set_exec_size(devinfo, insn, execute_size);
+   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
+   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
+   if (!p->single_program_flow && devinfo->ver < 6)
+      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
+
+   push_if_stack(p, insn);
+   p->if_depth_in_loop[p->loop_stack_depth]++;
+   return insn;
+}
+
+/* This function is only used for gfx6-style IF instructions with an
+ * embedded comparison (conditional modifier).  It is not used on gfx7.
+ */
+brw_inst *
+gfx6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
+	struct brw_reg src0, struct brw_reg src1)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   insn = next_insn(p, BRW_OPCODE_IF);
+
+   brw_set_dest(p, insn, brw_imm_w(0));
+   brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
+   brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, src1);
+
+   assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
+   assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
+   brw_inst_set_cond_modifier(devinfo, insn, conditional);
+
+   push_if_stack(p, insn);
+   return insn;
+}
+
+/**
+ * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
+ */
+static void
+convert_IF_ELSE_to_ADD(struct brw_codegen *p,
+                       brw_inst *if_inst, brw_inst *else_inst)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   /* The next instruction (where the ENDIF would be, if it existed) */
+   brw_inst *next_inst = &p->store[p->nr_insn];
+
+   assert(p->single_program_flow);
+   assert(if_inst != NULL && brw_inst_opcode(p->isa, if_inst) == BRW_OPCODE_IF);
+   assert(else_inst == NULL || brw_inst_opcode(p->isa, else_inst) == BRW_OPCODE_ELSE);
+   assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
+
+   /* Convert IF to an ADD instruction that moves the instruction pointer
+    * to the first instruction of the ELSE block.  If there is no ELSE
+    * block, point to where ENDIF would be.  Reverse the predicate.
+    *
+    * There's no need to execute an ENDIF since we don't need to do any
+    * stack operations, and if we're currently executing, we just want to
+    * continue normally.
+    */
+   brw_inst_set_opcode(p->isa, if_inst, BRW_OPCODE_ADD);
+   brw_inst_set_pred_inv(devinfo, if_inst, true);
+
+   if (else_inst != NULL) {
+      /* Convert ELSE to an ADD instruction that points where the ENDIF
+       * would be.
+       */
+      brw_inst_set_opcode(p->isa, else_inst, BRW_OPCODE_ADD);
+
+      brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
+      brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
+   } else {
+      brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
+   }
+}
+
+/**
+ * Patch IF and ELSE instructions with appropriate jump targets.
+ */
+static void
+patch_IF_ELSE(struct brw_codegen *p,
+              brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   /* We shouldn't be patching IF and ELSE instructions in single program flow
+    * mode when gen < 6, because in single program flow mode on those
+    * platforms, we convert flow control instructions to conditional ADDs that
+    * operate on IP (see brw_ENDIF).
+    *
+    * However, on Gfx6, writing to IP doesn't work in single program flow mode
+    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
+    * not be updated by non-flow control instructions.").  And on later
+    * platforms, there is no significant benefit to converting control flow
+    * instructions to conditional ADDs.  So we do patch IF and ELSE
+    * instructions in single program flow mode on those platforms.
+    */
+   if (devinfo->ver < 6)
+      assert(!p->single_program_flow);
+
+   assert(if_inst != NULL && brw_inst_opcode(p->isa, if_inst) == BRW_OPCODE_IF);
+   assert(endif_inst != NULL);
+   assert(else_inst == NULL || brw_inst_opcode(p->isa, else_inst) == BRW_OPCODE_ELSE);
+
+   unsigned br = brw_jump_scale(devinfo);
+
+   assert(brw_inst_opcode(p->isa, endif_inst) == BRW_OPCODE_ENDIF);
+   brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
+
+   if (else_inst == NULL) {
+      /* Patch IF -> ENDIF */
+      if (devinfo->ver < 6) {
+	 /* Turn it into an IFF, which means no mask stack operations for
+	  * all-false and jumping past the ENDIF.
+	  */
+         brw_inst_set_opcode(p->isa, if_inst, BRW_OPCODE_IFF);
+         brw_inst_set_gfx4_jump_count(devinfo, if_inst,
+                                      br * (endif_inst - if_inst + 1));
+         brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
+      } else if (devinfo->ver == 6) {
+	 /* As of gfx6, there is no IFF and IF must point to the ENDIF. */
+         brw_inst_set_gfx6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
+      } else {
+         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
+         brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
+      }
+   } else {
+      brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
+
+      /* Patch IF -> ELSE */
+      if (devinfo->ver < 6) {
+         brw_inst_set_gfx4_jump_count(devinfo, if_inst,
+                                      br * (else_inst - if_inst));
+         brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
+      } else if (devinfo->ver == 6) {
+         brw_inst_set_gfx6_jump_count(devinfo, if_inst,
+                                      br * (else_inst - if_inst + 1));
+      }
+
+      /* Patch ELSE -> ENDIF */
+      if (devinfo->ver < 6) {
+	 /* BRW_OPCODE_ELSE pre-gfx6 should point just past the
+	  * matching ENDIF.
+	  */
+         brw_inst_set_gfx4_jump_count(devinfo, else_inst,
+                                      br * (endif_inst - else_inst + 1));
+         brw_inst_set_gfx4_pop_count(devinfo, else_inst, 1);
+      } else if (devinfo->ver == 6) {
+	 /* BRW_OPCODE_ELSE on gfx6 should point to the matching ENDIF. */
+         brw_inst_set_gfx6_jump_count(devinfo, else_inst,
+                                      br * (endif_inst - else_inst));
+      } else {
+	 /* The IF instruction's JIP should point just past the ELSE */
+         brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
+	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
+         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
+
+         if (devinfo->ver >= 8 && devinfo->ver < 11) {
+            /* Set the ELSE instruction to use branch_ctrl with a join
+             * jump target pointing at the NOP inserted right before
+             * the ENDIF instruction in order to make sure it is
+             * executed in all cases, since attempting to do the same
+             * as on other generations could cause the EU to jump at
+             * the instruction immediately after the ENDIF due to
+             * Wa_220160235, which could cause the program to continue
+             * running with all channels disabled.
+             */
+            brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst - 1));
+            brw_inst_set_branch_control(devinfo, else_inst, true);
+         } else {
+            brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
+         }
+
+         if (devinfo->ver >= 8) {
+            /* Since we don't set branch_ctrl on Gfx11+, the ELSE's
+             * JIP and UIP both should point to ENDIF on those
+             * platforms.
+             */
+            brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
+         }
+      }
+   }
+}
+
+void
+brw_ELSE(struct brw_codegen *p)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   insn = next_insn(p, BRW_OPCODE_ELSE);
+
+   if (devinfo->ver < 6) {
+      brw_set_dest(p, insn, brw_ip_reg());
+      brw_set_src0(p, insn, brw_ip_reg());
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+   } else if (devinfo->ver == 6) {
+      brw_set_dest(p, insn, brw_imm_w(0));
+      brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   } else if (devinfo->ver == 7) {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_w(0));
+      brw_inst_set_jip(devinfo, insn, 0);
+      brw_inst_set_uip(devinfo, insn, 0);
+   } else {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      if (devinfo->ver < 12)
+         brw_set_src0(p, insn, brw_imm_d(0));
+      brw_inst_set_jip(devinfo, insn, 0);
+      brw_inst_set_uip(devinfo, insn, 0);
+   }
+
+   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
+   if (!p->single_program_flow && devinfo->ver < 6)
+      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
+
+   push_if_stack(p, insn);
+}
+
+void
+brw_ENDIF(struct brw_codegen *p)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *insn = NULL;
+   brw_inst *else_inst = NULL;
+   brw_inst *if_inst = NULL;
+   brw_inst *tmp;
+   bool emit_endif = true;
+
+   assert(p->if_stack_depth > 0);
+
+   if (devinfo->ver >= 8 && devinfo->ver < 11 &&
+       brw_inst_opcode(p->isa, &p->store[p->if_stack[
+                             p->if_stack_depth - 1]]) == BRW_OPCODE_ELSE) {
+      /* Insert a NOP to be specified as join instruction within the
+       * ELSE block, which is valid for an ELSE instruction with
+       * branch_ctrl on.  The ELSE instruction will be set to jump
+       * here instead of to the ENDIF instruction, since attempting to
+       * do the latter would prevent the ENDIF from being executed in
+       * some cases due to Wa_220160235, which could cause the program
+       * to continue running with all channels disabled.
+       */
+      brw_NOP(p);
+   }
+
+   /* In single program flow mode, we can express IF and ELSE instructions
+    * equivalently as ADD instructions that operate on IP.  On platforms prior
+    * to Gfx6, flow control instructions cause an implied thread switch, so
+    * this is a significant savings.
+    *
+    * However, on Gfx6, writing to IP doesn't work in single program flow mode
+    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
+    * not be updated by non-flow control instructions.").  And on later
+    * platforms, there is no significant benefit to converting control flow
+    * instructions to conditional ADDs.  So we only do this trick on Gfx4 and
+    * Gfx5.
+    */
+   if (devinfo->ver < 6 && p->single_program_flow)
+      emit_endif = false;
+
+   /*
+    * A single next_insn() may change the base address of instruction store
+    * memory(p->store), so call it first before referencing the instruction
+    * store pointer from an index
+    */
+   if (emit_endif)
+      insn = next_insn(p, BRW_OPCODE_ENDIF);
+
+   /* Pop the IF and (optional) ELSE instructions from the stack */
+   p->if_depth_in_loop[p->loop_stack_depth]--;
+   tmp = pop_if_stack(p);
+   if (brw_inst_opcode(p->isa, tmp) == BRW_OPCODE_ELSE) {
+      else_inst = tmp;
+      tmp = pop_if_stack(p);
+   }
+   if_inst = tmp;
+
+   if (!emit_endif) {
+      /* ENDIF is useless; don't bother emitting it. */
+      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
+      return;
+   }
+
+   if (devinfo->ver < 6) {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+   } else if (devinfo->ver == 6) {
+      brw_set_dest(p, insn, brw_imm_w(0));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   } else if (devinfo->ver == 7) {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_w(0));
+   } else {
+      brw_set_src0(p, insn, brw_imm_d(0));
+   }
+
+   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
+   if (devinfo->ver < 6)
+      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
+
+   /* Also pop item off the stack in the endif instruction: */
+   if (devinfo->ver < 6) {
+      brw_inst_set_gfx4_jump_count(devinfo, insn, 0);
+      brw_inst_set_gfx4_pop_count(devinfo, insn, 1);
+   } else if (devinfo->ver == 6) {
+      brw_inst_set_gfx6_jump_count(devinfo, insn, 2);
+   } else {
+      brw_inst_set_jip(devinfo, insn, 2);
+   }
+   patch_IF_ELSE(p, if_inst, else_inst, insn);
+}
+
+brw_inst *
+brw_BREAK(struct brw_codegen *p)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   insn = next_insn(p, BRW_OPCODE_BREAK);
+   if (devinfo->ver >= 8) {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, brw_imm_d(0x0));
+   } else if (devinfo->ver >= 6) {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+   } else {
+      brw_set_dest(p, insn, brw_ip_reg());
+      brw_set_src0(p, insn, brw_ip_reg());
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+      brw_inst_set_gfx4_pop_count(devinfo, insn,
+                                  p->if_depth_in_loop[p->loop_stack_depth]);
+   }
+   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+   brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
+
+   return insn;
+}
+
+brw_inst *
+brw_CONT(struct brw_codegen *p)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   insn = next_insn(p, BRW_OPCODE_CONTINUE);
+   brw_set_dest(p, insn, brw_ip_reg());
+   if (devinfo->ver >= 8) {
+      brw_set_src0(p, insn, brw_imm_d(0x0));
+   } else {
+      brw_set_src0(p, insn, brw_ip_reg());
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+   }
+
+   if (devinfo->ver < 6) {
+      brw_inst_set_gfx4_pop_count(devinfo, insn,
+                                  p->if_depth_in_loop[p->loop_stack_depth]);
+   }
+   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+   brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
+   return insn;
+}
+
+brw_inst *
+brw_HALT(struct brw_codegen *p)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   insn = next_insn(p, BRW_OPCODE_HALT);
+   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   if (devinfo->ver < 6) {
+      /* From the Gfx4 PRM:
+       *
+       *    "IP register must be put (for example, by the assembler) at <dst>
+       *    and <src0> locations.
+       */
+      brw_set_dest(p, insn, brw_ip_reg());
+      brw_set_src0(p, insn, brw_ip_reg());
+      brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */
+   } else if (devinfo->ver < 8) {
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
+   } else if (devinfo->ver < 12) {
+      brw_set_src0(p, insn, brw_imm_d(0x0));
+   }
+
+   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+   brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
+   return insn;
+}
+
+/* DO/WHILE loop:
+ *
+ * The DO/WHILE is just an unterminated loop -- break or continue are
+ * used for control within the loop.  We have a few ways they can be
+ * done.
+ *
+ * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
+ * jip and no DO instruction.
+ *
+ * For non-uniform control flow pre-gfx6, there's a DO instruction to
+ * push the mask, and a WHILE to jump back, and BREAK to get out and
+ * pop the mask.
+ *
+ * For gfx6, there's no more mask stack, so no need for DO.  WHILE
+ * just points back to the first instruction of the loop.
+ */
+brw_inst *
+brw_DO(struct brw_codegen *p, unsigned execute_size)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   if (devinfo->ver >= 6 || p->single_program_flow) {
+      push_loop_stack(p, &p->store[p->nr_insn]);
+      return &p->store[p->nr_insn];
+   } else {
+      brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
+
+      push_loop_stack(p, insn);
+
+      /* Override the defaults for this instruction:
+       */
+      brw_set_dest(p, insn, brw_null_reg());
+      brw_set_src0(p, insn, brw_null_reg());
+      brw_set_src1(p, insn, brw_null_reg());
+
+      brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+      brw_inst_set_exec_size(devinfo, insn, execute_size);
+      brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
+
+      return insn;
+   }
+}
+
+/**
+ * For pre-gfx6, we patch BREAK/CONT instructions to point at the WHILE
+ * instruction here.
+ *
+ * For gfx6+, see brw_set_uip_jip(), which doesn't care so much about the loop
+ * nesting, since it can always just point to the end of the block/current loop.
+ */
+static void
+brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *do_inst = get_inner_do_insn(p);
+   brw_inst *inst;
+   unsigned br = brw_jump_scale(devinfo);
+
+   assert(devinfo->ver < 6);
+
+   for (inst = while_inst - 1; inst != do_inst; inst--) {
+      /* If the jump count is != 0, that means that this instruction has already
+       * been patched because it's part of a loop inside of the one we're
+       * patching.
+       */
+      if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_BREAK &&
+          brw_inst_gfx4_jump_count(devinfo, inst) == 0) {
+         brw_inst_set_gfx4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
+      } else if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_CONTINUE &&
+                 brw_inst_gfx4_jump_count(devinfo, inst) == 0) {
+         brw_inst_set_gfx4_jump_count(devinfo, inst, br * (while_inst - inst));
+      }
+   }
+}
+
+brw_inst *
+brw_WHILE(struct brw_codegen *p)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *insn, *do_insn;
+   unsigned br = brw_jump_scale(devinfo);
+
+   if (devinfo->ver >= 6) {
+      insn = next_insn(p, BRW_OPCODE_WHILE);
+      do_insn = get_inner_do_insn(p);
+
+      if (devinfo->ver >= 8) {
+         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+         if (devinfo->ver < 12)
+            brw_set_src0(p, insn, brw_imm_d(0));
+         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
+      } else if (devinfo->ver == 7) {
+         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+         brw_set_src1(p, insn, brw_imm_w(0));
+         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
+      } else {
+         brw_set_dest(p, insn, brw_imm_w(0));
+         brw_inst_set_gfx6_jump_count(devinfo, insn, br * (do_insn - insn));
+         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+         brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      }
+
+      brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
+
+   } else {
+      if (p->single_program_flow) {
+	 insn = next_insn(p, BRW_OPCODE_ADD);
+         do_insn = get_inner_do_insn(p);
+
+	 brw_set_dest(p, insn, brw_ip_reg());
+	 brw_set_src0(p, insn, brw_ip_reg());
+	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
+         brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
+      } else {
+	 insn = next_insn(p, BRW_OPCODE_WHILE);
+         do_insn = get_inner_do_insn(p);
+
+         assert(brw_inst_opcode(p->isa, do_insn) == BRW_OPCODE_DO);
+
+	 brw_set_dest(p, insn, brw_ip_reg());
+	 brw_set_src0(p, insn, brw_ip_reg());
+	 brw_set_src1(p, insn, brw_imm_d(0));
+
+         brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
+         brw_inst_set_gfx4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
+         brw_inst_set_gfx4_pop_count(devinfo, insn, 0);
+
+	 brw_patch_break_cont(p, insn);
+      }
+   }
+   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+
+   p->loop_stack_depth--;
+
+   return insn;
+}
+
+/* FORWARD JUMPS:
+ */
+void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *jmp_insn = &p->store[jmp_insn_idx];
+   unsigned jmpi = 1;
+
+   if (devinfo->ver >= 5)
+      jmpi = 2;
+
+   assert(brw_inst_opcode(p->isa, jmp_insn) == BRW_OPCODE_JMPI);
+   assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
+
+   brw_inst_set_gfx4_jump_count(devinfo, jmp_insn,
+                                jmpi * (p->nr_insn - jmp_insn_idx - 1));
+}
+
+/* To integrate with the above, it makes sense that the comparison
+ * instruction should populate the flag register.  It might be simpler
+ * just to use the flag reg for most WM tasks?
+ */
+void brw_CMP(struct brw_codegen *p,
+	     struct brw_reg dest,
+	     unsigned conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
+
+   brw_inst_set_cond_modifier(devinfo, insn, conditional);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, src1);
+
+   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
+    * page says:
+    *    "Any CMP instruction with a null destination must use a {switch}."
+    *
+    * It also applies to other Gfx7 platforms (IVB, BYT) even though it isn't
+    * mentioned on their work-arounds pages.
+    */
+   if (devinfo->ver == 7) {
+      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+          dest.nr == BRW_ARF_NULL) {
+         brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
+      }
+   }
+}
+
+void brw_CMPN(struct brw_codegen *p,
+              struct brw_reg dest,
+              unsigned conditional,
+              struct brw_reg src0,
+              struct brw_reg src1)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *insn = next_insn(p, BRW_OPCODE_CMPN);
+
+   brw_inst_set_cond_modifier(devinfo, insn, conditional);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, src1);
+
+   /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA)
+    * says:
+    *
+    *    If the destination is the null register, the {Switch} instruction
+    *    option must be used.
+    *
+    * Page 77 of the Haswell PRM Volume 2b contains the same text.
+    */
+   if (devinfo->ver == 7) {
+      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+          dest.nr == BRW_ARF_NULL) {
+         brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
+      }
+   }
+}
+
+/***********************************************************************
+ * Helpers for the various SEND message types:
+ */
+
+/** Extended math function, float[8].
+ */
+void gfx4_math(struct brw_codegen *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       unsigned msg_reg_nr,
+	       struct brw_reg src,
+	       unsigned precision )
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+   unsigned data_type;
+   if (has_scalar_region(src)) {
+      data_type = BRW_MATH_DATA_SCALAR;
+   } else {
+      data_type = BRW_MATH_DATA_VECTOR;
+   }
+
+   assert(devinfo->ver < 6);
+
+   /* Example code doesn't set predicate_control for send
+    * instructions.
+    */
+   brw_inst_set_pred_control(devinfo, insn, 0);
+   brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
+
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src);
+   brw_set_math_message(p,
+                        insn,
+                        function,
+                        src.type == BRW_REGISTER_TYPE_D,
+                        precision,
+                        data_type);
+}
+
+void gfx6_math(struct brw_codegen *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       struct brw_reg src0,
+	       struct brw_reg src1)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
+
+   assert(devinfo->ver >= 6);
+
+   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
+          (devinfo->ver >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
+
+   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
+   if (devinfo->ver == 6) {
+      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
+      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
+   }
+
+   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
+       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
+       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
+      assert(src0.type != BRW_REGISTER_TYPE_F);
+      assert(src1.type != BRW_REGISTER_TYPE_F);
+      assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
+             (devinfo->ver >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
+      /* From BSpec 6647/47428 "[Instruction] Extended Math Function":
+       *     INT DIV function does not support source modifiers.
+       */
+      assert(!src0.negate);
+      assert(!src0.abs);
+      assert(!src1.negate);
+      assert(!src1.abs);
+   } else {
+      assert(src0.type == BRW_REGISTER_TYPE_F ||
+             (src0.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
+      assert(src1.type == BRW_REGISTER_TYPE_F ||
+             (src1.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
+   }
+
+   /* Source modifiers are ignored for extended math instructions on Gfx6. */
+   if (devinfo->ver == 6) {
+      assert(!src0.negate);
+      assert(!src0.abs);
+      assert(!src1.negate);
+      assert(!src1.abs);
+   }
+
+   brw_inst_set_math_function(devinfo, insn, function);
+
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, src1);
+}
+
+/**
+ * Return the right surface index to access the thread scratch space using
+ * stateless dataport messages.
+ */
+unsigned
+brw_scratch_surface_idx(const struct brw_codegen *p)
+{
+   /* The scratch space is thread-local so IA coherency is unnecessary. */
+   if (p->devinfo->ver >= 8)
+      return GFX8_BTI_STATELESS_NON_COHERENT;
+   else
+      return BRW_BTI_STATELESS;
+}
+
+/**
+ * Write a block of OWORDs (half a GRF each) from the scratch buffer,
+ * using a constant offset per channel.
+ *
+ * The offset must be aligned to oword size (16 bytes).  Used for
+ * register spilling.
+ */
+void brw_oword_block_write_scratch(struct brw_codegen *p,
+				   struct brw_reg mrf,
+				   int num_regs,
+				   unsigned offset)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   const unsigned target_cache =
+      (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
+       devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
+       BRW_SFID_DATAPORT_WRITE);
+   const struct tgl_swsb swsb = brw_get_default_swsb(p);
+   uint32_t msg_type;
+
+   if (devinfo->ver >= 6)
+      offset /= 16;
+
+   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
+
+   const unsigned mlen = 1 + num_regs;
+
+   /* Set up the message header.  This is g0, with g0.2 filled with
+    * the offset.  We don't want to leave our offset around in g0 or
+    * it'll screw up texture samples, so set it up inside the message
+    * reg.
+    */
+   {
+      brw_push_insn_state(p);
+      brw_set_default_exec_size(p, BRW_EXECUTE_8);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
+
+      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+      /* set message header global offset field (reg 0, element 2) */
+      brw_set_default_exec_size(p, BRW_EXECUTE_1);
+      brw_set_default_swsb(p, tgl_swsb_null());
+      brw_MOV(p,
+	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+				  mrf.nr,
+				  2), BRW_REGISTER_TYPE_UD),
+	      brw_imm_ud(offset));
+
+      brw_pop_insn_state(p);
+      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
+   }
+
+   {
+      struct brw_reg dest;
+      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+      int send_commit_msg;
+      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
+					 BRW_REGISTER_TYPE_UW);
+
+      brw_inst_set_sfid(devinfo, insn, target_cache);
+      brw_inst_set_compression(devinfo, insn, false);
+
+      if (brw_inst_exec_size(devinfo, insn) >= 16)
+	 src_header = vec16(src_header);
+
+      assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
+      if (devinfo->ver < 6)
+         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
+
+      /* Until gfx6, writes followed by reads from the same location
+       * are not guaranteed to be ordered unless write_commit is set.
+       * If set, then a no-op write is issued to the destination
+       * register to set a dependency, and a read from the destination
+       * can be used to ensure the ordering.
+       *
+       * For gfx6, only writes between different threads need ordering
+       * protection.  Our use of DP writes is all about register
+       * spilling within a thread.
+       */
+      if (devinfo->ver >= 6) {
+	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+	 send_commit_msg = 0;
+      } else {
+	 dest = src_header;
+	 send_commit_msg = 1;
+      }
+
+      brw_set_dest(p, insn, dest);
+      if (devinfo->ver >= 6) {
+	 brw_set_src0(p, insn, mrf);
+      } else {
+	 brw_set_src0(p, insn, brw_null_reg());
+      }
+
+      if (devinfo->ver >= 6)
+	 msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
+      else
+	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
+
+      brw_set_desc(p, insn,
+                   brw_message_desc(devinfo, mlen, send_commit_msg, true) |
+                   brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
+                                     BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
+                                     msg_type, send_commit_msg));
+   }
+}
+
+
+/**
+ * Read a block of owords (half a GRF each) from the scratch buffer
+ * using a constant index per channel.
+ *
+ * Offset must be aligned to oword size (16 bytes).  Used for register
+ * spilling.
+ */
+void
+brw_oword_block_read_scratch(struct brw_codegen *p,
+			     struct brw_reg dest,
+			     struct brw_reg mrf,
+			     int num_regs,
+			     unsigned offset)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   const struct tgl_swsb swsb = brw_get_default_swsb(p);
+
+   if (devinfo->ver >= 6)
+      offset /= 16;
+
+   if (p->devinfo->ver >= 7) {
+      /* On gen 7 and above, we no longer have message registers and we can
+       * send from any register we want.  By using the destination register
+       * for the message, we guarantee that the implied message write won't
+       * accidentally overwrite anything.  This has been a problem because
+       * the MRF registers and source for the final FB write are both fixed
+       * and may overlap.
+       */
+      mrf = retype(dest, BRW_REGISTER_TYPE_UD);
+   } else {
+      mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
+   }
+   dest = retype(dest, BRW_REGISTER_TYPE_UW);
+
+   const unsigned rlen = num_regs;
+   const unsigned target_cache =
+      (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
+       devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
+       BRW_SFID_DATAPORT_READ);
+
+   {
+      brw_push_insn_state(p);
+      brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
+      brw_set_default_exec_size(p, BRW_EXECUTE_8);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+      /* set message header global offset field (reg 0, element 2) */
+      brw_set_default_exec_size(p, BRW_EXECUTE_1);
+      brw_set_default_swsb(p, tgl_swsb_null());
+      brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
+
+      brw_pop_insn_state(p);
+      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
+   }
+
+   {
+      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+
+      brw_inst_set_sfid(devinfo, insn, target_cache);
+      assert(brw_inst_pred_control(devinfo, insn) == 0);
+      brw_inst_set_compression(devinfo, insn, false);
+
+      brw_set_dest(p, insn, dest);	/* UW? */
+      if (devinfo->ver >= 6) {
+	 brw_set_src0(p, insn, mrf);
+      } else {
+	 brw_set_src0(p, insn, brw_null_reg());
+         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
+      }
+
+      brw_set_desc(p, insn,
+                   brw_message_desc(devinfo, 1, rlen, true) |
+                   brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
+                                    BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
+                                    BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
+                                    BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
+   }
+}
+
+void
+gfx7_block_read_scratch(struct brw_codegen *p,
+                        struct brw_reg dest,
+                        int num_regs,
+                        unsigned offset)
+{
+   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+   assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
+
+   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
+
+   /* The HW requires that the header is present; this is to get the g0.5
+    * scratch offset.
+    */
+   brw_set_src0(p, insn, brw_vec8_grf(0, 0));
+
+   /* According to the docs, offset is "A 12-bit HWord offset into the memory
+    * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
+    * is 32 bytes, which happens to be the size of a register.
+    */
+   offset /= REG_SIZE;
+   assert(offset < (1 << 12));
+
+   gfx7_set_dp_scratch_message(p, insn,
+                               false, /* scratch read */
+                               false, /* OWords */
+                               false, /* invalidate after read */
+                               num_regs,
+                               offset,
+                               1,        /* mlen: just g0 */
+                               num_regs, /* rlen */
+                               true);    /* header present */
+}
+
+/**
+ * Read float[4] vectors from the data port constant cache.
+ * Location (in buffer) should be a multiple of 16.
+ * Used for fetching shader constants.
+ */
+void brw_oword_block_read(struct brw_codegen *p,
+			  struct brw_reg dest,
+			  struct brw_reg mrf,
+			  uint32_t offset,
+			  uint32_t bind_table_index)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   const unsigned target_cache =
+      (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_CONSTANT_CACHE :
+       BRW_SFID_DATAPORT_READ);
+   const unsigned exec_size = 1 << brw_get_default_exec_size(p);
+   const struct tgl_swsb swsb = brw_get_default_swsb(p);
+
+   /* On newer hardware, offset is in units of owords. */
+   if (devinfo->ver >= 6)
+      offset /= 16;
+
+   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
+
+   brw_push_insn_state(p);
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+   brw_set_default_flag_reg(p, 0, 0);
+   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   brw_push_insn_state(p);
+   brw_set_default_exec_size(p, BRW_EXECUTE_8);
+   brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
+   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+   /* set message header global offset field (reg 0, element 2) */
+   brw_set_default_exec_size(p, BRW_EXECUTE_1);
+   brw_set_default_swsb(p, tgl_swsb_null());
+   brw_MOV(p,
+	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+			       mrf.nr,
+			       2), BRW_REGISTER_TYPE_UD),
+	   brw_imm_ud(offset));
+   brw_pop_insn_state(p);
+
+   brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
+
+   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   brw_inst_set_sfid(devinfo, insn, target_cache);
+
+   /* cast dest to a uword[8] vector */
+   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
+
+   brw_set_dest(p, insn, dest);
+   if (devinfo->ver >= 6) {
+      brw_set_src0(p, insn, mrf);
+   } else {
+      brw_set_src0(p, insn, brw_null_reg());
+      brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
+   }
+
+   brw_set_desc(p, insn,
+                brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
+                brw_dp_read_desc(devinfo, bind_table_index,
+                                 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
+                                 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
+                                 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
+
+   brw_pop_insn_state(p);
+}
+
+brw_inst *
+brw_fb_WRITE(struct brw_codegen *p,
+             struct brw_reg payload,
+             struct brw_reg implied_header,
+             unsigned msg_control,
+             unsigned binding_table_index,
+             unsigned msg_length,
+             unsigned response_length,
+             bool eot,
+             bool last_render_target,
+             bool header_present)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   const unsigned target_cache =
+      (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
+       BRW_SFID_DATAPORT_WRITE);
+   brw_inst *insn;
+   struct brw_reg dest, src0;
+
+   if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
+      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+   else
+      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+
+   if (devinfo->ver >= 6) {
+      insn = next_insn(p, BRW_OPCODE_SENDC);
+   } else {
+      insn = next_insn(p, BRW_OPCODE_SEND);
+   }
+   brw_inst_set_sfid(devinfo, insn, target_cache);
+   brw_inst_set_compression(devinfo, insn, false);
+
+   if (devinfo->ver >= 6) {
+      /* headerless version, just submit color payload */
+      src0 = payload;
+   } else {
+      assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
+      brw_inst_set_base_mrf(devinfo, insn, payload.nr);
+      src0 = implied_header;
+   }
+
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_desc(p, insn,
+                brw_message_desc(devinfo, msg_length, response_length,
+                                 header_present) |
+                brw_fb_write_desc(devinfo, binding_table_index, msg_control,
+                                  last_render_target,
+                                  false /* coarse_write */));
+   brw_inst_set_eot(devinfo, insn, eot);
+
+   return insn;
+}
+
+brw_inst *
+gfx9_fb_READ(struct brw_codegen *p,
+             struct brw_reg dst,
+             struct brw_reg payload,
+             unsigned binding_table_index,
+             unsigned msg_length,
+             unsigned response_length,
+             bool per_sample)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   assert(devinfo->ver >= 9);
+   brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
+
+   brw_inst_set_sfid(devinfo, insn, GFX6_SFID_DATAPORT_RENDER_CACHE);
+   brw_set_dest(p, insn, dst);
+   brw_set_src0(p, insn, payload);
+   brw_set_desc(
+      p, insn,
+      brw_message_desc(devinfo, msg_length, response_length, true) |
+      brw_fb_read_desc(devinfo, binding_table_index, 0 /* msg_control */,
+                       1 << brw_get_default_exec_size(p), per_sample));
+   brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
+
+   return insn;
+}
+
+/**
+ * Texture sample instruction.
+ * Note: the msg_type plus msg_length values determine exactly what kind
+ * of sampling operation is performed.  See volume 4, page 161 of docs.
+ */
+void brw_SAMPLE(struct brw_codegen *p,
+		struct brw_reg dest,
+		unsigned msg_reg_nr,
+		struct brw_reg src0,
+		unsigned binding_table_index,
+		unsigned sampler,
+		unsigned msg_type,
+		unsigned response_length,
+		unsigned msg_length,
+		unsigned header_present,
+		unsigned simd_mode,
+		unsigned return_format)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   if (msg_reg_nr != -1)
+      gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
+   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
+
+   /* From the 965 PRM (volume 4, part 1, section 14.2.41):
+    *
+    *    "Instruction compression is not allowed for this instruction (that
+    *     is, send). The hardware behavior is undefined if this instruction is
+    *     set as compressed. However, compress control can be set to "SecHalf"
+    *     to affect the EMask generation."
+    *
+    * No similar wording is found in later PRMs, but there are examples
+    * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
+    * are allowed in SIMD16 mode and they could not work without SecHalf.  For
+    * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
+    */
+   brw_inst_set_compression(devinfo, insn, false);
+
+   if (devinfo->ver < 6)
+      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
+
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_desc(p, insn,
+                brw_message_desc(devinfo, msg_length, response_length,
+                                 header_present) |
+                brw_sampler_desc(devinfo, binding_table_index, sampler,
+                                 msg_type, simd_mode, return_format));
+}
+
+/* Adjust the message header's sampler state pointer to
+ * select the correct group of 16 samplers.
+ */
+void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
+                                      struct brw_reg header,
+                                      struct brw_reg sampler_index)
+{
+   /* The "Sampler Index" field can only store values between 0 and 15.
+    * However, we can add an offset to the "Sampler State Pointer"
+    * field, effectively selecting a different set of 16 samplers.
+    *
+    * The "Sampler State Pointer" needs to be aligned to a 32-byte
+    * offset, and each sampler state is only 16-bytes, so we can't
+    * exclusively use the offset - we have to use both.
+    */
+
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
+      const int sampler_state_size = 16; /* 16 bytes */
+      uint32_t sampler = sampler_index.ud;
+
+      if (sampler >= 16) {
+         assert(devinfo->verx10 >= 75);
+         brw_ADD(p,
+                 get_element_ud(header, 3),
+                 get_element_ud(brw_vec8_grf(0, 0), 3),
+                 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
+      }
+   } else {
+      /* Non-const sampler array indexing case */
+      if (devinfo->verx10 <= 70) {
+         return;
+      }
+
+      struct brw_reg temp = get_element_ud(header, 3);
+
+      brw_push_insn_state(p);
+      brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
+      brw_set_default_swsb(p, tgl_swsb_regdist(1));
+      brw_SHL(p, temp, temp, brw_imm_ud(4));
+      brw_ADD(p,
+              get_element_ud(header, 3),
+              get_element_ud(brw_vec8_grf(0, 0), 3),
+              temp);
+      brw_pop_insn_state(p);
+   }
+}
+
+/* All these variables are pretty confusing - we might be better off
+ * using bitmasks and macros for this, in the old style.  Or perhaps
+ * just having the caller instantiate the fields in dword3 itself.
+ */
+void brw_urb_WRITE(struct brw_codegen *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+                   enum brw_urb_write_flags flags,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   unsigned offset,
+		   unsigned swizzle)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+   if (devinfo->ver >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
+      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
+      brw_push_insn_state(p);
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_exec_size(p, BRW_EXECUTE_1);
+      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
+		       BRW_REGISTER_TYPE_UD),
+	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
+		brw_imm_ud(0xff00));
+      brw_pop_insn_state(p);
+   }
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
+
+   assert(msg_length < BRW_MAX_MRF(devinfo->ver));
+
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, brw_imm_d(0));
+
+   if (devinfo->ver < 6)
+      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
+
+   brw_set_urb_message(p,
+		       insn,
+		       flags,
+		       msg_length,
+		       response_length,
+		       offset,
+		       swizzle);
+}
+
+void
+brw_send_indirect_message(struct brw_codegen *p,
+                          unsigned sfid,
+                          struct brw_reg dst,
+                          struct brw_reg payload,
+                          struct brw_reg desc,
+                          unsigned desc_imm,
+                          bool eot)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   struct brw_inst *send;
+
+   dst = retype(dst, BRW_REGISTER_TYPE_UW);
+
+   assert(desc.type == BRW_REGISTER_TYPE_UD);
+
+   if (desc.file == BRW_IMMEDIATE_VALUE) {
+      send = next_insn(p, BRW_OPCODE_SEND);
+      brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
+      brw_set_desc(p, send, desc.ud | desc_imm);
+   } else {
+      const struct tgl_swsb swsb = brw_get_default_swsb(p);
+      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
+
+      brw_push_insn_state(p);
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_exec_size(p, BRW_EXECUTE_1);
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+      brw_set_default_flag_reg(p, 0, 0);
+      brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
+
+      /* Load the indirect descriptor to an address register using OR so the
+       * caller can specify additional descriptor bits with the desc_imm
+       * immediate.
+       */
+      brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
+
+      brw_pop_insn_state(p);
+
+      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
+      send = next_insn(p, BRW_OPCODE_SEND);
+      brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
+
+      if (devinfo->ver >= 12)
+         brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
+      else
+         brw_set_src1(p, send, addr);
+   }
+
+   brw_set_dest(p, send, dst);
+   brw_inst_set_sfid(devinfo, send, sfid);
+   brw_inst_set_eot(devinfo, send, eot);
+}
+
+void
+brw_send_indirect_split_message(struct brw_codegen *p,
+                                unsigned sfid,
+                                struct brw_reg dst,
+                                struct brw_reg payload0,
+                                struct brw_reg payload1,
+                                struct brw_reg desc,
+                                unsigned desc_imm,
+                                struct brw_reg ex_desc,
+                                unsigned ex_desc_imm,
+                                bool ex_desc_scratch,
+                                bool ex_bso,
+                                bool eot)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   struct brw_inst *send;
+
+   dst = retype(dst, BRW_REGISTER_TYPE_UW);
+
+   assert(desc.type == BRW_REGISTER_TYPE_UD);
+
+   if (desc.file == BRW_IMMEDIATE_VALUE) {
+      desc.ud |= desc_imm;
+   } else {
+      const struct tgl_swsb swsb = brw_get_default_swsb(p);
+      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
+
+      brw_push_insn_state(p);
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_exec_size(p, BRW_EXECUTE_1);
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+      brw_set_default_flag_reg(p, 0, 0);
+      brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
+
+      /* Load the indirect descriptor to an address register using OR so the
+       * caller can specify additional descriptor bits with the desc_imm
+       * immediate.
+       */
+      brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
+
+      brw_pop_insn_state(p);
+      desc = addr;
+
+      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
+   }
+
+   if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
+       !ex_desc_scratch &&
+       (devinfo->ver >= 12 ||
+        ((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {
+      /* ATS-M PRMs, Volume 2d: Command Reference: Structures,
+       * EU_INSTRUCTION_SEND instruction
+       *
+       *    "ExBSO: Exists If: ([ExDesc.IsReg]==true)"
+       */
+      assert(!ex_bso);
+      ex_desc.ud |= ex_desc_imm;
+   } else {
+      const struct tgl_swsb swsb = brw_get_default_swsb(p);
+      struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
+
+      brw_push_insn_state(p);
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_exec_size(p, BRW_EXECUTE_1);
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+      brw_set_default_flag_reg(p, 0, 0);
+      brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
+
+      /* Load the indirect extended descriptor to an address register using OR
+       * so the caller can specify additional descriptor bits with the
+       * desc_imm immediate.
+       *
+       * Even though the instruction dispatcher always pulls the SFID and EOT
+       * fields from the instruction itself, actual external unit which
+       * processes the message gets the SFID and EOT from the extended
+       * descriptor which comes from the address register.  If we don't OR
+       * those two bits in, the external unit may get confused and hang.
+       */
+      unsigned imm_part = ex_bso ? 0 : (ex_desc_imm | sfid | eot << 5);
+
+      if (ex_desc_scratch) {
+         /* Or the scratch surface offset together with the immediate part of
+          * the extended descriptor.
+          */
+         assert(devinfo->verx10 >= 125);
+         brw_AND(p, addr,
+                 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(INTEL_MASK(31, 10)));
+         brw_OR(p, addr, addr, brw_imm_ud(imm_part));
+      } else if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
+         /* ex_desc bits 15:12 don't exist in the instruction encoding prior
+          * to Gfx12, so we may have fallen back to an indirect extended
+          * descriptor.
+          */
+         brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
+      } else {
+         brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
+      }
+
+      brw_pop_insn_state(p);
+      ex_desc = addr;
+
+      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
+   }
+
+   send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
+   brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
+
+   if (desc.file == BRW_IMMEDIATE_VALUE) {
+      brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
+      brw_inst_set_send_desc(devinfo, send, desc.ud);
+   } else {
+      assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
+      assert(desc.nr == BRW_ARF_ADDRESS);
+      assert(desc.subnr == 0);
+      brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
+   }
+
+   if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
+      brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
+      brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
+   } else {
+      assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
+      assert(ex_desc.nr == BRW_ARF_ADDRESS);
+      assert((ex_desc.subnr & 0x3) == 0);
+      brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
+      brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, phys_subnr(devinfo, ex_desc) >> 2);
+   }
+
+   if (ex_bso) {
+      /* The send instruction ExBSO field does not exist with UGM on Gfx20+,
+       * it is assumed.
+       *
+       * BSpec 56890
+       */
+      if (devinfo->ver < 20 || sfid != GFX12_SFID_UGM)
+         brw_inst_set_send_ex_bso(devinfo, send, true);
+      brw_inst_set_send_src1_len(devinfo, send, GET_BITS(ex_desc_imm, 10, 6));
+   }
+   brw_inst_set_sfid(devinfo, send, sfid);
+   brw_inst_set_eot(devinfo, send, eot);
+}
+
+static void
+brw_send_indirect_surface_message(struct brw_codegen *p,
+                                  unsigned sfid,
+                                  struct brw_reg dst,
+                                  struct brw_reg payload,
+                                  struct brw_reg surface,
+                                  unsigned desc_imm)
+{
+   if (surface.file != BRW_IMMEDIATE_VALUE) {
+      const struct tgl_swsb swsb = brw_get_default_swsb(p);
+      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
+
+      brw_push_insn_state(p);
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_exec_size(p, BRW_EXECUTE_1);
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+      brw_set_default_flag_reg(p, 0, 0);
+      brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
+
+      /* Mask out invalid bits from the surface index to avoid hangs e.g. when
+       * some surface array is accessed out of bounds.
+       */
+      brw_AND(p, addr,
+              suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
+                        BRW_GET_SWZ(surface.swizzle, 0)),
+              brw_imm_ud(0xff));
+
+      brw_pop_insn_state(p);
+
+      surface = addr;
+      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
+   }
+
+   brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
+}
+
+static bool
+while_jumps_before_offset(const struct intel_device_info *devinfo,
+                          brw_inst *insn, int while_offset, int start_offset)
+{
+   int scale = 16 / brw_jump_scale(devinfo);
+   int jip = devinfo->ver == 6 ? brw_inst_gfx6_jump_count(devinfo, insn)
+                               : brw_inst_jip(devinfo, insn);
+   assert(jip < 0);
+   return while_offset + jip * scale <= start_offset;
+}
+
+
+static int
+brw_find_next_block_end(struct brw_codegen *p, int start_offset)
+{
+   int offset;
+   void *store = p->store;
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   int depth = 0;
+
+   for (offset = next_offset(devinfo, store, start_offset);
+        offset < p->next_insn_offset;
+        offset = next_offset(devinfo, store, offset)) {
+      brw_inst *insn = store + offset;
+
+      switch (brw_inst_opcode(p->isa, insn)) {
+      case BRW_OPCODE_IF:
+         depth++;
+         break;
+      case BRW_OPCODE_ENDIF:
+         if (depth == 0)
+            return offset;
+         depth--;
+         break;
+      case BRW_OPCODE_WHILE:
+         /* If the while doesn't jump before our instruction, it's the end
+          * of a sibling do...while loop.  Ignore it.
+          */
+         if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
+            continue;
+         FALLTHROUGH;
+      case BRW_OPCODE_ELSE:
+      case BRW_OPCODE_HALT:
+         if (depth == 0)
+            return offset;
+         break;
+      default:
+         break;
+      }
+   }
+
+   return 0;
+}
+
+/* There is no DO instruction on gfx6, so to find the end of the loop
+ * we have to see if the loop is jumping back before our start
+ * instruction.
+ */
+static int
+brw_find_loop_end(struct brw_codegen *p, int start_offset)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   int offset;
+   void *store = p->store;
+
+   assert(devinfo->ver >= 6);
+
+   /* Always start after the instruction (such as a WHILE) we're trying to fix
+    * up.
+    */
+   for (offset = next_offset(devinfo, store, start_offset);
+        offset < p->next_insn_offset;
+        offset = next_offset(devinfo, store, offset)) {
+      brw_inst *insn = store + offset;
+
+      if (brw_inst_opcode(p->isa, insn) == BRW_OPCODE_WHILE) {
+	 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
+	    return offset;
+      }
+   }
+   assert(!"not reached");
+   return start_offset;
+}
+
+/* After program generation, go back and update the UIP and JIP of
+ * BREAK, CONT, and HALT instructions to their correct locations.
+ */
+void
+brw_set_uip_jip(struct brw_codegen *p, int start_offset)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   int offset;
+   int br = brw_jump_scale(devinfo);
+   int scale = 16 / br;
+   void *store = p->store;
+
+   if (devinfo->ver < 6)
+      return;
+
+   for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
+      brw_inst *insn = store + offset;
+      assert(brw_inst_cmpt_control(devinfo, insn) == 0);
+
+      switch (brw_inst_opcode(p->isa, insn)) {
+      case BRW_OPCODE_BREAK: {
+         int block_end_offset = brw_find_next_block_end(p, offset);
+         assert(block_end_offset != 0);
+         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
+	 /* Gfx7 UIP points to WHILE; Gfx6 points just after it */
+         brw_inst_set_uip(devinfo, insn,
+	    (brw_find_loop_end(p, offset) - offset +
+             (devinfo->ver == 6 ? 16 : 0)) / scale);
+	 break;
+      }
+
+      case BRW_OPCODE_CONTINUE: {
+         int block_end_offset = brw_find_next_block_end(p, offset);
+         assert(block_end_offset != 0);
+         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
+         brw_inst_set_uip(devinfo, insn,
+            (brw_find_loop_end(p, offset) - offset) / scale);
+
+         assert(brw_inst_uip(devinfo, insn) != 0);
+         assert(brw_inst_jip(devinfo, insn) != 0);
+	 break;
+      }
+
+      case BRW_OPCODE_ENDIF: {
+         int block_end_offset = brw_find_next_block_end(p, offset);
+         int32_t jump = (block_end_offset == 0) ?
+                        1 * br : (block_end_offset - offset) / scale;
+         if (devinfo->ver >= 7)
+            brw_inst_set_jip(devinfo, insn, jump);
+         else
+            brw_inst_set_gfx6_jump_count(devinfo, insn, jump);
+	 break;
+      }
+
+      case BRW_OPCODE_HALT: {
+	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
+	  *
+	  *    "In case of the halt instruction not inside any conditional
+	  *     code block, the value of <JIP> and <UIP> should be the
+	  *     same. In case of the halt instruction inside conditional code
+	  *     block, the <UIP> should be the end of the program, and the
+	  *     <JIP> should be end of the most inner conditional code block."
+	  *
+	  * The uip will have already been set by whoever set up the
+	  * instruction.
+	  */
+         int block_end_offset = brw_find_next_block_end(p, offset);
+	 if (block_end_offset == 0) {
+            brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
+	 } else {
+            brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
+	 }
+         assert(brw_inst_uip(devinfo, insn) != 0);
+         assert(brw_inst_jip(devinfo, insn) != 0);
+	 break;
+      }
+
+      default:
+         break;
+      }
+   }
+}
+
+void brw_ff_sync(struct brw_codegen *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   bool allocate,
+		   unsigned response_length,
+		   bool eot)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, brw_imm_d(0));
+
+   if (devinfo->ver < 6)
+      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
+
+   brw_set_ff_sync_message(p,
+			   insn,
+			   allocate,
+			   response_length,
+			   eot);
+}
+
+/**
+ * Emit the SEND instruction necessary to generate stream output data on Gfx6
+ * (for transform feedback).
+ *
+ * If send_commit_msg is true, this is the last piece of stream output data
+ * from this thread, so send the data as a committed write.  According to the
+ * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
+ *
+ *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
+ *   writes are complete by sending the final write as a committed write."
+ */
+void
+brw_svb_write(struct brw_codegen *p,
+              struct brw_reg dest,
+              unsigned msg_reg_nr,
+              struct brw_reg src0,
+              unsigned binding_table_index,
+              bool   send_commit_msg)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   assert(devinfo->ver == 6);
+   const unsigned target_cache = GFX6_SFID_DATAPORT_RENDER_CACHE;
+   brw_inst *insn;
+
+   gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_sfid(devinfo, insn, target_cache);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_desc(p, insn,
+                brw_message_desc(devinfo, 1, send_commit_msg, true) |
+                brw_dp_write_desc(devinfo, binding_table_index,
+                                  0, /* msg_control: ignored */
+                                  GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
+                                  send_commit_msg)); /* send_commit_msg */
+}
+
+static unsigned
+brw_surface_payload_size(unsigned num_channels,
+                         unsigned exec_size /**< 0 for SIMD4x2 */)
+{
+   if (exec_size == 0)
+      return 1; /* SIMD4x2 */
+   else if (exec_size <= 8)
+      return num_channels;
+   else
+      return 2 * num_channels;
+}
+
+void
+brw_untyped_atomic(struct brw_codegen *p,
+                   struct brw_reg dst,
+                   struct brw_reg payload,
+                   struct brw_reg surface,
+                   unsigned atomic_op,
+                   unsigned msg_length,
+                   bool response_expected,
+                   bool header_present)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   const unsigned sfid = (devinfo->verx10 >= 75 ?
+                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
+                          GFX7_SFID_DATAPORT_DATA_CACHE);
+   const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
+   /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
+   const bool has_simd4x2 = devinfo->verx10 >= 75;
+   const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
+                              has_simd4x2 ? 0 : 8;
+   const unsigned response_length =
+      brw_surface_payload_size(response_expected, exec_size);
+   const unsigned desc =
+      brw_message_desc(devinfo, msg_length, response_length, header_present) |
+      brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
+                                 response_expected);
+   /* Mask out unused components -- This is especially important in Align16
+    * mode on generations that don't have native support for SIMD4x2 atomics,
+    * because unused but enabled components will cause the dataport to perform
+    * additional atomic operations on the addresses that happen to be in the
+    * uninitialized Y, Z and W coordinates of the payload.
+    */
+   const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
+
+   brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
+                                     payload, surface, desc);
+}
+
+void
+brw_untyped_surface_read(struct brw_codegen *p,
+                         struct brw_reg dst,
+                         struct brw_reg payload,
+                         struct brw_reg surface,
+                         unsigned msg_length,
+                         unsigned num_channels)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   const unsigned sfid = (devinfo->verx10 >= 75 ?
+                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
+                          GFX7_SFID_DATAPORT_DATA_CACHE);
+   const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
+   const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
+   const unsigned response_length =
+      brw_surface_payload_size(num_channels, exec_size);
+   const unsigned desc =
+      brw_message_desc(devinfo, msg_length, response_length, false) |
+      brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
+
+   brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
+}
+
+void
+brw_untyped_surface_write(struct brw_codegen *p,
+                          struct brw_reg payload,
+                          struct brw_reg surface,
+                          unsigned msg_length,
+                          unsigned num_channels,
+                          bool header_present)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   const unsigned sfid = (devinfo->verx10 >= 75 ?
+                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
+                          GFX7_SFID_DATAPORT_DATA_CACHE);
+   const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
+   /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
+   const bool has_simd4x2 = devinfo->verx10 >= 75;
+   const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
+                              has_simd4x2 ? 0 : 8;
+   const unsigned desc =
+      brw_message_desc(devinfo, msg_length, 0, header_present) |
+      brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
+   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
+   const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
+
+   brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
+                                     payload, surface, desc);
+}
+
+static void
+brw_set_memory_fence_message(struct brw_codegen *p,
+                             struct brw_inst *insn,
+                             enum brw_message_target sfid,
+                             bool commit_enable,
+                             unsigned bti)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   brw_set_desc(p, insn, brw_message_desc(
+                   devinfo, 1, (commit_enable ? 1 : 0), true));
+
+   brw_inst_set_sfid(devinfo, insn, sfid);
+
+   switch (sfid) {
+   case GFX6_SFID_DATAPORT_RENDER_CACHE:
+      brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_RC_MEMORY_FENCE);
+      break;
+   case GFX7_SFID_DATAPORT_DATA_CACHE:
+      brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_DC_MEMORY_FENCE);
+      break;
+   default:
+      unreachable("Not reached");
+   }
+
+   if (commit_enable)
+      brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
+
+   assert(devinfo->ver >= 11 || bti == 0);
+   brw_inst_set_binding_table_index(devinfo, insn, bti);
+}
+
+static void
+gfx12_set_memory_fence_message(struct brw_codegen *p,
+                               struct brw_inst *insn,
+                               enum brw_message_target sfid,
+                               uint32_t desc)
+{
+   const unsigned mlen = 1 * reg_unit(p->devinfo); /* g0 header */
+    /* Completion signaled by write to register. No data returned. */
+   const unsigned rlen = 1 * reg_unit(p->devinfo);
+
+   brw_inst_set_sfid(p->devinfo, insn, sfid);
+
+   if (sfid == BRW_SFID_URB && p->devinfo->ver < 20) {
+      brw_set_desc(p, insn, brw_urb_fence_desc(p->devinfo) |
+                            brw_message_desc(p->devinfo, mlen, rlen, true));
+   } else {
+      enum lsc_fence_scope scope = lsc_fence_msg_desc_scope(p->devinfo, desc);
+      enum lsc_flush_type flush_type = lsc_fence_msg_desc_flush_type(p->devinfo, desc);
+
+      if (sfid == GFX12_SFID_TGM) {
+         scope = LSC_FENCE_TILE;
+         flush_type = LSC_FLUSH_TYPE_EVICT;
+      }
+
+      /* Wa_14012437816:
+       *
+       *   "For any fence greater than local scope, always set flush type to
+       *    at least invalidate so that fence goes on properly."
+       *
+       *   "The bug is if flush_type is 'None', the scope is always downgraded
+       *    to 'local'."
+       *
+       * Here set scope to NONE_6 instead of NONE, which has the same effect
+       * as NONE but avoids the downgrade to scope LOCAL.
+       */
+      if (intel_needs_workaround(p->devinfo, 14012437816) &&
+          scope > LSC_FENCE_LOCAL &&
+          flush_type == LSC_FLUSH_TYPE_NONE) {
+         flush_type = LSC_FLUSH_TYPE_NONE_6;
+      }
+
+      brw_set_desc(p, insn, lsc_fence_msg_desc(p->devinfo, scope,
+                                               flush_type, false) |
+                            brw_message_desc(p->devinfo, mlen, rlen, false));
+   }
+}
+
+void
+brw_memory_fence(struct brw_codegen *p,
+                 struct brw_reg dst,
+                 struct brw_reg src,
+                 enum opcode send_op,
+                 enum brw_message_target sfid,
+                 uint32_t desc,
+                 bool commit_enable,
+                 unsigned bti)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
+   src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
+
+   /* Set dst as destination for dependency tracking, the MEMORY_FENCE
+    * message doesn't write anything back.
+    */
+   struct brw_inst *insn = next_insn(p, send_op);
+   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+   brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
+   brw_set_dest(p, insn, dst);
+   brw_set_src0(p, insn, src);
+
+   /* All DG2 hardware requires LSC for fence messages, even A-step */
+   if (devinfo->has_lsc)
+      gfx12_set_memory_fence_message(p, insn, sfid, desc);
+   else
+      brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
+}
+
+void
+brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst, bool last)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   const unsigned exec_size = 1 << brw_get_default_exec_size(p);
+   const unsigned qtr_control = brw_get_default_group(p) / 8;
+   brw_inst *inst;
+
+   assert(devinfo->ver == 7);
+
+   brw_push_insn_state(p);
+
+   /* The flag register is only used on Gfx7 in align1 mode, so avoid setting
+    * unnecessary bits in the instruction words, get the information we need
+    * and reset the default flag register. This allows more instructions to be
+    * compacted.
+    */
+   const unsigned flag_subreg = p->current->flag_subreg;
+   brw_set_default_flag_reg(p, 0, 0);
+
+   if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+      const struct brw_reg flag = brw_flag_subreg(flag_subreg);
+
+      brw_set_default_exec_size(p, BRW_EXECUTE_1);
+      brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
+
+      /* Run enough instructions returning zero with execution masking and
+       * a conditional modifier enabled in order to get the full execution
+       * mask in f1.0.  We could use a single 32-wide move here if it
+       * weren't because of the hardware bug that causes channel enables to
+       * be applied incorrectly to the second half of 32-wide instructions
+       * on Gfx7.
+       */
+      const unsigned lower_size = MIN2(16, exec_size);
+      for (unsigned i = 0; i < exec_size / lower_size; i++) {
+         inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
+                        brw_imm_uw(0));
+         brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
+         brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
+         brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
+         brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
+         brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
+         brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
+      }
+
+      /* Find the first bit set in the exec_size-wide portion of the flag
+       * register that was updated by the last sequence of MOV
+       * instructions.
+       */
+      const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
+      brw_set_default_exec_size(p, BRW_EXECUTE_1);
+      if (!last) {
+         inst = brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
+      } else {
+         inst = brw_LZD(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
+         struct brw_reg neg = vec1(dst);
+         neg.negate = true;
+         inst = brw_ADD(p, vec1(dst), neg, brw_imm_uw(31));
+      }
+   } else {
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+      /* Overwrite the destination without and with execution masking to
+       * find out which of the channels is active.
+       */
+      brw_push_insn_state(p);
+      brw_set_default_exec_size(p, BRW_EXECUTE_4);
+      brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
+              brw_imm_ud(1));
+
+      inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
+                     brw_imm_ud(0));
+      brw_pop_insn_state(p);
+      brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
+   }
+
+   brw_pop_insn_state(p);
+}
+
+void
+brw_broadcast(struct brw_codegen *p,
+              struct brw_reg dst,
+              struct brw_reg src,
+              struct brw_reg idx)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
+   brw_inst *inst;
+
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
+
+   assert(src.file == BRW_GENERAL_REGISTER_FILE &&
+          src.address_mode == BRW_ADDRESS_DIRECT);
+   assert(!src.abs && !src.negate);
+
+   /* Gen12.5 adds the following region restriction:
+    *
+    *    "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float
+    *    and Quad-Word data must not be used."
+    *
+    * We require the source and destination types to match so stomp to an
+    * unsigned integer type.
+    */
+   assert(src.type == dst.type);
+   src.type = dst.type = brw_reg_type_from_bit_size(type_sz(src.type) * 8,
+                                                    BRW_REGISTER_TYPE_UD);
+
+   if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
+       idx.file == BRW_IMMEDIATE_VALUE) {
+      /* Trivial, the source is already uniform or the index is a constant.
+       * We will typically not get here if the optimizer is doing its job, but
+       * asserting would be mean.
+       */
+      const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
+      src = align1 ? stride(suboffset(src, i), 0, 1, 0) :
+                     stride(suboffset(src, 4 * i), 0, 4, 1);
+
+      if (type_sz(src.type) > 4 && !devinfo->has_64bit_int) {
+         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
+                    subscript(src, BRW_REGISTER_TYPE_D, 0));
+         brw_set_default_swsb(p, tgl_swsb_null());
+         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
+                    subscript(src, BRW_REGISTER_TYPE_D, 1));
+      } else {
+         brw_MOV(p, dst, src);
+      }
+   } else {
+      /* From the Haswell PRM section "Register Region Restrictions":
+       *
+       *    "The lower bits of the AddressImmediate must not overflow to
+       *    change the register address.  The lower 5 bits of Address
+       *    Immediate when added to lower 5 bits of address register gives
+       *    the sub-register offset. The upper bits of Address Immediate
+       *    when added to upper bits of address register gives the register
+       *    address. Any overflow from sub-register offset is dropped."
+       *
+       * Fortunately, for broadcast, we never have a sub-register offset so
+       * this isn't an issue.
+       */
+      assert(src.subnr == 0);
+
+      if (align1) {
+         const struct brw_reg addr =
+            retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
+         unsigned offset = src.nr * REG_SIZE + src.subnr;
+         /* Limit in bytes of the signed indirect addressing immediate. */
+         const unsigned limit = 512;
+
+         brw_push_insn_state(p);
+         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+         brw_set_default_flag_reg(p, 0, 0);
+
+         /* Take into account the component size and horizontal stride. */
+         assert(src.vstride == src.hstride + src.width);
+         brw_SHL(p, addr, vec1(idx),
+                 brw_imm_ud(util_logbase2(type_sz(src.type)) +
+                            src.hstride - 1));
+
+         /* We can only address up to limit bytes using the indirect
+          * addressing immediate, account for the difference if the source
+          * register is above this limit.
+          */
+         if (offset >= limit) {
+            brw_set_default_swsb(p, tgl_swsb_regdist(1));
+            brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
+            offset = offset % limit;
+         }
+
+         brw_pop_insn_state(p);
+
+         brw_set_default_swsb(p, tgl_swsb_regdist(1));
+
+         /* Use indirect addressing to fetch the specified component. */
+         if (type_sz(src.type) > 4 &&
+             (devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo) ||
+              !devinfo->has_64bit_int)) {
+            /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
+             *
+             *    "When source or destination datatype is 64b or operation is
+             *    integer DWord multiply, indirect addressing must not be
+             *    used."
+             *
+             * To work around both of this issue, we do two integer MOVs
+             * insead of one 64-bit MOV.  Because no double value should ever
+             * cross a register boundary, it's safe to use the immediate
+             * offset in the indirect here to handle adding 4 bytes to the
+             * offset and avoid the extra ADD to the register file.
+             */
+            brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
+                       retype(brw_vec1_indirect(addr.subnr, offset),
+                              BRW_REGISTER_TYPE_D));
+            brw_set_default_swsb(p, tgl_swsb_null());
+            brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
+                       retype(brw_vec1_indirect(addr.subnr, offset + 4),
+                              BRW_REGISTER_TYPE_D));
+         } else {
+            brw_MOV(p, dst,
+                    retype(brw_vec1_indirect(addr.subnr, offset), src.type));
+         }
+      } else {
+         /* In SIMD4x2 mode the index can be either zero or one, replicate it
+          * to all bits of a flag register,
+          */
+         inst = brw_MOV(p,
+                        brw_null_reg(),
+                        stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
+         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
+         brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
+         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
+
+         /* and use predicated SEL to pick the right channel. */
+         inst = brw_SEL(p, dst,
+                        stride(suboffset(src, 4), 4, 4, 1),
+                        stride(src, 4, 4, 1));
+         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
+         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
+      }
+   }
+
+   brw_pop_insn_state(p);
+}
+
+
+/**
+ * Emit the SEND message for a barrier
+ */
+void
+brw_barrier(struct brw_codegen *p, struct brw_reg src)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   struct brw_inst *inst;
+
+   assert(devinfo->ver >= 7);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   inst = next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
+   brw_set_src0(p, inst, src);
+   brw_set_src1(p, inst, brw_null_reg());
+   brw_set_desc(p, inst, brw_message_desc(devinfo,
+                                          1 * reg_unit(devinfo), 0, false));
+
+   brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
+   brw_inst_set_gateway_subfuncid(devinfo, inst,
+                                  BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
+
+   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
+   brw_pop_insn_state(p);
+}
+
+
+/**
+ * Emit the wait instruction for a barrier
+ */
+void
+brw_WAIT(struct brw_codegen *p)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   struct brw_inst *insn;
+
+   struct brw_reg src = brw_notification_reg();
+
+   insn = next_insn(p, BRW_OPCODE_WAIT);
+   brw_set_dest(p, insn, src);
+   brw_set_src0(p, insn, src);
+   brw_set_src1(p, insn, brw_null_reg());
+
+   brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
+   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+}
+
+void
+brw_float_controls_mode(struct brw_codegen *p,
+                        unsigned mode, unsigned mask)
+{
+   assert(p->current->mask_control == BRW_MASK_DISABLE);
+
+   /* From the Skylake PRM, Volume 7, page 760:
+    *  "Implementation Restriction on Register Access: When the control
+    *   register is used as an explicit source and/or destination, hardware
+    *   does not ensure execution pipeline coherency. Software must set the
+    *   thread control field to ‘switch’ for an instruction that uses
+    *   control register as an explicit operand."
+    *
+    * On Gfx12+ this is implemented in terms of SWSB annotations instead.
+    */
+   brw_set_default_swsb(p, tgl_swsb_regdist(1));
+
+   brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
+                            brw_imm_ud(~mask));
+   brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
+   if (p->devinfo->ver < 12)
+      brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
+
+   if (mode) {
+      brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
+                                 brw_imm_ud(mode));
+      brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
+      if (p->devinfo->ver < 12)
+         brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
+   }
+
+   if (p->devinfo->ver >= 12)
+      brw_SYNC(p, TGL_SYNC_NOP);
+}
+
+void
+brw_update_reloc_imm(const struct brw_isa_info *isa,
+                     brw_inst *inst,
+                     uint32_t value)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   /* Sanity check that the instruction is a MOV of an immediate */
+   assert(brw_inst_opcode(isa, inst) == BRW_OPCODE_MOV);
+   assert(brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE);
+
+   /* If it was compacted, we can't safely rewrite */
+   assert(brw_inst_cmpt_control(devinfo, inst) == 0);
+
+   brw_inst_set_imm_ud(devinfo, inst, value);
+}
+
+/* A default value for constants that will be patched at run-time.
+ * We pick an arbitrary value that prevents instruction compaction.
+ */
+#define DEFAULT_PATCH_IMM 0x4a7cc037
+
+void
+brw_MOV_reloc_imm(struct brw_codegen *p,
+                  struct brw_reg dst,
+                  enum brw_reg_type src_type,
+                  uint32_t id)
+{
+   assert(type_sz(src_type) == 4);
+   assert(type_sz(dst.type) == 4);
+
+   brw_add_reloc(p, id, BRW_SHADER_RELOC_TYPE_MOV_IMM,
+                 p->next_insn_offset, 0);
+
+   brw_MOV(p, dst, retype(brw_imm_ud(DEFAULT_PATCH_IMM), src_type));
+}
diff --git a/src/intel/compiler/elk/brw_eu_util.c b/src/intel/compiler/elk/brw_eu_util.c
new file mode 100644
index 00000000000..9fc8ff9c7c7
--- /dev/null
+++ b/src/intel/compiler/elk/brw_eu_util.c
@@ -0,0 +1,119 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+
+#include "brw_eu_defines.h"
+#include "brw_eu.h"
+
+
+void brw_math_invert( struct brw_codegen *p,
+			     struct brw_reg dst,
+			     struct brw_reg src)
+{
+   gfx4_math(p,
+	     dst,
+	     BRW_MATH_FUNCTION_INV,
+	     0,
+	     src,
+	     BRW_MATH_PRECISION_FULL);
+}
+
+
+
+void brw_copy4(struct brw_codegen *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count)
+{
+   unsigned i;
+
+   dst = vec4(dst);
+   src = vec4(src);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16));
+   }
+}
+
+
+void brw_copy8(struct brw_codegen *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count)
+{
+   unsigned i;
+
+   dst = vec8(dst);
+   src = vec8(src);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+   }
+}
+
+
+void brw_copy_indirect_to_indirect(struct brw_codegen *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   unsigned count)
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, deref_4f(dst_ptr, delta),    deref_4f(src_ptr, delta));
+      brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16));
+   }
+}
+
+
+void brw_copy_from_indirect(struct brw_codegen *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    unsigned count)
+{
+   unsigned i;
+
+   dst = vec4(dst);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    deref_4f(ptr, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16));
+   }
+}
diff --git a/src/intel/compiler/elk/brw_eu_validate.c b/src/intel/compiler/elk/brw_eu_validate.c
new file mode 100644
index 00000000000..ec22ef4fa03
--- /dev/null
+++ b/src/intel/compiler/elk/brw_eu_validate.c
@@ -0,0 +1,2827 @@
+/*
+ * Copyright © 2015-2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_eu_validate.c
+ *
+ * This file implements a pass that validates shader assembly.
+ *
+ * The restrictions implemented herein are intended to verify that instructions
+ * in shader assembly do not violate restrictions documented in the graphics
+ * programming reference manuals.
+ *
+ * The restrictions are difficult for humans to quickly verify due to their
+ * complexity and abundance.
+ *
+ * It is critical that this code is thoroughly unit tested because false
+ * results will lead developers astray, which is worse than having no validator
+ * at all. Functional changes to this file without corresponding unit tests (in
+ * test_eu_validate.cpp) will be rejected.
+ */
+
+#include <stdlib.h>
+#include "brw_eu.h"
+#include "brw_disasm_info.h"
+
+/* We're going to do lots of string concatenation, so this should help. */
+struct string {
+   char *str;
+   size_t len;
+};
+
+static void
+cat(struct string *dest, const struct string src)
+{
+   dest->str = realloc(dest->str, dest->len + src.len + 1);
+   memcpy(dest->str + dest->len, src.str, src.len);
+   dest->str[dest->len + src.len] = '\0';
+   dest->len = dest->len + src.len;
+}
+#define CAT(dest, src) cat(&dest, (struct string){src, strlen(src)})
+
+static bool
+contains(const struct string haystack, const struct string needle)
+{
+   return haystack.str && memmem(haystack.str, haystack.len,
+                                 needle.str, needle.len) != NULL;
+}
+#define CONTAINS(haystack, needle) \
+   contains(haystack, (struct string){needle, strlen(needle)})
+
+#define error(str)   "\tERROR: " str "\n"
+#define ERROR_INDENT "\t       "
+
+#define ERROR(msg) ERROR_IF(true, msg)
+#define ERROR_IF(cond, msg)                             \
+   do {                                                 \
+      if ((cond) && !CONTAINS(error_msg, error(msg))) { \
+         CAT(error_msg, error(msg));                    \
+      }                                                 \
+   } while(0)
+
+#define CHECK(func, args...)                             \
+   do {                                                  \
+      struct string __msg = func(isa, inst, ##args); \
+      if (__msg.str) {                                   \
+         cat(&error_msg, __msg);                         \
+         free(__msg.str);                                \
+      }                                                  \
+   } while (0)
+
+#define STRIDE(stride) (stride != 0 ? 1 << ((stride) - 1) : 0)
+#define WIDTH(width)   (1 << (width))
+
+static bool
+inst_is_send(const struct brw_isa_info *isa, const brw_inst *inst)
+{
+   switch (brw_inst_opcode(isa, inst)) {
+   case BRW_OPCODE_SEND:
+   case BRW_OPCODE_SENDC:
+   case BRW_OPCODE_SENDS:
+   case BRW_OPCODE_SENDSC:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static bool
+inst_is_split_send(const struct brw_isa_info *isa, const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   if (devinfo->ver >= 12) {
+      return inst_is_send(isa, inst);
+   } else {
+      switch (brw_inst_opcode(isa, inst)) {
+      case BRW_OPCODE_SENDS:
+      case BRW_OPCODE_SENDSC:
+         return true;
+      default:
+         return false;
+      }
+   }
+}
+
+static unsigned
+signed_type(unsigned type)
+{
+   switch (type) {
+   case BRW_REGISTER_TYPE_UD: return BRW_REGISTER_TYPE_D;
+   case BRW_REGISTER_TYPE_UW: return BRW_REGISTER_TYPE_W;
+   case BRW_REGISTER_TYPE_UB: return BRW_REGISTER_TYPE_B;
+   case BRW_REGISTER_TYPE_UQ: return BRW_REGISTER_TYPE_Q;
+   default:                   return type;
+   }
+}
+
+static enum brw_reg_type
+inst_dst_type(const struct brw_isa_info *isa, const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   return (devinfo->ver < 12 || !inst_is_send(isa, inst)) ?
+      brw_inst_dst_type(devinfo, inst) : BRW_REGISTER_TYPE_D;
+}
+
+static bool
+inst_is_raw_move(const struct brw_isa_info *isa, const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   unsigned dst_type = signed_type(inst_dst_type(isa, inst));
+   unsigned src_type = signed_type(brw_inst_src0_type(devinfo, inst));
+
+   if (brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) {
+      /* FIXME: not strictly true */
+      if (brw_inst_src0_type(devinfo, inst) == BRW_REGISTER_TYPE_VF ||
+          brw_inst_src0_type(devinfo, inst) == BRW_REGISTER_TYPE_UV ||
+          brw_inst_src0_type(devinfo, inst) == BRW_REGISTER_TYPE_V) {
+         return false;
+      }
+   } else if (brw_inst_src0_negate(devinfo, inst) ||
+              brw_inst_src0_abs(devinfo, inst)) {
+      return false;
+   }
+
+   return brw_inst_opcode(isa, inst) == BRW_OPCODE_MOV &&
+          brw_inst_saturate(devinfo, inst) == 0 &&
+          dst_type == src_type;
+}
+
+static bool
+dst_is_null(const struct intel_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_dst_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+          brw_inst_dst_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
+}
+
+static bool
+src0_is_null(const struct intel_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT &&
+          brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+          brw_inst_src0_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
+}
+
+static bool
+src1_is_null(const struct intel_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src1_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+          brw_inst_src1_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
+}
+
+static bool
+src0_is_acc(const struct intel_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+          (brw_inst_src0_da_reg_nr(devinfo, inst) & 0xF0) == BRW_ARF_ACCUMULATOR;
+}
+
+static bool
+src1_is_acc(const struct intel_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src1_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+          (brw_inst_src1_da_reg_nr(devinfo, inst) & 0xF0) == BRW_ARF_ACCUMULATOR;
+}
+
+static bool
+src0_has_scalar_region(const struct intel_device_info *devinfo,
+                       const brw_inst *inst)
+{
+   return brw_inst_src0_vstride(devinfo, inst) == BRW_VERTICAL_STRIDE_0 &&
+          brw_inst_src0_width(devinfo, inst) == BRW_WIDTH_1 &&
+          brw_inst_src0_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0;
+}
+
+static bool
+src1_has_scalar_region(const struct intel_device_info *devinfo,
+                       const brw_inst *inst)
+{
+   return brw_inst_src1_vstride(devinfo, inst) == BRW_VERTICAL_STRIDE_0 &&
+          brw_inst_src1_width(devinfo, inst) == BRW_WIDTH_1 &&
+          brw_inst_src1_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0;
+}
+
+static struct string
+invalid_values(const struct brw_isa_info *isa, const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   unsigned num_sources = brw_num_sources_from_inst(isa, inst);
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   switch ((enum brw_execution_size) brw_inst_exec_size(devinfo, inst)) {
+   case BRW_EXECUTE_1:
+   case BRW_EXECUTE_2:
+   case BRW_EXECUTE_4:
+   case BRW_EXECUTE_8:
+   case BRW_EXECUTE_16:
+   case BRW_EXECUTE_32:
+      break;
+   default:
+      ERROR("invalid execution size");
+      break;
+   }
+
+   if (error_msg.str)
+      return error_msg;
+
+   if (devinfo->ver >= 12) {
+      unsigned group_size = 1 << brw_inst_exec_size(devinfo, inst);
+      unsigned qtr_ctrl = brw_inst_qtr_control(devinfo, inst);
+      unsigned nib_ctrl = brw_inst_nib_control(devinfo, inst);
+
+      unsigned chan_off = (qtr_ctrl * 2 + nib_ctrl) << 2;
+      ERROR_IF(chan_off % group_size != 0,
+               "The execution size must be a factor of the chosen offset");
+   }
+
+   if (inst_is_send(isa, inst))
+      return error_msg;
+
+   if (num_sources == 3) {
+      /* Nothing to test:
+       *    No 3-src instructions on Gfx4-5
+       *    No reg file bits on Gfx6-10 (align16)
+       *    No invalid encodings on Gfx10-12 (align1)
+       */
+   } else {
+      if (devinfo->ver > 6) {
+         ERROR_IF(brw_inst_dst_reg_file(devinfo, inst) == MRF ||
+                  (num_sources > 0 &&
+                   brw_inst_src0_reg_file(devinfo, inst) == MRF) ||
+                  (num_sources > 1 &&
+                   brw_inst_src1_reg_file(devinfo, inst) == MRF),
+                  "invalid register file encoding");
+      }
+   }
+
+   if (error_msg.str)
+      return error_msg;
+
+   if (num_sources == 3) {
+      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+         if (devinfo->ver >= 10) {
+            ERROR_IF(brw_inst_3src_a1_dst_type (devinfo, inst) == INVALID_REG_TYPE ||
+                     brw_inst_3src_a1_src0_type(devinfo, inst) == INVALID_REG_TYPE ||
+                     brw_inst_3src_a1_src1_type(devinfo, inst) == INVALID_REG_TYPE ||
+                     brw_inst_3src_a1_src2_type(devinfo, inst) == INVALID_REG_TYPE,
+                     "invalid register type encoding");
+         } else {
+            ERROR("Align1 mode not allowed on Gen < 10");
+         }
+      } else {
+         ERROR_IF(brw_inst_3src_a16_dst_type(devinfo, inst) == INVALID_REG_TYPE ||
+                  brw_inst_3src_a16_src_type(devinfo, inst) == INVALID_REG_TYPE,
+                  "invalid register type encoding");
+      }
+   } else {
+      ERROR_IF(brw_inst_dst_type (devinfo, inst) == INVALID_REG_TYPE ||
+               (num_sources > 0 &&
+                brw_inst_src0_type(devinfo, inst) == INVALID_REG_TYPE) ||
+               (num_sources > 1 &&
+                brw_inst_src1_type(devinfo, inst) == INVALID_REG_TYPE),
+               "invalid register type encoding");
+   }
+
+   return error_msg;
+}
+
+static struct string
+sources_not_null(const struct brw_isa_info *isa,
+                 const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+   unsigned num_sources = brw_num_sources_from_inst(isa, inst);
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   /* Nothing to test. 3-src instructions can only have GRF sources, and
+    * there's no bit to control the file.
+    */
+   if (num_sources == 3)
+      return (struct string){};
+
+   /* Nothing to test.  Split sends can only encode a file in sources that are
+    * allowed to be NULL.
+    */
+   if (inst_is_split_send(isa, inst))
+      return (struct string){};
+
+   if (num_sources >= 1 && brw_inst_opcode(isa, inst) != BRW_OPCODE_SYNC)
+      ERROR_IF(src0_is_null(devinfo, inst), "src0 is null");
+
+   if (num_sources == 2)
+      ERROR_IF(src1_is_null(devinfo, inst), "src1 is null");
+
+   return error_msg;
+}
+
+static struct string
+alignment_supported(const struct brw_isa_info *isa,
+                    const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   ERROR_IF(devinfo->ver >= 11 && brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16,
+            "Align16 not supported");
+
+   return error_msg;
+}
+
+static bool
+inst_uses_src_acc(const struct brw_isa_info *isa,
+                  const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   /* Check instructions that use implicit accumulator sources */
+   switch (brw_inst_opcode(isa, inst)) {
+   case BRW_OPCODE_MAC:
+   case BRW_OPCODE_MACH:
+   case BRW_OPCODE_SADA2:
+      return true;
+   default:
+      break;
+   }
+
+   /* FIXME: support 3-src instructions */
+   unsigned num_sources = brw_num_sources_from_inst(isa, inst);
+   assert(num_sources < 3);
+
+   return src0_is_acc(devinfo, inst) || (num_sources > 1 && src1_is_acc(devinfo, inst));
+}
+
+static struct string
+send_restrictions(const struct brw_isa_info *isa,
+                  const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   if (inst_is_split_send(isa, inst)) {
+      ERROR_IF(brw_inst_send_src1_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+               brw_inst_send_src1_reg_nr(devinfo, inst) != BRW_ARF_NULL,
+               "src1 of split send must be a GRF or NULL");
+
+      ERROR_IF(brw_inst_eot(devinfo, inst) &&
+               brw_inst_src0_da_reg_nr(devinfo, inst) < 112,
+               "send with EOT must use g112-g127");
+      ERROR_IF(brw_inst_eot(devinfo, inst) &&
+               brw_inst_send_src1_reg_file(devinfo, inst) == BRW_GENERAL_REGISTER_FILE &&
+               brw_inst_send_src1_reg_nr(devinfo, inst) < 112,
+               "send with EOT must use g112-g127");
+
+      if (brw_inst_send_src0_reg_file(devinfo, inst) == BRW_GENERAL_REGISTER_FILE &&
+          brw_inst_send_src1_reg_file(devinfo, inst) == BRW_GENERAL_REGISTER_FILE) {
+         /* Assume minimums if we don't know */
+         unsigned mlen = 1;
+         if (!brw_inst_send_sel_reg32_desc(devinfo, inst)) {
+            const uint32_t desc = brw_inst_send_desc(devinfo, inst);
+            mlen = brw_message_desc_mlen(devinfo, desc) / reg_unit(devinfo);
+         }
+
+         unsigned ex_mlen = 1;
+         if (!brw_inst_send_sel_reg32_ex_desc(devinfo, inst)) {
+            const uint32_t ex_desc = brw_inst_sends_ex_desc(devinfo, inst);
+            ex_mlen = brw_message_ex_desc_ex_mlen(devinfo, ex_desc) /
+                      reg_unit(devinfo);
+         }
+         const unsigned src0_reg_nr = brw_inst_src0_da_reg_nr(devinfo, inst);
+         const unsigned src1_reg_nr = brw_inst_send_src1_reg_nr(devinfo, inst);
+         ERROR_IF((src0_reg_nr <= src1_reg_nr &&
+                   src1_reg_nr < src0_reg_nr + mlen) ||
+                  (src1_reg_nr <= src0_reg_nr &&
+                   src0_reg_nr < src1_reg_nr + ex_mlen),
+                   "split send payloads must not overlap");
+      }
+   } else if (inst_is_send(isa, inst)) {
+      ERROR_IF(brw_inst_src0_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT,
+               "send must use direct addressing");
+
+      if (devinfo->ver >= 7) {
+         ERROR_IF(brw_inst_send_src0_reg_file(devinfo, inst) != BRW_GENERAL_REGISTER_FILE,
+                  "send from non-GRF");
+         ERROR_IF(brw_inst_eot(devinfo, inst) &&
+                  brw_inst_src0_da_reg_nr(devinfo, inst) < 112,
+                  "send with EOT must use g112-g127");
+      }
+
+      if (devinfo->ver >= 8) {
+         ERROR_IF(!dst_is_null(devinfo, inst) &&
+                  (brw_inst_dst_da_reg_nr(devinfo, inst) +
+                   brw_inst_rlen(devinfo, inst) > 127) &&
+                  (brw_inst_src0_da_reg_nr(devinfo, inst) +
+                   brw_inst_mlen(devinfo, inst) >
+                   brw_inst_dst_da_reg_nr(devinfo, inst)),
+                  "r127 must not be used for return address when there is "
+                  "a src and dest overlap");
+      }
+   }
+
+   return error_msg;
+}
+
+static bool
+is_unsupported_inst(const struct brw_isa_info *isa,
+                    const brw_inst *inst)
+{
+   return brw_inst_opcode(isa, inst) == BRW_OPCODE_ILLEGAL;
+}
+
+/**
+ * Returns whether a combination of two types would qualify as mixed float
+ * operation mode
+ */
+static inline bool
+types_are_mixed_float(enum brw_reg_type t0, enum brw_reg_type t1)
+{
+   return (t0 == BRW_REGISTER_TYPE_F && t1 == BRW_REGISTER_TYPE_HF) ||
+          (t1 == BRW_REGISTER_TYPE_F && t0 == BRW_REGISTER_TYPE_HF);
+}
+
+static enum brw_reg_type
+execution_type_for_type(enum brw_reg_type type)
+{
+   switch (type) {
+   case BRW_REGISTER_TYPE_NF:
+   case BRW_REGISTER_TYPE_DF:
+   case BRW_REGISTER_TYPE_F:
+   case BRW_REGISTER_TYPE_HF:
+      return type;
+
+   case BRW_REGISTER_TYPE_VF:
+      return BRW_REGISTER_TYPE_F;
+
+   case BRW_REGISTER_TYPE_Q:
+   case BRW_REGISTER_TYPE_UQ:
+      return BRW_REGISTER_TYPE_Q;
+
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UD:
+      return BRW_REGISTER_TYPE_D;
+
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_B:
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_V:
+   case BRW_REGISTER_TYPE_UV:
+      return BRW_REGISTER_TYPE_W;
+   }
+   unreachable("not reached");
+}
+
+/**
+ * Returns the execution type of an instruction \p inst
+ */
+static enum brw_reg_type
+execution_type(const struct brw_isa_info *isa, const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   unsigned num_sources = brw_num_sources_from_inst(isa, inst);
+   enum brw_reg_type src0_exec_type, src1_exec_type;
+
+   /* Execution data type is independent of destination data type, except in
+    * mixed F/HF instructions.
+    */
+   enum brw_reg_type dst_exec_type = inst_dst_type(isa, inst);
+
+   src0_exec_type = execution_type_for_type(brw_inst_src0_type(devinfo, inst));
+   if (num_sources == 1) {
+      if (src0_exec_type == BRW_REGISTER_TYPE_HF)
+         return dst_exec_type;
+      return src0_exec_type;
+   }
+
+   src1_exec_type = execution_type_for_type(brw_inst_src1_type(devinfo, inst));
+   if (types_are_mixed_float(src0_exec_type, src1_exec_type) ||
+       types_are_mixed_float(src0_exec_type, dst_exec_type) ||
+       types_are_mixed_float(src1_exec_type, dst_exec_type)) {
+      return BRW_REGISTER_TYPE_F;
+   }
+
+   if (src0_exec_type == src1_exec_type)
+      return src0_exec_type;
+
+   if (src0_exec_type == BRW_REGISTER_TYPE_NF ||
+       src1_exec_type == BRW_REGISTER_TYPE_NF)
+      return BRW_REGISTER_TYPE_NF;
+
+   /* Mixed operand types where one is float is float on Gen < 6
+    * (and not allowed on later platforms)
+    */
+   if (devinfo->ver < 6 &&
+       (src0_exec_type == BRW_REGISTER_TYPE_F ||
+        src1_exec_type == BRW_REGISTER_TYPE_F))
+      return BRW_REGISTER_TYPE_F;
+
+   if (src0_exec_type == BRW_REGISTER_TYPE_Q ||
+       src1_exec_type == BRW_REGISTER_TYPE_Q)
+      return BRW_REGISTER_TYPE_Q;
+
+   if (src0_exec_type == BRW_REGISTER_TYPE_D ||
+       src1_exec_type == BRW_REGISTER_TYPE_D)
+      return BRW_REGISTER_TYPE_D;
+
+   if (src0_exec_type == BRW_REGISTER_TYPE_W ||
+       src1_exec_type == BRW_REGISTER_TYPE_W)
+      return BRW_REGISTER_TYPE_W;
+
+   if (src0_exec_type == BRW_REGISTER_TYPE_DF ||
+       src1_exec_type == BRW_REGISTER_TYPE_DF)
+      return BRW_REGISTER_TYPE_DF;
+
+   unreachable("not reached");
+}
+
+/**
+ * Returns whether a region is packed
+ *
+ * A region is packed if its elements are adjacent in memory, with no
+ * intervening space, no overlap, and no replicated values.
+ */
+static bool
+is_packed(unsigned vstride, unsigned width, unsigned hstride)
+{
+   if (vstride == width) {
+      if (vstride == 1) {
+         return hstride == 0;
+      } else {
+         return hstride == 1;
+      }
+   }
+
+   return false;
+}
+
+/**
+ * Returns whether a region is linear
+ *
+ * A region is linear if its elements do not overlap and are not replicated.
+ * Unlike a packed region, intervening space (i.e. strided values) is allowed.
+ */
+static bool
+is_linear(unsigned vstride, unsigned width, unsigned hstride)
+{
+   return vstride == width * hstride ||
+          (hstride == 0 && width == 1);
+}
+
+/**
+ * Returns whether an instruction is an explicit or implicit conversion
+ * to/from half-float.
+ */
+static bool
+is_half_float_conversion(const struct brw_isa_info *isa,
+                         const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst);
+
+   unsigned num_sources = brw_num_sources_from_inst(isa, inst);
+   enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
+
+   if (dst_type != src0_type &&
+       (dst_type == BRW_REGISTER_TYPE_HF || src0_type == BRW_REGISTER_TYPE_HF)) {
+      return true;
+   } else if (num_sources > 1) {
+      enum brw_reg_type src1_type = brw_inst_src1_type(devinfo, inst);
+      return dst_type != src1_type &&
+            (dst_type == BRW_REGISTER_TYPE_HF ||
+             src1_type == BRW_REGISTER_TYPE_HF);
+   }
+
+   return false;
+}
+
+/*
+ * Returns whether an instruction is using mixed float operation mode
+ */
+static bool
+is_mixed_float(const struct brw_isa_info *isa, const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   if (devinfo->ver < 8)
+      return false;
+
+   if (inst_is_send(isa, inst))
+      return false;
+
+   unsigned opcode = brw_inst_opcode(isa, inst);
+   const struct opcode_desc *desc = brw_opcode_desc(isa, opcode);
+   if (desc->ndst == 0)
+      return false;
+
+   /* FIXME: support 3-src instructions */
+   unsigned num_sources = brw_num_sources_from_inst(isa, inst);
+   assert(num_sources < 3);
+
+   enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst);
+   enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
+
+   if (num_sources == 1)
+      return types_are_mixed_float(src0_type, dst_type);
+
+   enum brw_reg_type src1_type = brw_inst_src1_type(devinfo, inst);
+
+   return types_are_mixed_float(src0_type, src1_type) ||
+          types_are_mixed_float(src0_type, dst_type) ||
+          types_are_mixed_float(src1_type, dst_type);
+}
+
+/**
+ * Returns whether an instruction is an explicit or implicit conversion
+ * to/from byte.
+ */
+static bool
+is_byte_conversion(const struct brw_isa_info *isa,
+                   const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst);
+
+   unsigned num_sources = brw_num_sources_from_inst(isa, inst);
+   enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
+
+   if (dst_type != src0_type &&
+       (type_sz(dst_type) == 1 || type_sz(src0_type) == 1)) {
+      return true;
+   } else if (num_sources > 1) {
+      enum brw_reg_type src1_type = brw_inst_src1_type(devinfo, inst);
+      return dst_type != src1_type &&
+            (type_sz(dst_type) == 1 || type_sz(src1_type) == 1);
+   }
+
+   return false;
+}
+
+/**
+ * Checks restrictions listed in "General Restrictions Based on Operand Types"
+ * in the "Register Region Restrictions" section.
+ */
+static struct string
+general_restrictions_based_on_operand_types(const struct brw_isa_info *isa,
+                                            const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   const struct opcode_desc *desc =
+      brw_opcode_desc(isa, brw_inst_opcode(isa, inst));
+   unsigned num_sources = brw_num_sources_from_inst(isa, inst);
+   unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   if (inst_is_send(isa, inst))
+      return error_msg;
+
+   if (devinfo->ver >= 11) {
+      /* A register type of B or UB for DPAS actually means 4 bytes packed into
+       * a D or UD, so it is allowed.
+       */
+      if (num_sources == 3 && brw_inst_opcode(isa, inst) != BRW_OPCODE_DPAS) {
+         ERROR_IF(brw_reg_type_to_size(brw_inst_3src_a1_src1_type(devinfo, inst)) == 1 ||
+                  brw_reg_type_to_size(brw_inst_3src_a1_src2_type(devinfo, inst)) == 1,
+                  "Byte data type is not supported for src1/2 register regioning. This includes "
+                  "byte broadcast as well.");
+      }
+      if (num_sources == 2) {
+         ERROR_IF(brw_reg_type_to_size(brw_inst_src1_type(devinfo, inst)) == 1,
+                  "Byte data type is not supported for src1 register regioning. This includes "
+                  "byte broadcast as well.");
+      }
+   }
+
+   enum brw_reg_type dst_type;
+
+   if (num_sources == 3) {
+      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1)
+         dst_type = brw_inst_3src_a1_dst_type(devinfo, inst);
+      else
+         dst_type = brw_inst_3src_a16_dst_type(devinfo, inst);
+   } else {
+      dst_type = inst_dst_type(isa, inst);
+   }
+
+   ERROR_IF(dst_type == BRW_REGISTER_TYPE_DF &&
+            !devinfo->has_64bit_float,
+            "64-bit float destination, but platform does not support it");
+
+   ERROR_IF((dst_type == BRW_REGISTER_TYPE_Q ||
+             dst_type == BRW_REGISTER_TYPE_UQ) &&
+            !devinfo->has_64bit_int,
+            "64-bit int destination, but platform does not support it");
+
+   for (unsigned s = 0; s < num_sources; s++) {
+      enum brw_reg_type src_type;
+      if (num_sources == 3) {
+         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+            switch (s) {
+            case 0: src_type = brw_inst_3src_a1_src0_type(devinfo, inst); break;
+            case 1: src_type = brw_inst_3src_a1_src1_type(devinfo, inst); break;
+            case 2: src_type = brw_inst_3src_a1_src2_type(devinfo, inst); break;
+            default: unreachable("invalid src");
+            }
+         } else {
+            src_type = brw_inst_3src_a16_src_type(devinfo, inst);
+         }
+      } else {
+         switch (s) {
+         case 0: src_type = brw_inst_src0_type(devinfo, inst); break;
+         case 1: src_type = brw_inst_src1_type(devinfo, inst); break;
+         default: unreachable("invalid src");
+         }
+      }
+
+      ERROR_IF(src_type == BRW_REGISTER_TYPE_DF &&
+               !devinfo->has_64bit_float,
+               "64-bit float source, but platform does not support it");
+
+      ERROR_IF((src_type == BRW_REGISTER_TYPE_Q ||
+                src_type == BRW_REGISTER_TYPE_UQ) &&
+               !devinfo->has_64bit_int,
+               "64-bit int source, but platform does not support it");
+   }
+
+   if (num_sources == 3)
+      return error_msg;
+
+   if (exec_size == 1)
+      return error_msg;
+
+   if (desc->ndst == 0)
+      return error_msg;
+
+   /* The PRMs say:
+    *
+    *    Where n is the largest element size in bytes for any source or
+    *    destination operand type, ExecSize * n must be <= 64.
+    *
+    * But we do not attempt to enforce it, because it is implied by other
+    * rules:
+    *
+    *    - that the destination stride must match the execution data type
+    *    - sources may not span more than two adjacent GRF registers
+    *    - destination may not span more than two adjacent GRF registers
+    *
+    * In fact, checking it would weaken testing of the other rules.
+    */
+
+   unsigned dst_stride = STRIDE(brw_inst_dst_hstride(devinfo, inst));
+   bool dst_type_is_byte =
+      inst_dst_type(isa, inst) == BRW_REGISTER_TYPE_B ||
+      inst_dst_type(isa, inst) == BRW_REGISTER_TYPE_UB;
+
+   if (dst_type_is_byte) {
+      if (is_packed(exec_size * dst_stride, exec_size, dst_stride)) {
+         if (!inst_is_raw_move(isa, inst))
+            ERROR("Only raw MOV supports a packed-byte destination");
+         return error_msg;
+      }
+   }
+
+   unsigned exec_type = execution_type(isa, inst);
+   unsigned exec_type_size = brw_reg_type_to_size(exec_type);
+   unsigned dst_type_size = brw_reg_type_to_size(dst_type);
+
+   /* On IVB/BYT, region parameters and execution size for DF are in terms of
+    * 32-bit elements, so they are doubled. For evaluating the validity of an
+    * instruction, we halve them.
+    */
+   if (devinfo->verx10 == 70 &&
+       exec_type_size == 8 && dst_type_size == 4)
+      dst_type_size = 8;
+
+   if (is_byte_conversion(isa, inst)) {
+      /* From the BDW+ PRM, Volume 2a, Command Reference, Instructions - MOV:
+       *
+       *    "There is no direct conversion from B/UB to DF or DF to B/UB.
+       *     There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB."
+       *
+       * Even if these restrictions are listed for the MOV instruction, we
+       * validate this more generally, since there is the possibility
+       * of implicit conversions from other instructions.
+       */
+      enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
+      enum brw_reg_type src1_type = num_sources > 1 ?
+                                    brw_inst_src1_type(devinfo, inst) : 0;
+
+      ERROR_IF(type_sz(dst_type) == 1 &&
+               (type_sz(src0_type) == 8 ||
+                (num_sources > 1 && type_sz(src1_type) == 8)),
+               "There are no direct conversions between 64-bit types and B/UB");
+
+      ERROR_IF(type_sz(dst_type) == 8 &&
+               (type_sz(src0_type) == 1 ||
+                (num_sources > 1 && type_sz(src1_type) == 1)),
+               "There are no direct conversions between 64-bit types and B/UB");
+   }
+
+   if (is_half_float_conversion(isa, inst)) {
+      /**
+       * A helper to validate used in the validation of the following restriction
+       * from the BDW+ PRM, Volume 2a, Command Reference, Instructions - MOV:
+       *
+       *    "There is no direct conversion from HF to DF or DF to HF.
+       *     There is no direct conversion from HF to Q/UQ or Q/UQ to HF."
+       *
+       * Even if these restrictions are listed for the MOV instruction, we
+       * validate this more generally, since there is the possibility
+       * of implicit conversions from other instructions, such us implicit
+       * conversion from integer to HF with the ADD instruction in SKL+.
+       */
+      enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
+      enum brw_reg_type src1_type = num_sources > 1 ?
+                                    brw_inst_src1_type(devinfo, inst) : 0;
+      ERROR_IF(dst_type == BRW_REGISTER_TYPE_HF &&
+               (type_sz(src0_type) == 8 ||
+                (num_sources > 1 && type_sz(src1_type) == 8)),
+               "There are no direct conversions between 64-bit types and HF");
+
+      ERROR_IF(type_sz(dst_type) == 8 &&
+               (src0_type == BRW_REGISTER_TYPE_HF ||
+                (num_sources > 1 && src1_type == BRW_REGISTER_TYPE_HF)),
+               "There are no direct conversions between 64-bit types and HF");
+
+      /* From the BDW+ PRM:
+       *
+       *   "Conversion between Integer and HF (Half Float) must be
+       *    DWord-aligned and strided by a DWord on the destination."
+       *
+       * Also, the above restrictions seems to be expanded on CHV and SKL+ by:
+       *
+       *   "There is a relaxed alignment rule for word destinations. When
+       *    the destination type is word (UW, W, HF), destination data types
+       *    can be aligned to either the lowest word or the second lowest
+       *    word of the execution channel. This means the destination data
+       *    words can be either all in the even word locations or all in the
+       *    odd word locations."
+       *
+       * We do not implement the second rule as is though, since empirical
+       * testing shows inconsistencies:
+       *   - It suggests that packed 16-bit is not allowed, which is not true.
+       *   - It suggests that conversions from Q/DF to W (which need to be
+       *     64-bit aligned on the destination) are not possible, which is
+       *     not true.
+       *
+       * So from this rule we only validate the implication that conversions
+       * from F to HF need to be DWord strided (except in Align1 mixed
+       * float mode where packed fp16 destination is allowed so long as the
+       * destination is oword-aligned).
+       *
+       * Finally, we only validate this for Align1 because Align16 always
+       * requires packed destinations, so these restrictions can't possibly
+       * apply to Align16 mode.
+       */
+      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+         if ((dst_type == BRW_REGISTER_TYPE_HF &&
+              (brw_reg_type_is_integer(src0_type) ||
+               (num_sources > 1 && brw_reg_type_is_integer(src1_type)))) ||
+             (brw_reg_type_is_integer(dst_type) &&
+              (src0_type == BRW_REGISTER_TYPE_HF ||
+               (num_sources > 1 && src1_type == BRW_REGISTER_TYPE_HF)))) {
+            ERROR_IF(dst_stride * dst_type_size != 4,
+                     "Conversions between integer and half-float must be "
+                     "strided by a DWord on the destination");
+
+            unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
+            ERROR_IF(subreg % 4 != 0,
+                     "Conversions between integer and half-float must be "
+                     "aligned to a DWord on the destination");
+         } else if ((devinfo->platform == INTEL_PLATFORM_CHV ||
+                     devinfo->ver >= 9) &&
+                    dst_type == BRW_REGISTER_TYPE_HF) {
+            unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
+            ERROR_IF(dst_stride != 2 &&
+                     !(is_mixed_float(isa, inst) &&
+                       dst_stride == 1 && subreg % 16 == 0),
+                     "Conversions to HF must have either all words in even "
+                     "word locations or all words in odd word locations or "
+                     "be mixed-float with Oword-aligned packed destination");
+         }
+      }
+   }
+
+   /* There are special regioning rules for mixed-float mode in CHV and SKL that
+    * override the general rule for the ratio of sizes of the destination type
+    * and the execution type. We will add validation for those in a later patch.
+    */
+   bool validate_dst_size_and_exec_size_ratio =
+      !is_mixed_float(isa, inst) ||
+      !(devinfo->platform == INTEL_PLATFORM_CHV || devinfo->ver >= 9);
+
+   if (validate_dst_size_and_exec_size_ratio &&
+       exec_type_size > dst_type_size) {
+      if (!(dst_type_is_byte && inst_is_raw_move(isa, inst))) {
+         ERROR_IF(dst_stride * dst_type_size != exec_type_size,
+                  "Destination stride must be equal to the ratio of the sizes "
+                  "of the execution data type to the destination type");
+      }
+
+      unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
+
+      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1 &&
+          brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         /* The i965 PRM says:
+          *
+          *    Implementation Restriction: The relaxed alignment rule for byte
+          *    destination (#10.5) is not supported.
+          */
+         if (devinfo->verx10 >= 45 && dst_type_is_byte) {
+            ERROR_IF(subreg % exec_type_size != 0 &&
+                     subreg % exec_type_size != 1,
+                     "Destination subreg must be aligned to the size of the "
+                     "execution data type (or to the next lowest byte for byte "
+                     "destinations)");
+         } else {
+            ERROR_IF(subreg % exec_type_size != 0,
+                     "Destination subreg must be aligned to the size of the "
+                     "execution data type");
+         }
+      }
+   }
+
+   return error_msg;
+}
+
+/**
+ * Checks restrictions listed in "General Restrictions on Regioning Parameters"
+ * in the "Register Region Restrictions" section.
+ */
+static struct string
+general_restrictions_on_region_parameters(const struct brw_isa_info *isa,
+                                          const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   const struct opcode_desc *desc =
+      brw_opcode_desc(isa, brw_inst_opcode(isa, inst));
+   unsigned num_sources = brw_num_sources_from_inst(isa, inst);
+   unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   if (num_sources == 3)
+      return (struct string){};
+
+   /* Split sends don't have the bits in the instruction to encode regions so
+    * there's nothing to check.
+    */
+   if (inst_is_split_send(isa, inst))
+      return (struct string){};
+
+   if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16) {
+      if (desc->ndst != 0 && !dst_is_null(devinfo, inst))
+         ERROR_IF(brw_inst_dst_hstride(devinfo, inst) != BRW_HORIZONTAL_STRIDE_1,
+                  "Destination Horizontal Stride must be 1");
+
+      if (num_sources >= 1) {
+         if (devinfo->verx10 >= 75) {
+            ERROR_IF(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+                     brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 &&
+                     brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_2 &&
+                     brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+                     "In Align16 mode, only VertStride of 0, 2, or 4 is allowed");
+         } else {
+            ERROR_IF(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+                     brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 &&
+                     brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+                     "In Align16 mode, only VertStride of 0 or 4 is allowed");
+         }
+      }
+
+      if (num_sources == 2) {
+         if (devinfo->verx10 >= 75) {
+            ERROR_IF(brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+                     brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 &&
+                     brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_2 &&
+                     brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+                     "In Align16 mode, only VertStride of 0, 2, or 4 is allowed");
+         } else {
+            ERROR_IF(brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+                     brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 &&
+                     brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+                     "In Align16 mode, only VertStride of 0 or 4 is allowed");
+         }
+      }
+
+      return error_msg;
+   }
+
+   for (unsigned i = 0; i < num_sources; i++) {
+      unsigned vstride, width, hstride, element_size, subreg;
+      enum brw_reg_type type;
+
+#define DO_SRC(n)                                                              \
+      if (brw_inst_src ## n ## _reg_file(devinfo, inst) ==                     \
+          BRW_IMMEDIATE_VALUE)                                                 \
+         continue;                                                             \
+                                                                               \
+      vstride = STRIDE(brw_inst_src ## n ## _vstride(devinfo, inst));          \
+      width = WIDTH(brw_inst_src ## n ## _width(devinfo, inst));               \
+      hstride = STRIDE(brw_inst_src ## n ## _hstride(devinfo, inst));          \
+      type = brw_inst_src ## n ## _type(devinfo, inst);                        \
+      element_size = brw_reg_type_to_size(type);                               \
+      subreg = brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst)
+
+      if (i == 0) {
+         DO_SRC(0);
+      } else {
+         DO_SRC(1);
+      }
+#undef DO_SRC
+
+      /* On IVB/BYT, region parameters and execution size for DF are in terms of
+       * 32-bit elements, so they are doubled. For evaluating the validity of an
+       * instruction, we halve them.
+       */
+      if (devinfo->verx10 == 70 &&
+          element_size == 8)
+         element_size = 4;
+
+      /* ExecSize must be greater than or equal to Width. */
+      ERROR_IF(exec_size < width, "ExecSize must be greater than or equal "
+                                  "to Width");
+
+      /* If ExecSize = Width and HorzStride ≠ 0,
+       * VertStride must be set to Width * HorzStride.
+       */
+      if (exec_size == width && hstride != 0) {
+         ERROR_IF(vstride != width * hstride,
+                  "If ExecSize = Width and HorzStride ≠ 0, "
+                  "VertStride must be set to Width * HorzStride");
+      }
+
+      /* If Width = 1, HorzStride must be 0 regardless of the values of
+       * ExecSize and VertStride.
+       */
+      if (width == 1) {
+         ERROR_IF(hstride != 0,
+                  "If Width = 1, HorzStride must be 0 regardless "
+                  "of the values of ExecSize and VertStride");
+      }
+
+      /* If ExecSize = Width = 1, both VertStride and HorzStride must be 0. */
+      if (exec_size == 1 && width == 1) {
+         ERROR_IF(vstride != 0 || hstride != 0,
+                  "If ExecSize = Width = 1, both VertStride "
+                  "and HorzStride must be 0");
+      }
+
+      /* If VertStride = HorzStride = 0, Width must be 1 regardless of the
+       * value of ExecSize.
+       */
+      if (vstride == 0 && hstride == 0) {
+         ERROR_IF(width != 1,
+                  "If VertStride = HorzStride = 0, Width must be "
+                  "1 regardless of the value of ExecSize");
+      }
+
+      /* VertStride must be used to cross GRF register boundaries. This rule
+       * implies that elements within a 'Width' cannot cross GRF boundaries.
+       */
+      const uint64_t mask = (1ULL << element_size) - 1;
+      unsigned rowbase = subreg;
+
+      for (int y = 0; y < exec_size / width; y++) {
+         uint64_t access_mask = 0;
+         unsigned offset = rowbase;
+
+         for (int x = 0; x < width; x++) {
+            access_mask |= mask << (offset % 64);
+            offset += hstride * element_size;
+         }
+
+         rowbase += vstride * element_size;
+
+         if ((uint32_t)access_mask != 0 && (access_mask >> 32) != 0) {
+            ERROR("VertStride must be used to cross GRF register boundaries");
+            break;
+         }
+      }
+   }
+
+   /* Dst.HorzStride must not be 0. */
+   if (desc->ndst != 0 && !dst_is_null(devinfo, inst)) {
+      ERROR_IF(brw_inst_dst_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0,
+               "Destination Horizontal Stride must not be 0");
+   }
+
+   return error_msg;
+}
+
+static struct string
+special_restrictions_for_mixed_float_mode(const struct brw_isa_info *isa,
+                                          const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   const unsigned opcode = brw_inst_opcode(isa, inst);
+   const unsigned num_sources = brw_num_sources_from_inst(isa, inst);
+   if (num_sources >= 3)
+      return error_msg;
+
+   if (!is_mixed_float(isa, inst))
+      return error_msg;
+
+   unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+   bool is_align16 = brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16;
+
+   enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
+   enum brw_reg_type src1_type = num_sources > 1 ?
+                                 brw_inst_src1_type(devinfo, inst) : 0;
+   enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst);
+
+   unsigned dst_stride = STRIDE(brw_inst_dst_hstride(devinfo, inst));
+   bool dst_is_packed = is_packed(exec_size * dst_stride, exec_size, dst_stride);
+
+   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+    * Float Operations:
+    *
+    *    "Indirect addressing on source is not supported when source and
+    *     destination data types are mixed float."
+    */
+   ERROR_IF(brw_inst_src0_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT ||
+            (num_sources > 1 &&
+             brw_inst_src1_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT),
+            "Indirect addressing on source is not supported when source and "
+            "destination data types are mixed float");
+
+   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+    * Float Operations:
+    *
+    *    "No SIMD16 in mixed mode when destination is f32. Instruction
+    *     execution size must be no more than 8."
+    */
+   ERROR_IF(exec_size > 8 && dst_type == BRW_REGISTER_TYPE_F,
+            "Mixed float mode with 32-bit float destination is limited "
+            "to SIMD8");
+
+   if (is_align16) {
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *   "In Align16 mode, when half float and float data types are mixed
+       *    between source operands OR between source and destination operands,
+       *    the register content are assumed to be packed."
+       *
+       * Since Align16 doesn't have a concept of horizontal stride (or width),
+       * it means that vertical stride must always be 4, since 0 and 2 would
+       * lead to replicated data, and any other value is disallowed in Align16.
+       */
+      ERROR_IF(brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+               "Align16 mixed float mode assumes packed data (vstride must be 4");
+
+      ERROR_IF(num_sources >= 2 &&
+               brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+               "Align16 mixed float mode assumes packed data (vstride must be 4");
+
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *   "For Align16 mixed mode, both input and output packed f16 data
+       *    must be oword aligned, no oword crossing in packed f16."
+       *
+       * The previous rule requires that Align16 operands are always packed,
+       * and since there is only one bit for Align16 subnr, which represents
+       * offsets 0B and 16B, this rule is always enforced and we don't need to
+       * validate it.
+       */
+
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *    "No SIMD16 in mixed mode when destination is packed f16 for both
+       *     Align1 and Align16."
+       *
+       * And:
+       *
+       *   "In Align16 mode, when half float and float data types are mixed
+       *    between source operands OR between source and destination operands,
+       *    the register content are assumed to be packed."
+       *
+       * Which implies that SIMD16 is not available in Align16. This is further
+       * confirmed by:
+       *
+       *    "For Align16 mixed mode, both input and output packed f16 data
+       *     must be oword aligned, no oword crossing in packed f16"
+       *
+       * Since oword-aligned packed f16 data would cross oword boundaries when
+       * the execution size is larger than 8.
+       */
+      ERROR_IF(exec_size > 8, "Align16 mixed float mode is limited to SIMD8");
+
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *    "No accumulator read access for Align16 mixed float."
+       */
+      ERROR_IF(inst_uses_src_acc(isa, inst),
+               "No accumulator read access for Align16 mixed float");
+   } else {
+      assert(!is_align16);
+
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *    "No SIMD16 in mixed mode when destination is packed f16 for both
+       *     Align1 and Align16."
+       */
+      ERROR_IF(exec_size > 8 && dst_is_packed &&
+               dst_type == BRW_REGISTER_TYPE_HF,
+               "Align1 mixed float mode is limited to SIMD8 when destination "
+               "is packed half-float");
+
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *    "Math operations for mixed mode:
+       *     - In Align1, f16 inputs need to be strided"
+       */
+      if (opcode == BRW_OPCODE_MATH) {
+         if (src0_type == BRW_REGISTER_TYPE_HF) {
+            ERROR_IF(STRIDE(brw_inst_src0_hstride(devinfo, inst)) <= 1,
+                     "Align1 mixed mode math needs strided half-float inputs");
+         }
+
+         if (num_sources >= 2 && src1_type == BRW_REGISTER_TYPE_HF) {
+            ERROR_IF(STRIDE(brw_inst_src1_hstride(devinfo, inst)) <= 1,
+                     "Align1 mixed mode math needs strided half-float inputs");
+         }
+      }
+
+      if (dst_type == BRW_REGISTER_TYPE_HF && dst_stride == 1) {
+         /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+          * Float Operations:
+          *
+          *    "In Align1, destination stride can be smaller than execution
+          *     type. When destination is stride of 1, 16 bit packed data is
+          *     updated on the destination. However, output packed f16 data
+          *     must be oword aligned, no oword crossing in packed f16."
+          *
+          * The requirement of not crossing oword boundaries for 16-bit oword
+          * aligned data means that execution size is limited to 8.
+          */
+         unsigned subreg;
+         if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT)
+            subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
+         else
+            subreg = brw_inst_dst_ia_subreg_nr(devinfo, inst);
+         ERROR_IF(subreg % 16 != 0,
+                  "Align1 mixed mode packed half-float output must be "
+                  "oword aligned");
+         ERROR_IF(exec_size > 8,
+                  "Align1 mixed mode packed half-float output must not "
+                  "cross oword boundaries (max exec size is 8)");
+
+         /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+          * Float Operations:
+          *
+          *    "When source is float or half float from accumulator register and
+          *     destination is half float with a stride of 1, the source must
+          *     register aligned. i.e., source must have offset zero."
+          *
+          * Align16 mixed float mode doesn't allow accumulator access on sources,
+          * so we only need to check this for Align1.
+          */
+         if (src0_is_acc(devinfo, inst) &&
+             (src0_type == BRW_REGISTER_TYPE_F ||
+              src0_type == BRW_REGISTER_TYPE_HF)) {
+            ERROR_IF(brw_inst_src0_da1_subreg_nr(devinfo, inst) != 0,
+                     "Mixed float mode requires register-aligned accumulator "
+                     "source reads when destination is packed half-float");
+
+         }
+
+         if (num_sources > 1 &&
+             src1_is_acc(devinfo, inst) &&
+             (src1_type == BRW_REGISTER_TYPE_F ||
+              src1_type == BRW_REGISTER_TYPE_HF)) {
+            ERROR_IF(brw_inst_src1_da1_subreg_nr(devinfo, inst) != 0,
+                     "Mixed float mode requires register-aligned accumulator "
+                     "source reads when destination is packed half-float");
+         }
+      }
+
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *    "No swizzle is allowed when an accumulator is used as an implicit
+       *     source or an explicit source in an instruction. i.e. when
+       *     destination is half float with an implicit accumulator source,
+       *     destination stride needs to be 2."
+       *
+       * FIXME: it is not quite clear what the first sentence actually means
+       *        or its link to the implication described after it, so we only
+       *        validate the explicit implication, which is clearly described.
+       */
+      if (dst_type == BRW_REGISTER_TYPE_HF &&
+          inst_uses_src_acc(isa, inst)) {
+         ERROR_IF(dst_stride != 2,
+                  "Mixed float mode with implicit/explicit accumulator "
+                  "source and half-float destination requires a stride "
+                  "of 2 on the destination");
+      }
+   }
+
+   return error_msg;
+}
+
+/**
+ * Creates an \p access_mask for an \p exec_size, \p element_size, and a region
+ *
+ * An \p access_mask is a 32-element array of uint64_t, where each uint64_t is
+ * a bitmask of bytes accessed by the region.
+ *
+ * For instance the access mask of the source gX.1<4,2,2>F in an exec_size = 4
+ * instruction would be
+ *
+ *    access_mask[0] = 0x00000000000000F0
+ *    access_mask[1] = 0x000000000000F000
+ *    access_mask[2] = 0x0000000000F00000
+ *    access_mask[3] = 0x00000000F0000000
+ *    access_mask[4-31] = 0
+ *
+ * because the first execution channel accesses bytes 7-4 and the second
+ * execution channel accesses bytes 15-12, etc.
+ */
+static void
+align1_access_mask(uint64_t access_mask[static 32],
+                   unsigned exec_size, unsigned element_size, unsigned subreg,
+                   unsigned vstride, unsigned width, unsigned hstride)
+{
+   const uint64_t mask = (1ULL << element_size) - 1;
+   unsigned rowbase = subreg;
+   unsigned element = 0;
+
+   for (int y = 0; y < exec_size / width; y++) {
+      unsigned offset = rowbase;
+
+      for (int x = 0; x < width; x++) {
+         access_mask[element++] = mask << (offset % 64);
+         offset += hstride * element_size;
+      }
+
+      rowbase += vstride * element_size;
+   }
+
+   assert(element == 0 || element == exec_size);
+}
+
+/**
+ * Returns the number of registers accessed according to the \p access_mask
+ */
+static int
+registers_read(const uint64_t access_mask[static 32])
+{
+   int regs_read = 0;
+
+   for (unsigned i = 0; i < 32; i++) {
+      if (access_mask[i] > 0xFFFFFFFF) {
+         return 2;
+      } else if (access_mask[i]) {
+         regs_read = 1;
+      }
+   }
+
+   return regs_read;
+}
+
+/**
+ * Checks restrictions listed in "Region Alignment Rules" in the "Register
+ * Region Restrictions" section.
+ */
+static struct string
+region_alignment_rules(const struct brw_isa_info *isa,
+                       const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+   const struct opcode_desc *desc =
+      brw_opcode_desc(isa, brw_inst_opcode(isa, inst));
+   unsigned num_sources = brw_num_sources_from_inst(isa, inst);
+   unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+   uint64_t dst_access_mask[32], src0_access_mask[32], src1_access_mask[32];
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   if (num_sources == 3)
+      return (struct string){};
+
+   if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16)
+      return (struct string){};
+
+   if (inst_is_send(isa, inst))
+      return (struct string){};
+
+   memset(dst_access_mask, 0, sizeof(dst_access_mask));
+   memset(src0_access_mask, 0, sizeof(src0_access_mask));
+   memset(src1_access_mask, 0, sizeof(src1_access_mask));
+
+   for (unsigned i = 0; i < num_sources; i++) {
+      unsigned vstride, width, hstride, element_size, subreg;
+      enum brw_reg_type type;
+
+      /* In Direct Addressing mode, a source cannot span more than 2 adjacent
+       * GRF registers.
+       */
+
+#define DO_SRC(n)                                                              \
+      if (brw_inst_src ## n ## _address_mode(devinfo, inst) !=                 \
+          BRW_ADDRESS_DIRECT)                                                  \
+         continue;                                                             \
+                                                                               \
+      if (brw_inst_src ## n ## _reg_file(devinfo, inst) ==                     \
+          BRW_IMMEDIATE_VALUE)                                                 \
+         continue;                                                             \
+                                                                               \
+      vstride = STRIDE(brw_inst_src ## n ## _vstride(devinfo, inst));          \
+      width = WIDTH(brw_inst_src ## n ## _width(devinfo, inst));               \
+      hstride = STRIDE(brw_inst_src ## n ## _hstride(devinfo, inst));          \
+      type = brw_inst_src ## n ## _type(devinfo, inst);                        \
+      element_size = brw_reg_type_to_size(type);                               \
+      subreg = brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst);             \
+      align1_access_mask(src ## n ## _access_mask,                             \
+                         exec_size, element_size, subreg,                      \
+                         vstride, width, hstride)
+
+      if (i == 0) {
+         DO_SRC(0);
+      } else {
+         DO_SRC(1);
+      }
+#undef DO_SRC
+
+      unsigned num_vstride = exec_size / width;
+      unsigned num_hstride = width;
+      unsigned vstride_elements = (num_vstride - 1) * vstride;
+      unsigned hstride_elements = (num_hstride - 1) * hstride;
+      unsigned offset = (vstride_elements + hstride_elements) * element_size +
+                        subreg;
+      ERROR_IF(offset >= 64 * reg_unit(devinfo),
+               "A source cannot span more than 2 adjacent GRF registers");
+   }
+
+   if (desc->ndst == 0 || dst_is_null(devinfo, inst))
+      return error_msg;
+
+   unsigned stride = STRIDE(brw_inst_dst_hstride(devinfo, inst));
+   enum brw_reg_type dst_type = inst_dst_type(isa, inst);
+   unsigned element_size = brw_reg_type_to_size(dst_type);
+   unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
+   unsigned offset = ((exec_size - 1) * stride * element_size) + subreg;
+   ERROR_IF(offset >= 64 * reg_unit(devinfo),
+            "A destination cannot span more than 2 adjacent GRF registers");
+
+   if (error_msg.str)
+      return error_msg;
+
+   /* On IVB/BYT, region parameters and execution size for DF are in terms of
+    * 32-bit elements, so they are doubled. For evaluating the validity of an
+    * instruction, we halve them.
+    */
+   if (devinfo->verx10 == 70 &&
+       element_size == 8)
+      element_size = 4;
+
+   align1_access_mask(dst_access_mask, exec_size, element_size, subreg,
+                      exec_size == 1 ? 0 : exec_size * stride,
+                      exec_size == 1 ? 1 : exec_size,
+                      exec_size == 1 ? 0 : stride);
+
+   unsigned dst_regs = registers_read(dst_access_mask);
+   unsigned src0_regs = registers_read(src0_access_mask);
+   unsigned src1_regs = registers_read(src1_access_mask);
+
+   /* The SNB, IVB, HSW, BDW, and CHV PRMs say:
+    *
+    *    When an instruction has a source region spanning two registers and a
+    *    destination region contained in one register, the number of elements
+    *    must be the same between two sources and one of the following must be
+    *    true:
+    *
+    *       1. The destination region is entirely contained in the lower OWord
+    *          of a register.
+    *       2. The destination region is entirely contained in the upper OWord
+    *          of a register.
+    *       3. The destination elements are evenly split between the two OWords
+    *          of a register.
+    */
+   if (devinfo->ver <= 8) {
+      if (dst_regs == 1 && (src0_regs == 2 || src1_regs == 2)) {
+         unsigned upper_oword_writes = 0, lower_oword_writes = 0;
+
+         for (unsigned i = 0; i < exec_size; i++) {
+            if (dst_access_mask[i] > 0x0000FFFF) {
+               upper_oword_writes++;
+            } else {
+               assert(dst_access_mask[i] != 0);
+               lower_oword_writes++;
+            }
+         }
+
+         ERROR_IF(lower_oword_writes != 0 &&
+                  upper_oword_writes != 0 &&
+                  upper_oword_writes != lower_oword_writes,
+                  "Writes must be to only one OWord or "
+                  "evenly split between OWords");
+      }
+   }
+
+   /* The IVB and HSW PRMs say:
+    *
+    *    When an instruction has a source region that spans two registers and
+    *    the destination spans two registers, the destination elements must be
+    *    evenly split between the two registers [...]
+    *
+    * The SNB PRM contains similar wording (but written in a much more
+    * confusing manner).
+    *
+    * The BDW PRM says:
+    *
+    *    When destination spans two registers, the source may be one or two
+    *    registers. The destination elements must be evenly split between the
+    *    two registers.
+    *
+    * The SKL PRM says:
+    *
+    *    When destination of MATH instruction spans two registers, the
+    *    destination elements must be evenly split between the two registers.
+    *
+    * It is not known whether this restriction applies to KBL other Gens after
+    * SKL.
+    */
+   if (devinfo->ver <= 8 ||
+       brw_inst_opcode(isa, inst) == BRW_OPCODE_MATH) {
+
+      /* Nothing explicitly states that on Gen < 8 elements must be evenly
+       * split between two destination registers in the two exceptional
+       * source-region-spans-one-register cases, but since Broadwell requires
+       * evenly split writes regardless of source region, we assume that it was
+       * an oversight and require it.
+       */
+      if (dst_regs == 2) {
+         unsigned upper_reg_writes = 0, lower_reg_writes = 0;
+
+         for (unsigned i = 0; i < exec_size; i++) {
+            if (dst_access_mask[i] > 0xFFFFFFFF) {
+               upper_reg_writes++;
+            } else {
+               assert(dst_access_mask[i] != 0);
+               lower_reg_writes++;
+            }
+         }
+
+         ERROR_IF(upper_reg_writes != lower_reg_writes,
+                  "Writes must be evenly split between the two "
+                  "destination registers");
+      }
+   }
+
+   /* The IVB and HSW PRMs say:
+    *
+    *    When an instruction has a source region that spans two registers and
+    *    the destination spans two registers, the destination elements must be
+    *    evenly split between the two registers and each destination register
+    *    must be entirely derived from one source register.
+    *
+    *    Note: In such cases, the regioning parameters must ensure that the
+    *    offset from the two source registers is the same.
+    *
+    * The SNB PRM contains similar wording (but written in a much more
+    * confusing manner).
+    *
+    * There are effectively three rules stated here:
+    *
+    *    For an instruction with a source and a destination spanning two
+    *    registers,
+    *
+    *       (1) destination elements must be evenly split between the two
+    *           registers
+    *       (2) all destination elements in a register must be derived
+    *           from one source register
+    *       (3) the offset (i.e. the starting location in each of the two
+    *           registers spanned by a region) must be the same in the two
+    *           registers spanned by a region
+    *
+    * It is impossible to violate rule (1) without violating (2) or (3), so we
+    * do not attempt to validate it.
+    */
+   if (devinfo->ver <= 7 && dst_regs == 2) {
+      for (unsigned i = 0; i < num_sources; i++) {
+#define DO_SRC(n)                                                             \
+         if (src ## n ## _regs <= 1)                                          \
+            continue;                                                         \
+                                                                              \
+         for (unsigned i = 0; i < exec_size; i++) {                           \
+            if ((dst_access_mask[i] > 0xFFFFFFFF) !=                          \
+                (src ## n ## _access_mask[i] > 0xFFFFFFFF)) {                 \
+               ERROR("Each destination register must be entirely derived "    \
+                     "from one source register");                             \
+               break;                                                         \
+            }                                                                 \
+         }                                                                    \
+                                                                              \
+         unsigned offset_0 =                                                  \
+            brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst);               \
+         unsigned offset_1 = offset_0;                                        \
+                                                                              \
+         for (unsigned i = 0; i < exec_size; i++) {                           \
+            if (src ## n ## _access_mask[i] > 0xFFFFFFFF) {                   \
+               offset_1 = __builtin_ctzll(src ## n ## _access_mask[i]) - 32;  \
+               break;                                                         \
+            }                                                                 \
+         }                                                                    \
+                                                                              \
+         ERROR_IF(num_sources == 2 && offset_0 != offset_1,                   \
+                  "The offset from the two source registers "                 \
+                  "must be the same")
+
+         if (i == 0) {
+            DO_SRC(0);
+         } else {
+            DO_SRC(1);
+         }
+#undef DO_SRC
+      }
+   }
+
+   /* The IVB and HSW PRMs say:
+    *
+    *    When destination spans two registers, the source MUST span two
+    *    registers. The exception to the above rule:
+    *        1. When source is scalar, the source registers are not
+    *           incremented.
+    *        2. When source is packed integer Word and destination is packed
+    *           integer DWord, the source register is not incremented by the
+    *           source sub register is incremented.
+    *
+    * The SNB PRM does not contain this rule, but the internal documentation
+    * indicates that it applies to SNB as well. We assume that the rule applies
+    * to Gen <= 5 although their PRMs do not state it.
+    *
+    * While the documentation explicitly says in exception (2) that the
+    * destination must be an integer DWord, the hardware allows at least a
+    * float destination type as well. We emit such instructions from
+    *
+    *    fs_visitor::emit_interpolation_setup_gfx6
+    *    fs_visitor::emit_fragcoord_interpolation
+    *
+    * and have for years with no ill effects.
+    *
+    * Additionally the simulator source code indicates that the real condition
+    * is that the size of the destination type is 4 bytes.
+    *
+    * HSW PRMs also add a note to the second exception:
+    *  "When lower 8 channels are disabled, the sub register of source1
+    *   operand is not incremented. If the lower 8 channels are expected
+    *   to be disabled, say by predication, the instruction must be split
+    *   into pair of simd8 operations."
+    *
+    * We can't reliably know if the channels won't be disabled due to,
+    * for example, IMASK. So, play it safe and disallow packed-word exception
+    * for src1.
+    */
+   if (devinfo->ver <= 7 && dst_regs == 2) {
+      enum brw_reg_type dst_type = inst_dst_type(isa, inst);
+      bool dst_is_packed_dword =
+         is_packed(exec_size * stride, exec_size, stride) &&
+         brw_reg_type_to_size(dst_type) == 4;
+
+      for (unsigned i = 0; i < num_sources; i++) {
+#define DO_SRC(n)                                                                  \
+         unsigned vstride, width, hstride;                                         \
+         vstride = STRIDE(brw_inst_src ## n ## _vstride(devinfo, inst));           \
+         width = WIDTH(brw_inst_src ## n ## _width(devinfo, inst));                \
+         hstride = STRIDE(brw_inst_src ## n ## _hstride(devinfo, inst));           \
+         bool src ## n ## _is_packed_word =                                        \
+            n != 1 && is_packed(vstride, width, hstride) &&                        \
+            (brw_inst_src ## n ## _type(devinfo, inst) == BRW_REGISTER_TYPE_W ||   \
+             brw_inst_src ## n ## _type(devinfo, inst) == BRW_REGISTER_TYPE_UW);   \
+                                                                                   \
+         ERROR_IF(src ## n ## _regs == 1 &&                                        \
+                  !src ## n ## _has_scalar_region(devinfo, inst) &&                \
+                  !(dst_is_packed_dword && src ## n ## _is_packed_word),           \
+                  "When the destination spans two registers, the source must "     \
+                  "span two registers\n" ERROR_INDENT "(exceptions for scalar "    \
+                  "sources, and packed-word to packed-dword expansion for src0)")
+
+         if (i == 0) {
+            DO_SRC(0);
+         } else {
+            DO_SRC(1);
+         }
+#undef DO_SRC
+      }
+   }
+
+   return error_msg;
+}
+
+static struct string
+vector_immediate_restrictions(const struct brw_isa_info *isa,
+                              const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   unsigned num_sources = brw_num_sources_from_inst(isa, inst);
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   if (num_sources == 3 || num_sources == 0 ||
+       (devinfo->ver >= 12 && inst_is_send(isa, inst)))
+      return (struct string){};
+
+   unsigned file = num_sources == 1 ?
+                   brw_inst_src0_reg_file(devinfo, inst) :
+                   brw_inst_src1_reg_file(devinfo, inst);
+   if (file != BRW_IMMEDIATE_VALUE)
+      return (struct string){};
+
+   enum brw_reg_type dst_type = inst_dst_type(isa, inst);
+   unsigned dst_type_size = brw_reg_type_to_size(dst_type);
+   unsigned dst_subreg = brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1 ?
+                         brw_inst_dst_da1_subreg_nr(devinfo, inst) : 0;
+   unsigned dst_stride = STRIDE(brw_inst_dst_hstride(devinfo, inst));
+   enum brw_reg_type type = num_sources == 1 ?
+                            brw_inst_src0_type(devinfo, inst) :
+                            brw_inst_src1_type(devinfo, inst);
+
+   /* The PRMs say:
+    *
+    *    When an immediate vector is used in an instruction, the destination
+    *    must be 128-bit aligned with destination horizontal stride equivalent
+    *    to a word for an immediate integer vector (v) and equivalent to a
+    *    DWord for an immediate float vector (vf).
+    *
+    * The text has not been updated for the addition of the immediate unsigned
+    * integer vector type (uv) on SNB, but presumably the same restriction
+    * applies.
+    */
+   switch (type) {
+   case BRW_REGISTER_TYPE_V:
+   case BRW_REGISTER_TYPE_UV:
+   case BRW_REGISTER_TYPE_VF:
+      ERROR_IF(dst_subreg % (128 / 8) != 0,
+               "Destination must be 128-bit aligned in order to use immediate "
+               "vector types");
+
+      if (type == BRW_REGISTER_TYPE_VF) {
+         ERROR_IF(dst_type_size * dst_stride != 4,
+                  "Destination must have stride equivalent to dword in order "
+                  "to use the VF type");
+      } else {
+         ERROR_IF(dst_type_size * dst_stride != 2,
+                  "Destination must have stride equivalent to word in order "
+                  "to use the V or UV type");
+      }
+      break;
+   default:
+      break;
+   }
+
+   return error_msg;
+}
+
+static struct string
+special_requirements_for_handling_double_precision_data_types(
+                                       const struct brw_isa_info *isa,
+                                       const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   unsigned num_sources = brw_num_sources_from_inst(isa, inst);
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   if (num_sources == 3 || num_sources == 0)
+      return (struct string){};
+
+   /* Split sends don't have types so there's no doubles there. */
+   if (inst_is_split_send(isa, inst))
+      return (struct string){};
+
+   enum brw_reg_type exec_type = execution_type(isa, inst);
+   unsigned exec_type_size = brw_reg_type_to_size(exec_type);
+
+   enum brw_reg_file dst_file = brw_inst_dst_reg_file(devinfo, inst);
+   enum brw_reg_type dst_type = inst_dst_type(isa, inst);
+   unsigned dst_type_size = brw_reg_type_to_size(dst_type);
+   unsigned dst_hstride = STRIDE(brw_inst_dst_hstride(devinfo, inst));
+   unsigned dst_reg = brw_inst_dst_da_reg_nr(devinfo, inst);
+   unsigned dst_subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
+   unsigned dst_address_mode = brw_inst_dst_address_mode(devinfo, inst);
+
+   bool is_integer_dword_multiply =
+      devinfo->ver >= 8 &&
+      brw_inst_opcode(isa, inst) == BRW_OPCODE_MUL &&
+      (brw_inst_src0_type(devinfo, inst) == BRW_REGISTER_TYPE_D ||
+       brw_inst_src0_type(devinfo, inst) == BRW_REGISTER_TYPE_UD) &&
+      (brw_inst_src1_type(devinfo, inst) == BRW_REGISTER_TYPE_D ||
+       brw_inst_src1_type(devinfo, inst) == BRW_REGISTER_TYPE_UD);
+
+   const bool is_double_precision =
+      dst_type_size == 8 || exec_type_size == 8 || is_integer_dword_multiply;
+
+   for (unsigned i = 0; i < num_sources; i++) {
+      unsigned vstride, width, hstride, type_size, reg, subreg, address_mode;
+      bool is_scalar_region;
+      enum brw_reg_file file;
+      enum brw_reg_type type;
+
+#define DO_SRC(n)                                                              \
+      if (brw_inst_src ## n ## _reg_file(devinfo, inst) ==                     \
+          BRW_IMMEDIATE_VALUE)                                                 \
+         continue;                                                             \
+                                                                               \
+      is_scalar_region = src ## n ## _has_scalar_region(devinfo, inst);        \
+      vstride = STRIDE(brw_inst_src ## n ## _vstride(devinfo, inst));          \
+      width = WIDTH(brw_inst_src ## n ## _width(devinfo, inst));               \
+      hstride = STRIDE(brw_inst_src ## n ## _hstride(devinfo, inst));          \
+      file = brw_inst_src ## n ## _reg_file(devinfo, inst);                    \
+      type = brw_inst_src ## n ## _type(devinfo, inst);                        \
+      type_size = brw_reg_type_to_size(type);                                  \
+      reg = brw_inst_src ## n ## _da_reg_nr(devinfo, inst);                    \
+      subreg = brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst);             \
+      address_mode = brw_inst_src ## n ## _address_mode(devinfo, inst)
+
+      if (i == 0) {
+         DO_SRC(0);
+      } else {
+         DO_SRC(1);
+      }
+#undef DO_SRC
+
+      const unsigned src_stride = (hstride ? hstride : vstride) * type_size;
+      const unsigned dst_stride = dst_hstride * dst_type_size;
+
+      /* The PRMs say that for CHV, BXT:
+       *
+       *    When source or destination datatype is 64b or operation is integer
+       *    DWord multiply, regioning in Align1 must follow these rules:
+       *
+       *    1. Source and Destination horizontal stride must be aligned to the
+       *       same qword.
+       *    2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
+       *    3. Source and Destination offset must be the same, except the case
+       *       of scalar source.
+       *
+       * We assume that the restriction applies to GLK as well.
+       */
+      if (is_double_precision &&
+          brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1 &&
+          (devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo))) {
+         ERROR_IF(!is_scalar_region &&
+                  (src_stride % 8 != 0 ||
+                   dst_stride % 8 != 0 ||
+                   src_stride != dst_stride),
+                  "Source and destination horizontal stride must equal and a "
+                  "multiple of a qword when the execution type is 64-bit");
+
+         ERROR_IF(vstride != width * hstride,
+                  "Vstride must be Width * Hstride when the execution type is "
+                  "64-bit");
+
+         ERROR_IF(!is_scalar_region && dst_subreg != subreg,
+                  "Source and destination offset must be the same when the "
+                  "execution type is 64-bit");
+      }
+
+      /* The PRMs say that for CHV, BXT:
+       *
+       *    When source or destination datatype is 64b or operation is integer
+       *    DWord multiply, indirect addressing must not be used.
+       *
+       * We assume that the restriction applies to GLK as well.
+       */
+      if (is_double_precision &&
+          (devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo))) {
+         ERROR_IF(BRW_ADDRESS_REGISTER_INDIRECT_REGISTER == address_mode ||
+                  BRW_ADDRESS_REGISTER_INDIRECT_REGISTER == dst_address_mode,
+                  "Indirect addressing is not allowed when the execution type "
+                  "is 64-bit");
+      }
+
+      /* The PRMs say that for CHV, BXT:
+       *
+       *    ARF registers must never be used with 64b datatype or when
+       *    operation is integer DWord multiply.
+       *
+       * We assume that the restriction applies to GLK as well.
+       *
+       * We assume that the restriction does not apply to the null register.
+       */
+      if (is_double_precision &&
+          (devinfo->platform == INTEL_PLATFORM_CHV ||
+           intel_device_info_is_9lp(devinfo))) {
+         ERROR_IF(brw_inst_opcode(isa, inst) == BRW_OPCODE_MAC ||
+                  brw_inst_acc_wr_control(devinfo, inst) ||
+                  (BRW_ARCHITECTURE_REGISTER_FILE == file &&
+                   reg != BRW_ARF_NULL) ||
+                  (BRW_ARCHITECTURE_REGISTER_FILE == dst_file &&
+                   dst_reg != BRW_ARF_NULL),
+                  "Architecture registers cannot be used when the execution "
+                  "type is 64-bit");
+      }
+
+      /* From the hardware spec section "Register Region Restrictions":
+       *
+       * There are two rules:
+       *
+       * "In case of all floating point data types used in destination:" and
+       *
+       * "In case where source or destination datatype is 64b or operation is
+       *  integer DWord multiply:"
+       *
+       * both of which list the same restrictions:
+       *
+       *  "1. Register Regioning patterns where register data bit location
+       *      of the LSB of the channels are changed between source and
+       *      destination are not supported on Src0 and Src1 except for
+       *      broadcast of a scalar.
+       *
+       *   2. Explicit ARF registers except null and accumulator must not be
+       *      used."
+       */
+      if (devinfo->verx10 >= 125 &&
+          (brw_reg_type_is_floating_point(dst_type) ||
+           is_double_precision)) {
+         ERROR_IF(!is_scalar_region &&
+                  BRW_ADDRESS_REGISTER_INDIRECT_REGISTER != address_mode &&
+                  (!is_linear(vstride, width, hstride) ||
+                   src_stride != dst_stride ||
+                   subreg != dst_subreg),
+                  "Register Regioning patterns where register data bit "
+                  "location of the LSB of the channels are changed between "
+                  "source and destination are not supported except for "
+                  "broadcast of a scalar.");
+
+         ERROR_IF((address_mode == BRW_ADDRESS_DIRECT && file == BRW_ARCHITECTURE_REGISTER_FILE &&
+                   reg != BRW_ARF_NULL && !(reg >= BRW_ARF_ACCUMULATOR && reg < BRW_ARF_FLAG)) ||
+                  (dst_file == BRW_ARCHITECTURE_REGISTER_FILE &&
+                   dst_reg != BRW_ARF_NULL && dst_reg != BRW_ARF_ACCUMULATOR),
+                  "Explicit ARF registers except null and accumulator must not "
+                  "be used.");
+      }
+
+      /* From the hardware spec section "Register Region Restrictions":
+       *
+       * "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float and
+       *  Quad-Word data must not be used."
+       */
+      if (devinfo->verx10 >= 125 &&
+          (brw_reg_type_is_floating_point(type) || type_sz(type) == 8)) {
+         ERROR_IF(address_mode == BRW_ADDRESS_REGISTER_INDIRECT_REGISTER &&
+                  vstride == BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL,
+                  "Vx1 and VxH indirect addressing for Float, Half-Float, "
+                  "Double-Float and Quad-Word data must not be used");
+      }
+   }
+
+   /* The PRMs say that for BDW, SKL:
+    *
+    *    If Align16 is required for an operation with QW destination and non-QW
+    *    source datatypes, the execution size cannot exceed 2.
+    *
+    * We assume that the restriction applies to all Gfx8+ parts.
+    */
+   if (is_double_precision && devinfo->ver >= 8) {
+      enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
+      enum brw_reg_type src1_type =
+         num_sources > 1 ? brw_inst_src1_type(devinfo, inst) : src0_type;
+      unsigned src0_type_size = brw_reg_type_to_size(src0_type);
+      unsigned src1_type_size = brw_reg_type_to_size(src1_type);
+
+      ERROR_IF(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16 &&
+               dst_type_size == 8 &&
+               (src0_type_size != 8 || src1_type_size != 8) &&
+               brw_inst_exec_size(devinfo, inst) > BRW_EXECUTE_2,
+               "In Align16 exec size cannot exceed 2 with a QWord destination "
+               "and a non-QWord source");
+   }
+
+   /* The PRMs say that for CHV, BXT:
+    *
+    *    When source or destination datatype is 64b or operation is integer
+    *    DWord multiply, DepCtrl must not be used.
+    *
+    * We assume that the restriction applies to GLK as well.
+    */
+   if (is_double_precision &&
+       (devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo))) {
+      ERROR_IF(brw_inst_no_dd_check(devinfo, inst) ||
+               brw_inst_no_dd_clear(devinfo, inst),
+               "DepCtrl is not allowed when the execution type is 64-bit");
+   }
+
+   return error_msg;
+}
+
+static struct string
+instruction_restrictions(const struct brw_isa_info *isa,
+                         const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   /* From Wa_1604601757:
+    *
+    * "When multiplying a DW and any lower precision integer, source modifier
+    *  is not supported."
+    */
+   if (devinfo->ver >= 12 &&
+       brw_inst_opcode(isa, inst) == BRW_OPCODE_MUL) {
+      enum brw_reg_type exec_type = execution_type(isa, inst);
+      const bool src0_valid = type_sz(brw_inst_src0_type(devinfo, inst)) == 4 ||
+         brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE ||
+         !(brw_inst_src0_negate(devinfo, inst) ||
+           brw_inst_src0_abs(devinfo, inst));
+      const bool src1_valid = type_sz(brw_inst_src1_type(devinfo, inst)) == 4 ||
+         brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE ||
+         !(brw_inst_src1_negate(devinfo, inst) ||
+           brw_inst_src1_abs(devinfo, inst));
+
+      ERROR_IF(!brw_reg_type_is_floating_point(exec_type) &&
+               type_sz(exec_type) == 4 && !(src0_valid && src1_valid),
+               "When multiplying a DW and any lower precision integer, source "
+               "modifier is not supported.");
+   }
+
+   if (brw_inst_opcode(isa, inst) == BRW_OPCODE_CMP ||
+       brw_inst_opcode(isa, inst) == BRW_OPCODE_CMPN) {
+      if (devinfo->ver <= 7) {
+         /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit
+          * ISA) says:
+          *
+          *    Accumulator cannot be destination, implicit or explicit. The
+          *    destination must be a general register or the null register.
+          *
+          * Page 77 of the Haswell PRM Volume 2b contains the same text.  The
+          * 965G PRMs contain similar text.
+          *
+          * Page 864 (page 880 of the PDF) of the Broadwell PRM Volume 7 says:
+          *
+          *    For the cmp and cmpn instructions, remove the accumulator
+          *    restrictions.
+          */
+         ERROR_IF(brw_inst_dst_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+                  brw_inst_dst_da_reg_nr(devinfo, inst) != BRW_ARF_NULL,
+                  "Accumulator cannot be destination, implicit or explicit.");
+      }
+
+      /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA)
+       * says:
+       *
+       *    If the destination is the null register, the {Switch} instruction
+       *    option must be used.
+       *
+       * Page 77 of the Haswell PRM Volume 2b contains the same text.
+       */
+      if (devinfo->ver == 7) {
+         ERROR_IF(dst_is_null(devinfo, inst) &&
+                  brw_inst_thread_control(devinfo, inst) != BRW_THREAD_SWITCH,
+                  "If the destination is the null register, the {Switch} "
+                  "instruction option must be used.");
+      }
+
+      ERROR_IF(brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE,
+               "CMP (or CMPN) must have a condition.");
+   }
+
+   if (brw_inst_opcode(isa, inst) == BRW_OPCODE_SEL) {
+      if (devinfo->ver < 6) {
+         ERROR_IF(brw_inst_cond_modifier(devinfo, inst) != BRW_CONDITIONAL_NONE,
+                  "SEL must not have a condition modifier");
+         ERROR_IF(brw_inst_pred_control(devinfo, inst) == BRW_PREDICATE_NONE,
+                  "SEL must be predicated");
+      } else {
+         ERROR_IF((brw_inst_cond_modifier(devinfo, inst) != BRW_CONDITIONAL_NONE) ==
+                  (brw_inst_pred_control(devinfo, inst) != BRW_PREDICATE_NONE),
+                  "SEL must either be predicated or have a condition modifiers");
+      }
+   }
+
+   if (brw_inst_opcode(isa, inst) == BRW_OPCODE_MUL) {
+      const enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
+      const enum brw_reg_type src1_type = brw_inst_src1_type(devinfo, inst);
+      const enum brw_reg_type dst_type = inst_dst_type(isa, inst);
+
+      if (devinfo->ver == 6) {
+         /* Page 223 of the Sandybridge PRM volume 4 part 2 says:
+          *
+          *    [DevSNB]: When multiple (sic) a DW and a W, the W has to be on
+          *    src0, and the DW has to be on src1.
+          *
+          * This text appears only in the Sandybridge PRMw.
+          */
+         ERROR_IF(brw_reg_type_is_integer(src0_type) &&
+                  type_sz(src0_type) == 4 && type_sz(src1_type) < 4,
+                  "When multiplying a DW and any lower precision integer, the "
+                  "DW operand must be src1.");
+      } else if (devinfo->ver >= 7) {
+         /* Page 966 (page 982 of the PDF) of Broadwell PRM volume 2a says:
+          *
+          *    When multiplying a DW and any lower precision integer, the DW
+          *    operand must on src0.
+          *
+          * Ivy Bridge, Haswell, Skylake, and Ice Lake PRMs contain the same
+          * text.
+          */
+         ERROR_IF(brw_reg_type_is_integer(src1_type) &&
+                  type_sz(src0_type) < 4 && type_sz(src1_type) == 4,
+                  "When multiplying a DW and any lower precision integer, the "
+                  "DW operand must be src0.");
+      }
+
+      if (devinfo->ver <= 7) {
+         /* Section 14.2.28 of Intel 965 Express Chipset PRM volume 4 says:
+          *
+          *    Source operands cannot be an accumulator register.
+          *
+          * Iron Lake, Sandybridge, and Ivy Bridge PRMs have the same text.
+          * Haswell does not.  Given that later PRMs have different
+          * restrictions on accumulator sources (see below), it seems most
+          * likely that Haswell shares the Ivy Bridge restriction.
+          */
+         ERROR_IF(src0_is_acc(devinfo, inst) || src1_is_acc(devinfo, inst),
+                  "Source operands cannot be an accumulator register.");
+      } else {
+         /* Page 971 (page 987 of the PDF), section "Accumulator
+          * Restrictions," of the Broadwell PRM volume 7 says:
+          *
+          *    Integer source operands cannot be accumulators.
+          *
+          * The Skylake and Ice Lake PRMs contain the same text.
+          */
+         ERROR_IF((src0_is_acc(devinfo, inst) &&
+                   brw_reg_type_is_integer(src0_type)) ||
+                  (src1_is_acc(devinfo, inst) &&
+                   brw_reg_type_is_integer(src1_type)),
+                  "Integer source operands cannot be accumulators.");
+      }
+
+      if (devinfo->ver <= 6) {
+         /* Page 223 of the Sandybridge PRM volume 4 part 2 says:
+          *
+          *    Dword integer source is not allowed for this instruction in
+          *    float execution mode.  In other words, if one source is of type
+          *    float (:f, :vf), the other source cannot be of type dword
+          *    integer (:ud or :d).
+          *
+          * G965 and Iron Lake PRMs have similar text.  Later GPUs do not
+          * allow mixed source types at all, but that restriction should be
+          * handled elsewhere.
+          */
+         ERROR_IF(execution_type(isa, inst) == BRW_REGISTER_TYPE_F &&
+                  (src0_type == BRW_REGISTER_TYPE_UD ||
+                   src0_type == BRW_REGISTER_TYPE_D ||
+                   src1_type == BRW_REGISTER_TYPE_UD ||
+                   src1_type == BRW_REGISTER_TYPE_D),
+                  "Dword integer source is not allowed for this instruction in"
+                  "float execution mode.");
+      }
+
+      if (devinfo->ver <= 7) {
+         /* Page 118 of the Haswell PRM volume 2b says:
+          *
+          *    When operating on integers with at least one of the source
+          *    being a DWord type (signed or unsigned), the destination cannot
+          *    be floating-point (implementation note: the data converter only
+          *    looks at the low 34 bits of the result).
+          *
+          * G965, Iron Lake, Sandybridge, and Ivy Bridge have similar text.
+          * Later GPUs do not allow mixed source and destination types at all,
+          * but that restriction should be handled elsewhere.
+          */
+         ERROR_IF(dst_type == BRW_REGISTER_TYPE_F &&
+                  (src0_type == BRW_REGISTER_TYPE_UD ||
+                   src0_type == BRW_REGISTER_TYPE_D ||
+                   src1_type == BRW_REGISTER_TYPE_UD ||
+                   src1_type == BRW_REGISTER_TYPE_D),
+                  "Float destination type not allowed with DWord source type.");
+      }
+
+      if (devinfo->ver == 8) {
+         /* Page 966 (page 982 of the PDF) of the Broadwell PRM volume 2a
+          * says:
+          *
+          *    When multiplying DW x DW, the dst cannot be accumulator.
+          *
+          * This text also appears in the Cherry Trail / Braswell PRM, but it
+          * does not appear in any other PRM.
+          */
+         ERROR_IF((src0_type == BRW_REGISTER_TYPE_UD ||
+                   src0_type == BRW_REGISTER_TYPE_D) &&
+                  (src1_type == BRW_REGISTER_TYPE_UD ||
+                   src1_type == BRW_REGISTER_TYPE_D) &&
+                  brw_inst_dst_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+                  brw_inst_dst_da_reg_nr(devinfo, inst) != BRW_ARF_NULL,
+                  "When multiplying DW x DW, the dst cannot be accumulator.");
+      }
+
+      /* Page 935 (page 951 of the PDF) of the Ice Lake PRM volume 2a says:
+       *
+       *    When multiplying integer data types, if one of the sources is a
+       *    DW, the resulting full precision data is stored in the
+       *    accumulator. However, if the destination data type is either W or
+       *    DW, the low bits of the result are written to the destination
+       *    register and the remaining high bits are discarded. This results
+       *    in undefined Overflow and Sign flags. Therefore, conditional
+       *    modifiers and saturation (.sat) cannot be used in this case.
+       *
+       * Similar text appears in every version of the PRM.
+       *
+       * The wording of the last sentence is not very clear.  It could either
+       * be interpreted as "conditional modifiers combined with saturation
+       * cannot be used" or "neither conditional modifiers nor saturation can
+       * be used."  I have interpreted it as the latter primarily because that
+       * is the more restrictive interpretation.
+       */
+      ERROR_IF((src0_type == BRW_REGISTER_TYPE_UD ||
+                src0_type == BRW_REGISTER_TYPE_D ||
+                src1_type == BRW_REGISTER_TYPE_UD ||
+                src1_type == BRW_REGISTER_TYPE_D) &&
+               (dst_type == BRW_REGISTER_TYPE_UD ||
+                dst_type == BRW_REGISTER_TYPE_D ||
+                dst_type == BRW_REGISTER_TYPE_UW ||
+                dst_type == BRW_REGISTER_TYPE_W) &&
+               (brw_inst_saturate(devinfo, inst) != 0 ||
+                brw_inst_cond_modifier(devinfo, inst) != BRW_CONDITIONAL_NONE),
+               "Neither Saturate nor conditional modifier allowed with DW "
+               "integer multiply.");
+   }
+
+   if (brw_inst_opcode(isa, inst) == BRW_OPCODE_MATH) {
+      unsigned math_function = brw_inst_math_function(devinfo, inst);
+      switch (math_function) {
+      case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+      case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
+      case BRW_MATH_FUNCTION_INT_DIV_REMAINDER: {
+         /* Page 442 of the Broadwell PRM Volume 2a "Extended Math Function" says:
+          *    INT DIV function does not support source modifiers.
+          * Bspec 6647 extends it back to Ivy Bridge.
+          */
+         bool src0_valid = !brw_inst_src0_negate(devinfo, inst) &&
+                           !brw_inst_src0_abs(devinfo, inst);
+         bool src1_valid = !brw_inst_src1_negate(devinfo, inst) &&
+                           !brw_inst_src1_abs(devinfo, inst);
+         ERROR_IF(!src0_valid || !src1_valid,
+                  "INT DIV function does not support source modifiers.");
+         break;
+      }
+      default:
+         break;
+      }
+   }
+
+   if (brw_inst_opcode(isa, inst) == BRW_OPCODE_DP4A) {
+      /* Page 396 (page 412 of the PDF) of the DG1 PRM volume 2a says:
+       *
+       *    Only one of src0 or src1 operand may be an the (sic) accumulator
+       *    register (acc#).
+       */
+      ERROR_IF(src0_is_acc(devinfo, inst) && src1_is_acc(devinfo, inst),
+               "Only one of src0 or src1 operand may be an accumulator "
+               "register (acc#).");
+
+   }
+
+   if (brw_inst_opcode(isa, inst) == BRW_OPCODE_ADD3) {
+      const enum brw_reg_type dst_type = inst_dst_type(isa, inst);
+
+      ERROR_IF(dst_type != BRW_REGISTER_TYPE_D &&
+               dst_type != BRW_REGISTER_TYPE_UD &&
+               dst_type != BRW_REGISTER_TYPE_W &&
+               dst_type != BRW_REGISTER_TYPE_UW,
+               "Destination must be integer D, UD, W, or UW type.");
+
+      for (unsigned i = 0; i < 3; i++) {
+         enum brw_reg_type src_type;
+
+         switch (i) {
+         case 0: src_type = brw_inst_3src_a1_src0_type(devinfo, inst); break;
+         case 1: src_type = brw_inst_3src_a1_src1_type(devinfo, inst); break;
+         case 2: src_type = brw_inst_3src_a1_src2_type(devinfo, inst); break;
+         default: unreachable("invalid src");
+         }
+
+         ERROR_IF(src_type != BRW_REGISTER_TYPE_D &&
+                  src_type != BRW_REGISTER_TYPE_UD &&
+                  src_type != BRW_REGISTER_TYPE_W &&
+                  src_type != BRW_REGISTER_TYPE_UW,
+                  "Source must be integer D, UD, W, or UW type.");
+
+         if (i == 0) {
+            if (brw_inst_3src_a1_src0_is_imm(devinfo, inst)) {
+               ERROR_IF(src_type != BRW_REGISTER_TYPE_W &&
+                        src_type != BRW_REGISTER_TYPE_UW,
+                        "Immediate source must be integer W or UW type.");
+            }
+         } else if (i == 2) {
+            if (brw_inst_3src_a1_src2_is_imm(devinfo, inst)) {
+               ERROR_IF(src_type != BRW_REGISTER_TYPE_W &&
+                        src_type != BRW_REGISTER_TYPE_UW,
+                        "Immediate source must be integer W or UW type.");
+            }
+         }
+      }
+   }
+
+   if (brw_inst_opcode(isa, inst) == BRW_OPCODE_OR ||
+       brw_inst_opcode(isa, inst) == BRW_OPCODE_AND ||
+       brw_inst_opcode(isa, inst) == BRW_OPCODE_XOR ||
+       brw_inst_opcode(isa, inst) == BRW_OPCODE_NOT) {
+      if (devinfo->ver >= 8) {
+         /* While the behavior of the negate source modifier is defined as
+          * logical not, the behavior of abs source modifier is not
+          * defined. Disallow it to be safe.
+          */
+         ERROR_IF(brw_inst_src0_abs(devinfo, inst),
+                  "Behavior of abs source modifier in logic ops is undefined.");
+         ERROR_IF(brw_inst_opcode(isa, inst) != BRW_OPCODE_NOT &&
+                  brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+                  brw_inst_src1_abs(devinfo, inst),
+                  "Behavior of abs source modifier in logic ops is undefined.");
+
+         /* Page 479 (page 495 of the PDF) of the Broadwell PRM volume 2a says:
+          *
+          *    Source modifier is not allowed if source is an accumulator.
+          *
+          * The same text also appears for OR, NOT, and XOR instructions.
+          */
+         ERROR_IF((brw_inst_src0_abs(devinfo, inst) ||
+                   brw_inst_src0_negate(devinfo, inst)) &&
+                  src0_is_acc(devinfo, inst),
+                  "Source modifier is not allowed if source is an accumulator.");
+         ERROR_IF(brw_num_sources_from_inst(isa, inst) > 1 &&
+                  (brw_inst_src1_abs(devinfo, inst) ||
+                   brw_inst_src1_negate(devinfo, inst)) &&
+                  src1_is_acc(devinfo, inst),
+                  "Source modifier is not allowed if source is an accumulator.");
+      }
+
+      /* Page 479 (page 495 of the PDF) of the Broadwell PRM volume 2a says:
+       *
+       *    This operation does not produce sign or overflow conditions. Only
+       *    the .e/.z or .ne/.nz conditional modifiers should be used.
+       *
+       * The same text also appears for OR, NOT, and XOR instructions.
+       *
+       * Per the comment around nir_op_imod in brw_fs_nir.cpp, we have
+       * determined this to not be true. The only conditions that seem
+       * absolutely sketchy are O, R, and U.  Some OpenGL shaders from Doom
+       * 2016 have been observed to generate and.g and operate correctly.
+       */
+      const enum brw_conditional_mod cmod =
+         brw_inst_cond_modifier(devinfo, inst);
+      ERROR_IF(cmod == BRW_CONDITIONAL_O ||
+               cmod == BRW_CONDITIONAL_R ||
+               cmod == BRW_CONDITIONAL_U,
+               "O, R, and U conditional modifiers should not be used.");
+   }
+
+   if (brw_inst_opcode(isa, inst) == BRW_OPCODE_BFI2) {
+      ERROR_IF(brw_inst_cond_modifier(devinfo, inst) != BRW_CONDITIONAL_NONE,
+               "BFI2 cannot have conditional modifier");
+
+      ERROR_IF(brw_inst_saturate(devinfo, inst),
+               "BFI2 cannot have saturate modifier");
+
+      enum brw_reg_type dst_type;
+
+      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1)
+         dst_type = brw_inst_3src_a1_dst_type(devinfo, inst);
+      else
+         dst_type = brw_inst_3src_a16_dst_type(devinfo, inst);
+
+      ERROR_IF(dst_type != BRW_REGISTER_TYPE_D &&
+               dst_type != BRW_REGISTER_TYPE_UD,
+               "BFI2 destination type must be D or UD");
+
+      for (unsigned s = 0; s < 3; s++) {
+         enum brw_reg_type src_type;
+
+         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+            switch (s) {
+            case 0: src_type = brw_inst_3src_a1_src0_type(devinfo, inst); break;
+            case 1: src_type = brw_inst_3src_a1_src1_type(devinfo, inst); break;
+            case 2: src_type = brw_inst_3src_a1_src2_type(devinfo, inst); break;
+            default: unreachable("invalid src");
+            }
+         } else {
+            src_type = brw_inst_3src_a16_src_type(devinfo, inst);
+         }
+
+         ERROR_IF(src_type != dst_type,
+                  "BFI2 source type must match destination type");
+      }
+   }
+
+   if (brw_inst_opcode(isa, inst) == BRW_OPCODE_CSEL) {
+      ERROR_IF(brw_inst_pred_control(devinfo, inst) != BRW_PREDICATE_NONE,
+               "CSEL cannot be predicated");
+
+      /* CSEL is CMP and SEL fused into one. The condition modifier, which
+       * does not actually modify the flags, controls the built-in comparison.
+       */
+      ERROR_IF(brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE,
+               "CSEL must have a condition.");
+
+      enum brw_reg_type dst_type;
+
+      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1)
+         dst_type = brw_inst_3src_a1_dst_type(devinfo, inst);
+      else
+         dst_type = brw_inst_3src_a16_dst_type(devinfo, inst);
+
+      if (devinfo->ver < 8) {
+         ERROR_IF(devinfo->ver < 8, "CSEL not supported before Gfx8");
+      } else if (devinfo->ver <= 9) {
+         ERROR_IF(dst_type != BRW_REGISTER_TYPE_F,
+                  "CSEL destination type must be F");
+      } else {
+         ERROR_IF(dst_type != BRW_REGISTER_TYPE_F &&
+                  dst_type != BRW_REGISTER_TYPE_HF &&
+                  dst_type != BRW_REGISTER_TYPE_D &&
+                  dst_type != BRW_REGISTER_TYPE_W,
+                  "CSEL destination type must be F, HF, D, or W");
+      }
+
+      for (unsigned s = 0; s < 3; s++) {
+         enum brw_reg_type src_type;
+
+         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+            switch (s) {
+            case 0: src_type = brw_inst_3src_a1_src0_type(devinfo, inst); break;
+            case 1: src_type = brw_inst_3src_a1_src1_type(devinfo, inst); break;
+            case 2: src_type = brw_inst_3src_a1_src2_type(devinfo, inst); break;
+            default: unreachable("invalid src");
+            }
+         } else {
+            src_type = brw_inst_3src_a16_src_type(devinfo, inst);
+         }
+
+         ERROR_IF(src_type != dst_type,
+                  "CSEL source type must match destination type");
+      }
+   }
+
+   if (brw_inst_opcode(isa, inst) == BRW_OPCODE_DPAS) {
+      ERROR_IF(brw_inst_dpas_3src_sdepth(devinfo, inst) != BRW_SYSTOLIC_DEPTH_8,
+               "Systolic depth must be 8.");
+
+      const unsigned sdepth = 8;
+
+      const enum brw_reg_type dst_type =
+         brw_inst_dpas_3src_dst_type(devinfo, inst);
+      const enum brw_reg_type src0_type =
+         brw_inst_dpas_3src_src0_type(devinfo, inst);
+      const enum brw_reg_type src1_type =
+         brw_inst_dpas_3src_src1_type(devinfo, inst);
+      const enum brw_reg_type src2_type =
+         brw_inst_dpas_3src_src2_type(devinfo, inst);
+
+      const enum gfx12_sub_byte_precision src1_sub_byte =
+         brw_inst_dpas_3src_src1_subbyte(devinfo, inst);
+
+      if (src1_type != BRW_REGISTER_TYPE_B && src1_type != BRW_REGISTER_TYPE_UB) {
+         ERROR_IF(src1_sub_byte != BRW_SUB_BYTE_PRECISION_NONE,
+                  "Sub-byte precision must be None for source type larger than Byte.");
+      } else {
+         ERROR_IF(src1_sub_byte != BRW_SUB_BYTE_PRECISION_NONE &&
+                  src1_sub_byte != BRW_SUB_BYTE_PRECISION_4BIT &&
+                  src1_sub_byte != BRW_SUB_BYTE_PRECISION_2BIT,
+                  "Invalid sub-byte precision.");
+      }
+
+      const enum gfx12_sub_byte_precision src2_sub_byte =
+         brw_inst_dpas_3src_src2_subbyte(devinfo, inst);
+
+      if (src2_type != BRW_REGISTER_TYPE_B && src2_type != BRW_REGISTER_TYPE_UB) {
+         ERROR_IF(src2_sub_byte != BRW_SUB_BYTE_PRECISION_NONE,
+                  "Sub-byte precision must be None.");
+      } else {
+         ERROR_IF(src2_sub_byte != BRW_SUB_BYTE_PRECISION_NONE &&
+                  src2_sub_byte != BRW_SUB_BYTE_PRECISION_4BIT &&
+                  src2_sub_byte != BRW_SUB_BYTE_PRECISION_2BIT,
+                  "Invalid sub-byte precision.");
+      }
+
+      const unsigned src1_bits_per_element =
+         (8 * brw_reg_type_to_size(src1_type)) >>
+         brw_inst_dpas_3src_src1_subbyte(devinfo, inst);
+
+      const unsigned src2_bits_per_element =
+         (8 * brw_reg_type_to_size(src2_type)) >>
+         brw_inst_dpas_3src_src2_subbyte(devinfo, inst);
+
+      /* The MAX2(1, ...) is just to prevent possible division by 0 later. */
+      const unsigned ops_per_chan =
+         MAX2(1, 32 / MAX2(src1_bits_per_element, src2_bits_per_element));
+
+      ERROR_IF(brw_inst_exec_size(devinfo, inst) != BRW_EXECUTE_8,
+               "DPAS execution size must be 8.");
+
+      const unsigned exec_size = 8;
+
+      const unsigned dst_subnr  = brw_inst_dpas_3src_dst_subreg_nr(devinfo, inst);
+      const unsigned src0_subnr = brw_inst_dpas_3src_src0_subreg_nr(devinfo, inst);
+      const unsigned src1_subnr = brw_inst_dpas_3src_src1_subreg_nr(devinfo, inst);
+      const unsigned src2_subnr = brw_inst_dpas_3src_src2_subreg_nr(devinfo, inst);
+
+      /* Until HF is supported as dst type, this is effectively subnr == 0. */
+      ERROR_IF(dst_subnr % exec_size != 0,
+               "Destination subregister offset must be a multiple of ExecSize.");
+
+      /* Until HF is supported as src0 type, this is effectively subnr == 0. */
+      ERROR_IF(src0_subnr % exec_size != 0,
+               "Src0 subregister offset must be a multiple of ExecSize.");
+
+      ERROR_IF(src1_subnr != 0,
+               "Src1 subregister offsets must be 0.");
+
+      /* In nearly all cases, this effectively requires that src2.subnr be
+       * 0. It is only when src1 is 8 bits and src2 is 2 or 4 bits that the
+       * ops_per_chan value can allow non-zero src2.subnr.
+       */
+      ERROR_IF(src2_subnr % (sdepth * ops_per_chan) != 0,
+               "Src2 subregister offset must be a multiple of SystolicDepth "
+               "times OPS_PER_CHAN.");
+
+      ERROR_IF(dst_subnr * type_sz(dst_type) >= REG_SIZE,
+               "Destination subregister specifies next register.");
+
+      ERROR_IF(src0_subnr * type_sz(src0_type) >= REG_SIZE,
+               "Src0 subregister specifies next register.");
+
+      ERROR_IF((src1_subnr * type_sz(src1_type) * src1_bits_per_element) / 8 >= REG_SIZE,
+               "Src1 subregister specifies next register.");
+
+      ERROR_IF((src2_subnr * type_sz(src2_type) * src2_bits_per_element) / 8 >= REG_SIZE,
+               "Src2 subregister specifies next register.");
+
+      if (brw_inst_3src_atomic_control(devinfo, inst)) {
+         /* FINISHME: When we start emitting DPAS with Atomic set, figure out
+          * a way to validate it. Also add a test in test_eu_validate.cpp.
+          */
+         ERROR_IF(true,
+                  "When instruction option Atomic is used it must be follwed by a "
+                  "DPAS instruction.");
+      }
+
+      if (brw_inst_dpas_3src_exec_type(devinfo, inst) ==
+          BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT) {
+         ERROR_IF(dst_type != BRW_REGISTER_TYPE_F,
+                  "DPAS destination type must be F.");
+         ERROR_IF(src0_type != BRW_REGISTER_TYPE_F,
+                  "DPAS src0 type must be F.");
+         ERROR_IF(src1_type != BRW_REGISTER_TYPE_HF,
+                  "DPAS src1 type must be HF.");
+         ERROR_IF(src2_type != BRW_REGISTER_TYPE_HF,
+                  "DPAS src2 type must be HF.");
+      } else {
+         ERROR_IF(dst_type != BRW_REGISTER_TYPE_D &&
+                  dst_type != BRW_REGISTER_TYPE_UD,
+                  "DPAS destination type must be D or UD.");
+         ERROR_IF(src0_type != BRW_REGISTER_TYPE_D &&
+                  src0_type != BRW_REGISTER_TYPE_UD,
+                  "DPAS src0 type must be D or UD.");
+         ERROR_IF(src1_type != BRW_REGISTER_TYPE_B &&
+                  src1_type != BRW_REGISTER_TYPE_UB,
+                  "DPAS src1 base type must be B or UB.");
+         ERROR_IF(src2_type != BRW_REGISTER_TYPE_B &&
+                  src2_type != BRW_REGISTER_TYPE_UB,
+                  "DPAS src2 base type must be B or UB.");
+
+         if (brw_reg_type_is_unsigned_integer(dst_type)) {
+            ERROR_IF(!brw_reg_type_is_unsigned_integer(src0_type) ||
+                     !brw_reg_type_is_unsigned_integer(src1_type) ||
+                     !brw_reg_type_is_unsigned_integer(src2_type),
+                     "If any source datatype is signed, destination datatype "
+                     "must be signed.");
+         }
+      }
+
+      /* FINISHME: Additional restrictions mentioned in the Bspec that are not
+       * yet enforced here:
+       *
+       *    - General Accumulator registers access is not supported. This is
+       *      currently enforced in brw_dpas_three_src (brw_eu_emit.c).
+       *
+       *    - Given any combination of datatypes in the sources of a DPAS
+       *      instructions, the boundaries of a register should not be crossed.
+       */
+   }
+
+   return error_msg;
+}
+
+static struct string
+send_descriptor_restrictions(const struct brw_isa_info *isa,
+                             const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   if (inst_is_split_send(isa, inst)) {
+      /* We can only validate immediate descriptors */
+      if (brw_inst_send_sel_reg32_desc(devinfo, inst))
+         return error_msg;
+   } else if (inst_is_send(isa, inst)) {
+      /* We can only validate immediate descriptors */
+      if (brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE)
+         return error_msg;
+   } else {
+      return error_msg;
+   }
+
+   const uint32_t desc = brw_inst_send_desc(devinfo, inst);
+
+   switch (brw_inst_sfid(devinfo, inst)) {
+   case BRW_SFID_URB:
+      if (devinfo->ver < 20)
+         break;
+      FALLTHROUGH;
+   case GFX12_SFID_TGM:
+   case GFX12_SFID_SLM:
+   case GFX12_SFID_UGM:
+      ERROR_IF(!devinfo->has_lsc, "Platform does not support LSC");
+
+      ERROR_IF(lsc_opcode_has_transpose(lsc_msg_desc_opcode(devinfo, desc)) &&
+               lsc_msg_desc_transpose(devinfo, desc) &&
+               brw_inst_exec_size(devinfo, inst) != BRW_EXECUTE_1,
+               "Transposed vectors are restricted to Exec_Mask = 1.");
+      break;
+
+   default:
+      break;
+   }
+
+   if (brw_inst_sfid(devinfo, inst) == BRW_SFID_URB && devinfo->ver < 20) {
+      /* Gfx4 doesn't have a "header present" bit in the SEND message. */
+      ERROR_IF(devinfo->ver > 4 && !brw_inst_header_present(devinfo, inst),
+               "Header must be present for all URB messages.");
+
+      switch (brw_inst_urb_opcode(devinfo, inst)) {
+      case BRW_URB_OPCODE_WRITE_HWORD:
+         break;
+
+      /* case FF_SYNC: */
+      case BRW_URB_OPCODE_WRITE_OWORD:
+         /* Gfx5 / Gfx6 FF_SYNC message and Gfx7+ URB_WRITE_OWORD have the
+          * same opcode value.
+          */
+         if (devinfo->ver == 5 || devinfo->ver == 6) {
+            ERROR_IF(brw_inst_urb_global_offset(devinfo, inst) != 0,
+                     "FF_SYNC global offset must be zero.");
+            ERROR_IF(brw_inst_urb_swizzle_control(devinfo, inst) != 0,
+                     "FF_SYNC swizzle control must be zero.");
+            ERROR_IF(brw_inst_urb_used(devinfo, inst) != 0,
+                     "FF_SYNC used must be zero.");
+            ERROR_IF(brw_inst_urb_complete(devinfo, inst) != 0,
+                     "FF_SYNC complete must be zero.");
+
+            /* Volume 4 part 2 of the Sandybridge PRM (page 28) says:
+             *
+             *    A message response (writeback) length of 1 GRF will be
+             *    indicated on the ‘send’ instruction if the thread requires
+             *    response data and/or synchronization.
+             */
+            ERROR_IF((unsigned)brw_inst_rlen(devinfo, inst) > 1,
+                     "FF_SYNC read length must be 0 or 1.");
+         } else {
+            ERROR_IF(devinfo->ver < 7,
+                     "URB OWORD write messages only valid on gfx >= 7");
+         }
+         break;
+
+      case BRW_URB_OPCODE_READ_HWORD:
+      case BRW_URB_OPCODE_READ_OWORD:
+         ERROR_IF(devinfo->ver < 7,
+                  "URB read messages only valid on gfx >= 7");
+         break;
+
+      case GFX7_URB_OPCODE_ATOMIC_MOV:
+      case GFX7_URB_OPCODE_ATOMIC_INC:
+         ERROR_IF(devinfo->ver < 7,
+                  "URB atomic move and increment messages only valid on gfx >= 7");
+         break;
+
+      case GFX8_URB_OPCODE_ATOMIC_ADD:
+         /* The Haswell PRM lists this opcode as valid on page 317. */
+         ERROR_IF(devinfo->verx10 < 75,
+                  "URB atomic add message only valid on gfx >= 7.5");
+         break;
+
+      case GFX8_URB_OPCODE_SIMD8_READ:
+         ERROR_IF(brw_inst_rlen(devinfo, inst) == 0,
+                  "URB SIMD8 read message must read some data.");
+         FALLTHROUGH;
+
+      case GFX8_URB_OPCODE_SIMD8_WRITE:
+         ERROR_IF(devinfo->ver < 8,
+                  "URB SIMD8 messages only valid on gfx >= 8");
+         break;
+
+      case GFX125_URB_OPCODE_FENCE:
+         ERROR_IF(devinfo->verx10 < 125,
+                  "URB fence message only valid on gfx >= 12.5");
+         break;
+
+      default:
+         ERROR_IF(true, "Invalid URB message");
+         break;
+      }
+   }
+
+   return error_msg;
+}
+
+bool
+brw_validate_instruction(const struct brw_isa_info *isa,
+                         const brw_inst *inst, int offset,
+                         unsigned inst_size,
+                         struct disasm_info *disasm)
+{
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   if (is_unsupported_inst(isa, inst)) {
+      ERROR("Instruction not supported on this Gen");
+   } else {
+      CHECK(invalid_values);
+
+      if (error_msg.str == NULL) {
+         CHECK(sources_not_null);
+         CHECK(send_restrictions);
+         CHECK(alignment_supported);
+         CHECK(general_restrictions_based_on_operand_types);
+         CHECK(general_restrictions_on_region_parameters);
+         CHECK(special_restrictions_for_mixed_float_mode);
+         CHECK(region_alignment_rules);
+         CHECK(vector_immediate_restrictions);
+         CHECK(special_requirements_for_handling_double_precision_data_types);
+         CHECK(instruction_restrictions);
+         CHECK(send_descriptor_restrictions);
+      }
+   }
+
+   if (error_msg.str && disasm) {
+      disasm_insert_error(disasm, offset, inst_size, error_msg.str);
+   }
+   free(error_msg.str);
+
+   return error_msg.len == 0;
+}
+
+bool
+brw_validate_instructions(const struct brw_isa_info *isa,
+                          const void *assembly, int start_offset, int end_offset,
+                          struct disasm_info *disasm)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+   bool valid = true;
+
+   for (int src_offset = start_offset; src_offset < end_offset;) {
+      const brw_inst *inst = assembly + src_offset;
+      bool is_compact = brw_inst_cmpt_control(devinfo, inst);
+      unsigned inst_size = is_compact ? sizeof(brw_compact_inst)
+                                      : sizeof(brw_inst);
+      brw_inst uncompacted;
+
+      if (is_compact) {
+         brw_compact_inst *compacted = (void *)inst;
+         brw_uncompact_instruction(isa, &uncompacted, compacted);
+         inst = &uncompacted;
+      }
+
+      bool v = brw_validate_instruction(isa, inst, src_offset,
+                                        inst_size, disasm);
+      valid = valid && v;
+
+      src_offset += inst_size;
+   }
+
+   return valid;
+}
diff --git a/src/intel/compiler/elk/brw_fs.cpp b/src/intel/compiler/elk/brw_fs.cpp
new file mode 100644
index 00000000000..2a9cee96c5e
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs.cpp
@@ -0,0 +1,8561 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs.cpp
+ *
+ * This file drives the GLSL IR -> LIR translation, contains the
+ * optimizations on the LIR, and drives the generation of native code
+ * from the LIR.
+ */
+
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_fs_live_variables.h"
+#include "brw_nir.h"
+#include "brw_vec4_gs_visitor.h"
+#include "brw_cfg.h"
+#include "brw_dead_control_flow.h"
+#include "brw_private.h"
+#include "intel_nir.h"
+#include "shader_enums.h"
+#include "dev/intel_debug.h"
+#include "dev/intel_wa.h"
+#include "compiler/glsl_types.h"
+#include "compiler/nir/nir_builder.h"
+#include "util/u_math.h"
+
+#include <memory>
+
+using namespace brw;
+
+static unsigned get_lowered_simd_width(const fs_visitor *shader,
+                                       const fs_inst *inst);
+
+void
+fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+              const fs_reg *src, unsigned sources)
+{
+   memset((void*)this, 0, sizeof(*this));
+
+   this->src = new fs_reg[MAX2(sources, 3)];
+   for (unsigned i = 0; i < sources; i++)
+      this->src[i] = src[i];
+
+   this->opcode = opcode;
+   this->dst = dst;
+   this->sources = sources;
+   this->exec_size = exec_size;
+   this->base_mrf = -1;
+
+   assert(dst.file != IMM && dst.file != UNIFORM);
+
+   assert(this->exec_size != 0);
+
+   this->conditional_mod = BRW_CONDITIONAL_NONE;
+
+   /* This will be the case for almost all instructions. */
+   switch (dst.file) {
+   case VGRF:
+   case ARF:
+   case FIXED_GRF:
+   case MRF:
+   case ATTR:
+      this->size_written = dst.component_size(exec_size);
+      break;
+   case BAD_FILE:
+      this->size_written = 0;
+      break;
+   case IMM:
+   case UNIFORM:
+      unreachable("Invalid destination register file");
+   }
+
+   this->writes_accumulator = false;
+}
+
+fs_inst::fs_inst()
+{
+   init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
+{
+   init(opcode, exec_size, reg_undef, NULL, 0);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
+{
+   init(opcode, exec_size, dst, NULL, 0);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+                 const fs_reg &src0)
+{
+   const fs_reg src[1] = { src0 };
+   init(opcode, exec_size, dst, src, 1);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+                 const fs_reg &src0, const fs_reg &src1)
+{
+   const fs_reg src[2] = { src0, src1 };
+   init(opcode, exec_size, dst, src, 2);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+                 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
+{
+   const fs_reg src[3] = { src0, src1, src2 };
+   init(opcode, exec_size, dst, src, 3);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
+                 const fs_reg src[], unsigned sources)
+{
+   init(opcode, exec_width, dst, src, sources);
+}
+
+fs_inst::fs_inst(const fs_inst &that)
+{
+   memcpy((void*)this, &that, sizeof(that));
+
+   this->src = new fs_reg[MAX2(that.sources, 3)];
+
+   for (unsigned i = 0; i < that.sources; i++)
+      this->src[i] = that.src[i];
+}
+
+fs_inst::~fs_inst()
+{
+   delete[] this->src;
+}
+
+void
+fs_inst::resize_sources(uint8_t num_sources)
+{
+   if (this->sources != num_sources) {
+      fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
+
+      for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
+         src[i] = this->src[i];
+
+      delete[] this->src;
+      this->src = src;
+      this->sources = num_sources;
+   }
+}
+
+void
+fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
+                                       const fs_reg &dst,
+                                       const fs_reg &surface,
+                                       const fs_reg &surface_handle,
+                                       const fs_reg &varying_offset,
+                                       uint32_t const_offset,
+                                       uint8_t alignment,
+                                       unsigned components)
+{
+   assert(components <= 4);
+
+   /* We have our constant surface use a pitch of 4 bytes, so our index can
+    * be any component of a vector, and then we load 4 contiguous
+    * components starting from that.  TODO: Support loading fewer than 4.
+    */
+   fs_reg total_offset = vgrf(glsl_uint_type());
+   bld.ADD(total_offset, varying_offset, brw_imm_ud(const_offset));
+
+   /* The pull load message will load a vec4 (16 bytes). If we are loading
+    * a double this means we are only loading 2 elements worth of data.
+    * We also want to use a 32-bit data type for the dst of the load operation
+    * so other parts of the driver don't get confused about the size of the
+    * result.
+    */
+   fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+
+   fs_reg srcs[PULL_VARYING_CONSTANT_SRCS];
+   srcs[PULL_VARYING_CONSTANT_SRC_SURFACE]        = surface;
+   srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
+   srcs[PULL_VARYING_CONSTANT_SRC_OFFSET]         = total_offset;
+   srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT]      = brw_imm_ud(alignment);
+
+   fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
+                            vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
+   inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
+
+   shuffle_from_32bit_read(bld, dst, vec4_result, 0, components);
+}
+
+/**
+ * A helper for MOV generation for fixing up broken hardware SEND dependency
+ * handling.
+ */
+void
+fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
+{
+   /* The caller always wants uncompressed to emit the minimal extra
+    * dependencies, and to avoid having to deal with aligning its regs to 2.
+    */
+   const fs_builder ubld = bld.annotate("send dependency resolve")
+                              .quarter(0);
+
+   ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
+}
+
+bool
+fs_inst::is_send_from_grf() const
+{
+   switch (opcode) {
+   case SHADER_OPCODE_SEND:
+   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+   case SHADER_OPCODE_INTERLOCK:
+   case SHADER_OPCODE_MEMORY_FENCE:
+   case SHADER_OPCODE_BARRIER:
+      return true;
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+      return src[1].file == VGRF;
+   case FS_OPCODE_FB_WRITE:
+   case FS_OPCODE_FB_READ:
+      return src[0].file == VGRF;
+   default:
+      return false;
+   }
+}
+
+bool
+fs_inst::is_control_source(unsigned arg) const
+{
+   switch (opcode) {
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
+      return arg == 0;
+
+   case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_SHUFFLE:
+   case SHADER_OPCODE_QUAD_SWIZZLE:
+   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+      return arg == 1;
+
+   case SHADER_OPCODE_MOV_INDIRECT:
+   case SHADER_OPCODE_CLUSTER_BROADCAST:
+   case SHADER_OPCODE_TEX:
+   case FS_OPCODE_TXB:
+   case SHADER_OPCODE_TXD:
+   case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_LZ:
+   case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_CMS_W:
+   case SHADER_OPCODE_TXF_UMS:
+   case SHADER_OPCODE_TXF_MCS:
+   case SHADER_OPCODE_TXL:
+   case SHADER_OPCODE_TXL_LZ:
+   case SHADER_OPCODE_TXS:
+   case SHADER_OPCODE_LOD:
+   case SHADER_OPCODE_TG4:
+   case SHADER_OPCODE_TG4_OFFSET:
+   case SHADER_OPCODE_SAMPLEINFO:
+      return arg == 1 || arg == 2;
+
+   case SHADER_OPCODE_SEND:
+      return arg == 0 || arg == 1;
+
+   default:
+      return false;
+   }
+}
+
+bool
+fs_inst::is_payload(unsigned arg) const
+{
+   switch (opcode) {
+   case FS_OPCODE_FB_WRITE:
+   case FS_OPCODE_FB_READ:
+   case VEC4_OPCODE_UNTYPED_ATOMIC:
+   case VEC4_OPCODE_UNTYPED_SURFACE_READ:
+   case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+   case SHADER_OPCODE_INTERLOCK:
+   case SHADER_OPCODE_MEMORY_FENCE:
+   case SHADER_OPCODE_BARRIER:
+   case SHADER_OPCODE_TEX:
+   case FS_OPCODE_TXB:
+   case SHADER_OPCODE_TXD:
+   case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_LZ:
+   case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_CMS_W:
+   case SHADER_OPCODE_TXF_UMS:
+   case SHADER_OPCODE_TXF_MCS:
+   case SHADER_OPCODE_TXL:
+   case SHADER_OPCODE_TXL_LZ:
+   case SHADER_OPCODE_TXS:
+   case SHADER_OPCODE_LOD:
+   case SHADER_OPCODE_TG4:
+   case SHADER_OPCODE_TG4_OFFSET:
+   case SHADER_OPCODE_SAMPLEINFO:
+      return arg == 0;
+
+   case SHADER_OPCODE_SEND:
+      return arg == 2 || arg == 3;
+
+   default:
+      return false;
+   }
+}
+
+/**
+ * Returns true if this instruction's sources and destinations cannot
+ * safely be the same register.
+ *
+ * In most cases, a register can be written over safely by the same
+ * instruction that is its last use.  For a single instruction, the
+ * sources are dereferenced before writing of the destination starts
+ * (naturally).
+ *
+ * However, there are a few cases where this can be problematic:
+ *
+ * - Virtual opcodes that translate to multiple instructions in the
+ *   code generator: if src == dst and one instruction writes the
+ *   destination before a later instruction reads the source, then
+ *   src will have been clobbered.
+ *
+ * - SIMD16 compressed instructions with certain regioning (see below).
+ *
+ * The register allocator uses this information to set up conflicts between
+ * GRF sources and the destination.
+ */
+bool
+fs_inst::has_source_and_destination_hazard() const
+{
+   switch (opcode) {
+   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+      /* Multiple partial writes to the destination */
+      return true;
+   case SHADER_OPCODE_SHUFFLE:
+      /* This instruction returns an arbitrary channel from the source and
+       * gets split into smaller instructions in the generator.  It's possible
+       * that one of the instructions will read from a channel corresponding
+       * to an earlier instruction.
+       */
+   case SHADER_OPCODE_SEL_EXEC:
+      /* This is implemented as
+       *
+       * mov(16)      g4<1>D      0D            { align1 WE_all 1H };
+       * mov(16)      g4<1>D      g5<8,8,1>D    { align1 1H }
+       *
+       * Because the source is only read in the second instruction, the first
+       * may stomp all over it.
+       */
+      return true;
+   case SHADER_OPCODE_QUAD_SWIZZLE:
+      switch (src[1].ud) {
+      case BRW_SWIZZLE_XXXX:
+      case BRW_SWIZZLE_YYYY:
+      case BRW_SWIZZLE_ZZZZ:
+      case BRW_SWIZZLE_WWWW:
+      case BRW_SWIZZLE_XXZZ:
+      case BRW_SWIZZLE_YYWW:
+      case BRW_SWIZZLE_XYXY:
+      case BRW_SWIZZLE_ZWZW:
+         /* These can be implemented as a single Align1 region on all
+          * platforms, so there's never a hazard between source and
+          * destination.  C.f. fs_generator::generate_quad_swizzle().
+          */
+         return false;
+      default:
+         return !is_uniform(src[0]);
+      }
+   case BRW_OPCODE_DPAS:
+      /* This is overly conservative. The actual hazard is more complicated to
+       * describe. When the repeat count is N, the single instruction behaves
+       * like N instructions with a repeat count of one, but the destination
+       * and source registers are incremented (in somewhat complex ways) for
+       * each instruction.
+       *
+       * This means the source and destination register is actually a range of
+       * registers. The hazard exists of an earlier iteration would write a
+       * register that should be read by a later iteration.
+       *
+       * There may be some advantage to properly modeling this, but for now,
+       * be overly conservative.
+       */
+      return rcount > 1;
+   default:
+      /* The SIMD16 compressed instruction
+       *
+       * add(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
+       *
+       * is actually decoded in hardware as:
+       *
+       * add(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
+       * add(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
+       *
+       * Which is safe.  However, if we have uniform accesses
+       * happening, we get into trouble:
+       *
+       * add(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
+       * add(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
+       *
+       * Now our destination for the first instruction overwrote the
+       * second instruction's src0, and we get garbage for those 8
+       * pixels.  There's a similar issue for the pre-gfx6
+       * pixel_x/pixel_y, which are registers of 16-bit values and thus
+       * would get stomped by the first decode as well.
+       */
+      if (exec_size == 16) {
+         for (int i = 0; i < sources; i++) {
+            if (src[i].file == VGRF && (src[i].stride == 0 ||
+                                        src[i].type == BRW_REGISTER_TYPE_UW ||
+                                        src[i].type == BRW_REGISTER_TYPE_W ||
+                                        src[i].type == BRW_REGISTER_TYPE_UB ||
+                                        src[i].type == BRW_REGISTER_TYPE_B)) {
+               return true;
+            }
+         }
+      }
+      return false;
+   }
+}
+
+bool
+fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
+{
+   if (devinfo->ver == 6 && is_math())
+      return false;
+
+   if (is_send_from_grf())
+      return false;
+
+   /* From Wa_1604601757:
+    *
+    * "When multiplying a DW and any lower precision integer, source modifier
+    *  is not supported."
+    */
+   if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
+                              opcode == BRW_OPCODE_MAD)) {
+      const brw_reg_type exec_type = get_exec_type(this);
+      const unsigned min_type_sz = opcode == BRW_OPCODE_MAD ?
+         MIN2(type_sz(src[1].type), type_sz(src[2].type)) :
+         MIN2(type_sz(src[0].type), type_sz(src[1].type));
+
+      if (brw_reg_type_is_integer(exec_type) &&
+          type_sz(exec_type) >= 4 &&
+          type_sz(exec_type) != min_type_sz)
+         return false;
+   }
+
+   if (!backend_instruction::can_do_source_mods())
+      return false;
+
+   return true;
+}
+
+bool
+fs_inst::can_do_cmod()
+{
+   if (!backend_instruction::can_do_cmod())
+      return false;
+
+   /* The accumulator result appears to get used for the conditional modifier
+    * generation.  When negating a UD value, there is a 33rd bit generated for
+    * the sign in the accumulator value, so now you can't check, for example,
+    * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
+    */
+   for (unsigned i = 0; i < sources; i++) {
+      if (brw_reg_type_is_unsigned_integer(src[i].type) && src[i].negate)
+         return false;
+   }
+
+   return true;
+}
+
+bool
+fs_inst::can_change_types() const
+{
+   return dst.type == src[0].type &&
+          !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
+          (opcode == BRW_OPCODE_MOV ||
+           (opcode == BRW_OPCODE_SEL &&
+            dst.type == src[1].type &&
+            predicate != BRW_PREDICATE_NONE &&
+            !src[1].abs && !src[1].negate && src[1].file != ATTR));
+}
+
+void
+fs_reg::init()
+{
+   memset((void*)this, 0, sizeof(*this));
+   type = BRW_REGISTER_TYPE_UD;
+   stride = 1;
+}
+
+/** Generic unset register constructor. */
+fs_reg::fs_reg()
+{
+   init();
+   this->file = BAD_FILE;
+}
+
+fs_reg::fs_reg(struct ::brw_reg reg) :
+   backend_reg(reg)
+{
+   this->offset = 0;
+   this->stride = 1;
+   if (this->file == IMM &&
+       (this->type != BRW_REGISTER_TYPE_V &&
+        this->type != BRW_REGISTER_TYPE_UV &&
+        this->type != BRW_REGISTER_TYPE_VF)) {
+      this->stride = 0;
+   }
+}
+
+bool
+fs_reg::equals(const fs_reg &r) const
+{
+   return (this->backend_reg::equals(r) &&
+           stride == r.stride);
+}
+
+bool
+fs_reg::negative_equals(const fs_reg &r) const
+{
+   return (this->backend_reg::negative_equals(r) &&
+           stride == r.stride);
+}
+
+bool
+fs_reg::is_contiguous() const
+{
+   switch (file) {
+   case ARF:
+   case FIXED_GRF:
+      return hstride == BRW_HORIZONTAL_STRIDE_1 &&
+             vstride == width + hstride;
+   case MRF:
+   case VGRF:
+   case ATTR:
+      return stride == 1;
+   case UNIFORM:
+   case IMM:
+   case BAD_FILE:
+      return true;
+   }
+
+   unreachable("Invalid register file");
+}
+
+unsigned
+fs_reg::component_size(unsigned width) const
+{
+   if (file == ARF || file == FIXED_GRF) {
+      const unsigned w = MIN2(width, 1u << this->width);
+      const unsigned h = width >> this->width;
+      const unsigned vs = vstride ? 1 << (vstride - 1) : 0;
+      const unsigned hs = hstride ? 1 << (hstride - 1) : 0;
+      assert(w > 0);
+      return ((MAX2(1, h) - 1) * vs + (w - 1) * hs + 1) * type_sz(type);
+   } else {
+      return MAX2(width * stride, 1) * type_sz(type);
+   }
+}
+
+void
+fs_visitor::vfail(const char *format, va_list va)
+{
+   char *msg;
+
+   if (failed)
+      return;
+
+   failed = true;
+
+   msg = ralloc_vasprintf(mem_ctx, format, va);
+   msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
+         dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
+
+   this->fail_msg = msg;
+
+   if (unlikely(debug_enabled)) {
+      fprintf(stderr, "%s",  msg);
+   }
+}
+
+void
+fs_visitor::fail(const char *format, ...)
+{
+   va_list va;
+
+   va_start(va, format);
+   vfail(format, va);
+   va_end(va);
+}
+
+/**
+ * Mark this program as impossible to compile with dispatch width greater
+ * than n.
+ *
+ * During the SIMD8 compile (which happens first), we can detect and flag
+ * things that are unsupported in SIMD16+ mode, so the compiler can skip the
+ * SIMD16+ compile altogether.
+ *
+ * During a compile of dispatch width greater than n (if one happens anyway),
+ * this just calls fail().
+ */
+void
+fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
+{
+   if (dispatch_width > n) {
+      fail("%s", msg);
+   } else {
+      max_dispatch_width = MIN2(max_dispatch_width, n);
+      brw_shader_perf_log(compiler, log_data,
+                          "Shader dispatch width limited to SIMD%d: %s\n",
+                          n, msg);
+   }
+}
+
+/**
+ * Returns true if the instruction has a flag that means it won't
+ * update an entire destination register.
+ *
+ * For example, dead code elimination and live variable analysis want to know
+ * when a write to a variable screens off any preceding values that were in
+ * it.
+ */
+bool
+fs_inst::is_partial_write() const
+{
+   if (this->predicate && !this->predicate_trivial &&
+       this->opcode != BRW_OPCODE_SEL)
+      return true;
+
+   if (this->dst.offset % REG_SIZE != 0)
+      return true;
+
+   /* SEND instructions always write whole registers */
+   if (this->opcode == SHADER_OPCODE_SEND)
+      return false;
+
+   /* Special case UNDEF since a lot of places in the backend do things like this :
+    *
+    *  fs_builder ubld = bld.exec_all().group(1, 0);
+    *  fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+    *  ubld.UNDEF(tmp); <- partial write, even if the whole register is concerned
+    */
+   if (this->opcode == SHADER_OPCODE_UNDEF) {
+      assert(this->dst.is_contiguous());
+      return this->size_written < 32;
+   }
+
+   return this->exec_size * type_sz(this->dst.type) < 32 ||
+          !this->dst.is_contiguous();
+}
+
+unsigned
+fs_inst::components_read(unsigned i) const
+{
+   /* Return zero if the source is not present. */
+   if (src[i].file == BAD_FILE)
+      return 0;
+
+   switch (opcode) {
+   case FS_OPCODE_LINTERP:
+      if (i == 0)
+         return 2;
+      else
+         return 1;
+
+   case FS_OPCODE_PIXEL_X:
+   case FS_OPCODE_PIXEL_Y:
+      assert(i < 2);
+      if (i == 0)
+         return 2;
+      else
+         return 1;
+
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
+      /* First/second FB write color. */
+      if (i < 2)
+         return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
+      else
+         return 1;
+
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case SHADER_OPCODE_TXD_LOGICAL:
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+   case SHADER_OPCODE_LOD_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+      assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
+             src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
+             src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
+      /* Texture coordinates. */
+      if (i == TEX_LOGICAL_SRC_COORDINATE)
+         return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
+      /* Texture derivatives. */
+      else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
+               opcode == SHADER_OPCODE_TXD_LOGICAL)
+         return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
+      /* Texture offset. */
+      else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
+         return 2;
+      /* MCS */
+      else if (i == TEX_LOGICAL_SRC_MCS) {
+         if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
+            return 2;
+         else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
+            return 4;
+         else
+            return 1;
+      } else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
+      /* Surface coordinates. */
+      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
+         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
+      /* Surface operation source (ignored for reads). */
+      else if (i == SURFACE_LOGICAL_SRC_DATA)
+         return 0;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
+             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
+      /* Surface coordinates. */
+      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
+         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
+      /* Surface operation source. */
+      else if (i == SURFACE_LOGICAL_SRC_DATA)
+         return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
+      else
+         return 1;
+
+   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
+   case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
+   case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
+      assert(src[A64_LOGICAL_ARG].file == IMM);
+      return 1;
+
+   case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
+      assert(src[A64_LOGICAL_ARG].file == IMM);
+      if (i == A64_LOGICAL_SRC) { /* data to write */
+         const unsigned comps = src[A64_LOGICAL_ARG].ud / exec_size;
+         assert(comps > 0);
+         return comps;
+      } else {
+         return 1;
+      }
+
+   case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
+      assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
+      return 1;
+
+   case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
+      assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
+      if (i == SURFACE_LOGICAL_SRC_DATA) {
+         const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size;
+         assert(comps > 0);
+         return comps;
+      } else {
+         return 1;
+      }
+
+   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
+      assert(src[A64_LOGICAL_ARG].file == IMM);
+      return i == A64_LOGICAL_SRC ? src[A64_LOGICAL_ARG].ud : 1;
+
+   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
+      assert(src[A64_LOGICAL_ARG].file == IMM);
+      return i == A64_LOGICAL_SRC ?
+             lsc_op_num_data_values(src[A64_LOGICAL_ARG].ud) : 1;
+
+   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
+   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
+      /* Scattered logical opcodes use the following params:
+       * src[0] Surface coordinates
+       * src[1] Surface operation source (ignored for reads)
+       * src[2] Surface
+       * src[3] IMM with always 1 dimension.
+       * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
+       */
+      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
+             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
+      return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
+
+   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
+   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
+      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
+             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
+      return 1;
+
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
+      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
+             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
+      const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
+      /* Surface coordinates. */
+      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
+         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
+      /* Surface operation source. */
+      else if (i == SURFACE_LOGICAL_SRC_DATA)
+         return lsc_op_num_data_values(op);
+      else
+         return 1;
+   }
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+      return (i == 0 ? 2 : 1);
+
+   case SHADER_OPCODE_URB_WRITE_LOGICAL:
+      assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
+
+      if (i == URB_LOGICAL_SRC_DATA)
+         return src[URB_LOGICAL_SRC_COMPONENTS].ud;
+      else
+         return 1;
+
+   case BRW_OPCODE_DPAS:
+      unreachable("Do not use components_read() for DPAS.");
+
+   default:
+      return 1;
+   }
+}
+
+unsigned
+fs_inst::size_read(int arg) const
+{
+   switch (opcode) {
+   case SHADER_OPCODE_SEND:
+      if (arg == 2) {
+         return mlen * REG_SIZE;
+      } else if (arg == 3) {
+         return ex_mlen * REG_SIZE;
+      }
+      break;
+
+   case FS_OPCODE_FB_WRITE:
+   case FS_OPCODE_REP_FB_WRITE:
+      if (arg == 0) {
+         if (base_mrf >= 0)
+            return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;
+         else
+            return mlen * REG_SIZE;
+      }
+      break;
+
+   case FS_OPCODE_FB_READ:
+   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+      if (arg == 0)
+         return mlen * REG_SIZE;
+      break;
+
+   case FS_OPCODE_SET_SAMPLE_ID:
+      if (arg == 1)
+         return 1;
+      break;
+
+   case FS_OPCODE_LINTERP:
+      if (arg == 1)
+         return 16;
+      break;
+
+   case SHADER_OPCODE_LOAD_PAYLOAD:
+      if (arg < this->header_size)
+         return retype(src[arg], BRW_REGISTER_TYPE_UD).component_size(8);
+      break;
+
+   case CS_OPCODE_CS_TERMINATE:
+   case SHADER_OPCODE_BARRIER:
+      return REG_SIZE;
+
+   case SHADER_OPCODE_MOV_INDIRECT:
+      if (arg == 0) {
+         assert(src[2].file == IMM);
+         return src[2].ud;
+      }
+      break;
+
+   case BRW_OPCODE_DPAS:
+      switch (arg) {
+      case 0:
+         if (src[0].type == BRW_REGISTER_TYPE_HF) {
+            return rcount * REG_SIZE / 2;
+         } else {
+            return rcount * REG_SIZE;
+         }
+      case 1:
+         return sdepth * REG_SIZE;
+      case 2:
+         /* This is simpler than the formula described in the Bspec, but it
+          * covers all of the cases that we support on DG2.
+          */
+         return rcount * REG_SIZE;
+      default:
+         unreachable("Invalid source number.");
+      }
+      break;
+
+   case SHADER_OPCODE_TEX:
+   case FS_OPCODE_TXB:
+   case SHADER_OPCODE_TXD:
+   case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_LZ:
+   case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_CMS_W:
+   case SHADER_OPCODE_TXF_UMS:
+   case SHADER_OPCODE_TXF_MCS:
+   case SHADER_OPCODE_TXL:
+   case SHADER_OPCODE_TXL_LZ:
+   case SHADER_OPCODE_TXS:
+   case SHADER_OPCODE_LOD:
+   case SHADER_OPCODE_TG4:
+   case SHADER_OPCODE_TG4_OFFSET:
+   case SHADER_OPCODE_SAMPLEINFO:
+      if (arg == 0 && src[0].file == VGRF)
+         return mlen * REG_SIZE;
+      break;
+
+   default:
+      break;
+   }
+
+   switch (src[arg].file) {
+   case UNIFORM:
+   case IMM:
+      return components_read(arg) * type_sz(src[arg].type);
+   case BAD_FILE:
+   case ARF:
+   case FIXED_GRF:
+   case VGRF:
+   case ATTR:
+      return components_read(arg) * src[arg].component_size(exec_size);
+   case MRF:
+      unreachable("MRF registers are not allowed as sources");
+   }
+   return 0;
+}
+
+namespace {
+   unsigned
+   predicate_width(const intel_device_info *devinfo, brw_predicate predicate)
+   {
+      if (devinfo->ver >= 20) {
+         return 1;
+      } else {
+         switch (predicate) {
+         case BRW_PREDICATE_NONE:            return 1;
+         case BRW_PREDICATE_NORMAL:          return 1;
+         case BRW_PREDICATE_ALIGN1_ANY2H:    return 2;
+         case BRW_PREDICATE_ALIGN1_ALL2H:    return 2;
+         case BRW_PREDICATE_ALIGN1_ANY4H:    return 4;
+         case BRW_PREDICATE_ALIGN1_ALL4H:    return 4;
+         case BRW_PREDICATE_ALIGN1_ANY8H:    return 8;
+         case BRW_PREDICATE_ALIGN1_ALL8H:    return 8;
+         case BRW_PREDICATE_ALIGN1_ANY16H:   return 16;
+         case BRW_PREDICATE_ALIGN1_ALL16H:   return 16;
+         case BRW_PREDICATE_ALIGN1_ANY32H:   return 32;
+         case BRW_PREDICATE_ALIGN1_ALL32H:   return 32;
+         default: unreachable("Unsupported predicate");
+         }
+      }
+   }
+
+   /* Return the subset of flag registers that an instruction could
+    * potentially read or write based on the execution controls and flag
+    * subregister number of the instruction.
+    */
+   unsigned
+   flag_mask(const fs_inst *inst, unsigned width)
+   {
+      assert(util_is_power_of_two_nonzero(width));
+      const unsigned start = (inst->flag_subreg * 16 + inst->group) &
+                             ~(width - 1);
+      const unsigned end = start + ALIGN(inst->exec_size, width);
+      return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
+   }
+
+   unsigned
+   bit_mask(unsigned n)
+   {
+      return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
+   }
+
+   unsigned
+   flag_mask(const fs_reg &r, unsigned sz)
+   {
+      if (r.file == ARF) {
+         const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;
+         const unsigned end = start + sz;
+         return bit_mask(end) & ~bit_mask(start);
+      } else {
+         return 0;
+      }
+   }
+}
+
+unsigned
+fs_inst::flags_read(const intel_device_info *devinfo) const
+{
+   if (devinfo->ver < 20 && (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
+                             predicate == BRW_PREDICATE_ALIGN1_ALLV)) {
+      /* The vertical predication modes combine corresponding bits from
+       * f0.0 and f1.0 on Gfx7+, and f0.0 and f0.1 on older hardware.
+       */
+      const unsigned shift = devinfo->ver >= 7 ? 4 : 2;
+      return flag_mask(this, 1) << shift | flag_mask(this, 1);
+   } else if (predicate) {
+      return flag_mask(this, predicate_width(devinfo, predicate));
+   } else {
+      unsigned mask = 0;
+      for (int i = 0; i < sources; i++) {
+         mask |= flag_mask(src[i], size_read(i));
+      }
+      return mask;
+   }
+}
+
+unsigned
+fs_inst::flags_written(const intel_device_info *devinfo) const
+{
+   /* On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
+    * using a separate cmpn and sel instruction.  This lowering occurs in
+    * fs_vistor::lower_minmax which is called very, very late.
+    */
+   if ((conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) &&
+                            opcode != BRW_OPCODE_CSEL &&
+                            opcode != BRW_OPCODE_IF &&
+                            opcode != BRW_OPCODE_WHILE)) ||
+       opcode == FS_OPCODE_FB_WRITE) {
+      return flag_mask(this, 1);
+   } else if (opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL ||
+              opcode == SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL ||
+              opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {
+      return flag_mask(this, 32);
+   } else {
+      return flag_mask(dst, size_written);
+   }
+}
+
+/**
+ * Returns how many MRFs an FS opcode will write over.
+ *
+ * Note that this is not the 0 or 1 implied writes in an actual gen
+ * instruction -- the FS opcodes often generate MOVs in addition.
+ */
+unsigned
+fs_inst::implied_mrf_writes() const
+{
+   if (mlen == 0)
+      return 0;
+
+   if (base_mrf == -1)
+      return 0;
+
+   switch (opcode) {
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      return 1 * exec_size / 8;
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+      return 2 * exec_size / 8;
+   case SHADER_OPCODE_TEX:
+   case FS_OPCODE_TXB:
+   case SHADER_OPCODE_TXD:
+   case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_MCS:
+   case SHADER_OPCODE_TG4:
+   case SHADER_OPCODE_TG4_OFFSET:
+   case SHADER_OPCODE_TXL:
+   case SHADER_OPCODE_TXS:
+   case SHADER_OPCODE_LOD:
+   case SHADER_OPCODE_SAMPLEINFO:
+      return 1;
+   case FS_OPCODE_FB_WRITE:
+   case FS_OPCODE_REP_FB_WRITE:
+      return src[0].file == BAD_FILE ? 0 : 2;
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case SHADER_OPCODE_GFX4_SCRATCH_READ:
+      return 1;
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
+      return mlen;
+   case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
+      return mlen;
+   default:
+      unreachable("not reached");
+   }
+}
+
+bool
+fs_inst::has_sampler_residency() const
+{
+   switch (opcode) {
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case SHADER_OPCODE_TXD_LOGICAL:
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+      assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
+      return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
+   default:
+      return false;
+   }
+}
+
+fs_reg
+fs_visitor::vgrf(const glsl_type *const type)
+{
+   int reg_width = dispatch_width / 8;
+   return fs_reg(VGRF,
+                 alloc.allocate(glsl_count_dword_slots(type, false) * reg_width),
+                 brw_type_for_base_type(type));
+}
+
+fs_reg::fs_reg(enum brw_reg_file file, unsigned nr)
+{
+   init();
+   this->file = file;
+   this->nr = nr;
+   this->type = BRW_REGISTER_TYPE_F;
+   this->stride = (file == UNIFORM ? 0 : 1);
+}
+
+fs_reg::fs_reg(enum brw_reg_file file, unsigned nr, enum brw_reg_type type)
+{
+   init();
+   this->file = file;
+   this->nr = nr;
+   this->type = type;
+   this->stride = (file == UNIFORM ? 0 : 1);
+}
+
+/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
+ * This brings in those uniform definitions
+ */
+void
+fs_visitor::import_uniforms(fs_visitor *v)
+{
+   this->push_constant_loc = v->push_constant_loc;
+   this->uniforms = v->uniforms;
+}
+
+enum brw_barycentric_mode
+brw_barycentric_mode(nir_intrinsic_instr *intr)
+{
+   const glsl_interp_mode mode =
+      (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
+
+   /* Barycentric modes don't make sense for flat inputs. */
+   assert(mode != INTERP_MODE_FLAT);
+
+   unsigned bary;
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_barycentric_pixel:
+   case nir_intrinsic_load_barycentric_at_offset:
+      bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
+      break;
+   case nir_intrinsic_load_barycentric_centroid:
+      bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
+      break;
+   case nir_intrinsic_load_barycentric_sample:
+   case nir_intrinsic_load_barycentric_at_sample:
+      bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
+      break;
+   default:
+      unreachable("invalid intrinsic");
+   }
+
+   if (mode == INTERP_MODE_NOPERSPECTIVE)
+      bary += 3;
+
+   return (enum brw_barycentric_mode) bary;
+}
+
+/**
+ * Turn one of the two CENTROID barycentric modes into PIXEL mode.
+ */
+static enum brw_barycentric_mode
+centroid_to_pixel(enum brw_barycentric_mode bary)
+{
+   assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
+          bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
+   return (enum brw_barycentric_mode) ((unsigned) bary - 1);
+}
+
+/**
+ * Walk backwards from the end of the program looking for a URB write that
+ * isn't in control flow, and mark it with EOT.
+ *
+ * Return true if successful or false if a separate EOT write is needed.
+ */
+bool
+fs_visitor::mark_last_urb_write_with_eot()
+{
+   foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
+      if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
+         prev->eot = true;
+
+         /* Delete now dead instructions. */
+         foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
+            if (dead == prev)
+               break;
+            dead->remove();
+         }
+         return true;
+      } else if (prev->is_control_flow() || prev->has_side_effects()) {
+         break;
+      }
+   }
+
+   return false;
+}
+
+void
+fs_visitor::emit_gs_thread_end()
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
+
+   if (gs_compile->control_data_header_size_bits > 0) {
+      emit_gs_control_data_bits(this->final_gs_vertex_count);
+   }
+
+   const fs_builder abld = fs_builder(this).at_end().annotate("thread end");
+   fs_inst *inst;
+
+   if (gs_prog_data->static_vertex_count != -1) {
+      /* Try and tag the last URB write with EOT instead of emitting a whole
+       * separate write just to finish the thread.
+       */
+      if (mark_last_urb_write_with_eot())
+         return;
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
+      srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(0);
+      inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
+                       srcs, ARRAY_SIZE(srcs));
+   } else {
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
+      srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count;
+      srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
+      inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
+                       srcs, ARRAY_SIZE(srcs));
+   }
+   inst->eot = true;
+   inst->offset = 0;
+}
+
+void
+fs_visitor::assign_curb_setup()
+{
+   unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
+
+   unsigned ubo_push_length = 0;
+   unsigned ubo_push_start[4];
+   for (int i = 0; i < 4; i++) {
+      ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
+      ubo_push_length += stage_prog_data->ubo_ranges[i].length;
+   }
+
+   prog_data->curb_read_length = uniform_push_length + ubo_push_length;
+
+   uint64_t used = 0;
+   bool is_compute = gl_shader_stage_is_compute(stage);
+
+   if (is_compute && brw_cs_prog_data(prog_data)->uses_inline_data) {
+      /* With COMPUTE_WALKER, we can push up to one register worth of data via
+       * the inline data parameter in the COMPUTE_WALKER command itself.
+       *
+       * TODO: Support inline data and push at the same time.
+       */
+      assert(devinfo->verx10 >= 125);
+      assert(uniform_push_length <= reg_unit(devinfo));
+   } else if (is_compute && devinfo->verx10 >= 125) {
+      assert(devinfo->has_lsc);
+      fs_builder ubld = fs_builder(this, 1).exec_all().at(
+         cfg->first_block(), cfg->first_block()->start());
+
+      /* The base offset for our push data is passed in as R0.0[31:6]. We have
+       * to mask off the bottom 6 bits.
+       */
+      fs_reg base_addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      ubld.AND(base_addr,
+               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
+               brw_imm_ud(INTEL_MASK(31, 6)));
+
+      /* On Gfx12-HP we load constants at the start of the program using A32
+       * stateless messages.
+       */
+      for (unsigned i = 0; i < uniform_push_length;) {
+         /* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
+         unsigned num_regs = MIN2(uniform_push_length - i, 8);
+         assert(num_regs > 0);
+         num_regs = 1 << util_logbase2(num_regs);
+
+         fs_reg addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         ubld.ADD(addr, base_addr, brw_imm_ud(i * REG_SIZE));
+
+         fs_reg srcs[4] = {
+            brw_imm_ud(0), /* desc */
+            brw_imm_ud(0), /* ex_desc */
+            addr,          /* payload */
+            fs_reg(),      /* payload2 */
+         };
+
+         fs_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
+                              BRW_REGISTER_TYPE_UD);
+         fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
+
+         send->sfid = GFX12_SFID_UGM;
+         send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
+                                   1 /* exec_size */,
+                                   LSC_ADDR_SURFTYPE_FLAT,
+                                   LSC_ADDR_SIZE_A32,
+                                   1 /* num_coordinates */,
+                                   LSC_DATA_SIZE_D32,
+                                   num_regs * 8 /* num_channels */,
+                                   true /* transpose */,
+                                   LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
+                                   true /* has_dest */);
+         send->header_size = 0;
+         send->mlen = lsc_msg_desc_src0_len(devinfo, send->desc);
+         send->size_written =
+            lsc_msg_desc_dest_len(devinfo, send->desc) * REG_SIZE;
+         send->send_is_volatile = true;
+
+         i += num_regs;
+      }
+
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+   }
+
+   /* Map the offsets in the UNIFORM file to fixed HW regs. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      for (unsigned int i = 0; i < inst->sources; i++) {
+	 if (inst->src[i].file == UNIFORM) {
+            int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
+            int constant_nr;
+            if (inst->src[i].nr >= UBO_START) {
+               /* constant_nr is in 32-bit units, the rest are in bytes */
+               constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
+                             inst->src[i].offset / 4;
+            } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
+               constant_nr = push_constant_loc[uniform_nr];
+            } else {
+               /* Section 5.11 of the OpenGL 4.1 spec says:
+                * "Out-of-bounds reads return undefined values, which include
+                *  values from other variables of the active program or zero."
+                * Just return the first push constant.
+                */
+               constant_nr = 0;
+            }
+
+            assert(constant_nr / 8 < 64);
+            used |= BITFIELD64_BIT(constant_nr / 8);
+
+	    struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
+						  constant_nr / 8,
+						  constant_nr % 8);
+            brw_reg.abs = inst->src[i].abs;
+            brw_reg.negate = inst->src[i].negate;
+
+            assert(inst->src[i].stride == 0);
+            inst->src[i] = byte_offset(
+               retype(brw_reg, inst->src[i].type),
+               inst->src[i].offset % 4);
+	 }
+      }
+   }
+
+   uint64_t want_zero = used & stage_prog_data->zero_push_reg;
+   if (want_zero) {
+      fs_builder ubld = fs_builder(this, 8).exec_all().at(
+         cfg->first_block(), cfg->first_block()->start());
+
+      /* push_reg_mask_param is in 32-bit units */
+      unsigned mask_param = stage_prog_data->push_reg_mask_param;
+      struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
+                                                              mask_param % 8);
+
+      fs_reg b32;
+      for (unsigned i = 0; i < 64; i++) {
+         if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
+            fs_reg shifted = ubld.vgrf(BRW_REGISTER_TYPE_W, 2);
+            ubld.SHL(horiz_offset(shifted, 8),
+                     byte_offset(retype(mask, BRW_REGISTER_TYPE_W), i / 8),
+                     brw_imm_v(0x01234567));
+            ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
+
+            fs_builder ubld16 = ubld.group(16, 0);
+            b32 = ubld16.vgrf(BRW_REGISTER_TYPE_D);
+            ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
+         }
+
+         if (want_zero & BITFIELD64_BIT(i)) {
+            assert(i < prog_data->curb_read_length);
+            struct brw_reg push_reg =
+               retype(brw_vec8_grf(payload().num_regs + i, 0),
+                      BRW_REGISTER_TYPE_D);
+
+            ubld.AND(push_reg, push_reg, component(b32, i % 16));
+         }
+      }
+
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+   }
+
+   /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
+   this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
+}
+
+/*
+ * Build up an array of indices into the urb_setup array that
+ * references the active entries of the urb_setup array.
+ * Used to accelerate walking the active entries of the urb_setup array
+ * on each upload.
+ */
+void
+brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
+{
+   /* TODO(mesh): Review usage of this in the context of Mesh, we may want to
+    * skip per-primitive attributes here.
+    */
+
+   /* Make sure uint8_t is sufficient */
+   STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
+   uint8_t index = 0;
+   for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
+      if (wm_prog_data->urb_setup[attr] >= 0) {
+         wm_prog_data->urb_setup_attribs[index++] = attr;
+      }
+   }
+   wm_prog_data->urb_setup_attribs_count = index;
+}
+
+static void
+calculate_urb_setup(const struct intel_device_info *devinfo,
+                    const struct brw_wm_prog_key *key,
+                    struct brw_wm_prog_data *prog_data,
+                    const nir_shader *nir,
+                    const struct brw_mue_map *mue_map)
+{
+   memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
+   memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
+
+   int urb_next = 0; /* in vec4s */
+
+   const uint64_t inputs_read =
+      nir->info.inputs_read & ~nir->info.per_primitive_inputs;
+
+   /* Figure out where each of the incoming setup attributes lands. */
+   if (key->mesh_input != BRW_NEVER) {
+      /* Per-Primitive Attributes are laid out by Hardware before the regular
+       * attributes, so order them like this to make easy later to map setup
+       * into real HW registers.
+       */
+      if (nir->info.per_primitive_inputs) {
+         uint64_t per_prim_inputs_read =
+               nir->info.inputs_read & nir->info.per_primitive_inputs;
+
+         /* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
+          * are always at the beginning, because they come from MUE
+          * Primitive Header, not Per-Primitive Attributes.
+          */
+         const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
+                                                VARYING_BIT_LAYER |
+                                                VARYING_BIT_PRIMITIVE_SHADING_RATE;
+
+         if (mue_map) {
+            unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
+            unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
+
+            bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
+
+            if (reads_header || mue_map->user_data_in_primitive_header) {
+               /* Primitive Shading Rate, Layer and Viewport live in the same
+                * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
+                * is dword 2).
+                */
+               if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE)
+                  prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0;
+
+               if (per_prim_inputs_read & VARYING_BIT_LAYER)
+                  prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
+
+               if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
+                  prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
+
+               per_prim_inputs_read &= ~primitive_header_bits;
+            } else {
+               /* If fs doesn't need primitive header, then it won't be made
+                * available through SBE_MESH, so we have to skip them when
+                * calculating offset from start of per-prim data.
+                */
+               per_prim_start_dw += mue_map->per_primitive_header_size_dw;
+               per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
+            }
+
+            u_foreach_bit64(i, per_prim_inputs_read) {
+               int start = mue_map->start_dw[i];
+
+               assert(start >= 0);
+               assert(mue_map->len_dw[i] > 0);
+
+               assert(unsigned(start) >= per_prim_start_dw);
+               unsigned pos_dw = unsigned(start) - per_prim_start_dw;
+
+               prog_data->urb_setup[i] = urb_next + pos_dw / 4;
+               prog_data->urb_setup_channel[i] = pos_dw % 4;
+            }
+
+            urb_next = per_prim_size_dw / 4;
+         } else {
+            /* With no MUE map, we never read the primitive header, and
+             * per-primitive attributes won't be packed either, so just lay
+             * them in varying order.
+             */
+            per_prim_inputs_read &= ~primitive_header_bits;
+
+            for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
+               if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
+                  prog_data->urb_setup[i] = urb_next++;
+               }
+            }
+
+            /* The actual setup attributes later must be aligned to a full GRF. */
+            urb_next = ALIGN(urb_next, 2);
+         }
+
+         prog_data->num_per_primitive_inputs = urb_next;
+      }
+
+      const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 |
+                                      VARYING_BIT_CLIP_DIST1;
+
+      uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
+
+      if (inputs_read & clip_dist_bits) {
+         assert(!mue_map || mue_map->per_vertex_header_size_dw > 8);
+         unique_fs_attrs &= ~clip_dist_bits;
+      }
+
+      if (mue_map) {
+         unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
+         unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
+
+         /* Per-Vertex header is available to fragment shader only if there's
+          * user data there.
+          */
+         if (!mue_map->user_data_in_vertex_header) {
+            per_vertex_start_dw += 8;
+            per_vertex_size_dw -= 8;
+         }
+
+         /* In Mesh, CLIP_DIST slots are always at the beginning, because
+          * they come from MUE Vertex Header, not Per-Vertex Attributes.
+          */
+         if (inputs_read & clip_dist_bits) {
+            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
+            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
+         } else if (mue_map && mue_map->per_vertex_header_size_dw > 8) {
+            /* Clip distances are in MUE, but we are not reading them in FS. */
+            per_vertex_start_dw += 8;
+            per_vertex_size_dw -= 8;
+         }
+
+         /* Per-Vertex attributes are laid out ordered.  Because we always link
+          * Mesh and Fragment shaders, the which slots are written and read by
+          * each of them will match. */
+         u_foreach_bit64(i, unique_fs_attrs) {
+            int start = mue_map->start_dw[i];
+
+            assert(start >= 0);
+            assert(mue_map->len_dw[i] > 0);
+
+            assert(unsigned(start) >= per_vertex_start_dw);
+            unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
+
+            prog_data->urb_setup[i] = urb_next + pos_dw / 4;
+            prog_data->urb_setup_channel[i] = pos_dw % 4;
+         }
+
+         urb_next += per_vertex_size_dw / 4;
+      } else {
+         /* If we don't have an MUE map, just lay down the inputs the FS reads
+          * in varying order, as we do for the legacy pipeline.
+          */
+         if (inputs_read & clip_dist_bits) {
+            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
+            prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
+         }
+
+         for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
+            if (unique_fs_attrs & BITFIELD64_BIT(i))
+               prog_data->urb_setup[i] = urb_next++;
+         }
+      }
+   } else if (devinfo->ver >= 6) {
+      assert(!nir->info.per_primitive_inputs);
+
+      uint64_t vue_header_bits =
+         VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
+
+      uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
+
+      /* VUE header fields all live in the same URB slot, so we pass them
+       * as a single FS input attribute.  We want to only count them once.
+       */
+      if (inputs_read & vue_header_bits) {
+         unique_fs_attrs &= ~vue_header_bits;
+         unique_fs_attrs |= VARYING_BIT_PSIZ;
+      }
+
+      if (util_bitcount64(unique_fs_attrs) <= 16) {
+         /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
+          * first 16 varying inputs, so we can put them wherever we want.
+          * Just put them in order.
+          *
+          * This is useful because it means that (a) inputs not used by the
+          * fragment shader won't take up valuable register space, and (b) we
+          * won't have to recompile the fragment shader if it gets paired with
+          * a different vertex (or geometry) shader.
+          *
+          * VUE header fields share the same FS input attribute.
+          */
+         if (inputs_read & vue_header_bits) {
+            if (inputs_read & VARYING_BIT_PSIZ)
+               prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next;
+            if (inputs_read & VARYING_BIT_LAYER)
+               prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next;
+            if (inputs_read & VARYING_BIT_VIEWPORT)
+               prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next;
+
+            urb_next++;
+         }
+
+         for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
+            if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits &
+                BITFIELD64_BIT(i)) {
+               prog_data->urb_setup[i] = urb_next++;
+            }
+         }
+      } else {
+         /* We have enough input varyings that the SF/SBE pipeline stage can't
+          * arbitrarily rearrange them to suit our whim; we have to put them
+          * in an order that matches the output of the previous pipeline stage
+          * (geometry or vertex shader).
+          */
+
+         /* Re-compute the VUE map here in the case that the one coming from
+          * geometry has more than one position slot (used for Primitive
+          * Replication).
+          */
+         struct intel_vue_map prev_stage_vue_map;
+         brw_compute_vue_map(devinfo, &prev_stage_vue_map,
+                             key->input_slots_valid,
+                             nir->info.separate_shader, 1);
+
+         int first_slot =
+            brw_compute_first_urb_slot_required(inputs_read,
+                                                &prev_stage_vue_map);
+
+         assert(prev_stage_vue_map.num_slots <= first_slot + 32);
+         for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
+              slot++) {
+            int varying = prev_stage_vue_map.slot_to_varying[slot];
+            if (varying != BRW_VARYING_SLOT_PAD &&
+                (inputs_read & BRW_FS_VARYING_INPUT_MASK &
+                 BITFIELD64_BIT(varying))) {
+               prog_data->urb_setup[varying] = slot - first_slot;
+            }
+         }
+         urb_next = prev_stage_vue_map.num_slots - first_slot;
+      }
+   } else {
+      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
+      for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
+         /* Point size is packed into the header, not as a general attribute */
+         if (i == VARYING_SLOT_PSIZ)
+            continue;
+
+	 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
+	    /* The back color slot is skipped when the front color is
+	     * also written to.  In addition, some slots can be
+	     * written in the vertex shader and not read in the
+	     * fragment shader.  So the register number must always be
+	     * incremented, mapped or not.
+	     */
+	    if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
+	       prog_data->urb_setup[i] = urb_next;
+            urb_next++;
+	 }
+      }
+
+      /*
+       * It's a FS only attribute, and we did interpolation for this attribute
+       * in SF thread. So, count it here, too.
+       *
+       * See compile_sf_prog() for more info.
+       */
+      if (inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
+         prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
+   }
+
+   prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
+   prog_data->inputs = inputs_read;
+
+   brw_compute_urb_setup_index(prog_data);
+}
+
+void
+fs_visitor::assign_urb_setup()
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+
+   int urb_start = payload().num_regs + prog_data->base.curb_read_length;
+
+   /* Offset all the urb_setup[] index by the actual position of the
+    * setup regs, now that the location of the constants has been chosen.
+    */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == ATTR) {
+            /* ATTR fs_reg::nr in the FS is in units of logical scalar
+             * inputs each of which consumes 16B on Gfx4-Gfx12.  In
+             * single polygon mode this leads to the following layout
+             * of the vertex setup plane parameters in the ATTR
+             * register file:
+             *
+             *  fs_reg::nr   Input   Comp0  Comp1  Comp2  Comp3
+             *      0       Attr0.x  a1-a0  a2-a0   N/A    a0
+             *      1       Attr0.y  a1-a0  a2-a0   N/A    a0
+             *      2       Attr0.z  a1-a0  a2-a0   N/A    a0
+             *      3       Attr0.w  a1-a0  a2-a0   N/A    a0
+             *      4       Attr1.x  a1-a0  a2-a0   N/A    a0
+             *     ...
+             *
+             * In multipolygon mode that no longer works since
+             * different channels may be processing polygons with
+             * different plane parameters, so each parameter above is
+             * represented as a dispatch_width-wide vector:
+             *
+             *  fs_reg::nr     fs_reg::offset    Input      Comp0     ...    CompN
+             *      0                 0          Attr0.x  a1[0]-a0[0] ... a1[N]-a0[N]
+             *      0        4 * dispatch_width  Attr0.x  a2[0]-a0[0] ... a2[N]-a0[N]
+             *      0        8 * dispatch_width  Attr0.x     N/A      ...     N/A
+             *      0       12 * dispatch_width  Attr0.x    a0[0]     ...    a0[N]
+             *      1                 0          Attr0.y  a1[0]-a0[0] ... a1[N]-a0[N]
+             *     ...
+             *
+             * Note that many of the components on a single row above
+             * are likely to be replicated multiple times (if, say, a
+             * single SIMD thread is only processing 2 different
+             * polygons), so plane parameters aren't actually stored
+             * in GRF memory with that layout to avoid wasting space.
+             * Instead we compose ATTR register regions with a 2D
+             * region that walks through the parameters of each
+             * polygon with the correct stride, reading the parameter
+             * corresponding to each channel directly from the PS
+             * thread payload.
+             *
+             * The latter layout corresponds to a param_width equal to
+             * dispatch_width, while the former (scalar parameter)
+             * layout has a param_width of 1.
+             *
+             * Gfx20+ represent plane parameters in a format similar
+             * to the above, except the parameters are packed in 12B
+             * and ordered like "a0, a1-a0, a2-a0" instead of the
+             * above vec4 representation with a missing component.
+             */
+            const unsigned param_width = (max_polygons > 1 ? dispatch_width : 1);
+
+            /* Size of a single scalar component of a plane parameter
+             * in bytes.
+             */
+            const unsigned chan_sz = 4;
+            struct brw_reg reg;
+            assert(max_polygons > 0);
+
+            /* Calculate the base register on the thread payload of
+             * either the block of vertex setup data or the block of
+             * per-primitive constant data depending on whether we're
+             * accessing a primitive or vertex input.  Also calculate
+             * the index of the input within that block.
+             */
+            const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs;
+            const unsigned base = urb_start +
+               (per_prim ? 0 :
+                ALIGN(prog_data->num_per_primitive_inputs / 2,
+                      reg_unit(devinfo)) * max_polygons);
+            const unsigned idx = per_prim ? inst->src[i].nr :
+               inst->src[i].nr - prog_data->num_per_primitive_inputs;
+
+            /* Translate the offset within the param_width-wide
+             * representation described above into an offset and a
+             * grf, which contains the plane parameters for the first
+             * polygon processed by the thread.
+             */
+            if (devinfo->ver >= 20 && !per_prim) {
+               /* Gfx20+ is able to pack 5 logical input components
+                * per 64B register for vertex setup data.
+                */
+               const unsigned grf = base + idx / 5 * 2 * max_polygons;
+               assert(inst->src[i].offset / param_width < 12);
+               const unsigned delta = idx % 5 * 12 +
+                  inst->src[i].offset / (param_width * chan_sz) * chan_sz +
+                  inst->src[i].offset % chan_sz;
+               reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
+                                 delta);
+            } else {
+               /* Earlier platforms and per-primitive block pack 2 logical
+                * input components per 32B register.
+                */
+               const unsigned grf = base + idx / 2 * max_polygons;
+               assert(inst->src[i].offset / param_width < REG_SIZE / 2);
+               const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
+                  inst->src[i].offset / (param_width * chan_sz) * chan_sz +
+                  inst->src[i].offset % chan_sz;
+               reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
+                                 delta);
+            }
+
+            if (max_polygons > 1) {
+               assert(devinfo->ver >= 12);
+               /* Misaligned channel strides that would lead to
+                * cross-channel access in the representation above are
+                * disallowed.
+                */
+               assert(inst->src[i].stride * type_sz(inst->src[i].type) == chan_sz);
+
+               /* Number of channels processing the same polygon. */
+               const unsigned poly_width = dispatch_width / max_polygons;
+               assert(dispatch_width % max_polygons == 0);
+
+               /* Accessing a subset of channels of a parameter vector
+                * starting from "chan" is necessary to handle
+                * SIMD-lowered instructions though.
+                */
+               const unsigned chan = inst->src[i].offset %
+                  (param_width * chan_sz) / chan_sz;
+               assert(chan < dispatch_width);
+               assert(chan % poly_width == 0);
+               const unsigned reg_size = reg_unit(devinfo) * REG_SIZE;
+               reg = byte_offset(reg, chan / poly_width * reg_size);
+
+               if (inst->exec_size > poly_width) {
+                  /* Accessing the parameters for multiple polygons.
+                   * Corresponding parameters for different polygons
+                   * are stored a GRF apart on the thread payload, so
+                   * use that as vertical stride.
+                   */
+                  const unsigned vstride = reg_size / type_sz(inst->src[i].type);
+                  assert(vstride <= 32);
+                  assert(chan % poly_width == 0);
+                  reg = stride(reg, vstride, poly_width, 0);
+               } else {
+                  /* Accessing one parameter for a single polygon --
+                   * Translate to a scalar region.
+                   */
+                  assert(chan % poly_width + inst->exec_size <= poly_width);
+                  reg = stride(reg, 0, 1, 0);
+               }
+
+            } else {
+               const unsigned width = inst->src[i].stride == 0 ?
+                  1 : MIN2(inst->exec_size, 8);
+               reg = stride(reg, width * inst->src[i].stride,
+                            width, inst->src[i].stride);
+            }
+
+            reg.abs = inst->src[i].abs;
+            reg.negate = inst->src[i].negate;
+            inst->src[i] = reg;
+         }
+      }
+   }
+
+   /* Each attribute is 4 setup channels, each of which is half a reg,
+    * but they may be replicated multiple times for multipolygon
+    * dispatch.
+    */
+   this->first_non_payload_grf += prog_data->num_varying_inputs * 2 * max_polygons;
+
+   /* Unlike regular attributes, per-primitive attributes have all 4 channels
+    * in the same slot, so each GRF can store two slots.
+    */
+   assert(prog_data->num_per_primitive_inputs % 2 == 0);
+   this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2 * max_polygons;
+}
+
+void
+fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
+{
+   for (int i = 0; i < inst->sources; i++) {
+      if (inst->src[i].file == ATTR) {
+         assert(inst->src[i].nr == 0);
+         int grf = payload().num_regs +
+                   prog_data->curb_read_length +
+                   inst->src[i].offset / REG_SIZE;
+
+         /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
+          *
+          * VertStride must be used to cross GRF register boundaries. This
+          * rule implies that elements within a 'Width' cannot cross GRF
+          * boundaries.
+          *
+          * So, for registers that are large enough, we have to split the exec
+          * size in two and trust the compression state to sort it out.
+          */
+         unsigned total_size = inst->exec_size *
+                               inst->src[i].stride *
+                               type_sz(inst->src[i].type);
+
+         assert(total_size <= 2 * REG_SIZE);
+         const unsigned exec_size =
+            (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
+
+         unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
+         struct brw_reg reg =
+            stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
+                               inst->src[i].offset % REG_SIZE),
+                   exec_size * inst->src[i].stride,
+                   width, inst->src[i].stride);
+         reg.abs = inst->src[i].abs;
+         reg.negate = inst->src[i].negate;
+
+         inst->src[i] = reg;
+      }
+   }
+}
+
+void
+fs_visitor::assign_vs_urb_setup()
+{
+   struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
+
+   assert(stage == MESA_SHADER_VERTEX);
+
+   /* Each attribute is 4 regs. */
+   this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
+
+   assert(vs_prog_data->base.urb_read_length <= 15);
+
+   /* Rewrite all ATTR file references to the hw grf that they land in. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
+void
+fs_visitor::assign_tcs_urb_setup()
+{
+   assert(stage == MESA_SHADER_TESS_CTRL);
+
+   /* Rewrite all ATTR file references to HW_REGs. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
+void
+fs_visitor::assign_tes_urb_setup()
+{
+   assert(stage == MESA_SHADER_TESS_EVAL);
+
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+
+   first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
+
+   /* Rewrite all ATTR file references to HW_REGs. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
+void
+fs_visitor::assign_gs_urb_setup()
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+
+   first_non_payload_grf +=
+      8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      /* Rewrite all ATTR file references to GRFs. */
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
+
+/**
+ * Split large virtual GRFs into separate components if we can.
+ *
+ * This pass aggressively splits VGRFs into as small a chunks as possible,
+ * down to single registers if it can.  If no VGRFs can be split, we return
+ * false so this pass can safely be used inside an optimization loop.  We
+ * want to split, because virtual GRFs are what we register allocate and
+ * spill (due to contiguousness requirements for some instructions), and
+ * they're what we naturally generate in the codegen process, but most
+ * virtual GRFs don't actually need to be contiguous sets of GRFs.  If we
+ * split, we'll end up with reduced live intervals and better dead code
+ * elimination and coalescing.
+ */
+bool
+fs_visitor::split_virtual_grfs()
+{
+   /* Compact the register file so we eliminate dead vgrfs.  This
+    * only defines split points for live registers, so if we have
+    * too large dead registers they will hit assertions later.
+    */
+   compact_virtual_grfs();
+
+   unsigned num_vars = this->alloc.count;
+
+   /* Count the total number of registers */
+   unsigned reg_count = 0;
+   unsigned vgrf_to_reg[num_vars];
+   for (unsigned i = 0; i < num_vars; i++) {
+      vgrf_to_reg[i] = reg_count;
+      reg_count += alloc.sizes[i];
+   }
+
+   /* An array of "split points".  For each register slot, this indicates
+    * if this slot can be separated from the previous slot.  Every time an
+    * instruction uses multiple elements of a register (as a source or
+    * destination), we mark the used slots as inseparable.  Then we go
+    * through and split the registers into the smallest pieces we can.
+    */
+   bool *split_points = new bool[reg_count];
+   memset(split_points, 0, reg_count * sizeof(*split_points));
+
+   /* Mark all used registers as fully splittable */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->dst.file == VGRF) {
+         unsigned reg = vgrf_to_reg[inst->dst.nr];
+         for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
+            split_points[reg + j] = true;
+      }
+
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF) {
+            unsigned reg = vgrf_to_reg[inst->src[i].nr];
+            for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
+               split_points[reg + j] = true;
+         }
+      }
+   }
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      /* We fix up undef instructions later */
+      if (inst->opcode == SHADER_OPCODE_UNDEF) {
+         assert(inst->dst.file == VGRF);
+         continue;
+      }
+
+      if (inst->dst.file == VGRF) {
+         unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
+         for (unsigned j = 1; j < regs_written(inst); j++)
+            split_points[reg + j] = false;
+      }
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF) {
+            unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
+            for (unsigned j = 1; j < regs_read(inst, i); j++)
+               split_points[reg + j] = false;
+         }
+      }
+   }
+
+   /* Bitset of which registers have been split */
+   bool *vgrf_has_split = new bool[num_vars];
+   memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
+
+   unsigned *new_virtual_grf = new unsigned[reg_count];
+   unsigned *new_reg_offset = new unsigned[reg_count];
+
+   unsigned reg = 0;
+   bool has_splits = false;
+   for (unsigned i = 0; i < num_vars; i++) {
+      /* The first one should always be 0 as a quick sanity check. */
+      assert(split_points[reg] == false);
+
+      /* j = 0 case */
+      new_reg_offset[reg] = 0;
+      reg++;
+      unsigned offset = 1;
+
+      /* j > 0 case */
+      for (unsigned j = 1; j < alloc.sizes[i]; j++) {
+         /* If this is a split point, reset the offset to 0 and allocate a
+          * new virtual GRF for the previous offset many registers
+          */
+         if (split_points[reg]) {
+            has_splits = true;
+            vgrf_has_split[i] = true;
+            assert(offset <= MAX_VGRF_SIZE(devinfo));
+            unsigned grf = alloc.allocate(offset);
+            for (unsigned k = reg - offset; k < reg; k++)
+               new_virtual_grf[k] = grf;
+            offset = 0;
+         }
+         new_reg_offset[reg] = offset;
+         offset++;
+         reg++;
+      }
+
+      /* The last one gets the original register number */
+      assert(offset <= MAX_VGRF_SIZE(devinfo));
+      alloc.sizes[i] = offset;
+      for (unsigned k = reg - offset; k < reg; k++)
+         new_virtual_grf[k] = i;
+   }
+   assert(reg == reg_count);
+
+   bool progress;
+   if (!has_splits) {
+      progress = false;
+      goto cleanup;
+   }
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      if (inst->opcode == SHADER_OPCODE_UNDEF) {
+         assert(inst->dst.file == VGRF);
+         if (vgrf_has_split[inst->dst.nr]) {
+            const fs_builder ibld(this, block, inst);
+            assert(inst->size_written % REG_SIZE == 0);
+            unsigned reg_offset = inst->dst.offset / REG_SIZE;
+            unsigned size_written = 0;
+            while (size_written < inst->size_written) {
+               reg = vgrf_to_reg[inst->dst.nr] + reg_offset + size_written / REG_SIZE;
+               fs_inst *undef =
+                  ibld.UNDEF(
+                     byte_offset(fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type),
+                                 new_reg_offset[reg] * REG_SIZE));
+               undef->size_written =
+                  MIN2(inst->size_written - size_written, undef->size_written);
+               assert(undef->size_written % REG_SIZE == 0);
+               size_written += undef->size_written;
+            }
+            inst->remove(block);
+         } else {
+            reg = vgrf_to_reg[inst->dst.nr];
+            assert(new_reg_offset[reg] == 0);
+            assert(new_virtual_grf[reg] == inst->dst.nr);
+         }
+         continue;
+      }
+
+      if (inst->dst.file == VGRF) {
+         reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
+         if (vgrf_has_split[inst->dst.nr]) {
+            inst->dst.nr = new_virtual_grf[reg];
+            inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
+                               inst->dst.offset % REG_SIZE;
+            assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
+         } else {
+            assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
+            assert(new_virtual_grf[reg] == inst->dst.nr);
+         }
+      }
+      for (unsigned i = 0; i < inst->sources; i++) {
+	 if (inst->src[i].file != VGRF)
+            continue;
+
+         reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
+         if (vgrf_has_split[inst->src[i].nr]) {
+            inst->src[i].nr = new_virtual_grf[reg];
+            inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
+                                  inst->src[i].offset % REG_SIZE;
+            assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
+         } else {
+            assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
+            assert(new_virtual_grf[reg] == inst->src[i].nr);
+         }
+      }
+   }
+   invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
+
+   progress = true;
+
+cleanup:
+   delete[] split_points;
+   delete[] vgrf_has_split;
+   delete[] new_virtual_grf;
+   delete[] new_reg_offset;
+
+   return progress;
+}
+
+/**
+ * Remove unused virtual GRFs and compact the vgrf_* arrays.
+ *
+ * During code generation, we create tons of temporary variables, many of
+ * which get immediately killed and are never used again.  Yet, in later
+ * optimization and analysis passes, such as compute_live_intervals, we need
+ * to loop over all the virtual GRFs.  Compacting them can save a lot of
+ * overhead.
+ */
+bool
+fs_visitor::compact_virtual_grfs()
+{
+   bool progress = false;
+   int *remap_table = new int[this->alloc.count];
+   memset(remap_table, -1, this->alloc.count * sizeof(int));
+
+   /* Mark which virtual GRFs are used. */
+   foreach_block_and_inst(block, const fs_inst, inst, cfg) {
+      if (inst->dst.file == VGRF)
+         remap_table[inst->dst.nr] = 0;
+
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF)
+            remap_table[inst->src[i].nr] = 0;
+      }
+   }
+
+   /* Compact the GRF arrays. */
+   int new_index = 0;
+   for (unsigned i = 0; i < this->alloc.count; i++) {
+      if (remap_table[i] == -1) {
+         /* We just found an unused register.  This means that we are
+          * actually going to compact something.
+          */
+         progress = true;
+      } else {
+         remap_table[i] = new_index;
+         alloc.sizes[new_index] = alloc.sizes[i];
+         invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
+         ++new_index;
+      }
+   }
+
+   this->alloc.count = new_index;
+
+   /* Patch all the instructions to use the newly renumbered registers */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->dst.file == VGRF)
+         inst->dst.nr = remap_table[inst->dst.nr];
+
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF)
+            inst->src[i].nr = remap_table[inst->src[i].nr];
+      }
+   }
+
+   /* Patch all the references to delta_xy, since they're used in register
+    * allocation.  If they're unused, switch them to BAD_FILE so we don't
+    * think some random VGRF is delta_xy.
+    */
+   for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
+      if (delta_xy[i].file == VGRF) {
+         if (remap_table[delta_xy[i].nr] != -1) {
+            delta_xy[i].nr = remap_table[delta_xy[i].nr];
+         } else {
+            delta_xy[i].file = BAD_FILE;
+         }
+      }
+   }
+
+   delete[] remap_table;
+
+   return progress;
+}
+
+int
+brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
+                                const brw_stage_prog_data *prog_data)
+{
+   if (prog_data->nr_params == 0)
+      return -1;
+
+   if (devinfo->verx10 >= 125)
+      return -1;
+
+   /* The local thread id is always the last parameter in the list */
+   uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
+   if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
+      return prog_data->nr_params - 1;
+
+   return -1;
+}
+
+/**
+ * Assign UNIFORM file registers to either push constants or pull constants.
+ *
+ * We allow a fragment shader to have more than the specified minimum
+ * maximum number of fragment shader uniform components (64).  If
+ * there are too many of these, they'd fill up all of register space.
+ * So, this will push some of them out to the pull constant buffer and
+ * update the program to load them.
+ */
+void
+fs_visitor::assign_constant_locations()
+{
+   /* Only the first compile gets to decide on locations. */
+   if (push_constant_loc)
+      return;
+
+   push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+   for (unsigned u = 0; u < uniforms; u++)
+      push_constant_loc[u] = u;
+
+   /* Now that we know how many regular uniforms we'll push, reduce the
+    * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
+    */
+   /* For gen4/5:
+    * Only allow 16 registers (128 uniform components) as push constants.
+    *
+    * If changing this value, note the limitation about total_regs in
+    * brw_curbe.c/crocus_state.c
+    */
+   const unsigned max_push_length = compiler->devinfo->ver < 6 ? 16 : 64;
+   unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
+   for (int i = 0; i < 4; i++) {
+      struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
+
+      if (push_length + range->length > max_push_length)
+         range->length = max_push_length - push_length;
+
+      push_length += range->length;
+   }
+   assert(push_length <= max_push_length);
+}
+
+bool
+fs_visitor::get_pull_locs(const fs_reg &src,
+                          unsigned *out_surf_index,
+                          unsigned *out_pull_index)
+{
+   assert(src.file == UNIFORM);
+
+   if (src.nr < UBO_START)
+      return false;
+
+   const struct brw_ubo_range *range =
+      &prog_data->ubo_ranges[src.nr - UBO_START];
+
+   /* If this access is in our (reduced) range, use the push data. */
+   if (src.offset / 32 < range->length)
+      return false;
+
+   *out_surf_index = range->block;
+   *out_pull_index = (32 * range->start + src.offset) / 4;
+
+   prog_data->has_ubo_pull = true;
+
+   return true;
+}
+
+/**
+ * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
+ * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
+ */
+bool
+fs_visitor::lower_constant_loads()
+{
+   unsigned index, pull_index;
+   bool progress = false;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      /* Set up the annotation tracking for new generated instructions. */
+      const fs_builder ibld(this, block, inst);
+
+      for (int i = 0; i < inst->sources; i++) {
+	 if (inst->src[i].file != UNIFORM)
+	    continue;
+
+         /* We'll handle this case later */
+         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
+            continue;
+
+         if (!get_pull_locs(inst->src[i], &index, &pull_index))
+	    continue;
+
+         assert(inst->src[i].stride == 0);
+
+         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
+         const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
+         const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         const unsigned base = pull_index * 4;
+
+         fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
+         srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);
+         srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]  = brw_imm_ud(base & ~(block_sz - 1));
+         srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]    = brw_imm_ud(block_sz);
+
+
+         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
+                   srcs, PULL_UNIFORM_CONSTANT_SRCS);
+
+         /* Rewrite the instruction to use the temporary VGRF. */
+         inst->src[i].file = VGRF;
+         inst->src[i].nr = dst.nr;
+         inst->src[i].offset = (base & (block_sz - 1)) +
+                               inst->src[i].offset % 4;
+
+         progress = true;
+      }
+
+      if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
+          inst->src[0].file == UNIFORM) {
+
+         if (!get_pull_locs(inst->src[0], &index, &pull_index))
+            continue;
+
+         VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
+                                    brw_imm_ud(index),
+                                    fs_reg() /* surface_handle */,
+                                    inst->src[1],
+                                    pull_index * 4, 4, 1);
+         inst->remove(block);
+
+         progress = true;
+      }
+   }
+   invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+static uint64_t
+src_as_uint(const fs_reg &src)
+{
+   assert(src.file == IMM);
+
+   switch (src.type) {
+   case BRW_REGISTER_TYPE_W:
+      return (uint64_t)(int16_t)(src.ud & 0xffff);
+
+   case BRW_REGISTER_TYPE_UW:
+      return (uint64_t)(uint16_t)(src.ud & 0xffff);
+
+   case BRW_REGISTER_TYPE_D:
+      return (uint64_t)src.d;
+
+   case BRW_REGISTER_TYPE_UD:
+      return (uint64_t)src.ud;
+
+   case BRW_REGISTER_TYPE_Q:
+      return src.d64;
+
+   case BRW_REGISTER_TYPE_UQ:
+      return src.u64;
+
+   default:
+      unreachable("Invalid integer type.");
+   }
+}
+
+static fs_reg
+brw_imm_for_type(uint64_t value, enum brw_reg_type type)
+{
+   switch (type) {
+   case BRW_REGISTER_TYPE_W:
+      return brw_imm_w(value);
+
+   case BRW_REGISTER_TYPE_UW:
+      return brw_imm_uw(value);
+
+   case BRW_REGISTER_TYPE_D:
+      return brw_imm_d(value);
+
+   case BRW_REGISTER_TYPE_UD:
+      return brw_imm_ud(value);
+
+   case BRW_REGISTER_TYPE_Q:
+      return brw_imm_d(value);
+
+   case BRW_REGISTER_TYPE_UQ:
+      return brw_imm_uq(value);
+
+   default:
+      unreachable("Invalid integer type.");
+   }
+}
+
+bool
+fs_visitor::opt_algebraic()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      switch (inst->opcode) {
+      case BRW_OPCODE_MOV:
+         if (!devinfo->has_64bit_float &&
+             inst->dst.type == BRW_REGISTER_TYPE_DF) {
+            assert(inst->dst.type == inst->src[0].type);
+            assert(!inst->saturate);
+            assert(!inst->src[0].abs);
+            assert(!inst->src[0].negate);
+            const brw::fs_builder ibld(this, block, inst);
+
+            if (!inst->is_partial_write())
+               ibld.emit_undef_for_dst(inst);
+
+            ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 1),
+                     subscript(inst->src[0], BRW_REGISTER_TYPE_F, 1));
+            ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 0),
+                     subscript(inst->src[0], BRW_REGISTER_TYPE_F, 0));
+
+            inst->remove(block);
+            progress = true;
+         }
+
+         if (!devinfo->has_64bit_int &&
+             (inst->dst.type == BRW_REGISTER_TYPE_UQ ||
+              inst->dst.type == BRW_REGISTER_TYPE_Q)) {
+            assert(inst->dst.type == inst->src[0].type);
+            assert(!inst->saturate);
+            assert(!inst->src[0].abs);
+            assert(!inst->src[0].negate);
+            const brw::fs_builder ibld(this, block, inst);
+
+            if (!inst->is_partial_write())
+               ibld.emit_undef_for_dst(inst);
+
+            ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
+                     subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1));
+            ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
+                     subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0));
+
+            inst->remove(block);
+            progress = true;
+         }
+
+         if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
+              inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
+             inst->dst.is_null() &&
+             (inst->src[0].abs || inst->src[0].negate)) {
+            inst->src[0].abs = false;
+            inst->src[0].negate = false;
+            progress = true;
+            break;
+         }
+
+         if (inst->src[0].file != IMM)
+            break;
+
+         if (inst->saturate) {
+            /* Full mixed-type saturates don't happen.  However, we can end up
+             * with things like:
+             *
+             *    mov.sat(8) g21<1>DF       -1F
+             *
+             * Other mixed-size-but-same-base-type cases may also be possible.
+             */
+            if (inst->dst.type != inst->src[0].type &&
+                inst->dst.type != BRW_REGISTER_TYPE_DF &&
+                inst->src[0].type != BRW_REGISTER_TYPE_F)
+               assert(!"unimplemented: saturate mixed types");
+
+            if (brw_saturate_immediate(inst->src[0].type,
+                                       &inst->src[0].as_brw_reg())) {
+               inst->saturate = false;
+               progress = true;
+            }
+         }
+         break;
+
+      case BRW_OPCODE_MUL:
+         if (inst->src[1].file != IMM)
+            continue;
+
+         if (brw_reg_type_is_floating_point(inst->src[1].type))
+            break;
+
+         /* From the BDW PRM, Vol 2a, "mul - Multiply":
+          *
+          *    "When multiplying integer datatypes, if src0 is DW and src1
+          *    is W, irrespective of the destination datatype, the
+          *    accumulator maintains full 48-bit precision."
+          *    ...
+          *    "When multiplying integer data types, if one of the sources
+          *    is a DW, the resulting full precision data is stored in
+          *    the accumulator."
+          *
+          * There are also similar notes in earlier PRMs.
+          *
+          * The MOV instruction can copy the bits of the source, but it
+          * does not clear the higher bits of the accumulator. So, because
+          * we might use the full accumulator in the MUL/MACH macro, we
+          * shouldn't replace such MULs with MOVs.
+          */
+         if ((brw_reg_type_to_size(inst->src[0].type) == 4 ||
+              brw_reg_type_to_size(inst->src[1].type) == 4) &&
+             (inst->dst.is_accumulator() ||
+              inst->writes_accumulator_implicitly(devinfo)))
+            break;
+
+         /* a * 1.0 = a */
+         if (inst->src[1].is_one()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+
+         /* a * -1.0 = -a */
+         if (inst->src[1].is_negative_one()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
+            inst->src[0].negate = !inst->src[0].negate;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+
+         break;
+      case BRW_OPCODE_ADD:
+         if (inst->src[1].file != IMM)
+            continue;
+
+         if (brw_reg_type_is_integer(inst->src[1].type) &&
+             inst->src[1].is_zero()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+
+         if (inst->src[0].file == IMM) {
+            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
+            inst->src[0].f += inst->src[1].f;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+         break;
+
+      case BRW_OPCODE_AND:
+         if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
+            const uint64_t src0 = src_as_uint(inst->src[0]);
+            const uint64_t src1 = src_as_uint(inst->src[1]);
+
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
+            inst->src[0] = brw_imm_for_type(src0 & src1, inst->dst.type);
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+
+         break;
+
+      case BRW_OPCODE_OR:
+         if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
+            const uint64_t src0 = src_as_uint(inst->src[0]);
+            const uint64_t src1 = src_as_uint(inst->src[1]);
+
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
+            inst->src[0] = brw_imm_for_type(src0 | src1, inst->dst.type);
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+
+         if (inst->src[0].equals(inst->src[1]) ||
+             inst->src[1].is_zero()) {
+            /* On Gfx8+, the OR instruction can have a source modifier that
+             * performs logical not on the operand.  Cases of 'OR r0, ~r1, 0'
+             * or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
+             */
+            if (inst->src[0].negate) {
+               inst->opcode = BRW_OPCODE_NOT;
+               inst->sources = 1;
+               inst->src[0].negate = false;
+            } else {
+               inst->opcode = BRW_OPCODE_MOV;
+               inst->sources = 1;
+            }
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+         break;
+      case BRW_OPCODE_CMP:
+         if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
+              inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
+             inst->src[1].is_zero() &&
+             (inst->src[0].abs || inst->src[0].negate)) {
+            inst->src[0].abs = false;
+            inst->src[0].negate = false;
+            progress = true;
+            break;
+         }
+         break;
+      case BRW_OPCODE_SEL:
+         if (!devinfo->has_64bit_float &&
+             !devinfo->has_64bit_int &&
+             (inst->dst.type == BRW_REGISTER_TYPE_DF ||
+              inst->dst.type == BRW_REGISTER_TYPE_UQ ||
+              inst->dst.type == BRW_REGISTER_TYPE_Q)) {
+            assert(inst->dst.type == inst->src[0].type);
+            assert(!inst->saturate);
+            assert(!inst->src[0].abs && !inst->src[0].negate);
+            assert(!inst->src[1].abs && !inst->src[1].negate);
+            const brw::fs_builder ibld(this, block, inst);
+
+            if (!inst->is_partial_write())
+               ibld.emit_undef_for_dst(inst);
+
+            set_predicate(inst->predicate,
+                          ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
+                                   subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
+                                   subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)));
+            set_predicate(inst->predicate,
+                          ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
+                                   subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),
+                                   subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1)));
+
+            inst->remove(block);
+            progress = true;
+         }
+         if (inst->src[0].equals(inst->src[1])) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
+            inst->src[1] = reg_undef;
+            inst->predicate = BRW_PREDICATE_NONE;
+            inst->predicate_inverse = false;
+            progress = true;
+         } else if (inst->saturate && inst->src[1].file == IMM) {
+            switch (inst->conditional_mod) {
+            case BRW_CONDITIONAL_LE:
+            case BRW_CONDITIONAL_L:
+               switch (inst->src[1].type) {
+               case BRW_REGISTER_TYPE_F:
+                  if (inst->src[1].f >= 1.0f) {
+                     inst->opcode = BRW_OPCODE_MOV;
+                     inst->sources = 1;
+                     inst->src[1] = reg_undef;
+                     inst->conditional_mod = BRW_CONDITIONAL_NONE;
+                     progress = true;
+                  }
+                  break;
+               default:
+                  break;
+               }
+               break;
+            case BRW_CONDITIONAL_GE:
+            case BRW_CONDITIONAL_G:
+               switch (inst->src[1].type) {
+               case BRW_REGISTER_TYPE_F:
+                  if (inst->src[1].f <= 0.0f) {
+                     inst->opcode = BRW_OPCODE_MOV;
+                     inst->sources = 1;
+                     inst->src[1] = reg_undef;
+                     inst->conditional_mod = BRW_CONDITIONAL_NONE;
+                     progress = true;
+                  }
+                  break;
+               default:
+                  break;
+               }
+            default:
+               break;
+            }
+         }
+         break;
+      case BRW_OPCODE_MAD:
+         if (inst->src[0].type != BRW_REGISTER_TYPE_F ||
+             inst->src[1].type != BRW_REGISTER_TYPE_F ||
+             inst->src[2].type != BRW_REGISTER_TYPE_F)
+            break;
+         if (inst->src[1].is_one()) {
+            inst->opcode = BRW_OPCODE_ADD;
+            inst->sources = 2;
+            inst->src[1] = inst->src[2];
+            inst->src[2] = reg_undef;
+            progress = true;
+         } else if (inst->src[2].is_one()) {
+            inst->opcode = BRW_OPCODE_ADD;
+            inst->sources = 2;
+            inst->src[2] = reg_undef;
+            progress = true;
+         }
+         break;
+      case BRW_OPCODE_SHL:
+         if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
+            /* It's not currently possible to generate this, and this constant
+             * folding does not handle it.
+             */
+            assert(!inst->saturate);
+
+            fs_reg result;
+
+            switch (type_sz(inst->src[0].type)) {
+            case 2:
+               result = brw_imm_uw(0x0ffff & (inst->src[0].ud << (inst->src[1].ud & 0x1f)));
+               break;
+            case 4:
+               result = brw_imm_ud(inst->src[0].ud << (inst->src[1].ud & 0x1f));
+               break;
+            case 8:
+               result = brw_imm_uq(inst->src[0].u64 << (inst->src[1].ud & 0x3f));
+               break;
+            default:
+               /* Just in case a future platform re-enables B or UB types. */
+               unreachable("Invalid source size.");
+            }
+
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = retype(result, inst->dst.type);
+            inst->src[1] = reg_undef;
+            inst->sources = 1;
+
+            progress = true;
+         }
+         break;
+
+      case SHADER_OPCODE_BROADCAST:
+         if (is_uniform(inst->src[0])) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
+            inst->force_writemask_all = true;
+            progress = true;
+         } else if (inst->src[1].file == IMM) {
+            inst->opcode = BRW_OPCODE_MOV;
+            /* It's possible that the selected component will be too large and
+             * overflow the register.  This can happen if someone does a
+             * readInvocation() from GLSL or SPIR-V and provides an OOB
+             * invocationIndex.  If this happens and we some how manage
+             * to constant fold it in and get here, then component() may cause
+             * us to start reading outside of the VGRF which will lead to an
+             * assert later.  Instead, just let it wrap around if it goes over
+             * exec_size.
+             */
+            const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
+            inst->src[0] = component(inst->src[0], comp);
+            inst->sources = 1;
+            inst->force_writemask_all = true;
+            progress = true;
+         }
+         break;
+
+      case SHADER_OPCODE_SHUFFLE:
+         if (is_uniform(inst->src[0])) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
+            progress = true;
+         } else if (inst->src[1].file == IMM) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = component(inst->src[0],
+                                     inst->src[1].ud);
+            inst->sources = 1;
+            progress = true;
+         }
+         break;
+
+      default:
+	 break;
+      }
+
+      /* Ensure that the correct source has the immediate value. 2-source
+       * instructions must have the immediate in src[1]. On Gfx12 and later,
+       * some 3-source instructions can have the immediate in src[0] or
+       * src[2]. It's complicated, so don't mess with 3-source instructions
+       * here.
+       */
+      if (progress && inst->sources == 2 && inst->is_commutative()) {
+         if (inst->src[0].file == IMM) {
+            fs_reg tmp = inst->src[1];
+            inst->src[1] = inst->src[0];
+            inst->src[0] = tmp;
+         }
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
+                          DEPENDENCY_INSTRUCTION_DETAIL);
+
+   return progress;
+}
+
+static unsigned
+load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
+{
+   assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
+   assert(size_read >= lp->header_size * REG_SIZE);
+
+   unsigned i;
+   unsigned size = lp->header_size * REG_SIZE;
+   for (i = lp->header_size; size < size_read && i < lp->sources; i++)
+      size += lp->exec_size * type_sz(lp->src[i].type);
+
+   /* Size read must cover exactly a subset of sources. */
+   assert(size == size_read);
+   return i;
+}
+
+/**
+ * Optimize sample messages that have constant zero values for the trailing
+ * parameters. We can just reduce the message length for these
+ * instructions instead of reserving a register for it. Trailing parameters
+ * that aren't sent default to zero anyway. This will cause the dead code
+ * eliminator to remove the MOV instruction that would otherwise be emitted to
+ * set up the zero value.
+ */
+bool
+fs_visitor::opt_zero_samples()
+{
+   /* Implementation supports only SENDs, so applicable to Gfx7+ only. */
+   assert(devinfo->ver >= 7);
+
+   bool progress = false;
+
+   foreach_block_and_inst(block, fs_inst, send, cfg) {
+      if (send->opcode != SHADER_OPCODE_SEND ||
+          send->sfid != BRW_SFID_SAMPLER)
+         continue;
+
+      /* Wa_14012688258:
+       *
+       * Don't trim zeros at the end of payload for sample operations
+       * in cube and cube arrays.
+       */
+      if (send->keep_payload_trailing_zeros)
+         continue;
+
+      /* This pass works on SENDs before splitting. */
+      if (send->ex_mlen > 0)
+         continue;
+
+      fs_inst *lp = (fs_inst *) send->prev;
+
+      if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+         continue;
+
+      /* How much of the payload are actually read by this SEND. */
+      const unsigned params =
+         load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
+
+      /* We don't want to remove the message header or the first parameter.
+       * Removing the first parameter is not allowed, see the Haswell PRM
+       * volume 7, page 149:
+       *
+       *     "Parameter 0 is required except for the sampleinfo message, which
+       *      has no parameter 0"
+       */
+      const unsigned first_param_idx = lp->header_size;
+      unsigned zero_size = 0;
+      for (unsigned i = params - 1; i > first_param_idx; i--) {
+         if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
+            break;
+         zero_size += lp->exec_size * type_sz(lp->src[i].type) * lp->dst.stride;
+      }
+
+      const unsigned zero_len = zero_size / (reg_unit(devinfo) * REG_SIZE);
+      if (zero_len > 0) {
+         send->mlen -= zero_len;
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
+
+   return progress;
+}
+
+/**
+ * Opportunistically split SEND message payloads.
+ *
+ * Gfx9+ supports "split" SEND messages, which take two payloads that are
+ * implicitly concatenated.  If we find a SEND message with a single payload,
+ * we can split that payload in two.  This results in smaller contiguous
+ * register blocks for us to allocate.  But it can help beyond that, too.
+ *
+ * We try and split a LOAD_PAYLOAD between sources which change registers.
+ * For example, a sampler message often contains a x/y/z coordinate that may
+ * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
+ * or array index, which comes from elsewhere.  In this case, the first few
+ * sources will be different offsets of the same VGRF, then a later source
+ * will be a different VGRF.  So we split there, possibly eliminating the
+ * payload concatenation altogether.
+ */
+bool
+fs_visitor::opt_split_sends()
+{
+   if (devinfo->ver < 9)
+      return false;
+
+   bool progress = false;
+
+   foreach_block_and_inst(block, fs_inst, send, cfg) {
+      if (send->opcode != SHADER_OPCODE_SEND ||
+          send->mlen <= reg_unit(devinfo) || send->ex_mlen > 0)
+         continue;
+
+      assert(send->src[2].file == VGRF);
+
+      /* Currently don't split sends that reuse a previously used payload. */
+      fs_inst *lp = (fs_inst *) send->prev;
+
+      if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+         continue;
+
+      if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
+         continue;
+
+      /* Split either after the header (if present), or when consecutive
+       * sources switch from one VGRF to a different one.
+       */
+      unsigned mid = lp->header_size;
+      if (mid == 0) {
+         for (mid = 1; mid < lp->sources; mid++) {
+            if (lp->src[mid].file == BAD_FILE)
+               continue;
+
+            if (lp->src[0].file != lp->src[mid].file ||
+                lp->src[0].nr != lp->src[mid].nr)
+               break;
+         }
+      }
+
+      /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
+       * find out how many sources from the payload does it really need.
+       */
+      const unsigned end =
+         load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
+
+      /* Nothing to split. */
+      if (end <= mid)
+         continue;
+
+      const fs_builder ibld(this, block, lp);
+      fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
+      fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
+
+      assert(lp1->size_written % REG_SIZE == 0);
+      assert(lp2->size_written % REG_SIZE == 0);
+      assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
+
+      lp1->dst = fs_reg(VGRF, alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
+      lp2->dst = fs_reg(VGRF, alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
+
+      send->resize_sources(4);
+      send->src[2] = lp1->dst;
+      send->src[3] = lp2->dst;
+      send->ex_mlen = lp2->size_written / REG_SIZE;
+      send->mlen -= send->ex_mlen;
+
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+/**
+ * Remove redundant or useless halts.
+ *
+ * For example, we can eliminate halts in the following sequence:
+ *
+ * halt        (redundant with the next halt)
+ * halt        (useless; jumps to the next instruction)
+ * halt-target
+ */
+bool
+fs_visitor::opt_redundant_halt()
+{
+   bool progress = false;
+
+   unsigned halt_count = 0;
+   fs_inst *halt_target = NULL;
+   bblock_t *halt_target_block = NULL;
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->opcode == BRW_OPCODE_HALT)
+         halt_count++;
+
+      if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
+         halt_target = inst;
+         halt_target_block = block;
+         break;
+      }
+   }
+
+   if (!halt_target) {
+      assert(halt_count == 0);
+      return false;
+   }
+
+   /* Delete any HALTs immediately before the halt target. */
+   for (fs_inst *prev = (fs_inst *) halt_target->prev;
+        !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
+        prev = (fs_inst *) halt_target->prev) {
+      prev->remove(halt_target_block);
+      halt_count--;
+      progress = true;
+   }
+
+   if (halt_count == 0) {
+      halt_target->remove(halt_target_block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+/**
+ * Compute a bitmask with GRF granularity with a bit set for each GRF starting
+ * from \p r.offset which overlaps the region starting at \p s.offset and
+ * spanning \p ds bytes.
+ */
+static inline unsigned
+mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
+{
+   const int rel_offset = reg_offset(s) - reg_offset(r);
+   const int shift = rel_offset / REG_SIZE;
+   const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
+   assert(reg_space(r) == reg_space(s) &&
+          shift >= 0 && shift < int(8 * sizeof(unsigned)));
+   return ((1 << n) - 1) << shift;
+}
+
+bool
+fs_visitor::compute_to_mrf()
+{
+   bool progress = false;
+   int next_ip = 0;
+
+   /* No MRFs on Gen >= 7. */
+   if (devinfo->ver >= 7)
+      return false;
+
+   const fs_live_variables &live = live_analysis.require();
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      int ip = next_ip;
+      next_ip++;
+
+      if (inst->opcode != BRW_OPCODE_MOV ||
+	  inst->is_partial_write() ||
+	  inst->dst.file != MRF || inst->src[0].file != VGRF ||
+	  inst->dst.type != inst->src[0].type ||
+	  inst->src[0].abs || inst->src[0].negate ||
+          !inst->src[0].is_contiguous() ||
+          inst->src[0].offset % REG_SIZE != 0)
+	 continue;
+
+      /* Can't compute-to-MRF this GRF if someone else was going to
+       * read it later.
+       */
+      if (live.vgrf_end[inst->src[0].nr] > ip)
+	 continue;
+
+      /* Found a move of a GRF to a MRF.  Let's see if we can go rewrite the
+       * things that computed the value of all GRFs of the source region.  The
+       * regs_left bitset keeps track of the registers we haven't yet found a
+       * generating instruction for.
+       */
+      unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
+
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->src[0], inst->size_read(0))) {
+	    /* Found the last thing to write our reg we want to turn
+	     * into a compute-to-MRF.
+	     */
+
+	    /* If this one instruction didn't populate all the
+	     * channels, bail.  We might be able to rewrite everything
+	     * that writes that reg, but it would require smarter
+	     * tracking.
+	     */
+	    if (scan_inst->is_partial_write())
+	       break;
+
+            /* Handling things not fully contained in the source of the copy
+             * would need us to understand coalescing out more than one MOV at
+             * a time.
+             */
+            if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
+                                     inst->src[0], inst->size_read(0)))
+               break;
+
+	    /* SEND instructions can't have MRF as a destination. */
+	    if (scan_inst->mlen)
+	       break;
+
+	    if (devinfo->ver == 6) {
+	       /* gfx6 math instructions must have the destination be
+		* GRF, so no compute-to-MRF for them.
+		*/
+	       if (scan_inst->is_math()) {
+		  break;
+	       }
+	    }
+
+            /* Clear the bits for any registers this instruction overwrites. */
+            regs_left &= ~mask_relative_to(
+               inst->src[0], scan_inst->dst, scan_inst->size_written);
+            if (!regs_left)
+               break;
+	 }
+
+	 /* We don't handle control flow here.  Most computation of
+	  * values that end up in MRFs are shortly before the MRF
+	  * write anyway.
+	  */
+	 if (block->start() == scan_inst)
+	    break;
+
+	 /* You can't read from an MRF, so if someone else reads our
+	  * MRF's source GRF that we wanted to rewrite, that stops us.
+	  */
+	 bool interfered = false;
+	 for (int i = 0; i < scan_inst->sources; i++) {
+            if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
+                                inst->src[0], inst->size_read(0))) {
+	       interfered = true;
+	    }
+	 }
+	 if (interfered)
+	    break;
+
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->dst, inst->size_written)) {
+	    /* If somebody else writes our MRF here, we can't
+	     * compute-to-MRF before that.
+	     */
+            break;
+         }
+
+         if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
+             regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
+                             inst->dst, inst->size_written)) {
+	    /* Found a SEND instruction, which means that there are
+	     * live values in MRFs from base_mrf to base_mrf +
+	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
+	     * above it.
+	     */
+            break;
+         }
+      }
+
+      if (regs_left)
+         continue;
+
+      /* Found all generating instructions of our MRF's source value, so it
+       * should be safe to rewrite them to point to the MRF directly.
+       */
+      regs_left = (1 << regs_read(inst, 0)) - 1;
+
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->src[0], inst->size_read(0))) {
+            /* Clear the bits for any registers this instruction overwrites. */
+            regs_left &= ~mask_relative_to(
+               inst->src[0], scan_inst->dst, scan_inst->size_written);
+
+            const unsigned rel_offset = reg_offset(scan_inst->dst) -
+                                        reg_offset(inst->src[0]);
+
+            if (inst->dst.nr & BRW_MRF_COMPR4) {
+               /* Apply the same address transformation done by the hardware
+                * for COMPR4 MRF writes.
+                */
+               assert(rel_offset < 2 * REG_SIZE);
+               scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
+
+               /* Clear the COMPR4 bit if the generating instruction is not
+                * compressed.
+                */
+               if (scan_inst->size_written < 2 * REG_SIZE)
+                  scan_inst->dst.nr &= ~BRW_MRF_COMPR4;
+
+            } else {
+               /* Calculate the MRF number the result of this instruction is
+                * ultimately written to.
+                */
+               scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
+            }
+
+            scan_inst->dst.file = MRF;
+            scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
+            scan_inst->saturate |= inst->saturate;
+            if (!regs_left)
+               break;
+         }
+      }
+
+      assert(!regs_left);
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+/**
+ * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
+ * flow.  We could probably do better here with some form of divergence
+ * analysis.
+ */
+bool
+fs_visitor::eliminate_find_live_channel()
+{
+   bool progress = false;
+   unsigned depth = 0;
+
+   if (!brw_stage_has_packed_dispatch(devinfo, stage, max_polygons,
+                                      stage_prog_data)) {
+      /* The optimization below assumes that channel zero is live on thread
+       * dispatch, which may not be the case if the fixed function dispatches
+       * threads sparsely.
+       */
+      return false;
+   }
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      switch (inst->opcode) {
+      case BRW_OPCODE_IF:
+      case BRW_OPCODE_DO:
+         depth++;
+         break;
+
+      case BRW_OPCODE_ENDIF:
+      case BRW_OPCODE_WHILE:
+         depth--;
+         break;
+
+      case BRW_OPCODE_HALT:
+         /* This can potentially make control flow non-uniform until the end
+          * of the program.
+          */
+         goto out;
+
+      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+         if (depth == 0) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = brw_imm_ud(0u);
+            inst->sources = 1;
+            inst->force_writemask_all = true;
+            progress = true;
+         }
+         break;
+
+      default:
+         break;
+      }
+   }
+
+out:
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
+
+   return progress;
+}
+
+/**
+ * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
+ * instructions to FS_OPCODE_REP_FB_WRITE.
+ */
+void
+fs_visitor::emit_repclear_shader()
+{
+   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+   fs_inst *write = NULL;
+
+   assert(uniforms == 0);
+   assume(key->nr_color_regions > 0);
+
+   fs_reg color_output, header;
+   if (devinfo->ver >= 7) {
+      color_output = retype(brw_vec4_grf(127, 0), BRW_REGISTER_TYPE_UD);
+      header = retype(brw_vec8_grf(125, 0), BRW_REGISTER_TYPE_UD);
+   } else {
+      color_output = retype(brw_vec4_reg(MRF, 2, 0), BRW_REGISTER_TYPE_UD);
+      header = retype(brw_vec8_reg(MRF, 0, 0), BRW_REGISTER_TYPE_UD);
+   }
+
+   /* We pass the clear color as a flat input.  Copy it to the output. */
+   fs_reg color_input =
+      brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_UD,
+              BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
+              BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+
+   const fs_builder bld = fs_builder(this).at_end();
+   bld.exec_all().group(4, 0).MOV(color_output, color_input);
+
+   if (key->nr_color_regions > 1) {
+      /* Copy g0..g1 as the message header */
+      bld.exec_all().group(16, 0)
+         .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   }
+
+   for (int i = 0; i < key->nr_color_regions; ++i) {
+      if (i > 0)
+         bld.exec_all().group(1, 0).MOV(component(header, 2), brw_imm_ud(i));
+
+      if (devinfo->ver >= 7) {
+         write = bld.emit(SHADER_OPCODE_SEND);
+         write->resize_sources(3);
+         write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
+         write->src[0] = brw_imm_ud(0);
+         write->src[1] = brw_imm_ud(0);
+         write->src[2] = i == 0 ? color_output : header;
+         write->check_tdr = true;
+         write->send_has_side_effects = true;
+         write->desc = brw_fb_write_desc(devinfo, i,
+            BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
+            i == key->nr_color_regions - 1, false);
+      } else {
+         write = bld.emit(FS_OPCODE_REP_FB_WRITE);
+         write->target = i;
+         write->base_mrf = i == 0 ? color_output.nr : header.nr;
+      }
+
+      /* We can use a headerless message for the first render target */
+      write->header_size = i == 0 ? 0 : 2;
+      write->mlen = 1 + write->header_size;
+   }
+   write->eot = true;
+   write->last_rt = true;
+
+   calculate_cfg();
+
+   this->first_non_payload_grf = payload().num_regs;
+
+   lower_scoreboard();
+}
+
+/**
+ * Walks through basic blocks, looking for repeated MRF writes and
+ * removing the later ones.
+ */
+bool
+fs_visitor::remove_duplicate_mrf_writes()
+{
+   fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->ver)];
+   bool progress = false;
+
+   /* Need to update the MRF tracking for compressed instructions. */
+   if (dispatch_width >= 16)
+      return false;
+
+   memset(last_mrf_move, 0, sizeof(last_mrf_move));
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      if (inst->is_control_flow()) {
+	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
+      }
+
+      if (inst->opcode == BRW_OPCODE_MOV &&
+	  inst->dst.file == MRF) {
+         fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
+	 if (prev_inst && prev_inst->opcode == BRW_OPCODE_MOV &&
+             inst->dst.equals(prev_inst->dst) &&
+             inst->src[0].equals(prev_inst->src[0]) &&
+             inst->saturate == prev_inst->saturate &&
+             inst->predicate == prev_inst->predicate &&
+             inst->conditional_mod == prev_inst->conditional_mod &&
+             inst->exec_size == prev_inst->exec_size) {
+	    inst->remove(block);
+	    progress = true;
+	    continue;
+	 }
+      }
+
+      /* Clear out the last-write records for MRFs that were overwritten. */
+      if (inst->dst.file == MRF) {
+         last_mrf_move[inst->dst.nr] = NULL;
+      }
+
+      if (inst->mlen > 0 && inst->base_mrf != -1) {
+	 /* Found a SEND instruction, which will include two or fewer
+	  * implied MRF writes.  We could do better here.
+	  */
+	 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
+	    last_mrf_move[inst->base_mrf + i] = NULL;
+	 }
+      }
+
+      /* Clear out any MRF move records whose sources got overwritten. */
+      for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
+         if (last_mrf_move[i] &&
+             regions_overlap(inst->dst, inst->size_written,
+                             last_mrf_move[i]->src[0],
+                             last_mrf_move[i]->size_read(0))) {
+            last_mrf_move[i] = NULL;
+         }
+      }
+
+      if (inst->opcode == BRW_OPCODE_MOV &&
+	  inst->dst.file == MRF &&
+	  inst->src[0].file != ARF &&
+	  !inst->is_partial_write()) {
+         last_mrf_move[inst->dst.nr] = inst;
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+/**
+ * Rounding modes for conversion instructions are included for each
+ * conversion, but right now it is a state. So once it is set,
+ * we don't need to call it again for subsequent calls.
+ *
+ * This is useful for vector/matrices conversions, as setting the
+ * mode once is enough for the full vector/matrix
+ */
+bool
+fs_visitor::remove_extra_rounding_modes()
+{
+   bool progress = false;
+   unsigned execution_mode = this->nir->info.float_controls_execution_mode;
+
+   brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
+   if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
+        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
+        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
+       execution_mode)
+      base_mode = BRW_RND_MODE_RTNE;
+   if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
+        FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
+        FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
+       execution_mode)
+      base_mode = BRW_RND_MODE_RTZ;
+
+   foreach_block (block, cfg) {
+      brw_rnd_mode prev_mode = base_mode;
+
+      foreach_inst_in_block_safe (fs_inst, inst, block) {
+         if (inst->opcode == SHADER_OPCODE_RND_MODE) {
+            assert(inst->src[0].file == BRW_IMMEDIATE_VALUE);
+            const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
+            if (mode == prev_mode) {
+               inst->remove(block);
+               progress = true;
+            } else {
+               prev_mode = mode;
+            }
+         }
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+static void
+clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
+{
+   /* Clear the flag for registers that actually got read (as expected). */
+   for (int i = 0; i < inst->sources; i++) {
+      int grf;
+      if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
+         grf = inst->src[i].nr;
+      } else {
+         continue;
+      }
+
+      if (grf >= first_grf &&
+          grf < first_grf + grf_len) {
+         deps[grf - first_grf] = false;
+         if (inst->exec_size == 16)
+            deps[grf - first_grf + 1] = false;
+      }
+   }
+}
+
+/**
+ * Implements this workaround for the original 965:
+ *
+ *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
+ *      check for post destination dependencies on this instruction, software
+ *      must ensure that there is no destination hazard for the case of ‘write
+ *      followed by a posted write’ shown in the following example.
+ *
+ *      1. mov r3 0
+ *      2. send r3.xy <rest of send instruction>
+ *      3. mov r2 r3
+ *
+ *      Due to no post-destination dependency check on the ‘send’, the above
+ *      code sequence could have two instructions (1 and 2) in flight at the
+ *      same time that both consider ‘r3’ as the target of their final writes.
+ */
+void
+fs_visitor::insert_gfx4_pre_send_dependency_workarounds(bblock_t *block,
+                                                        fs_inst *inst)
+{
+   int write_len = regs_written(inst);
+   int first_write_grf = inst->dst.nr;
+   bool needs_dep[BRW_MAX_MRF(devinfo->ver)];
+   assert(write_len < (int)sizeof(needs_dep) - 1);
+
+   memset(needs_dep, false, sizeof(needs_dep));
+   memset(needs_dep, true, write_len);
+
+   clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
+
+   /* Walk backwards looking for writes to registers we're writing which
+    * aren't read since being written.  If we hit the start of the program,
+    * we assume that there are no outstanding dependencies on entry to the
+    * program.
+    */
+   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+      /* If we hit control flow, assume that there *are* outstanding
+       * dependencies, and force their cleanup before our instruction.
+       */
+      if (block->start() == scan_inst && block->num != 0) {
+         for (int i = 0; i < write_len; i++) {
+            if (needs_dep[i])
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst),
+                               first_write_grf + i);
+         }
+         return;
+      }
+
+      /* We insert our reads as late as possible on the assumption that any
+       * instruction but a MOV that might have left us an outstanding
+       * dependency has more latency than a MOV.
+       */
+      if (scan_inst->dst.file == VGRF) {
+         for (unsigned i = 0; i < regs_written(scan_inst); i++) {
+            int reg = scan_inst->dst.nr + i;
+
+            if (reg >= first_write_grf &&
+                reg < first_write_grf + write_len &&
+                needs_dep[reg - first_write_grf]) {
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
+               needs_dep[reg - first_write_grf] = false;
+               if (scan_inst->exec_size == 16)
+                  needs_dep[reg - first_write_grf + 1] = false;
+            }
+         }
+      }
+
+      /* Clear the flag for registers that actually got read (as expected). */
+      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
+
+      /* Continue the loop only if we haven't resolved all the dependencies */
+      int i;
+      for (i = 0; i < write_len; i++) {
+         if (needs_dep[i])
+            break;
+      }
+      if (i == write_len)
+         return;
+   }
+}
+
+/**
+ * Implements this workaround for the original 965:
+ *
+ *     "[DevBW, DevCL] Errata: A destination register from a send can not be
+ *      used as a destination register until after it has been sourced by an
+ *      instruction with a different destination register.
+ */
+void
+fs_visitor::insert_gfx4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
+{
+   int write_len = regs_written(inst);
+   unsigned first_write_grf = inst->dst.nr;
+   bool needs_dep[BRW_MAX_MRF(devinfo->ver)];
+   assert(write_len < (int)sizeof(needs_dep) - 1);
+
+   memset(needs_dep, false, sizeof(needs_dep));
+   memset(needs_dep, true, write_len);
+   /* Walk forwards looking for writes to registers we're writing which aren't
+    * read before being written.
+    */
+   foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
+      /* If we hit control flow, force resolve all remaining dependencies. */
+      if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
+         for (int i = 0; i < write_len; i++) {
+            if (needs_dep[i])
+               DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                               first_write_grf + i);
+         }
+         return;
+      }
+
+      /* Clear the flag for registers that actually got read (as expected). */
+      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
+
+      /* We insert our reads as late as possible since they're reading the
+       * result of a SEND, which has massive latency.
+       */
+      if (scan_inst->dst.file == VGRF &&
+          scan_inst->dst.nr >= first_write_grf &&
+          scan_inst->dst.nr < first_write_grf + write_len &&
+          needs_dep[scan_inst->dst.nr - first_write_grf]) {
+         DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                         scan_inst->dst.nr);
+         needs_dep[scan_inst->dst.nr - first_write_grf] = false;
+      }
+
+      /* Continue the loop only if we haven't resolved all the dependencies */
+      int i;
+      for (i = 0; i < write_len; i++) {
+         if (needs_dep[i])
+            break;
+      }
+      if (i == write_len)
+         return;
+   }
+}
+
+void
+fs_visitor::insert_gfx4_send_dependency_workarounds()
+{
+   if (devinfo->ver != 4 || devinfo->platform == INTEL_PLATFORM_G4X)
+      return;
+
+   bool progress = false;
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->mlen != 0 && inst->dst.file == VGRF) {
+         insert_gfx4_pre_send_dependency_workarounds(block, inst);
+         insert_gfx4_post_send_dependency_workarounds(block, inst);
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+}
+
+bool
+fs_visitor::lower_load_payload()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+         continue;
+
+      assert(inst->dst.file == MRF || inst->dst.file == VGRF);
+      assert(inst->saturate == false);
+      fs_reg dst = inst->dst;
+
+      /* Get rid of COMPR4.  We'll add it back in if we need it */
+      if (dst.file == MRF)
+         dst.nr = dst.nr & ~BRW_MRF_COMPR4;
+
+      const fs_builder ibld(this, block, inst);
+      const fs_builder ubld = ibld.exec_all();
+
+      for (uint8_t i = 0; i < inst->header_size;) {
+         /* Number of header GRFs to initialize at once with a single MOV
+          * instruction.
+          */
+         const unsigned n =
+            (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
+             inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
+            2 : 1;
+
+         if (inst->src[i].file != BAD_FILE)
+            ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD),
+                                     retype(inst->src[i], BRW_REGISTER_TYPE_UD));
+
+         dst = byte_offset(dst, n * REG_SIZE);
+         i += n;
+      }
+
+      if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
+          inst->exec_size > 8) {
+         /* In this case, the payload portion of the LOAD_PAYLOAD isn't
+          * a straightforward copy.  Instead, the result of the
+          * LOAD_PAYLOAD is treated as interleaved and the first four
+          * non-header sources are unpacked as:
+          *
+          * m + 0: r0
+          * m + 1: g0
+          * m + 2: b0
+          * m + 3: a0
+          * m + 4: r1
+          * m + 5: g1
+          * m + 6: b1
+          * m + 7: a1
+          *
+          * This is used for gen <= 5 fb writes.
+          */
+         assert(inst->exec_size == 16);
+         assert(inst->header_size + 4 <= inst->sources);
+         for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
+            if (inst->src[i].file != BAD_FILE) {
+               if (devinfo->has_compr4) {
+                  fs_reg compr4_dst = retype(dst, inst->src[i].type);
+                  compr4_dst.nr |= BRW_MRF_COMPR4;
+                  ibld.MOV(compr4_dst, inst->src[i]);
+               } else {
+                  /* Platform doesn't have COMPR4.  We have to fake it */
+                  fs_reg mov_dst = retype(dst, inst->src[i].type);
+                  ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));
+                  mov_dst.nr += 4;
+                  ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));
+               }
+            }
+
+            dst.nr++;
+         }
+
+         /* The loop above only ever incremented us through the first set
+          * of 4 registers.  However, thanks to the magic of COMPR4, we
+          * actually wrote to the first 8 registers, so we need to take
+          * that into account now.
+          */
+         dst.nr += 4;
+
+         /* The COMPR4 code took care of the first 4 sources.  We'll let
+          * the regular path handle any remaining sources.  Yes, we are
+          * modifying the instruction but we're about to delete it so
+          * this really doesn't hurt anything.
+          */
+         inst->header_size += 4;
+      }
+
+      for (uint8_t i = inst->header_size; i < inst->sources; i++) {
+         dst.type = inst->src[i].type;
+         if (inst->src[i].file != BAD_FILE) {
+            ibld.MOV(dst, inst->src[i]);
+         }
+         dst = offset(dst, ibld, 1);
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+/**
+ * Factor an unsigned 32-bit integer.
+ *
+ * Attempts to factor \c x into two values that are at most 0xFFFF.  If no
+ * such factorization is possible, either because the value is too large or is
+ * prime, both \c result_a and \c result_b will be zero.
+ */
+static void
+factor_uint32(uint32_t x, unsigned *result_a, unsigned *result_b)
+{
+   /* This is necessary to prevent various opportunities for division by zero
+    * below.
+    */
+   assert(x > 0xffff);
+
+   /* This represents the actual expected constraints on the input.  Namely,
+    * both the upper and lower words should be > 1.
+    */
+   assert(x >= 0x00020002);
+
+   *result_a = 0;
+   *result_b = 0;
+
+   /* The value is too large to factor with the constraints. */
+   if (x > (0xffffu * 0xffffu))
+      return;
+
+   /* A non-prime number will have the form p*q*d where p is some prime
+    * number, q > 1, and 1 <= d <= q.  To meet the constraints of this
+    * function, (p*d) < 0x10000.  This implies d <= floor(0xffff / p).
+    * Furthermore, since q < 0x10000, d >= floor(x / (0xffff * p)).  Finally,
+    * floor(x / (0xffff * p)) <= d <= floor(0xffff / p).
+    *
+    * The observation is finding the largest possible value of p reduces the
+    * possible range of d.  After selecting p, all values of d in this range
+    * are tested until a factorization is found.  The size of the range of
+    * possible values of d sets an upper bound on the run time of the
+    * function.
+    */
+   static const uint16_t primes[256] = {
+         2,    3,    5,    7,   11,   13,   17,   19,
+        23,   29,   31,   37,   41,   43,   47,   53,
+        59,   61,   67,   71,   73,   79,   83,   89,
+        97,  101,  103,  107,  109,  113,  127,  131,  /*  32 */
+       137,  139,  149,  151,  157,  163,  167,  173,
+       179,  181,  191,  193,  197,  199,  211,  223,
+       227,  229,  233,  239,  241,  251,  257,  263,
+       269,  271,  277,  281,  283,  293,  307,  311,  /*  64 */
+       313,  317,  331,  337,  347,  349,  353,  359,
+       367,  373,  379,  383,  389,  397,  401,  409,
+       419,  421,  431,  433,  439,  443,  449,  457,
+       461,  463,  467,  479,  487,  491,  499,  503,  /*  96 */
+       509,  521,  523,  541,  547,  557,  563,  569,
+       571,  577,  587,  593,  599,  601,  607,  613,
+       617,  619,  631,  641,  643,  647,  653,  659,
+       661,  673,  677,  683,  691,  701,  709,  719,   /* 128 */
+       727,  733,  739,  743,  751,  757,  761,  769,
+       773,  787,  797,  809,  811,  821,  823,  827,
+       829,  839,  853,  857,  859,  863,  877,  881,
+       883,  887,  907,  911,  919,  929,  937,  941,  /* 160 */
+       947,  953,  967,  971,  977,  983,  991,  997,
+      1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
+      1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
+      1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163,  /* 192 */
+      1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
+      1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
+      1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
+      1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423,  /* 224 */
+      1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
+      1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
+      1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
+      1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619,  /* 256 */
+   };
+
+   unsigned p;
+   unsigned x_div_p;
+
+   for (int i = ARRAY_SIZE(primes) - 1; i >= 0; i--) {
+      p = primes[i];
+      x_div_p = x / p;
+
+      if ((x_div_p * p) == x)
+         break;
+   }
+
+   /* A prime factor was not found. */
+   if (x_div_p * p != x)
+      return;
+
+   /* Terminate early if d=1 is a solution. */
+   if (x_div_p < 0x10000) {
+      *result_a = x_div_p;
+      *result_b = p;
+      return;
+   }
+
+   /* Pick the maximum possible value for 'd'.  It's important that the loop
+    * below execute while d <= max_d because max_d is a valid value.  Having
+    * the wrong loop bound would cause 1627*1367*47 (0x063b0c83) to be
+    * incorrectly reported as not being factorable.  The problem would occur
+    * with any value that is a factor of two primes in the table and one prime
+    * not in the table.
+    */
+   const unsigned max_d = 0xffff / p;
+
+   /* Pick an initial value of 'd' that (combined with rejecting too large
+    * values above) guarantees that 'q' will always be small enough.
+    * DIV_ROUND_UP is used to prevent 'd' from being zero.
+    */
+   for (unsigned d = DIV_ROUND_UP(x_div_p, 0xffff); d <= max_d; d++) {
+      unsigned q = x_div_p / d;
+
+      if ((q * d) == x_div_p) {
+         assert(p * d * q == x);
+         assert((p * d) < 0x10000);
+
+         *result_a = q;
+         *result_b = p * d;
+         break;
+      }
+
+      /* Since every value of 'd' is tried, as soon as 'd' is larger
+       * than 'q', we're just re-testing combinations that have
+       * already been tested.
+       */
+      if (d > q)
+         break;
+   }
+}
+
+void
+fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
+{
+   const fs_builder ibld(this, block, inst);
+
+   /* It is correct to use inst->src[1].d in both end of the comparison.
+    * Using .ud in the UINT16_MAX comparison would cause any negative value to
+    * fail the check.
+    */
+   if (inst->src[1].file == IMM &&
+       (inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) {
+      /* The MUL instruction isn't commutative. On Gen <= 6, only the low
+       * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
+       * src1 are used.
+       *
+       * If multiplying by an immediate value that fits in 16-bits, do a
+       * single MUL instruction with that value in the proper location.
+       */
+      const bool ud = (inst->src[1].d >= 0);
+      if (devinfo->ver < 7) {
+         fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type);
+         ibld.MOV(imm, inst->src[1]);
+         ibld.MUL(inst->dst, imm, inst->src[0]);
+      } else {
+         ibld.MUL(inst->dst, inst->src[0],
+                  ud ? brw_imm_uw(inst->src[1].ud)
+                     : brw_imm_w(inst->src[1].d));
+      }
+   } else {
+      /* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot
+       * do 32-bit integer multiplication in one instruction, but instead
+       * must do a sequence (which actually calculates a 64-bit result):
+       *
+       *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
+       *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
+       *    mov(8)  g2<1>D     acc0<8,8,1>D
+       *
+       * But on Gen > 6, the ability to use second accumulator register
+       * (acc1) for non-float data types was removed, preventing a simple
+       * implementation in SIMD16. A 16-channel result can be calculated by
+       * executing the three instructions twice in SIMD8, once with quarter
+       * control of 1Q for the first eight channels and again with 2Q for
+       * the second eight channels.
+       *
+       * Which accumulator register is implicitly accessed (by AccWrEnable
+       * for instance) is determined by the quarter control. Unfortunately
+       * Ivybridge (and presumably Baytrail) has a hardware bug in which an
+       * implicit accumulator access by an instruction with 2Q will access
+       * acc1 regardless of whether the data type is usable in acc1.
+       *
+       * Specifically, the 2Q mach(8) writes acc1 which does not exist for
+       * integer data types.
+       *
+       * Since we only want the low 32-bits of the result, we can do two
+       * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
+       * adjust the high result and add them (like the mach is doing):
+       *
+       *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
+       *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
+       *    shl(8)  g9<1>D     g8<8,8,1>D      16D
+       *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
+       *
+       * We avoid the shl instruction by realizing that we only want to add
+       * the low 16-bits of the "high" result to the high 16-bits of the
+       * "low" result and using proper regioning on the add:
+       *
+       *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
+       *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
+       *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
+       *
+       * Since it does not use the (single) accumulator register, we can
+       * schedule multi-component multiplications much better.
+       */
+
+      bool needs_mov = false;
+      fs_reg orig_dst = inst->dst;
+
+      /* Get a new VGRF for the "low" 32x16-bit multiplication result if
+       * reusing the original destination is impossible due to hardware
+       * restrictions, source/destination overlap, or it being the null
+       * register.
+       */
+      fs_reg low = inst->dst;
+      if (orig_dst.is_null() || orig_dst.file == MRF ||
+          regions_overlap(inst->dst, inst->size_written,
+                          inst->src[0], inst->size_read(0)) ||
+          regions_overlap(inst->dst, inst->size_written,
+                          inst->src[1], inst->size_read(1)) ||
+          inst->dst.stride >= 4) {
+         needs_mov = true;
+         low = fs_reg(VGRF, alloc.allocate(regs_written(inst)),
+                      inst->dst.type);
+      }
+
+      /* Get a new VGRF but keep the same stride as inst->dst */
+      fs_reg high(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type);
+      high.stride = inst->dst.stride;
+      high.offset = inst->dst.offset % REG_SIZE;
+
+      bool do_addition = true;
+      if (devinfo->ver >= 7) {
+         /* From Wa_1604601757:
+          *
+          * "When multiplying a DW and any lower precision integer, source modifier
+          *  is not supported."
+          *
+          * An unsupported negate modifier on src[1] would ordinarily be
+          * lowered by the subsequent lower_regioning pass.  In this case that
+          * pass would spawn another dword multiply.  Instead, lower the
+          * modifier first.
+          */
+         const bool source_mods_unsupported = (devinfo->ver >= 12);
+
+         if (inst->src[1].abs || (inst->src[1].negate &&
+                                  source_mods_unsupported))
+            lower_src_modifiers(this, block, inst, 1);
+
+         if (inst->src[1].file == IMM) {
+            unsigned a;
+            unsigned b;
+
+            /* If the immeditate value can be factored into two values, A and
+             * B, that each fit in 16-bits, the multiplication result can
+             * instead be calculated as (src1 * (A * B)) = ((src1 * A) * B).
+             * This saves an operation (the addition) and a temporary register
+             * (high).
+             *
+             * Skip the optimization if either the high word or the low word
+             * is 0 or 1.  In these conditions, at least one of the
+             * multiplications generated by the straightforward method will be
+             * eliminated anyway.
+             */
+            if (inst->src[1].ud > 0x0001ffff &&
+                (inst->src[1].ud & 0xffff) > 1) {
+               factor_uint32(inst->src[1].ud, &a, &b);
+
+               if (a != 0) {
+                  ibld.MUL(low, inst->src[0], brw_imm_uw(a));
+                  ibld.MUL(low, low, brw_imm_uw(b));
+                  do_addition = false;
+               }
+            }
+
+            if (do_addition) {
+               ibld.MUL(low, inst->src[0],
+                        brw_imm_uw(inst->src[1].ud & 0xffff));
+               ibld.MUL(high, inst->src[0],
+                        brw_imm_uw(inst->src[1].ud >> 16));
+            }
+         } else {
+            ibld.MUL(low, inst->src[0],
+                     subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
+            ibld.MUL(high, inst->src[0],
+                     subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
+         }
+      } else {
+         if (inst->src[0].abs)
+            lower_src_modifiers(this, block, inst, 0);
+
+         ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
+                  inst->src[1]);
+         ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
+                  inst->src[1]);
+      }
+
+      if (do_addition) {
+         ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1),
+                  subscript(low, BRW_REGISTER_TYPE_UW, 1),
+                  subscript(high, BRW_REGISTER_TYPE_UW, 0));
+      }
+
+      if (needs_mov || inst->conditional_mod)
+         set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
+   }
+}
+
+void
+fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block)
+{
+   const fs_builder ibld(this, block, inst);
+
+   /* Considering two 64-bit integers ab and cd where each letter        ab
+    * corresponds to 32 bits, we get a 128-bit result WXYZ. We         * cd
+    * only need to provide the YZ part of the result.               -------
+    *                                                                    BD
+    *  Only BD needs to be 64 bits. For AD and BC we only care       +  AD
+    *  about the lower 32 bits (since they are part of the upper     +  BC
+    *  32 bits of our result). AC is not needed since it starts      + AC
+    *  on the 65th bit of the result.                               -------
+    *                                                                  WXYZ
+    */
+   unsigned int q_regs = regs_written(inst);
+   unsigned int d_regs = (q_regs + 1) / 2;
+
+   fs_reg bd(VGRF, alloc.allocate(q_regs), BRW_REGISTER_TYPE_UQ);
+   fs_reg ad(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
+   fs_reg bc(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
+
+   /* Here we need the full 64 bit result for 32b * 32b. */
+   if (devinfo->has_integer_dword_mul) {
+      ibld.MUL(bd, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
+               subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
+   } else {
+      fs_reg bd_high(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
+      fs_reg bd_low(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
+      const unsigned acc_width = reg_unit(devinfo) * 8;
+      fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD),
+                             inst->group % acc_width);
+
+      fs_inst *mul = ibld.MUL(acc,
+                            subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
+                            subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
+      mul->writes_accumulator = true;
+
+      ibld.MACH(bd_high, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
+                subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
+      ibld.MOV(bd_low, acc);
+
+      ibld.UNDEF(bd);
+      ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low);
+      ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high);
+   }
+
+   ibld.MUL(ad, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),
+            subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
+   ibld.MUL(bc, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
+            subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1));
+
+   ibld.ADD(ad, ad, bc);
+   ibld.ADD(subscript(bd, BRW_REGISTER_TYPE_UD, 1),
+            subscript(bd, BRW_REGISTER_TYPE_UD, 1), ad);
+
+   if (devinfo->has_64bit_int) {
+      ibld.MOV(inst->dst, bd);
+   } else {
+      if (!inst->is_partial_write())
+         ibld.emit_undef_for_dst(inst);
+      ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
+               subscript(bd, BRW_REGISTER_TYPE_UD, 0));
+      ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
+               subscript(bd, BRW_REGISTER_TYPE_UD, 1));
+   }
+}
+
+void
+fs_visitor::lower_mulh_inst(fs_inst *inst, bblock_t *block)
+{
+   const fs_builder ibld(this, block, inst);
+
+   /* According to the BDW+ BSpec page for the "Multiply Accumulate
+    * High" instruction:
+    *
+    *  "An added preliminary mov is required for source modification on
+    *   src1:
+    *      mov (8) r3.0<1>:d -r3<8;8,1>:d
+    *      mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
+    *      mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
+    */
+   if (devinfo->ver >= 8 && (inst->src[1].negate || inst->src[1].abs))
+      lower_src_modifiers(this, block, inst, 1);
+
+   /* Should have been lowered to 8-wide. */
+   assert(inst->exec_size <= get_lowered_simd_width(this, inst));
+   const unsigned acc_width = reg_unit(devinfo) * 8;
+   const fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), inst->dst.type),
+                                inst->group % acc_width);
+   fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
+   fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
+
+   if (devinfo->ver >= 8) {
+      /* Until Gfx8, integer multiplies read 32-bits from one source,
+       * and 16-bits from the other, and relying on the MACH instruction
+       * to generate the high bits of the result.
+       *
+       * On Gfx8, the multiply instruction does a full 32x32-bit
+       * multiply, but in order to do a 64-bit multiply we can simulate
+       * the previous behavior and then use a MACH instruction.
+       */
+      assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
+             mul->src[1].type == BRW_REGISTER_TYPE_UD);
+      mul->src[1].type = BRW_REGISTER_TYPE_UW;
+      mul->src[1].stride *= 2;
+
+      if (mul->src[1].file == IMM) {
+         mul->src[1] = brw_imm_uw(mul->src[1].ud);
+      }
+   } else if (devinfo->verx10 == 70 &&
+              inst->group > 0) {
+      /* Among other things the quarter control bits influence which
+       * accumulator register is used by the hardware for instructions
+       * that access the accumulator implicitly (e.g. MACH).  A
+       * second-half instruction would normally map to acc1, which
+       * doesn't exist on Gfx7 and up (the hardware does emulate it for
+       * floating-point instructions *only* by taking advantage of the
+       * extra precision of acc0 not normally used for floating point
+       * arithmetic).
+       *
+       * HSW and up are careful enough not to try to access an
+       * accumulator register that doesn't exist, but on earlier Gfx7
+       * hardware we need to make sure that the quarter control bits are
+       * zero to avoid non-deterministic behaviour and emit an extra MOV
+       * to get the result masked correctly according to the current
+       * channel enables.
+       */
+      mach->group = 0;
+      mach->force_writemask_all = true;
+      mach->dst = ibld.vgrf(inst->dst.type);
+      ibld.MOV(inst->dst, mach->dst);
+   }
+}
+
+bool
+fs_visitor::lower_integer_multiplication()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      if (inst->opcode == BRW_OPCODE_MUL) {
+         /* If the instruction is already in a form that does not need lowering,
+          * return early.
+          */
+         if (devinfo->ver >= 7) {
+            if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)
+               continue;
+         } else {
+            if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4)
+               continue;
+         }
+
+         if ((inst->dst.type == BRW_REGISTER_TYPE_Q ||
+              inst->dst.type == BRW_REGISTER_TYPE_UQ) &&
+             (inst->src[0].type == BRW_REGISTER_TYPE_Q ||
+              inst->src[0].type == BRW_REGISTER_TYPE_UQ) &&
+             (inst->src[1].type == BRW_REGISTER_TYPE_Q ||
+              inst->src[1].type == BRW_REGISTER_TYPE_UQ)) {
+            lower_mul_qword_inst(inst, block);
+            inst->remove(block);
+            progress = true;
+         } else if (!inst->dst.is_accumulator() &&
+                    (inst->dst.type == BRW_REGISTER_TYPE_D ||
+                     inst->dst.type == BRW_REGISTER_TYPE_UD) &&
+                    (!devinfo->has_integer_dword_mul ||
+                     devinfo->verx10 >= 125)) {
+            lower_mul_dword_inst(inst, block);
+            inst->remove(block);
+            progress = true;
+         }
+      } else if (inst->opcode == SHADER_OPCODE_MULH) {
+         lower_mulh_inst(inst, block);
+         inst->remove(block);
+         progress = true;
+      }
+
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+bool
+fs_visitor::lower_minmax()
+{
+   assert(devinfo->ver < 6);
+
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const fs_builder ibld(this, block, inst);
+
+      if (inst->opcode == BRW_OPCODE_SEL &&
+          inst->predicate == BRW_PREDICATE_NONE) {
+         /* If src1 is an immediate value that is not NaN, then it can't be
+          * NaN.  In that case, emit CMP because it is much better for cmod
+          * propagation.  Likewise if src1 is not float.  Gfx4 and Gfx5 don't
+          * support HF or DF, so it is not necessary to check for those.
+          */
+         if (inst->src[1].type != BRW_REGISTER_TYPE_F ||
+             (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
+            ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                     inst->conditional_mod);
+         } else {
+            ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                      inst->conditional_mod);
+         }
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+bool
+fs_visitor::lower_sub_sat()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const fs_builder ibld(this, block, inst);
+
+      if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
+          inst->opcode == SHADER_OPCODE_ISUB_SAT) {
+         /* The fundamental problem is the hardware performs source negation
+          * at the bit width of the source.  If the source is 0x80000000D, the
+          * negation is 0x80000000D.  As a result, subtractSaturate(0,
+          * 0x80000000) will produce 0x80000000 instead of 0x7fffffff.  There
+          * are at least three ways to resolve this:
+          *
+          * 1. Use the accumulator for the negated source.  The accumulator is
+          *    33 bits, so our source 0x80000000 is sign-extended to
+          *    0x1800000000.  The negation of which is 0x080000000.  This
+          *    doesn't help for 64-bit integers (which are already bigger than
+          *    33 bits).  There are also only 8 accumulators, so SIMD16 or
+          *    SIMD32 instructions would have to be split into multiple SIMD8
+          *    instructions.
+          *
+          * 2. Use slightly different math.  For any n-bit value x, we know (x
+          *    >> 1) != -(x >> 1).  We can use this fact to only do
+          *    subtractions involving (x >> 1).  subtractSaturate(a, b) ==
+          *    subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
+          *
+          * 3. For unsigned sources, it is sufficient to replace the
+          *    subtractSaturate with (a > b) ? a - b : 0.
+          *
+          * It may also be possible to use the SUBB instruction.  This
+          * implicitly writes the accumulator, so it could only be used in the
+          * same situations as #1 above.  It is further limited by only
+          * allowing UD sources.
+          */
+         if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q &&
+             inst->src[0].type != BRW_REGISTER_TYPE_UQ) {
+            fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type);
+
+            ibld.MOV(acc, inst->src[1]);
+            fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
+            add->saturate = true;
+            add->src[0].negate = true;
+         } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
+            /* tmp = src1 >> 1;
+             * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
+             */
+            fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
+            fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
+            fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
+            fs_inst *add;
+
+            ibld.SHR(tmp1, inst->src[1], brw_imm_d(1));
+
+            add = ibld.ADD(tmp2, inst->src[1], tmp1);
+            add->src[1].negate = true;
+
+            add = ibld.ADD(tmp3, inst->src[0], tmp1);
+            add->src[1].negate = true;
+            add->saturate = true;
+
+            add = ibld.ADD(inst->dst, tmp3, tmp2);
+            add->src[1].negate = true;
+            add->saturate = true;
+         } else {
+            /* a > b ? a - b : 0 */
+            ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                     BRW_CONDITIONAL_G);
+
+            fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
+            add->src[1].negate = !add->src[1].negate;
+
+            ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
+               ->predicate = BRW_PREDICATE_NORMAL;
+         }
+
+         inst->remove(block);
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+/**
+ * Get the mask of SIMD channels enabled during dispatch and not yet disabled
+ * by discard.  Due to the layout of the sample mask in the fragment shader
+ * thread payload, \p bld is required to have a dispatch_width() not greater
+ * than 16 for fragment shaders.
+ */
+fs_reg
+brw_sample_mask_reg(const fs_builder &bld)
+{
+   const fs_visitor &s = *bld.shader;
+
+   if (s.stage != MESA_SHADER_FRAGMENT) {
+      return brw_imm_ud(0xffffffff);
+   } else if (brw_wm_prog_data(s.stage_prog_data)->uses_kill) {
+      assert(bld.dispatch_width() <= 16);
+      return brw_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
+   } else {
+      assert(s.devinfo->ver >= 6 && bld.dispatch_width() <= 16);
+      assert(s.devinfo->ver < 20);
+      return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
+                    BRW_REGISTER_TYPE_UW);
+   }
+}
+
+uint32_t
+brw_fb_write_msg_control(const fs_inst *inst,
+                         const struct brw_wm_prog_data *prog_data)
+{
+   uint32_t mctl;
+
+   if (inst->opcode == FS_OPCODE_REP_FB_WRITE) {
+      assert(inst->group == 0 && inst->exec_size == 16);
+      mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
+   } else if (prog_data->dual_src_blend) {
+      assert(inst->exec_size == 8);
+
+      if (inst->group % 16 == 0)
+         mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
+      else if (inst->group % 16 == 8)
+         mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
+      else
+         unreachable("Invalid dual-source FB write instruction group");
+   } else {
+      assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
+
+      if (inst->exec_size == 16)
+         mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
+      else if (inst->exec_size == 8)
+         mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
+      else
+         unreachable("Invalid FB write execution size");
+   }
+
+   return mctl;
+}
+
+ /**
+ * Predicate the specified instruction on the sample mask.
+ */
+void
+brw_emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
+{
+   assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
+          bld.group() == inst->group &&
+          bld.dispatch_width() == inst->exec_size);
+
+   const fs_visitor &s = *bld.shader;
+   const fs_reg sample_mask = brw_sample_mask_reg(bld);
+   const unsigned subreg = sample_mask_flag_subreg(s);
+
+   if (brw_wm_prog_data(s.stage_prog_data)->uses_kill) {
+      assert(sample_mask.file == ARF &&
+             sample_mask.nr == brw_flag_subreg(subreg).nr &&
+             sample_mask.subnr == brw_flag_subreg(
+                subreg + inst->group / 16).subnr);
+   } else {
+      bld.group(1, 0).exec_all()
+         .MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
+   }
+
+   if (inst->predicate) {
+      assert(inst->predicate == BRW_PREDICATE_NORMAL);
+      assert(!inst->predicate_inverse);
+      assert(inst->flag_subreg == 0);
+      assert(s.devinfo->ver < 20);
+      /* Combine the sample mask with the existing predicate by using a
+       * vertical predication mode.
+       */
+      inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
+   } else {
+      inst->flag_subreg = subreg;
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->predicate_inverse = false;
+   }
+}
+
+static bool
+is_mixed_float_with_fp32_dst(const fs_inst *inst)
+{
+   /* This opcode sometimes uses :W type on the source even if the operand is
+    * a :HF, because in gfx7 there is no support for :HF, and thus it uses :W.
+    */
+   if (inst->opcode == BRW_OPCODE_F16TO32)
+      return true;
+
+   if (inst->dst.type != BRW_REGISTER_TYPE_F)
+      return false;
+
+   for (int i = 0; i < inst->sources; i++) {
+      if (inst->src[i].type == BRW_REGISTER_TYPE_HF)
+         return true;
+   }
+
+   return false;
+}
+
+static bool
+is_mixed_float_with_packed_fp16_dst(const fs_inst *inst)
+{
+   /* This opcode sometimes uses :W type on the destination even if the
+    * destination is a :HF, because in gfx7 there is no support for :HF, and
+    * thus it uses :W.
+    */
+   if (inst->opcode == BRW_OPCODE_F32TO16 &&
+       inst->dst.stride == 1)
+      return true;
+
+   if (inst->dst.type != BRW_REGISTER_TYPE_HF ||
+       inst->dst.stride != 1)
+      return false;
+
+   for (int i = 0; i < inst->sources; i++) {
+      if (inst->src[i].type == BRW_REGISTER_TYPE_F)
+         return true;
+   }
+
+   return false;
+}
+
+/**
+ * Get the closest allowed SIMD width for instruction \p inst accounting for
+ * some common regioning and execution control restrictions that apply to FPU
+ * instructions.  These restrictions don't necessarily have any relevance to
+ * instructions not executed by the FPU pipeline like extended math, control
+ * flow or send message instructions.
+ *
+ * For virtual opcodes it's really up to the instruction -- In some cases
+ * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
+ * instructions) it may simplify virtual instruction lowering if we can
+ * enforce FPU-like regioning restrictions already on the virtual instruction,
+ * in other cases (e.g. virtual send-like instructions) this may be
+ * excessively restrictive.
+ */
+static unsigned
+get_fpu_lowered_simd_width(const fs_visitor *shader,
+                           const fs_inst *inst)
+{
+   const struct brw_compiler *compiler = shader->compiler;
+   const struct intel_device_info *devinfo = compiler->devinfo;
+
+   /* Maximum execution size representable in the instruction controls. */
+   unsigned max_width = MIN2(32, inst->exec_size);
+
+   /* Number of channels per polygon handled by a multipolygon PS shader. */
+   const unsigned poly_width = shader->dispatch_width /
+                               MAX2(1, shader->max_polygons);
+
+   /* Number of registers that will be read by an ATTR source if
+    * present for multipolygon PS shaders, since the PS vertex setup
+    * data for each polygon is stored in different contiguous GRFs.
+    */
+   const unsigned attr_reg_count = (shader->stage != MESA_SHADER_FRAGMENT ||
+                                    shader->max_polygons < 2 ? 0 :
+                                    DIV_ROUND_UP(inst->exec_size,
+                                                 poly_width) * reg_unit(devinfo));
+
+   /* According to the PRMs:
+    *  "A. In Direct Addressing mode, a source cannot span more than 2
+    *      adjacent GRF registers.
+    *   B. A destination cannot span more than 2 adjacent GRF registers."
+    *
+    * Look for the source or destination with the largest register region
+    * which is the one that is going to limit the overall execution size of
+    * the instruction due to this rule.
+    */
+   unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
+
+   for (unsigned i = 0; i < inst->sources; i++)
+      reg_count = MAX3(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE),
+                       (inst->src[i].file == ATTR ? attr_reg_count : 0));
+
+   /* Calculate the maximum execution size of the instruction based on the
+    * factor by which it goes over the hardware limit of 2 GRFs.
+    */
+   const unsigned max_reg_count = 2 * reg_unit(devinfo);
+   if (reg_count > max_reg_count)
+      max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, max_reg_count));
+
+   /* According to the IVB PRMs:
+    *  "When destination spans two registers, the source MUST span two
+    *   registers. The exception to the above rule:
+    *
+    *    - When source is scalar, the source registers are not incremented.
+    *    - When source is packed integer Word and destination is packed
+    *      integer DWord, the source register is not incremented but the
+    *      source sub register is incremented."
+    *
+    * The hardware specs from Gfx4 to Gfx7.5 mention similar regioning
+    * restrictions.  The code below intentionally doesn't check whether the
+    * destination type is integer because empirically the hardware doesn't
+    * seem to care what the actual type is as long as it's dword-aligned.
+    *
+    * HSW PRMs also add a note to the second exception:
+    *  "When lower 8 channels are disabled, the sub register of source1
+    *   operand is not incremented. If the lower 8 channels are expected
+    *   to be disabled, say by predication, the instruction must be split
+    *   into pair of simd8 operations."
+    *
+    * We can't reliably know if the channels won't be disabled due to,
+    * for example, IMASK. So, play it safe and disallow packed-word exception
+    * for src1.
+    */
+   if (devinfo->ver < 8) {
+      for (unsigned i = 0; i < inst->sources; i++) {
+         /* IVB implements DF scalars as <0;2,1> regions. */
+         const bool is_scalar_exception = is_uniform(inst->src[i]) &&
+            (devinfo->platform == INTEL_PLATFORM_HSW || type_sz(inst->src[i].type) != 8);
+         const bool is_packed_word_exception = i != 1 &&
+            type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
+            type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
+
+         /* We check size_read(i) against size_written instead of REG_SIZE
+          * because we want to properly handle SIMD32.  In SIMD32, you can end
+          * up with writes to 4 registers and a source that reads 2 registers
+          * and we may still need to lower all the way to SIMD8 in that case.
+          */
+         if (inst->size_written > REG_SIZE &&
+             inst->size_read(i) != 0 &&
+             inst->size_read(i) < inst->size_written &&
+             !is_scalar_exception && !is_packed_word_exception) {
+            const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
+            max_width = MIN2(max_width, inst->exec_size / reg_count);
+         }
+      }
+   }
+
+   if (devinfo->ver < 6) {
+      /* From the G45 PRM, Volume 4 Page 361:
+       *
+       *    "Operand Alignment Rule: With the exceptions listed below, a
+       *     source/destination operand in general should be aligned to even
+       *     256-bit physical register with a region size equal to two 256-bit
+       *     physical registers."
+       *
+       * Normally we enforce this by allocating virtual registers to the
+       * even-aligned class.  But we need to handle payload registers.
+       */
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
+             inst->size_read(i) > REG_SIZE) {
+            max_width = MIN2(max_width, 8);
+         }
+      }
+   }
+
+   /* From the IVB PRMs:
+    *  "When an instruction is SIMD32, the low 16 bits of the execution mask
+    *   are applied for both halves of the SIMD32 instruction. If different
+    *   execution mask channels are required, split the instruction into two
+    *   SIMD16 instructions."
+    *
+    * There is similar text in the HSW PRMs.  Gfx4-6 don't even implement
+    * 32-wide control flow support in hardware and will behave similarly.
+    */
+   if (devinfo->ver < 8 && !inst->force_writemask_all)
+      max_width = MIN2(max_width, 16);
+
+   /* From the IVB PRMs (applies to HSW too):
+    *  "Instructions with condition modifiers must not use SIMD32."
+    *
+    * From the BDW PRMs (applies to later hardware too):
+    *  "Ternary instruction with condition modifiers must not use SIMD32."
+    */
+   if (inst->conditional_mod && (devinfo->ver < 8 ||
+                                 (inst->is_3src(compiler) && devinfo->ver < 12)))
+      max_width = MIN2(max_width, 16);
+
+   /* From the IVB PRMs (applies to other devices that don't have the
+    * intel_device_info::supports_simd16_3src flag set):
+    *  "In Align16 access mode, SIMD16 is not allowed for DW operations and
+    *   SIMD8 is not allowed for DF operations."
+    */
+   if (inst->is_3src(compiler) && !devinfo->supports_simd16_3src)
+      max_width = MIN2(max_width, inst->exec_size / reg_count);
+
+   /* Pre-Gfx8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
+    * the 8-bit quarter of the execution mask signals specified in the
+    * instruction control fields) for the second compressed half of any
+    * single-precision instruction (for double-precision instructions
+    * it's hardwired to use NibCtrl+1, at least on HSW), which means that
+    * the EU will apply the wrong execution controls for the second
+    * sequential GRF write if the number of channels per GRF is not exactly
+    * eight in single-precision mode (or four in double-float mode).
+    *
+    * In this situation we calculate the maximum size of the split
+    * instructions so they only ever write to a single register.
+    */
+   if (devinfo->ver < 8 && inst->size_written > REG_SIZE &&
+       !inst->force_writemask_all) {
+      const unsigned channels_per_grf = inst->exec_size /
+         DIV_ROUND_UP(inst->size_written, REG_SIZE);
+      const unsigned exec_type_size = get_exec_type_size(inst);
+      assert(exec_type_size);
+
+      /* The hardware shifts exactly 8 channels per compressed half of the
+       * instruction in single-precision mode and exactly 4 in double-precision.
+       */
+      if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
+         max_width = MIN2(max_width, channels_per_grf);
+
+      /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
+       * because HW applies the same channel enable signals to both halves of
+       * the compressed instruction which will be just wrong under
+       * non-uniform control flow.
+       */
+      if (devinfo->verx10 == 70 &&
+          (exec_type_size == 8 || type_sz(inst->dst.type) == 8))
+         max_width = MIN2(max_width, 4);
+   }
+
+   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+    * Float Operations:
+    *
+    *    "No SIMD16 in mixed mode when destination is f32. Instruction
+    *     execution size must be no more than 8."
+    *
+    * FIXME: the simulator doesn't seem to complain if we don't do this and
+    * empirical testing with existing CTS tests show that they pass just fine
+    * without implementing this, however, since our interpretation of the PRM
+    * is that conversion MOVs between HF and F are still mixed-float
+    * instructions (and therefore subject to this restriction) we decided to
+    * split them to be safe. Might be useful to do additional investigation to
+    * lift the restriction if we can ensure that it is safe though, since these
+    * conversions are common when half-float types are involved since many
+    * instructions do not support HF types and conversions from/to F are
+    * required.
+    */
+   if (is_mixed_float_with_fp32_dst(inst) && devinfo->ver < 20)
+      max_width = MIN2(max_width, 8);
+
+   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+    * Float Operations:
+    *
+    *    "No SIMD16 in mixed mode when destination is packed f16 for both
+    *     Align1 and Align16."
+    */
+   if (is_mixed_float_with_packed_fp16_dst(inst) && devinfo->ver < 20)
+      max_width = MIN2(max_width, 8);
+
+   /* Only power-of-two execution sizes are representable in the instruction
+    * control fields.
+    */
+   return 1 << util_logbase2(max_width);
+}
+
+/**
+ * Get the maximum allowed SIMD width for instruction \p inst accounting for
+ * various payload size restrictions that apply to sampler message
+ * instructions.
+ *
+ * This is only intended to provide a maximum theoretical bound for the
+ * execution size of the message based on the number of argument components
+ * alone, which in most cases will determine whether the SIMD8 or SIMD16
+ * variant of the message can be used, though some messages may have
+ * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
+ * the message length to determine the exact SIMD width and argument count,
+ * which makes a number of sampler message combinations impossible to
+ * represent).
+ *
+ * Note: Platforms with monolithic SIMD16 double the possible SIMD widths
+ * change from (SIMD8, SIMD16) to (SIMD16, SIMD32).
+ */
+static unsigned
+get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
+                               const fs_inst *inst)
+{
+   /* If we have a min_lod parameter on anything other than a simple sample
+    * message, it will push it over 5 arguments and we have to fall back to
+    * SIMD8.
+    */
+   if (inst->opcode != SHADER_OPCODE_TEX &&
+       inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
+      return devinfo->ver < 20 ? 8 : 16;
+
+   /* Calculate the number of coordinate components that have to be present
+    * assuming that additional arguments follow the texel coordinates in the
+    * message payload.  On IVB+ there is no need for padding, on ILK-SNB we
+    * need to pad to four or three components depending on the message,
+    * pre-ILK we need to pad to at most three components.
+    */
+   const unsigned req_coord_components =
+      (devinfo->ver >= 7 ||
+       !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
+      (devinfo->ver >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
+                            inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
+      3;
+
+   /* On Gfx9+ the LOD argument is for free if we're able to use the LZ
+    * variant of the TXL or TXF message.
+    */
+   const bool implicit_lod = devinfo->ver >= 9 &&
+                             (inst->opcode == SHADER_OPCODE_TXL ||
+                              inst->opcode == SHADER_OPCODE_TXF) &&
+                             inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
+
+   /* Calculate the total number of argument components that need to be passed
+    * to the sampler unit.
+    */
+   const unsigned num_payload_components =
+      MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
+           req_coord_components) +
+      inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
+      (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
+      inst->components_read(TEX_LOGICAL_SRC_LOD2) +
+      inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
+      (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
+       inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
+      inst->components_read(TEX_LOGICAL_SRC_MCS);
+
+   const unsigned simd_limit = reg_unit(devinfo) *
+      (num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
+
+   /* SIMD16 (SIMD32 on Xe2) messages with more than five arguments exceed the
+    * maximum message size supported by the sampler, regardless of whether a
+    * header is provided or not.
+    */
+   return MIN2(inst->exec_size, simd_limit);
+}
+
+/**
+ * Get the closest native SIMD width supported by the hardware for instruction
+ * \p inst.  The instruction will be left untouched by
+ * fs_visitor::lower_simd_width() if the returned value is equal to the
+ * original execution size.
+ */
+static unsigned
+get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst)
+{
+   const struct brw_compiler *compiler = shader->compiler;
+   const struct intel_device_info *devinfo = compiler->devinfo;
+
+   switch (inst->opcode) {
+   case BRW_OPCODE_DP4A:
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_SEL:
+   case BRW_OPCODE_NOT:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_ROR:
+   case BRW_OPCODE_ROL:
+   case BRW_OPCODE_CMPN:
+   case BRW_OPCODE_CSEL:
+   case BRW_OPCODE_F32TO16:
+   case BRW_OPCODE_F16TO32:
+   case BRW_OPCODE_BFREV:
+   case BRW_OPCODE_BFE:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_MUL:
+   case BRW_OPCODE_AVG:
+   case BRW_OPCODE_FRC:
+   case BRW_OPCODE_RNDU:
+   case BRW_OPCODE_RNDD:
+   case BRW_OPCODE_RNDE:
+   case BRW_OPCODE_RNDZ:
+   case BRW_OPCODE_LZD:
+   case BRW_OPCODE_FBH:
+   case BRW_OPCODE_FBL:
+   case BRW_OPCODE_CBIT:
+   case BRW_OPCODE_SAD2:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_LRP:
+   case BRW_OPCODE_ADD3:
+   case FS_OPCODE_PACK:
+   case SHADER_OPCODE_SEL_EXEC:
+   case SHADER_OPCODE_CLUSTER_BROADCAST:
+   case SHADER_OPCODE_MOV_RELOC_IMM:
+      return get_fpu_lowered_simd_width(shader, inst);
+
+   case BRW_OPCODE_CMP: {
+      /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
+       * when the destination is a GRF the dependency-clear bit on the flag
+       * register is cleared early.
+       *
+       * Suggested workarounds are to disable coissuing CMP instructions
+       * or to split CMP(16) instructions into two CMP(8) instructions.
+       *
+       * We choose to split into CMP(8) instructions since disabling
+       * coissuing would affect CMP instructions not otherwise affected by
+       * the errata.
+       */
+      const unsigned max_width = (devinfo->verx10 == 70 &&
+                                  !inst->dst.is_null() ? 8 : ~0);
+      return MIN2(max_width, get_fpu_lowered_simd_width(shader, inst));
+   }
+   case BRW_OPCODE_BFI1:
+   case BRW_OPCODE_BFI2:
+      /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
+       * should
+       *  "Force BFI instructions to be executed always in SIMD8."
+       */
+      return MIN2(devinfo->platform == INTEL_PLATFORM_HSW ? 8 : ~0u,
+                  get_fpu_lowered_simd_width(shader, inst));
+
+   case BRW_OPCODE_IF:
+      assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
+      return inst->exec_size;
+
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS: {
+      /* Unary extended math instructions are limited to SIMD8 on Gfx4 and
+       * Gfx6. Extended Math Function is limited to SIMD8 with half-float.
+       */
+      if (devinfo->ver == 6 || devinfo->verx10 == 40)
+         return MIN2(8, inst->exec_size);
+      if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+         return MIN2(8, inst->exec_size);
+      return MIN2(16, inst->exec_size);
+   }
+
+   case SHADER_OPCODE_POW: {
+      /* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
+       * to SIMD8 with half-float
+       */
+      if (devinfo->ver < 7)
+         return MIN2(8, inst->exec_size);
+      if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+         return MIN2(8, inst->exec_size);
+      return MIN2(16, inst->exec_size);
+   }
+
+   case SHADER_OPCODE_USUB_SAT:
+   case SHADER_OPCODE_ISUB_SAT:
+      return get_fpu_lowered_simd_width(shader, inst);
+
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+      /* Integer division is limited to SIMD8 on all generations. */
+      return MIN2(8, inst->exec_size);
+
+   case FS_OPCODE_LINTERP:
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+      return MIN2(16, inst->exec_size);
+
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
+      /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
+       * message used to implement varying pull constant loads, so expand it
+       * to SIMD16.  An alternative with longer message payload length but
+       * shorter return payload would be to use the SIMD8 sampler message that
+       * takes (header, u, v, r) as parameters instead of (header, u).
+       */
+      return (devinfo->ver == 4 ? 16 : MIN2(16, inst->exec_size));
+
+   case FS_OPCODE_DDX_COARSE:
+   case FS_OPCODE_DDX_FINE:
+   case FS_OPCODE_DDY_COARSE:
+   case FS_OPCODE_DDY_FINE:
+      /* The implementation of this virtual opcode may require emitting
+       * compressed Align16 instructions, which are severely limited on some
+       * generations.
+       *
+       * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
+       * Region Restrictions):
+       *
+       *  "In Align16 access mode, SIMD16 is not allowed for DW operations
+       *   and SIMD8 is not allowed for DF operations."
+       *
+       * In this context, "DW operations" means "operations acting on 32-bit
+       * values", so it includes operations on floats.
+       *
+       * Gfx4 has a similar restriction.  From the i965 PRM, section 11.5.3
+       * (Instruction Compression -> Rules and Restrictions):
+       *
+       *  "A compressed instruction must be in Align1 access mode. Align16
+       *   mode instructions cannot be compressed."
+       *
+       * Similar text exists in the g45 PRM.
+       *
+       * Empirically, compressed align16 instructions using odd register
+       * numbers don't appear to work on Sandybridge either.
+       */
+      return (devinfo->ver == 4 || devinfo->ver == 6 ||
+              (devinfo->verx10 == 70) ?
+              MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
+
+   case SHADER_OPCODE_MULH:
+      /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
+       * is 8-wide on Gfx7+.
+       */
+      return (devinfo->ver >= 20 ? 16 :
+              devinfo->ver >= 7 ? 8 :
+              get_fpu_lowered_simd_width(shader, inst));
+
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      /* Gfx6 doesn't support SIMD16 depth writes but we cannot handle them
+       * here.
+       */
+      assert(devinfo->ver != 6 ||
+             inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
+             inst->exec_size == 8);
+      /* Dual-source FB writes are unsupported in SIMD16 mode. */
+      return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
+              8 : MIN2(16, inst->exec_size));
+
+   case FS_OPCODE_FB_READ_LOGICAL:
+      return MIN2(16, inst->exec_size);
+
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+   case SHADER_OPCODE_LOD_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+      return get_sampler_lowered_simd_width(devinfo, inst);
+
+   /* On gfx12 parameters are fixed to 16-bit values and therefore they all
+    * always fit regardless of the execution size.
+    */
+   case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
+      return MIN2(16, inst->exec_size);
+
+   case SHADER_OPCODE_TXD_LOGICAL:
+      /* TXD is unsupported in SIMD16 mode previous to Xe2. SIMD32 is still
+       * unsuppported on Xe2.
+       */
+      return devinfo->ver < 20 ? 8 : 16;
+
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+      /* Only one execution size is representable pre-ILK depending on whether
+       * the shadow reference argument is present.
+       */
+      if (devinfo->ver == 4)
+         return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
+      else
+         return get_sampler_lowered_simd_width(devinfo, inst);
+
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+      /* Gfx4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
+       * messages.  Use SIMD16 instead.
+       */
+      if (devinfo->ver == 4)
+         return 16;
+      else
+         return get_sampler_lowered_simd_width(devinfo, inst);
+
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      return 8;
+
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
+   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
+   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
+   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
+      return MIN2(16, inst->exec_size);
+
+   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
+   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
+   case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
+   case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
+      return devinfo->ver <= 8 ? 8 : MIN2(16, inst->exec_size);
+
+   case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
+   case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
+   case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
+      assert(inst->exec_size <= 16);
+      return inst->exec_size;
+
+   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
+      return devinfo->has_lsc ? MIN2(16, inst->exec_size) : 8;
+
+   case SHADER_OPCODE_URB_READ_LOGICAL:
+   case SHADER_OPCODE_URB_WRITE_LOGICAL:
+      return MIN2(devinfo->ver < 20 ? 8 : 16, inst->exec_size);
+
+   case SHADER_OPCODE_QUAD_SWIZZLE: {
+      const unsigned swiz = inst->src[1].ud;
+      return (is_uniform(inst->src[0]) ?
+                 get_fpu_lowered_simd_width(shader, inst) :
+              devinfo->ver < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
+              swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
+              get_fpu_lowered_simd_width(shader, inst));
+   }
+   case SHADER_OPCODE_MOV_INDIRECT: {
+      /* From IVB and HSW PRMs:
+       *
+       * "2.When the destination requires two registers and the sources are
+       *  indirect, the sources must use 1x1 regioning mode.
+       *
+       * In case of DF instructions in HSW/IVB, the exec_size is limited by
+       * the EU decompression logic not handling VxH indirect addressing
+       * correctly.
+       */
+      const unsigned max_size = (devinfo->ver >= 8 ? 2 : 1) * REG_SIZE;
+      /* Prior to Broadwell, we only have 8 address subregisters. */
+      return MIN3(devinfo->ver >= 8 ? 16 : 8,
+                  max_size / (inst->dst.stride * type_sz(inst->dst.type)),
+                  inst->exec_size);
+   }
+
+   case SHADER_OPCODE_LOAD_PAYLOAD: {
+      const unsigned reg_count =
+         DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
+
+      if (reg_count > 2) {
+         /* Only LOAD_PAYLOAD instructions with per-channel destination region
+          * can be easily lowered (which excludes headers and heterogeneous
+          * types).
+          */
+         assert(!inst->header_size);
+         for (unsigned i = 0; i < inst->sources; i++)
+            assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
+                   inst->src[i].file == BAD_FILE);
+
+         return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
+      } else {
+         return inst->exec_size;
+      }
+   }
+   default:
+      return inst->exec_size;
+   }
+}
+
+/**
+ * Return true if splitting out the group of channels of instruction \p inst
+ * given by lbld.group() requires allocating a temporary for the i-th source
+ * of the lowered instruction.
+ */
+static inline bool
+needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
+{
+   return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
+            (inst->components_read(i) == 1 &&
+             lbld.dispatch_width() <= inst->exec_size)) ||
+          (inst->flags_written(lbld.shader->devinfo) &
+           flag_mask(inst->src[i], type_sz(inst->src[i].type)));
+}
+
+/**
+ * Extract the data that would be consumed by the channel group given by
+ * lbld.group() from the i-th source region of instruction \p inst and return
+ * it as result in packed form.
+ */
+static fs_reg
+emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
+{
+   assert(lbld.group() >= inst->group);
+
+   /* Specified channel group from the source region. */
+   const fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
+
+   if (needs_src_copy(lbld, inst, i)) {
+      /* Builder of the right width to perform the copy avoiding uninitialized
+       * data if the lowered execution size is greater than the original
+       * execution size of the instruction.
+       */
+      const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
+                                              inst->exec_size), 0);
+      const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
+
+      for (unsigned k = 0; k < inst->components_read(i); ++k)
+         cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
+
+      return tmp;
+
+   } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
+      /* The source is invariant for all dispatch_width-wide groups of the
+       * original region.
+       */
+      return inst->src[i];
+
+   } else {
+      /* We can just point the lowered instruction at the right channel group
+       * from the original region.
+       */
+      return src;
+   }
+}
+
+/**
+ * Return true if splitting out the group of channels of instruction \p inst
+ * given by lbld.group() requires allocating a temporary for the destination
+ * of the lowered instruction and copying the data back to the original
+ * destination region.
+ */
+static inline bool
+needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
+{
+   if (inst->dst.is_null())
+      return false;
+
+   /* If the instruction writes more than one component we'll have to shuffle
+    * the results of multiple lowered instructions in order to make sure that
+    * they end up arranged correctly in the original destination region.
+    */
+   if (inst->size_written > inst->dst.component_size(inst->exec_size))
+      return true;
+
+   /* If the lowered execution size is larger than the original the result of
+    * the instruction won't fit in the original destination, so we'll have to
+    * allocate a temporary in any case.
+    */
+   if (lbld.dispatch_width() > inst->exec_size)
+      return true;
+
+   for (unsigned i = 0; i < inst->sources; i++) {
+      /* If we already made a copy of the source for other reasons there won't
+       * be any overlap with the destination.
+       */
+      if (needs_src_copy(lbld, inst, i))
+         continue;
+
+      /* In order to keep the logic simple we emit a copy whenever the
+       * destination region doesn't exactly match an overlapping source, which
+       * may point at the source and destination not being aligned group by
+       * group which could cause one of the lowered instructions to overwrite
+       * the data read from the same source by other lowered instructions.
+       */
+      if (regions_overlap(inst->dst, inst->size_written,
+                          inst->src[i], inst->size_read(i)) &&
+          !inst->dst.equals(inst->src[i]))
+        return true;
+   }
+
+   return false;
+}
+
+/**
+ * Insert data from a packed temporary into the channel group given by
+ * lbld.group() of the destination region of instruction \p inst and return
+ * the temporary as result.  Any copy instructions that are required for
+ * unzipping the previous value (in the case of partial writes) will be
+ * inserted using \p lbld_before and any copy instructions required for
+ * zipping up the destination of \p inst will be inserted using \p lbld_after.
+ */
+static fs_reg
+emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
+         fs_inst *inst)
+{
+   assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
+   assert(lbld_before.group() == lbld_after.group());
+   assert(lbld_after.group() >= inst->group);
+
+   const struct intel_device_info *devinfo = lbld_before.shader->devinfo;
+
+   /* Specified channel group from the destination region. */
+   const fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
+
+   if (!needs_dst_copy(lbld_after, inst)) {
+      /* No need to allocate a temporary for the lowered instruction, just
+       * take the right group of channels from the original region.
+       */
+      return dst;
+   }
+
+   /* Deal with the residency data part later */
+   const unsigned residency_size = inst->has_sampler_residency() ?
+      (reg_unit(devinfo) * REG_SIZE) : 0;
+   const unsigned dst_size = (inst->size_written - residency_size) /
+      inst->dst.component_size(inst->exec_size);
+
+   const fs_reg tmp = lbld_after.vgrf(inst->dst.type,
+                                      dst_size + inst->has_sampler_residency());
+
+   if (inst->predicate) {
+      /* Handle predication by copying the original contents of the
+       * destination into the temporary before emitting the lowered
+       * instruction.
+       */
+      const fs_builder gbld_before =
+         lbld_before.group(MIN2(lbld_before.dispatch_width(),
+                                inst->exec_size), 0);
+      for (unsigned k = 0; k < dst_size; ++k) {
+         gbld_before.MOV(offset(tmp, lbld_before, k),
+                         offset(dst, inst->exec_size, k));
+      }
+   }
+
+   const fs_builder gbld_after =
+      lbld_after.group(MIN2(lbld_after.dispatch_width(),
+                            inst->exec_size), 0);
+   for (unsigned k = 0; k < dst_size; ++k) {
+      /* Use a builder of the right width to perform the copy avoiding
+       * uninitialized data if the lowered execution size is greater than the
+       * original execution size of the instruction.
+       */
+      gbld_after.MOV(offset(dst, inst->exec_size, k),
+                     offset(tmp, lbld_after, k));
+   }
+
+   if (inst->has_sampler_residency()) {
+      /* Sampler messages with residency need a special attention. In the
+       * first lane of the last component are located the Pixel Null Mask
+       * (bits 0:15) & some upper bits we need to discard (bits 16:31). We
+       * have to build a single 32bit value for the SIMD32 message out of 2
+       * SIMD16 16 bit values.
+       */
+      const fs_builder rbld = gbld_after.exec_all().group(1, 0);
+      fs_reg local_res_reg = component(
+         retype(offset(tmp, lbld_before, dst_size),
+                BRW_REGISTER_TYPE_UW), 0);
+      fs_reg final_res_reg =
+         retype(byte_offset(inst->dst,
+                            inst->size_written - residency_size +
+                            gbld_after.group() / 8),
+                BRW_REGISTER_TYPE_UW);
+      rbld.MOV(final_res_reg, local_res_reg);
+   }
+
+   return tmp;
+}
+
+bool
+fs_visitor::lower_simd_width()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const unsigned lower_width = get_lowered_simd_width(this, inst);
+
+      if (lower_width != inst->exec_size) {
+         /* Builder matching the original instruction.  We may also need to
+          * emit an instruction of width larger than the original, set the
+          * execution size of the builder to the highest of both for now so
+          * we're sure that both cases can be handled.
+          */
+         const unsigned max_width = MAX2(inst->exec_size, lower_width);
+
+         const fs_builder bld = fs_builder(this).at_end();
+         const fs_builder ibld = bld.at(block, inst)
+                                    .exec_all(inst->force_writemask_all)
+                                    .group(max_width, inst->group / max_width);
+
+         /* Split the copies in chunks of the execution width of either the
+          * original or the lowered instruction, whichever is lower.
+          */
+         const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
+         const unsigned residency_size = inst->has_sampler_residency() ?
+            (reg_unit(devinfo) * REG_SIZE) : 0;
+         const unsigned dst_size =
+            (inst->size_written - residency_size) /
+            inst->dst.component_size(inst->exec_size);
+
+         assert(!inst->writes_accumulator && !inst->mlen);
+
+         /* Inserting the zip, unzip, and duplicated instructions in all of
+          * the right spots is somewhat tricky.  All of the unzip and any
+          * instructions from the zip which unzip the destination prior to
+          * writing need to happen before all of the per-group instructions
+          * and the zip instructions need to happen after.  In order to sort
+          * this all out, we insert the unzip instructions before \p inst,
+          * insert the per-group instructions after \p inst (i.e. before
+          * inst->next), and insert the zip instructions before the
+          * instruction after \p inst.  Since we are inserting instructions
+          * after \p inst, inst->next is a moving target and we need to save
+          * it off here so that we insert the zip instructions in the right
+          * place.
+          *
+          * Since we're inserting split instructions after after_inst, the
+          * instructions will end up in the reverse order that we insert them.
+          * However, certain render target writes require that the low group
+          * instructions come before the high group.  From the Ivy Bridge PRM
+          * Vol. 4, Pt. 1, Section 3.9.11:
+          *
+          *    "If multiple SIMD8 Dual Source messages are delivered by the
+          *    pixel shader thread, each SIMD8_DUALSRC_LO message must be
+          *    issued before the SIMD8_DUALSRC_HI message with the same Slot
+          *    Group Select setting."
+          *
+          * And, from Section 3.9.11.1 of the same PRM:
+          *
+          *    "When SIMD32 or SIMD16 PS threads send render target writes
+          *    with multiple SIMD8 and SIMD16 messages, the following must
+          *    hold:
+          *
+          *    All the slots (as described above) must have a corresponding
+          *    render target write irrespective of the slot's validity. A slot
+          *    is considered valid when at least one sample is enabled. For
+          *    example, a SIMD16 PS thread must send two SIMD8 render target
+          *    writes to cover all the slots.
+          *
+          *    PS thread must send SIMD render target write messages with
+          *    increasing slot numbers. For example, SIMD16 thread has
+          *    Slot[15:0] and if two SIMD8 render target writes are used, the
+          *    first SIMD8 render target write must send Slot[7:0] and the
+          *    next one must send Slot[15:8]."
+          *
+          * In order to make low group instructions come before high group
+          * instructions (this is required for some render target writes), we
+          * split from the highest group to lowest.
+          */
+         exec_node *const after_inst = inst->next;
+         for (int i = n - 1; i >= 0; i--) {
+            /* Emit a copy of the original instruction with the lowered width.
+             * If the EOT flag was set throw it away except for the last
+             * instruction to avoid killing the thread prematurely.
+             */
+            fs_inst split_inst = *inst;
+            split_inst.exec_size = lower_width;
+            split_inst.eot = inst->eot && i == int(n - 1);
+
+            /* Select the correct channel enables for the i-th group, then
+             * transform the sources and destination and emit the lowered
+             * instruction.
+             */
+            const fs_builder lbld = ibld.group(lower_width, i);
+
+            for (unsigned j = 0; j < inst->sources; j++)
+               split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
+
+            split_inst.dst = emit_zip(lbld.at(block, inst),
+                                      lbld.at(block, after_inst), inst);
+            split_inst.size_written =
+               split_inst.dst.component_size(lower_width) * dst_size +
+               residency_size;
+
+            lbld.at(block, inst->next).emit(split_inst);
+         }
+
+         inst->remove(block);
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+/**
+ * Transform barycentric vectors into the interleaved form expected by the PLN
+ * instruction and returned by the Gfx7+ PI shared function.
+ *
+ * For channels 0-15 in SIMD16 mode they are expected to be laid out as
+ * follows in the register file:
+ *
+ *    rN+0: X[0-7]
+ *    rN+1: Y[0-7]
+ *    rN+2: X[8-15]
+ *    rN+3: Y[8-15]
+ *
+ * There is no need to handle SIMD32 here -- This is expected to be run after
+ * SIMD lowering, since SIMD lowering relies on vectors having the standard
+ * component layout.
+ */
+bool
+fs_visitor::lower_barycentrics()
+{
+   const bool has_interleaved_layout = devinfo->has_pln ||
+      (devinfo->ver >= 7 && devinfo->ver < 20);
+   bool progress = false;
+
+   if (stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
+      return false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      if (inst->exec_size < 16)
+         continue;
+
+      const fs_builder ibld(this, block, inst);
+      const fs_builder ubld = ibld.exec_all().group(8, 0);
+
+      switch (inst->opcode) {
+      case FS_OPCODE_LINTERP : {
+         assert(inst->exec_size == 16);
+         const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
+         fs_reg srcs[4];
+
+         for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
+            srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
+                                   8 * (i / 2));
+
+         ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
+
+         inst->src[0] = tmp;
+         progress = true;
+         break;
+      }
+      case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+      case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+      case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
+         assert(inst->exec_size == 16);
+         const fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
+
+         for (unsigned i = 0; i < 2; i++) {
+            for (unsigned g = 0; g < inst->exec_size / 8; g++) {
+               fs_inst *mov = ibld.at(block, inst->next).group(8, g)
+                                  .MOV(horiz_offset(offset(inst->dst, ibld, i),
+                                                    8 * g),
+                                       offset(tmp, ubld, 2 * g + i));
+               mov->predicate = inst->predicate;
+               mov->predicate_inverse = inst->predicate_inverse;
+               mov->flag_subreg = inst->flag_subreg;
+            }
+         }
+
+         inst->dst = tmp;
+         progress = true;
+         break;
+      }
+      default:
+         break;
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+/**
+ * Lower a derivative instruction as the floating-point difference of two
+ * swizzles of the source, specified as \p swz0 and \p swz1.
+ */
+static bool
+lower_derivative(fs_visitor *v, bblock_t *block, fs_inst *inst,
+                 unsigned swz0, unsigned swz1)
+{
+   const fs_builder ubld = fs_builder(v, block, inst).exec_all();
+   const fs_reg tmp0 = ubld.vgrf(inst->src[0].type);
+   const fs_reg tmp1 = ubld.vgrf(inst->src[0].type);
+
+   ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
+   ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
+
+   inst->resize_sources(2);
+   inst->src[0] = negate(tmp0);
+   inst->src[1] = tmp1;
+   inst->opcode = BRW_OPCODE_ADD;
+
+   return true;
+}
+
+/**
+ * Lower derivative instructions on platforms where codegen cannot implement
+ * them efficiently (i.e. XeHP).
+ */
+bool
+fs_visitor::lower_derivatives()
+{
+   bool progress = false;
+
+   if (devinfo->verx10 < 125)
+      return false;
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->opcode == FS_OPCODE_DDX_COARSE)
+         progress |= lower_derivative(this, block, inst,
+                                      BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
+
+      else if (inst->opcode == FS_OPCODE_DDX_FINE)
+         progress |= lower_derivative(this, block, inst,
+                                      BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
+
+      else if (inst->opcode == FS_OPCODE_DDY_COARSE)
+         progress |= lower_derivative(this, block, inst,
+                                      BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
+
+      else if (inst->opcode == FS_OPCODE_DDY_FINE)
+         progress |= lower_derivative(this, block, inst,
+                                      BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+bool
+fs_visitor::lower_find_live_channel()
+{
+   bool progress = false;
+
+   if (devinfo->ver < 8)
+      return false;
+
+   bool packed_dispatch =
+      brw_stage_has_packed_dispatch(devinfo, stage, max_polygons,
+                                    stage_prog_data);
+   bool vmask =
+      stage == MESA_SHADER_FRAGMENT &&
+      brw_wm_prog_data(stage_prog_data)->uses_vmask;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&
+          inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL)
+         continue;
+
+      bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;
+
+      /* Getting the first active channel index is easy on Gfx8: Just find
+       * the first bit set in the execution mask.  The register exists on
+       * HSW already but it reads back as all ones when the current
+       * instruction has execution masking disabled, so it's kind of
+       * useless there.
+       */
+      fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
+
+      const fs_builder ibld(this, block, inst);
+      if (!inst->is_partial_write())
+         ibld.emit_undef_for_dst(inst);
+
+      const fs_builder ubld = fs_builder(this, block, inst).exec_all().group(1, 0);
+
+      /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
+       * so combine the execution and dispatch masks to obtain the true mask.
+       *
+       * If we're looking for the first live channel, and we have packed
+       * dispatch, we can skip this step, as we know all dispatched channels
+       * will appear at the front of the mask.
+       */
+      if (!(first && packed_dispatch)) {
+         fs_reg mask = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         ubld.UNDEF(mask);
+         ubld.emit(SHADER_OPCODE_READ_SR_REG, mask, brw_imm_ud(vmask ? 3 : 2));
+
+         /* Quarter control has the effect of magically shifting the value of
+          * ce0 so you'll get the first/last active channel relative to the
+          * specified quarter control as result.
+          */
+         if (inst->group > 0)
+            ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));
+
+         ubld.AND(mask, exec_mask, mask);
+         exec_mask = mask;
+      }
+
+      if (first) {
+         ubld.FBL(inst->dst, exec_mask);
+      } else {
+         fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         ubld.UNDEF(tmp);
+         ubld.LZD(tmp, exec_mask);
+         ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+void
+fs_visitor::dump_instructions_to_file(FILE *file) const
+{
+   if (cfg) {
+      const register_pressure &rp = regpressure_analysis.require();
+      unsigned ip = 0, max_pressure = 0;
+      unsigned cf_count = 0;
+      foreach_block_and_inst(block, backend_instruction, inst, cfg) {
+         if (inst->is_control_flow_end())
+            cf_count -= 1;
+
+         max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
+         fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip);
+         for (unsigned i = 0; i < cf_count; i++)
+            fprintf(file, "  ");
+         dump_instruction(inst, file);
+         ip++;
+
+         if (inst->is_control_flow_begin())
+            cf_count += 1;
+      }
+      fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
+   } else {
+      int ip = 0;
+      foreach_in_list(backend_instruction, inst, &instructions) {
+         fprintf(file, "%4d: ", ip++);
+         dump_instruction(inst, file);
+      }
+   }
+}
+
+void
+fs_visitor::dump_instruction_to_file(const backend_instruction *be_inst, FILE *file) const
+{
+   const fs_inst *inst = (const fs_inst *)be_inst;
+
+   if (inst->predicate) {
+      fprintf(file, "(%cf%d.%d) ",
+              inst->predicate_inverse ? '-' : '+',
+              inst->flag_subreg / 2,
+              inst->flag_subreg % 2);
+   }
+
+   fprintf(file, "%s", brw_instruction_name(&compiler->isa, inst->opcode));
+   if (inst->saturate)
+      fprintf(file, ".sat");
+   if (inst->conditional_mod) {
+      fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
+      if (!inst->predicate &&
+          (devinfo->ver < 5 || (inst->opcode != BRW_OPCODE_SEL &&
+                                inst->opcode != BRW_OPCODE_CSEL &&
+                                inst->opcode != BRW_OPCODE_IF &&
+                                inst->opcode != BRW_OPCODE_WHILE))) {
+         fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
+                 inst->flag_subreg % 2);
+      }
+   }
+   fprintf(file, "(%d) ", inst->exec_size);
+
+   if (inst->mlen) {
+      fprintf(file, "(mlen: %d) ", inst->mlen);
+   }
+
+   if (inst->ex_mlen) {
+      fprintf(file, "(ex_mlen: %d) ", inst->ex_mlen);
+   }
+
+   if (inst->eot) {
+      fprintf(file, "(EOT) ");
+   }
+
+   switch (inst->dst.file) {
+   case VGRF:
+      fprintf(file, "vgrf%d", inst->dst.nr);
+      break;
+   case FIXED_GRF:
+      fprintf(file, "g%d", inst->dst.nr);
+      break;
+   case MRF:
+      fprintf(file, "m%d", inst->dst.nr);
+      break;
+   case BAD_FILE:
+      fprintf(file, "(null)");
+      break;
+   case UNIFORM:
+      fprintf(file, "***u%d***", inst->dst.nr);
+      break;
+   case ATTR:
+      fprintf(file, "***attr%d***", inst->dst.nr);
+      break;
+   case ARF:
+      switch (inst->dst.nr) {
+      case BRW_ARF_NULL:
+         fprintf(file, "null");
+         break;
+      case BRW_ARF_ADDRESS:
+         fprintf(file, "a0.%d", inst->dst.subnr);
+         break;
+      case BRW_ARF_ACCUMULATOR:
+         fprintf(file, "acc%d", inst->dst.subnr);
+         break;
+      case BRW_ARF_FLAG:
+         fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+         break;
+      default:
+         fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+         break;
+      }
+      break;
+   case IMM:
+      unreachable("not reached");
+   }
+
+   if (inst->dst.offset ||
+       (inst->dst.file == VGRF &&
+        alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
+      const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
+      fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
+              inst->dst.offset % reg_size);
+   }
+
+   if (inst->dst.stride != 1)
+      fprintf(file, "<%u>", inst->dst.stride);
+   fprintf(file, ":%s, ", brw_reg_type_to_letters(inst->dst.type));
+
+   for (int i = 0; i < inst->sources; i++) {
+      if (inst->src[i].negate)
+         fprintf(file, "-");
+      if (inst->src[i].abs)
+         fprintf(file, "|");
+      switch (inst->src[i].file) {
+      case VGRF:
+         fprintf(file, "vgrf%d", inst->src[i].nr);
+         break;
+      case FIXED_GRF:
+         fprintf(file, "g%d", inst->src[i].nr);
+         break;
+      case MRF:
+         fprintf(file, "***m%d***", inst->src[i].nr);
+         break;
+      case ATTR:
+         fprintf(file, "attr%d", inst->src[i].nr);
+         break;
+      case UNIFORM:
+         fprintf(file, "u%d", inst->src[i].nr);
+         break;
+      case BAD_FILE:
+         fprintf(file, "(null)");
+         break;
+      case IMM:
+         switch (inst->src[i].type) {
+         case BRW_REGISTER_TYPE_HF:
+            fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff));
+            break;
+         case BRW_REGISTER_TYPE_F:
+            fprintf(file, "%-gf", inst->src[i].f);
+            break;
+         case BRW_REGISTER_TYPE_DF:
+            fprintf(file, "%fdf", inst->src[i].df);
+            break;
+         case BRW_REGISTER_TYPE_W:
+         case BRW_REGISTER_TYPE_D:
+            fprintf(file, "%dd", inst->src[i].d);
+            break;
+         case BRW_REGISTER_TYPE_UW:
+         case BRW_REGISTER_TYPE_UD:
+            fprintf(file, "%uu", inst->src[i].ud);
+            break;
+         case BRW_REGISTER_TYPE_Q:
+            fprintf(file, "%" PRId64 "q", inst->src[i].d64);
+            break;
+         case BRW_REGISTER_TYPE_UQ:
+            fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
+            break;
+         case BRW_REGISTER_TYPE_VF:
+            fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
+                    brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
+            break;
+         case BRW_REGISTER_TYPE_V:
+         case BRW_REGISTER_TYPE_UV:
+            fprintf(file, "%08x%s", inst->src[i].ud,
+                    inst->src[i].type == BRW_REGISTER_TYPE_V ? "V" : "UV");
+            break;
+         default:
+            fprintf(file, "???");
+            break;
+         }
+         break;
+      case ARF:
+         switch (inst->src[i].nr) {
+         case BRW_ARF_NULL:
+            fprintf(file, "null");
+            break;
+         case BRW_ARF_ADDRESS:
+            fprintf(file, "a0.%d", inst->src[i].subnr);
+            break;
+         case BRW_ARF_ACCUMULATOR:
+            fprintf(file, "acc%d", inst->src[i].subnr);
+            break;
+         case BRW_ARF_FLAG:
+            fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+            break;
+         default:
+            fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+            break;
+         }
+         break;
+      }
+
+      if (inst->src[i].offset ||
+          (inst->src[i].file == VGRF &&
+           alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
+         const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
+         fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
+                 inst->src[i].offset % reg_size);
+      }
+
+      if (inst->src[i].abs)
+         fprintf(file, "|");
+
+      if (inst->src[i].file != IMM) {
+         unsigned stride;
+         if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
+            unsigned hstride = inst->src[i].hstride;
+            stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
+         } else {
+            stride = inst->src[i].stride;
+         }
+         if (stride != 1)
+            fprintf(file, "<%u>", stride);
+
+         fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type));
+      }
+
+      if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
+         fprintf(file, ", ");
+   }
+
+   fprintf(file, " ");
+
+   if (inst->force_writemask_all)
+      fprintf(file, "NoMask ");
+
+   if (inst->exec_size != dispatch_width)
+      fprintf(file, "group%d ", inst->group);
+
+   fprintf(file, "\n");
+}
+
+brw::register_pressure::register_pressure(const fs_visitor *v)
+{
+   const fs_live_variables &live = v->live_analysis.require();
+   const unsigned num_instructions = v->cfg->num_blocks ?
+      v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
+
+   regs_live_at_ip = new unsigned[num_instructions]();
+
+   for (unsigned reg = 0; reg < v->alloc.count; reg++) {
+      for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
+         regs_live_at_ip[ip] += v->alloc.sizes[reg];
+   }
+
+   const unsigned payload_count = v->first_non_payload_grf;
+
+   int *payload_last_use_ip = new int[payload_count];
+   v->calculate_payload_ranges(payload_count, payload_last_use_ip);
+
+   for (unsigned reg = 0; reg < payload_count; reg++) {
+      for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
+         ++regs_live_at_ip[ip];
+   }
+
+   delete[] payload_last_use_ip;
+}
+
+brw::register_pressure::~register_pressure()
+{
+   delete[] regs_live_at_ip;
+}
+
+void
+fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
+{
+   backend_shader::invalidate_analysis(c);
+   live_analysis.invalidate(c);
+   regpressure_analysis.invalidate(c);
+}
+
+void
+fs_visitor::debug_optimizer(const nir_shader *nir,
+                            const char *pass_name,
+                            int iteration, int pass_num) const
+{
+   if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER))
+      return;
+
+   char *filename;
+   int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
+                      debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
+                      _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
+                      iteration, pass_num, pass_name);
+   if (ret == -1)
+      return;
+   dump_instructions(filename);
+   free(filename);
+}
+
+void
+fs_visitor::optimize()
+{
+   debug_optimizer(nir, "start", 0, 0);
+
+   /* Start by validating the shader we currently have. */
+   validate();
+
+   bool progress = false;
+   int iteration = 0;
+   int pass_num = 0;
+
+#define OPT(pass, args...) ({                                           \
+      pass_num++;                                                       \
+      bool this_progress = pass(args);                                  \
+                                                                        \
+      if (this_progress)                                                \
+         debug_optimizer(nir, #pass, iteration, pass_num);              \
+                                                                        \
+      validate();                                                       \
+                                                                        \
+      progress = progress || this_progress;                             \
+      this_progress;                                                    \
+   })
+
+   assign_constant_locations();
+   OPT(lower_constant_loads);
+
+   validate();
+
+   if (compiler->lower_dpas)
+      OPT(brw_lower_dpas, *this);
+
+   OPT(split_virtual_grfs);
+
+   /* Before anything else, eliminate dead code.  The results of some NIR
+    * instructions may effectively be calculated twice.  Once when the
+    * instruction is encountered, and again when the user of that result is
+    * encountered.  Wipe those away before algebraic optimizations and
+    * especially copy propagation can mix things up.
+    */
+   OPT(dead_code_eliminate);
+
+   OPT(remove_extra_rounding_modes);
+
+   do {
+      progress = false;
+      pass_num = 0;
+      iteration++;
+
+      OPT(remove_duplicate_mrf_writes);
+
+      OPT(opt_algebraic);
+      OPT(opt_cse);
+      OPT(opt_copy_propagation);
+      OPT(opt_predicated_break, this);
+      OPT(opt_cmod_propagation);
+      OPT(dead_code_eliminate);
+      OPT(opt_peephole_sel);
+      OPT(dead_control_flow_eliminate, this);
+      OPT(opt_saturate_propagation);
+      OPT(register_coalesce);
+      OPT(compute_to_mrf);
+      OPT(eliminate_find_live_channel);
+
+      OPT(compact_virtual_grfs);
+   } while (progress);
+
+   progress = false;
+   pass_num = 0;
+
+   if (OPT(lower_pack)) {
+      OPT(register_coalesce);
+      OPT(dead_code_eliminate);
+   }
+
+   OPT(lower_simd_width);
+   OPT(lower_barycentrics);
+   OPT(lower_logical_sends);
+
+   /* After logical SEND lowering. */
+
+   if (OPT(opt_copy_propagation))
+      OPT(opt_algebraic);
+
+   /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
+    * Do this before splitting SENDs.
+    */
+   if (devinfo->ver >= 7) {
+      if (OPT(opt_zero_samples) && OPT(opt_copy_propagation))
+         OPT(opt_algebraic);
+   }
+
+   OPT(opt_split_sends);
+   OPT(fixup_nomask_control_flow);
+
+   if (progress) {
+      if (OPT(opt_copy_propagation))
+         OPT(opt_algebraic);
+
+      /* Run after logical send lowering to give it a chance to CSE the
+       * LOAD_PAYLOAD instructions created to construct the payloads of
+       * e.g. texturing messages in cases where it wasn't possible to CSE the
+       * whole logical instruction.
+       */
+      OPT(opt_cse);
+      OPT(register_coalesce);
+      OPT(compute_to_mrf);
+      OPT(dead_code_eliminate);
+      OPT(remove_duplicate_mrf_writes);
+      OPT(opt_peephole_sel);
+   }
+
+   OPT(opt_redundant_halt);
+
+   if (OPT(lower_load_payload)) {
+      OPT(split_virtual_grfs);
+
+      /* Lower 64 bit MOVs generated by payload lowering. */
+      if (!devinfo->has_64bit_float || !devinfo->has_64bit_int)
+         OPT(opt_algebraic);
+
+      OPT(register_coalesce);
+      OPT(lower_simd_width);
+      OPT(compute_to_mrf);
+      OPT(dead_code_eliminate);
+   }
+
+   OPT(opt_combine_constants);
+   if (OPT(lower_integer_multiplication)) {
+      /* If lower_integer_multiplication made progress, it may have produced
+       * some 32x32-bit MULs in the process of lowering 64-bit MULs.  Run it
+       * one more time to clean those up if they exist.
+       */
+      OPT(lower_integer_multiplication);
+   }
+   OPT(lower_sub_sat);
+
+   if (devinfo->ver <= 5 && OPT(lower_minmax)) {
+      OPT(opt_cmod_propagation);
+      OPT(opt_cse);
+      if (OPT(opt_copy_propagation))
+         OPT(opt_algebraic);
+      OPT(dead_code_eliminate);
+   }
+
+   progress = false;
+   OPT(lower_derivatives);
+   OPT(lower_regioning);
+   if (progress) {
+      if (OPT(opt_copy_propagation))
+         OPT(opt_algebraic);
+      OPT(dead_code_eliminate);
+      OPT(lower_simd_width);
+   }
+
+   OPT(fixup_sends_duplicate_payload);
+
+   OPT(lower_uniform_pull_constant_loads);
+
+   OPT(lower_find_live_channel);
+
+   validate();
+}
+
+/**
+ * From the Skylake PRM Vol. 2a docs for sends:
+ *
+ *    "It is required that the second block of GRFs does not overlap with the
+ *    first block."
+ *
+ * There are plenty of cases where we may accidentally violate this due to
+ * having, for instance, both sources be the constant 0.  This little pass
+ * just adds a new vgrf for the second payload and copies it over.
+ */
+bool
+fs_visitor::fixup_sends_duplicate_payload()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
+          regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
+                          inst->src[3], inst->ex_mlen * REG_SIZE)) {
+         fs_reg tmp = fs_reg(VGRF, alloc.allocate(inst->ex_mlen),
+                             BRW_REGISTER_TYPE_UD);
+         /* Sadly, we've lost all notion of channels and bit sizes at this
+          * point.  Just WE_all it.
+          */
+         const fs_builder ibld = fs_builder(this, block, inst).exec_all().group(16, 0);
+         fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD);
+         fs_reg copy_dst = tmp;
+         for (unsigned i = 0; i < inst->ex_mlen; i += 2) {
+            if (inst->ex_mlen == i + 1) {
+               /* Only one register left; do SIMD8 */
+               ibld.group(8, 0).MOV(copy_dst, copy_src);
+            } else {
+               ibld.MOV(copy_dst, copy_src);
+            }
+            copy_src = offset(copy_src, ibld, 1);
+            copy_dst = offset(copy_dst, ibld, 1);
+         }
+         inst->src[3] = tmp;
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+/**
+ * Three source instruction must have a GRF/MRF destination register.
+ * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
+ */
+void
+fs_visitor::fixup_3src_null_dest()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      if (inst->is_3src(compiler) && inst->dst.is_null()) {
+         inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
+                            inst->dst.type);
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
+                          DEPENDENCY_VARIABLES);
+}
+
+static bool
+needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
+{
+   /* This workaround is about making sure that any instruction writing
+    * through UGM has completed before we hit EOT.
+    */
+   if (inst->sfid != GFX12_SFID_UGM)
+      return false;
+
+   /* Any UGM, non-Scratch-surface Stores (not including Atomic) messages,
+    * where the L1-cache override is NOT among {WB, WS, WT}
+    */
+   enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc);
+   if (lsc_opcode_is_store(opcode)) {
+      switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) {
+      case LSC_CACHE_STORE_L1STATE_L3MOCS:
+      case LSC_CACHE_STORE_L1WB_L3WB:
+      case LSC_CACHE_STORE_L1S_L3UC:
+      case LSC_CACHE_STORE_L1S_L3WB:
+      case LSC_CACHE_STORE_L1WT_L3UC:
+      case LSC_CACHE_STORE_L1WT_L3WB:
+         return false;
+
+      default:
+         return true;
+      }
+   }
+
+   /* Any UGM Atomic message WITHOUT return value */
+   if (lsc_opcode_is_atomic(opcode) && inst->dst.file == BAD_FILE)
+      return true;
+
+   return false;
+}
+
+/* Wa_14015360517
+ *
+ * The first instruction of any kernel should have non-zero emask.
+ * Make sure this happens by introducing a dummy mov instruction.
+ */
+void
+fs_visitor::emit_dummy_mov_instruction()
+{
+   if (!intel_needs_workaround(devinfo, 14015360517))
+      return;
+
+   struct backend_instruction *first_inst =
+      cfg->first_block()->start();
+
+   /* We can skip the WA if first instruction is marked with
+    * force_writemask_all or exec_size equals dispatch_width.
+    */
+   if (first_inst->force_writemask_all ||
+       first_inst->exec_size == dispatch_width)
+      return;
+
+   /* Insert dummy mov as first instruction. */
+   const fs_builder ubld =
+      fs_builder(this, cfg->first_block(), (fs_inst *)first_inst).exec_all().group(8, 0);
+   ubld.MOV(ubld.null_reg_ud(), brw_imm_ud(0u));
+
+   invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+}
+
+/* Wa_22013689345
+ *
+ * We need to emit UGM fence message before EOT, if shader has any UGM write
+ * or atomic message.
+ *
+ * TODO/FINISHME: According to Curro we could avoid the fence in some cases.
+ *                We probably need a better criteria in needs_dummy_fence().
+ */
+void
+fs_visitor::emit_dummy_memory_fence_before_eot()
+{
+   bool progress = false;
+   bool has_ugm_write_or_atomic = false;
+
+   if (!intel_needs_workaround(devinfo, 22013689345))
+      return;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      if (!inst->eot) {
+         if (needs_dummy_fence(devinfo, inst))
+            has_ugm_write_or_atomic = true;
+         continue;
+      }
+
+      if (!has_ugm_write_or_atomic)
+         break;
+
+      const fs_builder ibld(this, block, inst);
+      const fs_builder ubld = ibld.exec_all().group(1, 0);
+
+      fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      fs_inst *dummy_fence = ubld.emit(SHADER_OPCODE_MEMORY_FENCE,
+                                       dst, brw_vec8_grf(0, 0),
+                                       /* commit enable */ brw_imm_ud(1),
+                                       /* bti */ brw_imm_ud(0));
+      dummy_fence->sfid = GFX12_SFID_UGM;
+      dummy_fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_TILE,
+                                             LSC_FLUSH_TYPE_NONE_6, false);
+      ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
+      progress = true;
+      /* TODO: remove this break if we ever have shader with multiple EOT. */
+      break;
+   }
+
+   if (progress) {
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS |
+                          DEPENDENCY_VARIABLES);
+   }
+}
+
+/**
+ * Find the first instruction in the program that might start a region of
+ * divergent control flow due to a HALT jump.  There is no
+ * find_halt_control_flow_region_end(), the region of divergence extends until
+ * the only SHADER_OPCODE_HALT_TARGET in the program.
+ */
+static const fs_inst *
+find_halt_control_flow_region_start(const fs_visitor *v)
+{
+   foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+      if (inst->opcode == BRW_OPCODE_HALT ||
+          inst->opcode == SHADER_OPCODE_HALT_TARGET)
+         return inst;
+   }
+
+   return NULL;
+}
+
+/**
+ * Work around the Gfx12 hardware bug filed as Wa_1407528679.  EU fusion
+ * can cause a BB to be executed with all channels disabled, which will lead
+ * to the execution of any NoMask instructions in it, even though any
+ * execution-masked instructions will be correctly shot down.  This may break
+ * assumptions of some NoMask SEND messages whose descriptor depends on data
+ * generated by live invocations of the shader.
+ *
+ * This avoids the problem by predicating certain instructions on an ANY
+ * horizontal predicate that makes sure that their execution is omitted when
+ * all channels of the program are disabled.
+ */
+bool
+fs_visitor::fixup_nomask_control_flow()
+{
+   if (devinfo->ver != 12)
+      return false;
+
+   const brw_predicate pred = dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :
+                              dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :
+                              BRW_PREDICATE_ALIGN1_ANY8H;
+   const fs_inst *halt_start = find_halt_control_flow_region_start(this);
+   unsigned depth = 0;
+   bool progress = false;
+
+   const fs_live_variables &live_vars = live_analysis.require();
+
+   /* Scan the program backwards in order to be able to easily determine
+    * whether the flag register is live at any point.
+    */
+   foreach_block_reverse_safe(block, cfg) {
+      BITSET_WORD flag_liveout = live_vars.block_data[block->num]
+                                               .flag_liveout[0];
+      STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);
+
+      foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
+         if (!inst->predicate && inst->exec_size >= 8)
+            flag_liveout &= ~inst->flags_written(devinfo);
+
+         switch (inst->opcode) {
+         case BRW_OPCODE_DO:
+         case BRW_OPCODE_IF:
+            /* Note that this doesn't handle BRW_OPCODE_HALT since only
+             * the first one in the program closes the region of divergent
+             * control flow due to any HALT instructions -- Instead this is
+             * handled with the halt_start check below.
+             */
+            depth--;
+            break;
+
+         case BRW_OPCODE_WHILE:
+         case BRW_OPCODE_ENDIF:
+         case SHADER_OPCODE_HALT_TARGET:
+            depth++;
+            break;
+
+         default:
+            /* Note that the vast majority of NoMask SEND instructions in the
+             * program are harmless while executed in a block with all
+             * channels disabled, since any instructions with side effects we
+             * could hit here should be execution-masked.
+             *
+             * The main concern is NoMask SEND instructions where the message
+             * descriptor or header depends on data generated by live
+             * invocations of the shader (RESINFO and
+             * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
+             * computed surface index seem to be the only examples right now
+             * where this could easily lead to GPU hangs).  Unfortunately we
+             * have no straightforward way to detect that currently, so just
+             * predicate any NoMask SEND instructions we find under control
+             * flow.
+             *
+             * If this proves to have a measurable performance impact it can
+             * be easily extended with a whitelist of messages we know we can
+             * safely omit the predication for.
+             */
+            if (depth && inst->force_writemask_all &&
+                is_send(inst) && !inst->predicate) {
+               /* We need to load the execution mask into the flag register by
+                * using a builder with channel group matching the whole shader
+                * (rather than the default which is derived from the original
+                * instruction), in order to avoid getting a right-shifted
+                * value.
+                */
+               const fs_builder ubld = fs_builder(this, block, inst)
+                                       .exec_all().group(dispatch_width, 0);
+               const fs_reg flag = retype(brw_flag_reg(0, 0),
+                                          BRW_REGISTER_TYPE_UD);
+
+               /* Due to the lack of flag register allocation we need to save
+                * and restore the flag register if it's live.
+                */
+               const bool save_flag = flag_liveout &
+                                      flag_mask(flag, dispatch_width / 8);
+               const fs_reg tmp = ubld.group(8, 0).vgrf(flag.type);
+
+               if (save_flag) {
+                  ubld.group(8, 0).UNDEF(tmp);
+                  ubld.group(1, 0).MOV(tmp, flag);
+               }
+
+               ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);
+
+               set_predicate(pred, inst);
+               inst->flag_subreg = 0;
+               inst->predicate_trivial = true;
+
+               if (save_flag)
+                  ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
+
+               progress = true;
+            }
+            break;
+         }
+
+         if (inst == halt_start)
+            depth--;
+
+         flag_liveout |= inst->flags_read(devinfo);
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+uint32_t
+fs_visitor::compute_max_register_pressure()
+{
+   const register_pressure &rp = regpressure_analysis.require();
+   uint32_t ip = 0, max_pressure = 0;
+   foreach_block_and_inst(block, backend_instruction, inst, cfg) {
+      max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
+      ip++;
+   }
+   return max_pressure;
+}
+
+static fs_inst **
+save_instruction_order(const struct cfg_t *cfg)
+{
+   /* Before we schedule anything, stash off the instruction order as an array
+    * of fs_inst *.  This way, we can reset it between scheduling passes to
+    * prevent dependencies between the different scheduling modes.
+    */
+   int num_insts = cfg->last_block()->end_ip + 1;
+   fs_inst **inst_arr = new fs_inst * [num_insts];
+
+   int ip = 0;
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      assert(ip >= block->start_ip && ip <= block->end_ip);
+      inst_arr[ip++] = inst;
+   }
+   assert(ip == num_insts);
+
+   return inst_arr;
+}
+
+static void
+restore_instruction_order(struct cfg_t *cfg, fs_inst **inst_arr)
+{
+   ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
+
+   int ip = 0;
+   foreach_block (block, cfg) {
+      block->instructions.make_empty();
+
+      assert(ip == block->start_ip);
+      for (; ip <= block->end_ip; ip++)
+         block->instructions.push_tail(inst_arr[ip]);
+   }
+   assert(ip == num_insts);
+}
+
+void
+fs_visitor::allocate_registers(bool allow_spilling)
+{
+   bool allocated;
+
+   static const enum instruction_scheduler_mode pre_modes[] = {
+      SCHEDULE_PRE,
+      SCHEDULE_PRE_NON_LIFO,
+      SCHEDULE_NONE,
+      SCHEDULE_PRE_LIFO,
+   };
+
+   static const char *scheduler_mode_name[] = {
+      [SCHEDULE_PRE] = "top-down",
+      [SCHEDULE_PRE_NON_LIFO] = "non-lifo",
+      [SCHEDULE_PRE_LIFO] = "lifo",
+      [SCHEDULE_POST] = "post",
+      [SCHEDULE_NONE] = "none",
+   };
+
+   uint32_t best_register_pressure = UINT32_MAX;
+   enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
+
+   compact_virtual_grfs();
+
+   if (needs_register_pressure)
+      shader_stats.max_register_pressure = compute_max_register_pressure();
+
+   debug_optimizer(nir, "pre_register_allocate", 90, 90);
+
+   bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
+
+   /* Before we schedule anything, stash off the instruction order as an array
+    * of fs_inst *.  This way, we can reset it between scheduling passes to
+    * prevent dependencies between the different scheduling modes.
+    */
+   fs_inst **orig_order = save_instruction_order(cfg);
+   fs_inst **best_pressure_order = NULL;
+
+   void *scheduler_ctx = ralloc_context(NULL);
+   fs_instruction_scheduler *sched = prepare_scheduler(scheduler_ctx);
+
+   /* Try each scheduling heuristic to see if it can successfully register
+    * allocate without spilling.  They should be ordered by decreasing
+    * performance but increasing likelihood of allocating.
+    */
+   for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
+      enum instruction_scheduler_mode sched_mode = pre_modes[i];
+
+      schedule_instructions_pre_ra(sched, sched_mode);
+      this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
+
+      debug_optimizer(nir, shader_stats.scheduler_mode, 95, i);
+
+      if (0) {
+         assign_regs_trivial();
+         allocated = true;
+         break;
+      }
+
+      /* We should only spill registers on the last scheduling. */
+      assert(!spilled_any_registers);
+
+      allocated = assign_regs(false, spill_all);
+      if (allocated)
+         break;
+
+      /* Save the maximum register pressure */
+      uint32_t this_pressure = compute_max_register_pressure();
+
+      if (0) {
+         fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
+                 scheduler_mode_name[sched_mode], this_pressure);
+      }
+
+      if (this_pressure < best_register_pressure) {
+         best_register_pressure = this_pressure;
+         best_sched = sched_mode;
+         delete[] best_pressure_order;
+         best_pressure_order = save_instruction_order(cfg);
+      }
+
+      /* Reset back to the original order before trying the next mode */
+      restore_instruction_order(cfg, orig_order);
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+   }
+
+   ralloc_free(scheduler_ctx);
+
+   if (!allocated) {
+      if (0) {
+         fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
+                 scheduler_mode_name[best_sched]);
+      }
+      restore_instruction_order(cfg, best_pressure_order);
+      shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
+
+      allocated = assign_regs(allow_spilling, spill_all);
+   }
+
+   delete[] orig_order;
+   delete[] best_pressure_order;
+
+   if (!allocated) {
+      fail("Failure to register allocate.  Reduce number of "
+           "live scalar values to avoid this.");
+   } else if (spilled_any_registers) {
+      brw_shader_perf_log(compiler, log_data,
+                          "%s shader triggered register spilling.  "
+                          "Try reducing the number of live scalar "
+                          "values to improve performance.\n",
+                          _mesa_shader_stage_to_string(stage));
+   }
+
+   /* This must come after all optimization and register allocation, since
+    * it inserts dead code that happens to have side effects, and it does
+    * so based on the actual physical registers in use.
+    */
+   insert_gfx4_send_dependency_workarounds();
+
+   if (failed)
+      return;
+
+   opt_bank_conflicts();
+
+   schedule_instructions_post_ra();
+
+   if (last_scratch > 0) {
+      ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
+
+      /* Take the max of any previously compiled variant of the shader. In the
+       * case of bindless shaders with return parts, this will also take the
+       * max of all parts.
+       */
+      prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch),
+                                      prog_data->total_scratch);
+
+      if (gl_shader_stage_is_compute(stage)) {
+         if (devinfo->platform == INTEL_PLATFORM_HSW) {
+            /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
+             * field documentation, Haswell supports a minimum of 2kB of
+             * scratch space for compute shaders, unlike every other stage
+             * and platform.
+             */
+            prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
+         } else if (devinfo->ver <= 7) {
+            /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
+             * field documentation, platforms prior to Haswell measure scratch
+             * size linearly with a range of [1kB, 12kB] and 1kB granularity.
+             */
+            prog_data->total_scratch = ALIGN(last_scratch, 1024);
+            max_scratch_size = 12 * 1024;
+         }
+      }
+
+      /* We currently only support up to 2MB of scratch space.  If we
+       * need to support more eventually, the documentation suggests
+       * that we could allocate a larger buffer, and partition it out
+       * ourselves.  We'd just have to undo the hardware's address
+       * calculation by subtracting (FFTID * Per Thread Scratch Space)
+       * and then add FFTID * (Larger Per Thread Scratch Space).
+       *
+       * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
+       * Thread Group Tracking > Local Memory/Scratch Space.
+       */
+      assert(prog_data->total_scratch < max_scratch_size);
+   }
+
+   lower_scoreboard();
+}
+
+bool
+fs_visitor::run_vs()
+{
+   assert(stage == MESA_SHADER_VERTEX);
+
+   payload_ = new vs_thread_payload(*this);
+
+   nir_to_brw(this);
+
+   if (failed)
+      return false;
+
+   emit_urb_writes();
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_vs_urb_setup();
+
+   fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
+
+   /* Wa_14015360517 */
+   emit_dummy_mov_instruction();
+
+   allocate_registers(true /* allow_spilling */);
+
+   return !failed;
+}
+
+void
+fs_visitor::set_tcs_invocation_id()
+{
+   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+   struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
+   const fs_builder bld = fs_builder(this).at_end();
+
+   const unsigned instance_id_mask =
+      (devinfo->verx10 >= 125) ? INTEL_MASK(7, 0) :
+      (devinfo->ver >= 11)     ? INTEL_MASK(22, 16) :
+                                 INTEL_MASK(23, 17);
+   const unsigned instance_id_shift =
+      (devinfo->verx10 >= 125) ? 0 : (devinfo->ver >= 11) ? 16 : 17;
+
+   /* Get instance number from g0.2 bits:
+    *  * 7:0 on DG2+
+    *  * 22:16 on gfx11+
+    *  * 23:17 otherwise
+    */
+   fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
+           brw_imm_ud(instance_id_mask));
+
+   invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+   if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH) {
+      /* gl_InvocationID is just the thread number */
+      bld.SHR(invocation_id, t, brw_imm_ud(instance_id_shift));
+      return;
+   }
+
+   assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH);
+
+   fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
+   fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
+   bld.MOV(channels_ud, channels_uw);
+
+   if (tcs_prog_data->instances == 1) {
+      invocation_id = channels_ud;
+   } else {
+      fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.SHR(instance_times_8, t, brw_imm_ud(instance_id_shift - 3));
+      bld.ADD(invocation_id, instance_times_8, channels_ud);
+   }
+}
+
+void
+fs_visitor::emit_tcs_thread_end()
+{
+   /* Try and tag the last URB write with EOT instead of emitting a whole
+    * separate write just to finish the thread.  There isn't guaranteed to
+    * be one, so this may not succeed.
+    */
+   if (devinfo->ver != 8 && mark_last_urb_write_with_eot())
+      return;
+
+   const fs_builder bld = fs_builder(this).at_end();
+
+   /* Emit a URB write to end the thread.  On Broadwell, we use this to write
+    * zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy
+    * algorithm to set it optimally).  On other platforms, we simply write
+    * zero to a reserved/MBZ patch header DWord which has no consequence.
+    */
+   fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+   srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output;
+   srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16);
+   srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0);
+   srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
+   fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
+                            reg_undef, srcs, ARRAY_SIZE(srcs));
+   inst->eot = true;
+}
+
+bool
+fs_visitor::run_tcs()
+{
+   assert(stage == MESA_SHADER_TESS_CTRL);
+
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+   const fs_builder bld = fs_builder(this).at_end();
+
+   assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH ||
+          vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
+
+   payload_ = new tcs_thread_payload(*this);
+
+   /* Initialize gl_InvocationID */
+   set_tcs_invocation_id();
+
+   const bool fix_dispatch_mask =
+      vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH &&
+      (nir->info.tess.tcs_vertices_out % 8) != 0;
+
+   /* Fix the disptach mask */
+   if (fix_dispatch_mask) {
+      bld.CMP(bld.null_reg_ud(), invocation_id,
+              brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
+      bld.IF(BRW_PREDICATE_NORMAL);
+   }
+
+   nir_to_brw(this);
+
+   if (fix_dispatch_mask) {
+      bld.emit(BRW_OPCODE_ENDIF);
+   }
+
+   emit_tcs_thread_end();
+
+   if (failed)
+      return false;
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_tcs_urb_setup();
+
+   fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
+
+   /* Wa_14015360517 */
+   emit_dummy_mov_instruction();
+
+   allocate_registers(true /* allow_spilling */);
+
+   return !failed;
+}
+
+bool
+fs_visitor::run_tes()
+{
+   assert(stage == MESA_SHADER_TESS_EVAL);
+
+   payload_ = new tes_thread_payload(*this);
+
+   nir_to_brw(this);
+
+   if (failed)
+      return false;
+
+   emit_urb_writes();
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_tes_urb_setup();
+
+   fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
+
+   /* Wa_14015360517 */
+   emit_dummy_mov_instruction();
+
+   allocate_registers(true /* allow_spilling */);
+
+   return !failed;
+}
+
+bool
+fs_visitor::run_gs()
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   payload_ = new gs_thread_payload(*this);
+
+   this->final_gs_vertex_count = vgrf(glsl_uint_type());
+
+   if (gs_compile->control_data_header_size_bits > 0) {
+      /* Create a VGRF to store accumulated control data bits. */
+      this->control_data_bits = vgrf(glsl_uint_type());
+
+      /* If we're outputting more than 32 control data bits, then EmitVertex()
+       * will set control_data_bits to 0 after emitting the first vertex.
+       * Otherwise, we need to initialize it to 0 here.
+       */
+      if (gs_compile->control_data_header_size_bits <= 32) {
+         const fs_builder bld = fs_builder(this).at_end();
+         const fs_builder abld = bld.annotate("initialize control data bits");
+         abld.MOV(this->control_data_bits, brw_imm_ud(0u));
+      }
+   }
+
+   nir_to_brw(this);
+
+   emit_gs_thread_end();
+
+   if (failed)
+      return false;
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_gs_urb_setup();
+
+   fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
+
+   /* Wa_14015360517 */
+   emit_dummy_mov_instruction();
+
+   allocate_registers(true /* allow_spilling */);
+
+   return !failed;
+}
+
+/* From the SKL PRM, Volume 16, Workarounds:
+ *
+ *   0877  3D   Pixel Shader Hang possible when pixel shader dispatched with
+ *              only header phases (R0-R2)
+ *
+ *   WA: Enable a non-header phase (e.g. push constant) when dispatch would
+ *       have been header only.
+ *
+ * Instead of enabling push constants one can alternatively enable one of the
+ * inputs. Here one simply chooses "layer" which shouldn't impose much
+ * overhead.
+ */
+static void
+gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
+{
+   if (wm_prog_data->num_varying_inputs)
+      return;
+
+   if (wm_prog_data->base.curb_read_length)
+      return;
+
+   wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
+   wm_prog_data->num_varying_inputs = 1;
+
+   brw_compute_urb_setup_index(wm_prog_data);
+}
+
+bool
+fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
+{
+   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
+   brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
+   const fs_builder bld = fs_builder(this).at_end();
+
+   assert(stage == MESA_SHADER_FRAGMENT);
+
+   payload_ = new fs_thread_payload(*this, source_depth_to_render_target,
+                                    runtime_check_aads_emit);
+
+   if (do_rep_send) {
+      assert(dispatch_width == 16);
+      emit_repclear_shader();
+   } else {
+      if (nir->info.inputs_read > 0 ||
+          BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
+          (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
+         if (devinfo->ver < 6)
+            emit_interpolation_setup_gfx4();
+         else
+            emit_interpolation_setup_gfx6();
+      }
+
+      /* We handle discards by keeping track of the still-live pixels in f0.1.
+       * Initialize it with the dispatched pixels.
+       */
+      if (wm_prog_data->uses_kill) {
+         const unsigned lower_width = MIN2(dispatch_width, 16);
+         for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
+            /* According to the "PS Thread Payload for Normal
+             * Dispatch" pages on the BSpec, the dispatch mask is
+             * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
+             * gfx6+.
+             */
+            const fs_reg dispatch_mask =
+               devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) :
+               devinfo->ver >= 6 ? brw_vec1_grf(i + 1, 7) :
+               brw_vec1_grf(0, 0);
+            bld.exec_all().group(1, 0)
+               .MOV(brw_sample_mask_reg(bld.group(lower_width, i)),
+                    retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
+         }
+      }
+
+      if (nir->info.writes_memory)
+         wm_prog_data->has_side_effects = true;
+
+      nir_to_brw(this);
+
+      if (failed)
+	 return false;
+
+      if (wm_key->emit_alpha_test)
+         emit_alpha_test();
+
+      emit_fb_writes();
+
+      calculate_cfg();
+
+      optimize();
+
+      assign_curb_setup();
+
+      if (devinfo->ver == 9)
+         gfx9_ps_header_only_workaround(wm_prog_data);
+
+      assign_urb_setup();
+
+      fixup_3src_null_dest();
+      emit_dummy_memory_fence_before_eot();
+
+      /* Wa_14015360517 */
+      emit_dummy_mov_instruction();
+
+      allocate_registers(allow_spilling);
+   }
+
+   return !failed;
+}
+
+bool
+fs_visitor::run_cs(bool allow_spilling)
+{
+   assert(gl_shader_stage_is_compute(stage));
+   assert(devinfo->ver >= 7);
+   const fs_builder bld = fs_builder(this).at_end();
+
+   payload_ = new cs_thread_payload(*this);
+
+   if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) {
+      /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
+      const fs_builder abld = bld.exec_all().group(1, 0);
+      abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),
+               suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
+   }
+
+   nir_to_brw(this);
+
+   if (failed)
+      return false;
+
+   emit_cs_terminate();
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+
+   fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
+
+   /* Wa_14015360517 */
+   emit_dummy_mov_instruction();
+
+   allocate_registers(allow_spilling);
+
+   return !failed;
+}
+
+bool
+fs_visitor::run_bs(bool allow_spilling)
+{
+   assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
+
+   payload_ = new bs_thread_payload(*this);
+
+   nir_to_brw(this);
+
+   if (failed)
+      return false;
+
+   /* TODO(RT): Perhaps rename this? */
+   emit_cs_terminate();
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+
+   fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
+
+   /* Wa_14015360517 */
+   emit_dummy_mov_instruction();
+
+   allocate_registers(allow_spilling);
+
+   return !failed;
+}
+
+bool
+fs_visitor::run_task(bool allow_spilling)
+{
+   assert(stage == MESA_SHADER_TASK);
+
+   payload_ = new task_mesh_thread_payload(*this);
+
+   nir_to_brw(this);
+
+   if (failed)
+      return false;
+
+   emit_urb_fence();
+
+   emit_cs_terminate();
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+
+   fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
+
+   /* Wa_14015360517 */
+   emit_dummy_mov_instruction();
+
+   allocate_registers(allow_spilling);
+
+   return !failed;
+}
+
+bool
+fs_visitor::run_mesh(bool allow_spilling)
+{
+   assert(stage == MESA_SHADER_MESH);
+
+   payload_ = new task_mesh_thread_payload(*this);
+
+   nir_to_brw(this);
+
+   if (failed)
+      return false;
+
+   emit_urb_fence();
+
+   emit_cs_terminate();
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+
+   fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
+
+   /* Wa_14015360517 */
+   emit_dummy_mov_instruction();
+
+   allocate_registers(allow_spilling);
+
+   return !failed;
+}
+
+static bool
+is_used_in_not_interp_frag_coord(nir_def *def)
+{
+   nir_foreach_use_including_if(src, def) {
+      if (nir_src_is_if(src))
+         return true;
+
+      if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
+         return true;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
+      if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
+         return true;
+   }
+
+   return false;
+}
+
+/**
+ * Return a bitfield where bit n is set if barycentric interpolation mode n
+ * (see enum brw_barycentric_mode) is needed by the fragment shader.
+ *
+ * We examine the load_barycentric intrinsics rather than looking at input
+ * variables so that we catch interpolateAtCentroid() messages too, which
+ * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
+ */
+static unsigned
+brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
+                                     const nir_shader *shader)
+{
+   unsigned barycentric_interp_modes = 0;
+
+   nir_foreach_function_impl(impl, shader) {
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_load_barycentric_pixel:
+            case nir_intrinsic_load_barycentric_centroid:
+            case nir_intrinsic_load_barycentric_sample:
+            case nir_intrinsic_load_barycentric_at_sample:
+            case nir_intrinsic_load_barycentric_at_offset:
+               break;
+            default:
+               continue;
+            }
+
+            /* Ignore WPOS; it doesn't require interpolation. */
+            if (!is_used_in_not_interp_frag_coord(&intrin->def))
+               continue;
+
+            nir_intrinsic_op bary_op = intrin->intrinsic;
+            enum brw_barycentric_mode bary =
+               brw_barycentric_mode(intrin);
+
+            barycentric_interp_modes |= 1 << bary;
+
+            if (devinfo->needs_unlit_centroid_workaround &&
+                bary_op == nir_intrinsic_load_barycentric_centroid)
+               barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
+         }
+      }
+   }
+
+   return barycentric_interp_modes;
+}
+
+static void
+brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
+                        const nir_shader *shader)
+{
+   prog_data->flat_inputs = 0;
+
+   nir_foreach_shader_in_variable(var, shader) {
+      /* flat shading */
+      if (var->data.interpolation != INTERP_MODE_FLAT)
+         continue;
+
+      if (var->data.per_primitive)
+         continue;
+
+      unsigned slots = glsl_count_attribute_slots(var->type, false);
+      for (unsigned s = 0; s < slots; s++) {
+         int input_index = prog_data->urb_setup[var->data.location + s];
+
+         if (input_index >= 0)
+            prog_data->flat_inputs |= 1 << input_index;
+      }
+   }
+}
+
+static uint8_t
+computed_depth_mode(const nir_shader *shader)
+{
+   if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+      switch (shader->info.fs.depth_layout) {
+      case FRAG_DEPTH_LAYOUT_NONE:
+      case FRAG_DEPTH_LAYOUT_ANY:
+         return BRW_PSCDEPTH_ON;
+      case FRAG_DEPTH_LAYOUT_GREATER:
+         return BRW_PSCDEPTH_ON_GE;
+      case FRAG_DEPTH_LAYOUT_LESS:
+         return BRW_PSCDEPTH_ON_LE;
+      case FRAG_DEPTH_LAYOUT_UNCHANGED:
+         /* We initially set this to OFF, but having the shader write the
+          * depth means we allocate register space in the SEND message. The
+          * difference between the SEND register count and the OFF state
+          * programming makes the HW hang.
+          *
+          * Removing the depth writes also leads to test failures. So use
+          * LesserThanOrEqual, which fits writing the same value
+          * (unchanged/equal).
+          *
+          */
+         return BRW_PSCDEPTH_ON_LE;
+      }
+   }
+   return BRW_PSCDEPTH_OFF;
+}
+
+/**
+ * Move load_interpolated_input with simple (payload-based) barycentric modes
+ * to the top of the program so we don't emit multiple PLNs for the same input.
+ *
+ * This works around CSE not being able to handle non-dominating cases
+ * such as:
+ *
+ *    if (...) {
+ *       interpolate input
+ *    } else {
+ *       interpolate the same exact input
+ *    }
+ *
+ * This should be replaced by global value numbering someday.
+ */
+bool
+brw_nir_move_interpolation_to_top(nir_shader *nir)
+{
+   bool progress = false;
+
+   nir_foreach_function_impl(impl, nir) {
+      nir_block *top = nir_start_block(impl);
+      nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
+      bool impl_progress = false;
+
+      for (nir_block *block = nir_block_cf_tree_next(top);
+           block != NULL;
+           block = nir_block_cf_tree_next(block)) {
+
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
+               continue;
+            nir_intrinsic_instr *bary_intrinsic =
+               nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
+            nir_intrinsic_op op = bary_intrinsic->intrinsic;
+
+            /* Leave interpolateAtSample/Offset() where they are. */
+            if (op == nir_intrinsic_load_barycentric_at_sample ||
+                op == nir_intrinsic_load_barycentric_at_offset)
+               continue;
+
+            nir_instr *move[3] = {
+               &bary_intrinsic->instr,
+               intrin->src[1].ssa->parent_instr,
+               instr
+            };
+
+            for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
+               if (move[i]->block != top) {
+                  nir_instr_move(cursor, move[i]);
+                  impl_progress = true;
+               }
+            }
+         }
+      }
+
+      progress = progress || impl_progress;
+
+      nir_metadata_preserve(impl, impl_progress ? (nir_metadata_block_index |
+                                                      nir_metadata_dominance)
+                                                   : nir_metadata_all);
+   }
+
+   return progress;
+}
+
+static void
+brw_nir_populate_wm_prog_data(nir_shader *shader,
+                              const struct intel_device_info *devinfo,
+                              const struct brw_wm_prog_key *key,
+                              struct brw_wm_prog_data *prog_data,
+                              const struct brw_mue_map *mue_map)
+{
+   /* key->alpha_test_func means simulating alpha testing via discards,
+    * so the shader definitely kills pixels.
+    */
+   prog_data->uses_kill = shader->info.fs.uses_discard ||
+                          shader->info.fs.uses_demote ||
+                          key->emit_alpha_test;
+   prog_data->uses_omask = !key->ignore_sample_mask_out &&
+      (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
+   prog_data->color_outputs_written = key->color_outputs_valid;
+   prog_data->max_polygons = 1;
+   prog_data->computed_depth_mode = computed_depth_mode(shader);
+   prog_data->computed_stencil =
+      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
+
+   prog_data->sample_shading =
+      shader->info.fs.uses_sample_shading ||
+      shader->info.outputs_read;
+
+   assert(key->multisample_fbo != BRW_NEVER ||
+          key->persample_interp == BRW_NEVER);
+
+   prog_data->persample_dispatch = key->persample_interp;
+   if (prog_data->sample_shading)
+      prog_data->persample_dispatch = BRW_ALWAYS;
+
+   /* We can only persample dispatch if we have a multisample FBO */
+   prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
+                                        key->multisample_fbo);
+
+   /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
+    * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
+    * to definitively tell whether alpha_to_coverage is on or off.
+    */
+   prog_data->alpha_to_coverage = key->alpha_to_coverage;
+   assert(prog_data->alpha_to_coverage != BRW_SOMETIMES ||
+          prog_data->persample_dispatch == BRW_SOMETIMES);
+
+   if (devinfo->ver >= 6) {
+      prog_data->uses_sample_mask =
+         BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
+
+      /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
+       *
+       *    "MSDISPMODE_PERSAMPLE is required in order to select
+       *    POSOFFSET_SAMPLE"
+       *
+       * So we can only really get sample positions if we are doing real
+       * per-sample dispatch.  If we need gl_SamplePosition and we don't have
+       * persample dispatch, we hard-code it to 0.5.
+       */
+      prog_data->uses_pos_offset =
+         prog_data->persample_dispatch != BRW_NEVER &&
+         (BITSET_TEST(shader->info.system_values_read,
+                      SYSTEM_VALUE_SAMPLE_POS) ||
+          BITSET_TEST(shader->info.system_values_read,
+                      SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
+   }
+
+   prog_data->has_render_target_reads = shader->info.outputs_read != 0ull;
+
+   prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
+   prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
+   prog_data->inner_coverage = shader->info.fs.inner_coverage;
+
+   prog_data->barycentric_interp_modes =
+      brw_compute_barycentric_interp_modes(devinfo, shader);
+
+   /* From the BDW PRM documentation for 3DSTATE_WM:
+    *
+    *    "MSDISPMODE_PERSAMPLE is required in order to select Perspective
+    *     Sample or Non- perspective Sample barycentric coordinates."
+    *
+    * So cleanup any potentially set sample barycentric mode when not in per
+    * sample dispatch.
+    */
+   if (prog_data->persample_dispatch == BRW_NEVER) {
+      prog_data->barycentric_interp_modes &=
+         ~BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE);
+   }
+
+   prog_data->uses_nonperspective_interp_modes |=
+      (prog_data->barycentric_interp_modes &
+      BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0;
+
+   /* The current VK_EXT_graphics_pipeline_library specification requires
+    * coarse to specified at compile time. But per sample interpolation can be
+    * dynamic. So we should never be in a situation where coarse &
+    * persample_interp are both respectively true & BRW_ALWAYS.
+    *
+    * Coarse will dynamically turned off when persample_interp is active.
+    */
+   assert(!key->coarse_pixel || key->persample_interp != BRW_ALWAYS);
+
+   prog_data->coarse_pixel_dispatch =
+      brw_sometimes_invert(prog_data->persample_dispatch);
+   if (!key->coarse_pixel ||
+       prog_data->uses_omask ||
+       prog_data->sample_shading ||
+       prog_data->uses_sample_mask ||
+       (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
+       prog_data->computed_stencil) {
+      prog_data->coarse_pixel_dispatch = BRW_NEVER;
+   }
+
+   /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
+    * Message Descriptor :
+    *
+    *    "Message Type. Specifies the type of message being sent when
+    *     pixel-rate evaluation is requested :
+    *
+    *     Format = U2
+    *       0: Per Message Offset (eval_snapped with immediate offset)
+    *       1: Sample Position Offset (eval_sindex)
+    *       2: Centroid Position Offset (eval_centroid)
+    *       3: Per Slot Offset (eval_snapped with register offset)
+    *
+    *     Message Type. Specifies the type of message being sent when
+    *     coarse-rate evaluation is requested :
+    *
+    *     Format = U2
+    *       0: Coarse to Pixel Mapping Message (internal message)
+    *       1: Reserved
+    *       2: Coarse Centroid Position (eval_centroid)
+    *       3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
+    *
+    * The Sample Position Offset is marked as reserved for coarse rate
+    * evaluation and leads to hangs if we try to use it. So disable coarse
+    * pixel shading if we have any intrinsic that will result in a pixel
+    * interpolater message at sample.
+    */
+   if (intel_nir_pulls_at_sample(shader))
+      prog_data->coarse_pixel_dispatch = BRW_NEVER;
+
+   /* We choose to always enable VMask prior to XeHP, as it would cause
+    * us to lose out on the eliminate_find_live_channel() optimization.
+    */
+   prog_data->uses_vmask = devinfo->verx10 < 125 ||
+                           shader->info.fs.needs_quad_helper_invocations ||
+                           shader->info.uses_wide_subgroup_intrinsics ||
+                           prog_data->coarse_pixel_dispatch != BRW_NEVER;
+
+   prog_data->uses_src_w =
+      BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
+   prog_data->uses_src_depth =
+      BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
+      prog_data->coarse_pixel_dispatch != BRW_ALWAYS;
+   prog_data->uses_depth_w_coefficients =
+      BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
+      prog_data->coarse_pixel_dispatch != BRW_NEVER;
+
+   calculate_urb_setup(devinfo, key, prog_data, shader, mue_map);
+   brw_compute_flat_inputs(prog_data, shader);
+}
+
+/**
+ * Pre-gfx6, the register file of the EUs was shared between threads,
+ * and each thread used some subset allocated on a 16-register block
+ * granularity.  The unit states wanted these block counts.
+ */
+static inline int
+brw_register_blocks(int reg_count)
+{
+   return ALIGN(reg_count, 16) / 16 - 1;
+}
+
+const unsigned *
+brw_compile_fs(const struct brw_compiler *compiler,
+               struct brw_compile_fs_params *params)
+{
+   struct nir_shader *nir = params->base.nir;
+   const struct brw_wm_prog_key *key = params->key;
+   struct brw_wm_prog_data *prog_data = params->prog_data;
+   bool allow_spilling = params->allow_spilling;
+   const bool debug_enabled =
+      brw_should_print_shader(nir, params->base.debug_flag ?
+                                   params->base.debug_flag : DEBUG_WM);
+
+   prog_data->base.stage = MESA_SHADER_FRAGMENT;
+   prog_data->base.ray_queries = nir->info.ray_queries;
+   prog_data->base.total_scratch = 0;
+
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16;
+
+   brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size);
+   brw_nir_lower_fs_inputs(nir, devinfo, key);
+   brw_nir_lower_fs_outputs(nir);
+
+   if (devinfo->ver < 6)
+      brw_setup_vue_interpolation(params->vue_map, nir, prog_data);
+
+   /* From the SKL PRM, Volume 7, "Alpha Coverage":
+    *  "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
+    *   hardware, regardless of the state setting for this feature."
+    */
+   if (devinfo->ver > 6 && key->alpha_to_coverage != BRW_NEVER) {
+      /* Run constant fold optimization in order to get the correct source
+       * offset to determine render target 0 store instruction in
+       * emit_alpha_to_coverage pass.
+       */
+      NIR_PASS(_, nir, nir_opt_constant_folding);
+      NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage, key, prog_data);
+   }
+
+   NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
+   brw_postprocess_nir(nir, compiler, debug_enabled,
+                       key->base.robust_flags);
+
+   brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
+                                 params->mue_map);
+
+   std::unique_ptr<fs_visitor> v8, v16, v32, vmulti;
+   cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL,
+      *multi_cfg = NULL;
+   float throughput = 0;
+   bool has_spilled = false;
+
+   if (devinfo->ver < 20) {
+      v8 = std::make_unique<fs_visitor>(compiler, &params->base, key,
+                                        prog_data, nir, 8, 1,
+                                        params->base.stats != NULL,
+                                        debug_enabled);
+      if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
+         params->base.error_str = ralloc_strdup(params->base.mem_ctx,
+                                                v8->fail_msg);
+         return NULL;
+      } else if (INTEL_SIMD(FS, 8)) {
+         simd8_cfg = v8->cfg;
+
+         assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
+         prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
+
+         prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used);
+         const performance &perf = v8->performance_analysis.require();
+         throughput = MAX2(throughput, perf.throughput);
+         has_spilled = v8->spilled_any_registers;
+         allow_spilling = false;
+      }
+   }
+
+   /* Limit dispatch width to simd8 with dual source blending on gfx8.
+    * See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917
+    */
+   if (devinfo->ver == 8 && prog_data->dual_src_blend &&
+       INTEL_SIMD(FS, 8)) {
+      assert(!params->use_rep_send);
+      v8->limit_dispatch_width(8, "gfx8 workaround: "
+                               "using SIMD8 when dual src blending.\n");
+   }
+
+   if (key->coarse_pixel && devinfo->ver < 20) {
+      if (prog_data->dual_src_blend) {
+         v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
+                                  " use SIMD8 messages.\n");
+      }
+      v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
+                               " pixel shading.\n");
+   }
+
+   if (nir->info.ray_queries > 0 && v8)
+      v8->limit_dispatch_width(16, "SIMD32 with ray queries.\n");
+
+   if (!has_spilled &&
+       (!v8 || v8->max_dispatch_width >= 16) &&
+       (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
+      /* Try a SIMD16 compile */
+      v16 = std::make_unique<fs_visitor>(compiler, &params->base, key,
+                                         prog_data, nir, 16, 1,
+                                         params->base.stats != NULL,
+                                         debug_enabled);
+      if (v8)
+         v16->import_uniforms(v8.get());
+      if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
+         brw_shader_perf_log(compiler, params->base.log_data,
+                             "SIMD16 shader failed to compile: %s\n",
+                             v16->fail_msg);
+      } else {
+         simd16_cfg = v16->cfg;
+
+         assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
+         prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
+
+         prog_data->reg_blocks_16 = brw_register_blocks(v16->grf_used);
+         const performance &perf = v16->performance_analysis.require();
+         throughput = MAX2(throughput, perf.throughput);
+         has_spilled = v16->spilled_any_registers;
+         allow_spilling = false;
+      }
+   }
+
+   const bool simd16_failed = v16 && !simd16_cfg;
+
+   /* Currently, the compiler only supports SIMD32 on SNB+ */
+   if (!has_spilled &&
+       (!v8 || v8->max_dispatch_width >= 32) &&
+       (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send &&
+       devinfo->ver >= 6 && !simd16_failed &&
+       INTEL_SIMD(FS, 32)) {
+      /* Try a SIMD32 compile */
+      v32 = std::make_unique<fs_visitor>(compiler, &params->base, key,
+                                         prog_data, nir, 32, 1,
+                                         params->base.stats != NULL,
+                                         debug_enabled);
+      if (v8)
+         v32->import_uniforms(v8.get());
+      else if (v16)
+         v32->import_uniforms(v16.get());
+
+      if (!v32->run_fs(allow_spilling, false)) {
+         brw_shader_perf_log(compiler, params->base.log_data,
+                             "SIMD32 shader failed to compile: %s\n",
+                             v32->fail_msg);
+      } else {
+         const performance &perf = v32->performance_analysis.require();
+
+         if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
+            brw_shader_perf_log(compiler, params->base.log_data,
+                                "SIMD32 shader inefficient\n");
+         } else {
+            simd32_cfg = v32->cfg;
+
+            assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
+            prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
+
+            prog_data->reg_blocks_32 = brw_register_blocks(v32->grf_used);
+            throughput = MAX2(throughput, perf.throughput);
+         }
+      }
+   }
+
+   if (devinfo->ver >= 12 && !has_spilled &&
+       params->max_polygons >= 2 && !key->coarse_pixel) {
+      fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get();
+      assert(vbase);
+
+      if (devinfo->ver >= 20 &&
+          params->max_polygons >= 4 &&
+          vbase->max_dispatch_width >= 32 &&
+          4 * prog_data->num_varying_inputs <= MAX_VARYING &&
+          INTEL_SIMD(FS, 4X8)) {
+         /* Try a quad-SIMD8 compile */
+         vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
+                                               prog_data, nir, 32, 4,
+                                               params->base.stats != NULL,
+                                               debug_enabled);
+         vmulti->import_uniforms(vbase);
+         if (!vmulti->run_fs(false, params->use_rep_send)) {
+            brw_shader_perf_log(compiler, params->base.log_data,
+                                "Quad-SIMD8 shader failed to compile: %s\n",
+                                vmulti->fail_msg);
+         } else {
+            multi_cfg = vmulti->cfg;
+            assert(!vmulti->spilled_any_registers);
+         }
+      }
+
+      if (!multi_cfg && devinfo->ver >= 20 &&
+          vbase->max_dispatch_width >= 32 &&
+          2 * prog_data->num_varying_inputs <= MAX_VARYING &&
+          INTEL_SIMD(FS, 2X16)) {
+         /* Try a dual-SIMD16 compile */
+         vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
+                                               prog_data, nir, 32, 2,
+                                               params->base.stats != NULL,
+                                               debug_enabled);
+         vmulti->import_uniforms(vbase);
+         if (!vmulti->run_fs(false, params->use_rep_send)) {
+            brw_shader_perf_log(compiler, params->base.log_data,
+                                "Dual-SIMD16 shader failed to compile: %s\n",
+                                vmulti->fail_msg);
+         } else {
+            multi_cfg = vmulti->cfg;
+            assert(!vmulti->spilled_any_registers);
+         }
+      }
+
+      if (!multi_cfg && vbase->max_dispatch_width >= 16 &&
+          2 * prog_data->num_varying_inputs <= MAX_VARYING &&
+          INTEL_SIMD(FS, 2X8)) {
+         /* Try a dual-SIMD8 compile */
+         vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
+                                               prog_data, nir, 16, 2,
+                                               params->base.stats != NULL,
+                                               debug_enabled);
+         vmulti->import_uniforms(vbase);
+         if (!vmulti->run_fs(allow_spilling, params->use_rep_send)) {
+            brw_shader_perf_log(compiler, params->base.log_data,
+                                "Dual-SIMD8 shader failed to compile: %s\n",
+                                vmulti->fail_msg);
+         } else {
+            multi_cfg = vmulti->cfg;
+         }
+      }
+
+      if (multi_cfg) {
+         assert(vmulti->payload().num_regs % reg_unit(devinfo) == 0);
+         prog_data->base.dispatch_grf_start_reg = vmulti->payload().num_regs / reg_unit(devinfo);
+
+         prog_data->reg_blocks_8 = brw_register_blocks(vmulti->grf_used);
+      }
+   }
+
+   /* When the caller requests a repclear shader, they want SIMD16-only */
+   if (params->use_rep_send)
+      simd8_cfg = NULL;
+
+   /* Prior to Iron Lake, the PS had a single shader offset with a jump table
+    * at the top to select the shader.  We've never implemented that.
+    * Instead, we just give them exactly one shader and we pick the widest one
+    * available.
+    */
+   if (compiler->devinfo->ver < 5) {
+      if (simd32_cfg || simd16_cfg)
+         simd8_cfg = NULL;
+      if (simd32_cfg)
+         simd16_cfg = NULL;
+   }
+
+   /* If computed depth is enabled SNB only allows SIMD8. */
+   if (compiler->devinfo->ver == 6 &&
+       prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF)
+      assert(simd16_cfg == NULL && simd32_cfg == NULL);
+
+   if (compiler->devinfo->ver <= 5 && !simd8_cfg) {
+      /* Iron lake and earlier only have one Dispatch GRF start field.  Make
+       * the data available in the base prog data struct for convenience.
+       */
+      if (simd16_cfg) {
+         prog_data->base.dispatch_grf_start_reg =
+            prog_data->dispatch_grf_start_reg_16;
+      } else if (simd32_cfg) {
+         prog_data->base.dispatch_grf_start_reg =
+            prog_data->dispatch_grf_start_reg_32;
+      }
+   }
+
+   fs_generator g(compiler, &params->base, &prog_data->base,
+                  v8 && v8->runtime_check_aads_emit, MESA_SHADER_FRAGMENT);
+
+   if (unlikely(debug_enabled)) {
+      g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
+                                     "%s fragment shader %s",
+                                     nir->info.label ?
+                                        nir->info.label : "unnamed",
+                                     nir->info.name));
+   }
+
+   struct brw_compile_stats *stats = params->base.stats;
+   uint32_t max_dispatch_width = 0;
+
+   if (multi_cfg) {
+      prog_data->dispatch_multi = vmulti->dispatch_width;
+      prog_data->max_polygons = vmulti->max_polygons;
+      g.generate_code(multi_cfg, vmulti->dispatch_width, vmulti->shader_stats,
+                      vmulti->performance_analysis.require(),
+                      stats, vmulti->max_polygons);
+      stats = stats ? stats + 1 : NULL;
+      max_dispatch_width = vmulti->dispatch_width;
+
+   } else if (simd8_cfg) {
+      prog_data->dispatch_8 = true;
+      g.generate_code(simd8_cfg, 8, v8->shader_stats,
+                      v8->performance_analysis.require(), stats, 1);
+      stats = stats ? stats + 1 : NULL;
+      max_dispatch_width = 8;
+   }
+
+   if (simd16_cfg) {
+      prog_data->dispatch_16 = true;
+      prog_data->prog_offset_16 = g.generate_code(
+         simd16_cfg, 16, v16->shader_stats,
+         v16->performance_analysis.require(), stats, 1);
+      stats = stats ? stats + 1 : NULL;
+      max_dispatch_width = 16;
+   }
+
+   if (simd32_cfg) {
+      prog_data->dispatch_32 = true;
+      prog_data->prog_offset_32 = g.generate_code(
+         simd32_cfg, 32, v32->shader_stats,
+         v32->performance_analysis.require(), stats, 1);
+      stats = stats ? stats + 1 : NULL;
+      max_dispatch_width = 32;
+   }
+
+   for (struct brw_compile_stats *s = params->base.stats; s != NULL && s != stats; s++)
+      s->max_dispatch_width = max_dispatch_width;
+
+   g.add_const_data(nir->constant_data, nir->constant_data_size);
+   return g.get_assembly();
+}
+
+unsigned
+brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
+                             unsigned threads)
+{
+   assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
+   assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
+   return cs_prog_data->push.per_thread.size * threads +
+          cs_prog_data->push.cross_thread.size;
+}
+
+static void
+fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
+{
+   block->dwords = dwords;
+   block->regs = DIV_ROUND_UP(dwords, 8);
+   block->size = block->regs * 32;
+}
+
+static void
+cs_fill_push_const_info(const struct intel_device_info *devinfo,
+                        struct brw_cs_prog_data *cs_prog_data)
+{
+   const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
+   int subgroup_id_index = brw_get_subgroup_id_param_index(devinfo, prog_data);
+   bool cross_thread_supported = devinfo->verx10 >= 75;
+
+   /* The thread ID should be stored in the last param dword */
+   assert(subgroup_id_index == -1 ||
+          subgroup_id_index == (int)prog_data->nr_params - 1);
+
+   unsigned cross_thread_dwords, per_thread_dwords;
+   if (!cross_thread_supported) {
+      cross_thread_dwords = 0u;
+      per_thread_dwords = prog_data->nr_params;
+   } else if (subgroup_id_index >= 0) {
+      /* Fill all but the last register with cross-thread payload */
+      cross_thread_dwords = 8 * (subgroup_id_index / 8);
+      per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
+      assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
+   } else {
+      /* Fill all data using cross-thread payload */
+      cross_thread_dwords = prog_data->nr_params;
+      per_thread_dwords = 0u;
+   }
+
+   fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
+   fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
+
+   assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
+          cs_prog_data->push.per_thread.size == 0);
+   assert(cs_prog_data->push.cross_thread.dwords +
+          cs_prog_data->push.per_thread.dwords ==
+             prog_data->nr_params);
+}
+
+static bool
+filter_simd(const nir_instr *instr, const void * /* options */)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   switch (nir_instr_as_intrinsic(instr)->intrinsic) {
+   case nir_intrinsic_load_simd_width_intel:
+   case nir_intrinsic_load_subgroup_id:
+      return true;
+
+   default:
+      return false;
+   }
+}
+
+static nir_def *
+lower_simd(nir_builder *b, nir_instr *instr, void *options)
+{
+   uintptr_t simd_width = (uintptr_t)options;
+
+   switch (nir_instr_as_intrinsic(instr)->intrinsic) {
+   case nir_intrinsic_load_simd_width_intel:
+      return nir_imm_int(b, simd_width);
+
+   case nir_intrinsic_load_subgroup_id:
+      /* If the whole workgroup fits in one thread, we can lower subgroup_id
+       * to a constant zero.
+       */
+      if (!b->shader->info.workgroup_size_variable) {
+         unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
+                                         b->shader->info.workgroup_size[1] *
+                                         b->shader->info.workgroup_size[2];
+         if (local_workgroup_size <= simd_width)
+            return nir_imm_int(b, 0);
+      }
+      return NULL;
+
+   default:
+      return NULL;
+   }
+}
+
+bool
+brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
+{
+   return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
+                                 (void *)(uintptr_t)dispatch_width);
+}
+
+const unsigned *
+brw_compile_cs(const struct brw_compiler *compiler,
+               struct brw_compile_cs_params *params)
+{
+   const nir_shader *nir = params->base.nir;
+   const struct brw_cs_prog_key *key = params->key;
+   struct brw_cs_prog_data *prog_data = params->prog_data;
+
+   const bool debug_enabled =
+      brw_should_print_shader(nir, params->base.debug_flag ?
+                                   params->base.debug_flag : DEBUG_CS);
+
+   prog_data->base.stage = MESA_SHADER_COMPUTE;
+   prog_data->base.total_shared = nir->info.shared_size;
+   prog_data->base.ray_queries = nir->info.ray_queries;
+   prog_data->base.total_scratch = 0;
+
+   if (!nir->info.workgroup_size_variable) {
+      prog_data->local_size[0] = nir->info.workgroup_size[0];
+      prog_data->local_size[1] = nir->info.workgroup_size[1];
+      prog_data->local_size[2] = nir->info.workgroup_size[2];
+   }
+
+   brw_simd_selection_state simd_state{
+      .devinfo = compiler->devinfo,
+      .prog_data = prog_data,
+      .required_width = brw_required_dispatch_width(&nir->info),
+   };
+
+   std::unique_ptr<fs_visitor> v[3];
+
+   for (unsigned simd = 0; simd < 3; simd++) {
+      if (!brw_simd_should_compile(simd_state, simd))
+         continue;
+
+      const unsigned dispatch_width = 8u << simd;
+
+      nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
+      brw_nir_apply_key(shader, compiler, &key->base,
+                        dispatch_width);
+
+      NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
+
+      /* Clean up after the local index and ID calculations. */
+      NIR_PASS(_, shader, nir_opt_constant_folding);
+      NIR_PASS(_, shader, nir_opt_dce);
+
+      brw_postprocess_nir(shader, compiler, debug_enabled,
+                          key->base.robust_flags);
+
+      v[simd] = std::make_unique<fs_visitor>(compiler, &params->base,
+                                             &key->base,
+                                             &prog_data->base,
+                                             shader, dispatch_width,
+                                             params->base.stats != NULL,
+                                             debug_enabled);
+
+      const int first = brw_simd_first_compiled(simd_state);
+      if (first >= 0)
+         v[simd]->import_uniforms(v[first].get());
+
+      const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
+
+      if (v[simd]->run_cs(allow_spilling)) {
+         cs_fill_push_const_info(compiler->devinfo, prog_data);
+
+         brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
+      } else {
+         simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
+         if (simd > 0) {
+            brw_shader_perf_log(compiler, params->base.log_data,
+                                "SIMD%u shader failed to compile: %s\n",
+                                dispatch_width, v[simd]->fail_msg);
+         }
+      }
+   }
+
+   const int selected_simd = brw_simd_select(simd_state);
+   if (selected_simd < 0) {
+      params->base.error_str =
+         ralloc_asprintf(params->base.mem_ctx,
+                         "Can't compile shader: "
+                         "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
+                         simd_state.error[0], simd_state.error[1],
+                         simd_state.error[2]);
+      return NULL;
+   }
+
+   assert(selected_simd < 3);
+   fs_visitor *selected = v[selected_simd].get();
+
+   if (!nir->info.workgroup_size_variable)
+      prog_data->prog_mask = 1 << selected_simd;
+
+   fs_generator g(compiler, &params->base, &prog_data->base,
+                  selected->runtime_check_aads_emit, MESA_SHADER_COMPUTE);
+   if (unlikely(debug_enabled)) {
+      char *name = ralloc_asprintf(params->base.mem_ctx,
+                                   "%s compute shader %s",
+                                   nir->info.label ?
+                                   nir->info.label : "unnamed",
+                                   nir->info.name);
+      g.enable_debug(name);
+   }
+
+   uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1);
+
+   struct brw_compile_stats *stats = params->base.stats;
+   for (unsigned simd = 0; simd < 3; simd++) {
+      if (prog_data->prog_mask & (1u << simd)) {
+         assert(v[simd]);
+         prog_data->prog_offset[simd] =
+            g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats,
+                            v[simd]->performance_analysis.require(), stats);
+         if (stats)
+            stats->max_dispatch_width = max_dispatch_width;
+         stats = stats ? stats + 1 : NULL;
+         max_dispatch_width = 8u << simd;
+      }
+   }
+
+   g.add_const_data(nir->constant_data, nir->constant_data_size);
+
+   return g.get_assembly();
+}
+
+struct intel_cs_dispatch_info
+brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
+                         const struct brw_cs_prog_data *prog_data,
+                         const unsigned *override_local_size)
+{
+   struct intel_cs_dispatch_info info = {};
+
+   const unsigned *sizes =
+      override_local_size ? override_local_size :
+                            prog_data->local_size;
+
+   const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
+   assert(simd >= 0 && simd < 3);
+
+   info.group_size = sizes[0] * sizes[1] * sizes[2];
+   info.simd_size = 8u << simd;
+   info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
+
+   const uint32_t remainder = info.group_size & (info.simd_size - 1);
+   if (remainder > 0)
+      info.right_mask = ~0u >> (32 - remainder);
+   else
+      info.right_mask = ~0u >> (32 - info.simd_size);
+
+   return info;
+}
+
+static uint8_t
+compile_single_bs(const struct brw_compiler *compiler,
+                  struct brw_compile_bs_params *params,
+                  const struct brw_bs_prog_key *key,
+                  struct brw_bs_prog_data *prog_data,
+                  nir_shader *shader,
+                  fs_generator *g,
+                  struct brw_compile_stats *stats,
+                  int *prog_offset)
+{
+   const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT);
+
+   prog_data->base.stage = shader->info.stage;
+   prog_data->max_stack_size = MAX2(prog_data->max_stack_size,
+                                    shader->scratch_size);
+
+   const unsigned max_dispatch_width = 16;
+   brw_nir_apply_key(shader, compiler, &key->base, max_dispatch_width);
+   brw_postprocess_nir(shader, compiler, debug_enabled,
+                       key->base.robust_flags);
+
+   brw_simd_selection_state simd_state{
+      .devinfo = compiler->devinfo,
+      .prog_data = prog_data,
+
+      /* Since divergence is a lot more likely in RT than compute, it makes
+       * sense to limit ourselves to the smallest available SIMD for now.
+       */
+      .required_width = compiler->devinfo->ver >= 20 ? 16u : 8u,
+   };
+
+   std::unique_ptr<fs_visitor> v[2];
+
+   for (unsigned simd = 0; simd < ARRAY_SIZE(v); simd++) {
+      if (!brw_simd_should_compile(simd_state, simd))
+         continue;
+
+      const unsigned dispatch_width = 8u << simd;
+
+      if (dispatch_width == 8 && compiler->devinfo->ver >= 20)
+         continue;
+
+      v[simd] = std::make_unique<fs_visitor>(compiler, &params->base,
+                                             &key->base,
+                                             &prog_data->base, shader,
+                                             dispatch_width,
+                                             stats != NULL,
+                                             debug_enabled);
+
+      const bool allow_spilling = !brw_simd_any_compiled(simd_state);
+      if (v[simd]->run_bs(allow_spilling)) {
+         brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
+      } else {
+         simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx,
+                                                v[simd]->fail_msg);
+         if (simd > 0) {
+            brw_shader_perf_log(compiler, params->base.log_data,
+                                "SIMD%u shader failed to compile: %s",
+                                dispatch_width, v[simd]->fail_msg);
+         }
+      }
+   }
+
+   const int selected_simd = brw_simd_select(simd_state);
+   if (selected_simd < 0) {
+      params->base.error_str =
+         ralloc_asprintf(params->base.mem_ctx,
+                         "Can't compile shader: "
+                         "SIMD8 '%s' and SIMD16 '%s'.\n",
+                         simd_state.error[0], simd_state.error[1]);
+      return 0;
+   }
+
+   assert(selected_simd < int(ARRAY_SIZE(v)));
+   fs_visitor *selected = v[selected_simd].get();
+   assert(selected);
+
+   const unsigned dispatch_width = selected->dispatch_width;
+
+   int offset = g->generate_code(selected->cfg, dispatch_width, selected->shader_stats,
+                                 selected->performance_analysis.require(), stats);
+   if (prog_offset)
+      *prog_offset = offset;
+   else
+      assert(offset == 0);
+
+   return dispatch_width;
+}
+
+uint64_t
+brw_bsr(const struct intel_device_info *devinfo,
+        uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
+{
+   assert(offset % 64 == 0);
+   assert(simd_size == 8 || simd_size == 16);
+   assert(local_arg_offset % 8 == 0);
+
+   return offset |
+          SET_BITS(simd_size == 8, 4, 4) |
+          SET_BITS(local_arg_offset / 8, 2, 0);
+}
+
+const unsigned *
+brw_compile_bs(const struct brw_compiler *compiler,
+               struct brw_compile_bs_params *params)
+{
+   nir_shader *shader = params->base.nir;
+   struct brw_bs_prog_data *prog_data = params->prog_data;
+   unsigned num_resume_shaders = params->num_resume_shaders;
+   nir_shader **resume_shaders = params->resume_shaders;
+   const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT);
+
+   prog_data->base.stage = shader->info.stage;
+   prog_data->base.ray_queries = shader->info.ray_queries;
+   prog_data->base.total_scratch = 0;
+
+   prog_data->max_stack_size = 0;
+   prog_data->num_resume_shaders = num_resume_shaders;
+
+   fs_generator g(compiler, &params->base, &prog_data->base,
+                  false, shader->info.stage);
+   if (unlikely(debug_enabled)) {
+      char *name = ralloc_asprintf(params->base.mem_ctx,
+                                   "%s %s shader %s",
+                                   shader->info.label ?
+                                      shader->info.label : "unnamed",
+                                   gl_shader_stage_name(shader->info.stage),
+                                   shader->info.name);
+      g.enable_debug(name);
+   }
+
+   prog_data->simd_size =
+      compile_single_bs(compiler, params, params->key, prog_data,
+                        shader, &g, params->base.stats, NULL);
+   if (prog_data->simd_size == 0)
+      return NULL;
+
+   uint64_t *resume_sbt = ralloc_array(params->base.mem_ctx,
+                                       uint64_t, num_resume_shaders);
+   for (unsigned i = 0; i < num_resume_shaders; i++) {
+      if (INTEL_DEBUG(DEBUG_RT)) {
+         char *name = ralloc_asprintf(params->base.mem_ctx,
+                                      "%s %s resume(%u) shader %s",
+                                      shader->info.label ?
+                                         shader->info.label : "unnamed",
+                                      gl_shader_stage_name(shader->info.stage),
+                                      i, shader->info.name);
+         g.enable_debug(name);
+      }
+
+      /* TODO: Figure out shader stats etc. for resume shaders */
+      int offset = 0;
+      uint8_t simd_size =
+         compile_single_bs(compiler, params, params->key,
+                           prog_data, resume_shaders[i], &g, NULL, &offset);
+      if (simd_size == 0)
+         return NULL;
+
+      assert(offset > 0);
+      resume_sbt[i] = brw_bsr(compiler->devinfo, offset, simd_size, 0);
+   }
+
+   /* We only have one constant data so we want to make sure they're all the
+    * same.
+    */
+   for (unsigned i = 0; i < num_resume_shaders; i++) {
+      assert(resume_shaders[i]->constant_data_size ==
+             shader->constant_data_size);
+      assert(memcmp(resume_shaders[i]->constant_data,
+                    shader->constant_data,
+                    shader->constant_data_size) == 0);
+   }
+
+   g.add_const_data(shader->constant_data, shader->constant_data_size);
+   g.add_resume_sbt(num_resume_shaders, resume_sbt);
+
+   return g.get_assembly();
+}
+
+/**
+ * Test the dispatch mask packing assumptions of
+ * brw_stage_has_packed_dispatch().  Call this from e.g. the top of
+ * fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
+ * executed with an unexpected dispatch mask.
+ */
+static UNUSED void
+brw_fs_test_dispatch_packing(const fs_builder &bld)
+{
+   const fs_visitor *shader = static_cast<const fs_visitor *>(bld.shader);
+   const gl_shader_stage stage = shader->stage;
+   const bool uses_vmask =
+      stage == MESA_SHADER_FRAGMENT &&
+      brw_wm_prog_data(shader->stage_prog_data)->uses_vmask;
+
+   if (brw_stage_has_packed_dispatch(shader->devinfo, stage,
+                                     shader->max_polygons,
+                                     shader->stage_prog_data)) {
+      const fs_builder ubld = bld.exec_all().group(1, 0);
+      const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
+      const fs_reg mask = uses_vmask ? brw_vmask_reg() : brw_dmask_reg();
+
+      ubld.ADD(tmp, mask, brw_imm_ud(1));
+      ubld.AND(tmp, mask, tmp);
+
+      /* This will loop forever if the dispatch mask doesn't have the expected
+       * form '2^n-1', in which case tmp will be non-zero.
+       */
+      bld.emit(BRW_OPCODE_DO);
+      bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
+      set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
+   }
+}
+
+unsigned
+fs_visitor::workgroup_size() const
+{
+   assert(gl_shader_stage_uses_workgroup(stage));
+   const struct brw_cs_prog_data *cs = brw_cs_prog_data(prog_data);
+   return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
+}
+
+bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
+{
+   return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
+}
+
+namespace brw {
+   fs_reg
+   fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
+                     brw_reg_type type, unsigned n)
+   {
+      if (!regs[0])
+         return fs_reg();
+
+      if (bld.dispatch_width() > 16) {
+         const fs_reg tmp = bld.vgrf(type, n);
+         const brw::fs_builder hbld = bld.exec_all().group(16, 0);
+         const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
+         fs_reg *const components = new fs_reg[m * n];
+
+         for (unsigned c = 0; c < n; c++) {
+            for (unsigned g = 0; g < m; g++)
+               components[c * m + g] =
+                  offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c);
+         }
+
+         hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
+
+         delete[] components;
+         return tmp;
+
+      } else {
+         return fs_reg(retype(brw_vec8_grf(regs[0], 0), type));
+      }
+   }
+
+   fs_reg
+   fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2])
+   {
+      if (!regs[0])
+         return fs_reg();
+      else if (bld.shader->devinfo->ver >= 20)
+         return fetch_payload_reg(bld, regs, BRW_REGISTER_TYPE_F, 2);
+
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
+      const brw::fs_builder hbld = bld.exec_all().group(8, 0);
+      const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
+      fs_reg *const components = new fs_reg[2 * m];
+
+      for (unsigned c = 0; c < 2; c++) {
+         for (unsigned g = 0; g < m; g++)
+            components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
+                                           hbld, c + 2 * (g % 2));
+      }
+
+      hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
+
+      delete[] components;
+      return tmp;
+   }
+
+   void
+   check_dynamic_msaa_flag(const fs_builder &bld,
+                           const struct brw_wm_prog_data *wm_prog_data,
+                           enum intel_msaa_flags flag)
+   {
+      fs_inst *inst = bld.AND(bld.null_reg_ud(),
+                              dynamic_msaa_flags(wm_prog_data),
+                              brw_imm_ud(flag));
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+   }
+}
diff --git a/src/intel/compiler/elk/brw_fs.h b/src/intel/compiler/elk/brw_fs.h
new file mode 100644
index 00000000000..0ee32403541
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs.h
@@ -0,0 +1,637 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#ifndef BRW_FS_H
+#define BRW_FS_H
+
+#include "brw_shader.h"
+#include "brw_ir_fs.h"
+#include "brw_fs_live_variables.h"
+#include "brw_ir_performance.h"
+#include "compiler/nir/nir.h"
+
+struct bblock_t;
+namespace {
+   struct acp_entry;
+}
+
+class fs_visitor;
+
+namespace brw {
+   /**
+    * Register pressure analysis of a shader.  Estimates how many registers
+    * are live at any point of the program in GRF units.
+    */
+   struct register_pressure {
+      register_pressure(const fs_visitor *v);
+      ~register_pressure();
+
+      analysis_dependency_class
+      dependency_class() const
+      {
+         return (DEPENDENCY_INSTRUCTION_IDENTITY |
+                 DEPENDENCY_INSTRUCTION_DATA_FLOW |
+                 DEPENDENCY_VARIABLES);
+      }
+
+      bool
+      validate(const fs_visitor *) const
+      {
+         /* FINISHME */
+         return true;
+      }
+
+      unsigned *regs_live_at_ip;
+   };
+}
+
+struct brw_gs_compile;
+
+namespace brw {
+class fs_builder;
+}
+
+struct shader_stats {
+   const char *scheduler_mode;
+   unsigned promoted_constants;
+   unsigned spill_count;
+   unsigned fill_count;
+   unsigned max_register_pressure;
+};
+
+/** Register numbers for thread payload fields. */
+struct thread_payload {
+   /** The number of thread payload registers the hardware will supply. */
+   uint8_t num_regs;
+
+   virtual ~thread_payload() = default;
+
+protected:
+   thread_payload() : num_regs() {}
+};
+
+struct vs_thread_payload : public thread_payload {
+   vs_thread_payload(const fs_visitor &v);
+
+   fs_reg urb_handles;
+};
+
+struct tcs_thread_payload : public thread_payload {
+   tcs_thread_payload(const fs_visitor &v);
+
+   fs_reg patch_urb_output;
+   fs_reg primitive_id;
+   fs_reg icp_handle_start;
+};
+
+struct tes_thread_payload : public thread_payload {
+   tes_thread_payload(const fs_visitor &v);
+
+   fs_reg patch_urb_input;
+   fs_reg primitive_id;
+   fs_reg coords[3];
+   fs_reg urb_output;
+};
+
+struct gs_thread_payload : public thread_payload {
+   gs_thread_payload(fs_visitor &v);
+
+   fs_reg urb_handles;
+   fs_reg primitive_id;
+   fs_reg instance_id;
+   fs_reg icp_handle_start;
+};
+
+struct fs_thread_payload : public thread_payload {
+   fs_thread_payload(const fs_visitor &v,
+                     bool &source_depth_to_render_target,
+                     bool &runtime_check_aads_emit);
+
+   uint8_t subspan_coord_reg[2];
+   uint8_t source_depth_reg[2];
+   uint8_t source_w_reg[2];
+   uint8_t aa_dest_stencil_reg[2];
+   uint8_t dest_depth_reg[2];
+   uint8_t sample_pos_reg[2];
+   uint8_t sample_mask_in_reg[2];
+   uint8_t depth_w_coef_reg;
+   uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT][2];
+};
+
+struct cs_thread_payload : public thread_payload {
+   cs_thread_payload(const fs_visitor &v);
+
+   void load_subgroup_id(const brw::fs_builder &bld, fs_reg &dest) const;
+
+   fs_reg local_invocation_id[3];
+
+protected:
+   fs_reg subgroup_id_;
+};
+
+struct task_mesh_thread_payload : public cs_thread_payload {
+   task_mesh_thread_payload(fs_visitor &v);
+
+   fs_reg extended_parameter_0;
+   fs_reg local_index;
+   fs_reg inline_parameter;
+
+   fs_reg urb_output;
+
+   /* URB to read Task memory inputs. Only valid for MESH stage. */
+   fs_reg task_urb_input;
+};
+
+struct bs_thread_payload : public thread_payload {
+   bs_thread_payload(const fs_visitor &v);
+
+   fs_reg global_arg_ptr;
+   fs_reg local_arg_ptr;
+
+   void load_shader_type(const brw::fs_builder &bld, fs_reg &dest) const;
+};
+
+class fs_instruction_scheduler;
+
+/**
+ * The fragment shader front-end.
+ *
+ * Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
+ */
+class fs_visitor : public backend_shader
+{
+public:
+   fs_visitor(const struct brw_compiler *compiler,
+              const struct brw_compile_params *params,
+              const brw_base_prog_key *key,
+              struct brw_stage_prog_data *prog_data,
+              const nir_shader *shader,
+              unsigned dispatch_width,
+              bool needs_register_pressure,
+              bool debug_enabled);
+   fs_visitor(const struct brw_compiler *compiler,
+              const struct brw_compile_params *params,
+              const brw_wm_prog_key *key,
+              struct brw_wm_prog_data *prog_data,
+              const nir_shader *shader,
+              unsigned dispatch_width,
+              unsigned num_polygons,
+              bool needs_register_pressure,
+              bool debug_enabled);
+   fs_visitor(const struct brw_compiler *compiler,
+              const struct brw_compile_params *params,
+              struct brw_gs_compile *gs_compile,
+              struct brw_gs_prog_data *prog_data,
+              const nir_shader *shader,
+              bool needs_register_pressure,
+              bool debug_enabled);
+   void init();
+   ~fs_visitor();
+
+   fs_reg vgrf(const glsl_type *const type);
+   void import_uniforms(fs_visitor *v);
+
+   void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld,
+                                   const fs_reg &dst,
+                                   const fs_reg &surface,
+                                   const fs_reg &surface_handle,
+                                   const fs_reg &varying_offset,
+                                   uint32_t const_offset,
+                                   uint8_t alignment,
+                                   unsigned components);
+   void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf);
+
+   bool run_fs(bool allow_spilling, bool do_rep_send);
+   bool run_vs();
+   bool run_tcs();
+   bool run_tes();
+   bool run_gs();
+   bool run_cs(bool allow_spilling);
+   bool run_bs(bool allow_spilling);
+   bool run_task(bool allow_spilling);
+   bool run_mesh(bool allow_spilling);
+   void optimize();
+   void allocate_registers(bool allow_spilling);
+   uint32_t compute_max_register_pressure();
+   bool fixup_sends_duplicate_payload();
+   void fixup_3src_null_dest();
+   void emit_dummy_memory_fence_before_eot();
+   void emit_dummy_mov_instruction();
+   bool fixup_nomask_control_flow();
+   void assign_curb_setup();
+   void assign_urb_setup();
+   void convert_attr_sources_to_hw_regs(fs_inst *inst);
+   void assign_vs_urb_setup();
+   void assign_tcs_urb_setup();
+   void assign_tes_urb_setup();
+   void assign_gs_urb_setup();
+   bool assign_regs(bool allow_spilling, bool spill_all);
+   void assign_regs_trivial();
+   void calculate_payload_ranges(unsigned payload_node_count,
+                                 int *payload_last_use_ip) const;
+   bool split_virtual_grfs();
+   bool compact_virtual_grfs();
+   void assign_constant_locations();
+   bool get_pull_locs(const fs_reg &src, unsigned *out_surf_index,
+                      unsigned *out_pull_index);
+   bool lower_constant_loads();
+   virtual void invalidate_analysis(brw::analysis_dependency_class c);
+
+#ifndef NDEBUG
+   void validate();
+#else
+   void validate() {}
+#endif
+
+   bool opt_algebraic();
+   bool opt_redundant_halt();
+   bool opt_cse();
+   bool opt_cse_local(const brw::fs_live_variables &live, bblock_t *block, int &ip);
+
+   bool opt_copy_propagation();
+   bool opt_bank_conflicts();
+   bool opt_split_sends();
+   bool register_coalesce();
+   bool compute_to_mrf();
+   bool eliminate_find_live_channel();
+   bool dead_code_eliminate();
+   bool remove_duplicate_mrf_writes();
+   bool remove_extra_rounding_modes();
+
+   fs_instruction_scheduler *prepare_scheduler(void *mem_ctx);
+   void schedule_instructions_pre_ra(fs_instruction_scheduler *sched,
+                                     instruction_scheduler_mode mode);
+   void schedule_instructions_post_ra();
+
+   void insert_gfx4_send_dependency_workarounds();
+   void insert_gfx4_pre_send_dependency_workarounds(bblock_t *block,
+                                                    fs_inst *inst);
+   void insert_gfx4_post_send_dependency_workarounds(bblock_t *block,
+                                                     fs_inst *inst);
+   void vfail(const char *msg, va_list args);
+   void fail(const char *msg, ...);
+   void limit_dispatch_width(unsigned n, const char *msg);
+   bool lower_uniform_pull_constant_loads();
+   bool lower_load_payload();
+   bool lower_pack();
+   bool lower_regioning();
+   bool lower_logical_sends();
+   bool lower_integer_multiplication();
+   bool lower_minmax();
+   bool lower_simd_width();
+   bool lower_barycentrics();
+   bool lower_derivatives();
+   bool lower_find_live_channel();
+   bool lower_scoreboard();
+   bool lower_sub_sat();
+   bool opt_combine_constants();
+
+   void emit_repclear_shader();
+   void emit_interpolation_setup_gfx4();
+   void emit_interpolation_setup_gfx6();
+   bool opt_peephole_sel();
+   bool opt_saturate_propagation();
+   bool opt_cmod_propagation();
+   bool opt_zero_samples();
+
+   void set_tcs_invocation_id();
+
+   void emit_alpha_test();
+   fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
+                                 fs_reg color1, fs_reg color2,
+                                 fs_reg src0_alpha, unsigned components);
+   void do_emit_fb_writes(int nr_color_regions, bool replicate_alpha);
+   void emit_fb_writes();
+   void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg());
+   void emit_gs_control_data_bits(const fs_reg &vertex_count);
+   void emit_gs_thread_end();
+   bool mark_last_urb_write_with_eot();
+   void emit_tcs_thread_end();
+   void emit_urb_fence();
+   void emit_cs_terminate();
+
+   fs_reg interp_reg(const brw::fs_builder &bld, unsigned location,
+                     unsigned channel, unsigned comp);
+   fs_reg per_primitive_reg(const brw::fs_builder &bld,
+                            int location, unsigned comp);
+
+   virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const;
+   virtual void dump_instructions_to_file(FILE *file) const;
+
+   const brw_base_prog_key *const key;
+   const struct brw_sampler_prog_key_data *key_tex;
+
+   struct brw_gs_compile *gs_compile;
+
+   struct brw_stage_prog_data *prog_data;
+
+   brw_analysis<brw::fs_live_variables, backend_shader> live_analysis;
+   brw_analysis<brw::register_pressure, fs_visitor> regpressure_analysis;
+   brw_analysis<brw::performance, fs_visitor> performance_analysis;
+
+   /** Number of uniform variable components visited. */
+   unsigned uniforms;
+
+   /** Byte-offset for the next available spot in the scratch space buffer. */
+   unsigned last_scratch;
+
+   /**
+    * Array mapping UNIFORM register numbers to the push parameter index,
+    * or -1 if this uniform register isn't being uploaded as a push constant.
+    */
+   int *push_constant_loc;
+
+   fs_reg frag_depth;
+   fs_reg frag_stencil;
+   fs_reg sample_mask;
+   fs_reg outputs[VARYING_SLOT_MAX];
+   fs_reg dual_src_output;
+   int first_non_payload_grf;
+   /** Either BRW_MAX_GRF or GFX7_MRF_HACK_START */
+   unsigned max_grf;
+
+   bool failed;
+   char *fail_msg;
+
+   thread_payload *payload_;
+
+   thread_payload &payload() {
+      return *this->payload_;
+   }
+
+   vs_thread_payload &vs_payload() {
+      assert(stage == MESA_SHADER_VERTEX);
+      return *static_cast<vs_thread_payload *>(this->payload_);
+   }
+
+   tcs_thread_payload &tcs_payload() {
+      assert(stage == MESA_SHADER_TESS_CTRL);
+      return *static_cast<tcs_thread_payload *>(this->payload_);
+   }
+
+   tes_thread_payload &tes_payload() {
+      assert(stage == MESA_SHADER_TESS_EVAL);
+      return *static_cast<tes_thread_payload *>(this->payload_);
+   }
+
+   gs_thread_payload &gs_payload() {
+      assert(stage == MESA_SHADER_GEOMETRY);
+      return *static_cast<gs_thread_payload *>(this->payload_);
+   }
+
+   fs_thread_payload &fs_payload() {
+      assert(stage == MESA_SHADER_FRAGMENT);
+      return *static_cast<fs_thread_payload *>(this->payload_);
+   };
+
+   cs_thread_payload &cs_payload() {
+      assert(gl_shader_stage_uses_workgroup(stage));
+      return *static_cast<cs_thread_payload *>(this->payload_);
+   }
+
+   task_mesh_thread_payload &task_mesh_payload() {
+      assert(stage == MESA_SHADER_TASK || stage == MESA_SHADER_MESH);
+      return *static_cast<task_mesh_thread_payload *>(this->payload_);
+   }
+
+   bs_thread_payload &bs_payload() {
+      assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
+      return *static_cast<bs_thread_payload *>(this->payload_);
+   }
+
+   bool source_depth_to_render_target;
+   bool runtime_check_aads_emit;
+
+   fs_reg pixel_x;
+   fs_reg pixel_y;
+   fs_reg pixel_z;
+   fs_reg wpos_w;
+   fs_reg pixel_w;
+   fs_reg delta_xy[BRW_BARYCENTRIC_MODE_COUNT];
+   fs_reg final_gs_vertex_count;
+   fs_reg control_data_bits;
+   fs_reg invocation_id;
+
+   unsigned grf_used;
+   bool spilled_any_registers;
+   bool needs_register_pressure;
+
+   const unsigned dispatch_width; /**< 8, 16 or 32 */
+   const unsigned max_polygons;
+   unsigned max_dispatch_width;
+
+   /* The API selected subgroup size */
+   unsigned api_subgroup_size; /**< 0, 8, 16, 32 */
+
+   struct shader_stats shader_stats;
+
+   void lower_mul_dword_inst(fs_inst *inst, bblock_t *block);
+   void lower_mul_qword_inst(fs_inst *inst, bblock_t *block);
+   void lower_mulh_inst(fs_inst *inst, bblock_t *block);
+
+   unsigned workgroup_size() const;
+
+   void debug_optimizer(const nir_shader *nir,
+                        const char *pass_name,
+                        int iteration, int pass_num) const;
+};
+
+/**
+ * Return the flag register used in fragment shaders to keep track of live
+ * samples.  On Gfx7+ we use f1.0-f1.1 to allow discard jumps in SIMD32
+ * dispatch mode, while earlier generations are constrained to f0.1, which
+ * limits the dispatch width to SIMD16 for fragment shaders that use discard.
+ */
+static inline unsigned
+sample_mask_flag_subreg(const fs_visitor &s)
+{
+   assert(s.stage == MESA_SHADER_FRAGMENT);
+   return s.devinfo->ver >= 7 ? 2 : 1;
+}
+
+/**
+ * The fragment shader code generator.
+ *
+ * Translates FS IR to actual i965 assembly code.
+ */
+class fs_generator
+{
+public:
+   fs_generator(const struct brw_compiler *compiler,
+                const struct brw_compile_params *params,
+                struct brw_stage_prog_data *prog_data,
+                bool runtime_check_aads_emit,
+                gl_shader_stage stage);
+   ~fs_generator();
+
+   void enable_debug(const char *shader_name);
+   int generate_code(const cfg_t *cfg, int dispatch_width,
+                     struct shader_stats shader_stats,
+                     const brw::performance &perf,
+                     struct brw_compile_stats *stats,
+                     unsigned max_polygons = 0);
+   void add_const_data(void *data, unsigned size);
+   void add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt);
+   const unsigned *get_assembly();
+
+private:
+   void fire_fb_write(fs_inst *inst,
+                      struct brw_reg payload,
+                      struct brw_reg implied_header,
+                      GLuint nr);
+   void generate_send(fs_inst *inst,
+                      struct brw_reg dst,
+                      struct brw_reg desc,
+                      struct brw_reg ex_desc,
+                      struct brw_reg payload,
+                      struct brw_reg payload2);
+   void generate_fb_write(fs_inst *inst, struct brw_reg payload);
+   void generate_fb_read(fs_inst *inst, struct brw_reg dst,
+                         struct brw_reg payload);
+   void generate_cs_terminate(fs_inst *inst, struct brw_reg payload);
+   void generate_barrier(fs_inst *inst, struct brw_reg src);
+   bool generate_linterp(fs_inst *inst, struct brw_reg dst,
+			 struct brw_reg *src);
+   void generate_tex(fs_inst *inst, struct brw_reg dst,
+                     struct brw_reg surface_index,
+                     struct brw_reg sampler_index);
+   void generate_ddx(const fs_inst *inst,
+                     struct brw_reg dst, struct brw_reg src);
+   void generate_ddy(const fs_inst *inst,
+                     struct brw_reg dst, struct brw_reg src);
+   void generate_scratch_write(fs_inst *inst, struct brw_reg src);
+   void generate_scratch_read(fs_inst *inst, struct brw_reg dst);
+   void generate_scratch_read_gfx7(fs_inst *inst, struct brw_reg dst);
+   void generate_scratch_header(fs_inst *inst, struct brw_reg dst);
+   void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
+                                            struct brw_reg index,
+                                            struct brw_reg offset);
+   void generate_varying_pull_constant_load_gfx4(fs_inst *inst,
+                                                 struct brw_reg dst,
+                                                 struct brw_reg index);
+
+   void generate_set_sample_id(fs_inst *inst,
+                               struct brw_reg dst,
+                               struct brw_reg src0,
+                               struct brw_reg src1);
+
+   void generate_halt(fs_inst *inst);
+
+   void generate_mov_indirect(fs_inst *inst,
+                              struct brw_reg dst,
+                              struct brw_reg reg,
+                              struct brw_reg indirect_byte_offset);
+
+   void generate_shuffle(fs_inst *inst,
+                         struct brw_reg dst,
+                         struct brw_reg src,
+                         struct brw_reg idx);
+
+   void generate_quad_swizzle(const fs_inst *inst,
+                              struct brw_reg dst, struct brw_reg src,
+                              unsigned swiz);
+
+   bool patch_halt_jumps();
+
+   const struct brw_compiler *compiler;
+   const struct brw_compile_params *params;
+
+   const struct intel_device_info *devinfo;
+
+   struct brw_codegen *p;
+   struct brw_stage_prog_data * const prog_data;
+
+   unsigned dispatch_width; /**< 8, 16 or 32 */
+
+   exec_list discard_halt_patches;
+   bool runtime_check_aads_emit;
+   bool debug_flag;
+   const char *shader_name;
+   gl_shader_stage stage;
+   void *mem_ctx;
+};
+
+namespace brw {
+   fs_reg
+   fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
+                     brw_reg_type type = BRW_REGISTER_TYPE_F,
+                     unsigned n = 1);
+
+   fs_reg
+   fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2]);
+
+   inline fs_reg
+   dynamic_msaa_flags(const struct brw_wm_prog_data *wm_prog_data)
+   {
+      return fs_reg(UNIFORM, wm_prog_data->msaa_flags_param,
+                    BRW_REGISTER_TYPE_UD);
+   }
+
+   void
+   check_dynamic_msaa_flag(const fs_builder &bld,
+                           const struct brw_wm_prog_data *wm_prog_data,
+                           enum intel_msaa_flags flag);
+
+   bool
+   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i);
+}
+
+void shuffle_from_32bit_read(const brw::fs_builder &bld,
+                             const fs_reg &dst,
+                             const fs_reg &src,
+                             uint32_t first_component,
+                             uint32_t components);
+
+fs_reg setup_imm_df(const brw::fs_builder &bld,
+                    double v);
+
+fs_reg setup_imm_b(const brw::fs_builder &bld,
+                   int8_t v);
+
+fs_reg setup_imm_ub(const brw::fs_builder &bld,
+                   uint8_t v);
+
+enum brw_barycentric_mode brw_barycentric_mode(nir_intrinsic_instr *intr);
+
+uint32_t brw_fb_write_msg_control(const fs_inst *inst,
+                                  const struct brw_wm_prog_data *prog_data);
+
+void brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data);
+
+bool brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width);
+
+fs_reg brw_sample_mask_reg(const brw::fs_builder &bld);
+void brw_emit_predicate_on_sample_mask(const brw::fs_builder &bld, fs_inst *inst);
+
+int brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
+                                    const brw_stage_prog_data *prog_data);
+
+bool brw_lower_dpas(fs_visitor &v);
+
+void nir_to_brw(fs_visitor *s);
+
+#endif /* BRW_FS_H */
diff --git a/src/intel/compiler/elk/brw_fs_bank_conflicts.cpp b/src/intel/compiler/elk/brw_fs_bank_conflicts.cpp
new file mode 100644
index 00000000000..8505748b0f8
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_bank_conflicts.cpp
@@ -0,0 +1,955 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_bank_conflicts.cpp
+ *
+ * This file contains a GRF bank conflict mitigation pass.  The pass is
+ * intended to be run after register allocation and works by rearranging the
+ * layout of the GRF space (without altering the semantics of the program) in
+ * a way that minimizes the number of GRF bank conflicts incurred by ternary
+ * instructions.
+ *
+ * Unfortunately there is close to no information about bank conflicts in the
+ * hardware spec, but experimentally on Gfx7-Gfx9 ternary instructions seem to
+ * incur an average bank conflict penalty of one cycle per SIMD8 op whenever
+ * the second and third source are stored in the same GRF bank (\sa bank_of()
+ * for the exact bank layout) which cannot be fetched during the same cycle by
+ * the EU, unless the EU logic manages to optimize out the read cycle of a
+ * duplicate source register (\sa is_conflict_optimized_out()).
+ *
+ * The asymptotic run-time of the algorithm is dominated by the
+ * shader_conflict_weight_matrix() computation below, which is O(n) on the
+ * number of instructions in the program, however for small and medium-sized
+ * programs the run-time is likely to be dominated by
+ * optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of
+ * the program (\sa partitioning), which is bounded (since the program uses a
+ * bounded number of registers post-regalloc) and of the order of 100.  For
+ * that reason optimize_reg_permutation() is vectorized in order to keep the
+ * cubic term within reasonable bounds for m close to its theoretical maximum.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+#ifdef __SSE2__
+
+#include <emmintrin.h>
+
+/**
+ * Thin layer around vector intrinsics so they can be easily replaced with
+ * e.g. the fall-back scalar path, an implementation with different vector
+ * width or using different SIMD architectures (AVX-512?!).
+ *
+ * This implementation operates on pairs of independent SSE2 integer vectors à
+ * la SIMD16 for somewhat improved throughput.  SSE2 is supported by virtually
+ * all platforms that care about bank conflicts, so this path should almost
+ * always be available in practice.
+ */
+namespace {
+   /**
+    * SIMD integer vector data type.
+    */
+   struct vector_type {
+      __m128i v[2];
+   };
+
+   /**
+    * Scalar data type matching the representation of a single component of \p
+    * vector_type.
+    */
+   typedef int16_t scalar_type;
+
+   /**
+    * Maximum integer value representable as a \p scalar_type.
+    */
+   const scalar_type max_scalar = INT16_MAX;
+
+   /**
+    * Number of components of a \p vector_type.
+    */
+   const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type);
+
+   /**
+    * Set the i-th component of vector \p v to \p x.
+    */
+   void
+   set(vector_type &v, unsigned i, scalar_type x)
+   {
+      assert(i < vector_width);
+      memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x));
+   }
+
+   /**
+    * Get the i-th component of vector \p v.
+    */
+   scalar_type
+   get(const vector_type &v, unsigned i)
+   {
+      assert(i < vector_width);
+      scalar_type x;
+      memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x));
+      return x;
+   }
+
+   /**
+    * Add two vectors with saturation.
+    */
+   vector_type
+   adds(const vector_type &v, const vector_type &w)
+   {
+      const vector_type u = {{
+            _mm_adds_epi16(v.v[0], w.v[0]),
+            _mm_adds_epi16(v.v[1], w.v[1])
+         }};
+      return u;
+   }
+
+   /**
+    * Subtract two vectors with saturation.
+    */
+   vector_type
+   subs(const vector_type &v, const vector_type &w)
+   {
+      const vector_type u = {{
+            _mm_subs_epi16(v.v[0], w.v[0]),
+            _mm_subs_epi16(v.v[1], w.v[1])
+         }};
+      return u;
+   }
+
+   /**
+    * Compute the bitwise conjunction of two vectors.
+    */
+   vector_type
+   mask(const vector_type &v, const vector_type &w)
+   {
+      const vector_type u = {{
+            _mm_and_si128(v.v[0], w.v[0]),
+            _mm_and_si128(v.v[1], w.v[1])
+         }};
+      return u;
+   }
+
+   /**
+    * Reduce the components of a vector using saturating addition.
+    */
+   scalar_type
+   sums(const vector_type &v)
+   {
+      const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]);
+      const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e));
+      const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1));
+      const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1));
+      return _mm_extract_epi16(v1, 0);
+   }
+}
+
+#else
+
+/**
+ * Thin layer around vector intrinsics so they can be easily replaced with
+ * e.g. the fall-back scalar path, an implementation with different vector
+ * width or using different SIMD architectures (AVX-512?!).
+ *
+ * This implementation operates on scalar values and doesn't rely on
+ * any vector extensions.  This is mainly intended for debugging and
+ * to keep this file building on exotic platforms.
+ */
+namespace {
+   /**
+    * SIMD integer vector data type.
+    */
+   typedef int16_t vector_type;
+
+   /**
+    * Scalar data type matching the representation of a single component of \p
+    * vector_type.
+    */
+   typedef int16_t scalar_type;
+
+   /**
+    * Maximum integer value representable as a \p scalar_type.
+    */
+   const scalar_type max_scalar = INT16_MAX;
+
+   /**
+    * Number of components of a \p vector_type.
+    */
+   const unsigned vector_width = 1;
+
+   /**
+    * Set the i-th component of vector \p v to \p x.
+    */
+   void
+   set(vector_type &v, unsigned i, scalar_type x)
+   {
+      assert(i < vector_width);
+      v = x;
+   }
+
+   /**
+    * Get the i-th component of vector \p v.
+    */
+   scalar_type
+   get(const vector_type &v, unsigned i)
+   {
+      assert(i < vector_width);
+      return v;
+   }
+
+   /**
+    * Add two vectors with saturation.
+    */
+   vector_type
+   adds(vector_type v, vector_type w)
+   {
+      return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w));
+   }
+
+   /**
+    * Subtract two vectors with saturation.
+    */
+   vector_type
+   subs(vector_type v, vector_type w)
+   {
+      return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w));
+   }
+
+   /**
+    * Compute the bitwise conjunction of two vectors.
+    */
+   vector_type
+   mask(vector_type v, vector_type w)
+   {
+      return v & w;
+   }
+
+   /**
+    * Reduce the components of a vector using saturating addition.
+    */
+   scalar_type
+   sums(vector_type v)
+   {
+      return v;
+   }
+}
+
+#endif
+
+/**
+ * Swap \p x and \p y.
+ */
+#define SWAP(x, y) do {                          \
+      __typeof(y) _swap_tmp = y;                 \
+      y = x;                                     \
+      x = _swap_tmp;                             \
+   } while (0)
+
+namespace {
+   /**
+    * Variable-length vector type intended to represent cycle-count costs for
+    * arbitrary atom-to-bank assignments.  It's indexed by a pair of integers
+    * (i, p), where i is an atom index and p in {0, 1} indicates the parity of
+    * the conflict (respectively, whether the cost is incurred whenever the
+    * atoms are assigned the same bank b or opposite-parity banks b and b^1).
+    * \sa shader_conflict_weight_matrix()
+    */
+   struct weight_vector_type {
+      weight_vector_type() : v(NULL), size(0) {}
+
+      weight_vector_type(unsigned n) : v(alloc(n)), size(n) {}
+
+      weight_vector_type(const weight_vector_type &u) :
+         v(alloc(u.size)), size(u.size)
+      {
+         memcpy(v, u.v,
+                DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type));
+      }
+
+      ~weight_vector_type()
+      {
+         free(v);
+      }
+
+      weight_vector_type &
+      operator=(weight_vector_type u)
+      {
+         SWAP(v, u.v);
+         SWAP(size, u.size);
+         return *this;
+      }
+
+      vector_type *v;
+      unsigned size;
+
+   private:
+      static vector_type *
+      alloc(unsigned n)
+      {
+         const unsigned align = MAX2(sizeof(void *), __alignof__(vector_type));
+         const unsigned size = DIV_ROUND_UP(n, vector_width) * sizeof(vector_type);
+         void *p;
+         if (posix_memalign(&p, align, size))
+            return NULL;
+         memset(p, 0, size);
+         return reinterpret_cast<vector_type *>(p);
+      }
+   };
+
+   /**
+    * Set the (i, p)-th component of weight vector \p v to \p x.
+    */
+   void
+   set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x)
+   {
+      set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x);
+   }
+
+   /**
+    * Get the (i, p)-th component of weight vector \p v.
+    */
+   scalar_type
+   get(const weight_vector_type &v, unsigned i, unsigned p)
+   {
+      return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width);
+   }
+
+   /**
+    * Swap the (i, p)-th and (j, q)-th components of weight vector \p v.
+    */
+   void
+   swap(weight_vector_type &v,
+        unsigned i, unsigned p,
+        unsigned j, unsigned q)
+   {
+      const scalar_type tmp = get(v, i, p);
+      set(v, i, p, get(v, j, q));
+      set(v, j, q, tmp);
+   }
+}
+
+namespace {
+   /**
+    * Object that represents the partitioning of an arbitrary register space
+    * into indivisible units (referred to as atoms below) that can potentially
+    * be rearranged independently from other registers.  The partitioning is
+    * inferred from a number of contiguity requirements specified using
+    * require_contiguous().  This allows efficient look-up of the atom index a
+    * given register address belongs to, or conversely the range of register
+    * addresses that belong to a given atom.
+    */
+   struct partitioning {
+      /**
+       * Create a (for the moment unrestricted) partitioning of a register
+       * file of size \p n.  The units are arbitrary.
+       */
+      partitioning(unsigned n) :
+         max_reg(n),
+         offsets(new unsigned[n + num_terminator_atoms]),
+         atoms(new unsigned[n + num_terminator_atoms])
+      {
+         for (unsigned i = 0; i < n + num_terminator_atoms; i++) {
+            offsets[i] = i;
+            atoms[i] = i;
+         }
+      }
+
+      partitioning(const partitioning &p) :
+         max_reg(p.max_reg),
+         offsets(new unsigned[p.num_atoms() + num_terminator_atoms]),
+         atoms(new unsigned[p.max_reg + num_terminator_atoms])
+      {
+         memcpy(offsets, p.offsets,
+                sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms));
+         memcpy(atoms, p.atoms,
+                sizeof(unsigned) * (p.max_reg + num_terminator_atoms));
+      }
+
+      ~partitioning()
+      {
+         delete[] offsets;
+         delete[] atoms;
+      }
+
+      partitioning &
+      operator=(partitioning p)
+      {
+         SWAP(max_reg, p.max_reg);
+         SWAP(offsets, p.offsets);
+         SWAP(atoms, p.atoms);
+         return *this;
+      }
+
+      /**
+       * Require register range [reg, reg + n[ to be considered part of the
+       * same atom.
+       */
+      void
+      require_contiguous(unsigned reg, unsigned n)
+      {
+         unsigned r = atoms[reg];
+
+         /* Renumber atoms[reg...] = { r... } and their offsets[r...] for the
+          * case that the specified contiguity requirement leads to the fusion
+          * (yay) of one or more existing atoms.
+          */
+         for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) {
+            if (offsets[atoms[reg1]] < reg + n) {
+               atoms[reg1] = r;
+            } else {
+               if (offsets[atoms[reg1 - 1]] != offsets[atoms[reg1]])
+                  r++;
+
+               offsets[r] = offsets[atoms[reg1]];
+               atoms[reg1] = r;
+            }
+         }
+      }
+
+      /**
+       * Get the atom index register address \p reg belongs to.
+       */
+      unsigned
+      atom_of_reg(unsigned reg) const
+      {
+         return atoms[reg];
+      }
+
+      /**
+       * Get the base register address that belongs to atom \p r.
+       */
+      unsigned
+      reg_of_atom(unsigned r) const
+      {
+         return offsets[r];
+      }
+
+      /**
+       * Get the size of atom \p r in register address units.
+       */
+      unsigned
+      size_of_atom(unsigned r) const
+      {
+         assert(r < num_atoms());
+         return reg_of_atom(r + 1) - reg_of_atom(r);
+      }
+
+      /**
+       * Get the number of atoms the whole register space is partitioned into.
+       */
+      unsigned
+      num_atoms() const
+      {
+         return atoms[max_reg];
+      }
+
+   private:
+      /**
+       * Number of trailing atoms inserted for convenience so among other
+       * things we don't need to special-case the last element in
+       * size_of_atom().
+       */
+      static const unsigned num_terminator_atoms = 1;
+      unsigned max_reg;
+      unsigned *offsets;
+      unsigned *atoms;
+   };
+
+   /**
+    * Only GRF sources (whether they have been register-allocated or not) can
+    * possibly incur bank conflicts.
+    */
+   bool
+   is_grf(const fs_reg &r)
+   {
+      return r.file == VGRF || r.file == FIXED_GRF;
+   }
+
+   /**
+    * Register offset of \p r in GRF units.  Useful because the representation
+    * of GRFs post-register allocation is somewhat inconsistent and depends on
+    * whether the register already had a fixed GRF offset prior to register
+    * allocation or whether it was part of a VGRF allocation.
+    */
+   unsigned
+   reg_of(const fs_reg &r)
+   {
+      assert(is_grf(r));
+      if (r.file == VGRF)
+         return r.nr + r.offset / REG_SIZE;
+      else
+         return reg_offset(r) / REG_SIZE;
+   }
+
+   /**
+    * Calculate the finest partitioning of the GRF space compatible with the
+    * register contiguity requirements derived from all instructions part of
+    * the program.
+    */
+   partitioning
+   shader_reg_partitioning(const fs_visitor *v)
+   {
+      partitioning p(BRW_MAX_GRF);
+
+      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+         if (is_grf(inst->dst))
+            p.require_contiguous(reg_of(inst->dst), regs_written(inst));
+
+         for (int i = 0; i < inst->sources; i++) {
+            if (is_grf(inst->src[i]))
+               p.require_contiguous(reg_of(inst->src[i]), regs_read(inst, i));
+         }
+      }
+
+      return p;
+   }
+
+   /**
+    * Return the set of GRF atoms that should be left untouched at their
+    * original location to avoid violating hardware or software assumptions.
+    */
+   bool *
+   shader_reg_constraints(const fs_visitor *v, const partitioning &p)
+   {
+      bool *constrained = new bool[p.num_atoms()]();
+
+      /* These are read implicitly by some send-message instructions without
+       * any indication at the IR level.  Assume they are unsafe to move
+       * around.
+       */
+      for (unsigned reg = 0; reg < 2; reg++)
+         constrained[p.atom_of_reg(reg)] = true;
+
+      /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
+       * subsection "EUISA Instructions", Send Message (page 990):
+       *
+       * "r127 must not be used for return address when there is a src and
+       * dest overlap in send instruction."
+       *
+       * Register allocation ensures that, so don't move 127 around to avoid
+       * breaking that property.
+       */
+      if (v->devinfo->ver >= 8)
+         constrained[p.atom_of_reg(127)] = true;
+
+      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+         /* Assume that anything referenced via fixed GRFs is baked into the
+          * hardware's fixed-function logic and may be unsafe to move around.
+          * Also take into account the source GRF restrictions of EOT
+          * send-message instructions.
+          */
+         if (inst->dst.file == FIXED_GRF)
+            constrained[p.atom_of_reg(reg_of(inst->dst))] = true;
+
+         for (int i = 0; i < inst->sources; i++) {
+            if (inst->src[i].file == FIXED_GRF ||
+                (is_grf(inst->src[i]) && inst->eot))
+               constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true;
+         }
+
+         /* Preserve the original allocation of VGRFs used by the barycentric
+          * source of the LINTERP instruction on Gfx6, since pair-aligned
+          * barycentrics allow the PLN instruction to be used.
+          */
+         if (v->devinfo->has_pln && v->devinfo->ver <= 6 &&
+             inst->opcode == FS_OPCODE_LINTERP)
+            constrained[p.atom_of_reg(reg_of(inst->src[0]))] = true;
+
+         /* The location of the Gfx7 MRF hack registers is hard-coded in the
+          * rest of the compiler back-end.  Don't attempt to move them around.
+          */
+         if (v->devinfo->ver >= 7) {
+            assert(inst->dst.file != MRF);
+
+            for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
+               const unsigned reg = GFX7_MRF_HACK_START + inst->base_mrf + i;
+               constrained[p.atom_of_reg(reg)] = true;
+            }
+         }
+      }
+
+      return constrained;
+   }
+
+   /**
+    * Return whether the hardware will be able to prevent a bank conflict by
+    * optimizing out the read cycle of a source register.  The formula was
+    * found experimentally.
+    */
+   bool
+   is_conflict_optimized_out(const intel_device_info *devinfo,
+                             const fs_inst *inst)
+   {
+      return devinfo->ver >= 9 &&
+         ((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
+                                    reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
+          reg_of(inst->src[1]) == reg_of(inst->src[2]));
+   }
+
+   /**
+    * Return a matrix that allows reasonably efficient computation of the
+    * cycle-count cost of bank conflicts incurred throughout the whole program
+    * for any given atom-to-bank assignment.
+    *
+    * More precisely, if C_r_s_p is the result of this function, the total
+    * cost of all bank conflicts involving any given atom r can be readily
+    * recovered as follows:
+    *
+    *  S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p)
+    *
+    * where d_i_j is the Kronecker delta, and B_r indicates the bank
+    * assignment of r.  \sa delta_conflicts() for a vectorized implementation
+    * of the expression above.
+    *
+    * FINISHME: Teach this about the Gfx10+ bank conflict rules, which are
+    *           somewhat more relaxed than on previous generations.  In the
+    *           meantime optimizing based on Gfx9 weights is likely to be more
+    *           helpful than not optimizing at all.
+    */
+   weight_vector_type *
+   shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p)
+   {
+      weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()];
+      for (unsigned r = 0; r < p.num_atoms(); r++)
+         conflicts[r] = weight_vector_type(2 * p.num_atoms());
+
+      /* Crude approximation of the number of times the current basic block
+       * will be executed at run-time.
+       */
+      unsigned block_scale = 1;
+
+      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+         if (inst->opcode == BRW_OPCODE_DO) {
+            block_scale *= 10;
+
+         } else if (inst->opcode == BRW_OPCODE_WHILE) {
+            block_scale /= 10;
+
+         } else if (inst->is_3src(v->compiler) &&
+                    is_grf(inst->src[1]) && is_grf(inst->src[2])) {
+            const unsigned r = p.atom_of_reg(reg_of(inst->src[1]));
+            const unsigned s = p.atom_of_reg(reg_of(inst->src[2]));
+
+            /* Estimate of the cycle-count cost of incurring a bank conflict
+             * for this instruction.  This is only true on the average, for a
+             * sequence of back-to-back ternary instructions, since the EU
+             * front-end only seems to be able to issue a new instruction at
+             * an even cycle.  The cost of a bank conflict incurred by an
+             * isolated ternary instruction may be higher.
+             */
+            const unsigned exec_size = inst->dst.component_size(inst->exec_size);
+            const unsigned cycle_scale = block_scale * DIV_ROUND_UP(exec_size,
+                                                                    REG_SIZE);
+
+            /* Neglect same-atom conflicts (since they're either trivial or
+             * impossible to avoid without splitting the atom), and conflicts
+             * known to be optimized out by the hardware.
+             */
+            if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) {
+               /* Calculate the parity of the sources relative to the start of
+                * their respective atoms.  If their parity is the same (and
+                * none of the atoms straddle the 2KB mark), the instruction
+                * will incur a conflict iff both atoms are assigned the same
+                * bank b.  If their parity is opposite, the instruction will
+                * incur a conflict iff they are assigned opposite banks (b and
+                * b^1).
+                */
+               const bool p_r = 1 & (reg_of(inst->src[1]) - p.reg_of_atom(r));
+               const bool p_s = 1 & (reg_of(inst->src[2]) - p.reg_of_atom(s));
+               const unsigned p = p_r ^ p_s;
+
+               /* Calculate the updated cost of a hypothetical conflict
+                * between atoms r and s.  Note that the weight matrix is
+                * symmetric with respect to indices r and s by construction.
+                */
+               const scalar_type w = MIN2(unsigned(max_scalar),
+                                          get(conflicts[r], s, p) + cycle_scale);
+               set(conflicts[r], s, p, w);
+               set(conflicts[s], r, p, w);
+            }
+         }
+      }
+
+      return conflicts;
+   }
+
+   /**
+    * Return the set of GRF atoms that could potentially lead to bank
+    * conflicts if laid out unfavorably in the GRF space according to
+    * the specified \p conflicts matrix (\sa
+    * shader_conflict_weight_matrix()).
+    */
+   bool *
+   have_any_conflicts(const partitioning &p,
+                      const weight_vector_type *conflicts)
+   {
+      bool *any_conflicts = new bool[p.num_atoms()]();
+
+      for (unsigned r = 0; r < p.num_atoms(); r++) {
+         const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width);
+         for (unsigned s = 0; s < m; s++)
+            any_conflicts[r] |= sums(conflicts[r].v[s]);
+      }
+
+      return any_conflicts;
+   }
+
+   /**
+    * Calculate the difference between two S(B) cost estimates as defined
+    * above (\sa shader_conflict_weight_matrix()).  This represents the
+    * (partial) cycle-count benefit from moving an atom r from bank p to n.
+    * The respective bank assignments Bp and Bn are encoded as the \p
+    * bank_mask_p and \p bank_mask_n bitmasks for efficient computation,
+    * according to the formula:
+    *
+    *  bank_mask(B)_s_p = -d_(p^B_r)_(B_s)
+    *
+    * Notice the similarity with the delta function in the S(B) expression
+    * above, and how bank_mask(B) can be precomputed for every possible
+    * selection of r since bank_mask(B) only depends on it via B_r that may
+    * only assume one of four different values, so the caller can keep every
+    * possible bank_mask(B) vector in memory without much hassle (\sa
+    * bank_characteristics()).
+    */
+   int
+   delta_conflicts(const weight_vector_type &bank_mask_p,
+                   const weight_vector_type &bank_mask_n,
+                   const weight_vector_type &conflicts)
+   {
+      const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width);
+      vector_type s_p = {}, s_n = {};
+
+      for (unsigned r = 0; r < m; r++) {
+         s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r]));
+         s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r]));
+      }
+
+      return sums(subs(s_p, s_n));
+   }
+
+   /**
+    * Register atom permutation, represented as the start GRF offset each atom
+    * is mapped into.
+    */
+   struct permutation {
+      permutation() : v(NULL), size(0) {}
+
+      permutation(unsigned n) :
+         v(new unsigned[n]()), size(n) {}
+
+      permutation(const permutation &p) :
+         v(new unsigned[p.size]), size(p.size)
+      {
+         memcpy(v, p.v, p.size * sizeof(unsigned));
+      }
+
+      ~permutation()
+      {
+         delete[] v;
+      }
+
+      permutation &
+      operator=(permutation p)
+      {
+         SWAP(v, p.v);
+         SWAP(size, p.size);
+         return *this;
+      }
+
+      unsigned *v;
+      unsigned size;
+   };
+
+   /**
+    * Return an identity permutation of GRF atoms.
+    */
+   permutation
+   identity_reg_permutation(const partitioning &p)
+   {
+      permutation map(p.num_atoms());
+
+      for (unsigned r = 0; r < map.size; r++)
+         map.v[r] = p.reg_of_atom(r);
+
+      return map;
+   }
+
+   /**
+    * Return the bank index of GRF address \p reg, numbered according to the
+    * table:
+    *        Even Odd
+    *    Lo    0   1
+    *    Hi    2   3
+    */
+   unsigned
+   bank_of(unsigned reg)
+   {
+      return (reg & 0x40) >> 5 | (reg & 1);
+   }
+
+   /**
+    * Return bitmasks suitable for use as bank mask arguments for the
+    * delta_conflicts() computation.  Note that this is just the (negative)
+    * characteristic function of each bank, if you regard it as a set
+    * containing all atoms assigned to it according to the \p map array.
+    */
+   weight_vector_type *
+   bank_characteristics(const permutation &map)
+   {
+      weight_vector_type *banks = new weight_vector_type[4];
+
+      for (unsigned b = 0; b < 4; b++) {
+         banks[b] = weight_vector_type(2 * map.size);
+
+         for (unsigned j = 0; j < map.size; j++) {
+            for (unsigned p = 0; p < 2; p++)
+               set(banks[b], j, p,
+                   (b ^ p) == bank_of(map.v[j]) ? -1 : 0);
+         }
+      }
+
+      return banks;
+   }
+
+   /**
+    * Return an improved permutation of GRF atoms based on \p map attempting
+    * to reduce the total cycle-count cost of bank conflicts greedily.
+    *
+    * Note that this doesn't attempt to merge multiple atoms into one, which
+    * may allow it to do a better job in some cases -- It simply reorders
+    * existing atoms in the GRF space without affecting their identity.
+    */
+   permutation
+   optimize_reg_permutation(const partitioning &p,
+                            const bool *constrained,
+                            const weight_vector_type *conflicts,
+                            permutation map)
+   {
+      const bool *any_conflicts = have_any_conflicts(p, conflicts);
+      weight_vector_type *banks = bank_characteristics(map);
+
+      for (unsigned r = 0; r < map.size; r++) {
+         const unsigned bank_r = bank_of(map.v[r]);
+
+         if (!constrained[r]) {
+            unsigned best_s = r;
+            int best_benefit = 0;
+
+            for (unsigned s = 0; s < map.size; s++) {
+               const unsigned bank_s = bank_of(map.v[s]);
+
+               if (bank_r != bank_s && !constrained[s] &&
+                   p.size_of_atom(r) == p.size_of_atom(s) &&
+                   (any_conflicts[r] || any_conflicts[s])) {
+                  const int benefit =
+                     delta_conflicts(banks[bank_r], banks[bank_s], conflicts[r]) +
+                     delta_conflicts(banks[bank_s], banks[bank_r], conflicts[s]);
+
+                  if (benefit > best_benefit) {
+                     best_s = s;
+                     best_benefit = benefit;
+                  }
+               }
+            }
+
+            if (best_s != r) {
+               for (unsigned b = 0; b < 4; b++) {
+                  for (unsigned p = 0; p < 2; p++)
+                     swap(banks[b], r, p, best_s, p);
+               }
+
+               SWAP(map.v[r], map.v[best_s]);
+            }
+         }
+      }
+
+      delete[] banks;
+      delete[] any_conflicts;
+      return map;
+   }
+
+   /**
+    * Apply the GRF atom permutation given by \p map to register \p r and
+    * return the result.
+    */
+   fs_reg
+   transform(const partitioning &p, const permutation &map, fs_reg r)
+   {
+      if (r.file == VGRF) {
+         const unsigned reg = reg_of(r);
+         const unsigned s = p.atom_of_reg(reg);
+         r.nr = map.v[s] + reg - p.reg_of_atom(s);
+         r.offset = r.offset % REG_SIZE;
+      }
+
+      return r;
+   }
+}
+
+bool
+fs_visitor::opt_bank_conflicts()
+{
+   assert(grf_used || !"Must be called after register allocation");
+
+   /* TODO: Re-work this pass for Gfx20+. */
+   if (devinfo->ver >= 20)
+      return false;
+
+   /* No ternary instructions -- No bank conflicts. */
+   if (devinfo->ver < 6)
+      return false;
+
+   const partitioning p = shader_reg_partitioning(this);
+   const bool *constrained = shader_reg_constraints(this, p);
+   const weight_vector_type *conflicts =
+      shader_conflict_weight_matrix(this, p);
+   const permutation map =
+      optimize_reg_permutation(p, constrained, conflicts,
+                               identity_reg_permutation(p));
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      inst->dst = transform(p, map, inst->dst);
+
+      for (int i = 0; i < inst->sources; i++)
+         inst->src[i] = transform(p, map, inst->src[i]);
+   }
+
+   delete[] conflicts;
+   delete[] constrained;
+   return true;
+}
+
+/**
+ * Return whether the instruction incurs GRF bank conflict cycles.
+ *
+ * Note that this is only accurate after register allocation because otherwise
+ * we don't know which bank each VGRF is going to end up aligned to.
+ */
+bool
+has_bank_conflict(const struct brw_isa_info *isa, const fs_inst *inst)
+{
+   return is_3src(isa, inst->opcode) &&
+          is_grf(inst->src[1]) && is_grf(inst->src[2]) &&
+          bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) &&
+          !is_conflict_optimized_out(isa->devinfo, inst);
+}
diff --git a/src/intel/compiler/elk/brw_fs_builder.h b/src/intel/compiler/elk/brw_fs_builder.h
new file mode 100644
index 00000000000..63244f0b75b
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_builder.h
@@ -0,0 +1,965 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_FS_BUILDER_H
+#define BRW_FS_BUILDER_H
+
+#include "brw_ir_fs.h"
+#include "brw_shader.h"
+#include "brw_eu.h"
+#include "brw_fs.h"
+
+namespace brw {
+   /**
+    * Toolbox to assemble an FS IR program out of individual instructions.
+    *
+    * This object is meant to have an interface consistent with
+    * brw::vec4_builder.  They cannot be fully interchangeable because
+    * brw::fs_builder generates scalar code while brw::vec4_builder generates
+    * vector code.
+    */
+   class fs_builder {
+   public:
+      /** Type used in this IR to represent a source of an instruction. */
+      typedef fs_reg src_reg;
+
+      /** Type used in this IR to represent the destination of an instruction. */
+      typedef fs_reg dst_reg;
+
+      /** Type used in this IR to represent an instruction. */
+      typedef fs_inst instruction;
+
+      /**
+       * Construct an fs_builder that inserts instructions into \p shader.
+       * \p dispatch_width gives the native execution width of the program.
+       */
+      fs_builder(fs_visitor *shader,
+                 unsigned dispatch_width) :
+         shader(shader), block(NULL), cursor(NULL),
+         _dispatch_width(dispatch_width),
+         _group(0),
+         force_writemask_all(false),
+         annotation()
+      {
+      }
+
+      explicit fs_builder(fs_visitor *s) : fs_builder(s, s->dispatch_width) {}
+
+      /**
+       * Construct an fs_builder that inserts instructions into \p shader
+       * before instruction \p inst in basic block \p block.  The default
+       * execution controls and debug annotation are initialized from the
+       * instruction passed as argument.
+       */
+      fs_builder(fs_visitor *shader, bblock_t *block, fs_inst *inst) :
+         shader(shader), block(block), cursor(inst),
+         _dispatch_width(inst->exec_size),
+         _group(inst->group),
+         force_writemask_all(inst->force_writemask_all)
+      {
+         annotation.str = inst->annotation;
+         annotation.ir = inst->ir;
+      }
+
+      /**
+       * Construct an fs_builder that inserts instructions before \p cursor in
+       * basic block \p block, inheriting other code generation parameters
+       * from this.
+       */
+      fs_builder
+      at(bblock_t *block, exec_node *cursor) const
+      {
+         fs_builder bld = *this;
+         bld.block = block;
+         bld.cursor = cursor;
+         return bld;
+      }
+
+      /**
+       * Construct an fs_builder appending instructions at the end of the
+       * instruction list of the shader, inheriting other code generation
+       * parameters from this.
+       */
+      fs_builder
+      at_end() const
+      {
+         return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
+      }
+
+      /**
+       * Construct a builder specifying the default SIMD width and group of
+       * channel enable signals, inheriting other code generation parameters
+       * from this.
+       *
+       * \p n gives the default SIMD width, \p i gives the slot group used for
+       * predication and control flow masking in multiples of \p n channels.
+       */
+      fs_builder
+      group(unsigned n, unsigned i) const
+      {
+         fs_builder bld = *this;
+
+         if (n <= dispatch_width() && i < dispatch_width() / n) {
+            bld._group += i * n;
+         } else {
+            /* The requested channel group isn't a subset of the channel group
+             * of this builder, which means that the resulting instructions
+             * would use (potentially undefined) channel enable signals not
+             * specified by the parent builder.  That's only valid if the
+             * instruction doesn't have per-channel semantics, in which case
+             * we should clear off the default group index in order to prevent
+             * emitting instructions with channel group not aligned to their
+             * own execution size.
+             */
+            assert(force_writemask_all);
+            bld._group = 0;
+         }
+
+         bld._dispatch_width = n;
+         return bld;
+      }
+
+      /**
+       * Alias for group() with width equal to eight.
+       */
+      fs_builder
+      quarter(unsigned i) const
+      {
+         return group(8, i);
+      }
+
+      /**
+       * Construct a builder with per-channel control flow execution masking
+       * disabled if \p b is true.  If control flow execution masking is
+       * already disabled this has no effect.
+       */
+      fs_builder
+      exec_all(bool b = true) const
+      {
+         fs_builder bld = *this;
+         if (b)
+            bld.force_writemask_all = true;
+         return bld;
+      }
+
+      /**
+       * Construct a builder with the given debug annotation info.
+       */
+      fs_builder
+      annotate(const char *str, const void *ir = NULL) const
+      {
+         fs_builder bld = *this;
+         bld.annotation.str = str;
+         bld.annotation.ir = ir;
+         return bld;
+      }
+
+      /**
+       * Get the SIMD width in use.
+       */
+      unsigned
+      dispatch_width() const
+      {
+         return _dispatch_width;
+      }
+
+      /**
+       * Get the channel group in use.
+       */
+      unsigned
+      group() const
+      {
+         return _group;
+      }
+
+      /**
+       * Allocate a virtual register of natural vector size (one for this IR)
+       * and SIMD width.  \p n gives the amount of space to allocate in
+       * dispatch_width units (which is just enough space for one logical
+       * component in this IR).
+       */
+      dst_reg
+      vgrf(enum brw_reg_type type, unsigned n = 1) const
+      {
+         const unsigned unit = reg_unit(shader->devinfo);
+         assert(dispatch_width() <= 32);
+
+         if (n > 0)
+            return dst_reg(VGRF, shader->alloc.allocate(
+                              DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
+                                           unit * REG_SIZE) * unit),
+                           type);
+         else
+            return retype(null_reg_ud(), type);
+      }
+
+      /**
+       * Create a null register of floating type.
+       */
+      dst_reg
+      null_reg_f() const
+      {
+         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
+      }
+
+      dst_reg
+      null_reg_df() const
+      {
+         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
+      }
+
+      /**
+       * Create a null register of signed integer type.
+       */
+      dst_reg
+      null_reg_d() const
+      {
+         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      }
+
+      /**
+       * Create a null register of unsigned integer type.
+       */
+      dst_reg
+      null_reg_ud() const
+      {
+         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+      }
+
+      /**
+       * Insert an instruction into the program.
+       */
+      instruction *
+      emit(const instruction &inst) const
+      {
+         return emit(new(shader->mem_ctx) instruction(inst));
+      }
+
+      /**
+       * Create and insert a nullary control instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode) const
+      {
+         return emit(instruction(opcode, dispatch_width()));
+      }
+
+      /**
+       * Create and insert a nullary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst) const
+      {
+         return emit(instruction(opcode, dispatch_width(), dst));
+      }
+
+      /**
+       * Create and insert a unary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_RCP:
+         case SHADER_OPCODE_RSQ:
+         case SHADER_OPCODE_SQRT:
+         case SHADER_OPCODE_EXP2:
+         case SHADER_OPCODE_LOG2:
+         case SHADER_OPCODE_SIN:
+         case SHADER_OPCODE_COS:
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    fix_math_operand(src0)));
+
+         default:
+            return emit(instruction(opcode, dispatch_width(), dst, src0));
+         }
+      }
+
+      /**
+       * Create and insert a binary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_POW:
+         case SHADER_OPCODE_INT_QUOTIENT:
+         case SHADER_OPCODE_INT_REMAINDER:
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    fix_math_operand(src0),
+                                    fix_math_operand(src1)));
+
+         default:
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    src0, src1));
+
+         }
+      }
+
+      /**
+       * Create and insert a ternary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1, const src_reg &src2) const
+      {
+         switch (opcode) {
+         case BRW_OPCODE_BFE:
+         case BRW_OPCODE_BFI2:
+         case BRW_OPCODE_MAD:
+         case BRW_OPCODE_LRP:
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    fix_3src_operand(src0),
+                                    fix_3src_operand(src1),
+                                    fix_3src_operand(src2)));
+
+         default:
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    src0, src1, src2));
+         }
+      }
+
+      /**
+       * Create and insert an instruction with a variable number of sources
+       * into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
+           unsigned n) const
+      {
+         /* Use the emit() methods for specific operand counts to ensure that
+          * opcode-specific operand fixups occur.
+          */
+         if (n == 2) {
+            return emit(opcode, dst, srcs[0], srcs[1]);
+         } else if (n == 3) {
+            return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
+         } else {
+            return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
+         }
+      }
+
+      /**
+       * Insert a preallocated instruction into the program.
+       */
+      instruction *
+      emit(instruction *inst) const
+      {
+         assert(inst->exec_size <= 32);
+         assert(inst->exec_size == dispatch_width() ||
+                force_writemask_all);
+
+         inst->group = _group;
+         inst->force_writemask_all = force_writemask_all;
+         inst->annotation = annotation.str;
+         inst->ir = annotation.ir;
+
+         if (block)
+            static_cast<instruction *>(cursor)->insert_before(block, inst);
+         else
+            cursor->insert_before(inst);
+
+         return inst;
+      }
+
+      /**
+       * Select \p src0 if the comparison of both sources with the given
+       * conditional mod evaluates to true, otherwise select \p src1.
+       *
+       * Generally useful to get the minimum or maximum of two values.
+       */
+      instruction *
+      emit_minmax(const dst_reg &dst, const src_reg &src0,
+                  const src_reg &src1, brw_conditional_mod mod) const
+      {
+         assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
+
+         /* In some cases we can't have bytes as operand for src1, so use the
+          * same type for both operand.
+          */
+         return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
+                                     fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Copy any live channel from \p src to the first channel of the result.
+       */
+      src_reg
+      emit_uniformize(const src_reg &src) const
+      {
+         /* FIXME: We use a vector chan_index and dst to allow constant and
+          * copy propagration to move result all the way into the consuming
+          * instruction (typically a surface index or sampler index for a
+          * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
+          * dispatch. Once we teach const/copy propagation about scalars we
+          * should go back to scalar destinations here.
+          */
+         const fs_builder ubld = exec_all();
+         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
+         const dst_reg dst = vgrf(src.type);
+
+         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
+         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
+
+         return src_reg(component(dst, 0));
+      }
+
+      src_reg
+      move_to_vgrf(const src_reg &src, unsigned num_components) const
+      {
+         src_reg *const src_comps = new src_reg[num_components];
+         for (unsigned i = 0; i < num_components; i++)
+            src_comps[i] = offset(src, dispatch_width(), i);
+
+         const dst_reg dst = vgrf(src.type, num_components);
+         LOAD_PAYLOAD(dst, src_comps, num_components, 0);
+
+         delete[] src_comps;
+
+         return src_reg(dst);
+      }
+
+      void
+      emit_scan_step(enum opcode opcode, brw_conditional_mod mod,
+                     const dst_reg &tmp,
+                     unsigned left_offset, unsigned left_stride,
+                     unsigned right_offset, unsigned right_stride) const
+      {
+         dst_reg left, right;
+         left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
+         right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
+         if ((tmp.type == BRW_REGISTER_TYPE_Q ||
+              tmp.type == BRW_REGISTER_TYPE_UQ) &&
+             !shader->devinfo->has_64bit_int) {
+            switch (opcode) {
+            case BRW_OPCODE_MUL:
+               /* This will get lowered by integer MUL lowering */
+               set_condmod(mod, emit(opcode, right, left, right));
+               break;
+
+            case BRW_OPCODE_SEL: {
+               /* In order for the comparisons to work out right, we need our
+                * comparisons to be strict.
+                */
+               assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
+               if (mod == BRW_CONDITIONAL_GE)
+                  mod = BRW_CONDITIONAL_G;
+
+               /* We treat the bottom 32 bits as unsigned regardless of
+                * whether or not the integer as a whole is signed.
+                */
+               dst_reg right_low = subscript(right, BRW_REGISTER_TYPE_UD, 0);
+               dst_reg left_low = subscript(left, BRW_REGISTER_TYPE_UD, 0);
+
+               /* The upper bits get the same sign as the 64-bit type */
+               brw_reg_type type32 = brw_reg_type_from_bit_size(32, tmp.type);
+               dst_reg right_high = subscript(right, type32, 1);
+               dst_reg left_high = subscript(left, type32, 1);
+
+               /* Build up our comparison:
+                *
+                *   l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
+                */
+               CMP(null_reg_ud(), retype(left_low, BRW_REGISTER_TYPE_UD),
+                                  retype(right_low, BRW_REGISTER_TYPE_UD), mod);
+               set_predicate(BRW_PREDICATE_NORMAL,
+                             CMP(null_reg_ud(), left_high, right_high,
+                                 BRW_CONDITIONAL_EQ));
+               set_predicate_inv(BRW_PREDICATE_NORMAL, true,
+                                 CMP(null_reg_ud(), left_high, right_high, mod));
+
+               /* We could use selects here or we could use predicated MOVs
+                * because the destination and second source (if it were a SEL)
+                * are the same.
+                */
+               set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low));
+               set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high));
+               break;
+            }
+
+            default:
+               unreachable("Unsupported 64-bit scan op");
+            }
+         } else {
+            set_condmod(mod, emit(opcode, right, left, right));
+         }
+      }
+
+      void
+      emit_scan(enum opcode opcode, const dst_reg &tmp,
+                unsigned cluster_size, brw_conditional_mod mod) const
+      {
+         assert(dispatch_width() >= 8);
+
+         /* The instruction splitting code isn't advanced enough to split
+          * these so we need to handle that ourselves.
+          */
+         if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
+            const unsigned half_width = dispatch_width() / 2;
+            const fs_builder ubld = exec_all().group(half_width, 0);
+            dst_reg left = tmp;
+            dst_reg right = horiz_offset(tmp, half_width);
+            ubld.emit_scan(opcode, left, cluster_size, mod);
+            ubld.emit_scan(opcode, right, cluster_size, mod);
+            if (cluster_size > half_width) {
+               ubld.emit_scan_step(opcode, mod, tmp,
+                                   half_width - 1, 0, half_width, 1);
+            }
+            return;
+         }
+
+         if (cluster_size > 1) {
+            const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
+            ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
+         }
+
+         if (cluster_size > 2) {
+            if (type_sz(tmp.type) <= 4) {
+               const fs_builder ubld =
+                  exec_all().group(dispatch_width() / 4, 0);
+               ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
+               ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
+            } else {
+               /* For 64-bit types, we have to do things differently because
+                * the code above would land us with destination strides that
+                * the hardware can't handle.  Fortunately, we'll only be
+                * 8-wide in that case and it's the same number of
+                * instructions.
+                */
+               const fs_builder ubld = exec_all().group(2, 0);
+               for (unsigned i = 0; i < dispatch_width(); i += 4)
+                  ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
+            }
+         }
+
+         for (unsigned i = 4;
+              i < MIN2(cluster_size, dispatch_width());
+              i *= 2) {
+            const fs_builder ubld = exec_all().group(i, 0);
+            ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
+
+            if (dispatch_width() > i * 2)
+               ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
+
+            if (dispatch_width() > i * 4) {
+               ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
+               ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
+            }
+         }
+      }
+
+      instruction *
+      emit_undef_for_dst(const instruction *old_inst) const
+      {
+         assert(old_inst->dst.file == VGRF);
+         instruction *inst = emit(SHADER_OPCODE_UNDEF,
+                                  retype(old_inst->dst, BRW_REGISTER_TYPE_UD));
+         inst->size_written = old_inst->size_written;
+
+         return inst;
+      }
+
+      /**
+       * Assorted arithmetic ops.
+       * @{
+       */
+#define ALU1(op)                                        \
+      instruction *                                     \
+      op(const dst_reg &dst, const src_reg &src0) const \
+      {                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0);       \
+      }
+
+#define ALU2(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
+      }
+
+#define ALU2_ACC(op)                                                    \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
+         inst->writes_accumulator = true;                               \
+         return inst;                                                   \
+      }
+
+#define ALU3(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
+         const src_reg &src2) const                                     \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
+      }
+
+      ALU2(ADD)
+      ALU3(ADD3)
+      ALU2_ACC(ADDC)
+      ALU2(AND)
+      ALU2(ASR)
+      ALU2(AVG)
+      ALU3(BFE)
+      ALU2(BFI1)
+      ALU3(BFI2)
+      ALU1(BFREV)
+      ALU1(CBIT)
+      ALU1(DIM)
+      ALU2(DP2)
+      ALU2(DP3)
+      ALU2(DP4)
+      ALU2(DPH)
+      ALU1(FBH)
+      ALU1(FBL)
+      ALU1(FRC)
+      ALU3(DP4A)
+      ALU2(LINE)
+      ALU1(LZD)
+      ALU2(MAC)
+      ALU2_ACC(MACH)
+      ALU3(MAD)
+      ALU1(MOV)
+      ALU2(MUL)
+      ALU1(NOT)
+      ALU2(OR)
+      ALU2(PLN)
+      ALU1(RNDD)
+      ALU1(RNDE)
+      ALU1(RNDU)
+      ALU1(RNDZ)
+      ALU2(ROL)
+      ALU2(ROR)
+      ALU2(SAD2)
+      ALU2_ACC(SADA2)
+      ALU2(SEL)
+      ALU2(SHL)
+      ALU2(SHR)
+      ALU2_ACC(SUBB)
+      ALU2(XOR)
+
+#undef ALU3
+#undef ALU2_ACC
+#undef ALU2
+#undef ALU1
+
+      instruction *
+      F32TO16(const dst_reg &dst, const src_reg &src) const
+      {
+         assert(dst.type == BRW_REGISTER_TYPE_HF);
+         assert(src.type == BRW_REGISTER_TYPE_F);
+
+         if (shader->devinfo->ver >= 8) {
+            return MOV(dst, src);
+         } else {
+            assert(shader->devinfo->ver == 7);
+            return emit(BRW_OPCODE_F32TO16,
+                        retype(dst, BRW_REGISTER_TYPE_W), src);
+         }
+      }
+
+      instruction *
+      F16TO32(const dst_reg &dst, const src_reg &src) const
+      {
+         assert(dst.type == BRW_REGISTER_TYPE_F);
+         assert(src.type == BRW_REGISTER_TYPE_HF);
+
+         if (shader->devinfo->ver >= 8) {
+            return MOV(dst, src);
+         } else {
+            assert(shader->devinfo->ver == 7);
+            return emit(BRW_OPCODE_F16TO32,
+                        dst, retype(src, BRW_REGISTER_TYPE_W));
+         }
+      }
+      /** @} */
+
+      /**
+       * CMP: Sets the low bit of the destination channels with the result
+       * of the comparison, while the upper bits are undefined, and updates
+       * the flag register with the packed 16 bits of the result.
+       */
+      instruction *
+      CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
+          brw_conditional_mod condition) const
+      {
+         /* Take the instruction:
+          *
+          * CMP null<d> src0<f> src1<f>
+          *
+          * Original gfx4 does type conversion to the destination type
+          * before comparison, producing garbage results for floating
+          * point comparisons.
+          *
+          * The destination type doesn't matter on newer generations,
+          * so we set the type to match src0 so we can compact the
+          * instruction.
+          */
+         return set_condmod(condition,
+                            emit(BRW_OPCODE_CMP, retype(dst, src0.type),
+                                 fix_unsigned_negate(src0),
+                                 fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * CMPN: Behaves like CMP, but produces true if src1 is NaN.
+       */
+      instruction *
+      CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
+           brw_conditional_mod condition) const
+      {
+         /* Take the instruction:
+          *
+          * CMP null<d> src0<f> src1<f>
+          *
+          * Original gfx4 does type conversion to the destination type
+          * before comparison, producing garbage results for floating
+          * point comparisons.
+          *
+          * The destination type doesn't matter on newer generations,
+          * so we set the type to match src0 so we can compact the
+          * instruction.
+          */
+         return set_condmod(condition,
+                            emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
+                                 fix_unsigned_negate(src0),
+                                 fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Gfx4 predicated IF.
+       */
+      instruction *
+      IF(brw_predicate predicate) const
+      {
+         return set_predicate(predicate, emit(BRW_OPCODE_IF));
+      }
+
+      /**
+       * CSEL: dst = src2 <op> 0.0f ? src0 : src1
+       */
+      instruction *
+      CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
+           const src_reg &src2, brw_conditional_mod condition) const
+      {
+         /* CSEL only operates on floats, so we can't do integer </<=/>=/>
+          * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
+          * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
+          */
+         assert(src2.type == BRW_REGISTER_TYPE_F);
+
+         return set_condmod(condition,
+                            emit(BRW_OPCODE_CSEL,
+                                 retype(dst, BRW_REGISTER_TYPE_F),
+                                 retype(src0, BRW_REGISTER_TYPE_F),
+                                 retype(src1, BRW_REGISTER_TYPE_F),
+                                 src2));
+      }
+
+      /**
+       * Emit a linear interpolation instruction.
+       */
+      instruction *
+      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
+          const src_reg &a) const
+      {
+         if (shader->devinfo->ver >= 6 && shader->devinfo->ver <= 10) {
+            /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
+             * we need to reorder the operands.
+             */
+            return emit(BRW_OPCODE_LRP, dst, a, y, x);
+
+         } else {
+            /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
+            const dst_reg y_times_a = vgrf(dst.type);
+            const dst_reg one_minus_a = vgrf(dst.type);
+            const dst_reg x_times_one_minus_a = vgrf(dst.type);
+
+            MUL(y_times_a, y, a);
+            ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
+            MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
+            return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
+         }
+      }
+
+      /**
+       * Collect a number of registers in a contiguous range of registers.
+       */
+      instruction *
+      LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
+                   unsigned sources, unsigned header_size) const
+      {
+         instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
+         inst->header_size = header_size;
+         inst->size_written = header_size * REG_SIZE;
+         for (unsigned i = header_size; i < sources; i++) {
+            inst->size_written += dispatch_width() * type_sz(src[i].type) *
+                                  dst.stride;
+         }
+
+         return inst;
+      }
+
+      instruction *
+      UNDEF(const dst_reg &dst) const
+      {
+         assert(dst.file == VGRF);
+         assert(dst.offset % REG_SIZE == 0);
+         instruction *inst = emit(SHADER_OPCODE_UNDEF,
+                                  retype(dst, BRW_REGISTER_TYPE_UD));
+         inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
+
+         return inst;
+      }
+
+      instruction *
+      DPAS(const dst_reg &dst, const src_reg &src0, const src_reg &src1, const src_reg &src2,
+           unsigned sdepth, unsigned rcount) const
+      {
+         assert(_dispatch_width == 8);
+         assert(sdepth == 8);
+         assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8);
+
+         instruction *inst = emit(BRW_OPCODE_DPAS, dst, src0, src1, src2);
+         inst->sdepth = sdepth;
+         inst->rcount = rcount;
+
+         if (dst.type == BRW_REGISTER_TYPE_HF) {
+            inst->size_written = rcount * REG_SIZE / 2;
+         } else {
+            inst->size_written = rcount * REG_SIZE;
+         }
+
+         return inst;
+      }
+
+      fs_visitor *shader;
+
+      fs_inst *BREAK()    { return emit(BRW_OPCODE_BREAK); }
+      fs_inst *DO()       { return emit(BRW_OPCODE_DO); }
+      fs_inst *ENDIF()    { return emit(BRW_OPCODE_ENDIF); }
+      fs_inst *NOP()      { return emit(BRW_OPCODE_NOP); }
+      fs_inst *WHILE()    { return emit(BRW_OPCODE_WHILE); }
+      fs_inst *CONTINUE() { return emit(BRW_OPCODE_CONTINUE); }
+
+   private:
+      /**
+       * Workaround for negation of UD registers.  See comment in
+       * fs_generator::generate_code() for more details.
+       */
+      src_reg
+      fix_unsigned_negate(const src_reg &src) const
+      {
+         if (src.type == BRW_REGISTER_TYPE_UD &&
+             src.negate) {
+            dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
+            MOV(temp, src);
+            return src_reg(temp);
+         } else {
+            return src;
+         }
+      }
+
+      /**
+       * Workaround for source register modes not supported by the ternary
+       * instruction encoding.
+       */
+      src_reg
+      fix_3src_operand(const src_reg &src) const
+      {
+         switch (src.file) {
+         case FIXED_GRF:
+            /* FINISHME: Could handle scalar region, other stride=1 regions */
+            if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
+                src.width != BRW_WIDTH_8 ||
+                src.hstride != BRW_HORIZONTAL_STRIDE_1)
+               break;
+            FALLTHROUGH;
+         case ATTR:
+         case VGRF:
+         case UNIFORM:
+         case IMM:
+            return src;
+         default:
+            break;
+         }
+
+         dst_reg expanded = vgrf(src.type);
+         MOV(expanded, src);
+         return expanded;
+      }
+
+      /**
+       * Workaround for source register modes not supported by the math
+       * instruction.
+       */
+      src_reg
+      fix_math_operand(const src_reg &src) const
+      {
+         /* Can't do hstride == 0 args on gfx6 math, so expand it out. We
+          * might be able to do better by doing execsize = 1 math and then
+          * expanding that result out, but we would need to be careful with
+          * masking.
+          *
+          * Gfx6 hardware ignores source modifiers (negate and abs) on math
+          * instructions, so we also move to a temp to set those up.
+          *
+          * Gfx7 relaxes most of the above restrictions, but still can't use IMM
+          * operands to math
+          */
+         if ((shader->devinfo->ver == 6 &&
+              (src.file == IMM || src.file == UNIFORM ||
+               src.abs || src.negate)) ||
+             (shader->devinfo->ver == 7 && src.file == IMM)) {
+            const dst_reg tmp = vgrf(src.type);
+            MOV(tmp, src);
+            return tmp;
+         } else {
+            return src;
+         }
+      }
+
+      bblock_t *block;
+      exec_node *cursor;
+
+      unsigned _dispatch_width;
+      unsigned _group;
+      bool force_writemask_all;
+
+      /** Debug annotation info. */
+      struct {
+         const char *str;
+         const void *ir;
+      } annotation;
+   };
+}
+
+static inline fs_reg
+offset(const fs_reg &reg, const brw::fs_builder &bld, unsigned delta)
+{
+   return offset(reg, bld.dispatch_width(), delta);
+}
+
+#endif
diff --git a/src/intel/compiler/elk/brw_fs_cmod_propagation.cpp b/src/intel/compiler/elk/brw_fs_cmod_propagation.cpp
new file mode 100644
index 00000000000..0fadb402172
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_cmod_propagation.cpp
@@ -0,0 +1,568 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+
+/** @file brw_fs_cmod_propagation.cpp
+ *
+ * Implements a pass that propagates the conditional modifier from a CMP x 0.0
+ * instruction into the instruction that generated x. For instance, in this
+ * sequence
+ *
+ *    add(8)          g70<1>F    g69<8,8,1>F    4096F
+ *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
+ *
+ * we can do the comparison as part of the ADD instruction directly:
+ *
+ *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
+ *
+ * If there had been a use of the flag register and another CMP using g70
+ *
+ *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
+ *    (+f0) sel(8)    g71<F>     g72<8,8,1>F    g73<8,8,1>F
+ *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
+ *
+ * we can recognize that the CMP is generating the flag value that already
+ * exists and therefore remove the instruction.
+ */
+
+using namespace brw;
+
+static bool
+cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block,
+                          fs_inst *inst)
+{
+   bool read_flag = false;
+   const unsigned flags_written = inst->flags_written(devinfo);
+
+   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+      if (scan_inst->opcode == BRW_OPCODE_ADD &&
+          !scan_inst->is_partial_write() &&
+          scan_inst->exec_size == inst->exec_size) {
+         bool negate;
+
+         /* A CMP is basically a subtraction.  The result of the
+          * subtraction must be the same as the result of the addition.
+          * This means that one of the operands must be negated.  So (a +
+          * b) vs (a == -b) or (a + -b) vs (a == b).
+          */
+         if ((inst->src[0].equals(scan_inst->src[0]) &&
+              inst->src[1].negative_equals(scan_inst->src[1])) ||
+             (inst->src[0].equals(scan_inst->src[1]) &&
+              inst->src[1].negative_equals(scan_inst->src[0]))) {
+            negate = false;
+         } else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
+                     inst->src[1].equals(scan_inst->src[1])) ||
+                    (inst->src[0].negative_equals(scan_inst->src[1]) &&
+                     inst->src[1].equals(scan_inst->src[0]))) {
+            negate = true;
+         } else {
+            goto not_match;
+         }
+
+         /* If the scan instruction writes a different flag register than the
+          * instruction we're trying to propagate from, bail.
+          *
+          * FINISHME: The second part of the condition may be too strong.
+          * Perhaps (scan_inst->flags_written() & flags_written) !=
+          * flags_written?
+          */
+         if (scan_inst->flags_written(devinfo) != 0 &&
+             scan_inst->flags_written(devinfo) != flags_written)
+            goto not_match;
+
+         /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
+          *
+          *    * Note that the [post condition signal] bits generated at
+          *      the output of a compute are before the .sat.
+          *
+          * Paragraph about post_zero does not mention saturation, but
+          * testing it on actual GPUs shows that conditional modifiers
+          * are applied after saturation.
+          *
+          *    * post_zero bit: This bit reflects whether the final
+          *      result is zero after all the clamping, normalizing,
+          *      or format conversion logic.
+          *
+          * For signed types we don't care about saturation: it won't
+          * change the result of conditional modifier.
+          *
+          * For floating and unsigned types there two special cases,
+          * when we can remove inst even if scan_inst is saturated: G
+          * and LE. Since conditional modifiers are just comparisons
+          * against zero, saturating positive values to the upper
+          * limit never changes the result of comparison.
+          *
+          * For negative values:
+          * (sat(x) >  0) == (x >  0) --- false
+          * (sat(x) <= 0) == (x <= 0) --- true
+          */
+         const enum brw_conditional_mod cond =
+            negate ? brw_swap_cmod(inst->conditional_mod)
+            : inst->conditional_mod;
+
+         if (scan_inst->saturate &&
+             (brw_reg_type_is_floating_point(scan_inst->dst.type) ||
+              brw_reg_type_is_unsigned_integer(scan_inst->dst.type)) &&
+             (cond != BRW_CONDITIONAL_G &&
+              cond != BRW_CONDITIONAL_LE))
+            goto not_match;
+
+         /* Otherwise, try propagating the conditional. */
+         if (scan_inst->can_do_cmod() &&
+             ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
+              scan_inst->conditional_mod == cond)) {
+            scan_inst->conditional_mod = cond;
+            scan_inst->flag_subreg = inst->flag_subreg;
+            inst->remove(block, true);
+            return true;
+         }
+         break;
+      }
+
+   not_match:
+      if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
+         break;
+
+      read_flag = read_flag ||
+                  (scan_inst->flags_read(devinfo) & flags_written) != 0;
+   }
+
+   return false;
+}
+
+/**
+ * Propagate conditional modifiers from NOT instructions
+ *
+ * Attempt to convert sequences like
+ *
+ *    or(8)           g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
+ *    ...
+ *    not.nz.f0(8)    null            g78<8,8,1>UD
+ *
+ * into
+ *
+ *    or.z.f0(8)      g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
+ */
+static bool
+cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block,
+                   fs_inst *inst)
+{
+   const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod);
+   bool read_flag = false;
+   const unsigned flags_written = inst->flags_written(devinfo);
+
+   if (cond != BRW_CONDITIONAL_Z && cond != BRW_CONDITIONAL_NZ)
+      return false;
+
+   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+      if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                          inst->src[0], inst->size_read(0))) {
+         if (scan_inst->opcode != BRW_OPCODE_OR &&
+             scan_inst->opcode != BRW_OPCODE_AND)
+            break;
+
+         if (scan_inst->is_partial_write() ||
+             scan_inst->dst.offset != inst->src[0].offset ||
+             scan_inst->exec_size != inst->exec_size)
+            break;
+
+         /* If the scan instruction writes a different flag register than the
+          * instruction we're trying to propagate from, bail.
+          *
+          * FINISHME: The second part of the condition may be too strong.
+          * Perhaps (scan_inst->flags_written() & flags_written) !=
+          * flags_written?
+          */
+         if (scan_inst->flags_written(devinfo) != 0 &&
+             scan_inst->flags_written(devinfo) != flags_written)
+            break;
+
+         if (scan_inst->can_do_cmod() &&
+             ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
+              scan_inst->conditional_mod == cond)) {
+            scan_inst->conditional_mod = cond;
+            scan_inst->flag_subreg = inst->flag_subreg;
+            inst->remove(block, true);
+            return true;
+         }
+         break;
+      }
+
+      if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
+         break;
+
+      read_flag = read_flag ||
+                  (scan_inst->flags_read(devinfo) & flags_written) != 0;
+   }
+
+   return false;
+}
+
+static bool
+opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block)
+{
+   bool progress = false;
+   UNUSED int ip = block->end_ip + 1;
+
+   foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
+      ip--;
+
+      if ((inst->opcode != BRW_OPCODE_AND &&
+           inst->opcode != BRW_OPCODE_CMP &&
+           inst->opcode != BRW_OPCODE_MOV &&
+           inst->opcode != BRW_OPCODE_NOT) ||
+          inst->predicate != BRW_PREDICATE_NONE ||
+          !inst->dst.is_null() ||
+          (inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
+           inst->src[0].file != UNIFORM))
+         continue;
+
+      /* An ABS source modifier can only be handled when processing a compare
+       * with a value other than zero.
+       */
+      if (inst->src[0].abs &&
+          (inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero()))
+         continue;
+
+      /* Only an AND.NZ can be propagated.  Many AND.Z instructions are
+       * generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code).
+       * Propagating those would require inverting the condition on the CMP.
+       * This changes both the flag value and the register destination of the
+       * CMP.  That result may be used elsewhere, so we can't change its value
+       * on a whim.
+       */
+      if (inst->opcode == BRW_OPCODE_AND &&
+          !(inst->src[1].is_one() &&
+            inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+            !inst->src[0].negate))
+         continue;
+
+      /* A CMP with a second source of zero can match with anything.  A CMP
+       * with a second source that is not zero can only match with an ADD
+       * instruction.
+       *
+       * Only apply this optimization to float-point sources.  It can fail for
+       * integers.  For inputs a = 0x80000000, b = 4, int(0x80000000) < 4, but
+       * int(0x80000000) - 4 overflows and results in 0x7ffffffc.  that's not
+       * less than zero, so the flags get set differently than for (a < b).
+       */
+      if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
+         if (brw_reg_type_is_floating_point(inst->src[0].type) &&
+             cmod_propagate_cmp_to_add(devinfo, block, inst))
+            progress = true;
+
+         continue;
+      }
+
+      if (inst->opcode == BRW_OPCODE_NOT) {
+         progress = cmod_propagate_not(devinfo, block, inst) || progress;
+         continue;
+      }
+
+      bool read_flag = false;
+      const unsigned flags_written = inst->flags_written(devinfo);
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->src[0], inst->size_read(0))) {
+            /* If the scan instruction writes a different flag register than
+             * the instruction we're trying to propagate from, bail.
+             *
+             * FINISHME: The second part of the condition may be too strong.
+             * Perhaps (scan_inst->flags_written() & flags_written) !=
+             * flags_written?
+             */
+            if (scan_inst->flags_written(devinfo) != 0 &&
+                scan_inst->flags_written(devinfo) != flags_written)
+               break;
+
+            if (scan_inst->is_partial_write() ||
+                scan_inst->dst.offset != inst->src[0].offset ||
+                scan_inst->exec_size != inst->exec_size)
+               break;
+
+            /* If the write mask is different we can't propagate. */
+            if (scan_inst->force_writemask_all != inst->force_writemask_all)
+               break;
+
+            /* CMP's result is the same regardless of dest type. */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+                scan_inst->opcode == BRW_OPCODE_CMP &&
+                brw_reg_type_is_integer(inst->dst.type)) {
+               inst->remove(block, true);
+               progress = true;
+               break;
+            }
+
+            /* If the AND wasn't handled by the previous case, it isn't safe
+             * to remove it.
+             */
+            if (inst->opcode == BRW_OPCODE_AND)
+               break;
+
+            if (inst->opcode == BRW_OPCODE_MOV) {
+               if (brw_reg_type_is_floating_point(scan_inst->dst.type)) {
+                  /* If the destination type of scan_inst is floating-point,
+                   * then:
+                   *
+                   * - The source of the MOV instruction must be the same
+                   *   type.
+                   *
+                   * - The destination of the MOV instruction must be float
+                   *   point with a size at least as large as the destination
+                   *   of inst.  Size-reducing f2f conversions could cause
+                   *   non-zero values to become zero, etc.
+                   */
+                  if (scan_inst->dst.type != inst->src[0].type)
+                     break;
+
+                  if (!brw_reg_type_is_floating_point(inst->dst.type))
+                     break;
+
+                  if (type_sz(scan_inst->dst.type) > type_sz(inst->dst.type))
+                     break;
+               } else {
+                  /* If the destination type of scan_inst is integer, then:
+                   *
+                   * - The source of the MOV instruction must be integer with
+                   *   the same size.
+                   *
+                   * - If the conditional modifier is Z or NZ, then the
+                   *   destination type of inst must either be floating point
+                   *   (of any size) or integer with a size at least as large
+                   *   as the destination of inst.
+                   *
+                   * - If the conditional modifier is neither Z nor NZ, then the
+                   *   destination type of inst must either be floating point
+                   *   (of any size) or integer with a size at least as large
+                   *   as the destination of inst and the same signedness.
+                   */
+                  if (!brw_reg_type_is_integer(inst->src[0].type) ||
+                      type_sz(scan_inst->dst.type) != type_sz(inst->src[0].type))
+                     break;
+
+                  if (brw_reg_type_is_integer(inst->dst.type)) {
+                     if (type_sz(inst->dst.type) < type_sz(scan_inst->dst.type))
+                        break;
+
+                     if (inst->conditional_mod != BRW_CONDITIONAL_Z &&
+                         inst->conditional_mod != BRW_CONDITIONAL_NZ &&
+                         brw_reg_type_is_unsigned_integer(inst->dst.type) !=
+                         brw_reg_type_is_unsigned_integer(scan_inst->dst.type))
+                        break;
+                  }
+               }
+            } else {
+               /* Not safe to use inequality operators if the types are
+                * different.
+                */
+               if (scan_inst->dst.type != inst->src[0].type &&
+                   inst->conditional_mod != BRW_CONDITIONAL_Z &&
+                   inst->conditional_mod != BRW_CONDITIONAL_NZ)
+                  break;
+
+               /* Comparisons operate differently for ints and floats */
+               if (scan_inst->dst.type != inst->dst.type) {
+                  /* Comparison result may be altered if the bit-size changes
+                   * since that affects range, denorms, etc
+                   */
+                  if (type_sz(scan_inst->dst.type) != type_sz(inst->dst.type))
+                     break;
+
+                  if (brw_reg_type_is_floating_point(scan_inst->dst.type) !=
+                      brw_reg_type_is_floating_point(inst->dst.type))
+                     break;
+               }
+            }
+
+            /* Knowing following:
+             * - CMP writes to flag register the result of
+             *   applying cmod to the `src0 - src1`.
+             *   After that it stores the same value to dst.
+             *   Other instructions first store their result to
+             *   dst, and then store cmod(dst) to the flag
+             *   register.
+             * - inst is either CMP or MOV
+             * - inst->dst is null
+             * - inst->src[0] overlaps with scan_inst->dst
+             * - inst->src[1] is zero
+             * - scan_inst wrote to a flag register
+             *
+             * There can be three possible paths:
+             *
+             * - scan_inst is CMP:
+             *
+             *   Considering that src0 is either 0x0 (false),
+             *   or 0xffffffff (true), and src1 is 0x0:
+             *
+             *   - If inst's cmod is NZ, we can always remove
+             *     scan_inst: NZ is invariant for false and true. This
+             *     holds even if src0 is NaN: .nz is the only cmod,
+             *     that returns true for NaN.
+             *
+             *   - .g is invariant if src0 has a UD type
+             *
+             *   - .l is invariant if src0 has a D type
+             *
+             * - scan_inst and inst have the same cmod:
+             *
+             *   If scan_inst is anything than CMP, it already
+             *   wrote the appropriate value to the flag register.
+             *
+             * - else:
+             *
+             *   We can change cmod of scan_inst to that of inst,
+             *   and remove inst. It is valid as long as we make
+             *   sure that no instruction uses the flag register
+             *   between scan_inst and inst.
+             */
+            if (!inst->src[0].negate &&
+                scan_inst->flags_written(devinfo)) {
+               if (scan_inst->opcode == BRW_OPCODE_CMP) {
+                  if ((inst->conditional_mod == BRW_CONDITIONAL_NZ) ||
+                      (inst->conditional_mod == BRW_CONDITIONAL_G &&
+                       inst->src[0].type == BRW_REGISTER_TYPE_UD) ||
+                      (inst->conditional_mod == BRW_CONDITIONAL_L &&
+                       inst->src[0].type == BRW_REGISTER_TYPE_D)) {
+                     inst->remove(block, true);
+                     progress = true;
+                     break;
+                  }
+               } else if (scan_inst->conditional_mod == inst->conditional_mod) {
+                  /* On Gfx4 and Gfx5 sel.cond will dirty the flags, but the
+                   * flags value is not based on the result stored in the
+                   * destination.  On all other platforms sel.cond will not
+                   * write the flags, so execution will not get to this point.
+                   */
+                  if (scan_inst->opcode == BRW_OPCODE_SEL) {
+                     assert(devinfo->ver <= 5);
+                  } else {
+                     inst->remove(block, true);
+                     progress = true;
+                  }
+
+                  break;
+               } else if (!read_flag && scan_inst->can_do_cmod()) {
+                  scan_inst->conditional_mod = inst->conditional_mod;
+                  scan_inst->flag_subreg = inst->flag_subreg;
+                  inst->remove(block, true);
+                  progress = true;
+                  break;
+               }
+            }
+
+            /* The conditional mod of the CMP/CMPN instructions behaves
+             * specially because the flag output is not calculated from the
+             * result of the instruction, but the other way around, which
+             * means that even if the condmod to propagate and the condmod
+             * from the CMP instruction are the same they will in general give
+             * different results because they are evaluated based on different
+             * inputs.
+             */
+            if (scan_inst->opcode == BRW_OPCODE_CMP ||
+                scan_inst->opcode == BRW_OPCODE_CMPN)
+               break;
+
+            /* From the Sky Lake PRM, Vol 2a, "Multiply":
+             *
+             *    "When multiplying integer data types, if one of the sources
+             *     is a DW, the resulting full precision data is stored in
+             *     the accumulator. However, if the destination data type is
+             *     either W or DW, the low bits of the result are written to
+             *     the destination register and the remaining high bits are
+             *     discarded. This results in undefined Overflow and Sign
+             *     flags. Therefore, conditional modifiers and saturation
+             *     (.sat) cannot be used in this case."
+             *
+             * We just disallow cmod propagation on all integer multiplies.
+             */
+            if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
+                scan_inst->opcode == BRW_OPCODE_MUL)
+               break;
+
+            enum brw_conditional_mod cond =
+               inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
+                                   : inst->conditional_mod;
+
+            /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
+             *
+             *    * Note that the [post condition signal] bits generated at
+             *      the output of a compute are before the .sat.
+             *
+             * Paragraph about post_zero does not mention saturation, but
+             * testing it on actual GPUs shows that conditional modifiers are
+             * applied after saturation.
+             *
+             *    * post_zero bit: This bit reflects whether the final
+             *      result is zero after all the clamping, normalizing,
+             *      or format conversion logic.
+             *
+             * For this reason, no additional restrictions are necessary on
+             * instructions with saturate.
+             */
+
+            /* Otherwise, try propagating the conditional. */
+            if (scan_inst->can_do_cmod() &&
+                ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
+                 scan_inst->conditional_mod == cond)) {
+               scan_inst->conditional_mod = cond;
+               scan_inst->flag_subreg = inst->flag_subreg;
+               inst->remove(block, true);
+               progress = true;
+            }
+            break;
+         }
+
+         if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
+            break;
+
+         read_flag = read_flag ||
+                     (scan_inst->flags_read(devinfo) & flags_written) != 0;
+      }
+   }
+
+   /* There is progress if and only if instructions were removed. */
+   assert(progress == (block->end_ip_delta != 0));
+
+   return progress;
+}
+
+bool
+fs_visitor::opt_cmod_propagation()
+{
+   bool progress = false;
+
+   foreach_block_reverse(block, cfg) {
+      progress = opt_cmod_propagation_local(devinfo, block) || progress;
+   }
+
+   if (progress) {
+      cfg->adjust_block_ips();
+
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+   }
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_fs_combine_constants.cpp b/src/intel/compiler/elk/brw_fs_combine_constants.cpp
new file mode 100644
index 00000000000..ed5176153da
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_combine_constants.cpp
@@ -0,0 +1,1858 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_combine_constants.cpp
+ *
+ * This file contains the opt_combine_constants() pass that runs after the
+ * regular optimization loop. It passes over the instruction list and
+ * selectively promotes immediate values to registers by emitting a mov(1)
+ * instruction.
+ *
+ * This is useful on Gen 7 particularly, because a few instructions can be
+ * coissued (i.e., issued in the same cycle as another thread on the same EU
+ * issues an instruction) under some circumstances, one of which is that they
+ * cannot use immediate values.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_cfg.h"
+#include "util/half_float.h"
+
+using namespace brw;
+
+static const bool debug = false;
+
+enum PACKED interpreted_type {
+   float_only = 0,
+   integer_only,
+   either_type
+};
+
+struct value {
+   /** Raw bit pattern of the value. */
+   nir_const_value value;
+
+   /** Instruction that uses this instance of the value. */
+   unsigned instr_index;
+
+   /** Size, in bits, of the value. */
+   uint8_t bit_size;
+
+   /**
+    * Which source of instr is this value?
+    *
+    * \note This field is not actually used by \c brw_combine_constants, but
+    * it is generally very useful to callers.
+    */
+   uint8_t src;
+
+   /**
+    * In what ways can instr interpret this value?
+    *
+    * Choices are floating-point only, integer only, or either type.
+    */
+   enum interpreted_type type;
+
+   /**
+    * Only try to make a single source non-constant.
+    *
+    * On some architectures, some instructions require that all sources be
+    * non-constant.  For example, the multiply-accumulate instruction on Intel
+    * GPUs upto Gen11 require that all sources be non-constant.  Other
+    * instructions, like the selection instruction, allow one constant source.
+    *
+    * If a single constant source is allowed, set this flag to true.
+    *
+    * If an instruction allows a single constant and it has only a signle
+    * constant to begin, it should be included.  Various places in
+    * \c combine_constants will assume that there are multiple constants if
+    * \c ::allow_one_constant is set.  This may even be enforced by in-code
+    * assertions.
+    */
+   bool allow_one_constant;
+
+   /**
+    * Restrict values that can reach this value to not include negations.
+    *
+    * This is useful for instructions that cannot have source modifiers.  For
+    * example, on Intel GPUs the integer source of a shift instruction (e.g.,
+    * SHL) can have a source modifier, but the integer source of the bitfield
+    * insertion instruction (i.e., BFI2) cannot.  A pair of these instructions
+    * might have sources that are negations of each other.  Using this flag
+    * will ensure that the BFI2 does not have a negated source, but the SHL
+    * might.
+    */
+   bool no_negations;
+
+   /**
+    * \name UtilCombineConstantsPrivate
+    * Private data used only by brw_combine_constants
+    *
+    * Any data stored in these fields will be overwritten by the call to
+    * \c brw_combine_constants.  No assumptions should be made about the
+    * state of these fields after that function returns.
+    */
+   /**@{*/
+   /** Mask of negations that can be generated from this value. */
+   uint8_t reachable_mask;
+
+   /** Mask of negations that can generate this value. */
+   uint8_t reaching_mask;
+
+   /**
+    * Value with the next source from the same instruction.
+    *
+    * This pointer may be \c NULL.  If it is not \c NULL, it will form a
+    * singly-linked circular list of values.  The list is unorderd.  That is,
+    * as the list is iterated, the \c ::src values will be in arbitrary order.
+    *
+    * \todo Is it even possible for there to be more than two elements in this
+    * list?  This pass does not operate on vecN instructions or intrinsics, so
+    * the theoretical limit should be three.  However, instructions with all
+    * constant sources should have been folded away.
+    */
+   struct value *next_src;
+   /**@}*/
+};
+
+struct combine_constants_value {
+   /** Raw bit pattern of the constant loaded. */
+   nir_const_value value;
+
+   /**
+    * Index of the first user.
+    *
+    * This is the offset into \c combine_constants_result::user_map of the
+    * first user of this value.
+    */
+   unsigned first_user;
+
+   /** Number of users of this value. */
+   unsigned num_users;
+
+   /** Size, in bits, of the value. */
+   uint8_t bit_size;
+};
+
+struct combine_constants_user {
+   /** Index into the array of values passed to brw_combine_constants. */
+   unsigned index;
+
+   /**
+    * Manner in which the value should be interpreted in the instruction.
+    *
+    * This is only useful when ::negate is set.  Unless the corresponding
+    * value::type is \c either_type, this field must have the same value as
+    * value::type.
+    */
+   enum interpreted_type type;
+
+   /** Should this value be negated to generate the original value? */
+   bool negate;
+};
+
+class combine_constants_result {
+public:
+   combine_constants_result(unsigned num_candidates, bool &success)
+      : num_values_to_emit(0), user_map(NULL)
+   {
+      user_map = (struct combine_constants_user *) calloc(num_candidates,
+                                                          sizeof(user_map[0]));
+
+      /* In the worst case, the number of output values will be equal to the
+       * number of input values.  Allocate a buffer that is known to be large
+       * enough now, and it can be reduced later.
+       */
+      values_to_emit =
+         (struct combine_constants_value *) calloc(num_candidates,
+                                                   sizeof(values_to_emit[0]));
+
+      success = (user_map != NULL && values_to_emit != NULL);
+   }
+
+   ~combine_constants_result()
+   {
+      free(values_to_emit);
+      free(user_map);
+   }
+
+   void append_value(const nir_const_value &value, unsigned bit_size)
+   {
+      values_to_emit[num_values_to_emit].value = value;
+      values_to_emit[num_values_to_emit].first_user = 0;
+      values_to_emit[num_values_to_emit].num_users = 0;
+      values_to_emit[num_values_to_emit].bit_size = bit_size;
+      num_values_to_emit++;
+   }
+
+   unsigned num_values_to_emit;
+   struct combine_constants_value *values_to_emit;
+
+   struct combine_constants_user *user_map;
+};
+
+#define VALUE_INDEX                  0
+#define FLOAT_NEG_INDEX              1
+#define INT_NEG_INDEX                2
+#define MAX_NUM_REACHABLE            3
+
+#define VALUE_EXISTS                 (1 << VALUE_INDEX)
+#define FLOAT_NEG_EXISTS             (1 << FLOAT_NEG_INDEX)
+#define INT_NEG_EXISTS               (1 << INT_NEG_INDEX)
+
+static bool
+negation_exists(nir_const_value v, unsigned bit_size,
+                enum interpreted_type base_type)
+{
+   /* either_type does not make sense in this context. */
+   assert(base_type == float_only || base_type == integer_only);
+
+   switch (bit_size) {
+   case 8:
+      if (base_type == float_only)
+         return false;
+      else
+         return v.i8 != 0 && v.i8 != INT8_MIN;
+
+   case 16:
+      if (base_type == float_only)
+         return !util_is_half_nan(v.i16);
+      else
+         return v.i16 != 0 && v.i16 != INT16_MIN;
+
+   case 32:
+      if (base_type == float_only)
+         return !isnan(v.f32);
+      else
+         return v.i32 != 0 && v.i32 != INT32_MIN;
+
+   case 64:
+      if (base_type == float_only)
+         return !isnan(v.f64);
+      else
+         return v.i64 != 0 && v.i64 != INT64_MIN;
+
+   default:
+      unreachable("unsupported bit-size should have already been filtered.");
+   }
+}
+
+static nir_const_value
+negate(nir_const_value v, unsigned bit_size, enum interpreted_type base_type)
+{
+   /* either_type does not make sense in this context. */
+   assert(base_type == float_only || base_type == integer_only);
+
+   nir_const_value ret = { 0, };
+
+   switch (bit_size) {
+   case 8:
+      assert(base_type == integer_only);
+      ret.i8 = -v.i8;
+      break;
+
+   case 16:
+      if (base_type == float_only)
+         ret.u16 = v.u16 ^ INT16_MIN;
+      else
+         ret.i16 = -v.i16;
+      break;
+
+   case 32:
+      if (base_type == float_only)
+         ret.u32 = v.u32 ^ INT32_MIN;
+      else
+         ret.i32 = -v.i32;
+      break;
+
+   case 64:
+      if (base_type == float_only)
+         ret.u64 = v.u64 ^ INT64_MIN;
+      else
+         ret.i64 = -v.i64;
+      break;
+
+   default:
+      unreachable("unsupported bit-size should have already been filtered.");
+   }
+
+   return ret;
+}
+
+static nir_const_value
+absolute(nir_const_value v, unsigned bit_size, enum interpreted_type base_type)
+{
+   /* either_type does not make sense in this context. */
+   assert(base_type == float_only || base_type == integer_only);
+
+   nir_const_value ret = { 0, };
+
+   switch (bit_size) {
+   case 8:
+      assert(base_type == integer_only);
+      ret.i8 = abs(v.i8);
+      break;
+
+   case 16:
+      if (base_type == float_only)
+         ret.u16 = v.u16 & 0x7fff;
+      else
+         ret.i16 = abs(v.i16);
+      break;
+
+   case 32:
+      if (base_type == float_only)
+         ret.f32 = fabs(v.f32);
+      else
+         ret.i32 = abs(v.i32);
+      break;
+
+   case 64:
+      if (base_type == float_only)
+         ret.f64 = fabs(v.f64);
+      else {
+         if (sizeof(v.i64) == sizeof(long int)) {
+            ret.i64 = labs((long int) v.i64);
+         } else {
+            assert(sizeof(v.i64) == sizeof(long long int));
+            ret.i64 = llabs((long long int) v.i64);
+         }
+      }
+      break;
+
+   default:
+      unreachable("unsupported bit-size should have already been filtered.");
+   }
+
+   return ret;
+}
+
+static void
+calculate_masks(nir_const_value v, enum interpreted_type type,
+                unsigned bit_size, uint8_t *reachable_mask,
+                uint8_t *reaching_mask)
+{
+   *reachable_mask = 0;
+   *reaching_mask = 0;
+
+   /* Calculate the extended reachable mask. */
+   if (type == float_only || type == either_type) {
+      if (negation_exists(v, bit_size, float_only))
+         *reachable_mask |= FLOAT_NEG_EXISTS;
+   }
+
+   if (type == integer_only || type == either_type) {
+      if (negation_exists(v, bit_size, integer_only))
+         *reachable_mask |= INT_NEG_EXISTS;
+   }
+
+   /* Calculate the extended reaching mask.  All of the "is this negation
+    * possible" was already determined for the reachable_mask, so reuse that
+    * data.
+    */
+   if (type == float_only || type == either_type) {
+      if (*reachable_mask & FLOAT_NEG_EXISTS)
+         *reaching_mask |= FLOAT_NEG_EXISTS;
+   }
+
+   if (type == integer_only || type == either_type) {
+      if (*reachable_mask & INT_NEG_EXISTS)
+         *reaching_mask |= INT_NEG_EXISTS;
+   }
+}
+
+static void
+calculate_reachable_values(nir_const_value v,
+                           unsigned bit_size,
+                           unsigned reachable_mask,
+                           nir_const_value *reachable_values)
+{
+   memset(reachable_values, 0, MAX_NUM_REACHABLE * sizeof(reachable_values[0]));
+
+   reachable_values[VALUE_INDEX] = v;
+
+   if (reachable_mask & INT_NEG_EXISTS) {
+      const nir_const_value neg = negate(v, bit_size, integer_only);
+
+      reachable_values[INT_NEG_INDEX] = neg;
+   }
+
+   if (reachable_mask & FLOAT_NEG_EXISTS) {
+      const nir_const_value neg = negate(v, bit_size, float_only);
+
+      reachable_values[FLOAT_NEG_INDEX] = neg;
+   }
+}
+
+static bool
+value_equal(nir_const_value a, nir_const_value b, unsigned bit_size)
+{
+   switch (bit_size) {
+   case 8:
+      return a.u8 == b.u8;
+   case 16:
+      return a.u16 == b.u16;
+   case 32:
+      return a.u32 == b.u32;
+   case 64:
+      return a.u64 == b.u64;
+   default:
+      unreachable("unsupported bit-size should have already been filtered.");
+   }
+}
+
+/** Can these values be the same with one level of negation? */
+static bool
+value_can_equal(const nir_const_value *from, uint8_t reachable_mask,
+                nir_const_value to, uint8_t reaching_mask,
+                unsigned bit_size)
+{
+   const uint8_t combined_mask = reachable_mask & reaching_mask;
+
+   return value_equal(from[VALUE_INDEX], to, bit_size) ||
+          ((combined_mask & INT_NEG_EXISTS) &&
+           value_equal(from[INT_NEG_INDEX], to, bit_size)) ||
+          ((combined_mask & FLOAT_NEG_EXISTS) &&
+           value_equal(from[FLOAT_NEG_INDEX], to, bit_size));
+}
+
+static void
+preprocess_candidates(struct value *candidates, unsigned num_candidates)
+{
+   /* Calculate the reaching_mask and reachable_mask for each candidate. */
+   for (unsigned i = 0; i < num_candidates; i++) {
+      calculate_masks(candidates[i].value,
+                      candidates[i].type,
+                      candidates[i].bit_size,
+                      &candidates[i].reachable_mask,
+                      &candidates[i].reaching_mask);
+
+      /* If negations are not allowed, then only the original value is
+       * reaching.
+       */
+      if (candidates[i].no_negations)
+         candidates[i].reaching_mask = 0;
+   }
+
+   for (unsigned i = 0; i < num_candidates; i++)
+      candidates[i].next_src = NULL;
+
+   for (unsigned i = 0; i < num_candidates - 1; i++) {
+      if (candidates[i].next_src != NULL)
+         continue;
+
+      struct value *prev = &candidates[i];
+
+      for (unsigned j = i + 1; j < num_candidates; j++) {
+         if (candidates[i].instr_index == candidates[j].instr_index) {
+            prev->next_src = &candidates[j];
+            prev = prev->next_src;
+         }
+      }
+
+      /* Close the cycle. */
+      if (prev != &candidates[i])
+         prev->next_src = &candidates[i];
+   }
+}
+
+static bool
+reaching_value_exists(const struct value *c,
+                      const struct combine_constants_value *values,
+                      unsigned num_values)
+{
+   nir_const_value reachable_values[MAX_NUM_REACHABLE];
+
+   calculate_reachable_values(c->value, c->bit_size, c->reaching_mask,
+                              reachable_values);
+
+   /* Check to see if the value is already in the result set. */
+   for (unsigned j = 0; j < num_values; j++) {
+      if (c->bit_size == values[j].bit_size &&
+          value_can_equal(reachable_values, c->reaching_mask,
+                          values[j].value, c->reaching_mask,
+                          c->bit_size)) {
+         return true;
+      }
+   }
+
+   return false;
+}
+
+static combine_constants_result *
+combine_constants_greedy(struct value *candidates, unsigned num_candidates)
+{
+   bool success;
+   combine_constants_result *result =
+      new combine_constants_result(num_candidates, success);
+   if (result == NULL || !success) {
+      delete result;
+      return NULL;
+   }
+
+   BITSET_WORD *remain =
+      (BITSET_WORD *) calloc(BITSET_WORDS(num_candidates), sizeof(remain[0]));
+
+   if (remain == NULL) {
+      delete result;
+      return NULL;
+   }
+
+   memset(remain, 0xff, BITSET_WORDS(num_candidates) * sizeof(remain[0]));
+
+   /* Operate in three passes.  The first pass handles all values that must be
+    * emitted and for which a negation cannot exist.
+    */
+   unsigned i;
+   for (i = 0; i < num_candidates; i++) {
+      if (candidates[i].allow_one_constant ||
+          (candidates[i].reaching_mask & (FLOAT_NEG_EXISTS | INT_NEG_EXISTS))) {
+         continue;
+      }
+
+      /* Check to see if the value is already in the result set. */
+      bool found = false;
+      const unsigned num_values = result->num_values_to_emit;
+      for (unsigned j = 0; j < num_values; j++) {
+         if (candidates[i].bit_size == result->values_to_emit[j].bit_size &&
+             value_equal(candidates[i].value,
+                         result->values_to_emit[j].value,
+                         candidates[i].bit_size)) {
+            found = true;
+            break;
+         }
+      }
+
+      if (!found)
+         result->append_value(candidates[i].value, candidates[i].bit_size);
+
+      BITSET_CLEAR(remain, i);
+   }
+
+   /* The second pass handles all values that must be emitted and for which a
+    * negation can exist.
+    */
+   BITSET_FOREACH_SET(i, remain, num_candidates) {
+      if (candidates[i].allow_one_constant)
+         continue;
+
+      assert(candidates[i].reaching_mask & (FLOAT_NEG_EXISTS | INT_NEG_EXISTS));
+
+      if (!reaching_value_exists(&candidates[i], result->values_to_emit,
+                                 result->num_values_to_emit)) {
+         result->append_value(absolute(candidates[i].value,
+                                       candidates[i].bit_size,
+                                       candidates[i].type),
+                              candidates[i].bit_size);
+      }
+
+      BITSET_CLEAR(remain, i);
+   }
+
+   /* The third pass handles all of the values that may not have to be
+    * emitted.  These are the values where allow_one_constant is set.
+    */
+   BITSET_FOREACH_SET(i, remain, num_candidates) {
+      assert(candidates[i].allow_one_constant);
+
+      /* The BITSET_FOREACH_SET macro does not detect changes to the bitset
+       * that occur within the current word.  Since code in this loop may
+       * clear bits from the set, re-test here.
+       */
+      if (!BITSET_TEST(remain, i))
+         continue;
+
+      assert(candidates[i].next_src != NULL);
+
+      const struct value *const other_candidate = candidates[i].next_src;
+      const unsigned j = other_candidate - candidates;
+
+      if (!reaching_value_exists(&candidates[i], result->values_to_emit,
+                                 result->num_values_to_emit)) {
+         /* Before emitting a value, see if a match for the other source of
+          * the instruction exists.
+          */
+         if (!reaching_value_exists(&candidates[j], result->values_to_emit,
+                                    result->num_values_to_emit)) {
+            result->append_value(candidates[i].value, candidates[i].bit_size);
+         }
+      }
+
+      /* Mark both sources as handled. */
+      BITSET_CLEAR(remain, i);
+      BITSET_CLEAR(remain, j);
+   }
+
+   /* As noted above, there will never be more values in the output than in
+    * the input.  If there are fewer values, reduce the size of the
+    * allocation.
+    */
+   if (result->num_values_to_emit < num_candidates) {
+      result->values_to_emit = (struct combine_constants_value *)
+         realloc(result->values_to_emit, sizeof(result->values_to_emit[0]) *
+                 result->num_values_to_emit);
+
+      /* Is it even possible for a reducing realloc to fail? */
+      assert(result->values_to_emit != NULL);
+   }
+
+   /* Create the mapping from "combined" constants to list of candidates
+    * passed in by the caller.
+    */
+   memset(remain, 0xff, BITSET_WORDS(num_candidates) * sizeof(remain[0]));
+
+   unsigned total_users = 0;
+
+   const unsigned num_values = result->num_values_to_emit;
+   for (unsigned value_idx = 0; value_idx < num_values; value_idx++) {
+      result->values_to_emit[value_idx].first_user = total_users;
+
+      uint8_t reachable_mask;
+      uint8_t unused_mask;
+
+      calculate_masks(result->values_to_emit[value_idx].value, either_type,
+                      result->values_to_emit[value_idx].bit_size,
+                      &reachable_mask, &unused_mask);
+
+      nir_const_value reachable_values[MAX_NUM_REACHABLE];
+
+      calculate_reachable_values(result->values_to_emit[value_idx].value,
+                                 result->values_to_emit[value_idx].bit_size,
+                                 reachable_mask, reachable_values);
+
+      for (unsigned i = 0; i < num_candidates; i++) {
+         bool matched = false;
+
+         if (!BITSET_TEST(remain, i))
+            continue;
+
+         if (candidates[i].bit_size != result->values_to_emit[value_idx].bit_size)
+            continue;
+
+         if (value_equal(candidates[i].value, result->values_to_emit[value_idx].value,
+                         result->values_to_emit[value_idx].bit_size)) {
+            result->user_map[total_users].index = i;
+            result->user_map[total_users].type = candidates[i].type;
+            result->user_map[total_users].negate = false;
+            total_users++;
+
+            matched = true;
+            BITSET_CLEAR(remain, i);
+         } else {
+            const uint8_t combined_mask = reachable_mask &
+                                          candidates[i].reaching_mask;
+
+            enum interpreted_type type = either_type;
+
+            if ((combined_mask & INT_NEG_EXISTS) &&
+                value_equal(candidates[i].value,
+                            reachable_values[INT_NEG_INDEX],
+                            candidates[i].bit_size)) {
+               type = integer_only;
+            }
+
+            if (type == either_type &&
+                (combined_mask & FLOAT_NEG_EXISTS) &&
+                value_equal(candidates[i].value,
+                            reachable_values[FLOAT_NEG_INDEX],
+                            candidates[i].bit_size)) {
+               type = float_only;
+            }
+
+            if (type != either_type) {
+               /* Finding a match on this path implies that the user must
+                * allow source negations.
+                */
+               assert(!candidates[i].no_negations);
+
+               result->user_map[total_users].index = i;
+               result->user_map[total_users].type = type;
+               result->user_map[total_users].negate = true;
+               total_users++;
+
+               matched = true;
+               BITSET_CLEAR(remain, i);
+            }
+         }
+
+         /* Mark the other source of instructions that can have a constant
+          * source.  Selection is the prime example of this, and we want to
+          * avoid generating sequences like bcsel(a, fneg(b), ineg(c)).
+          *
+          * This also makes sure that the assertion (below) that *all* values
+          * were processed holds even when some values may be allowed to
+          * remain as constants.
+          *
+          * FINISHME: There may be value in only doing this when type ==
+          * either_type.  If both sources are loaded, a register allocator may
+          * be able to make a better choice about which value to "spill"
+          * (i.e., replace with an immediate) under heavy register pressure.
+          */
+         if (matched && candidates[i].allow_one_constant) {
+            const struct value *const other_src = candidates[i].next_src;
+            const unsigned idx = other_src - candidates;
+
+            assert(idx < num_candidates);
+            BITSET_CLEAR(remain, idx);
+         }
+      }
+
+      assert(total_users > result->values_to_emit[value_idx].first_user);
+      result->values_to_emit[value_idx].num_users =
+         total_users - result->values_to_emit[value_idx].first_user;
+   }
+
+   /* Verify that all of the values were emitted by the loop above.  If any
+    * bits are still set in remain, then some value was not emitted.  The use
+    * of memset to populate remain prevents the use of a more performant loop.
+    */
+#ifndef NDEBUG
+   bool pass = true;
+
+   BITSET_FOREACH_SET(i, remain, num_candidates) {
+      fprintf(stderr, "candidate %d was not processed: { "
+              ".b = %s, "
+              ".f32 = %f, .f64 = %g, "
+              ".i8 = %d, .u8 = 0x%02x, "
+              ".i16 = %d, .u16 = 0x%04x, "
+              ".i32 = %d, .u32 = 0x%08x, "
+              ".i64 = %" PRId64 ", .u64 = 0x%016" PRIx64 " }\n",
+              i,
+              candidates[i].value.b ? "true" : "false",
+              candidates[i].value.f32, candidates[i].value.f64,
+              candidates[i].value.i8,  candidates[i].value.u8,
+              candidates[i].value.i16, candidates[i].value.u16,
+              candidates[i].value.i32, candidates[i].value.u32,
+              candidates[i].value.i64, candidates[i].value.u64);
+      pass = false;
+   }
+
+   assert(pass && "All values should have been processed.");
+#endif
+
+   free(remain);
+
+   return result;
+}
+
+static combine_constants_result *
+brw_combine_constants(struct value *candidates, unsigned num_candidates)
+{
+   preprocess_candidates(candidates, num_candidates);
+
+   return combine_constants_greedy(candidates, num_candidates);
+}
+
+/* Returns whether an instruction could co-issue if its immediate source were
+ * replaced with a GRF source.
+ */
+static bool
+could_coissue(const struct intel_device_info *devinfo, const fs_inst *inst)
+{
+   assert(inst->opcode == BRW_OPCODE_MOV ||
+          inst->opcode == BRW_OPCODE_CMP ||
+          inst->opcode == BRW_OPCODE_ADD ||
+          inst->opcode == BRW_OPCODE_MUL);
+
+   if (devinfo->ver != 7)
+      return false;
+
+   /* Only float instructions can coissue.  We don't have a great
+    * understanding of whether or not something like float(int(a) + int(b))
+    * would be considered float (based on the destination type) or integer
+    * (based on the source types), so we take the conservative choice of
+    * only promoting when both destination and source are float.
+    */
+   return inst->dst.type == BRW_REGISTER_TYPE_F &&
+          inst->src[0].type == BRW_REGISTER_TYPE_F;
+}
+
+/**
+ * Box for storing fs_inst and some other necessary data
+ *
+ * \sa box_instruction
+ */
+struct fs_inst_box {
+   fs_inst *inst;
+   unsigned ip;
+   bblock_t *block;
+   bool must_promote;
+};
+
+/** A box for putting fs_regs in a linked list. */
+struct reg_link {
+   DECLARE_RALLOC_CXX_OPERATORS(reg_link)
+
+   reg_link(fs_inst *inst, unsigned src, bool negate, enum interpreted_type type)
+   : inst(inst), src(src), negate(negate), type(type) {}
+
+   struct exec_node link;
+   fs_inst *inst;
+   uint8_t src;
+   bool negate;
+   enum interpreted_type type;
+};
+
+static struct exec_node *
+link(void *mem_ctx, fs_inst *inst, unsigned src, bool negate,
+     enum interpreted_type type)
+{
+   reg_link *l = new(mem_ctx) reg_link(inst, src, negate, type);
+   return &l->link;
+}
+
+/**
+ * Information about an immediate value.
+ */
+struct imm {
+   /** The common ancestor of all blocks using this immediate value. */
+   bblock_t *block;
+
+   /**
+    * The instruction generating the immediate value, if all uses are contained
+    * within a single basic block. Otherwise, NULL.
+    */
+   fs_inst *inst;
+
+   /**
+    * A list of fs_regs that refer to this immediate.  If we promote it, we'll
+    * have to patch these up to refer to the new GRF.
+    */
+   exec_list *uses;
+
+   /** The immediate value */
+   union {
+      char bytes[8];
+      double df;
+      int64_t d64;
+      float f;
+      int32_t d;
+      int16_t w;
+   };
+   uint8_t size;
+
+   /** When promoting half-float we need to account for certain restrictions */
+   bool is_half_float;
+
+   /**
+    * The GRF register and subregister number where we've decided to store the
+    * constant value.
+    */
+   uint8_t subreg_offset;
+   uint16_t nr;
+
+   /** The number of coissuable instructions using this immediate. */
+   uint16_t uses_by_coissue;
+
+   /**
+    * Whether this constant is used by an instruction that can't handle an
+    * immediate source (and already has to be promoted to a GRF).
+    */
+   bool must_promote;
+
+   /** Is the value used only in a single basic block? */
+   bool used_in_single_block;
+
+   uint16_t first_use_ip;
+   uint16_t last_use_ip;
+};
+
+/** The working set of information about immediates. */
+struct table {
+   struct value *values;
+   int size;
+   int num_values;
+
+   struct imm *imm;
+   int len;
+
+   struct fs_inst_box *boxes;
+   unsigned num_boxes;
+   unsigned size_boxes;
+};
+
+static struct value *
+new_value(struct table *table, void *mem_ctx)
+{
+   if (table->num_values == table->size) {
+      table->size *= 2;
+      table->values = reralloc(mem_ctx, table->values, struct value, table->size);
+   }
+   return &table->values[table->num_values++];
+}
+
+/**
+ * Store an instruction with some other data in a table.
+ *
+ * \returns the index into the dynamic array of boxes for the instruction.
+ */
+static unsigned
+box_instruction(struct table *table, void *mem_ctx, fs_inst *inst,
+                unsigned ip, bblock_t *block, bool must_promote)
+{
+   /* It is common for box_instruction to be called consecutively for each
+    * source of an instruction.  As a result, the most common case for finding
+    * an instruction in the table is when that instruction was the last one
+    * added.  Search the list back to front.
+    */
+   for (unsigned i = table->num_boxes; i > 0; /* empty */) {
+      i--;
+
+      if (table->boxes[i].inst == inst)
+         return i;
+   }
+
+   if (table->num_boxes == table->size_boxes) {
+      table->size_boxes *= 2;
+      table->boxes = reralloc(mem_ctx, table->boxes, fs_inst_box,
+                              table->size_boxes);
+   }
+
+   assert(table->num_boxes < table->size_boxes);
+
+   const unsigned idx = table->num_boxes++;
+   fs_inst_box *ib =  &table->boxes[idx];
+
+   ib->inst = inst;
+   ib->block = block;
+   ib->ip = ip;
+   ib->must_promote = must_promote;
+
+   return idx;
+}
+
+/**
+ * Comparator used for sorting an array of imm structures.
+ *
+ * We sort by basic block number, then last use IP, then first use IP (least
+ * to greatest). This sorting causes immediates live in the same area to be
+ * allocated to the same register in the hopes that all values will be dead
+ * about the same time and the register can be reused.
+ */
+static int
+compare(const void *_a, const void *_b)
+{
+   const struct imm *a = (const struct imm *)_a,
+                    *b = (const struct imm *)_b;
+
+   int block_diff = a->block->num - b->block->num;
+   if (block_diff)
+      return block_diff;
+
+   int end_diff = a->last_use_ip - b->last_use_ip;
+   if (end_diff)
+      return end_diff;
+
+   return a->first_use_ip - b->first_use_ip;
+}
+
+static struct brw_reg
+build_imm_reg_for_copy(struct imm *imm)
+{
+   switch (imm->size) {
+   case 8:
+      return brw_imm_d(imm->d64);
+   case 4:
+      return brw_imm_d(imm->d);
+   case 2:
+      return brw_imm_w(imm->w);
+   default:
+      unreachable("not implemented");
+   }
+}
+
+static inline uint32_t
+get_alignment_for_imm(const struct imm *imm)
+{
+   if (imm->is_half_float)
+      return 4; /* At least MAD seems to require this */
+   else
+      return imm->size;
+}
+
+static bool
+representable_as_hf(float f, uint16_t *hf)
+{
+   union fi u;
+   uint16_t h = _mesa_float_to_half(f);
+   u.f = _mesa_half_to_float(h);
+
+   if (u.f == f) {
+      *hf = h;
+      return true;
+   }
+
+   return false;
+}
+
+static bool
+representable_as_w(int d, int16_t *w)
+{
+   int res = ((d & 0xffff8000) + 0x8000) & 0xffff7fff;
+   if (!res) {
+      *w = d;
+      return true;
+   }
+
+   return false;
+}
+
+static bool
+representable_as_uw(unsigned ud, uint16_t *uw)
+{
+   if (!(ud & 0xffff0000)) {
+      *uw = ud;
+      return true;
+   }
+
+   return false;
+}
+
+static bool
+supports_src_as_imm(const struct intel_device_info *devinfo, const fs_inst *inst)
+{
+   if (devinfo->ver < 12)
+      return false;
+
+   switch (inst->opcode) {
+   case BRW_OPCODE_ADD3:
+      /* ADD3 only exists on Gfx12.5+. */
+      return true;
+
+   case BRW_OPCODE_MAD:
+      /* Integer types can always mix sizes. Floating point types can mix
+       * sizes on Gfx12. On Gfx12.5, floating point sources must all be HF or
+       * all be F.
+       */
+      return devinfo->verx10 < 125 || inst->src[0].type != BRW_REGISTER_TYPE_F;
+
+   default:
+      return false;
+   }
+}
+
+static bool
+can_promote_src_as_imm(const struct intel_device_info *devinfo, fs_inst *inst,
+                       unsigned src_idx)
+{
+   bool can_promote = false;
+
+   /* Experiment shows that we can only support src0 as immediate for MAD on
+    * Gfx12. ADD3 can use src0 or src2 in Gfx12.5, but constant propagation
+    * only propagates into src0. It's possible that src2 works for W or UW MAD
+    * on Gfx12.5.
+    */
+   if (src_idx != 0)
+      return false;
+
+   if (!supports_src_as_imm(devinfo, inst))
+      return false;
+
+   /* TODO - Fix the codepath below to use a bfloat16 immediate on XeHP,
+    *        since HF/F mixed mode has been removed from the hardware.
+    */
+   switch (inst->src[src_idx].type) {
+   case BRW_REGISTER_TYPE_F: {
+      uint16_t hf;
+      if (representable_as_hf(inst->src[src_idx].f, &hf)) {
+         inst->src[src_idx] = retype(brw_imm_uw(hf), BRW_REGISTER_TYPE_HF);
+         can_promote = true;
+      }
+      break;
+   }
+   case BRW_REGISTER_TYPE_D: {
+      int16_t w;
+      if (representable_as_w(inst->src[src_idx].d, &w)) {
+         inst->src[src_idx] = brw_imm_w(w);
+         can_promote = true;
+      }
+      break;
+   }
+   case BRW_REGISTER_TYPE_UD: {
+      uint16_t uw;
+      if (representable_as_uw(inst->src[src_idx].ud, &uw)) {
+         inst->src[src_idx] = brw_imm_uw(uw);
+         can_promote = true;
+      }
+      break;
+   }
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_HF:
+      can_promote = true;
+      break;
+   default:
+      break;
+   }
+
+   return can_promote;
+}
+
+static void
+add_candidate_immediate(struct table *table, fs_inst *inst, unsigned ip,
+                        unsigned i,
+                        bool must_promote,
+                        bool allow_one_constant,
+                        bblock_t *block,
+                        const struct intel_device_info *devinfo,
+                        void *const_ctx)
+{
+   struct value *v = new_value(table, const_ctx);
+
+   unsigned box_idx = box_instruction(table, const_ctx, inst, ip, block,
+                                      must_promote);
+
+   v->value.u64 = inst->src[i].d64;
+   v->bit_size = 8 * type_sz(inst->src[i].type);
+   v->instr_index = box_idx;
+   v->src = i;
+   v->allow_one_constant = allow_one_constant;
+
+   /* Right-shift instructions are special.  They can have source modifiers,
+    * but changing the type can change the semantic of the instruction.  Only
+    * allow negations on a right shift if the source type is already signed.
+    */
+   v->no_negations = !inst->can_do_source_mods(devinfo) ||
+                     ((inst->opcode == BRW_OPCODE_SHR ||
+                       inst->opcode == BRW_OPCODE_ASR) &&
+                      brw_reg_type_is_unsigned_integer(inst->src[i].type));
+
+   switch (inst->src[i].type) {
+   case BRW_REGISTER_TYPE_DF:
+   case BRW_REGISTER_TYPE_NF:
+   case BRW_REGISTER_TYPE_F:
+   case BRW_REGISTER_TYPE_HF:
+      v->type = float_only;
+      break;
+
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_Q:
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_W:
+      v->type = integer_only;
+      break;
+
+   case BRW_REGISTER_TYPE_VF:
+   case BRW_REGISTER_TYPE_UV:
+   case BRW_REGISTER_TYPE_V:
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+   default:
+      unreachable("not reached");
+   }
+
+   /* It is safe to change the type of the operands of a select instruction
+    * that has no conditional modifier, no source modifiers, and no saturate
+    * modifer.
+    */
+   if (inst->opcode == BRW_OPCODE_SEL &&
+       inst->conditional_mod == BRW_CONDITIONAL_NONE &&
+       !inst->src[0].negate && !inst->src[0].abs &&
+       !inst->src[1].negate && !inst->src[1].abs &&
+       !inst->saturate) {
+      v->type = either_type;
+   }
+}
+
+struct register_allocation {
+   /** VGRF for storing values. */
+   unsigned nr;
+
+   /**
+    * Mask of currently available slots in this register.
+    *
+    * Each register is 16, 16-bit slots.  Allocations require 1, 2, or 4 slots
+    * for word, double-word, or quad-word values, respectively.
+    */
+   uint16_t avail;
+};
+
+static fs_reg
+allocate_slots(struct register_allocation *regs, unsigned num_regs,
+               unsigned bytes, unsigned align_bytes,
+               brw::simple_allocator &alloc)
+{
+   assert(bytes == 2 || bytes == 4 || bytes == 8);
+   assert(align_bytes == 2 || align_bytes == 4 || align_bytes == 8);
+
+   const unsigned words = bytes / 2;
+   const unsigned align_words = align_bytes / 2;
+   const uint16_t mask = (1U << words) - 1;
+
+   for (unsigned i = 0; i < num_regs; i++) {
+      for (unsigned j = 0; j <= (16 - words); j += align_words) {
+         const uint16_t x = regs[i].avail >> j;
+
+         if ((x & mask) == mask) {
+            if (regs[i].nr == UINT_MAX)
+               regs[i].nr = alloc.allocate(1);
+
+            regs[i].avail &= ~(mask << j);
+
+            fs_reg reg(VGRF, regs[i].nr);
+            reg.offset = j * 2;
+
+            return reg;
+         }
+      }
+   }
+
+   unreachable("No free slots found.");
+}
+
+static void
+deallocate_slots(struct register_allocation *regs, unsigned num_regs,
+                 unsigned reg_nr, unsigned subreg_offset, unsigned bytes)
+{
+   assert(bytes == 2 || bytes == 4 || bytes == 8);
+   assert(subreg_offset % 2 == 0);
+   assert(subreg_offset + bytes <= 32);
+
+   const unsigned words = bytes / 2;
+   const unsigned offset = subreg_offset / 2;
+   const uint16_t mask = ((1U << words) - 1) << offset;
+
+   for (unsigned i = 0; i < num_regs; i++) {
+      if (regs[i].nr == reg_nr) {
+         regs[i].avail |= mask;
+         return;
+      }
+   }
+
+   unreachable("No such register found.");
+}
+
+static void
+parcel_out_registers(struct imm *imm, unsigned len, const bblock_t *cur_block,
+                     struct register_allocation *regs, unsigned num_regs,
+                     brw::simple_allocator &alloc, unsigned ver)
+{
+   /* Each basic block has two distinct set of constants.  There is the set of
+    * constants that only have uses in that block, and there is the set of
+    * constants that have uses after that block.
+    *
+    * Allocation proceeds in three passes.
+    *
+    * 1. Allocate space for the values that are used outside this block.
+    *
+    * 2. Allocate space for the values that are used only in this block.
+    *
+    * 3. Deallocate the space for the values that are used only in this block.
+    */
+
+   for (unsigned pass = 0; pass < 2; pass++) {
+      const bool used_in_single_block = pass != 0;
+
+      for (unsigned i = 0; i < len; i++) {
+         if (imm[i].block == cur_block &&
+             imm[i].used_in_single_block == used_in_single_block) {
+            /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
+             *
+             *   "In Align16 mode, the channel selects and channel enables apply
+             *    to a pair of half-floats, because these parameters are defined
+             *    for DWord elements ONLY. This is applicable when both source
+             *    and destination are half-floats."
+             *
+             * This means that Align16 instructions that use promoted HF
+             * immediates and use a <0,1,0>:HF region would read 2 HF slots
+             * instead of replicating the single one we want. To avoid this, we
+             * always populate both HF slots within a DWord with the constant.
+             */
+            const unsigned width = ver == 8 && imm[i].is_half_float ? 2 : 1;
+
+            const fs_reg reg = allocate_slots(regs, num_regs,
+                                              imm[i].size * width,
+                                              get_alignment_for_imm(&imm[i]),
+                                              alloc);
+
+            imm[i].nr = reg.nr;
+            imm[i].subreg_offset = reg.offset;
+         }
+      }
+   }
+
+   for (unsigned i = 0; i < len; i++) {
+      if (imm[i].block == cur_block && imm[i].used_in_single_block) {
+         const unsigned width = ver == 8 && imm[i].is_half_float ? 2 : 1;
+
+         deallocate_slots(regs, num_regs, imm[i].nr, imm[i].subreg_offset,
+                          imm[i].size * width);
+      }
+   }
+}
+
+bool
+fs_visitor::opt_combine_constants()
+{
+   void *const_ctx = ralloc_context(NULL);
+
+   struct table table;
+
+   /* For each of the dynamic arrays in the table, allocate about a page of
+    * memory.  On LP64 systems, this gives 126 value objects 169 fs_inst_box
+    * objects.  Even larger shaders that have been obverved rarely need more
+    * than 20 or 30 values.  Most smaller shaders, which is most shaders, need
+    * at most a couple dozen fs_inst_box.
+    */
+   table.size = (4096 - (5 * sizeof(void *))) / sizeof(struct value);
+   table.num_values = 0;
+   table.values = ralloc_array(const_ctx, struct value, table.size);
+
+   table.size_boxes = (4096 - (5 * sizeof(void *))) / sizeof(struct fs_inst_box);
+   table.num_boxes = 0;
+   table.boxes = ralloc_array(const_ctx, fs_inst_box, table.size_boxes);
+
+   const brw::idom_tree &idom = idom_analysis.require();
+   unsigned ip = -1;
+
+   /* Make a pass through all instructions and count the number of times each
+    * constant is used by coissueable instructions or instructions that cannot
+    * take immediate arguments.
+    */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      ip++;
+
+      switch (inst->opcode) {
+      case SHADER_OPCODE_INT_QUOTIENT:
+      case SHADER_OPCODE_INT_REMAINDER:
+      case SHADER_OPCODE_POW:
+         if (inst->src[0].file == IMM) {
+            assert(inst->opcode != SHADER_OPCODE_POW);
+
+            add_candidate_immediate(&table, inst, ip, 0, true, false, block,
+                                    devinfo, const_ctx);
+         }
+
+         if (inst->src[1].file == IMM && devinfo->ver < 8) {
+            add_candidate_immediate(&table, inst, ip, 1, true, false, block,
+                                    devinfo, const_ctx);
+         }
+
+         break;
+
+      case BRW_OPCODE_ADD3:
+      case BRW_OPCODE_MAD: {
+         for (int i = 0; i < inst->sources; i++) {
+            if (inst->src[i].file != IMM)
+               continue;
+
+            if (can_promote_src_as_imm(devinfo, inst, i))
+               continue;
+
+            add_candidate_immediate(&table, inst, ip, i, true, false, block,
+                                    devinfo, const_ctx);
+         }
+
+         break;
+      }
+
+      case BRW_OPCODE_BFE:
+      case BRW_OPCODE_BFI2:
+      case BRW_OPCODE_LRP:
+         for (int i = 0; i < inst->sources; i++) {
+            if (inst->src[i].file != IMM)
+               continue;
+
+            add_candidate_immediate(&table, inst, ip, i, true, false, block,
+                                    devinfo, const_ctx);
+         }
+
+         break;
+
+      case BRW_OPCODE_SEL:
+         if (inst->src[0].file == IMM) {
+            /* It is possible to have src0 be immediate but src1 not be
+             * immediate for the non-commutative conditional modifiers (e.g.,
+             * G).
+             */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NONE ||
+                /* Only GE and L are commutative. */
+                inst->conditional_mod == BRW_CONDITIONAL_GE ||
+                inst->conditional_mod == BRW_CONDITIONAL_L) {
+               assert(inst->src[1].file == IMM);
+
+               add_candidate_immediate(&table, inst, ip, 0, true, true, block,
+                                       devinfo, const_ctx);
+               add_candidate_immediate(&table, inst, ip, 1, true, true, block,
+                                       devinfo, const_ctx);
+            } else {
+               add_candidate_immediate(&table, inst, ip, 0, true, false, block,
+                                       devinfo, const_ctx);
+            }
+         }
+         break;
+
+      case BRW_OPCODE_ASR:
+      case BRW_OPCODE_BFI1:
+      case BRW_OPCODE_ROL:
+      case BRW_OPCODE_ROR:
+      case BRW_OPCODE_SHL:
+      case BRW_OPCODE_SHR:
+         if (inst->src[0].file == IMM) {
+            add_candidate_immediate(&table, inst, ip, 0, true, false, block,
+                                    devinfo, const_ctx);
+         }
+         break;
+
+      case BRW_OPCODE_MOV:
+         if (could_coissue(devinfo, inst) && inst->src[0].file == IMM) {
+            add_candidate_immediate(&table, inst, ip, 0, false, false, block,
+                                    devinfo, const_ctx);
+         }
+         break;
+
+      case BRW_OPCODE_CMP:
+      case BRW_OPCODE_ADD:
+      case BRW_OPCODE_MUL:
+         assert(inst->src[0].file != IMM);
+
+         if (could_coissue(devinfo, inst) && inst->src[1].file == IMM) {
+            add_candidate_immediate(&table, inst, ip, 1, false, false, block,
+                                    devinfo, const_ctx);
+         }
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   if (table.num_values == 0) {
+      ralloc_free(const_ctx);
+      return false;
+   }
+
+   combine_constants_result *result =
+      brw_combine_constants(table.values, table.num_values);
+
+   table.imm = ralloc_array(const_ctx, struct imm, result->num_values_to_emit);
+   table.len = 0;
+
+   for (unsigned i = 0; i < result->num_values_to_emit; i++) {
+      struct imm *imm = &table.imm[table.len];
+
+      imm->block = NULL;
+      imm->inst = NULL;
+      imm->d64 = result->values_to_emit[i].value.u64;
+      imm->size = result->values_to_emit[i].bit_size / 8;
+
+      imm->uses_by_coissue = 0;
+      imm->must_promote = false;
+      imm->is_half_float = false;
+
+      imm->first_use_ip = UINT16_MAX;
+      imm->last_use_ip = 0;
+
+      imm->uses = new(const_ctx) exec_list;
+
+      const unsigned first_user = result->values_to_emit[i].first_user;
+      const unsigned last_user = first_user +
+         result->values_to_emit[i].num_users;
+
+      for (unsigned j = first_user; j < last_user; j++) {
+         const unsigned idx = table.values[result->user_map[j].index].instr_index;
+         fs_inst_box *const ib = &table.boxes[idx];
+
+         const unsigned src = table.values[result->user_map[j].index].src;
+
+         imm->uses->push_tail(link(const_ctx, ib->inst, src,
+                                   result->user_map[j].negate,
+                                   result->user_map[j].type));
+
+         if (ib->must_promote)
+            imm->must_promote = true;
+         else
+            imm->uses_by_coissue++;
+
+         if (imm->block == NULL) {
+            /* Block should only be NULL on the first pass.  On the first
+             * pass, inst should also be NULL.
+             */
+            assert(imm->inst == NULL);
+
+            imm->inst = ib->inst;
+            imm->block = ib->block;
+            imm->first_use_ip = ib->ip;
+            imm->last_use_ip = ib->ip;
+            imm->used_in_single_block = true;
+         } else {
+            bblock_t *intersection = idom.intersect(ib->block,
+                                                    imm->block);
+
+            if (ib->block != imm->block)
+               imm->used_in_single_block = false;
+
+            if (imm->first_use_ip > ib->ip) {
+               imm->first_use_ip = ib->ip;
+
+               /* If the first-use instruction is to be tracked, block must be
+                * the block that contains it.  The old block was read in the
+                * idom.intersect call above, so it is safe to overwrite it
+                * here.
+                */
+               imm->inst = ib->inst;
+               imm->block = ib->block;
+            }
+
+            if (imm->last_use_ip < ib->ip)
+               imm->last_use_ip = ib->ip;
+
+            /* The common dominator is not the block that contains the
+             * first-use instruction, so don't track that instruction.  The
+             * load instruction will be added in the common dominator block
+             * instead of before the first-use instruction.
+             */
+            if (intersection != imm->block)
+               imm->inst = NULL;
+
+            imm->block = intersection;
+         }
+
+         if (ib->inst->src[src].type == BRW_REGISTER_TYPE_HF)
+            imm->is_half_float = true;
+      }
+
+      /* Remove constants from the table that don't have enough uses to make
+       * them profitable to store in a register.
+       */
+      if (imm->must_promote || imm->uses_by_coissue >= 4)
+         table.len++;
+   }
+
+   delete result;
+
+   if (table.len == 0) {
+      ralloc_free(const_ctx);
+      return false;
+   }
+   if (cfg->num_blocks != 1)
+      qsort(table.imm, table.len, sizeof(struct imm), compare);
+
+   if (devinfo->ver > 7) {
+      struct register_allocation *regs =
+         (struct register_allocation *) calloc(table.len, sizeof(regs[0]));
+
+      for (int i = 0; i < table.len; i++) {
+         regs[i].nr = UINT_MAX;
+         regs[i].avail = 0xffff;
+      }
+
+      foreach_block(block, cfg) {
+         parcel_out_registers(table.imm, table.len, block, regs, table.len,
+                              alloc, devinfo->ver);
+      }
+
+      free(regs);
+   } else {
+      fs_reg reg(VGRF, alloc.allocate(1));
+      reg.stride = 0;
+
+      for (int i = 0; i < table.len; i++) {
+         struct imm *imm = &table.imm[i];
+
+         /* Put the immediate in an offset aligned to its size. Some
+          * instructions seem to have additional alignment requirements, so
+          * account for that too.
+          */
+         reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
+
+         /* Ensure we have enough space in the register to copy the immediate */
+         if (reg.offset + imm->size > REG_SIZE) {
+            reg.nr = alloc.allocate(1);
+            reg.offset = 0;
+         }
+
+         imm->nr = reg.nr;
+         imm->subreg_offset = reg.offset;
+
+         reg.offset += imm->size;
+      }
+   }
+
+   bool rebuild_cfg = false;
+
+   /* Insert MOVs to load the constant values into GRFs. */
+   for (int i = 0; i < table.len; i++) {
+      struct imm *imm = &table.imm[i];
+
+      /* Insert it either before the instruction that generated the immediate
+       * or after the last non-control flow instruction of the common ancestor.
+       */
+      exec_node *n;
+      bblock_t *insert_block;
+      if (imm->inst != nullptr) {
+         n = imm->inst;
+         insert_block = imm->block;
+      } else {
+         if (imm->block->start()->opcode == BRW_OPCODE_DO) {
+            /* DO blocks are weird. They can contain only the single DO
+             * instruction. As a result, MOV instructions cannot be added to
+             * the DO block.
+             */
+            bblock_t *next_block = imm->block->next();
+            if (next_block->starts_with_control_flow()) {
+               /* This is the difficult case. This occurs for code like
+                *
+                *    do {
+                *       do {
+                *          ...
+                *       } while (...);
+                *    } while (...);
+                *
+                * when the MOV instructions need to be inserted between the
+                * two DO instructions.
+                *
+                * To properly handle this scenario, a new block would need to
+                * be inserted. Doing so would require modifying arbitrary many
+                * CONTINUE, BREAK, and WHILE instructions to point to the new
+                * block.
+                *
+                * It is unlikely that this would ever be correct. Instead,
+                * insert the MOV instructions in the known wrong place and
+                * rebuild the CFG at the end of the pass.
+                */
+               insert_block = imm->block;
+               n = insert_block->last_non_control_flow_inst()->next;
+
+               rebuild_cfg = true;
+            } else {
+               insert_block = next_block;
+               n = insert_block->start();
+            }
+         } else {
+            insert_block = imm->block;
+            n = insert_block->last_non_control_flow_inst()->next;
+         }
+      }
+
+      /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
+       *
+       *   "In Align16 mode, the channel selects and channel enables apply to a
+       *    pair of half-floats, because these parameters are defined for DWord
+       *    elements ONLY. This is applicable when both source and destination
+       *    are half-floats."
+       *
+       * This means that Align16 instructions that use promoted HF immediates
+       * and use a <0,1,0>:HF region would read 2 HF slots instead of
+       * replicating the single one we want. To avoid this, we always populate
+       * both HF slots within a DWord with the constant.
+       */
+      const uint32_t width = devinfo->ver == 8 && imm->is_half_float ? 2 : 1;
+      const fs_builder ibld = fs_builder(this, width).at(insert_block, n).exec_all();
+
+      fs_reg reg(VGRF, imm->nr);
+      reg.offset = imm->subreg_offset;
+      reg.stride = 0;
+
+      /* Put the immediate in an offset aligned to its size. Some instructions
+       * seem to have additional alignment requirements, so account for that
+       * too.
+       */
+      assert(reg.offset == ALIGN(reg.offset, get_alignment_for_imm(imm)));
+
+      struct brw_reg imm_reg = build_imm_reg_for_copy(imm);
+
+      /* Ensure we have enough space in the register to copy the immediate */
+      assert(reg.offset + type_sz(imm_reg.type) * width <= REG_SIZE);
+
+      ibld.MOV(retype(reg, imm_reg.type), imm_reg);
+   }
+   shader_stats.promoted_constants = table.len;
+
+   /* Rewrite the immediate sources to refer to the new GRFs. */
+   for (int i = 0; i < table.len; i++) {
+      foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
+         fs_reg *reg = &link->inst->src[link->src];
+
+         if (link->inst->opcode == BRW_OPCODE_SEL) {
+            if (link->type == either_type) {
+               /* Do not change the register type. */
+            } else if (link->type == integer_only) {
+               reg->type = brw_int_type(type_sz(reg->type), true);
+            } else {
+               assert(link->type == float_only);
+
+               switch (type_sz(reg->type)) {
+               case 2:
+                  reg->type = BRW_REGISTER_TYPE_HF;
+                  break;
+               case 4:
+                  reg->type = BRW_REGISTER_TYPE_F;
+                  break;
+               case 8:
+                  reg->type = BRW_REGISTER_TYPE_DF;
+                  break;
+               default:
+                  unreachable("Bad type size");
+               }
+            }
+         } else if ((link->inst->opcode == BRW_OPCODE_SHL ||
+                     link->inst->opcode == BRW_OPCODE_ASR) &&
+                    link->negate) {
+            reg->type = brw_int_type(type_sz(reg->type), true);
+         }
+
+#ifdef DEBUG
+         switch (reg->type) {
+         case BRW_REGISTER_TYPE_DF:
+            assert((isnan(reg->df) && isnan(table.imm[i].df)) ||
+                   (fabs(reg->df) == fabs(table.imm[i].df)));
+            break;
+         case BRW_REGISTER_TYPE_F:
+            assert((isnan(reg->f) && isnan(table.imm[i].f)) ||
+                   (fabsf(reg->f) == fabsf(table.imm[i].f)));
+            break;
+         case BRW_REGISTER_TYPE_HF:
+            assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) &&
+                    isnan(_mesa_half_to_float(table.imm[i].w))) ||
+                   (fabsf(_mesa_half_to_float(reg->d & 0xffffu)) ==
+                    fabsf(_mesa_half_to_float(table.imm[i].w))));
+            break;
+         case BRW_REGISTER_TYPE_Q:
+            assert(abs(reg->d64) == abs(table.imm[i].d64));
+            break;
+         case BRW_REGISTER_TYPE_UQ:
+            assert(!link->negate);
+            assert(reg->d64 == table.imm[i].d64);
+            break;
+         case BRW_REGISTER_TYPE_D:
+            assert(abs(reg->d) == abs(table.imm[i].d));
+            break;
+         case BRW_REGISTER_TYPE_UD:
+            assert(!link->negate);
+            assert(reg->d == table.imm[i].d);
+            break;
+         case BRW_REGISTER_TYPE_W:
+            assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w);
+            break;
+         case BRW_REGISTER_TYPE_UW:
+            assert(!link->negate);
+            assert((reg->ud & 0xffffu) == (uint16_t) table.imm[i].w);
+            break;
+         default:
+            break;
+         }
+#endif
+
+         assert(link->inst->can_do_source_mods(devinfo) || !link->negate);
+
+         reg->file = VGRF;
+         reg->offset = table.imm[i].subreg_offset;
+         reg->stride = 0;
+         reg->negate = link->negate;
+         reg->nr = table.imm[i].nr;
+      }
+   }
+
+   /* Fixup any SEL instructions that have src0 still as an immediate.  Fixup
+    * the types of any SEL instruction that have a negation on one of the
+    * sources.  Adding the negation may have changed the type of that source,
+    * so the other source (and destination) must be changed to match.
+    */
+   for (unsigned i = 0; i < table.num_boxes; i++) {
+      fs_inst *inst = table.boxes[i].inst;
+
+      if (inst->opcode != BRW_OPCODE_SEL)
+         continue;
+
+      /* If both sources have negation, the types had better be the same! */
+      assert(!inst->src[0].negate || !inst->src[1].negate ||
+             inst->src[0].type == inst->src[1].type);
+
+      /* If either source has a negation, force the type of the other source
+       * and the type of the result to be the same.
+       */
+      if (inst->src[0].negate) {
+         inst->src[1].type = inst->src[0].type;
+         inst->dst.type = inst->src[0].type;
+      }
+
+      if (inst->src[1].negate) {
+         inst->src[0].type = inst->src[1].type;
+         inst->dst.type = inst->src[1].type;
+      }
+
+      if (inst->src[0].file != IMM)
+         continue;
+
+      assert(inst->src[1].file != IMM);
+      assert(inst->conditional_mod == BRW_CONDITIONAL_NONE ||
+             inst->conditional_mod == BRW_CONDITIONAL_GE ||
+             inst->conditional_mod == BRW_CONDITIONAL_L);
+
+      fs_reg temp = inst->src[0];
+      inst->src[0] = inst->src[1];
+      inst->src[1] = temp;
+
+      /* If this was predicated, flipping operands means we also need to flip
+       * the predicate.
+       */
+      if (inst->conditional_mod == BRW_CONDITIONAL_NONE)
+         inst->predicate_inverse = !inst->predicate_inverse;
+   }
+
+   if (debug) {
+      for (int i = 0; i < table.len; i++) {
+         struct imm *imm = &table.imm[i];
+
+         fprintf(stderr,
+                 "0x%016" PRIx64 " - block %3d, reg %3d sub %2d, "
+                 "Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n",
+                 (uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)),
+                 imm->block->num,
+                 imm->nr,
+                 imm->subreg_offset,
+                 imm->must_promote,
+                 imm->uses_by_coissue,
+                 imm->first_use_ip,
+                 imm->last_use_ip,
+                 imm->last_use_ip - imm->first_use_ip);
+      }
+   }
+
+   if (rebuild_cfg) {
+      /* When the CFG is initially built, the instructions are removed from
+       * the list of instructions stored in fs_visitor -- the same exec_node
+       * is used for membership in that list and in a block list.  So we need
+       * to pull them back before rebuilding the CFG.
+       */
+      assert(exec_list_length(&instructions) == 0);
+      foreach_block(block, cfg) {
+         exec_list_append(&instructions, &block->instructions);
+      }
+
+      delete cfg;
+      cfg = NULL;
+      calculate_cfg();
+   }
+
+   ralloc_free(const_ctx);
+
+   invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES |
+                       (rebuild_cfg ? DEPENDENCY_BLOCKS : DEPENDENCY_NOTHING));
+
+   return true;
+}
diff --git a/src/intel/compiler/elk/brw_fs_copy_propagation.cpp b/src/intel/compiler/elk/brw_fs_copy_propagation.cpp
new file mode 100644
index 00000000000..62c16be4e64
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_copy_propagation.cpp
@@ -0,0 +1,1468 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_copy_propagation.cpp
+ *
+ * Support for global copy propagation in two passes: A local pass that does
+ * intra-block copy (and constant) propagation, and a global pass that uses
+ * dataflow analysis on the copies available at the end of each block to re-do
+ * local copy propagation with more copies available.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 12.5 (p356).
+ */
+
+#include "util/bitset.h"
+#include "util/u_math.h"
+#include "util/rb_tree.h"
+#include "brw_fs.h"
+#include "brw_fs_live_variables.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+
+using namespace brw;
+
+namespace { /* avoid conflict with opt_copy_propagation_elements */
+struct acp_entry {
+   struct rb_node by_dst;
+   struct rb_node by_src;
+   fs_reg dst;
+   fs_reg src;
+   unsigned global_idx;
+   unsigned size_written;
+   unsigned size_read;
+   enum opcode opcode;
+   bool is_partial_write;
+   bool force_writemask_all;
+};
+
+/**
+ * Compare two acp_entry::src.nr
+ *
+ * This is intended to be used as the comparison function for rb_tree.
+ */
+static int
+cmp_entry_dst_entry_dst(const struct rb_node *a_node, const struct rb_node *b_node)
+{
+   const struct acp_entry *a_entry =
+      rb_node_data(struct acp_entry, a_node, by_dst);
+
+   const struct acp_entry *b_entry =
+      rb_node_data(struct acp_entry, b_node, by_dst);
+
+   return a_entry->dst.nr - b_entry->dst.nr;
+}
+
+static int
+cmp_entry_dst_nr(const struct rb_node *a_node, const void *b_key)
+{
+   const struct acp_entry *a_entry =
+      rb_node_data(struct acp_entry, a_node, by_dst);
+
+   return a_entry->dst.nr - (uintptr_t) b_key;
+}
+
+static int
+cmp_entry_src_entry_src(const struct rb_node *a_node, const struct rb_node *b_node)
+{
+   const struct acp_entry *a_entry =
+      rb_node_data(struct acp_entry, a_node, by_src);
+
+   const struct acp_entry *b_entry =
+      rb_node_data(struct acp_entry, b_node, by_src);
+
+   return a_entry->src.nr - b_entry->src.nr;
+}
+
+/**
+ * Compare an acp_entry::src.nr with a raw nr.
+ *
+ * This is intended to be used as the comparison function for rb_tree.
+ */
+static int
+cmp_entry_src_nr(const struct rb_node *a_node, const void *b_key)
+{
+   const struct acp_entry *a_entry =
+      rb_node_data(struct acp_entry, a_node, by_src);
+
+   return a_entry->src.nr - (uintptr_t) b_key;
+}
+
+class acp_forward_iterator {
+public:
+   acp_forward_iterator(struct rb_node *n, unsigned offset)
+      : curr(n), next(nullptr), offset(offset)
+   {
+      next = rb_node_next_or_null(curr);
+   }
+
+   acp_forward_iterator &operator++()
+   {
+      curr = next;
+      next = rb_node_next_or_null(curr);
+
+      return *this;
+   }
+
+   bool operator!=(const acp_forward_iterator &other) const
+   {
+      return curr != other.curr;
+   }
+
+   struct acp_entry *operator*() const
+   {
+      /* This open-codes part of rb_node_data. */
+      return curr != NULL ? (struct acp_entry *)(((char *)curr) - offset)
+                          : NULL;
+   }
+
+private:
+   struct rb_node *curr;
+   struct rb_node *next;
+   unsigned offset;
+};
+
+struct acp {
+   struct rb_tree by_dst;
+   struct rb_tree by_src;
+
+   acp()
+   {
+      rb_tree_init(&by_dst);
+      rb_tree_init(&by_src);
+   }
+
+   acp_forward_iterator begin()
+   {
+      return acp_forward_iterator(rb_tree_first(&by_src),
+                                  rb_tree_offsetof(struct acp_entry, by_src, 0));
+   }
+
+   const acp_forward_iterator end() const
+   {
+      return acp_forward_iterator(nullptr, 0);
+   }
+
+   unsigned length()
+   {
+      unsigned l = 0;
+
+      for (rb_node *iter = rb_tree_first(&by_src);
+           iter != NULL; iter = rb_node_next(iter))
+         l++;
+
+      return l;
+   }
+
+   void add(acp_entry *entry)
+   {
+      rb_tree_insert(&by_dst, &entry->by_dst, cmp_entry_dst_entry_dst);
+      rb_tree_insert(&by_src, &entry->by_src, cmp_entry_src_entry_src);
+   }
+
+   void remove(acp_entry *entry)
+   {
+      rb_tree_remove(&by_dst, &entry->by_dst);
+      rb_tree_remove(&by_src, &entry->by_src);
+   }
+
+   acp_forward_iterator find_by_src(unsigned nr)
+   {
+      struct rb_node *rbn = rb_tree_search(&by_src,
+                                           (void *)(uintptr_t) nr,
+                                           cmp_entry_src_nr);
+
+      return acp_forward_iterator(rbn, rb_tree_offsetof(struct acp_entry,
+                                                        by_src, rbn));
+   }
+
+   acp_forward_iterator find_by_dst(unsigned nr)
+   {
+      struct rb_node *rbn = rb_tree_search(&by_dst,
+                                           (void *)(uintptr_t) nr,
+                                           cmp_entry_dst_nr);
+
+      return acp_forward_iterator(rbn, rb_tree_offsetof(struct acp_entry,
+                                                        by_dst, rbn));
+   }
+};
+
+struct block_data {
+   /**
+    * Which entries in the fs_copy_prop_dataflow acp table are live at the
+    * start of this block.  This is the useful output of the analysis, since
+    * it lets us plug those into the local copy propagation on the second
+    * pass.
+    */
+   BITSET_WORD *livein;
+
+   /**
+    * Which entries in the fs_copy_prop_dataflow acp table are live at the end
+    * of this block.  This is done in initial setup from the per-block acps
+    * returned by the first local copy prop pass.
+    */
+   BITSET_WORD *liveout;
+
+   /**
+    * Which entries in the fs_copy_prop_dataflow acp table are generated by
+    * instructions in this block which reach the end of the block without
+    * being killed.
+    */
+   BITSET_WORD *copy;
+
+   /**
+    * Which entries in the fs_copy_prop_dataflow acp table are killed over the
+    * course of this block.
+    */
+   BITSET_WORD *kill;
+
+   /**
+    * Which entries in the fs_copy_prop_dataflow acp table are guaranteed to
+    * have a fully uninitialized destination at the end of this block.
+    */
+   BITSET_WORD *undef;
+
+   /**
+    * Which entries in the fs_copy_prop_dataflow acp table can the
+    * start of this block be reached from.  Note that this is a weaker
+    * condition than livein.
+    */
+   BITSET_WORD *reachin;
+
+   /**
+    * Which entries in the fs_copy_prop_dataflow acp table are
+    * overwritten by an instruction with channel masks inconsistent
+    * with the copy instruction (e.g. due to force_writemask_all).
+    * Such an overwrite can cause the copy entry to become invalid
+    * even if the copy instruction is subsequently re-executed for any
+    * given channel i, since the execution of the overwrite for
+    * channel i may corrupt other channels j!=i inactive for the
+    * subsequent copy.
+    */
+   BITSET_WORD *exec_mismatch;
+};
+
+class fs_copy_prop_dataflow
+{
+public:
+   fs_copy_prop_dataflow(linear_ctx *lin_ctx, cfg_t *cfg,
+                         const fs_live_variables &live,
+                         struct acp *out_acp);
+
+   void setup_initial_values();
+   void run();
+
+   void dump_block_data() const UNUSED;
+
+   cfg_t *cfg;
+   const fs_live_variables &live;
+
+   acp_entry **acp;
+   int num_acp;
+   int bitset_words;
+
+  struct block_data *bd;
+};
+} /* anonymous namespace */
+
+fs_copy_prop_dataflow::fs_copy_prop_dataflow(linear_ctx *lin_ctx, cfg_t *cfg,
+                                             const fs_live_variables &live,
+                                             struct acp *out_acp)
+   : cfg(cfg), live(live)
+{
+   bd = linear_zalloc_array(lin_ctx, struct block_data, cfg->num_blocks);
+
+   num_acp = 0;
+   foreach_block (block, cfg)
+      num_acp += out_acp[block->num].length();
+
+   bitset_words = BITSET_WORDS(num_acp);
+
+   foreach_block (block, cfg) {
+      bd[block->num].livein = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      bd[block->num].liveout = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      bd[block->num].copy = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      bd[block->num].kill = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      bd[block->num].undef = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      bd[block->num].reachin = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      bd[block->num].exec_mismatch = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+   }
+
+   acp = linear_zalloc_array(lin_ctx, struct acp_entry *, num_acp);
+
+   int next_acp = 0;
+   foreach_block (block, cfg) {
+      for (auto iter = out_acp[block->num].begin();
+           iter != out_acp[block->num].end(); ++iter) {
+         acp[next_acp] = *iter;
+
+         (*iter)->global_idx = next_acp;
+
+         /* opt_copy_propagation_local populates out_acp with copies created
+          * in a block which are still live at the end of the block.  This
+          * is exactly what we want in the COPY set.
+          */
+         BITSET_SET(bd[block->num].copy, next_acp);
+
+         next_acp++;
+      }
+   }
+
+   assert(next_acp == num_acp);
+
+   setup_initial_values();
+   run();
+}
+
+/**
+ * Like reg_offset, but register must be VGRF or FIXED_GRF.
+ */
+static inline unsigned
+grf_reg_offset(const fs_reg &r)
+{
+   return (r.file == VGRF ? 0 : r.nr) * REG_SIZE +
+          r.offset +
+          (r.file == FIXED_GRF ? r.subnr : 0);
+}
+
+/**
+ * Like regions_overlap, but register must be VGRF or FIXED_GRF.
+ */
+static inline bool
+grf_regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
+{
+   return reg_space(r) == reg_space(s) &&
+          !(grf_reg_offset(r) + dr <= grf_reg_offset(s) ||
+            grf_reg_offset(s) + ds <= grf_reg_offset(r));
+}
+
+/**
+ * Set up initial values for each of the data flow sets, prior to running
+ * the fixed-point algorithm.
+ */
+void
+fs_copy_prop_dataflow::setup_initial_values()
+{
+   /* Initialize the COPY and KILL sets. */
+   {
+      struct acp acp_table;
+
+      /* First, get all the KILLs for instructions which overwrite ACP
+       * destinations.
+       */
+      for (int i = 0; i < num_acp; i++)
+         acp_table.add(acp[i]);
+
+      foreach_block (block, cfg) {
+         foreach_inst_in_block(fs_inst, inst, block) {
+            if (inst->dst.file != VGRF &&
+                inst->dst.file != FIXED_GRF)
+               continue;
+
+            for (auto iter = acp_table.find_by_src(inst->dst.nr);
+              iter != acp_table.end() && (*iter)->src.nr == inst->dst.nr;
+              ++iter) {
+               if (grf_regions_overlap(inst->dst, inst->size_written,
+                                       (*iter)->src, (*iter)->size_read)) {
+                  BITSET_SET(bd[block->num].kill, (*iter)->global_idx);
+                  if (inst->force_writemask_all && !(*iter)->force_writemask_all)
+                     BITSET_SET(bd[block->num].exec_mismatch, (*iter)->global_idx);
+               }
+            }
+
+            if (inst->dst.file != VGRF)
+               continue;
+
+            for (auto iter = acp_table.find_by_dst(inst->dst.nr);
+              iter != acp_table.end() && (*iter)->dst.nr == inst->dst.nr;
+              ++iter) {
+               if (grf_regions_overlap(inst->dst, inst->size_written,
+                                       (*iter)->dst, (*iter)->size_written)) {
+                  BITSET_SET(bd[block->num].kill, (*iter)->global_idx);
+                  if (inst->force_writemask_all && !(*iter)->force_writemask_all)
+                     BITSET_SET(bd[block->num].exec_mismatch, (*iter)->global_idx);
+               }
+            }
+         }
+      }
+   }
+
+   /* Populate the initial values for the livein and liveout sets.  For the
+    * block at the start of the program, livein = 0 and liveout = copy.
+    * For the others, set liveout and livein to ~0 (the universal set).
+    */
+   foreach_block (block, cfg) {
+      if (block->parents.is_empty()) {
+         for (int i = 0; i < bitset_words; i++) {
+            bd[block->num].livein[i] = 0u;
+            bd[block->num].liveout[i] = bd[block->num].copy[i];
+         }
+      } else {
+         for (int i = 0; i < bitset_words; i++) {
+            bd[block->num].liveout[i] = ~0u;
+            bd[block->num].livein[i] = ~0u;
+         }
+      }
+   }
+
+   /* Initialize the undef set. */
+   foreach_block (block, cfg) {
+      for (int i = 0; i < num_acp; i++) {
+         BITSET_SET(bd[block->num].undef, i);
+         for (unsigned off = 0; off < acp[i]->size_written; off += REG_SIZE) {
+            if (BITSET_TEST(live.block_data[block->num].defout,
+                            live.var_from_reg(byte_offset(acp[i]->dst, off))))
+               BITSET_CLEAR(bd[block->num].undef, i);
+         }
+      }
+   }
+}
+
+/**
+ * Walk the set of instructions in the block, marking which entries in the acp
+ * are killed by the block.
+ */
+void
+fs_copy_prop_dataflow::run()
+{
+   bool progress;
+
+   do {
+      progress = false;
+
+      foreach_block (block, cfg) {
+         if (block->parents.is_empty())
+            continue;
+
+         for (int i = 0; i < bitset_words; i++) {
+            const BITSET_WORD old_liveout = bd[block->num].liveout[i];
+            const BITSET_WORD old_reachin = bd[block->num].reachin[i];
+            BITSET_WORD livein_from_any_block = 0;
+
+            /* Update livein for this block.  If a copy is live out of all
+             * parent blocks, it's live coming in to this block.
+             */
+            bd[block->num].livein[i] = ~0u;
+            foreach_list_typed(bblock_link, parent_link, link, &block->parents) {
+               bblock_t *parent = parent_link->block;
+               /* Consider ACP entries with a known-undefined destination to
+                * be available from the parent.  This is valid because we're
+                * free to set the undefined variable equal to the source of
+                * the ACP entry without breaking the application's
+                * expectations, since the variable is undefined.
+                */
+               bd[block->num].livein[i] &= (bd[parent->num].liveout[i] |
+                                            bd[parent->num].undef[i]);
+               livein_from_any_block |= bd[parent->num].liveout[i];
+
+               /* Update reachin for this block.  If the end of any
+                * parent block is reachable from the copy, the start
+                * of this block is reachable from it as well.
+                */
+               bd[block->num].reachin[i] |= (bd[parent->num].reachin[i] |
+                                             bd[parent->num].copy[i]);
+            }
+
+            /* Limit to the set of ACP entries that can possibly be available
+             * at the start of the block, since propagating from a variable
+             * which is guaranteed to be undefined (rather than potentially
+             * undefined for some dynamic control-flow paths) doesn't seem
+             * particularly useful.
+             */
+            bd[block->num].livein[i] &= livein_from_any_block;
+
+            /* Update liveout for this block. */
+            bd[block->num].liveout[i] =
+               bd[block->num].copy[i] | (bd[block->num].livein[i] &
+                                         ~bd[block->num].kill[i]);
+
+            if (old_liveout != bd[block->num].liveout[i] ||
+                old_reachin != bd[block->num].reachin[i])
+               progress = true;
+         }
+      }
+   } while (progress);
+
+   /* Perform a second fixed-point pass in order to propagate the
+    * exec_mismatch bitsets.  Note that this requires an accurate
+    * value of the reachin bitsets as input, which isn't available
+    * until the end of the first propagation pass, so this loop cannot
+    * be folded into the previous one.
+    */
+   do {
+      progress = false;
+
+      foreach_block (block, cfg) {
+         for (int i = 0; i < bitset_words; i++) {
+            const BITSET_WORD old_exec_mismatch = bd[block->num].exec_mismatch[i];
+
+            /* Update exec_mismatch for this block.  If the end of a
+             * parent block is reachable by an overwrite with
+             * inconsistent execution masking, the start of this block
+             * is reachable by such an overwrite as well.
+             */
+            foreach_list_typed(bblock_link, parent_link, link, &block->parents) {
+               bblock_t *parent = parent_link->block;
+               bd[block->num].exec_mismatch[i] |= (bd[parent->num].exec_mismatch[i] &
+                                                   bd[parent->num].reachin[i]);
+            }
+
+            /* Only consider overwrites with inconsistent execution
+             * masking if they are reachable from the copy, since
+             * overwrites unreachable from a copy are harmless to that
+             * copy.
+             */
+            bd[block->num].exec_mismatch[i] &= bd[block->num].reachin[i];
+            if (old_exec_mismatch != bd[block->num].exec_mismatch[i])
+               progress = true;
+         }
+      }
+   } while (progress);
+}
+
+void
+fs_copy_prop_dataflow::dump_block_data() const
+{
+   foreach_block (block, cfg) {
+      fprintf(stderr, "Block %d [%d, %d] (parents ", block->num,
+             block->start_ip, block->end_ip);
+      foreach_list_typed(bblock_link, link, link, &block->parents) {
+         bblock_t *parent = link->block;
+         fprintf(stderr, "%d ", parent->num);
+      }
+      fprintf(stderr, "):\n");
+      fprintf(stderr, "       livein = 0x");
+      for (int i = 0; i < bitset_words; i++)
+         fprintf(stderr, "%08x", bd[block->num].livein[i]);
+      fprintf(stderr, ", liveout = 0x");
+      for (int i = 0; i < bitset_words; i++)
+         fprintf(stderr, "%08x", bd[block->num].liveout[i]);
+      fprintf(stderr, ",\n       copy   = 0x");
+      for (int i = 0; i < bitset_words; i++)
+         fprintf(stderr, "%08x", bd[block->num].copy[i]);
+      fprintf(stderr, ", kill    = 0x");
+      for (int i = 0; i < bitset_words; i++)
+         fprintf(stderr, "%08x", bd[block->num].kill[i]);
+      fprintf(stderr, "\n");
+   }
+}
+
+static bool
+is_logic_op(enum opcode opcode)
+{
+   return (opcode == BRW_OPCODE_AND ||
+           opcode == BRW_OPCODE_OR  ||
+           opcode == BRW_OPCODE_XOR ||
+           opcode == BRW_OPCODE_NOT);
+}
+
+static bool
+can_take_stride(fs_inst *inst, brw_reg_type dst_type,
+                unsigned arg, unsigned stride,
+                const struct brw_compiler *compiler)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+
+   if (stride > 4)
+      return false;
+
+   /* Bail if the channels of the source need to be aligned to the byte offset
+    * of the corresponding channel of the destination, and the provided stride
+    * would break this restriction.
+    */
+   if (has_dst_aligned_region_restriction(devinfo, inst, dst_type) &&
+       !(type_sz(inst->src[arg].type) * stride ==
+           type_sz(dst_type) * inst->dst.stride ||
+         stride == 0))
+      return false;
+
+   /* 3-source instructions can only be Align16, which restricts what strides
+    * they can take. They can only take a stride of 1 (the usual case), or 0
+    * with a special "repctrl" bit. But the repctrl bit doesn't work for
+    * 64-bit datatypes, so if the source type is 64-bit then only a stride of
+    * 1 is allowed. From the Broadwell PRM, Volume 7 "3D Media GPGPU", page
+    * 944:
+    *
+    *    This is applicable to 32b datatypes and 16b datatype. 64b datatypes
+    *    cannot use the replicate control.
+    */
+   if (inst->is_3src(compiler)) {
+      if (type_sz(inst->src[arg].type) > 4)
+         return stride == 1;
+      else
+         return stride == 1 || stride == 0;
+   }
+
+   /* From the Broadwell PRM, Volume 2a "Command Reference - Instructions",
+    * page 391 ("Extended Math Function"):
+    *
+    *     The following restrictions apply for align1 mode: Scalar source is
+    *     supported. Source and destination horizontal stride must be the
+    *     same.
+    *
+    * From the Haswell PRM Volume 2b "Command Reference - Instructions", page
+    * 134 ("Extended Math Function"):
+    *
+    *    Scalar source is supported. Source and destination horizontal stride
+    *    must be 1.
+    *
+    * and similar language exists for IVB and SNB. Pre-SNB, math instructions
+    * are sends, so the sources are moved to MRF's and there are no
+    * restrictions.
+    */
+   if (inst->is_math()) {
+      if (devinfo->ver == 6 || devinfo->ver == 7) {
+         assert(inst->dst.stride == 1);
+         return stride == 1 || stride == 0;
+      } else if (devinfo->ver >= 8) {
+         return stride == inst->dst.stride || stride == 0;
+      }
+   }
+
+   return true;
+}
+
+static bool
+instruction_requires_packed_data(fs_inst *inst)
+{
+   switch (inst->opcode) {
+   case FS_OPCODE_DDX_FINE:
+   case FS_OPCODE_DDX_COARSE:
+   case FS_OPCODE_DDY_FINE:
+   case FS_OPCODE_DDY_COARSE:
+   case SHADER_OPCODE_QUAD_SWIZZLE:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static bool
+try_copy_propagate(const brw_compiler *compiler, fs_inst *inst,
+                   acp_entry *entry, int arg,
+                   const brw::simple_allocator &alloc,
+                   uint8_t max_polygons)
+{
+   if (inst->src[arg].file != VGRF)
+      return false;
+
+   const struct intel_device_info *devinfo = compiler->devinfo;
+
+   assert(entry->src.file == VGRF || entry->src.file == UNIFORM ||
+          entry->src.file == ATTR || entry->src.file == FIXED_GRF);
+
+   /* Avoid propagating a LOAD_PAYLOAD instruction into another if there is a
+    * good chance that we'll be able to eliminate the latter through register
+    * coalescing.  If only part of the sources of the second LOAD_PAYLOAD can
+    * be simplified through copy propagation we would be making register
+    * coalescing impossible, ending up with unnecessary copies in the program.
+    * This is also the case for is_multi_copy_payload() copies that can only
+    * be coalesced when the instruction is lowered into a sequence of MOVs.
+    *
+    * Worse -- In cases where the ACP entry was the result of CSE combining
+    * multiple LOAD_PAYLOAD subexpressions, propagating the first LOAD_PAYLOAD
+    * into the second would undo the work of CSE, leading to an infinite
+    * optimization loop.  Avoid this by detecting LOAD_PAYLOAD copies from CSE
+    * temporaries which should match is_coalescing_payload().
+    */
+   if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
+       (is_coalescing_payload(alloc, inst) || is_multi_copy_payload(inst)))
+      return false;
+
+   assert(entry->dst.file == VGRF);
+   if (inst->src[arg].nr != entry->dst.nr)
+      return false;
+
+   /* Bail if inst is reading a range that isn't contained in the range
+    * that entry is writing.
+    */
+   if (!region_contained_in(inst->src[arg], inst->size_read(arg),
+                            entry->dst, entry->size_written))
+      return false;
+
+   /* Send messages with EOT set are restricted to use g112-g127 (and we
+    * sometimes need g127 for other purposes), so avoid copy propagating
+    * anything that would make it impossible to satisfy that restriction.
+    */
+   if (inst->eot) {
+      /* Avoid propagating a FIXED_GRF register, as that's already pinned. */
+      if (entry->src.file == FIXED_GRF)
+         return false;
+
+      /* We might be propagating from a large register, while the SEND only
+       * is reading a portion of it (say the .A channel in an RGBA value).
+       * We need to pin both split SEND sources in g112-g126/127, so only
+       * allow this if the registers aren't too large.
+       */
+      if (inst->opcode == SHADER_OPCODE_SEND && entry->src.file == VGRF) {
+         int other_src = arg == 2 ? 3 : 2;
+         unsigned other_size = inst->src[other_src].file == VGRF ?
+                               alloc.sizes[inst->src[other_src].nr] :
+                               inst->size_read(other_src);
+         unsigned prop_src_size = alloc.sizes[entry->src.nr];
+         if (other_size + prop_src_size > 15)
+            return false;
+      }
+   }
+
+   /* Avoid propagating odd-numbered FIXED_GRF registers into the first source
+    * of a LINTERP instruction on platforms where the PLN instruction has
+    * register alignment restrictions.
+    */
+   if (devinfo->has_pln && devinfo->ver <= 6 &&
+       entry->src.file == FIXED_GRF && (entry->src.nr & 1) &&
+       inst->opcode == FS_OPCODE_LINTERP && arg == 0)
+      return false;
+
+   /* we can't generally copy-propagate UD negations because we
+    * can end up accessing the resulting values as signed integers
+    * instead. See also resolve_ud_negate() and comment in
+    * fs_generator::generate_code.
+    */
+   if (entry->src.type == BRW_REGISTER_TYPE_UD &&
+       entry->src.negate)
+      return false;
+
+   bool has_source_modifiers = entry->src.abs || entry->src.negate;
+
+   if (has_source_modifiers && !inst->can_do_source_mods(devinfo))
+      return false;
+
+   /* Reject cases that would violate register regioning restrictions. */
+   if ((entry->src.file == UNIFORM || !entry->src.is_contiguous()) &&
+       ((devinfo->ver == 6 && inst->is_math()) ||
+        inst->is_send_from_grf() ||
+        inst->uses_indirect_addressing())) {
+      return false;
+   }
+
+   if (has_source_modifiers &&
+       inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_WRITE)
+      return false;
+
+   /* Some instructions implemented in the generator backend, such as
+    * derivatives, assume that their operands are packed so we can't
+    * generally propagate strided regions to them.
+    */
+   const unsigned entry_stride = (entry->src.file == FIXED_GRF ? 1 :
+                                  entry->src.stride);
+   if (instruction_requires_packed_data(inst) && entry_stride != 1)
+      return false;
+
+   const brw_reg_type dst_type = (has_source_modifiers &&
+                                  entry->dst.type != inst->src[arg].type) ?
+      entry->dst.type : inst->dst.type;
+
+   /* Bail if the result of composing both strides would exceed the
+    * hardware limit.
+    */
+   if (!can_take_stride(inst, dst_type, arg,
+                        entry_stride * inst->src[arg].stride,
+                        compiler))
+      return false;
+
+   /* From the Cherry Trail/Braswell PRMs, Volume 7: 3D Media GPGPU:
+    *    EU Overview
+    *       Register Region Restrictions
+    *          Special Requirements for Handling Double Precision Data Types :
+    *
+    *   "When source or destination datatype is 64b or operation is integer
+    *    DWord multiply, regioning in Align1 must follow these rules:
+    *
+    *      1. Source and Destination horizontal stride must be aligned to the
+    *         same qword.
+    *      2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
+    *      3. Source and Destination offset must be the same, except the case
+    *         of scalar source."
+    *
+    * Most of this is already checked in can_take_stride(), we're only left
+    * with checking 3.
+    */
+   if (has_dst_aligned_region_restriction(devinfo, inst, dst_type) &&
+       entry_stride != 0 &&
+       (reg_offset(inst->dst) % REG_SIZE) != (reg_offset(entry->src) % REG_SIZE))
+      return false;
+
+   /* The <8;8,0> regions used for FS attributes in multipolygon
+    * dispatch mode could violate regioning restrictions, don't copy
+    * propagate them in such cases.
+    */
+   if (entry->src.file == ATTR && max_polygons > 1 &&
+       (has_dst_aligned_region_restriction(devinfo, inst, dst_type) ||
+	instruction_requires_packed_data(inst) ||
+	(inst->is_3src(compiler) && arg == 2) ||
+	entry->dst.type != inst->src[arg].type))
+      return false;
+
+   /* Bail if the source FIXED_GRF region of the copy cannot be trivially
+    * composed with the source region of the instruction -- E.g. because the
+    * copy uses some extended stride greater than 4 not supported natively by
+    * the hardware as a horizontal stride, or because instruction compression
+    * could require us to use a vertical stride shorter than a GRF.
+    */
+   if (entry->src.file == FIXED_GRF &&
+       (inst->src[arg].stride > 4 ||
+        inst->dst.component_size(inst->exec_size) >
+        inst->src[arg].component_size(inst->exec_size)))
+      return false;
+
+   /* Bail if the instruction type is larger than the execution type of the
+    * copy, what implies that each channel is reading multiple channels of the
+    * destination of the copy, and simply replacing the sources would give a
+    * program with different semantics.
+    */
+   if ((type_sz(entry->dst.type) < type_sz(inst->src[arg].type) ||
+        entry->is_partial_write) &&
+       inst->opcode != BRW_OPCODE_MOV) {
+      return false;
+   }
+
+   /* Bail if the result of composing both strides cannot be expressed
+    * as another stride. This avoids, for example, trying to transform
+    * this:
+    *
+    *     MOV (8) rX<1>UD rY<0;1,0>UD
+    *     FOO (8) ...     rX<8;8,1>UW
+    *
+    * into this:
+    *
+    *     FOO (8) ...     rY<0;1,0>UW
+    *
+    * Which would have different semantics.
+    */
+   if (entry_stride != 1 &&
+       (inst->src[arg].stride *
+        type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0)
+      return false;
+
+   /* Since semantics of source modifiers are type-dependent we need to
+    * ensure that the meaning of the instruction remains the same if we
+    * change the type. If the sizes of the types are different the new
+    * instruction will read a different amount of data than the original
+    * and the semantics will always be different.
+    */
+   if (has_source_modifiers &&
+       entry->dst.type != inst->src[arg].type &&
+       (!inst->can_change_types() ||
+        type_sz(entry->dst.type) != type_sz(inst->src[arg].type)))
+      return false;
+
+   if (devinfo->ver >= 8 && (entry->src.negate || entry->src.abs) &&
+       is_logic_op(inst->opcode)) {
+      return false;
+   }
+
+   /* Save the offset of inst->src[arg] relative to entry->dst for it to be
+    * applied later.
+    */
+   const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset;
+
+   /* Fold the copy into the instruction consuming it. */
+   inst->src[arg].file = entry->src.file;
+   inst->src[arg].nr = entry->src.nr;
+   inst->src[arg].subnr = entry->src.subnr;
+   inst->src[arg].offset = entry->src.offset;
+
+   /* Compose the strides of both regions. */
+   if (entry->src.file == FIXED_GRF) {
+      if (inst->src[arg].stride) {
+         const unsigned orig_width = 1 << entry->src.width;
+         const unsigned reg_width = REG_SIZE / (type_sz(inst->src[arg].type) *
+                                                inst->src[arg].stride);
+         inst->src[arg].width = cvt(MIN2(orig_width, reg_width)) - 1;
+         inst->src[arg].hstride = cvt(inst->src[arg].stride);
+         inst->src[arg].vstride = inst->src[arg].hstride + inst->src[arg].width;
+      } else {
+         inst->src[arg].vstride = inst->src[arg].hstride =
+            inst->src[arg].width = 0;
+      }
+
+      inst->src[arg].stride = 1;
+
+      /* Hopefully no Align16 around here... */
+      assert(entry->src.swizzle == BRW_SWIZZLE_XYZW);
+      inst->src[arg].swizzle = entry->src.swizzle;
+   } else {
+      inst->src[arg].stride *= entry->src.stride;
+   }
+
+   /* Compute the first component of the copy that the instruction is
+    * reading, and the base byte offset within that component.
+    */
+   assert((entry->dst.offset % REG_SIZE == 0 || inst->opcode == BRW_OPCODE_MOV) &&
+           entry->dst.stride == 1);
+   const unsigned component = rel_offset / type_sz(entry->dst.type);
+   const unsigned suboffset = rel_offset % type_sz(entry->dst.type);
+
+   /* Calculate the byte offset at the origin of the copy of the given
+    * component and suboffset.
+    */
+   inst->src[arg] = byte_offset(inst->src[arg],
+      component * entry_stride * type_sz(entry->src.type) + suboffset);
+
+   if (has_source_modifiers) {
+      if (entry->dst.type != inst->src[arg].type) {
+         /* We are propagating source modifiers from a MOV with a different
+          * type.  If we got here, then we can just change the source and
+          * destination types of the instruction and keep going.
+          */
+         for (int i = 0; i < inst->sources; i++) {
+            inst->src[i].type = entry->dst.type;
+         }
+         inst->dst.type = entry->dst.type;
+      }
+
+      if (!inst->src[arg].abs) {
+         inst->src[arg].abs = entry->src.abs;
+         inst->src[arg].negate ^= entry->src.negate;
+      }
+   }
+
+   return true;
+}
+
+
+static bool
+try_constant_propagate(const brw_compiler *compiler, fs_inst *inst,
+                       acp_entry *entry, int arg)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   bool progress = false;
+
+   if (type_sz(entry->src.type) > 4)
+      return false;
+
+   if (inst->src[arg].file != VGRF)
+      return false;
+
+   assert(entry->dst.file == VGRF);
+   if (inst->src[arg].nr != entry->dst.nr)
+      return false;
+
+   /* Bail if inst is reading a range that isn't contained in the range
+    * that entry is writing.
+    */
+   if (!region_contained_in(inst->src[arg], inst->size_read(arg),
+                            entry->dst, entry->size_written))
+      return false;
+
+   /* If the size of the use type is larger than the size of the entry
+    * type, the entry doesn't contain all of the data that the user is
+    * trying to use.
+    */
+   if (type_sz(inst->src[arg].type) > type_sz(entry->dst.type))
+      return false;
+
+   fs_reg val = entry->src;
+
+   /* If the size of the use type is smaller than the size of the entry,
+    * clamp the value to the range of the use type.  This enables constant
+    * copy propagation in cases like
+    *
+    *
+    *    mov(8)          g12<1>UD        0x0000000cUD
+    *    ...
+    *    mul(8)          g47<1>D         g86<8,8,1>D     g12<16,8,2>W
+    */
+   if (type_sz(inst->src[arg].type) < type_sz(entry->dst.type)) {
+      if (type_sz(inst->src[arg].type) != 2 || type_sz(entry->dst.type) != 4)
+         return false;
+
+      assert(inst->src[arg].subnr == 0 || inst->src[arg].subnr == 2);
+
+      /* When subnr is 0, we want the lower 16-bits, and when it's 2, we
+       * want the upper 16-bits. No other values of subnr are valid for a
+       * UD source.
+       */
+      const uint16_t v = inst->src[arg].subnr == 2 ? val.ud >> 16 : val.ud;
+
+      val.ud = v | (uint32_t(v) << 16);
+   }
+
+   val.type = inst->src[arg].type;
+
+   if (inst->src[arg].abs) {
+      if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
+          !brw_abs_immediate(val.type, &val.as_brw_reg())) {
+         return false;
+      }
+   }
+
+   if (inst->src[arg].negate) {
+      if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
+          !brw_negate_immediate(val.type, &val.as_brw_reg())) {
+         return false;
+      }
+   }
+
+   switch (inst->opcode) {
+   case BRW_OPCODE_MOV:
+   case SHADER_OPCODE_LOAD_PAYLOAD:
+   case FS_OPCODE_PACK:
+      inst->src[arg] = val;
+      progress = true;
+      break;
+
+   case SHADER_OPCODE_POW:
+      /* Allow constant propagation into src1 (except on Gen 6 which
+       * doesn't support scalar source math), and let constant combining
+       * promote the constant on Gen < 8.
+       */
+      if (devinfo->ver == 6)
+         break;
+
+      if (arg == 1) {
+         inst->src[arg] = val;
+         progress = true;
+      }
+      break;
+
+   case BRW_OPCODE_SUBB:
+      if (arg == 1) {
+         inst->src[arg] = val;
+         progress = true;
+      }
+      break;
+
+   case BRW_OPCODE_MACH:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_ADDC:
+      if (arg == 1) {
+         inst->src[arg] = val;
+         progress = true;
+      } else if (arg == 0 && inst->src[1].file != IMM) {
+         /* Don't copy propagate the constant in situations like
+          *
+          *    mov(8)          g8<1>D          0x7fffffffD
+          *    mul(8)          g16<1>D         g8<8,8,1>D      g15<16,8,2>W
+          *
+          * On platforms that only have a 32x16 multiplier, this will
+          * result in lowering the multiply to
+          *
+          *    mul(8)          g15<1>D         g14<8,8,1>D     0xffffUW
+          *    mul(8)          g16<1>D         g14<8,8,1>D     0x7fffUW
+          *    add(8)          g15.1<2>UW      g15.1<16,8,2>UW g16<16,8,2>UW
+          *
+          * On Gfx8 and Gfx9, which have the full 32x32 multiplier, it
+          * results in
+          *
+          *    mul(8)          g16<1>D         g15<16,8,2>W    0x7fffffffD
+          *
+          * Volume 2a of the Skylake PRM says:
+          *
+          *    When multiplying a DW and any lower precision integer, the
+          *    DW operand must on src0.
+          */
+         if (inst->opcode == BRW_OPCODE_MUL &&
+             type_sz(inst->src[1].type) < 4 &&
+             type_sz(val.type) == 4)
+            break;
+
+         /* Fit this constant in by commuting the operands.
+          * Exception: we can't do this for 32-bit integer MUL/MACH
+          * because it's asymmetric.
+          *
+          * The BSpec says for Broadwell that
+          *
+          *    "When multiplying DW x DW, the dst cannot be accumulator."
+          *
+          * Integer MUL with a non-accumulator destination will be lowered
+          * by lower_integer_multiplication(), so don't restrict it.
+          */
+         if (((inst->opcode == BRW_OPCODE_MUL &&
+               inst->dst.is_accumulator()) ||
+              inst->opcode == BRW_OPCODE_MACH) &&
+             (inst->src[1].type == BRW_REGISTER_TYPE_D ||
+              inst->src[1].type == BRW_REGISTER_TYPE_UD))
+            break;
+         inst->src[0] = inst->src[1];
+         inst->src[1] = val;
+         progress = true;
+      }
+      break;
+
+   case BRW_OPCODE_ADD3:
+      /* add3 can have a single imm16 source. Proceed if the source type is
+       * already W or UW or the value can be coerced to one of those types.
+       */
+      if (val.type == BRW_REGISTER_TYPE_W || val.type == BRW_REGISTER_TYPE_UW)
+         ; /* Nothing to do. */
+      else if (val.ud <= 0xffff)
+         val = brw_imm_uw(val.ud);
+      else if (val.d >= -0x8000 && val.d <= 0x7fff)
+         val = brw_imm_w(val.d);
+      else
+         break;
+
+      if (arg == 2) {
+         inst->src[arg] = val;
+         progress = true;
+      } else if (inst->src[2].file != IMM) {
+         inst->src[arg] = inst->src[2];
+         inst->src[2] = val;
+         progress = true;
+      }
+
+      break;
+
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_IF:
+      if (arg == 1) {
+         inst->src[arg] = val;
+         progress = true;
+      } else if (arg == 0 && inst->src[1].file != IMM) {
+         enum brw_conditional_mod new_cmod;
+
+         new_cmod = brw_swap_cmod(inst->conditional_mod);
+         if (new_cmod != BRW_CONDITIONAL_NONE) {
+            /* Fit this constant in by swapping the operands and
+             * flipping the test
+             */
+            inst->src[0] = inst->src[1];
+            inst->src[1] = val;
+            inst->conditional_mod = new_cmod;
+            progress = true;
+         }
+      }
+      break;
+
+   case BRW_OPCODE_SEL:
+      if (arg == 1) {
+         inst->src[arg] = val;
+         progress = true;
+      } else if (arg == 0) {
+         if (inst->src[1].file != IMM &&
+             (inst->conditional_mod == BRW_CONDITIONAL_NONE ||
+              /* Only GE and L are commutative. */
+              inst->conditional_mod == BRW_CONDITIONAL_GE ||
+              inst->conditional_mod == BRW_CONDITIONAL_L)) {
+            inst->src[0] = inst->src[1];
+            inst->src[1] = val;
+
+            /* If this was predicated, flipping operands means
+             * we also need to flip the predicate.
+             */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
+               inst->predicate_inverse =
+                  !inst->predicate_inverse;
+            }
+         } else {
+            inst->src[0] = val;
+         }
+
+         progress = true;
+      }
+      break;
+
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are
+       * bit-cast using a strided region so they cannot be immediates.
+       */
+      if (arg != FB_WRITE_LOGICAL_SRC_SRC_STENCIL &&
+          arg != FB_WRITE_LOGICAL_SRC_OMASK) {
+         inst->src[arg] = val;
+         progress = true;
+      }
+      break;
+
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+      /* Allow constant propagation into either source (except on Gen 6
+       * which doesn't support scalar source math). Constant combining
+       * promote the src1 constant on Gen < 8, and it will promote the src0
+       * constant on all platforms.
+       */
+      if (devinfo->ver == 6)
+         break;
+
+      FALLTHROUGH;
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_BFE:
+   case BRW_OPCODE_BFI1:
+   case BRW_OPCODE_BFI2:
+   case BRW_OPCODE_ROL:
+   case BRW_OPCODE_ROR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_OR:
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case SHADER_OPCODE_TXD_LOGICAL:
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+   case SHADER_OPCODE_LOD_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
+   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case SHADER_OPCODE_BROADCAST:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_LRP:
+   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+   case SHADER_OPCODE_SHUFFLE:
+      inst->src[arg] = val;
+      progress = true;
+      break;
+
+   default:
+      break;
+   }
+
+   return progress;
+}
+
+static bool
+can_propagate_from(fs_inst *inst)
+{
+   return (inst->opcode == BRW_OPCODE_MOV &&
+           inst->dst.file == VGRF &&
+           ((inst->src[0].file == VGRF &&
+             !grf_regions_overlap(inst->dst, inst->size_written,
+                                  inst->src[0], inst->size_read(0))) ||
+            inst->src[0].file == ATTR ||
+            inst->src[0].file == UNIFORM ||
+            inst->src[0].file == IMM ||
+            (inst->src[0].file == FIXED_GRF &&
+             inst->src[0].is_contiguous())) &&
+           inst->src[0].type == inst->dst.type &&
+           !inst->saturate &&
+           /* Subset of !is_partial_write() conditions. */
+           !inst->predicate && inst->dst.is_contiguous()) ||
+          is_identity_payload(FIXED_GRF, inst);
+}
+
+/* Walks a basic block and does copy propagation on it using the acp
+ * list.
+ */
+static bool
+opt_copy_propagation_local(const brw_compiler *compiler, linear_ctx *lin_ctx,
+                           bblock_t *block, struct acp &acp,
+                           const brw::simple_allocator &alloc,
+                           uint8_t max_polygons)
+{
+   bool progress = false;
+
+   foreach_inst_in_block(fs_inst, inst, block) {
+      /* Try propagating into this instruction. */
+      bool instruction_progress = false;
+      for (int i = inst->sources - 1; i >= 0; i--) {
+         if (inst->src[i].file != VGRF)
+            continue;
+
+         for (auto iter = acp.find_by_dst(inst->src[i].nr);
+              iter != acp.end() && (*iter)->dst.nr == inst->src[i].nr;
+              ++iter) {
+            if ((*iter)->src.file == IMM) {
+               if (try_constant_propagate(compiler, inst, *iter, i)) {
+                  instruction_progress = true;
+                  break;
+               }
+            } else {
+               if (try_copy_propagate(compiler, inst, *iter, i, alloc,
+                                      max_polygons)) {
+                  instruction_progress = true;
+                  break;
+               }
+            }
+         }
+      }
+
+      if (instruction_progress) {
+         progress = true;
+
+         /* ADD3 can only have the immediate as src0. */
+         if (inst->opcode == BRW_OPCODE_ADD3) {
+            if (inst->src[2].file == IMM) {
+               const auto src0 = inst->src[0];
+               inst->src[0] = inst->src[2];
+               inst->src[2] = src0;
+            }
+         }
+
+         /* If only one of the sources of a 2-source, commutative instruction (e.g.,
+          * AND) is immediate, it must be src1. If both are immediate, opt_algebraic
+          * should fold it away.
+          */
+         if (inst->sources == 2 && inst->is_commutative() &&
+             inst->src[0].file == IMM && inst->src[1].file != IMM) {
+            const auto src1 = inst->src[1];
+            inst->src[1] = inst->src[0];
+            inst->src[0] = src1;
+         }
+      }
+
+      /* kill the destination from the ACP */
+      if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
+         for (auto iter = acp.find_by_dst(inst->dst.nr);
+              iter != acp.end() && (*iter)->dst.nr == inst->dst.nr;
+              ++iter) {
+            if (grf_regions_overlap((*iter)->dst, (*iter)->size_written,
+                                    inst->dst, inst->size_written))
+               acp.remove(*iter);
+         }
+
+         for (auto iter = acp.find_by_src(inst->dst.nr);
+              iter != acp.end() && (*iter)->src.nr == inst->dst.nr;
+              ++iter) {
+            /* Make sure we kill the entry if this instruction overwrites
+             * _any_ of the registers that it reads
+             */
+            if (grf_regions_overlap((*iter)->src, (*iter)->size_read,
+                                    inst->dst, inst->size_written))
+               acp.remove(*iter);
+         }
+      }
+
+      /* If this instruction's source could potentially be folded into the
+       * operand of another instruction, add it to the ACP.
+       */
+      if (can_propagate_from(inst)) {
+         acp_entry *entry = linear_zalloc(lin_ctx, acp_entry);
+         entry->dst = inst->dst;
+         entry->src = inst->src[0];
+         entry->size_written = inst->size_written;
+         for (unsigned i = 0; i < inst->sources; i++)
+            entry->size_read += inst->size_read(i);
+         entry->opcode = inst->opcode;
+         entry->is_partial_write = inst->is_partial_write();
+         entry->force_writemask_all = inst->force_writemask_all;
+         acp.add(entry);
+      } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
+                 inst->dst.file == VGRF) {
+         int offset = 0;
+         for (int i = 0; i < inst->sources; i++) {
+            int effective_width = i < inst->header_size ? 8 : inst->exec_size;
+            const unsigned size_written = effective_width *
+                                          type_sz(inst->src[i].type);
+            if (inst->src[i].file == VGRF ||
+                (inst->src[i].file == FIXED_GRF &&
+                 inst->src[i].is_contiguous())) {
+               const brw_reg_type t = i < inst->header_size ?
+                  BRW_REGISTER_TYPE_UD : inst->src[i].type;
+               fs_reg dst = byte_offset(retype(inst->dst, t), offset);
+               if (!dst.equals(inst->src[i])) {
+                  acp_entry *entry = linear_zalloc(lin_ctx, acp_entry);
+                  entry->dst = dst;
+                  entry->src = retype(inst->src[i], t);
+                  entry->size_written = size_written;
+                  entry->size_read = inst->size_read(i);
+                  entry->opcode = inst->opcode;
+                  entry->force_writemask_all = inst->force_writemask_all;
+                  acp.add(entry);
+               }
+            }
+            offset += size_written;
+         }
+      }
+   }
+
+   return progress;
+}
+
+bool
+fs_visitor::opt_copy_propagation()
+{
+   bool progress = false;
+   void *copy_prop_ctx = ralloc_context(NULL);
+   linear_ctx *lin_ctx = linear_context(copy_prop_ctx);
+   struct acp out_acp[cfg->num_blocks];
+
+   const fs_live_variables &live = live_analysis.require();
+
+   /* First, walk through each block doing local copy propagation and getting
+    * the set of copies available at the end of the block.
+    */
+   foreach_block (block, cfg) {
+      progress = opt_copy_propagation_local(compiler, lin_ctx, block,
+                                            out_acp[block->num], alloc,
+                                            max_polygons) || progress;
+
+      /* If the destination of an ACP entry exists only within this block,
+       * then there's no need to keep it for dataflow analysis.  We can delete
+       * it from the out_acp table and avoid growing the bitsets any bigger
+       * than we absolutely have to.
+       *
+       * Because nothing in opt_copy_propagation_local touches the block
+       * start/end IPs and opt_copy_propagation_local is incapable of
+       * extending the live range of an ACP destination beyond the block,
+       * it's safe to use the liveness information in this way.
+       */
+      for (auto iter = out_acp[block->num].begin();
+           iter != out_acp[block->num].end(); ++iter) {
+         assert((*iter)->dst.file == VGRF);
+         if (block->start_ip <= live.vgrf_start[(*iter)->dst.nr] &&
+             live.vgrf_end[(*iter)->dst.nr] <= block->end_ip) {
+            out_acp[block->num].remove(*iter);
+         }
+      }
+   }
+
+   /* Do dataflow analysis for those available copies. */
+   fs_copy_prop_dataflow dataflow(lin_ctx, cfg, live, out_acp);
+
+   /* Next, re-run local copy propagation, this time with the set of copies
+    * provided by the dataflow analysis available at the start of a block.
+    */
+   foreach_block (block, cfg) {
+      struct acp in_acp;
+
+      for (int i = 0; i < dataflow.num_acp; i++) {
+         if (BITSET_TEST(dataflow.bd[block->num].livein, i) &&
+             !BITSET_TEST(dataflow.bd[block->num].exec_mismatch, i)) {
+            struct acp_entry *entry = dataflow.acp[i];
+            in_acp.add(entry);
+         }
+      }
+
+      progress = opt_copy_propagation_local(compiler, lin_ctx, block,
+                                            in_acp, alloc, max_polygons) ||
+                 progress;
+   }
+
+   ralloc_free(copy_prop_ctx);
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
+                          DEPENDENCY_INSTRUCTION_DETAIL);
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_fs_cse.cpp b/src/intel/compiler/elk/brw_fs_cse.cpp
new file mode 100644
index 00000000000..8fa1d281b06
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_cse.cpp
@@ -0,0 +1,396 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_cfg.h"
+
+/** @file brw_fs_cse.cpp
+ *
+ * Support for local common subexpression elimination.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 13.1 (p378).
+ */
+
+using namespace brw;
+
+namespace {
+struct aeb_entry : public exec_node {
+   /** The instruction that generates the expression value. */
+   fs_inst *generator;
+
+   /** The temporary where the value is stored. */
+   fs_reg tmp;
+};
+}
+
+static bool
+is_expression(const fs_visitor *v, const fs_inst *const inst)
+{
+   switch (inst->opcode) {
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_SEL:
+   case BRW_OPCODE_NOT:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_CMPN:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+   case BRW_OPCODE_FRC:
+   case BRW_OPCODE_RNDU:
+   case BRW_OPCODE_RNDD:
+   case BRW_OPCODE_RNDE:
+   case BRW_OPCODE_RNDZ:
+   case BRW_OPCODE_LINE:
+   case BRW_OPCODE_PLN:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_LRP:
+   case FS_OPCODE_FB_READ_LOGICAL:
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
+   case FS_OPCODE_LINTERP:
+   case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+   case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
+   case FS_OPCODE_LOAD_LIVE_CHANNELS:
+   case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_MOV_INDIRECT:
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case SHADER_OPCODE_TXD_LOGICAL:
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+   case SHADER_OPCODE_LOD_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+   case FS_OPCODE_PACK:
+      return true;
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      return inst->mlen < 2;
+   case SHADER_OPCODE_LOAD_PAYLOAD:
+      return !is_coalescing_payload(v->alloc, inst);
+   default:
+      return inst->is_send_from_grf() && !inst->has_side_effects() &&
+         !inst->is_volatile();
+   }
+}
+
+static bool
+operands_match(const fs_inst *a, const fs_inst *b, bool *negate)
+{
+   fs_reg *xs = a->src;
+   fs_reg *ys = b->src;
+
+   if (a->opcode == BRW_OPCODE_MAD) {
+      return xs[0].equals(ys[0]) &&
+             ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
+              (xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
+   } else if (a->opcode == BRW_OPCODE_MUL && a->dst.type == BRW_REGISTER_TYPE_F) {
+      bool xs0_negate = xs[0].negate;
+      bool xs1_negate = xs[1].file == IMM ? xs[1].f < 0.0f
+                                          : xs[1].negate;
+      bool ys0_negate = ys[0].negate;
+      bool ys1_negate = ys[1].file == IMM ? ys[1].f < 0.0f
+                                          : ys[1].negate;
+      float xs1_imm = xs[1].f;
+      float ys1_imm = ys[1].f;
+
+      xs[0].negate = false;
+      xs[1].negate = false;
+      ys[0].negate = false;
+      ys[1].negate = false;
+      xs[1].f = fabsf(xs[1].f);
+      ys[1].f = fabsf(ys[1].f);
+
+      bool ret = (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
+                 (xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
+
+      xs[0].negate = xs0_negate;
+      xs[1].negate = xs[1].file == IMM ? false : xs1_negate;
+      ys[0].negate = ys0_negate;
+      ys[1].negate = ys[1].file == IMM ? false : ys1_negate;
+      xs[1].f = xs1_imm;
+      ys[1].f = ys1_imm;
+
+      *negate = (xs0_negate != xs1_negate) != (ys0_negate != ys1_negate);
+      if (*negate && (a->saturate || b->saturate))
+         return false;
+      return ret;
+   } else if (!a->is_commutative()) {
+      bool match = true;
+      for (int i = 0; i < a->sources; i++) {
+         if (!xs[i].equals(ys[i])) {
+            match = false;
+            break;
+         }
+      }
+      return match;
+   } else {
+      return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
+             (xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
+   }
+}
+
+static bool
+instructions_match(fs_inst *a, fs_inst *b, bool *negate)
+{
+   return a->opcode == b->opcode &&
+          a->force_writemask_all == b->force_writemask_all &&
+          a->exec_size == b->exec_size &&
+          a->group == b->group &&
+          a->saturate == b->saturate &&
+          a->predicate == b->predicate &&
+          a->predicate_inverse == b->predicate_inverse &&
+          a->conditional_mod == b->conditional_mod &&
+          a->flag_subreg == b->flag_subreg &&
+          a->dst.type == b->dst.type &&
+          a->offset == b->offset &&
+          a->mlen == b->mlen &&
+          a->ex_mlen == b->ex_mlen &&
+          a->sfid == b->sfid &&
+          a->desc == b->desc &&
+          a->size_written == b->size_written &&
+          a->base_mrf == b->base_mrf &&
+          a->check_tdr == b->check_tdr &&
+          a->send_has_side_effects == b->send_has_side_effects &&
+          a->eot == b->eot &&
+          a->header_size == b->header_size &&
+          a->shadow_compare == b->shadow_compare &&
+          a->pi_noperspective == b->pi_noperspective &&
+          a->target == b->target &&
+          a->sources == b->sources &&
+          operands_match(a, b, negate);
+}
+
+static void
+create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
+{
+   unsigned written = regs_written(inst);
+   unsigned dst_width =
+      DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
+   fs_inst *copy;
+
+   if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+      assert(src.file == VGRF);
+      fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg,
+                                     inst->sources);
+      for (int i = 0; i < inst->header_size; i++) {
+         payload[i] = src;
+         src.offset += REG_SIZE;
+      }
+      for (int i = inst->header_size; i < inst->sources; i++) {
+         src.type = inst->src[i].type;
+         payload[i] = src;
+         src = offset(src, bld, 1);
+      }
+      copy = bld.LOAD_PAYLOAD(inst->dst, payload, inst->sources,
+                              inst->header_size);
+   } else if (written != dst_width) {
+      assert(src.file == VGRF);
+      assert(written % dst_width == 0);
+      const int sources = written / dst_width;
+      fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources);
+      for (int i = 0; i < sources; i++) {
+         payload[i] = src;
+         src = offset(src, bld, 1);
+      }
+      copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, 0);
+   } else {
+      copy = bld.MOV(inst->dst, src);
+      copy->group = inst->group;
+      copy->force_writemask_all = inst->force_writemask_all;
+      copy->src[0].negate = negate;
+   }
+   assert(regs_written(copy) == written);
+}
+
+bool
+fs_visitor::opt_cse_local(const fs_live_variables &live, bblock_t *block, int &ip)
+{
+   bool progress = false;
+   exec_list aeb;
+
+   void *cse_ctx = ralloc_context(NULL);
+
+   foreach_inst_in_block(fs_inst, inst, block) {
+      /* Skip some cases. */
+      if (is_expression(this, inst) && !inst->is_partial_write() &&
+          ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
+           inst->dst.is_null()))
+      {
+         bool found = false;
+         bool negate = false;
+
+         foreach_in_list_use_after(aeb_entry, entry, &aeb) {
+            /* Match current instruction's expression against those in AEB. */
+            if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
+                instructions_match(inst, entry->generator, &negate)) {
+               found = true;
+               progress = true;
+               break;
+            }
+         }
+
+         if (!found) {
+            if (inst->opcode != BRW_OPCODE_MOV ||
+                (inst->opcode == BRW_OPCODE_MOV &&
+                 inst->src[0].file == IMM &&
+                 inst->src[0].type == BRW_REGISTER_TYPE_VF)) {
+               /* Our first sighting of this expression.  Create an entry. */
+               aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
+               entry->tmp = reg_undef;
+               entry->generator = inst;
+               aeb.push_tail(entry);
+            }
+         } else {
+            /* This is at least our second sighting of this expression.
+             * If we don't have a temporary already, make one.
+             */
+            bool no_existing_temp = entry->tmp.file == BAD_FILE;
+            if (no_existing_temp && !entry->generator->dst.is_null()) {
+               const fs_builder ibld = fs_builder(this, block, entry->generator)
+                                       .at(block, entry->generator->next);
+               int written = regs_written(entry->generator);
+
+               entry->tmp = fs_reg(VGRF, alloc.allocate(written),
+                                   entry->generator->dst.type);
+
+               create_copy_instr(ibld, entry->generator, entry->tmp, false);
+
+               entry->generator->dst = entry->tmp;
+            }
+
+            /* dest <- temp */
+            if (!inst->dst.is_null()) {
+               assert(inst->size_written == entry->generator->size_written);
+               assert(inst->dst.type == entry->tmp.type);
+               const fs_builder ibld(this, block, inst);
+
+               create_copy_instr(ibld, inst, entry->tmp, negate);
+            }
+
+            /* Set our iterator so that next time through the loop inst->next
+             * will get the instruction in the basic block after the one we've
+             * removed.
+             */
+            fs_inst *prev = (fs_inst *)inst->prev;
+
+            inst->remove(block);
+            inst = prev;
+         }
+      }
+
+      /* Discard jumps aren't represented in the CFG unfortunately, so we need
+       * to make sure that they behave as a CSE barrier, since we lack global
+       * dataflow information.  This is particularly likely to cause problems
+       * with instructions dependent on the current execution mask like
+       * SHADER_OPCODE_FIND_LIVE_CHANNEL.
+       */
+      if (inst->opcode == BRW_OPCODE_HALT ||
+          inst->opcode == SHADER_OPCODE_HALT_TARGET)
+         aeb.make_empty();
+
+      foreach_in_list_safe(aeb_entry, entry, &aeb) {
+         /* Kill all AEB entries that write a different value to or read from
+          * the flag register if we just wrote it.
+          */
+         if (inst->flags_written(devinfo)) {
+            bool negate; /* dummy */
+            if (entry->generator->flags_read(devinfo) ||
+                (entry->generator->flags_written(devinfo) &&
+                 !instructions_match(inst, entry->generator, &negate))) {
+               entry->remove();
+               ralloc_free(entry);
+               continue;
+            }
+         }
+
+         for (int i = 0; i < entry->generator->sources; i++) {
+            fs_reg *src_reg = &entry->generator->src[i];
+
+            /* Kill all AEB entries that use the destination we just
+             * overwrote.
+             */
+            if (regions_overlap(inst->dst, inst->size_written,
+                                entry->generator->src[i],
+                                entry->generator->size_read(i))) {
+               entry->remove();
+               ralloc_free(entry);
+               break;
+            }
+
+            /* Kill any AEB entries using registers that don't get reused any
+             * more -- a sure sign they'll fail operands_match().
+             */
+            if (src_reg->file == VGRF && live.vgrf_end[src_reg->nr] < ip) {
+               entry->remove();
+               ralloc_free(entry);
+               break;
+            }
+         }
+      }
+
+      ip++;
+   }
+
+   ralloc_free(cse_ctx);
+
+   return progress;
+}
+
+bool
+fs_visitor::opt_cse()
+{
+   const fs_live_variables &live = live_analysis.require();
+   bool progress = false;
+   int ip = 0;
+
+   foreach_block (block, cfg) {
+      progress = opt_cse_local(live, block, ip) || progress;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_fs_dead_code_eliminate.cpp b/src/intel/compiler/elk/brw_fs_dead_code_eliminate.cpp
new file mode 100644
index 00000000000..51e1bd549cd
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_dead_code_eliminate.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_live_variables.h"
+#include "brw_cfg.h"
+
+/** @file brw_fs_dead_code_eliminate.cpp
+ *
+ * Dataflow-aware dead code elimination.
+ *
+ * Walks the instruction list from the bottom, removing instructions that
+ * have results that both aren't used in later blocks and haven't been read
+ * yet in the tail end of this block.
+ */
+
+using namespace brw;
+
+/**
+ * Is it safe to eliminate the instruction?
+ */
+static bool
+can_eliminate(const intel_device_info *devinfo, const fs_inst *inst,
+              BITSET_WORD *flag_live)
+{
+    return !inst->is_control_flow() &&
+           !inst->has_side_effects() &&
+           !(flag_live[0] & inst->flags_written(devinfo)) &&
+           !inst->writes_accumulator;
+}
+
+/**
+ * Is it safe to omit the write, making the destination ARF null?
+ */
+static bool
+can_omit_write(const fs_inst *inst)
+{
+   switch (inst->opcode) {
+   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+      return true;
+   default:
+      /* We can eliminate the destination write for ordinary instructions,
+       * but not most SENDs.
+       */
+      if (inst->opcode < 128 && inst->mlen == 0)
+         return true;
+
+      /* It might not be safe for other virtual opcodes. */
+      return false;
+   }
+}
+
+bool
+fs_visitor::dead_code_eliminate()
+{
+   bool progress = false;
+
+   const fs_live_variables &live_vars = live_analysis.require();
+   int num_vars = live_vars.num_vars;
+   BITSET_WORD *live = rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(num_vars));
+   BITSET_WORD *flag_live = rzalloc_array(NULL, BITSET_WORD, 1);
+
+   foreach_block_reverse_safe(block, cfg) {
+      memcpy(live, live_vars.block_data[block->num].liveout,
+             sizeof(BITSET_WORD) * BITSET_WORDS(num_vars));
+      memcpy(flag_live, live_vars.block_data[block->num].flag_liveout,
+             sizeof(BITSET_WORD));
+
+      foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
+         if (inst->dst.file == VGRF) {
+            const unsigned var = live_vars.var_from_reg(inst->dst);
+            bool result_live = false;
+
+            for (unsigned i = 0; i < regs_written(inst); i++)
+               result_live |= BITSET_TEST(live, var + i);
+
+            if (!result_live &&
+                (can_omit_write(inst) || can_eliminate(devinfo, inst, flag_live))) {
+               inst->dst = fs_reg(spread(retype(brw_null_reg(), inst->dst.type),
+                                         inst->dst.stride));
+               progress = true;
+            }
+         }
+
+         if (inst->dst.is_null() && can_eliminate(devinfo, inst, flag_live)) {
+            inst->opcode = BRW_OPCODE_NOP;
+            progress = true;
+         }
+
+         if (inst->dst.file == VGRF) {
+            if (!inst->is_partial_write()) {
+               const unsigned var = live_vars.var_from_reg(inst->dst);
+               for (unsigned i = 0; i < regs_written(inst); i++) {
+                  BITSET_CLEAR(live, var + i);
+               }
+            }
+         }
+
+         if (!inst->predicate && inst->exec_size >= 8)
+            flag_live[0] &= ~inst->flags_written(devinfo);
+
+         if (inst->opcode == BRW_OPCODE_NOP) {
+            inst->remove(block, true);
+            continue;
+         }
+
+         for (int i = 0; i < inst->sources; i++) {
+            if (inst->src[i].file == VGRF) {
+               int var = live_vars.var_from_reg(inst->src[i]);
+
+               for (unsigned j = 0; j < regs_read(inst, i); j++) {
+                  BITSET_SET(live, var + j);
+               }
+            }
+         }
+
+         flag_live[0] |= inst->flags_read(devinfo);
+      }
+   }
+
+   cfg->adjust_block_ips();
+
+   ralloc_free(live);
+   ralloc_free(flag_live);
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_fs_generator.cpp b/src/intel/compiler/elk/brw_fs_generator.cpp
new file mode 100644
index 00000000000..2525c415ce5
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_generator.cpp
@@ -0,0 +1,2544 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_generator.cpp
+ *
+ * This file supports generating code from the FS LIR to the actual
+ * native instructions.
+ */
+
+#include "brw_eu.h"
+#include "brw_disasm_info.h"
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "dev/intel_debug.h"
+#include "util/mesa-sha1.h"
+#include "util/half_float.h"
+
+static enum brw_reg_file
+brw_file_from_reg(fs_reg *reg)
+{
+   switch (reg->file) {
+   case ARF:
+      return BRW_ARCHITECTURE_REGISTER_FILE;
+   case FIXED_GRF:
+   case VGRF:
+      return BRW_GENERAL_REGISTER_FILE;
+   case MRF:
+      return BRW_MESSAGE_REGISTER_FILE;
+   case IMM:
+      return BRW_IMMEDIATE_VALUE;
+   case BAD_FILE:
+   case ATTR:
+   case UNIFORM:
+      unreachable("not reached");
+   }
+   return BRW_ARCHITECTURE_REGISTER_FILE;
+}
+
+static struct brw_reg
+brw_reg_from_fs_reg(const struct intel_device_info *devinfo, fs_inst *inst,
+                    fs_reg *reg, bool compressed)
+{
+   struct brw_reg brw_reg;
+
+   switch (reg->file) {
+   case MRF:
+      assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
+      FALLTHROUGH;
+   case VGRF:
+      if (reg->stride == 0) {
+         brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
+      } else {
+         /* From the Haswell PRM:
+          *
+          *  "VertStride must be used to cross GRF register boundaries. This
+          *   rule implies that elements within a 'Width' cannot cross GRF
+          *   boundaries."
+          *
+          * The maximum width value that could satisfy this restriction is:
+          */
+         const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
+
+         /* Because the hardware can only split source regions at a whole
+          * multiple of width during decompression (i.e. vertically), clamp
+          * the value obtained above to the physical execution size of a
+          * single decompressed chunk of the instruction:
+          */
+         const unsigned phys_width = compressed ? inst->exec_size / 2 :
+                                     inst->exec_size;
+
+         const unsigned max_hw_width = 16;
+
+         /* XXX - The equation above is strictly speaking not correct on
+          *       hardware that supports unbalanced GRF writes -- On Gfx9+
+          *       each decompressed chunk of the instruction may have a
+          *       different execution size when the number of components
+          *       written to each destination GRF is not the same.
+          */
+         if (reg->stride > 4) {
+            assert(reg != &inst->dst);
+            assert(reg->stride * type_sz(reg->type) <= REG_SIZE);
+            brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0);
+            brw_reg = stride(brw_reg, reg->stride, 1, 0);
+         } else {
+            const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
+            brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
+            brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
+         }
+
+         if (devinfo->verx10 == 70) {
+            /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13):
+             *  "Each DF (Double Float) operand uses an element size of 4 rather
+             *   than 8 and all regioning parameters are twice what the values
+             *   would be based on the true element size: ExecSize, Width,
+             *   HorzStride, and VertStride. Each DF operand uses a pair of
+             *   channels and all masking and swizzing should be adjusted
+             *   appropriately."
+             *
+             * From the IvyBridge PRM (Special Requirements for Handling Double
+             * Precision Data Types, page 71):
+             *  "In Align1 mode, all regioning parameters like stride, execution
+             *   size, and width must use the syntax of a pair of packed
+             *   floats. The offsets for these data types must be 64-bit
+             *   aligned. The execution size and regioning parameters are in terms
+             *   of floats."
+             *
+             * Summarized: when handling DF-typed arguments, ExecSize,
+             * VertStride, and Width must be doubled.
+             *
+             * It applies to BayTrail too.
+             */
+            if (type_sz(reg->type) == 8) {
+               brw_reg.width++;
+               if (brw_reg.vstride > 0)
+                  brw_reg.vstride++;
+               assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1);
+            }
+
+            /* When converting from DF->F, we set the destination stride to 2
+             * because each d2f conversion implicitly writes 2 floats, being
+             * the first one the converted value. IVB/BYT actually writes two
+             * F components per SIMD channel, and every other component is
+             * filled with garbage.
+             */
+            if (reg == &inst->dst && get_exec_type_size(inst) == 8 &&
+                type_sz(inst->dst.type) < 8) {
+               assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1);
+               brw_reg.hstride--;
+            }
+         }
+      }
+
+      brw_reg = retype(brw_reg, reg->type);
+      brw_reg = byte_offset(brw_reg, reg->offset);
+      brw_reg.abs = reg->abs;
+      brw_reg.negate = reg->negate;
+      break;
+   case ARF:
+   case FIXED_GRF:
+   case IMM:
+      assert(reg->offset == 0);
+      brw_reg = reg->as_brw_reg();
+      break;
+   case BAD_FILE:
+      /* Probably unused. */
+      brw_reg = brw_null_reg();
+      break;
+   case ATTR:
+   case UNIFORM:
+      unreachable("not reached");
+   }
+
+   /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0>
+    * region, but on IVB and BYT DF regions must be programmed in terms of
+    * floats. A <0,2,1> region accomplishes this.
+    */
+   if (devinfo->verx10 == 70 &&
+       type_sz(reg->type) == 8 &&
+       brw_reg.vstride == BRW_VERTICAL_STRIDE_0 &&
+       brw_reg.width == BRW_WIDTH_1 &&
+       brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) {
+      brw_reg.width = BRW_WIDTH_2;
+      brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1;
+   }
+
+   return brw_reg;
+}
+
+fs_generator::fs_generator(const struct brw_compiler *compiler,
+                           const struct brw_compile_params *params,
+                           struct brw_stage_prog_data *prog_data,
+                           bool runtime_check_aads_emit,
+                           gl_shader_stage stage)
+
+   : compiler(compiler), params(params),
+     devinfo(compiler->devinfo),
+     prog_data(prog_data), dispatch_width(0),
+     runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
+     shader_name(NULL), stage(stage), mem_ctx(params->mem_ctx)
+{
+   p = rzalloc(mem_ctx, struct brw_codegen);
+   brw_init_codegen(&compiler->isa, p, mem_ctx);
+
+   /* In the FS code generator, we are very careful to ensure that we always
+    * set the right execution size so we don't need the EU code to "help" us
+    * by trying to infer it.  Sometimes, it infers the wrong thing.
+    */
+   p->automatic_exec_sizes = false;
+}
+
+fs_generator::~fs_generator()
+{
+}
+
+class ip_record : public exec_node {
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(ip_record)
+
+   ip_record(int ip)
+   {
+      this->ip = ip;
+   }
+
+   int ip;
+};
+
+bool
+fs_generator::patch_halt_jumps()
+{
+   if (this->discard_halt_patches.is_empty())
+      return false;
+
+   int scale = brw_jump_scale(p->devinfo);
+
+   if (devinfo->ver >= 6) {
+      /* There is a somewhat strange undocumented requirement of using
+       * HALT, according to the simulator.  If some channel has HALTed to
+       * a particular UIP, then by the end of the program, every channel
+       * must have HALTed to that UIP.  Furthermore, the tracking is a
+       * stack, so you can't do the final halt of a UIP after starting
+       * halting to a new UIP.
+       *
+       * Symptoms of not emitting this instruction on actual hardware
+       * included GPU hangs and sparkly rendering on the piglit discard
+       * tests.
+       */
+      brw_inst *last_halt = brw_HALT(p);
+      brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
+      brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
+   }
+
+   int ip = p->nr_insn;
+
+   foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
+      brw_inst *patch = &p->store[patch_ip->ip];
+
+      assert(brw_inst_opcode(p->isa, patch) == BRW_OPCODE_HALT);
+      if (devinfo->ver >= 6) {
+         /* HALT takes a half-instruction distance from the pre-incremented IP. */
+         brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
+      } else {
+         brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale));
+      }
+   }
+
+   this->discard_halt_patches.make_empty();
+
+   if (devinfo->ver < 6) {
+      /* From the g965 PRM:
+       *
+       *    "As DMask is not automatically reloaded into AMask upon completion
+       *    of this instruction, software has to manually restore AMask upon
+       *    completion."
+       *
+       * DMask lives in the bottom 16 bits of sr0.1.
+       */
+      brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK),
+                                   retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW));
+      brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1);
+      brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE);
+      brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE);
+      brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH);
+   }
+
+   if (devinfo->ver == 4 && devinfo->platform != INTEL_PLATFORM_G4X) {
+      /* From the g965 PRM:
+       *
+       *    "[DevBW, DevCL] Erratum: The subfields in mask stack register are
+       *    reset to zero during graphics reset, however, they are not
+       *    initialized at thread dispatch. These subfields will retain the
+       *    values from the previous thread. Software should make sure the
+       *    mask stack is empty (reset to zero) before terminating the thread.
+       *    In case that this is not practical, software may have to reset the
+       *    mask stack at the beginning of each kernel, which will impact the
+       *    performance."
+       *
+       * Luckily we can rely on:
+       *
+       *    "[DevBW, DevCL] This register access restriction is not
+       *    applicable, hardware does ensure execution pipeline coherency,
+       *    when a mask stack register is used as an explicit source and/or
+       *    destination."
+       */
+      brw_push_insn_state(p);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+
+      brw_set_default_exec_size(p, BRW_EXECUTE_2);
+      brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0));
+
+      brw_set_default_exec_size(p, BRW_EXECUTE_16);
+      /* Reset the if stack. */
+      brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW),
+              brw_imm_uw(0));
+
+      brw_pop_insn_state(p);
+   }
+
+   return true;
+}
+
+void
+fs_generator::generate_send(fs_inst *inst,
+                            struct brw_reg dst,
+                            struct brw_reg desc,
+                            struct brw_reg ex_desc,
+                            struct brw_reg payload,
+                            struct brw_reg payload2)
+{
+   const bool dst_is_null = dst.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+                            dst.nr == BRW_ARF_NULL;
+   const unsigned rlen = dst_is_null ? 0 : inst->size_written / REG_SIZE;
+
+   uint32_t desc_imm = inst->desc |
+      brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size);
+
+   uint32_t ex_desc_imm = inst->ex_desc |
+      brw_message_ex_desc(devinfo, inst->ex_mlen);
+
+   if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm ||
+       inst->send_ex_desc_scratch) {
+      /* If we have any sort of extended descriptor, then we need SENDS.  This
+       * also covers the dual-payload case because ex_mlen goes in ex_desc.
+       */
+      brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2,
+                                      desc, desc_imm, ex_desc, ex_desc_imm,
+                                      inst->send_ex_desc_scratch,
+                                      inst->send_ex_bso, inst->eot);
+      if (inst->check_tdr)
+         brw_inst_set_opcode(p->isa, brw_last_inst,
+                             devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);
+   } else {
+      brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm,
+                                   inst->eot);
+      if (inst->check_tdr)
+         brw_inst_set_opcode(p->isa, brw_last_inst, BRW_OPCODE_SENDC);
+   }
+}
+
+void
+fs_generator::fire_fb_write(fs_inst *inst,
+                            struct brw_reg payload,
+                            struct brw_reg implied_header,
+                            GLuint nr)
+{
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+
+   if (devinfo->ver < 6) {
+      brw_push_insn_state(p);
+      brw_set_default_exec_size(p, BRW_EXECUTE_8);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+      brw_set_default_flag_reg(p, 0, 0);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_MOV(p, offset(retype(payload, BRW_REGISTER_TYPE_UD), 1),
+              offset(retype(implied_header, BRW_REGISTER_TYPE_UD), 1));
+      brw_pop_insn_state(p);
+   }
+
+   uint32_t msg_control = brw_fb_write_msg_control(inst, prog_data);
+
+   /* We assume render targets start at 0, because headerless FB write
+    * messages set "Render Target Index" to 0.  Using a different binding
+    * table index would make it impossible to use headerless messages.
+    */
+   const uint32_t surf_index = inst->target;
+
+   brw_inst *insn = brw_fb_WRITE(p,
+                                 payload,
+                                 retype(implied_header, BRW_REGISTER_TYPE_UW),
+                                 msg_control,
+                                 surf_index,
+                                 nr,
+                                 0,
+                                 inst->eot,
+                                 inst->last_rt,
+                                 inst->header_size != 0);
+
+   if (devinfo->ver >= 6)
+      brw_inst_set_rt_slot_group(devinfo, insn, inst->group / 16);
+}
+
+void
+fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
+{
+   assert(devinfo->ver < 7);
+
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+   brw_set_default_flag_reg(p, 0, 0);
+
+   const struct brw_reg implied_header =
+      devinfo->ver < 6 ? payload : brw_null_reg();
+
+   if (inst->base_mrf >= 0)
+      payload = brw_message_reg(inst->base_mrf);
+
+   if (!runtime_check_aads_emit) {
+      fire_fb_write(inst, payload, implied_header, inst->mlen);
+   } else {
+      /* This can only happen in gen < 6 */
+      assert(devinfo->ver < 6);
+
+      struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+
+      /* Check runtime bit to detect if we have to send AA data or not */
+      brw_push_insn_state(p);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_default_exec_size(p, BRW_EXECUTE_1);
+      brw_AND(p,
+              v1_null_ud,
+              retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
+              brw_imm_ud(1<<26));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+
+      int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
+      brw_pop_insn_state(p);
+      {
+         /* Don't send AA data */
+         fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
+      }
+      brw_land_fwd_jump(p, jmp);
+      fire_fb_write(inst, payload, implied_header, inst->mlen);
+   }
+}
+
+void
+fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
+                               struct brw_reg payload)
+{
+   assert(inst->size_written % REG_SIZE == 0);
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+   /* We assume that render targets start at binding table index 0. */
+   const unsigned surf_index = inst->target;
+
+   gfx9_fb_READ(p, dst, payload, surf_index,
+                inst->header_size, inst->size_written / REG_SIZE,
+                prog_data->persample_dispatch);
+}
+
+void
+fs_generator::generate_mov_indirect(fs_inst *inst,
+                                    struct brw_reg dst,
+                                    struct brw_reg reg,
+                                    struct brw_reg indirect_byte_offset)
+{
+   assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
+   assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
+   assert(!reg.abs && !reg.negate);
+
+   /* Gen12.5 adds the following region restriction:
+    *
+    *    "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float
+    *    and Quad-Word data must not be used."
+    *
+    * We require the source and destination types to match so stomp to an
+    * unsigned integer type.
+    */
+   assert(reg.type == dst.type);
+   reg.type = dst.type = brw_reg_type_from_bit_size(type_sz(reg.type) * 8,
+                                                    BRW_REGISTER_TYPE_UD);
+
+   unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
+
+   if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
+      imm_byte_offset += indirect_byte_offset.ud;
+
+      reg.nr = imm_byte_offset / REG_SIZE;
+      reg.subnr = imm_byte_offset % REG_SIZE;
+      if (type_sz(reg.type) > 4 && !devinfo->has_64bit_float) {
+         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
+                    subscript(reg, BRW_REGISTER_TYPE_D, 0));
+         brw_set_default_swsb(p, tgl_swsb_null());
+         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
+                    subscript(reg, BRW_REGISTER_TYPE_D, 1));
+      } else {
+         brw_MOV(p, dst, reg);
+      }
+   } else {
+      /* Prior to Broadwell, there are only 8 address registers. */
+      assert(inst->exec_size <= 8 || devinfo->ver >= 8);
+
+      /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
+      struct brw_reg addr = vec8(brw_address_reg(0));
+
+      /* Whether we can use destination dependency control without running the
+       * risk of a hang if an instruction gets shot down.
+       */
+      const bool use_dep_ctrl = !inst->predicate &&
+                                inst->exec_size == dispatch_width;
+      brw_inst *insn;
+
+      /* The destination stride of an instruction (in bytes) must be greater
+       * than or equal to the size of the rest of the instruction.  Since the
+       * address register is of type UW, we can't use a D-type instruction.
+       * In order to get around this, re retype to UW and use a stride.
+       */
+      indirect_byte_offset =
+         retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
+
+      /* There are a number of reasons why we don't use the base offset here.
+       * One reason is that the field is only 9 bits which means we can only
+       * use it to access the first 16 GRFs.  Also, from the Haswell PRM
+       * section "Register Region Restrictions":
+       *
+       *    "The lower bits of the AddressImmediate must not overflow to
+       *    change the register address.  The lower 5 bits of Address
+       *    Immediate when added to lower 5 bits of address register gives
+       *    the sub-register offset. The upper bits of Address Immediate
+       *    when added to upper bits of address register gives the register
+       *    address. Any overflow from sub-register offset is dropped."
+       *
+       * Since the indirect may cause us to cross a register boundary, this
+       * makes the base offset almost useless.  We could try and do something
+       * clever where we use a actual base offset if base_offset % 32 == 0 but
+       * that would mean we were generating different code depending on the
+       * base offset.  Instead, for the sake of consistency, we'll just do the
+       * add ourselves.  This restriction is only listed in the Haswell PRM
+       * but empirical testing indicates that it applies on all older
+       * generations and is lifted on Broadwell.
+       *
+       * In the end, while base_offset is nice to look at in the generated
+       * code, using it saves us 0 instructions and would require quite a bit
+       * of case-by-case work.  It's just not worth it.
+       *
+       * Due to a hardware bug some platforms (particularly Gfx11+) seem to
+       * require the address components of all channels to be valid whether or
+       * not they're active, which causes issues if we use VxH addressing
+       * under non-uniform control-flow.  We can easily work around that by
+       * initializing the whole address register with a pipelined NoMask MOV
+       * instruction.
+       */
+      if (devinfo->ver >= 7) {
+         insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset));
+         brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+         brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
+         if (devinfo->ver >= 12)
+            brw_set_default_swsb(p, tgl_swsb_null());
+         else
+            brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
+      }
+
+      insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
+      if (devinfo->ver >= 12)
+         brw_set_default_swsb(p, tgl_swsb_regdist(1));
+      else if (devinfo->ver >= 7)
+         brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
+
+      if (type_sz(reg.type) > 4 &&
+          ((devinfo->verx10 == 70) ||
+           devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo) ||
+           !devinfo->has_64bit_float || devinfo->verx10 >= 125)) {
+         /* IVB has an issue (which we found empirically) where it reads two
+          * address register components per channel for indirectly addressed
+          * 64-bit sources.
+          *
+          * From the Cherryview PRM Vol 7. "Register Region Restrictions":
+          *
+          *    "When source or destination datatype is 64b or operation is
+          *    integer DWord multiply, indirect addressing must not be used."
+          *
+          * To work around both of these, we do two integer MOVs insead of one
+          * 64-bit MOV.  Because no double value should ever cross a register
+          * boundary, it's safe to use the immediate offset in the indirect
+          * here to handle adding 4 bytes to the offset and avoid the extra
+          * ADD to the register file.
+          */
+         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
+                    retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
+         brw_set_default_swsb(p, tgl_swsb_null());
+         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
+                    retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
+      } else {
+         struct brw_reg ind_src = brw_VxH_indirect(0, 0);
+
+         brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
+
+         if (devinfo->ver == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
+             !inst->get_next()->is_tail_sentinel() &&
+             ((fs_inst *)inst->get_next())->mlen > 0) {
+            /* From the Sandybridge PRM:
+             *
+             *    "[Errata: DevSNB(SNB)] If MRF register is updated by any
+             *    instruction that “indexed/indirect” source AND is followed
+             *    by a send, the instruction requires a “Switch”. This is to
+             *    avoid race condition where send may dispatch before MRF is
+             *    updated."
+             */
+            brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
+         }
+      }
+   }
+}
+
+void
+fs_generator::generate_shuffle(fs_inst *inst,
+                               struct brw_reg dst,
+                               struct brw_reg src,
+                               struct brw_reg idx)
+{
+   assert(src.file == BRW_GENERAL_REGISTER_FILE);
+   assert(!src.abs && !src.negate);
+
+   /* Ivy bridge has some strange behavior that makes this a real pain to
+    * implement for 64-bit values so we just don't bother.
+    */
+   assert((devinfo->verx10 >= 75 && devinfo->has_64bit_float) ||
+          type_sz(src.type) <= 4);
+
+   /* Gen12.5 adds the following region restriction:
+    *
+    *    "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float
+    *    and Quad-Word data must not be used."
+    *
+    * We require the source and destination types to match so stomp to an
+    * unsigned integer type.
+    */
+   assert(src.type == dst.type);
+   src.type = dst.type = brw_reg_type_from_bit_size(type_sz(src.type) * 8,
+                                                    BRW_REGISTER_TYPE_UD);
+
+   /* Because we're using the address register, we're limited to 8-wide
+    * execution on gfx7.  On gfx8, we're limited to 16-wide by the address
+    * register file and 8-wide for 64-bit types.  We could try and make this
+    * instruction splittable higher up in the compiler but that gets weird
+    * because it reads all of the channels regardless of execution size.  It's
+    * easier just to split it here.
+    */
+   const unsigned lower_width =
+      devinfo->ver <= 7 || element_sz(src) > 4 || element_sz(dst) > 4 ? 8 :
+      MIN2(16, inst->exec_size);
+
+   brw_set_default_exec_size(p, cvt(lower_width) - 1);
+   for (unsigned group = 0; group < inst->exec_size; group += lower_width) {
+      brw_set_default_group(p, group);
+
+      if ((src.vstride == 0 && src.hstride == 0) ||
+          idx.file == BRW_IMMEDIATE_VALUE) {
+         /* Trivial, the source is already uniform or the index is a constant.
+          * We will typically not get here if the optimizer is doing its job,
+          * but asserting would be mean.
+          */
+         const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
+         struct brw_reg group_src = stride(suboffset(src, i), 0, 1, 0);
+         struct brw_reg group_dst = suboffset(dst, group << (dst.hstride - 1));
+         brw_MOV(p, group_dst, group_src);
+      } else {
+         /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
+         struct brw_reg addr = vec8(brw_address_reg(0));
+
+         struct brw_reg group_idx = suboffset(idx, group);
+
+         if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) {
+            /* Things get grumpy if the register is too wide. */
+            group_idx.width--;
+            group_idx.vstride--;
+         }
+
+         assert(type_sz(group_idx.type) <= 4);
+         if (type_sz(group_idx.type) == 4) {
+            /* The destination stride of an instruction (in bytes) must be
+             * greater than or equal to the size of the rest of the
+             * instruction.  Since the address register is of type UW, we
+             * can't use a D-type instruction.  In order to get around this,
+             * re retype to UW and use a stride.
+             */
+            group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W);
+         }
+
+         uint32_t src_start_offset = src.nr * REG_SIZE + src.subnr;
+
+         /* From the Haswell PRM:
+          *
+          *    "When a sequence of NoDDChk and NoDDClr are used, the last
+          *    instruction that completes the scoreboard clear must have a
+          *    non-zero execution mask. This means, if any kind of predication
+          *    can change the execution mask or channel enable of the last
+          *    instruction, the optimization must be avoided.  This is to
+          *    avoid instructions being shot down the pipeline when no writes
+          *    are required."
+          *
+          * Whenever predication is enabled or the instructions being emitted
+          * aren't the full width, it's possible that it will be run with zero
+          * channels enabled so we can't use dependency control without
+          * running the risk of a hang if an instruction gets shot down.
+          */
+         const bool use_dep_ctrl = !inst->predicate &&
+                                   lower_width == dispatch_width;
+         brw_inst *insn;
+
+         /* Due to a hardware bug some platforms (particularly Gfx11+) seem
+          * to require the address components of all channels to be valid
+          * whether or not they're active, which causes issues if we use VxH
+          * addressing under non-uniform control-flow.  We can easily work
+          * around that by initializing the whole address register with a
+          * pipelined NoMask MOV instruction.
+          */
+         insn = brw_MOV(p, addr, brw_imm_uw(src_start_offset));
+         brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+         brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
+         if (devinfo->ver >= 12)
+            brw_set_default_swsb(p, tgl_swsb_null());
+         else
+            brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
+
+         /* Take into account the component size and horizontal stride. */
+         assert(src.vstride == src.hstride + src.width);
+         insn = brw_SHL(p, addr, group_idx,
+                        brw_imm_uw(util_logbase2(type_sz(src.type)) +
+                                   src.hstride - 1));
+         if (devinfo->ver >= 12)
+            brw_set_default_swsb(p, tgl_swsb_regdist(1));
+         else
+            brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
+
+         /* Add on the register start offset */
+         brw_ADD(p, addr, addr, brw_imm_uw(src_start_offset));
+         brw_MOV(p, suboffset(dst, group << (dst.hstride - 1)),
+                 retype(brw_VxH_indirect(0, 0), src.type));
+      }
+
+      brw_set_default_swsb(p, tgl_swsb_null());
+   }
+}
+
+void
+fs_generator::generate_quad_swizzle(const fs_inst *inst,
+                                    struct brw_reg dst, struct brw_reg src,
+                                    unsigned swiz)
+{
+   /* Requires a quad. */
+   assert(inst->exec_size >= 4);
+
+   if (src.file == BRW_IMMEDIATE_VALUE ||
+       has_scalar_region(src)) {
+      /* The value is uniform across all channels */
+      brw_MOV(p, dst, src);
+
+   } else if (devinfo->ver < 11 && type_sz(src.type) == 4) {
+      /* This only works on 8-wide 32-bit values */
+      assert(inst->exec_size == 8);
+      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
+      assert(src.vstride == src.width + 1);
+      brw_set_default_access_mode(p, BRW_ALIGN_16);
+      struct brw_reg swiz_src = stride(src, 4, 4, 1);
+      swiz_src.swizzle = swiz;
+      brw_MOV(p, dst, swiz_src);
+
+   } else {
+      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
+      assert(src.vstride == src.width + 1);
+      const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0));
+
+      switch (swiz) {
+      case BRW_SWIZZLE_XXXX:
+      case BRW_SWIZZLE_YYYY:
+      case BRW_SWIZZLE_ZZZZ:
+      case BRW_SWIZZLE_WWWW:
+         brw_MOV(p, dst, stride(src_0, 4, 4, 0));
+         break;
+
+      case BRW_SWIZZLE_XXZZ:
+      case BRW_SWIZZLE_YYWW:
+         brw_MOV(p, dst, stride(src_0, 2, 2, 0));
+         break;
+
+      case BRW_SWIZZLE_XYXY:
+      case BRW_SWIZZLE_ZWZW:
+         assert(inst->exec_size == 4);
+         brw_MOV(p, dst, stride(src_0, 0, 2, 1));
+         break;
+
+      default:
+         assert(inst->force_writemask_all);
+         brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
+
+         for (unsigned c = 0; c < 4; c++) {
+            brw_inst *insn = brw_MOV(
+               p, stride(suboffset(dst, c),
+                         4 * inst->dst.stride, 1, 4 * inst->dst.stride),
+               stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
+
+            if (devinfo->ver < 12) {
+               brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
+               brw_inst_set_no_dd_check(devinfo, insn, c > 0);
+            }
+
+            brw_set_default_swsb(p, tgl_swsb_null());
+         }
+
+         break;
+      }
+   }
+}
+
+void
+fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
+{
+   struct brw_inst *insn;
+
+   insn = brw_next_insn(p, BRW_OPCODE_SEND);
+
+   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
+   brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
+   if (devinfo->ver < 12)
+      brw_set_src1(p, insn, brw_imm_ud(0u));
+
+   /* For XeHP and newer send a message to the message gateway to terminate a
+    * compute shader. For older devices, a message is sent to the thread
+    * spawner.
+    */
+   if (devinfo->verx10 >= 125)
+      brw_inst_set_sfid(devinfo, insn, BRW_SFID_MESSAGE_GATEWAY);
+   else
+      brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
+   brw_inst_set_mlen(devinfo, insn, 1);
+   brw_inst_set_rlen(devinfo, insn, 0);
+   brw_inst_set_eot(devinfo, insn, inst->eot);
+   brw_inst_set_header_present(devinfo, insn, false);
+
+   brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
+
+   if (devinfo->ver < 11) {
+      brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
+
+      /* Note that even though the thread has a URB resource associated with it,
+       * we set the "do not dereference URB" bit, because the URB resource is
+       * managed by the fixed-function unit, so it will free it automatically.
+       */
+      brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
+   }
+
+   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+}
+
+void
+fs_generator::generate_barrier(fs_inst *, struct brw_reg src)
+{
+   brw_barrier(p, src);
+   if (devinfo->ver >= 12) {
+      brw_set_default_swsb(p, tgl_swsb_null());
+      brw_SYNC(p, TGL_SYNC_BAR);
+   } else {
+      brw_WAIT(p);
+   }
+}
+
+bool
+fs_generator::generate_linterp(fs_inst *inst,
+                               struct brw_reg dst, struct brw_reg *src)
+{
+   /* PLN reads:
+    *                      /   in SIMD16   \
+    *    -----------------------------------
+    *   | src1+0 | src1+1 | src1+2 | src1+3 |
+    *   |-----------------------------------|
+    *   |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
+    *    -----------------------------------
+    *
+    * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
+    *
+    *    -----------------------------------
+    *   | src1+0 | src1+1 | src1+2 | src1+3 |
+    *   |-----------------------------------|
+    *   |(x0, x1)|(y0, y1)|        |        | in SIMD8
+    *   |-----------------------------------|
+    *   |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
+    *    -----------------------------------
+    *
+    * See also: emit_interpolation_setup_gfx4().
+    */
+   struct brw_reg delta_x = src[0];
+   struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
+   struct brw_reg interp = src[1];
+   brw_inst *i[2];
+
+   /* nir_lower_interpolation() will do the lowering to MAD instructions for
+    * us on gfx11+
+    */
+   assert(devinfo->ver < 11);
+
+   if (devinfo->has_pln) {
+      if (devinfo->ver <= 6 && (delta_x.nr & 1) != 0) {
+         /* From the Sandy Bridge PRM Vol. 4, Pt. 2, Section 8.3.53, "Plane":
+          *
+          *    "[DevSNB]:<src1> must be even register aligned.
+          *
+          * This restriction is lifted on Ivy Bridge.
+          *
+          * This means that we need to split PLN into LINE+MAC on-the-fly.
+          * Unfortunately, the inputs are laid out for PLN and not LINE+MAC so
+          * we have to split into SIMD8 pieces.  For gfx4 (!has_pln), the
+          * coordinate registers are laid out differently so we leave it as a
+          * SIMD16 instruction.
+          */
+         assert(inst->exec_size == 8 || inst->exec_size == 16);
+         assert(inst->group % 16 == 0);
+
+         brw_push_insn_state(p);
+         brw_set_default_exec_size(p, BRW_EXECUTE_8);
+
+         /* Thanks to two accumulators, we can emit all the LINEs and then all
+          * the MACs.  This improves parallelism a bit.
+          */
+         for (unsigned g = 0; g < inst->exec_size / 8; g++) {
+            brw_inst *line = brw_LINE(p, brw_null_reg(), interp,
+                                      offset(delta_x, g * 2));
+            brw_inst_set_group(devinfo, line, inst->group + g * 8);
+
+            /* LINE writes the accumulator automatically on gfx4-5.  On Sandy
+             * Bridge and later, we have to explicitly enable it.
+             */
+            if (devinfo->ver >= 6)
+               brw_inst_set_acc_wr_control(p->devinfo, line, true);
+
+            /* brw_set_default_saturate() is called before emitting
+             * instructions, so the saturate bit is set in each instruction,
+             * so we need to unset it on the LINE instructions.
+             */
+            brw_inst_set_saturate(p->devinfo, line, false);
+         }
+
+         for (unsigned g = 0; g < inst->exec_size / 8; g++) {
+            brw_inst *mac = brw_MAC(p, offset(dst, g), suboffset(interp, 1),
+                                    offset(delta_x, g * 2 + 1));
+            brw_inst_set_group(devinfo, mac, inst->group + g * 8);
+            brw_inst_set_cond_modifier(p->devinfo, mac, inst->conditional_mod);
+         }
+
+         brw_pop_insn_state(p);
+
+         return true;
+      } else {
+         brw_PLN(p, dst, interp, delta_x);
+
+         return false;
+      }
+   } else {
+      i[0] = brw_LINE(p, brw_null_reg(), interp, delta_x);
+      i[1] = brw_MAC(p, dst, suboffset(interp, 1), delta_y);
+
+      brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod);
+
+      /* brw_set_default_saturate() is called before emitting instructions, so
+       * the saturate bit is set in each instruction, so we need to unset it on
+       * the first instruction.
+       */
+      brw_inst_set_saturate(p->devinfo, i[0], false);
+
+      return true;
+   }
+}
+
+void
+fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst,
+                           struct brw_reg surface_index,
+                           struct brw_reg sampler_index)
+{
+   assert(devinfo->ver < 7);
+   assert(inst->size_written % REG_SIZE == 0);
+   int msg_type = -1;
+   uint32_t simd_mode;
+   uint32_t return_format;
+
+   /* Sampler EOT message of less than the dispatch width would kill the
+    * thread prematurely.
+    */
+   assert(!inst->eot || inst->exec_size == dispatch_width);
+
+   switch (dst.type) {
+   case BRW_REGISTER_TYPE_D:
+      return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
+      break;
+   case BRW_REGISTER_TYPE_UD:
+      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
+      break;
+   default:
+      return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+      break;
+   }
+
+   /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
+    * is set as part of the message descriptor.  On gfx4, the PRM seems to
+    * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
+    * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
+    * gone from the message descriptor entirely and you just get UINT32 all
+    * the time regasrdless.  Since we can really only do non-UINT32 on gfx4,
+    * just stomp it to UINT32 all the time.
+    */
+   if (inst->opcode == SHADER_OPCODE_TXS)
+      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
+
+   switch (inst->exec_size) {
+   case 8:
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+      break;
+   case 16:
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+      break;
+   default:
+      unreachable("Invalid width for texture instruction");
+   }
+
+   if (devinfo->ver >= 5) {
+      switch (inst->opcode) {
+      case SHADER_OPCODE_TEX:
+	 if (inst->shadow_compare) {
+	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
+	 } else {
+	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE;
+	 }
+	 break;
+      case FS_OPCODE_TXB:
+	 if (inst->shadow_compare) {
+	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
+	 } else {
+	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
+	 }
+	 break;
+      case SHADER_OPCODE_TXL:
+	 if (inst->shadow_compare) {
+	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
+	 } else {
+	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
+	 }
+	 break;
+      case SHADER_OPCODE_TXS:
+	 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
+	 break;
+      case SHADER_OPCODE_TXD:
+         assert(!inst->shadow_compare);
+         msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
+	 break;
+      case SHADER_OPCODE_TXF:
+	 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
+	 break;
+      case SHADER_OPCODE_TXF_CMS:
+         msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
+         break;
+      case SHADER_OPCODE_LOD:
+         msg_type = GFX5_SAMPLER_MESSAGE_LOD;
+         break;
+      case SHADER_OPCODE_TG4:
+         assert(devinfo->ver == 6);
+         assert(!inst->shadow_compare);
+         msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
+         break;
+      case SHADER_OPCODE_SAMPLEINFO:
+         msg_type = GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
+         break;
+      default:
+	 unreachable("not reached");
+      }
+   } else {
+      switch (inst->opcode) {
+      case SHADER_OPCODE_TEX:
+	 /* Note that G45 and older determines shadow compare and dispatch width
+	  * from message length for most messages.
+	  */
+         if (inst->exec_size == 8) {
+            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+            if (inst->shadow_compare) {
+               assert(inst->mlen == 6);
+            } else {
+               assert(inst->mlen <= 4);
+            }
+         } else {
+            if (inst->shadow_compare) {
+               msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
+               assert(inst->mlen == 9);
+            } else {
+               msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
+               assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
+            }
+         }
+	 break;
+      case FS_OPCODE_TXB:
+	 if (inst->shadow_compare) {
+            assert(inst->exec_size == 8);
+	    assert(inst->mlen == 6);
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
+	 } else {
+	    assert(inst->mlen == 9);
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+	 }
+	 break;
+      case SHADER_OPCODE_TXL:
+	 if (inst->shadow_compare) {
+            assert(inst->exec_size == 8);
+	    assert(inst->mlen == 6);
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
+	 } else {
+	    assert(inst->mlen == 9);
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
+	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+	 }
+	 break;
+      case SHADER_OPCODE_TXD:
+	 /* There is no sample_d_c message; comparisons are done manually */
+         assert(inst->exec_size == 8);
+	 assert(inst->mlen == 7 || inst->mlen == 10);
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
+	 break;
+      case SHADER_OPCODE_TXF:
+         assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
+	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+	 break;
+      case SHADER_OPCODE_TXS:
+	 assert(inst->mlen == 3);
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
+	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+	 break;
+      default:
+	 unreachable("not reached");
+      }
+   }
+   assert(msg_type != -1);
+
+   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
+      dst = vec16(dst);
+   }
+
+   assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
+
+   /* Load the message header if present.  If there's a texture offset,
+    * we need to set it up explicitly and load the offset bitfield.
+    * Otherwise, we can use an implied move from g0 to the first message reg.
+    */
+   struct brw_reg src = brw_null_reg();
+   if (inst->header_size != 0) {
+      if (devinfo->ver < 6 && !inst->offset) {
+         /* Set up an implied move from g0 to the MRF. */
+         src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
+      } else {
+         const tgl_swsb swsb = brw_get_default_swsb(p);
+         assert(inst->base_mrf != -1);
+         struct brw_reg header_reg = brw_message_reg(inst->base_mrf);
+
+         brw_push_insn_state(p);
+         brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
+         brw_set_default_exec_size(p, BRW_EXECUTE_8);
+         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+         brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+         /* Explicitly set up the message header by copying g0 to the MRF. */
+         brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
+         brw_set_default_swsb(p, tgl_swsb_regdist(1));
+
+         brw_set_default_exec_size(p, BRW_EXECUTE_1);
+         if (inst->offset) {
+            /* Set the offset bits in DWord 2. */
+            brw_MOV(p, get_element_ud(header_reg, 2),
+                       brw_imm_ud(inst->offset));
+         }
+
+         brw_pop_insn_state(p);
+         brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
+      }
+   }
+
+   assert(surface_index.file == BRW_IMMEDIATE_VALUE);
+   assert(sampler_index.file == BRW_IMMEDIATE_VALUE);
+
+   brw_SAMPLE(p,
+              retype(dst, BRW_REGISTER_TYPE_UW),
+              inst->base_mrf,
+              src,
+              surface_index.ud,
+              sampler_index.ud % 16,
+              msg_type,
+              inst->size_written / REG_SIZE,
+              inst->mlen,
+              inst->header_size != 0,
+              simd_mode,
+              return_format);
+}
+
+
+/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
+ * looking like:
+ *
+ * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
+ *
+ * Ideally, we want to produce:
+ *
+ *           DDX                     DDY
+ * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
+ *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
+ *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
+ *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
+ *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
+ *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
+ *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
+ *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
+ *
+ * and add another set of two more subspans if in 16-pixel dispatch mode.
+ *
+ * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
+ * for each pair, and vertstride = 2 jumps us 2 elements after processing a
+ * pair.  But the ideal approximation may impose a huge performance cost on
+ * sample_d.  On at least Haswell, sample_d instruction does some
+ * optimizations if the same LOD is used for all pixels in the subspan.
+ *
+ * For DDY, we need to use ALIGN16 mode since it's capable of doing the
+ * appropriate swizzling.
+ */
+void
+fs_generator::generate_ddx(const fs_inst *inst,
+                           struct brw_reg dst, struct brw_reg src)
+{
+   unsigned vstride, width;
+
+   if (devinfo->ver >= 8) {
+      if (inst->opcode == FS_OPCODE_DDX_FINE) {
+         /* produce accurate derivatives */
+         vstride = BRW_VERTICAL_STRIDE_2;
+         width = BRW_WIDTH_2;
+      } else {
+         /* replicate the derivative at the top-left pixel to other pixels */
+         vstride = BRW_VERTICAL_STRIDE_4;
+         width = BRW_WIDTH_4;
+      }
+
+      struct brw_reg src0 = byte_offset(src, type_sz(src.type));;
+      struct brw_reg src1 = src;
+
+      src0.vstride = vstride;
+      src0.width   = width;
+      src0.hstride = BRW_HORIZONTAL_STRIDE_0;
+      src1.vstride = vstride;
+      src1.width   = width;
+      src1.hstride = BRW_HORIZONTAL_STRIDE_0;
+
+      brw_ADD(p, dst, src0, negate(src1));
+   } else {
+      /* On Haswell and earlier, the region used above appears to not work
+       * correctly for compressed instructions.  At least on Haswell and
+       * Iron Lake, compressed ALIGN16 instructions do work.  Since we
+       * would have to split to SIMD8 no matter which method we choose, we
+       * may as well use ALIGN16 on all platforms gfx7 and earlier.
+       */
+      struct brw_reg src0 = stride(src, 4, 4, 1);
+      struct brw_reg src1 = stride(src, 4, 4, 1);
+      if (inst->opcode == FS_OPCODE_DDX_FINE) {
+         src0.swizzle = BRW_SWIZZLE_XXZZ;
+         src1.swizzle = BRW_SWIZZLE_YYWW;
+      } else {
+         src0.swizzle = BRW_SWIZZLE_XXXX;
+         src1.swizzle = BRW_SWIZZLE_YYYY;
+      }
+
+      brw_push_insn_state(p);
+      brw_set_default_access_mode(p, BRW_ALIGN_16);
+      brw_ADD(p, dst, negate(src0), src1);
+      brw_pop_insn_state(p);
+   }
+}
+
+/* The negate_value boolean is used to negate the derivative computation for
+ * FBOs, since they place the origin at the upper left instead of the lower
+ * left.
+ */
+void
+fs_generator::generate_ddy(const fs_inst *inst,
+                           struct brw_reg dst, struct brw_reg src)
+{
+   const uint32_t type_size = type_sz(src.type);
+
+   if (inst->opcode == FS_OPCODE_DDY_FINE) {
+      /* produce accurate derivatives.
+       *
+       * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU)
+       * "Register Region Restrictions", Section "1. Special Restrictions":
+       *
+       *    "In Align16 mode, the channel selects and channel enables apply to
+       *     a pair of half-floats, because these parameters are defined for
+       *     DWord elements ONLY. This is applicable when both source and
+       *     destination are half-floats."
+       *
+       * So for half-float operations we use the Gfx11+ Align1 path. CHV
+       * inherits its FP16 hardware from SKL, so it is not affected.
+       */
+      if (devinfo->ver >= 11 ||
+          (devinfo->platform == INTEL_PLATFORM_BDW && src.type == BRW_REGISTER_TYPE_HF)) {
+         src = stride(src, 0, 2, 1);
+
+         brw_push_insn_state(p);
+         brw_set_default_exec_size(p, BRW_EXECUTE_4);
+         for (uint32_t g = 0; g < inst->exec_size; g += 4) {
+            brw_set_default_group(p, inst->group + g);
+            brw_ADD(p, byte_offset(dst, g * type_size),
+                       negate(byte_offset(src,  g * type_size)),
+                       byte_offset(src, (g + 2) * type_size));
+            brw_set_default_swsb(p, tgl_swsb_null());
+         }
+         brw_pop_insn_state(p);
+      } else {
+         struct brw_reg src0 = stride(src, 4, 4, 1);
+         struct brw_reg src1 = stride(src, 4, 4, 1);
+         src0.swizzle = BRW_SWIZZLE_XYXY;
+         src1.swizzle = BRW_SWIZZLE_ZWZW;
+
+         brw_push_insn_state(p);
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         brw_ADD(p, dst, negate(src0), src1);
+         brw_pop_insn_state(p);
+      }
+   } else {
+      /* replicate the derivative at the top-left pixel to other pixels */
+      if (devinfo->ver >= 8) {
+         struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
+         struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
+
+         brw_ADD(p, dst, negate(src0), src1);
+      } else {
+         /* On Haswell and earlier, the region used above appears to not work
+          * correctly for compressed instructions.  At least on Haswell and
+          * Iron Lake, compressed ALIGN16 instructions do work.  Since we
+          * would have to split to SIMD8 no matter which method we choose, we
+          * may as well use ALIGN16 on all platforms gfx7 and earlier.
+          */
+         struct brw_reg src0 = stride(src, 4, 4, 1);
+         struct brw_reg src1 = stride(src, 4, 4, 1);
+         src0.swizzle = BRW_SWIZZLE_XXXX;
+         src1.swizzle = BRW_SWIZZLE_ZZZZ;
+
+         brw_push_insn_state(p);
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         brw_ADD(p, dst, negate(src0), src1);
+         brw_pop_insn_state(p);
+      }
+   }
+}
+
+void
+fs_generator::generate_halt(fs_inst *)
+{
+   /* This HALT will be patched up at FB write time to point UIP at the end of
+    * the program, and at brw_uip_jip() JIP will be set to the end of the
+    * current block (or the program).
+    */
+   this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
+   brw_HALT(p);
+}
+
+void
+fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
+{
+   /* The 32-wide messages only respect the first 16-wide half of the channel
+    * enable signals which are replicated identically for the second group of
+    * 16 channels, so we cannot use them unless the write is marked
+    * force_writemask_all.
+    */
+   const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
+                               MIN2(16, inst->exec_size);
+   const unsigned block_size = 4 * lower_size / REG_SIZE;
+   const tgl_swsb swsb = brw_get_default_swsb(p);
+   assert(inst->mlen != 0);
+
+   brw_push_insn_state(p);
+   brw_set_default_exec_size(p, cvt(lower_size) - 1);
+   brw_set_default_compression(p, lower_size > 8);
+
+   for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
+      brw_set_default_group(p, inst->group + lower_size * i);
+
+      if (i > 0) {
+         assert(swsb.mode & TGL_SBID_SET);
+         brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_SRC, swsb.sbid));
+      } else {
+         brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
+      }
+
+      brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
+              retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
+
+      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
+      brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
+                                    block_size,
+                                    inst->offset + block_size * REG_SIZE * i);
+   }
+
+   brw_pop_insn_state(p);
+}
+
+void
+fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
+{
+   assert(inst->exec_size <= 16 || inst->force_writemask_all);
+   assert(inst->mlen != 0);
+
+   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
+                                inst->exec_size / 8, inst->offset);
+}
+
+void
+fs_generator::generate_scratch_read_gfx7(fs_inst *inst, struct brw_reg dst)
+{
+   assert(inst->exec_size <= 16 || inst->force_writemask_all);
+
+   gfx7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
+}
+
+/* The A32 messages take a buffer base address in header.5:[31:0] (See
+ * MH1_A32_PSM for typed messages or MH_A32_GO for byte/dword scattered
+ * and OWord block messages in the SKL PRM Vol. 2d for more details.)
+ * Unfortunately, there are a number of subtle differences:
+ *
+ * For the block read/write messages:
+ *
+ *   - We always stomp header.2 to fill in the actual scratch address (in
+ *     units of OWORDs) so we don't care what's in there.
+ *
+ *   - They rely on per-thread scratch space value in header.3[3:0] to do
+ *     bounds checking so that needs to be valid.  The upper bits of
+ *     header.3 are ignored, though, so we can copy all of g0.3.
+ *
+ *   - They ignore header.5[9:0] and assumes the address is 1KB aligned.
+ *
+ *
+ * For the byte/dword scattered read/write messages:
+ *
+ *   - We want header.2 to be zero because that gets added to the per-channel
+ *     offset in the non-header portion of the message.
+ *
+ *   - Contrary to what the docs claim, they don't do any bounds checking so
+ *     the value of header.3[3:0] doesn't matter.
+ *
+ *   - They consider all of header.5 for the base address and header.5[9:0]
+ *     are not ignored.  This means that we can't copy g0.5 verbatim because
+ *     g0.5[9:0] contains the FFTID on most platforms.  Instead, we have to
+ *     use an AND to mask off the bottom 10 bits.
+ *
+ *
+ * For block messages, just copying g0 gives a valid header because all the
+ * garbage gets ignored except for header.2 which we stomp as part of message
+ * setup.  For byte/dword scattered messages, we can just zero out the header
+ * and copy over the bits we need from g0.5.  This opcode, however, tries to
+ * satisfy the requirements of both by starting with 0 and filling out the
+ * information required by either set of opcodes.
+ */
+void
+fs_generator::generate_scratch_header(fs_inst *inst, struct brw_reg dst)
+{
+   assert(inst->exec_size == 8 && inst->force_writemask_all);
+   assert(dst.file == BRW_GENERAL_REGISTER_FILE);
+
+   dst.type = BRW_REGISTER_TYPE_UD;
+
+   brw_inst *insn = brw_MOV(p, dst, brw_imm_ud(0));
+   if (devinfo->ver >= 12)
+      brw_set_default_swsb(p, tgl_swsb_null());
+   else
+      brw_inst_set_no_dd_clear(p->devinfo, insn, true);
+
+   /* Copy the per-thread scratch space size from g0.3[3:0] */
+   brw_set_default_exec_size(p, BRW_EXECUTE_1);
+   insn = brw_AND(p, suboffset(dst, 3),
+                     retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
+                     brw_imm_ud(INTEL_MASK(3, 0)));
+   if (devinfo->ver < 12) {
+      brw_inst_set_no_dd_clear(p->devinfo, insn, true);
+      brw_inst_set_no_dd_check(p->devinfo, insn, true);
+   }
+
+   /* Copy the scratch base address from g0.5[31:10] */
+   insn = brw_AND(p, suboffset(dst, 5),
+                     retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
+                     brw_imm_ud(INTEL_MASK(31, 10)));
+   if (devinfo->ver < 12)
+      brw_inst_set_no_dd_check(p->devinfo, insn, true);
+}
+
+void
+fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
+                                                  struct brw_reg dst,
+                                                  struct brw_reg index,
+                                                  struct brw_reg offset)
+{
+   assert(type_sz(dst.type) == 4);
+   assert(inst->mlen != 0);
+
+   assert(index.file == BRW_IMMEDIATE_VALUE &&
+	  index.type == BRW_REGISTER_TYPE_UD);
+   uint32_t surf_index = index.ud;
+
+   assert(offset.file == BRW_IMMEDIATE_VALUE &&
+	  offset.type == BRW_REGISTER_TYPE_UD);
+   uint32_t read_offset = offset.ud;
+
+   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
+			read_offset, surf_index);
+}
+
+void
+fs_generator::generate_varying_pull_constant_load_gfx4(fs_inst *inst,
+                                                       struct brw_reg dst,
+                                                       struct brw_reg index)
+{
+   assert(devinfo->ver < 7); /* Should use the gfx7 variant. */
+   assert(inst->header_size != 0);
+   assert(inst->mlen);
+
+   assert(index.file == BRW_IMMEDIATE_VALUE &&
+	  index.type == BRW_REGISTER_TYPE_UD);
+   uint32_t surf_index = index.ud;
+
+   uint32_t simd_mode, rlen, msg_type;
+   if (inst->exec_size == 16) {
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+      rlen = 8;
+   } else {
+      assert(inst->exec_size == 8);
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+      rlen = 4;
+   }
+
+   if (devinfo->ver >= 5)
+      msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
+   else {
+      /* We always use the SIMD16 message so that we only have to load U, and
+       * not V or R.
+       */
+      msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
+      assert(inst->mlen == 3);
+      assert(inst->size_written == 8 * REG_SIZE);
+      rlen = 8;
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+   }
+
+   struct brw_reg header = brw_vec8_grf(0, 0);
+   gfx6_resolve_implied_move(p, &header, inst->base_mrf);
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_compression(devinfo, send, false);
+   brw_inst_set_sfid(devinfo, send, BRW_SFID_SAMPLER);
+   brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
+   brw_set_src0(p, send, header);
+   if (devinfo->ver < 6)
+      brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
+
+   /* Our surface is set up as floats, regardless of what actual data is
+    * stored in it.
+    */
+   uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+   brw_set_desc(p, send,
+                brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size) |
+                brw_sampler_desc(devinfo, surf_index,
+                                 0, /* sampler (unused) */
+                                 msg_type, simd_mode, return_format));
+}
+
+/* Sets vstride=1, width=4, hstride=0 of register src1 during
+ * the ADD instruction.
+ */
+void
+fs_generator::generate_set_sample_id(fs_inst *inst,
+                                     struct brw_reg dst,
+                                     struct brw_reg src0,
+                                     struct brw_reg src1)
+{
+   assert(dst.type == BRW_REGISTER_TYPE_D ||
+          dst.type == BRW_REGISTER_TYPE_UD);
+   assert(src0.type == BRW_REGISTER_TYPE_D ||
+          src0.type == BRW_REGISTER_TYPE_UD);
+
+   const struct brw_reg reg = stride(src1, 1, 4, 0);
+   const unsigned lower_size = MIN2(inst->exec_size,
+                                    devinfo->ver >= 8 ? 16 : 8);
+
+   for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
+      brw_inst *insn = brw_ADD(p, offset(dst, i * lower_size / 8),
+                               offset(src0, (src0.vstride == 0 ? 0 : (1 << (src0.vstride - 1)) *
+                                             (i * lower_size / (1 << src0.width))) *
+                                            type_sz(src0.type) / REG_SIZE),
+                               suboffset(reg, i * lower_size / 4));
+      brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1);
+      brw_inst_set_group(devinfo, insn, inst->group + lower_size * i);
+      brw_inst_set_compression(devinfo, insn, lower_size > 8);
+      brw_set_default_swsb(p, tgl_swsb_null());
+   }
+}
+
+void
+fs_generator::enable_debug(const char *shader_name)
+{
+   debug_flag = true;
+   this->shader_name = shader_name;
+}
+
+static gfx12_systolic_depth
+translate_systolic_depth(unsigned d)
+{
+   /* Could also return (ffs(d) - 1) & 3. */
+   switch (d) {
+   case 2:  return BRW_SYSTOLIC_DEPTH_2;
+   case 4:  return BRW_SYSTOLIC_DEPTH_4;
+   case 8:  return BRW_SYSTOLIC_DEPTH_8;
+   case 16: return BRW_SYSTOLIC_DEPTH_16;
+   default: unreachable("Invalid systolic depth.");
+   }
+}
+
+int
+fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
+                            struct shader_stats shader_stats,
+                            const brw::performance &perf,
+                            struct brw_compile_stats *stats,
+                            unsigned max_polygons)
+{
+   /* align to 64 byte boundary. */
+   brw_realign(p, 64);
+
+   this->dispatch_width = dispatch_width;
+
+   int start_offset = p->next_insn_offset;
+
+   int loop_count = 0, send_count = 0, nop_count = 0, sync_nop_count = 0;
+   bool is_accum_used = false;
+
+   struct disasm_info *disasm_info = disasm_initialize(p->isa, cfg);
+
+   foreach_block_and_inst (block, fs_inst, inst, cfg) {
+      if (inst->opcode == SHADER_OPCODE_UNDEF)
+         continue;
+
+      struct brw_reg src[4], dst;
+      unsigned int last_insn_offset = p->next_insn_offset;
+      bool multiple_instructions_emitted = false;
+      tgl_swsb swsb = inst->sched;
+
+      /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
+       * "Register Region Restrictions" section: for BDW, SKL:
+       *
+       *    "A POW/FDIV operation must not be followed by an instruction
+       *     that requires two destination registers."
+       *
+       * The documentation is often lacking annotations for Atom parts,
+       * and empirically this affects CHV as well.
+       */
+      if (devinfo->ver >= 8 &&
+          devinfo->ver <= 9 &&
+          p->nr_insn > 1 &&
+          brw_inst_opcode(p->isa, brw_last_inst) == BRW_OPCODE_MATH &&
+          brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
+          inst->dst.component_size(inst->exec_size) > REG_SIZE) {
+         brw_NOP(p);
+         last_insn_offset = p->next_insn_offset;
+
+         /* In order to avoid spurious instruction count differences when the
+          * instruction schedule changes, keep track of the number of inserted
+          * NOPs.
+          */
+         nop_count++;
+      }
+
+      /* Wa_14010017096:
+       *
+       * Clear accumulator register before end of thread.
+       */
+      if (inst->eot && is_accum_used &&
+          intel_needs_workaround(devinfo, 14010017096)) {
+         brw_set_default_exec_size(p, BRW_EXECUTE_16);
+         brw_set_default_group(p, 0);
+         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+         brw_set_default_flag_reg(p, 0, 0);
+         brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
+         brw_MOV(p, brw_acc_reg(8), brw_imm_f(0.0f));
+         last_insn_offset = p->next_insn_offset;
+         swsb = tgl_swsb_dst_dep(swsb, 1);
+      }
+
+      if (!is_accum_used && !inst->eot) {
+         is_accum_used = inst->writes_accumulator_implicitly(devinfo) ||
+                         inst->dst.is_accumulator();
+      }
+
+      /* Wa_14013672992:
+       *
+       * Always use @1 SWSB for EOT.
+       */
+      if (inst->eot && intel_needs_workaround(devinfo, 14013672992)) {
+         if (tgl_swsb_src_dep(swsb).mode) {
+            brw_set_default_exec_size(p, BRW_EXECUTE_1);
+            brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+            brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+            brw_set_default_flag_reg(p, 0, 0);
+            brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
+            brw_SYNC(p, TGL_SYNC_NOP);
+            last_insn_offset = p->next_insn_offset;
+         }
+
+         swsb = tgl_swsb_dst_dep(swsb, 1);
+      }
+
+      if (unlikely(debug_flag))
+         disasm_annotate(disasm_info, inst, p->next_insn_offset);
+
+      /* If the instruction writes to more than one register, it needs to be
+       * explicitly marked as compressed on Gen <= 5.  On Gen >= 6 the
+       * hardware figures out by itself what the right compression mode is,
+       * but we still need to know whether the instruction is compressed to
+       * set up the source register regions appropriately.
+       *
+       * XXX - This is wrong for instructions that write a single register but
+       *       read more than one which should strictly speaking be treated as
+       *       compressed.  For instructions that don't write any registers it
+       *       relies on the destination being a null register of the correct
+       *       type and regioning so the instruction is considered compressed
+       *       or not accordingly.
+       */
+      const bool compressed =
+           inst->dst.component_size(inst->exec_size) > REG_SIZE;
+      brw_set_default_compression(p, compressed);
+
+      if ((devinfo->ver >= 20 || devinfo->ver < 7) && inst->group % 8 != 0) {
+         assert(inst->force_writemask_all);
+         assert(!inst->predicate && !inst->conditional_mod);
+         assert(!inst->writes_accumulator_implicitly(devinfo) &&
+                !inst->reads_accumulator_implicitly());
+         assert(inst->opcode != SHADER_OPCODE_SEL_EXEC);
+         brw_set_default_group(p, 0);
+      } else {
+         brw_set_default_group(p, inst->group);
+      }
+
+      for (unsigned int i = 0; i < inst->sources; i++) {
+         src[i] = brw_reg_from_fs_reg(devinfo, inst,
+                                      &inst->src[i], compressed);
+	 /* The accumulator result appears to get used for the
+	  * conditional modifier generation.  When negating a UD
+	  * value, there is a 33rd bit generated for the sign in the
+	  * accumulator value, so now you can't check, for example,
+	  * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
+	  */
+	 assert(!inst->conditional_mod ||
+		inst->src[i].type != BRW_REGISTER_TYPE_UD ||
+		!inst->src[i].negate);
+      }
+      dst = brw_reg_from_fs_reg(devinfo, inst,
+                                &inst->dst, compressed);
+
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+      brw_set_default_predicate_control(p, inst->predicate);
+      brw_set_default_predicate_inverse(p, inst->predicate_inverse);
+      /* On gfx7 and above, hardware automatically adds the group onto the
+       * flag subregister number.  On Sandy Bridge and older, we have to do it
+       * ourselves.
+       */
+      const unsigned flag_subreg = inst->flag_subreg +
+         (devinfo->ver >= 7 ? 0 : inst->group / 16);
+      brw_set_default_flag_reg(p, flag_subreg / 2, flag_subreg % 2);
+      brw_set_default_saturate(p, inst->saturate);
+      brw_set_default_mask_control(p, inst->force_writemask_all);
+      if (devinfo->ver >= 20 && inst->writes_accumulator) {
+         assert(inst->dst.is_accumulator() ||
+                inst->opcode == BRW_OPCODE_ADDC ||
+                inst->opcode == BRW_OPCODE_MACH ||
+                inst->opcode == BRW_OPCODE_SUBB);
+      } else {
+         brw_set_default_acc_write_control(p, inst->writes_accumulator);
+      }
+      brw_set_default_swsb(p, swsb);
+
+      unsigned exec_size = inst->exec_size;
+      if (devinfo->verx10 == 70 &&
+          (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) {
+         exec_size *= 2;
+      }
+
+      brw_set_default_exec_size(p, cvt(exec_size) - 1);
+
+      assert(inst->force_writemask_all || inst->exec_size >= 4);
+      assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
+      assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->ver));
+      assert(inst->mlen <= BRW_MAX_MSG_LENGTH * reg_unit(devinfo));
+
+      switch (inst->opcode) {
+      case BRW_OPCODE_SYNC:
+         assert(src[0].file == BRW_IMMEDIATE_VALUE);
+         brw_SYNC(p, tgl_sync_function(src[0].ud));
+
+         if (tgl_sync_function(src[0].ud) == TGL_SYNC_NOP)
+            ++sync_nop_count;
+
+         break;
+      case BRW_OPCODE_MOV:
+	 brw_MOV(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_ADD:
+	 brw_ADD(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_MUL:
+	 brw_MUL(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_AVG:
+	 brw_AVG(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_MACH:
+	 brw_MACH(p, dst, src[0], src[1]);
+	 break;
+
+      case BRW_OPCODE_DP4A:
+         assert(devinfo->ver >= 12);
+         brw_DP4A(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_LINE:
+         brw_LINE(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_DPAS:
+         assert(devinfo->verx10 >= 125);
+         brw_DPAS(p, translate_systolic_depth(inst->sdepth), inst->rcount,
+                  dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_MAD:
+         assert(devinfo->ver >= 6);
+         if (devinfo->ver < 10)
+            brw_set_default_access_mode(p, BRW_ALIGN_16);
+         brw_MAD(p, dst, src[0], src[1], src[2]);
+	 break;
+
+      case BRW_OPCODE_LRP:
+         assert(devinfo->ver >= 6 && devinfo->ver <= 10);
+         if (devinfo->ver < 10)
+            brw_set_default_access_mode(p, BRW_ALIGN_16);
+         brw_LRP(p, dst, src[0], src[1], src[2]);
+	 break;
+
+      case BRW_OPCODE_ADD3:
+         assert(devinfo->verx10 >= 125);
+         brw_ADD3(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_FRC:
+	 brw_FRC(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_RNDD:
+	 brw_RNDD(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_RNDE:
+	 brw_RNDE(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_RNDZ:
+	 brw_RNDZ(p, dst, src[0]);
+	 break;
+
+      case BRW_OPCODE_AND:
+	 brw_AND(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_OR:
+	 brw_OR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_XOR:
+	 brw_XOR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_NOT:
+	 brw_NOT(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_ASR:
+	 brw_ASR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_SHR:
+	 brw_SHR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_SHL:
+	 brw_SHL(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_ROL:
+	 assert(devinfo->ver >= 11);
+	 assert(src[0].type == dst.type);
+	 brw_ROL(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_ROR:
+	 assert(devinfo->ver >= 11);
+	 assert(src[0].type == dst.type);
+	 brw_ROR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_F32TO16:
+         brw_F32TO16(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_F16TO32:
+         brw_F16TO32(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_CMP:
+         if (inst->exec_size >= 16 && devinfo->verx10 == 70 &&
+             dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
+            /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
+             * implemented in the compiler is not sufficient. Overriding the
+             * type when the destination is the null register is necessary but
+             * not sufficient by itself.
+             */
+            dst.type = BRW_REGISTER_TYPE_D;
+         }
+         brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_CMPN:
+         if (inst->exec_size >= 16 && devinfo->verx10 == 70 &&
+             dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
+            /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
+             * implemented in the compiler is not sufficient. Overriding the
+             * type when the destination is the null register is necessary but
+             * not sufficient by itself.
+             */
+            dst.type = BRW_REGISTER_TYPE_D;
+         }
+         brw_CMPN(p, dst, inst->conditional_mod, src[0], src[1]);
+         break;
+      case BRW_OPCODE_SEL:
+	 brw_SEL(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_CSEL:
+         assert(devinfo->ver >= 8);
+         if (devinfo->ver < 10)
+            brw_set_default_access_mode(p, BRW_ALIGN_16);
+         brw_CSEL(p, dst, src[0], src[1], src[2]);
+         break;
+      case BRW_OPCODE_BFREV:
+         assert(devinfo->ver >= 7);
+         brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
+                   retype(src[0], BRW_REGISTER_TYPE_UD));
+         break;
+      case BRW_OPCODE_FBH:
+         assert(devinfo->ver >= 7);
+         brw_FBH(p, retype(dst, src[0].type), src[0]);
+         break;
+      case BRW_OPCODE_FBL:
+         assert(devinfo->ver >= 7);
+         brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD),
+                 retype(src[0], BRW_REGISTER_TYPE_UD));
+         break;
+      case BRW_OPCODE_LZD:
+         brw_LZD(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_CBIT:
+         assert(devinfo->ver >= 7);
+         brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD),
+                  retype(src[0], BRW_REGISTER_TYPE_UD));
+         break;
+      case BRW_OPCODE_ADDC:
+         assert(devinfo->ver >= 7);
+         brw_ADDC(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_SUBB:
+         assert(devinfo->ver >= 7);
+         brw_SUBB(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_MAC:
+         brw_MAC(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_BFE:
+         assert(devinfo->ver >= 7);
+         if (devinfo->ver < 10)
+            brw_set_default_access_mode(p, BRW_ALIGN_16);
+         brw_BFE(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_BFI1:
+         assert(devinfo->ver >= 7);
+         brw_BFI1(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_BFI2:
+         assert(devinfo->ver >= 7);
+         if (devinfo->ver < 10)
+            brw_set_default_access_mode(p, BRW_ALIGN_16);
+         brw_BFI2(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_IF:
+	 if (inst->src[0].file != BAD_FILE) {
+	    /* The instruction has an embedded compare (only allowed on gfx6) */
+	    assert(devinfo->ver == 6);
+	    gfx6_IF(p, inst->conditional_mod, src[0], src[1]);
+	 } else {
+	    brw_IF(p, brw_get_default_exec_size(p));
+	 }
+	 break;
+
+      case BRW_OPCODE_ELSE:
+	 brw_ELSE(p);
+	 break;
+      case BRW_OPCODE_ENDIF:
+	 brw_ENDIF(p);
+	 break;
+
+      case BRW_OPCODE_DO:
+	 brw_DO(p, brw_get_default_exec_size(p));
+	 break;
+
+      case BRW_OPCODE_BREAK:
+	 brw_BREAK(p);
+	 break;
+      case BRW_OPCODE_CONTINUE:
+         brw_CONT(p);
+	 break;
+
+      case BRW_OPCODE_WHILE:
+	 brw_WHILE(p);
+         loop_count++;
+	 break;
+
+      case SHADER_OPCODE_RCP:
+      case SHADER_OPCODE_RSQ:
+      case SHADER_OPCODE_SQRT:
+      case SHADER_OPCODE_EXP2:
+      case SHADER_OPCODE_LOG2:
+      case SHADER_OPCODE_SIN:
+      case SHADER_OPCODE_COS:
+         assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
+	 if (devinfo->ver >= 6) {
+            assert(inst->mlen == 0);
+            assert(devinfo->ver >= 7 || inst->exec_size == 8);
+            gfx6_math(p, dst, brw_math_function(inst->opcode),
+                      src[0], brw_null_reg());
+	 } else {
+            assert(inst->mlen >= 1);
+            assert(devinfo->ver == 5 || devinfo->platform == INTEL_PLATFORM_G4X || inst->exec_size == 8);
+            gfx4_math(p, dst,
+                      brw_math_function(inst->opcode),
+                      inst->base_mrf, src[0],
+                      BRW_MATH_PRECISION_FULL);
+            send_count++;
+	 }
+	 break;
+      case SHADER_OPCODE_INT_QUOTIENT:
+      case SHADER_OPCODE_INT_REMAINDER:
+      case SHADER_OPCODE_POW:
+         assert(devinfo->verx10 < 125);
+         assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
+         if (devinfo->ver >= 6) {
+            assert(inst->mlen == 0);
+            assert((devinfo->ver >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
+                   inst->exec_size == 8);
+            gfx6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
+         } else {
+            assert(inst->mlen >= 1);
+            assert(inst->exec_size == 8);
+            gfx4_math(p, dst, brw_math_function(inst->opcode),
+                      inst->base_mrf, src[0],
+                      BRW_MATH_PRECISION_FULL);
+            send_count++;
+	 }
+	 break;
+      case FS_OPCODE_LINTERP:
+	 multiple_instructions_emitted = generate_linterp(inst, dst, src);
+	 break;
+      case FS_OPCODE_PIXEL_X:
+         assert(src[0].type == BRW_REGISTER_TYPE_UW);
+         assert(src[1].type == BRW_REGISTER_TYPE_UW);
+         src[0].subnr = 0 * type_sz(src[0].type);
+         if (src[1].file == BRW_IMMEDIATE_VALUE) {
+            assert(src[1].ud == 0);
+            brw_MOV(p, dst, stride(src[0], 8, 4, 1));
+         } else {
+            /* Coarse pixel case */
+            brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
+         }
+         break;
+      case FS_OPCODE_PIXEL_Y:
+         assert(src[0].type == BRW_REGISTER_TYPE_UW);
+         assert(src[1].type == BRW_REGISTER_TYPE_UW);
+         src[0].subnr = 4 * type_sz(src[0].type);
+         if (src[1].file == BRW_IMMEDIATE_VALUE) {
+            assert(src[1].ud == 0);
+            brw_MOV(p, dst, stride(src[0], 8, 4, 1));
+         } else {
+            /* Coarse pixel case */
+            brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
+         }
+         break;
+
+      case SHADER_OPCODE_SEND:
+         generate_send(inst, dst, src[0], src[1], src[2],
+                       inst->ex_mlen > 0 ? src[3] : brw_null_reg());
+         send_count++;
+         break;
+
+      case SHADER_OPCODE_TEX:
+      case FS_OPCODE_TXB:
+      case SHADER_OPCODE_TXD:
+      case SHADER_OPCODE_TXF:
+      case SHADER_OPCODE_TXF_CMS:
+      case SHADER_OPCODE_TXL:
+      case SHADER_OPCODE_TXS:
+      case SHADER_OPCODE_LOD:
+      case SHADER_OPCODE_TG4:
+      case SHADER_OPCODE_SAMPLEINFO:
+         assert(inst->src[0].file == BAD_FILE);
+         generate_tex(inst, dst, src[1], src[2]);
+         send_count++;
+         break;
+
+      case FS_OPCODE_DDX_COARSE:
+      case FS_OPCODE_DDX_FINE:
+         generate_ddx(inst, dst, src[0]);
+         break;
+      case FS_OPCODE_DDY_COARSE:
+      case FS_OPCODE_DDY_FINE:
+         generate_ddy(inst, dst, src[0]);
+	 break;
+
+      case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
+	 generate_scratch_write(inst, src[0]);
+         send_count++;
+	 break;
+
+      case SHADER_OPCODE_GFX4_SCRATCH_READ:
+	 generate_scratch_read(inst, dst);
+         send_count++;
+	 break;
+
+      case SHADER_OPCODE_GFX7_SCRATCH_READ:
+	 generate_scratch_read_gfx7(inst, dst);
+         send_count++;
+	 break;
+
+      case SHADER_OPCODE_SCRATCH_HEADER:
+         generate_scratch_header(inst, dst);
+         break;
+
+      case SHADER_OPCODE_MOV_INDIRECT:
+         generate_mov_indirect(inst, dst, src[0], src[1]);
+         break;
+
+      case SHADER_OPCODE_MOV_RELOC_IMM:
+         assert(src[0].file == BRW_IMMEDIATE_VALUE);
+         brw_MOV_reloc_imm(p, dst, dst.type, src[0].ud);
+         break;
+
+      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+         assert(inst->force_writemask_all);
+	 generate_uniform_pull_constant_load(inst, dst,
+                                             src[PULL_UNIFORM_CONSTANT_SRC_SURFACE],
+                                             src[PULL_UNIFORM_CONSTANT_SRC_OFFSET]);
+         send_count++;
+	 break;
+
+      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
+	 generate_varying_pull_constant_load_gfx4(inst, dst, src[0]);
+         send_count++;
+	 break;
+
+      case FS_OPCODE_REP_FB_WRITE:
+      case FS_OPCODE_FB_WRITE:
+	 generate_fb_write(inst, src[0]);
+         send_count++;
+	 break;
+
+      case FS_OPCODE_FB_READ:
+         generate_fb_read(inst, dst, src[0]);
+         send_count++;
+         break;
+
+      case BRW_OPCODE_HALT:
+         generate_halt(inst);
+         break;
+
+      case SHADER_OPCODE_INTERLOCK:
+      case SHADER_OPCODE_MEMORY_FENCE: {
+         assert(src[1].file == BRW_IMMEDIATE_VALUE);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+
+         const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ?
+            BRW_OPCODE_SENDC : BRW_OPCODE_SEND;
+
+         brw_memory_fence(p, dst, src[0], send_op,
+                          brw_message_target(inst->sfid),
+                          inst->desc,
+                          /* commit_enable */ src[1].ud,
+                          /* bti */ src[2].ud);
+         send_count++;
+         break;
+      }
+
+      case FS_OPCODE_SCHEDULING_FENCE:
+         if (inst->sources == 0 && swsb.regdist == 0 &&
+                                   swsb.mode == TGL_SBID_NULL) {
+            if (unlikely(debug_flag))
+               disasm_info->use_tail = true;
+            break;
+         }
+
+         if (devinfo->ver >= 12) {
+            /* Use the available SWSB information to stall.  A single SYNC is
+             * sufficient since if there were multiple dependencies, the
+             * scoreboard algorithm already injected other SYNCs before this
+             * instruction.
+             */
+            brw_SYNC(p, TGL_SYNC_NOP);
+         } else {
+            for (unsigned i = 0; i < inst->sources; i++) {
+               /* Emit a MOV to force a stall until the instruction producing the
+                * registers finishes.
+                */
+               brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
+                       retype(src[i], BRW_REGISTER_TYPE_UW));
+            }
+
+            if (inst->sources > 1)
+               multiple_instructions_emitted = true;
+         }
+
+         break;
+
+      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+         brw_find_live_channel(p, dst, false);
+         break;
+      case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
+         brw_find_live_channel(p, dst, true);
+         break;
+
+      case FS_OPCODE_LOAD_LIVE_CHANNELS: {
+         assert(devinfo->ver >= 8);
+         assert(inst->force_writemask_all && inst->group == 0);
+         assert(inst->dst.file == BAD_FILE);
+         brw_set_default_exec_size(p, BRW_EXECUTE_1);
+         brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg),
+                           BRW_REGISTER_TYPE_UD),
+                 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
+         break;
+      }
+      case SHADER_OPCODE_BROADCAST:
+         assert(inst->force_writemask_all);
+         brw_broadcast(p, dst, src[0], src[1]);
+         break;
+
+      case SHADER_OPCODE_SHUFFLE:
+         generate_shuffle(inst, dst, src[0], src[1]);
+         break;
+
+      case SHADER_OPCODE_SEL_EXEC:
+         assert(inst->force_writemask_all);
+         assert(devinfo->has_64bit_float || type_sz(dst.type) <= 4);
+         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+         brw_MOV(p, dst, src[1]);
+         brw_set_default_mask_control(p, BRW_MASK_ENABLE);
+         brw_set_default_swsb(p, tgl_swsb_null());
+         brw_MOV(p, dst, src[0]);
+         break;
+
+      case SHADER_OPCODE_QUAD_SWIZZLE:
+         assert(src[1].file == BRW_IMMEDIATE_VALUE);
+         assert(src[1].type == BRW_REGISTER_TYPE_UD);
+         generate_quad_swizzle(inst, dst, src[0], src[1].ud);
+         break;
+
+      case SHADER_OPCODE_CLUSTER_BROADCAST: {
+         assert((devinfo->platform != INTEL_PLATFORM_CHV &&
+                 !intel_device_info_is_9lp(devinfo) &&
+                 devinfo->has_64bit_float) || type_sz(src[0].type) <= 4);
+         assert(!src[0].negate && !src[0].abs);
+         assert(src[1].file == BRW_IMMEDIATE_VALUE);
+         assert(src[1].type == BRW_REGISTER_TYPE_UD);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         assert(src[2].type == BRW_REGISTER_TYPE_UD);
+         const unsigned component = src[1].ud;
+         const unsigned cluster_size = src[2].ud;
+         assert(inst->src[0].file != ARF && inst->src[0].file != FIXED_GRF);
+         const unsigned s = inst->src[0].stride;
+         unsigned vstride = cluster_size * s;
+         unsigned width = cluster_size;
+
+         /* The maximum exec_size is 32, but the maximum width is only 16. */
+         if (inst->exec_size == width) {
+            vstride = 0;
+            width = 1;
+         }
+
+         struct brw_reg strided = stride(suboffset(src[0], component * s),
+                                         vstride, width, 0);
+         brw_MOV(p, dst, strided);
+         break;
+      }
+
+      case FS_OPCODE_SET_SAMPLE_ID:
+         generate_set_sample_id(inst, dst, src[0], src[1]);
+         break;
+
+      case SHADER_OPCODE_HALT_TARGET:
+         /* This is the place where the final HALT needs to be inserted if
+          * we've emitted any discards.  If not, this will emit no code.
+          */
+         if (!patch_halt_jumps()) {
+            if (unlikely(debug_flag)) {
+               disasm_info->use_tail = true;
+            }
+         }
+         break;
+
+      case CS_OPCODE_CS_TERMINATE:
+         generate_cs_terminate(inst, src[0]);
+         send_count++;
+         break;
+
+      case SHADER_OPCODE_BARRIER:
+	 generate_barrier(inst, src[0]);
+         send_count++;
+	 break;
+
+      case BRW_OPCODE_DIM:
+         assert(devinfo->platform == INTEL_PLATFORM_HSW);
+         assert(src[0].type == BRW_REGISTER_TYPE_DF);
+         assert(dst.type == BRW_REGISTER_TYPE_DF);
+         brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
+         break;
+
+      case SHADER_OPCODE_RND_MODE: {
+         assert(src[0].file == BRW_IMMEDIATE_VALUE);
+         /*
+          * Changes the floating point rounding mode updating the control
+          * register field defined at cr0.0[5-6] bits.
+          */
+         enum brw_rnd_mode mode =
+            (enum brw_rnd_mode) (src[0].d << BRW_CR0_RND_MODE_SHIFT);
+         brw_float_controls_mode(p, mode, BRW_CR0_RND_MODE_MASK);
+      }
+         break;
+
+      case SHADER_OPCODE_FLOAT_CONTROL_MODE:
+         assert(src[0].file == BRW_IMMEDIATE_VALUE);
+         assert(src[1].file == BRW_IMMEDIATE_VALUE);
+         brw_float_controls_mode(p, src[0].d, src[1].d);
+         break;
+
+      case SHADER_OPCODE_READ_SR_REG:
+         if (devinfo->ver >= 12) {
+            /* There is a SWSB restriction that requires that any time sr0 is
+             * accessed both the instruction doing the access and the next one
+             * have SWSB set to RegDist(1).
+             */
+            if (brw_get_default_swsb(p).mode != TGL_SBID_NULL)
+               brw_SYNC(p, TGL_SYNC_NOP);
+            assert(src[0].file == BRW_IMMEDIATE_VALUE);
+            brw_set_default_swsb(p, tgl_swsb_regdist(1));
+            brw_MOV(p, dst, brw_sr0_reg(src[0].ud));
+            brw_set_default_swsb(p, tgl_swsb_regdist(1));
+            brw_AND(p, dst, dst, brw_imm_ud(0xffffffff));
+         } else {
+            brw_MOV(p, dst, brw_sr0_reg(src[0].ud));
+         }
+         break;
+
+      default:
+         unreachable("Unsupported opcode");
+
+      case SHADER_OPCODE_LOAD_PAYLOAD:
+         unreachable("Should be lowered by lower_load_payload()");
+      }
+
+      if (multiple_instructions_emitted)
+         continue;
+
+      if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
+         assert(p->next_insn_offset == last_insn_offset + 16 ||
+                !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
+                 "emitting more than 1 instruction");
+
+         brw_inst *last = &p->store[last_insn_offset / 16];
+
+         if (inst->conditional_mod)
+            brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
+         if (devinfo->ver < 12) {
+            brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
+            brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
+         }
+      }
+
+      /* When enabled, insert sync NOP after every instruction and make sure
+       * that current instruction depends on the previous instruction.
+       */
+      if (INTEL_DEBUG(DEBUG_SWSB_STALL) && devinfo->ver >= 12) {
+         brw_set_default_swsb(p, tgl_swsb_regdist(1));
+         brw_SYNC(p, TGL_SYNC_NOP);
+      }
+   }
+
+   brw_set_uip_jip(p, start_offset);
+
+   /* end of program sentinel */
+   disasm_new_inst_group(disasm_info, p->next_insn_offset);
+
+   /* `send_count` explicitly does not include spills or fills, as we'd
+    * like to use it as a metric for intentional memory access or other
+    * shared function use.  Otherwise, subtle changes to scheduling or
+    * register allocation could cause it to fluctuate wildly - and that
+    * effect is already counted in spill/fill counts.
+    */
+   send_count -= shader_stats.spill_count;
+   send_count -= shader_stats.fill_count;
+
+#ifndef NDEBUG
+   bool validated =
+#else
+   if (unlikely(debug_flag))
+#endif
+      brw_validate_instructions(&compiler->isa, p->store,
+                                start_offset,
+                                p->next_insn_offset,
+                                disasm_info);
+
+   int before_size = p->next_insn_offset - start_offset;
+   brw_compact_instructions(p, start_offset, disasm_info);
+   int after_size = p->next_insn_offset - start_offset;
+
+   bool dump_shader_bin = brw_should_dump_shader_bin();
+   unsigned char sha1[21];
+   char sha1buf[41];
+
+   if (unlikely(debug_flag || dump_shader_bin)) {
+      _mesa_sha1_compute(p->store + start_offset / sizeof(brw_inst),
+                         after_size, sha1);
+      _mesa_sha1_format(sha1buf, sha1);
+   }
+
+   if (unlikely(dump_shader_bin))
+      brw_dump_shader_bin(p->store, start_offset, p->next_insn_offset,
+                          sha1buf);
+
+   if (unlikely(debug_flag)) {
+      fprintf(stderr, "Native code for %s (src_hash 0x%08x) (sha1 %s)\n"
+              "SIMD%d shader: %d instructions. %d loops. %u cycles. "
+              "%d:%d spills:fills, %u sends, "
+              "scheduled with mode %s. "
+              "Promoted %u constants. "
+              "Compacted %d to %d bytes (%.0f%%)\n",
+              shader_name, params->source_hash, sha1buf,
+              dispatch_width, before_size / 16,
+              loop_count, perf.latency,
+              shader_stats.spill_count,
+              shader_stats.fill_count,
+              send_count,
+              shader_stats.scheduler_mode,
+              shader_stats.promoted_constants,
+              before_size, after_size,
+              100.0f * (before_size - after_size) / before_size);
+
+      /* overriding the shader makes disasm_info invalid */
+      if (!brw_try_override_assembly(p, start_offset, sha1buf)) {
+         dump_assembly(p->store, start_offset, p->next_insn_offset,
+                       disasm_info, perf.block_latency);
+      } else {
+         fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
+      }
+   }
+   ralloc_free(disasm_info);
+#ifndef NDEBUG
+   if (!validated && !debug_flag) {
+      fprintf(stderr,
+            "Validation failed. Rerun with INTEL_DEBUG=shaders to get more information.\n");
+   }
+#endif
+   assert(validated);
+
+   brw_shader_debug_log(compiler, params->log_data,
+                        "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
+                        "%d:%d spills:fills, %u sends, "
+                        "scheduled with mode %s, "
+                        "Promoted %u constants, "
+                        "compacted %d to %d bytes.\n",
+                        _mesa_shader_stage_to_abbrev(stage),
+                        dispatch_width,
+                        before_size / 16 - nop_count - sync_nop_count,
+                        loop_count, perf.latency,
+                        shader_stats.spill_count,
+                        shader_stats.fill_count,
+                        send_count,
+                        shader_stats.scheduler_mode,
+                        shader_stats.promoted_constants,
+                        before_size, after_size);
+   if (stats) {
+      stats->dispatch_width = dispatch_width;
+      stats->max_polygons = max_polygons;
+      stats->max_dispatch_width = dispatch_width;
+      stats->instructions = before_size / 16 - nop_count - sync_nop_count;
+      stats->sends = send_count;
+      stats->loops = loop_count;
+      stats->cycles = perf.latency;
+      stats->spills = shader_stats.spill_count;
+      stats->fills = shader_stats.fill_count;
+      stats->max_live_registers = shader_stats.max_register_pressure;
+   }
+
+   return start_offset;
+}
+
+void
+fs_generator::add_const_data(void *data, unsigned size)
+{
+   assert(prog_data->const_data_size == 0);
+   if (size > 0) {
+      prog_data->const_data_size = size;
+      prog_data->const_data_offset = brw_append_data(p, data, size, 32);
+   }
+}
+
+void
+fs_generator::add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt)
+{
+   assert(brw_shader_stage_is_bindless(stage));
+   struct brw_bs_prog_data *bs_prog_data = brw_bs_prog_data(prog_data);
+   if (num_resume_shaders > 0) {
+      bs_prog_data->resume_sbt_offset =
+         brw_append_data(p, sbt, num_resume_shaders * sizeof(uint64_t), 32);
+      for (unsigned i = 0; i < num_resume_shaders; i++) {
+         size_t offset = bs_prog_data->resume_sbt_offset + i * sizeof(*sbt);
+         assert(offset <= UINT32_MAX);
+         brw_add_reloc(p, BRW_SHADER_RELOC_SHADER_START_OFFSET,
+                       BRW_SHADER_RELOC_TYPE_U32,
+                       (uint32_t)offset, (uint32_t)sbt[i]);
+      }
+   }
+}
+
+const unsigned *
+fs_generator::get_assembly()
+{
+   prog_data->relocs = brw_get_shader_relocs(p, &prog_data->num_relocs);
+
+   return brw_get_program(p, &prog_data->program_size);
+}
diff --git a/src/intel/compiler/elk/brw_fs_live_variables.cpp b/src/intel/compiler/elk/brw_fs_live_variables.cpp
new file mode 100644
index 00000000000..c6361d67d95
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_live_variables.cpp
@@ -0,0 +1,371 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_live_variables.h"
+
+using namespace brw;
+
+#define MAX_INSTRUCTION (1 << 30)
+
+/** @file brw_fs_live_variables.cpp
+ *
+ * Support for calculating liveness information about virtual GRFs.
+ *
+ * This produces a live interval for each whole virtual GRF.  We could
+ * choose to expose per-component live intervals for VGRFs of size > 1,
+ * but we currently do not.  It is easier for the consumers of this
+ * information to work with whole VGRFs.
+ *
+ * However, we internally track use/def information at the per-GRF level for
+ * greater accuracy.  Large VGRFs may be accessed piecemeal over many
+ * (possibly non-adjacent) instructions.  In this case, examining a single
+ * instruction is insufficient to decide whether a whole VGRF is ultimately
+ * used or defined.  Tracking individual components allows us to easily
+ * assemble this information.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 14.1 (p444).
+ */
+
+void
+fs_live_variables::setup_one_read(struct block_data *bd,
+                                  int ip, const fs_reg &reg)
+{
+   int var = var_from_reg(reg);
+   assert(var < num_vars);
+
+   start[var] = MIN2(start[var], ip);
+   end[var] = MAX2(end[var], ip);
+
+   /* The use[] bitset marks when the block makes use of a variable (VGRF
+    * channel) without having completely defined that variable within the
+    * block.
+    */
+   if (!BITSET_TEST(bd->def, var))
+      BITSET_SET(bd->use, var);
+}
+
+void
+fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst,
+                                   int ip, const fs_reg &reg)
+{
+   int var = var_from_reg(reg);
+   assert(var < num_vars);
+
+   start[var] = MIN2(start[var], ip);
+   end[var] = MAX2(end[var], ip);
+
+   /* The def[] bitset marks when an initialization in a block completely
+    * screens off previous updates of that variable (VGRF channel).
+    */
+   if (inst->dst.file == VGRF) {
+      if (!inst->is_partial_write() && !BITSET_TEST(bd->use, var))
+         BITSET_SET(bd->def, var);
+
+      BITSET_SET(bd->defout, var);
+   }
+}
+
+/**
+ * Sets up the use[] and def[] bitsets.
+ *
+ * The basic-block-level live variable analysis needs to know which
+ * variables get used before they're completely defined, and which
+ * variables are completely defined before they're used.
+ *
+ * These are tracked at the per-component level, rather than whole VGRFs.
+ */
+void
+fs_live_variables::setup_def_use()
+{
+   int ip = 0;
+
+   foreach_block (block, cfg) {
+      assert(ip == block->start_ip);
+      if (block->num > 0)
+         assert(cfg->blocks[block->num - 1]->end_ip == ip - 1);
+
+      struct block_data *bd = &block_data[block->num];
+
+      foreach_inst_in_block(fs_inst, inst, block) {
+         /* Set use[] for this instruction */
+         for (unsigned int i = 0; i < inst->sources; i++) {
+            fs_reg reg = inst->src[i];
+
+            if (reg.file != VGRF)
+               continue;
+
+            for (unsigned j = 0; j < regs_read(inst, i); j++) {
+               setup_one_read(bd, ip, reg);
+               reg.offset += REG_SIZE;
+            }
+         }
+
+         bd->flag_use[0] |= inst->flags_read(devinfo) & ~bd->flag_def[0];
+
+         /* Set def[] for this instruction */
+         if (inst->dst.file == VGRF) {
+            fs_reg reg = inst->dst;
+            for (unsigned j = 0; j < regs_written(inst); j++) {
+               setup_one_write(bd, inst, ip, reg);
+               reg.offset += REG_SIZE;
+            }
+         }
+
+         if (!inst->predicate && inst->exec_size >= 8)
+            bd->flag_def[0] |= inst->flags_written(devinfo) & ~bd->flag_use[0];
+
+         ip++;
+      }
+   }
+}
+
+/**
+ * The algorithm incrementally sets bits in liveout and livein,
+ * propagating it through control flow.  It will eventually terminate
+ * because it only ever adds bits, and stops when no bits are added in
+ * a pass.
+ */
+void
+fs_live_variables::compute_live_variables()
+{
+   bool cont = true;
+
+   /* Propagate defin and defout down the CFG to calculate the union of live
+    * variables potentially defined along any possible control flow path.
+    */
+   do {
+      cont = false;
+
+      foreach_block (block, cfg) {
+         const struct block_data *bd = &block_data[block->num];
+
+         foreach_list_typed(bblock_link, child_link, link, &block->children) {
+            struct block_data *child_bd = &block_data[child_link->block->num];
+
+            for (int i = 0; i < bitset_words; i++) {
+               const BITSET_WORD new_def = bd->defout[i] & ~child_bd->defin[i];
+               child_bd->defin[i] |= new_def;
+               child_bd->defout[i] |= new_def;
+               cont |= new_def;
+            }
+         }
+      }
+   } while (cont);
+
+   do {
+      cont = false;
+
+      foreach_block_reverse (block, cfg) {
+         struct block_data *bd = &block_data[block->num];
+
+         /* Update liveout */
+         foreach_list_typed(bblock_link, child_link, link, &block->children) {
+            struct block_data *child_bd = &block_data[child_link->block->num];
+
+            for (int i = 0; i < bitset_words; i++) {
+               BITSET_WORD new_liveout = (child_bd->livein[i] &
+                                          ~bd->liveout[i]);
+               new_liveout &= bd->defout[i]; /* Screen off uses with no reaching def */
+               if (new_liveout)
+                  bd->liveout[i] |= new_liveout;
+            }
+            BITSET_WORD new_liveout = (child_bd->flag_livein[0] &
+                                       ~bd->flag_liveout[0]);
+            if (new_liveout)
+               bd->flag_liveout[0] |= new_liveout;
+         }
+
+         /* Update livein */
+         for (int i = 0; i < bitset_words; i++) {
+            BITSET_WORD new_livein = (bd->use[i] |
+                                      (bd->liveout[i] &
+                                       ~bd->def[i]));
+            new_livein &= bd->defin[i]; /* Screen off uses with no reaching def */
+            if (new_livein & ~bd->livein[i]) {
+               bd->livein[i] |= new_livein;
+               cont = true;
+            }
+         }
+         BITSET_WORD new_livein = (bd->flag_use[0] |
+                                   (bd->flag_liveout[0] &
+                                    ~bd->flag_def[0]));
+         if (new_livein & ~bd->flag_livein[0]) {
+            bd->flag_livein[0] |= new_livein;
+            cont = true;
+         }
+      }
+   } while (cont);
+}
+
+/**
+ * Extend the start/end ranges for each variable to account for the
+ * new information calculated from control flow.
+ */
+void
+fs_live_variables::compute_start_end()
+{
+   foreach_block (block, cfg) {
+      struct block_data *bd = &block_data[block->num];
+      unsigned i;
+
+      BITSET_FOREACH_SET(i, bd->livein, (unsigned)num_vars) {
+         start[i] = MIN2(start[i], block->start_ip);
+         end[i] = MAX2(end[i], block->start_ip);
+      }
+
+      BITSET_FOREACH_SET(i, bd->liveout, (unsigned)num_vars) {
+         start[i] = MIN2(start[i], block->end_ip);
+         end[i] = MAX2(end[i], block->end_ip);
+      }
+   }
+}
+
+fs_live_variables::fs_live_variables(const backend_shader *s)
+   : devinfo(s->devinfo), cfg(s->cfg)
+{
+   mem_ctx = ralloc_context(NULL);
+   linear_ctx *lin_ctx = linear_context(mem_ctx);
+
+   num_vgrfs = s->alloc.count;
+   num_vars = 0;
+   var_from_vgrf = linear_zalloc_array(lin_ctx, int, num_vgrfs);
+   for (int i = 0; i < num_vgrfs; i++) {
+      var_from_vgrf[i] = num_vars;
+      num_vars += s->alloc.sizes[i];
+   }
+
+   vgrf_from_var = linear_zalloc_array(lin_ctx, int, num_vars);
+   for (int i = 0; i < num_vgrfs; i++) {
+      for (unsigned j = 0; j < s->alloc.sizes[i]; j++) {
+         vgrf_from_var[var_from_vgrf[i] + j] = i;
+      }
+   }
+
+   start = ralloc_array(mem_ctx, int, num_vars);
+   end = linear_zalloc_array(lin_ctx, int, num_vars);
+   for (int i = 0; i < num_vars; i++) {
+      start[i] = MAX_INSTRUCTION;
+      end[i] = -1;
+   }
+
+   vgrf_start = ralloc_array(mem_ctx, int, num_vgrfs);
+   vgrf_end = ralloc_array(mem_ctx, int, num_vgrfs);
+   for (int i = 0; i < num_vgrfs; i++) {
+      vgrf_start[i] = MAX_INSTRUCTION;
+      vgrf_end[i] = -1;
+   }
+
+   block_data = linear_zalloc_array(lin_ctx, struct block_data, cfg->num_blocks);
+
+   bitset_words = BITSET_WORDS(num_vars);
+   for (int i = 0; i < cfg->num_blocks; i++) {
+      block_data[i].def = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      block_data[i].use = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      block_data[i].livein = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      block_data[i].liveout = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      block_data[i].defin = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      block_data[i].defout = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+
+      block_data[i].flag_def[0] = 0;
+      block_data[i].flag_use[0] = 0;
+      block_data[i].flag_livein[0] = 0;
+      block_data[i].flag_liveout[0] = 0;
+   }
+
+   setup_def_use();
+   compute_live_variables();
+   compute_start_end();
+
+   /* Merge the per-component live ranges to whole VGRF live ranges. */
+   for (int i = 0; i < num_vars; i++) {
+      const unsigned vgrf = vgrf_from_var[i];
+      vgrf_start[vgrf] = MIN2(vgrf_start[vgrf], start[i]);
+      vgrf_end[vgrf] = MAX2(vgrf_end[vgrf], end[i]);
+   }
+}
+
+fs_live_variables::~fs_live_variables()
+{
+   ralloc_free(mem_ctx);
+}
+
+static bool
+check_register_live_range(const fs_live_variables *live, int ip,
+                          const fs_reg &reg, unsigned n)
+{
+   const unsigned var = live->var_from_reg(reg);
+
+   if (var + n > unsigned(live->num_vars) ||
+       live->vgrf_start[reg.nr] > ip || live->vgrf_end[reg.nr] < ip)
+      return false;
+
+   for (unsigned j = 0; j < n; j++) {
+      if (live->start[var + j] > ip || live->end[var + j] < ip)
+         return false;
+   }
+
+   return true;
+}
+
+bool
+fs_live_variables::validate(const backend_shader *s) const
+{
+   int ip = 0;
+
+   foreach_block_and_inst(block, fs_inst, inst, s->cfg) {
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF &&
+             !check_register_live_range(this, ip,
+                                        inst->src[i], regs_read(inst, i)))
+            return false;
+      }
+
+      if (inst->dst.file == VGRF &&
+          !check_register_live_range(this, ip, inst->dst, regs_written(inst)))
+         return false;
+
+      ip++;
+   }
+
+   return true;
+}
+
+bool
+fs_live_variables::vars_interfere(int a, int b) const
+{
+   return !(end[b] <= start[a] ||
+            end[a] <= start[b]);
+}
+
+bool
+fs_live_variables::vgrfs_interfere(int a, int b) const
+{
+   return !(vgrf_end[a] <= vgrf_start[b] ||
+            vgrf_end[b] <= vgrf_start[a]);
+}
diff --git a/src/intel/compiler/elk/brw_fs_live_variables.h b/src/intel/compiler/elk/brw_fs_live_variables.h
new file mode 100644
index 00000000000..1c77efa0c19
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_live_variables.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#ifndef BRW_FS_LIVE_VARIABLES_H
+#define BRW_FS_LIVE_VARIABLES_H
+
+#include "brw_ir_analysis.h"
+#include "brw_ir_fs.h"
+#include "util/bitset.h"
+
+struct cfg_t;
+struct backend_shader;
+
+namespace brw {
+
+class fs_live_variables {
+public:
+   struct block_data {
+      /**
+       * Which variables are defined before being used in the block.
+       *
+       * Note that for our purposes, "defined" means unconditionally, completely
+       * defined.
+       */
+      BITSET_WORD *def;
+
+      /**
+       * Which variables are used before being defined in the block.
+       */
+      BITSET_WORD *use;
+
+      /** Which defs reach the entry point of the block. */
+      BITSET_WORD *livein;
+
+      /** Which defs reach the exit point of the block. */
+      BITSET_WORD *liveout;
+
+      /**
+       * Variables such that the entry point of the block may be reached from any
+       * of their definitions.
+       */
+      BITSET_WORD *defin;
+
+      /**
+       * Variables such that the exit point of the block may be reached from any
+       * of their definitions.
+       */
+      BITSET_WORD *defout;
+
+      BITSET_WORD flag_def[1];
+      BITSET_WORD flag_use[1];
+      BITSET_WORD flag_livein[1];
+      BITSET_WORD flag_liveout[1];
+   };
+
+   fs_live_variables(const backend_shader *s);
+   ~fs_live_variables();
+
+   bool validate(const backend_shader *s) const;
+
+   analysis_dependency_class
+   dependency_class() const
+   {
+      return (DEPENDENCY_INSTRUCTION_IDENTITY |
+              DEPENDENCY_INSTRUCTION_DATA_FLOW |
+              DEPENDENCY_VARIABLES);
+   }
+
+   bool vars_interfere(int a, int b) const;
+   bool vgrfs_interfere(int a, int b) const;
+   int var_from_reg(const fs_reg &reg) const
+   {
+      return var_from_vgrf[reg.nr] + reg.offset / REG_SIZE;
+   }
+
+   /** Map from virtual GRF number to index in block_data arrays. */
+   int *var_from_vgrf;
+
+   /**
+    * Map from any index in block_data to the virtual GRF containing it.
+    *
+    * For alloc.sizes of [1, 2, 3], vgrf_from_var would contain
+    * [0, 1, 1, 2, 2, 2].
+    */
+   int *vgrf_from_var;
+
+   int num_vars;
+   int num_vgrfs;
+   int bitset_words;
+
+   /** @{
+    * Final computed live ranges for each var (each component of each virtual
+    * GRF).
+    */
+   int *start;
+   int *end;
+   /** @} */
+
+   /** @{
+    * Final computed live ranges for each VGRF.
+    */
+   int *vgrf_start;
+   int *vgrf_end;
+   /** @} */
+
+   /** Per-basic-block information on live variables */
+   struct block_data *block_data;
+
+protected:
+   void setup_def_use();
+   void setup_one_read(struct block_data *bd, int ip, const fs_reg &reg);
+   void setup_one_write(struct block_data *bd, fs_inst *inst, int ip,
+                        const fs_reg &reg);
+   void compute_live_variables();
+   void compute_start_end();
+
+   const struct intel_device_info *devinfo;
+   const cfg_t *cfg;
+   void *mem_ctx;
+};
+
+} /* namespace brw */
+
+#endif /* BRW_FS_LIVE_VARIABLES_H */
diff --git a/src/intel/compiler/elk/brw_fs_lower_dpas.cpp b/src/intel/compiler/elk/brw_fs_lower_dpas.cpp
new file mode 100644
index 00000000000..306731722af
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_lower_dpas.cpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright 2023 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+static void
+f16_using_mac(const fs_builder &bld, fs_inst *inst)
+{
+   /* We only intend to support configurations where the destination and
+    * accumulator have the same type.
+    */
+   if (!inst->src[0].is_null())
+      assert(inst->dst.type == inst->src[0].type);
+
+   assert(inst->src[1].type == BRW_REGISTER_TYPE_HF);
+   assert(inst->src[2].type == BRW_REGISTER_TYPE_HF);
+
+   const brw_reg_type src0_type = inst->dst.type;
+   const brw_reg_type src1_type = BRW_REGISTER_TYPE_HF;
+   const brw_reg_type src2_type = BRW_REGISTER_TYPE_HF;
+
+   const fs_reg dest = inst->dst;
+   fs_reg src0 = inst->src[0];
+   const fs_reg src1 = retype(inst->src[1], src1_type);
+   const fs_reg src2 = retype(inst->src[2], src2_type);
+
+   const unsigned dest_stride =
+      dest.type == BRW_REGISTER_TYPE_HF ? REG_SIZE / 2 : REG_SIZE;
+
+   for (unsigned r = 0; r < inst->rcount; r++) {
+      fs_reg temp = bld.vgrf(BRW_REGISTER_TYPE_HF, 1);
+
+      for (unsigned subword = 0; subword < 2; subword++) {
+         for (unsigned s = 0; s < inst->sdepth; s++) {
+            /* The first multiply of the dot-product operation has to
+             * explicitly write the accumulator register. The successive MAC
+             * instructions will implicitly read *and* write the
+             * accumulator. Those MAC instructions can also optionally
+             * explicitly write some other register.
+             *
+             * FINISHME: The accumulator can actually hold 16 HF values. On
+             * Gfx12 there are two accumulators. It should be possible to do
+             * this in SIMD16 or even SIMD32. I was unable to get this to work
+             * properly.
+             */
+            if (s == 0 && subword == 0) {
+               const unsigned acc_width = 8;
+               fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD),
+                                      inst->group % acc_width);
+
+               if (bld.shader->devinfo->verx10 >= 125) {
+                  acc = subscript(acc, BRW_REGISTER_TYPE_HF, subword);
+               } else {
+                  acc = retype(acc, BRW_REGISTER_TYPE_HF);
+               }
+
+               bld.MUL(acc,
+                       subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                        BRW_REGISTER_TYPE_UD),
+                                 BRW_REGISTER_TYPE_HF, subword),
+                       component(retype(byte_offset(src2, r * REG_SIZE),
+                                        BRW_REGISTER_TYPE_HF),
+                                 s * 2 + subword))
+                  ->writes_accumulator = true;
+
+            } else {
+               fs_reg result;
+
+               /* As mentioned above, the MAC had an optional, explicit
+                * destination register. Various optimization passes are not
+                * clever enough to understand the intricacies of this
+                * instruction, so only write the result register on the final
+                * MAC in the sequence.
+                */
+               if ((s + 1) == inst->sdepth && subword == 1)
+                  result = temp;
+               else
+                  result = retype(bld.null_reg_ud(), BRW_REGISTER_TYPE_HF);
+
+               bld.MAC(result,
+                       subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                        BRW_REGISTER_TYPE_UD),
+                                 BRW_REGISTER_TYPE_HF, subword),
+                       component(retype(byte_offset(src2, r * REG_SIZE),
+                                        BRW_REGISTER_TYPE_HF),
+                                 s * 2 + subword))
+                  ->writes_accumulator = true;
+            }
+         }
+      }
+
+      if (!src0.is_null()) {
+         if (src0_type != BRW_REGISTER_TYPE_HF) {
+            fs_reg temp2 = bld.vgrf(src0_type, 1);
+
+            bld.MOV(temp2, temp);
+
+            bld.ADD(byte_offset(dest, r * dest_stride),
+                    temp2,
+                    byte_offset(src0, r * dest_stride));
+         } else {
+            bld.ADD(byte_offset(dest, r * dest_stride),
+                    temp,
+                    byte_offset(src0, r * dest_stride));
+         }
+      } else {
+         bld.MOV(byte_offset(dest, r * dest_stride), temp);
+      }
+   }
+}
+
+static void
+int8_using_dp4a(const fs_builder &bld, fs_inst *inst)
+{
+   /* We only intend to support configurations where the destination and
+    * accumulator have the same type.
+    */
+   if (!inst->src[0].is_null())
+      assert(inst->dst.type == inst->src[0].type);
+
+   assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
+          inst->src[1].type == BRW_REGISTER_TYPE_UB);
+   assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
+          inst->src[2].type == BRW_REGISTER_TYPE_UB);
+
+   const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
+      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
+
+   const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
+      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
+
+   fs_reg dest = inst->dst;
+   fs_reg src0 = inst->src[0];
+   const fs_reg src1 = retype(inst->src[1], src1_type);
+   const fs_reg src2 = retype(inst->src[2], src2_type);
+
+   const unsigned dest_stride = REG_SIZE;
+
+   for (unsigned r = 0; r < inst->rcount; r++) {
+      if (!src0.is_null()) {
+         bld.MOV(dest, src0);
+         src0 = byte_offset(src0, dest_stride);
+      } else {
+         bld.MOV(dest, retype(brw_imm_d(0), dest.type));
+      }
+
+      for (unsigned s = 0; s < inst->sdepth; s++) {
+         bld.DP4A(dest,
+                  dest,
+                  byte_offset(src1, s * REG_SIZE),
+                  component(byte_offset(src2, r * REG_SIZE), s))
+            ->saturate = inst->saturate;
+      }
+
+      dest = byte_offset(dest, dest_stride);
+   }
+}
+
+static void
+int8_using_mul_add(const fs_builder &bld, fs_inst *inst)
+{
+   /* We only intend to support configurations where the destination and
+    * accumulator have the same type.
+    */
+   if (!inst->src[0].is_null())
+      assert(inst->dst.type == inst->src[0].type);
+
+   assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
+          inst->src[1].type == BRW_REGISTER_TYPE_UB);
+   assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
+          inst->src[2].type == BRW_REGISTER_TYPE_UB);
+
+   const brw_reg_type src0_type = inst->dst.type;
+
+   const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
+      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
+
+   const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
+      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
+
+   fs_reg dest = inst->dst;
+   fs_reg src0 = inst->src[0];
+   const fs_reg src1 = retype(inst->src[1], src1_type);
+   const fs_reg src2 = retype(inst->src[2], src2_type);
+
+   const unsigned dest_stride = REG_SIZE;
+
+   for (unsigned r = 0; r < inst->rcount; r++) {
+      if (!src0.is_null()) {
+         bld.MOV(dest, src0);
+         src0 = byte_offset(src0, dest_stride);
+      } else {
+         bld.MOV(dest, retype(brw_imm_d(0), dest.type));
+      }
+
+      for (unsigned s = 0; s < inst->sdepth; s++) {
+         fs_reg temp1 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg temp2 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg temp3 = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const brw_reg_type temp_type =
+            (inst->src[1].type == BRW_REGISTER_TYPE_B ||
+             inst->src[2].type == BRW_REGISTER_TYPE_B)
+            ? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW;
+
+         /* Expand 8 dwords of packed bytes into 16 dwords of packed
+          * words.
+          *
+          * FINISHME: Gfx9 should not need this work around. Gfx11
+          * may be able to use integer MAD. Both platforms may be
+          * able to use MAC.
+          */
+         bld.group(32, 0).MOV(retype(temp3, temp_type),
+                              retype(byte_offset(src2, r * REG_SIZE),
+                                     inst->src[2].type));
+
+         bld.MUL(subscript(temp1, temp_type, 0),
+                 subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                  BRW_REGISTER_TYPE_UD),
+                           inst->src[1].type, 0),
+                 subscript(component(retype(temp3,
+                                            BRW_REGISTER_TYPE_UD),
+                                     s * 2),
+                           temp_type, 0));
+
+         bld.MUL(subscript(temp1, temp_type, 1),
+                 subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                  BRW_REGISTER_TYPE_UD),
+                           inst->src[1].type, 1),
+                 subscript(component(retype(temp3,
+                                            BRW_REGISTER_TYPE_UD),
+                                     s * 2),
+                           temp_type, 1));
+
+         bld.MUL(subscript(temp2, temp_type, 0),
+                 subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                  BRW_REGISTER_TYPE_UD),
+                           inst->src[1].type, 2),
+                 subscript(component(retype(temp3,
+                                            BRW_REGISTER_TYPE_UD),
+                                     s * 2 + 1),
+                           temp_type, 0));
+
+         bld.MUL(subscript(temp2, temp_type, 1),
+                 subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                  BRW_REGISTER_TYPE_UD),
+                           inst->src[1].type, 3),
+                 subscript(component(retype(temp3,
+                                            BRW_REGISTER_TYPE_UD),
+                                     s * 2 + 1),
+                           temp_type, 1));
+
+         bld.ADD(subscript(temp1, src0_type, 0),
+                 subscript(temp1, temp_type, 0),
+                 subscript(temp1, temp_type, 1));
+
+         bld.ADD(subscript(temp2, src0_type, 0),
+                 subscript(temp2, temp_type, 0),
+                 subscript(temp2, temp_type, 1));
+
+         bld.ADD(retype(temp1, src0_type),
+                 retype(temp1, src0_type),
+                 retype(temp2, src0_type));
+
+         bld.ADD(dest, dest, retype(temp1, src0_type))
+            ->saturate = inst->saturate;
+      }
+
+      dest = byte_offset(dest, dest_stride);
+   }
+}
+
+bool
+brw_lower_dpas(fs_visitor &v)
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, v.cfg) {
+      if (inst->opcode != BRW_OPCODE_DPAS)
+         continue;
+
+      const fs_builder bld = fs_builder(&v, block, inst).group(8, 0).exec_all();
+
+      if (brw_reg_type_is_floating_point(inst->dst.type)) {
+         f16_using_mac(bld, inst);
+      } else {
+         if (v.devinfo->ver >= 12) {
+            int8_using_dp4a(bld, inst);
+         } else {
+            int8_using_mul_add(bld, inst);
+         }
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      v.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_fs_lower_pack.cpp b/src/intel/compiler/elk/brw_fs_lower_pack.cpp
new file mode 100644
index 00000000000..3a60989ecda
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_lower_pack.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright © 2015 Connor Abbott
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/half_float.h"
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+bool
+fs_visitor::lower_pack()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      if (inst->opcode != FS_OPCODE_PACK &&
+          inst->opcode != FS_OPCODE_PACK_HALF_2x16_SPLIT)
+         continue;
+
+      assert(inst->dst.file == VGRF);
+      assert(inst->saturate == false);
+      fs_reg dst = inst->dst;
+
+      const fs_builder ibld(this, block, inst);
+      /* The lowering generates 2 instructions for what was previously 1. This
+       * can trick the IR to believe we're doing partial writes, but the
+       * register is actually fully written. Mark it as undef to help the IR
+       * reduce the liveness of the register.
+       */
+      if (!inst->is_partial_write())
+         ibld.emit_undef_for_dst(inst);
+
+      switch (inst->opcode) {
+      case FS_OPCODE_PACK:
+         for (unsigned i = 0; i < inst->sources; i++)
+            ibld.MOV(subscript(dst, inst->src[i].type, i), inst->src[i]);
+         break;
+      case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+         assert(dst.type == BRW_REGISTER_TYPE_UD);
+
+         for (unsigned i = 0; i < inst->sources; i++) {
+            if (inst->src[i].file == IMM) {
+               const uint32_t half = _mesa_float_to_half(inst->src[i].f);
+               ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, i),
+                        brw_imm_uw(half));
+            } else if (i == 1 && devinfo->ver < 9) {
+               /* Pre-Skylake requires DWord aligned destinations */
+               fs_reg tmp = ibld.vgrf(BRW_REGISTER_TYPE_UD);
+               ibld.F32TO16(subscript(tmp, BRW_REGISTER_TYPE_HF, 0),
+                            inst->src[i]);
+               ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, 1),
+                        subscript(tmp, BRW_REGISTER_TYPE_UW, 0));
+            } else {
+               ibld.F32TO16(subscript(dst, BRW_REGISTER_TYPE_HF, i),
+                            inst->src[i]);
+            }
+         }
+         break;
+      default:
+         unreachable("skipped above");
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_fs_lower_regioning.cpp b/src/intel/compiler/elk/brw_fs_lower_regioning.cpp
new file mode 100644
index 00000000000..3bff7770cd0
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_lower_regioning.cpp
@@ -0,0 +1,677 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+namespace {
+   /* From the SKL PRM Vol 2a, "Move":
+    *
+    * "A mov with the same source and destination type, no source modifier,
+    *  and no saturation is a raw move. A packed byte destination region (B
+    *  or UB type with HorzStride == 1 and ExecSize > 1) can only be written
+    *  using raw move."
+    */
+   bool
+   is_byte_raw_mov(const fs_inst *inst)
+   {
+      return type_sz(inst->dst.type) == 1 &&
+             inst->opcode == BRW_OPCODE_MOV &&
+             inst->src[0].type == inst->dst.type &&
+             !inst->saturate &&
+             !inst->src[0].negate &&
+             !inst->src[0].abs;
+   }
+
+   /*
+    * Return an acceptable byte stride for the destination of an instruction
+    * that requires it to have some particular alignment.
+    */
+   unsigned
+   required_dst_byte_stride(const fs_inst *inst)
+   {
+      if (inst->dst.is_accumulator()) {
+         /* If the destination is an accumulator, insist that we leave the
+          * stride alone.  We cannot "fix" accumulator destinations by writing
+          * to a temporary and emitting a MOV into the original destination.
+          * For multiply instructions (our one use of the accumulator), the
+          * MUL writes the full 66 bits of the accumulator whereas the MOV we
+          * would emit only writes 33 bits and leaves the top 33 bits
+          * undefined.
+          *
+          * It's safe to just require the original stride here because the
+          * lowering pass will detect the mismatch in has_invalid_src_region
+          * and fix the sources of the multiply instead of the destination.
+          */
+         return inst->dst.stride * type_sz(inst->dst.type);
+      } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
+          !is_byte_raw_mov(inst)) {
+         return get_exec_type_size(inst);
+      } else {
+         /* Calculate the maximum byte stride and the minimum/maximum type
+          * size across all source and destination operands we are required to
+          * lower.
+          */
+         unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
+         unsigned min_size = type_sz(inst->dst.type);
+         unsigned max_size = type_sz(inst->dst.type);
+
+         for (unsigned i = 0; i < inst->sources; i++) {
+            if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
+               const unsigned size = type_sz(inst->src[i].type);
+               max_stride = MAX2(max_stride, inst->src[i].stride * size);
+               min_size = MIN2(min_size, size);
+               max_size = MAX2(max_size, size);
+            }
+         }
+
+         /* All operands involved in lowering need to fit in the calculated
+          * stride.
+          */
+         assert(max_size <= 4 * min_size);
+
+         /* Attempt to use the largest byte stride among all present operands,
+          * but never exceed a stride of 4 since that would lead to illegal
+          * destination regions during lowering.
+          */
+         return MIN2(max_stride, 4 * min_size);
+      }
+   }
+
+   /*
+    * Return an acceptable byte sub-register offset for the destination of an
+    * instruction that requires it to be aligned to the sub-register offset of
+    * the sources.
+    */
+   unsigned
+   required_dst_byte_offset(const intel_device_info *devinfo, const fs_inst *inst)
+   {
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
+            if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) !=
+                reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE))
+               return 0;
+      }
+
+      return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
+   }
+
+   /*
+    * Return the closest legal execution type for an instruction on
+    * the specified platform.
+    */
+   brw_reg_type
+   required_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
+   {
+      const brw_reg_type t = get_exec_type(inst);
+      const bool has_64bit = brw_reg_type_is_floating_point(t) ?
+         devinfo->has_64bit_float : devinfo->has_64bit_int;
+
+      switch (inst->opcode) {
+      case SHADER_OPCODE_SHUFFLE:
+         /* IVB has an issue (which we found empirically) where it reads
+          * two address register components per channel for indirectly
+          * addressed 64-bit sources.
+          *
+          * From the Cherryview PRM Vol 7. "Register Region Restrictions":
+          *
+          *    "When source or destination datatype is 64b or operation is
+          *    integer DWord multiply, indirect addressing must not be
+          *    used."
+          *
+          * Work around both of the above and handle platforms that
+          * don't support 64-bit types at all.
+          */
+         if ((!devinfo->has_64bit_int ||
+              devinfo->platform == INTEL_PLATFORM_CHV ||
+              intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
+            return BRW_REGISTER_TYPE_UD;
+         else if (has_dst_aligned_region_restriction(devinfo, inst))
+            return brw_int_type(type_sz(t), false);
+         else
+            return t;
+
+      case SHADER_OPCODE_SEL_EXEC:
+         if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) &&
+             type_sz(t) > 4)
+            return BRW_REGISTER_TYPE_UD;
+         else
+            return t;
+
+      case SHADER_OPCODE_QUAD_SWIZZLE:
+         if (has_dst_aligned_region_restriction(devinfo, inst))
+            return brw_int_type(type_sz(t), false);
+         else
+            return t;
+
+      case SHADER_OPCODE_CLUSTER_BROADCAST:
+         /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
+          *
+          *    "When source or destination datatype is 64b or operation is
+          *    integer DWord multiply, indirect addressing must not be
+          *    used."
+          *
+          * For MTL (verx10 == 125), float64 is supported, but int64 is not.
+          * Therefore we need to lower cluster broadcast using 32-bit int ops.
+          *
+          * For gfx12.5+ platforms that support int64, the register regions
+          * used by cluster broadcast aren't supported by the 64-bit pipeline.
+          *
+          * Work around the above and handle platforms that don't
+          * support 64-bit types at all.
+          */
+         if ((!has_64bit || devinfo->verx10 >= 125 ||
+              devinfo->platform == INTEL_PLATFORM_CHV ||
+              intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
+            return BRW_REGISTER_TYPE_UD;
+         else
+            return brw_int_type(type_sz(t), false);
+
+      case SHADER_OPCODE_BROADCAST:
+      case SHADER_OPCODE_MOV_INDIRECT:
+         if (((devinfo->verx10 == 70 ||
+               devinfo->platform == INTEL_PLATFORM_CHV ||
+               intel_device_info_is_9lp(devinfo) ||
+               devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
+             (devinfo->verx10 >= 125 &&
+              brw_reg_type_is_floating_point(inst->src[0].type)))
+            return brw_int_type(type_sz(t), false);
+         else
+            return t;
+
+      default:
+         return t;
+      }
+   }
+
+   /*
+    * Return the stride between channels of the specified register in
+    * byte units, or ~0u if the region cannot be represented with a
+    * single one-dimensional stride.
+    */
+   unsigned
+   byte_stride(const fs_reg &reg)
+   {
+      switch (reg.file) {
+      case BAD_FILE:
+      case UNIFORM:
+      case IMM:
+      case VGRF:
+      case MRF:
+      case ATTR:
+         return reg.stride * type_sz(reg.type);
+      case ARF:
+      case FIXED_GRF:
+         if (reg.is_null()) {
+            return 0;
+         } else {
+            const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
+            const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
+            const unsigned width = 1 << reg.width;
+
+            if (width == 1) {
+               return vstride * type_sz(reg.type);
+            } else if (hstride * width == vstride) {
+               return hstride * type_sz(reg.type);
+            } else {
+               return ~0u;
+            }
+         }
+      default:
+         unreachable("Invalid register file");
+      }
+   }
+
+   /*
+    * Return whether the instruction has an unsupported channel bit layout
+    * specified for the i-th source region.
+    */
+   bool
+   has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
+                          unsigned i)
+   {
+      if (is_send(inst) || inst->is_math() || inst->is_control_source(i) ||
+          inst->opcode == BRW_OPCODE_DPAS) {
+         return false;
+      }
+
+      /* Empirical testing shows that Broadwell has a bug affecting half-float
+       * MAD instructions when any of its sources has a non-zero offset, such
+       * as:
+       *
+       * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
+       *
+       * We used to generate code like this for SIMD8 executions where we
+       * used to pack components Y and W of a vector at offset 16B of a SIMD
+       * register. The problem doesn't occur if the stride of the source is 0.
+       */
+      if (devinfo->ver == 8 &&
+          inst->opcode == BRW_OPCODE_MAD &&
+          inst->src[i].type == BRW_REGISTER_TYPE_HF &&
+          reg_offset(inst->src[i]) % REG_SIZE > 0 &&
+          inst->src[i].stride != 0) {
+         return true;
+      }
+
+      const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
+      const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
+
+      return has_dst_aligned_region_restriction(devinfo, inst) &&
+             !is_uniform(inst->src[i]) &&
+             (byte_stride(inst->src[i]) != byte_stride(inst->dst) ||
+              src_byte_offset != dst_byte_offset);
+   }
+
+   /*
+    * Return whether the instruction has an unsupported channel bit layout
+    * specified for the destination region.
+    */
+   bool
+   has_invalid_dst_region(const intel_device_info *devinfo,
+                          const fs_inst *inst)
+   {
+      if (is_send(inst) || inst->is_math()) {
+         return false;
+      } else {
+         const brw_reg_type exec_type = get_exec_type(inst);
+         const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
+         const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
+            type_sz(inst->dst.type) < type_sz(exec_type);
+
+         return (has_dst_aligned_region_restriction(devinfo, inst) &&
+                 (required_dst_byte_stride(inst) != byte_stride(inst->dst) ||
+                  required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) ||
+                (is_narrowing_conversion &&
+                 required_dst_byte_stride(inst) != byte_stride(inst->dst));
+      }
+   }
+
+   /**
+    * Return a non-zero value if the execution type of the instruction is
+    * unsupported.  The destination and sources matching the returned mask
+    * will be bit-cast to an integer type of appropriate size, lowering any
+    * source or destination modifiers into separate MOV instructions.
+    */
+   unsigned
+   has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
+   {
+      if (required_exec_type(devinfo, inst) != get_exec_type(inst)) {
+         switch (inst->opcode) {
+         case SHADER_OPCODE_SHUFFLE:
+         case SHADER_OPCODE_QUAD_SWIZZLE:
+         case SHADER_OPCODE_CLUSTER_BROADCAST:
+         case SHADER_OPCODE_BROADCAST:
+         case SHADER_OPCODE_MOV_INDIRECT:
+            return 0x1;
+
+         case SHADER_OPCODE_SEL_EXEC:
+            return 0x3;
+
+         default:
+            unreachable("Unknown invalid execution type source mask.");
+         }
+      } else {
+         return 0;
+      }
+   }
+
+   /*
+    * Return whether the instruction has unsupported source modifiers
+    * specified for the i-th source region.
+    */
+   bool
+   has_invalid_src_modifiers(const intel_device_info *devinfo,
+                             const fs_inst *inst, unsigned i)
+   {
+      return (!inst->can_do_source_mods(devinfo) &&
+              (inst->src[i].negate || inst->src[i].abs)) ||
+             ((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
+              (inst->src[i].negate || inst->src[i].abs ||
+               inst->src[i].type != get_exec_type(inst)));
+   }
+
+   /*
+    * Return whether the instruction has an unsupported type conversion
+    * specified for the destination.
+    */
+   bool
+   has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)
+   {
+      switch (inst->opcode) {
+      case BRW_OPCODE_MOV:
+         return false;
+      case BRW_OPCODE_SEL:
+         return inst->dst.type != get_exec_type(inst);
+      default:
+         /* FIXME: We assume the opcodes not explicitly mentioned before just
+          * work fine with arbitrary conversions, unless they need to be
+          * bit-cast.
+          */
+         return has_invalid_exec_type(devinfo, inst) &&
+                inst->dst.type != get_exec_type(inst);
+      }
+   }
+
+   /**
+    * Return whether the instruction has unsupported destination modifiers.
+    */
+   bool
+   has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)
+   {
+      return (has_invalid_exec_type(devinfo, inst) &&
+              (inst->saturate || inst->conditional_mod)) ||
+             has_invalid_conversion(devinfo, inst);
+   }
+
+   /**
+    * Return whether the instruction has non-standard semantics for the
+    * conditional mod which don't cause the flag register to be updated with
+    * the comparison result.
+    */
+   bool
+   has_inconsistent_cmod(const fs_inst *inst)
+   {
+      return inst->opcode == BRW_OPCODE_SEL ||
+             inst->opcode == BRW_OPCODE_CSEL ||
+             inst->opcode == BRW_OPCODE_IF ||
+             inst->opcode == BRW_OPCODE_WHILE;
+   }
+
+   bool
+   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
+}
+
+namespace brw {
+   /**
+    * Remove any modifiers from the \p i-th source region of the instruction,
+    * including negate, abs and any implicit type conversion to the execution
+    * type.  Instead any source modifiers will be implemented as a separate
+    * MOV instruction prior to the original instruction.
+    */
+   bool
+   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
+   {
+      assert(inst->components_read(i) == 1);
+      assert(v->devinfo->has_integer_dword_mul ||
+             inst->opcode != BRW_OPCODE_MUL ||
+             brw_reg_type_is_floating_point(get_exec_type(inst)) ||
+             MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||
+             type_sz(inst->src[i].type) == get_exec_type_size(inst));
+
+      const fs_builder ibld(v, block, inst);
+      const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
+
+      lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
+      inst->src[i] = tmp;
+
+      return true;
+   }
+}
+
+namespace {
+   /**
+    * Remove any modifiers from the destination region of the instruction,
+    * including saturate, conditional mod and any implicit type conversion
+    * from the execution type.  Instead any destination modifiers will be
+    * implemented as a separate MOV instruction after the original
+    * instruction.
+    */
+   bool
+   lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
+   {
+      const fs_builder ibld(v, block, inst);
+      const brw_reg_type type = get_exec_type(inst);
+      /* Not strictly necessary, but if possible use a temporary with the same
+       * channel alignment as the current destination in order to avoid
+       * violating the restrictions enforced later on by lower_src_region()
+       * and lower_dst_region(), which would introduce additional copy
+       * instructions into the program unnecessarily.
+       */
+      const unsigned stride =
+         type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
+         type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
+      fs_reg tmp = ibld.vgrf(type, stride);
+      ibld.UNDEF(tmp);
+      tmp = horiz_stride(tmp, stride);
+
+      /* Emit a MOV taking care of all the destination modifiers. */
+      fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
+      mov->saturate = inst->saturate;
+      if (!has_inconsistent_cmod(inst))
+         mov->conditional_mod = inst->conditional_mod;
+      if (inst->opcode != BRW_OPCODE_SEL) {
+         mov->predicate = inst->predicate;
+         mov->predicate_inverse = inst->predicate_inverse;
+      }
+      mov->flag_subreg = inst->flag_subreg;
+      lower_instruction(v, block, mov);
+
+      /* Point the original instruction at the temporary, and clean up any
+       * destination modifiers.
+       */
+      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
+      inst->dst = tmp;
+      inst->size_written = inst->dst.component_size(inst->exec_size);
+      inst->saturate = false;
+      if (!has_inconsistent_cmod(inst))
+         inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+      assert(!inst->flags_written(v->devinfo) || !mov->predicate);
+      return true;
+   }
+
+   /**
+    * Remove any non-trivial shuffling of data from the \p i-th source region
+    * of the instruction.  Instead implement the region as a series of integer
+    * copies into a temporary with the same channel layout as the destination.
+    */
+   bool
+   lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
+   {
+      assert(inst->components_read(i) == 1);
+      const fs_builder ibld(v, block, inst);
+      const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
+                              type_sz(inst->src[i].type);
+      assert(stride > 0);
+      fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
+      ibld.UNDEF(tmp);
+      tmp = horiz_stride(tmp, stride);
+
+      /* Emit a series of 32-bit integer copies with any source modifiers
+       * cleaned up (because their semantics are dependent on the type).
+       */
+      const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
+                                                 false);
+      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
+      fs_reg raw_src = inst->src[i];
+      raw_src.negate = false;
+      raw_src.abs = false;
+
+      for (unsigned j = 0; j < n; j++)
+         ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
+
+      /* Point the original instruction at the temporary, making sure to keep
+       * any source modifiers in the instruction.
+       */
+      fs_reg lower_src = tmp;
+      lower_src.negate = inst->src[i].negate;
+      lower_src.abs = inst->src[i].abs;
+      inst->src[i] = lower_src;
+
+      return true;
+   }
+
+   /**
+    * Remove any non-trivial shuffling of data from the destination region of
+    * the instruction.  Instead implement the region as a series of integer
+    * copies from a temporary with a channel layout compatible with the
+    * sources.
+    */
+   bool
+   lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
+   {
+      /* We cannot replace the result of an integer multiply which writes the
+       * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
+       * value whereas the MOV will act on only 32 or 33 bits of the
+       * accumulator.
+       */
+      assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
+             brw_reg_type_is_floating_point(inst->dst.type));
+
+      const fs_builder ibld(v, block, inst);
+      const unsigned stride = required_dst_byte_stride(inst) /
+                              type_sz(inst->dst.type);
+      assert(stride > 0);
+      fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
+      ibld.UNDEF(tmp);
+      tmp = horiz_stride(tmp, stride);
+
+      /* Emit a series of 32-bit integer copies from the temporary into the
+       * original destination.
+       */
+      const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
+                                                 false);
+      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
+
+      if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
+         /* Note that in general we cannot simply predicate the copies on the
+          * same flag register as the original instruction, since it may have
+          * been overwritten by the instruction itself.  Instead initialize
+          * the temporary with the previous contents of the destination
+          * register.
+          */
+         for (unsigned j = 0; j < n; j++)
+            ibld.MOV(subscript(tmp, raw_type, j),
+                     subscript(inst->dst, raw_type, j));
+      }
+
+      for (unsigned j = 0; j < n; j++)
+         ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
+                                        subscript(tmp, raw_type, j));
+
+      /* Point the original instruction at the temporary, making sure to keep
+       * any destination modifiers in the instruction.
+       */
+      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
+      inst->dst = tmp;
+      inst->size_written = inst->dst.component_size(inst->exec_size);
+
+      return true;
+   }
+
+   /**
+    * Change sources and destination of the instruction to an
+    * appropriate legal type, splitting the instruction into multiple
+    * ones of smaller execution type if necessary, to be used in cases
+    * where the execution type of an instruction is unsupported.
+    */
+   bool
+   lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)
+   {
+      assert(inst->dst.type == get_exec_type(inst));
+      const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
+      const brw_reg_type raw_type = required_exec_type(v->devinfo, inst);
+      const unsigned n = get_exec_type_size(inst) / type_sz(raw_type);
+      const fs_builder ibld(v, block, inst);
+
+      fs_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride);
+      ibld.UNDEF(tmp);
+      tmp = horiz_stride(tmp, inst->dst.stride);
+
+      for (unsigned j = 0; j < n; j++) {
+         fs_inst sub_inst = *inst;
+
+         for (unsigned i = 0; i < inst->sources; i++) {
+            if (mask & (1u << i)) {
+               assert(inst->src[i].type == inst->dst.type);
+               sub_inst.src[i] = subscript(inst->src[i], raw_type, j);
+            }
+         }
+
+         sub_inst.dst = subscript(tmp, raw_type, j);
+
+         assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size));
+         assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate);
+         ibld.emit(sub_inst);
+
+         fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j),
+                                 subscript(tmp, raw_type, j));
+         if (inst->opcode != BRW_OPCODE_SEL) {
+            mov->predicate = inst->predicate;
+            mov->predicate_inverse = inst->predicate_inverse;
+         }
+         lower_instruction(v, block, mov);
+      }
+
+      inst->remove(block);
+
+      return true;
+   }
+
+   /**
+    * Legalize the source and destination regioning controls of the specified
+    * instruction.
+    */
+   bool
+   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
+   {
+      const intel_device_info *devinfo = v->devinfo;
+      bool progress = false;
+
+      if (has_invalid_dst_modifiers(devinfo, inst))
+         progress |= lower_dst_modifiers(v, block, inst);
+
+      if (has_invalid_dst_region(devinfo, inst))
+         progress |= lower_dst_region(v, block, inst);
+
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (has_invalid_src_modifiers(devinfo, inst, i))
+            progress |= lower_src_modifiers(v, block, inst, i);
+
+         if (has_invalid_src_region(devinfo, inst, i))
+            progress |= lower_src_region(v, block, inst, i);
+      }
+
+      if (has_invalid_exec_type(devinfo, inst))
+         progress |= lower_exec_type(v, block, inst);
+
+      return progress;
+   }
+}
+
+bool
+fs_visitor::lower_regioning()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
+      progress |= lower_instruction(this, block, inst);
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_fs_nir.cpp b/src/intel/compiler/elk/brw_fs_nir.cpp
new file mode 100644
index 00000000000..d16ca1a5ae8
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_nir.cpp
@@ -0,0 +1,8804 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_nir.h"
+#include "brw_eu.h"
+#include "nir.h"
+#include "nir_intrinsics.h"
+#include "nir_search_helpers.h"
+#include "util/u_math.h"
+#include "util/bitscan.h"
+
+#include <vector>
+
+using namespace brw;
+
+struct brw_fs_bind_info {
+   bool valid;
+   bool bindless;
+   unsigned block;
+   unsigned set;
+   unsigned binding;
+};
+
+struct nir_to_brw_state {
+   fs_visitor &s;
+   const nir_shader *nir;
+   const intel_device_info *devinfo;
+   void *mem_ctx;
+
+   /* Points to the end of the program.  Annotated with the current NIR
+    * instruction when applicable.
+    */
+   fs_builder bld;
+
+   fs_reg *ssa_values;
+   fs_inst **resource_insts;
+   struct brw_fs_bind_info *ssa_bind_infos;
+   fs_reg *resource_values;
+   fs_reg *system_values;
+};
+
+static fs_reg get_nir_src(nir_to_brw_state &ntb, const nir_src &src);
+static fs_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def);
+static nir_component_mask_t get_nir_write_mask(const nir_def &def);
+
+static void fs_nir_emit_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld, nir_intrinsic_instr *instr);
+static fs_reg emit_samplepos_setup(nir_to_brw_state &ntb);
+static fs_reg emit_sampleid_setup(nir_to_brw_state &ntb);
+static fs_reg emit_samplemaskin_setup(nir_to_brw_state &ntb);
+static fs_reg emit_shading_rate_setup(nir_to_brw_state &ntb);
+
+static void fs_nir_emit_impl(nir_to_brw_state &ntb, nir_function_impl *impl);
+static void fs_nir_emit_cf_list(nir_to_brw_state &ntb, exec_list *list);
+static void fs_nir_emit_if(nir_to_brw_state &ntb, nir_if *if_stmt);
+static void fs_nir_emit_loop(nir_to_brw_state &ntb, nir_loop *loop);
+static void fs_nir_emit_block(nir_to_brw_state &ntb, nir_block *block);
+static void fs_nir_emit_instr(nir_to_brw_state &ntb, nir_instr *instr);
+
+static void fs_nir_emit_surface_atomic(nir_to_brw_state &ntb,
+                                       const fs_builder &bld,
+                                       nir_intrinsic_instr *instr,
+                                       fs_reg surface,
+                                       bool bindless);
+static void fs_nir_emit_global_atomic(nir_to_brw_state &ntb,
+                                      const fs_builder &bld,
+                                      nir_intrinsic_instr *instr);
+
+static void
+fs_nir_setup_outputs(nir_to_brw_state &ntb)
+{
+   fs_visitor &s = ntb.s;
+
+   if (s.stage == MESA_SHADER_TESS_CTRL ||
+       s.stage == MESA_SHADER_TASK ||
+       s.stage == MESA_SHADER_MESH ||
+       s.stage == MESA_SHADER_FRAGMENT)
+      return;
+
+   unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
+
+   /* Calculate the size of output registers in a separate pass, before
+    * allocating them.  With ARB_enhanced_layouts, multiple output variables
+    * may occupy the same slot, but have different type sizes.
+    */
+   nir_foreach_shader_out_variable(var, s.nir) {
+      const int loc = var->data.driver_location;
+      const unsigned var_vec4s = nir_variable_count_slots(var, var->type);
+      vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
+   }
+
+   for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
+      if (vec4s[loc] == 0) {
+         loc++;
+         continue;
+      }
+
+      unsigned reg_size = vec4s[loc];
+
+      /* Check if there are any ranges that start within this range and extend
+       * past it. If so, include them in this allocation.
+       */
+      for (unsigned i = 1; i < reg_size; i++) {
+         assert(i + loc < ARRAY_SIZE(vec4s));
+         reg_size = MAX2(vec4s[i + loc] + i, reg_size);
+      }
+
+      fs_reg reg = ntb.bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size);
+      for (unsigned i = 0; i < reg_size; i++) {
+         assert(loc + i < ARRAY_SIZE(s.outputs));
+         s.outputs[loc + i] = offset(reg, ntb.bld, 4 * i);
+      }
+
+      loc += reg_size;
+   }
+}
+
+static void
+fs_nir_setup_uniforms(fs_visitor &s)
+{
+   const intel_device_info *devinfo = s.devinfo;
+
+   /* Only the first compile gets to set up uniforms. */
+   if (s.push_constant_loc)
+      return;
+
+   s.uniforms = s.nir->num_uniforms / 4;
+
+   if (gl_shader_stage_is_compute(s.stage) && devinfo->verx10 < 125) {
+      /* Add uniforms for builtins after regular NIR uniforms. */
+      assert(s.uniforms == s.prog_data->nr_params);
+
+      /* Subgroup ID must be the last uniform on the list.  This will make
+       * easier later to split between cross thread and per thread
+       * uniforms.
+       */
+      uint32_t *param = brw_stage_prog_data_add_params(s.prog_data, 1);
+      *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
+      s.uniforms++;
+   }
+}
+
+static fs_reg
+emit_work_group_id_setup(nir_to_brw_state &ntb)
+{
+   fs_visitor &s = ntb.s;
+   const fs_builder &bld = ntb.bld;
+
+   assert(gl_shader_stage_is_compute(s.stage));
+
+   fs_reg id = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
+
+   struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
+   bld.MOV(id, r0_1);
+
+   struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
+   struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
+   bld.MOV(offset(id, bld, 1), r0_6);
+   bld.MOV(offset(id, bld, 2), r0_7);
+
+   return id;
+}
+
+static bool
+emit_system_values_block(nir_to_brw_state &ntb, nir_block *block)
+{
+   fs_visitor &s = ntb.s;
+   fs_reg *reg;
+
+   nir_foreach_instr(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_load_vertex_id:
+      case nir_intrinsic_load_base_vertex:
+         unreachable("should be lowered by nir_lower_system_values().");
+
+      case nir_intrinsic_load_vertex_id_zero_base:
+      case nir_intrinsic_load_is_indexed_draw:
+      case nir_intrinsic_load_first_vertex:
+      case nir_intrinsic_load_instance_id:
+      case nir_intrinsic_load_base_instance:
+         unreachable("should be lowered by brw_nir_lower_vs_inputs().");
+         break;
+
+      case nir_intrinsic_load_draw_id:
+         /* For Task/Mesh, draw_id will be handled later in
+          * nir_emit_mesh_task_intrinsic().
+          */
+         if (!gl_shader_stage_is_mesh(s.stage))
+            unreachable("should be lowered by brw_nir_lower_vs_inputs().");
+         break;
+
+      case nir_intrinsic_load_invocation_id:
+         if (s.stage == MESA_SHADER_TESS_CTRL)
+            break;
+         assert(s.stage == MESA_SHADER_GEOMETRY);
+         reg = &ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
+         if (reg->file == BAD_FILE) {
+            *reg = s.gs_payload().instance_id;
+         }
+         break;
+
+      case nir_intrinsic_load_sample_pos:
+      case nir_intrinsic_load_sample_pos_or_center:
+         assert(s.stage == MESA_SHADER_FRAGMENT);
+         reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
+         if (reg->file == BAD_FILE)
+            *reg = emit_samplepos_setup(ntb);
+         break;
+
+      case nir_intrinsic_load_sample_id:
+         assert(s.stage == MESA_SHADER_FRAGMENT);
+         reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
+         if (reg->file == BAD_FILE)
+            *reg = emit_sampleid_setup(ntb);
+         break;
+
+      case nir_intrinsic_load_sample_mask_in:
+         assert(s.stage == MESA_SHADER_FRAGMENT);
+         assert(s.devinfo->ver >= 7);
+         reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
+         if (reg->file == BAD_FILE)
+            *reg = emit_samplemaskin_setup(ntb);
+         break;
+
+      case nir_intrinsic_load_workgroup_id:
+      case nir_intrinsic_load_workgroup_id_zero_base:
+         if (gl_shader_stage_is_mesh(s.stage))
+            unreachable("should be lowered by nir_lower_compute_system_values().");
+         assert(gl_shader_stage_is_compute(s.stage));
+         reg = &ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
+         if (reg->file == BAD_FILE)
+            *reg = emit_work_group_id_setup(ntb);
+         break;
+
+      case nir_intrinsic_load_helper_invocation:
+         assert(s.stage == MESA_SHADER_FRAGMENT);
+         reg = &ntb.system_values[SYSTEM_VALUE_HELPER_INVOCATION];
+         if (reg->file == BAD_FILE) {
+            const fs_builder abld =
+               ntb.bld.annotate("gl_HelperInvocation", NULL);
+
+            /* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the
+             * pixel mask is in g1.7 of the thread payload.
+             *
+             * We move the per-channel pixel enable bit to the low bit of each
+             * channel by shifting the byte containing the pixel mask by the
+             * vector immediate 0x76543210UV.
+             *
+             * The region of <1,8,0> reads only 1 byte (the pixel masks for
+             * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
+             * masks for 2 and 3) in SIMD16.
+             */
+            fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
+
+            for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
+               const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
+               /* According to the "PS Thread Payload for Normal
+                * Dispatch" pages on the BSpec, the dispatch mask is
+                * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
+                * gfx6+.
+                */
+               const struct brw_reg reg = s.devinfo->ver >= 20 ?
+                  xe2_vec1_grf(i, 15) : brw_vec1_grf(i + 1, 7);
+               hbld.SHR(offset(shifted, hbld, i),
+                        stride(retype(reg, BRW_REGISTER_TYPE_UB), 1, 8, 0),
+                        brw_imm_v(0x76543210));
+            }
+
+            /* A set bit in the pixel mask means the channel is enabled, but
+             * that is the opposite of gl_HelperInvocation so we need to invert
+             * the mask.
+             *
+             * The negate source-modifier bit of logical instructions on Gfx8+
+             * performs 1's complement negation, so we can use that instead of
+             * a NOT instruction.
+             */
+            fs_reg inverted = negate(shifted);
+            if (s.devinfo->ver < 8) {
+               inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
+               abld.NOT(inverted, shifted);
+            }
+
+            /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
+             * with 1 and negating.
+             */
+            fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+            abld.AND(anded, inverted, brw_imm_uw(1));
+
+            fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
+            abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
+            *reg = dst;
+         }
+         break;
+
+      case nir_intrinsic_load_frag_shading_rate:
+         reg = &ntb.system_values[SYSTEM_VALUE_FRAG_SHADING_RATE];
+         if (reg->file == BAD_FILE)
+            *reg = emit_shading_rate_setup(ntb);
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   return true;
+}
+
+static void
+fs_nir_emit_system_values(nir_to_brw_state &ntb)
+{
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   ntb.system_values = ralloc_array(ntb.mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
+   for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
+      ntb.system_values[i] = fs_reg();
+   }
+
+   /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
+    * never end up using it.
+    */
+   {
+      const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
+      fs_reg &reg = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
+      reg = abld.vgrf(BRW_REGISTER_TYPE_UW);
+      abld.UNDEF(reg);
+
+      const fs_builder allbld8 = abld.group(8, 0).exec_all();
+      allbld8.MOV(reg, brw_imm_v(0x76543210));
+      if (s.dispatch_width > 8)
+         allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
+      if (s.dispatch_width > 16) {
+         const fs_builder allbld16 = abld.group(16, 0).exec_all();
+         allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
+      }
+   }
+
+   nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);
+   nir_foreach_block(block, impl)
+      emit_system_values_block(ntb, block);
+}
+
+static void
+fs_nir_emit_impl(nir_to_brw_state &ntb, nir_function_impl *impl)
+{
+   ntb.ssa_values = rzalloc_array(ntb.mem_ctx, fs_reg, impl->ssa_alloc);
+   ntb.resource_insts = rzalloc_array(ntb.mem_ctx, fs_inst *, impl->ssa_alloc);
+   ntb.ssa_bind_infos = rzalloc_array(ntb.mem_ctx, struct brw_fs_bind_info, impl->ssa_alloc);
+   ntb.resource_values = rzalloc_array(ntb.mem_ctx, fs_reg, impl->ssa_alloc);
+
+   fs_nir_emit_cf_list(ntb, &impl->body);
+}
+
+static void
+fs_nir_emit_cf_list(nir_to_brw_state &ntb, exec_list *list)
+{
+   exec_list_validate(list);
+   foreach_list_typed(nir_cf_node, node, node, list) {
+      switch (node->type) {
+      case nir_cf_node_if:
+         fs_nir_emit_if(ntb, nir_cf_node_as_if(node));
+         break;
+
+      case nir_cf_node_loop:
+         fs_nir_emit_loop(ntb, nir_cf_node_as_loop(node));
+         break;
+
+      case nir_cf_node_block:
+         fs_nir_emit_block(ntb, nir_cf_node_as_block(node));
+         break;
+
+      default:
+         unreachable("Invalid CFG node block");
+      }
+   }
+}
+
+static void
+fs_nir_emit_if(nir_to_brw_state &ntb, nir_if *if_stmt)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+
+   bool invert;
+   fs_reg cond_reg;
+
+   /* If the condition has the form !other_condition, use other_condition as
+    * the source, but invert the predicate on the if instruction.
+    */
+   nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
+   if (cond != NULL && cond->op == nir_op_inot) {
+      invert = true;
+      cond_reg = get_nir_src(ntb, cond->src[0].src);
+      cond_reg = offset(cond_reg, bld, cond->src[0].swizzle[0]);
+
+      if (devinfo->ver <= 5 &&
+	  (cond->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
+         /* redo boolean resolve on gen5 */
+         fs_reg masked = ntb.s.vgrf(glsl_int_type());
+         bld.AND(masked, cond_reg, brw_imm_d(1));
+         masked.negate = true;
+         fs_reg tmp = bld.vgrf(cond_reg.type);
+         bld.MOV(retype(tmp, BRW_REGISTER_TYPE_D), masked);
+         cond_reg = tmp;
+      }
+   } else {
+      invert = false;
+      cond_reg = get_nir_src(ntb, if_stmt->condition);
+   }
+
+   /* first, put the condition into f0 */
+   fs_inst *inst = bld.MOV(bld.null_reg_d(),
+                           retype(cond_reg, BRW_REGISTER_TYPE_D));
+   inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+   bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert;
+
+   fs_nir_emit_cf_list(ntb, &if_stmt->then_list);
+
+   if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
+      bld.emit(BRW_OPCODE_ELSE);
+      fs_nir_emit_cf_list(ntb, &if_stmt->else_list);
+   }
+
+   bld.emit(BRW_OPCODE_ENDIF);
+
+   if (devinfo->ver < 7)
+      ntb.s.limit_dispatch_width(16, "Non-uniform control flow unsupported "
+                                   "in SIMD32 mode.");
+}
+
+static void
+fs_nir_emit_loop(nir_to_brw_state &ntb, nir_loop *loop)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+
+   assert(!nir_loop_has_continue_construct(loop));
+   bld.emit(BRW_OPCODE_DO);
+
+   fs_nir_emit_cf_list(ntb, &loop->body);
+
+   bld.emit(BRW_OPCODE_WHILE);
+
+   if (devinfo->ver < 7)
+      ntb.s.limit_dispatch_width(16, "Non-uniform control flow unsupported "
+                                   "in SIMD32 mode.");
+}
+
+static void
+fs_nir_emit_block(nir_to_brw_state &ntb, nir_block *block)
+{
+   fs_builder bld = ntb.bld;
+
+   nir_foreach_instr(instr, block) {
+      fs_nir_emit_instr(ntb, instr);
+   }
+
+   ntb.bld = bld;
+}
+
+/**
+ * Recognizes a parent instruction of nir_op_extract_* and changes the type to
+ * match instr.
+ */
+static bool
+optimize_extract_to_float(nir_to_brw_state &ntb, nir_alu_instr *instr,
+                          const fs_reg &result)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+
+   if (!instr->src[0].src.ssa->parent_instr)
+      return false;
+
+   if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *src0 =
+      nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+
+   if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
+       src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
+      return false;
+
+   unsigned element = nir_src_as_uint(src0->src[1].src);
+
+   /* Element type to extract.*/
+   const brw_reg_type type = brw_int_type(
+      src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
+      src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
+
+   fs_reg op0 = get_nir_src(ntb, src0->src[0].src);
+   op0.type = brw_type_for_nir_type(devinfo,
+      (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
+                     nir_src_bit_size(src0->src[0].src)));
+   op0 = offset(op0, bld, src0->src[0].swizzle[0]);
+
+   bld.MOV(result, subscript(op0, type, element));
+   return true;
+}
+
+static bool
+optimize_frontfacing_ternary(nir_to_brw_state &ntb,
+                             nir_alu_instr *instr,
+                             const fs_reg &result)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   fs_visitor &s = ntb.s;
+
+   nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
+   if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
+      return false;
+
+   if (!nir_src_is_const(instr->src[1].src) ||
+       !nir_src_is_const(instr->src[2].src))
+      return false;
+
+   const float value1 = nir_src_as_float(instr->src[1].src);
+   const float value2 = nir_src_as_float(instr->src[2].src);
+   if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
+      return false;
+
+   /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
+   assert(value1 == -value2);
+
+   fs_reg tmp = s.vgrf(glsl_int_type());
+
+   if (devinfo->ver >= 20) {
+      /* Gfx20+ has separate back-facing bits for each pair of
+       * subspans in order to support multiple polygons, so we need to
+       * use a <1;8,0> region in order to select the correct word for
+       * each channel.  Unfortunately they're no longer aligned to the
+       * sign bit of a 16-bit word, so a left shift is necessary.
+       */
+      fs_reg ff = ntb.bld.vgrf(BRW_REGISTER_TYPE_UW);
+
+      for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
+         const fs_builder hbld = ntb.bld.group(16, i);
+         const struct brw_reg gi_uw = retype(xe2_vec1_grf(i, 9),
+                                             BRW_REGISTER_TYPE_UW);
+         hbld.SHL(offset(ff, hbld, i), stride(gi_uw, 1, 8, 0), brw_imm_ud(4));
+      }
+
+      if (value1 == -1.0f)
+         ff.negate = true;
+
+      ntb.bld.OR(subscript(tmp, BRW_REGISTER_TYPE_UW, 1), ff,
+                  brw_imm_uw(0x3f80));
+
+   } else if (devinfo->ver >= 12 && s.max_polygons == 2) {
+      /* According to the BSpec "PS Thread Payload for Normal
+       * Dispatch", the front/back facing interpolation bit is stored
+       * as bit 15 of either the R1.1 or R1.6 poly info field, for the
+       * first and second polygons respectively in multipolygon PS
+       * dispatch mode.
+       */
+      assert(s.dispatch_width == 16);
+
+      for (unsigned i = 0; i < s.max_polygons; i++) {
+         const fs_builder hbld = ntb.bld.group(8, i);
+         struct brw_reg g1 = retype(brw_vec1_grf(1, 1 + 5 * i),
+                                    BRW_REGISTER_TYPE_UW);
+
+         if (value1 == -1.0f)
+            g1.negate = true;
+
+         hbld.OR(subscript(offset(tmp, hbld, i), BRW_REGISTER_TYPE_UW, 1),
+                 g1, brw_imm_uw(0x3f80));
+      }
+
+   } else if (devinfo->ver >= 12) {
+      /* Bit 15 of g1.1 is 0 if the polygon is front facing. */
+      fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W));
+
+      /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
+       *
+       *    or(8)  tmp.1<2>W  g1.1<0,1,0>W  0x00003f80W
+       *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
+       *
+       * and negate g1.1<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
+       */
+      if (value1 == -1.0f)
+         g1.negate = true;
+
+      ntb.bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
+                  g1, brw_imm_uw(0x3f80));
+   } else if (devinfo->ver >= 6) {
+      /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
+      fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
+
+      /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
+       *
+       *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
+       *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
+       *
+       * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
+       *
+       * This negation looks like it's safe in practice, because bits 0:4 will
+       * surely be TRIANGLES
+       */
+
+      if (value1 == -1.0f) {
+         g0.negate = true;
+      }
+
+      ntb.bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
+                  g0, brw_imm_uw(0x3f80));
+   } else {
+      /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
+      fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
+
+      /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
+       *
+       *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
+       *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
+       *
+       * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
+       *
+       * This negation looks like it's safe in practice, because bits 0:4 will
+       * surely be TRIANGLES
+       */
+
+      if (value1 == -1.0f) {
+         g1_6.negate = true;
+      }
+
+      ntb.bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
+   }
+   ntb.bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
+
+   return true;
+}
+
+static brw_rnd_mode
+brw_rnd_mode_from_nir_op (const nir_op op) {
+   switch (op) {
+   case nir_op_f2f16_rtz:
+      return BRW_RND_MODE_RTZ;
+   case nir_op_f2f16_rtne:
+      return BRW_RND_MODE_RTNE;
+   default:
+      unreachable("Operation doesn't support rounding mode");
+   }
+}
+
+static brw_rnd_mode
+brw_rnd_mode_from_execution_mode(unsigned execution_mode)
+{
+   if (nir_has_any_rounding_mode_rtne(execution_mode))
+      return BRW_RND_MODE_RTNE;
+   if (nir_has_any_rounding_mode_rtz(execution_mode))
+      return BRW_RND_MODE_RTZ;
+   return BRW_RND_MODE_UNSPECIFIED;
+}
+
+static fs_reg
+prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
+                                    const fs_builder &bld,
+                                    nir_alu_instr *instr,
+                                    fs_reg *op,
+                                    bool need_dest)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+
+   fs_reg result =
+      need_dest ? get_nir_def(ntb, instr->def) : bld.null_reg_ud();
+
+   result.type = brw_type_for_nir_type(devinfo,
+      (nir_alu_type)(nir_op_infos[instr->op].output_type |
+                     instr->def.bit_size));
+
+   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+      op[i] = get_nir_src(ntb, instr->src[i].src);
+      op[i].type = brw_type_for_nir_type(devinfo,
+         (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
+                        nir_src_bit_size(instr->src[i].src)));
+   }
+
+   /* Move and vecN instrutions may still be vectored.  Return the raw,
+    * vectored source and destination so that fs_visitor::nir_emit_alu can
+    * handle it.  Other callers should not have to handle these kinds of
+    * instructions.
+    */
+   switch (instr->op) {
+   case nir_op_mov:
+   case nir_op_vec2:
+   case nir_op_vec3:
+   case nir_op_vec4:
+   case nir_op_vec8:
+   case nir_op_vec16:
+      return result;
+   default:
+      break;
+   }
+
+   /* At this point, we have dealt with any instruction that operates on
+    * more than a single channel.  Therefore, we can just adjust the source
+    * and destination registers for that channel and emit the instruction.
+    */
+   unsigned channel = 0;
+   if (nir_op_infos[instr->op].output_size == 0) {
+      /* Since NIR is doing the scalarizing for us, we should only ever see
+       * vectorized operations with a single channel.
+       */
+      nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
+      assert(util_bitcount(write_mask) == 1);
+      channel = ffs(write_mask) - 1;
+
+      result = offset(result, bld, channel);
+   }
+
+   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+      assert(nir_op_infos[instr->op].input_sizes[i] < 2);
+      op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
+   }
+
+   return result;
+}
+
+static fs_reg
+resolve_source_modifiers(const fs_builder &bld, const fs_reg &src)
+{
+   if (!src.abs && !src.negate)
+      return src;
+
+   fs_reg temp = bld.vgrf(src.type);
+   bld.MOV(temp, src);
+
+   return temp;
+}
+
+static void
+resolve_inot_sources(nir_to_brw_state &ntb, const fs_builder &bld, nir_alu_instr *instr,
+                     fs_reg *op)
+{
+   for (unsigned i = 0; i < 2; i++) {
+      nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
+
+      if (inot_instr != NULL && inot_instr->op == nir_op_inot) {
+         /* The source of the inot is now the source of instr. */
+         prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op[i], false);
+
+         assert(!op[i].negate);
+         op[i].negate = true;
+      } else {
+         op[i] = resolve_source_modifiers(bld, op[i]);
+      }
+   }
+}
+
+static bool
+try_emit_b2fi_of_inot(nir_to_brw_state &ntb, const fs_builder &bld,
+                      fs_reg result,
+                      nir_alu_instr *instr)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+
+   if (devinfo->ver < 6 || devinfo->verx10 >= 125)
+      return false;
+
+   nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
+
+   if (inot_instr == NULL || inot_instr->op != nir_op_inot)
+      return false;
+
+   /* HF is also possible as a destination on BDW+.  For nir_op_b2i, the set
+    * of valid size-changing combinations is a bit more complex.
+    *
+    * The source restriction is just because I was lazy about generating the
+    * constant below.
+    */
+   if (instr->def.bit_size != 32 ||
+       nir_src_bit_size(inot_instr->src[0].src) != 32)
+      return false;
+
+   /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0.  Since a can only be 0 or -1,
+    * this is float(1 + a).
+    */
+   fs_reg op;
+
+   prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op, false);
+
+   /* Ignore the saturate modifier, if there is one.  The result of the
+    * arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
+    */
+   bld.ADD(result, op, brw_imm_d(1));
+
+   return true;
+}
+
+/**
+ * Emit code for nir_op_fsign possibly fused with a nir_op_fmul
+ *
+ * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
+ * the source of \c instr that is a \c nir_op_fsign.
+ */
+static void
+emit_fsign(nir_to_brw_state &ntb, const fs_builder &bld, const nir_alu_instr *instr,
+           fs_reg result, fs_reg *op, unsigned fsign_src)
+{
+   fs_visitor &s = ntb.s;
+   const intel_device_info *devinfo = ntb.devinfo;
+
+   fs_inst *inst;
+
+   assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
+   assert(fsign_src < nir_op_infos[instr->op].num_inputs);
+
+   if (instr->op != nir_op_fsign) {
+      const nir_alu_instr *const fsign_instr =
+         nir_src_as_alu_instr(instr->src[fsign_src].src);
+
+      /* op[fsign_src] has the nominal result of the fsign, and op[1 -
+       * fsign_src] has the other multiply source.  This must be rearranged so
+       * that op[0] is the source of the fsign op[1] is the other multiply
+       * source.
+       */
+      if (fsign_src != 0)
+         op[1] = op[0];
+
+      op[0] = get_nir_src(ntb, fsign_instr->src[0].src);
+
+      const nir_alu_type t =
+         (nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
+                        nir_src_bit_size(fsign_instr->src[0].src));
+
+      op[0].type = brw_type_for_nir_type(devinfo, t);
+
+      unsigned channel = 0;
+      if (nir_op_infos[instr->op].output_size == 0) {
+         /* Since NIR is doing the scalarizing for us, we should only ever see
+          * vectorized operations with a single channel.
+          */
+         nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
+         assert(util_bitcount(write_mask) == 1);
+         channel = ffs(write_mask) - 1;
+      }
+
+      op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
+   }
+
+   if (type_sz(op[0].type) == 2) {
+      /* AND(val, 0x8000) gives the sign bit.
+       *
+       * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
+       */
+      fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF);
+      bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);
+
+      op[0].type = BRW_REGISTER_TYPE_UW;
+      result.type = BRW_REGISTER_TYPE_UW;
+      bld.AND(result, op[0], brw_imm_uw(0x8000u));
+
+      if (instr->op == nir_op_fsign)
+         inst = bld.OR(result, result, brw_imm_uw(0x3c00u));
+      else {
+         /* Use XOR here to get the result sign correct. */
+         inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW));
+      }
+
+      inst->predicate = BRW_PREDICATE_NORMAL;
+   } else if (type_sz(op[0].type) == 4) {
+      /* AND(val, 0x80000000) gives the sign bit.
+       *
+       * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
+       * zero.
+       */
+      bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
+
+      op[0].type = BRW_REGISTER_TYPE_UD;
+      result.type = BRW_REGISTER_TYPE_UD;
+      bld.AND(result, op[0], brw_imm_ud(0x80000000u));
+
+      if (instr->op == nir_op_fsign)
+         inst = bld.OR(result, result, brw_imm_ud(0x3f800000u));
+      else {
+         /* Use XOR here to get the result sign correct. */
+         inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD));
+      }
+
+      inst->predicate = BRW_PREDICATE_NORMAL;
+   } else {
+      /* For doubles we do the same but we need to consider:
+       *
+       * - 2-src instructions can't operate with 64-bit immediates
+       * - The sign is encoded in the high 32-bit of each DF
+       * - We need to produce a DF result.
+       */
+
+      fs_reg zero = s.vgrf(glsl_double_type());
+      bld.MOV(zero, setup_imm_df(bld, 0.0));
+      bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
+
+      bld.MOV(result, zero);
+
+      fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
+      bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
+              brw_imm_ud(0x80000000u));
+
+      if (instr->op == nir_op_fsign) {
+         set_predicate(BRW_PREDICATE_NORMAL,
+                       bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
+      } else {
+         if (devinfo->has_64bit_int) {
+            /* This could be done better in some cases.  If the scale is an
+             * immediate with the low 32-bits all 0, emitting a separate XOR and
+             * OR would allow an algebraic optimization to remove the OR.  There
+             * are currently zero instances of fsign(double(x))*IMM in shader-db
+             * or any test suite, so it is hard to care at this time.
+             */
+            fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ);
+            inst = bld.XOR(result_int64, result_int64,
+                           retype(op[1], BRW_REGISTER_TYPE_UQ));
+         } else {
+            fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ);
+            bld.MOV(subscript(result_int64, BRW_REGISTER_TYPE_UD, 0),
+                    subscript(op[1], BRW_REGISTER_TYPE_UD, 0));
+            bld.XOR(subscript(result_int64, BRW_REGISTER_TYPE_UD, 1),
+                    subscript(result_int64, BRW_REGISTER_TYPE_UD, 1),
+                    subscript(op[1], BRW_REGISTER_TYPE_UD, 1));
+         }
+      }
+   }
+}
+
+/**
+ * Determine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
+ *
+ * Checks the operands of a \c nir_op_fmul to determine whether or not
+ * \c emit_fsign could fuse the multiplication with the \c sign() calculation.
+ *
+ * \param instr  The multiplication instruction
+ *
+ * \param fsign_src The source of \c instr that may or may not be a
+ *                  \c nir_op_fsign
+ */
+static bool
+can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
+{
+   assert(instr->op == nir_op_fmul);
+
+   nir_alu_instr *const fsign_instr =
+      nir_src_as_alu_instr(instr->src[fsign_src].src);
+
+   /* Rules:
+    *
+    * 1. instr->src[fsign_src] must be a nir_op_fsign.
+    * 2. The nir_op_fsign can only be used by this multiplication.
+    * 3. The source that is the nir_op_fsign does not have source modifiers.
+    *    \c emit_fsign only examines the source modifiers of the source of the
+    *    \c nir_op_fsign.
+    *
+    * The nir_op_fsign must also not have the saturate modifier, but steps
+    * have already been taken (in nir_opt_algebraic) to ensure that.
+    */
+   return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
+          is_used_once(fsign_instr);
+}
+
+static bool
+is_const_zero(const nir_src &src)
+{
+   return nir_src_is_const(src) && nir_src_as_int(src) == 0;
+}
+
+static void
+fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
+                bool need_dest)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   fs_inst *inst;
+   unsigned execution_mode =
+      bld.shader->nir->info.float_controls_execution_mode;
+
+   fs_reg op[NIR_MAX_VEC_COMPONENTS];
+   fs_reg result = prepare_alu_destination_and_sources(ntb, bld, instr, op, need_dest);
+
+#ifndef NDEBUG
+   /* Everything except raw moves, some type conversions, iabs, and ineg
+    * should have 8-bit sources lowered by nir_lower_bit_size in
+    * brw_preprocess_nir or by brw_nir_lower_conversions in
+    * brw_postprocess_nir.
+    */
+   switch (instr->op) {
+   case nir_op_mov:
+   case nir_op_vec2:
+   case nir_op_vec3:
+   case nir_op_vec4:
+   case nir_op_vec8:
+   case nir_op_vec16:
+   case nir_op_i2f16:
+   case nir_op_i2f32:
+   case nir_op_i2i16:
+   case nir_op_i2i32:
+   case nir_op_u2f16:
+   case nir_op_u2f32:
+   case nir_op_u2u16:
+   case nir_op_u2u32:
+   case nir_op_iabs:
+   case nir_op_ineg:
+   case nir_op_pack_32_4x8_split:
+      break;
+
+   default:
+      for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+         assert(type_sz(op[i].type) > 1);
+      }
+   }
+#endif
+
+   switch (instr->op) {
+   case nir_op_mov:
+   case nir_op_vec2:
+   case nir_op_vec3:
+   case nir_op_vec4:
+   case nir_op_vec8:
+   case nir_op_vec16: {
+      fs_reg temp = result;
+      bool need_extra_copy = false;
+
+      nir_intrinsic_instr *store_reg =
+         nir_store_reg_for_def(&instr->def);
+      if (store_reg != NULL) {
+         nir_def *dest_reg = store_reg->src[1].ssa;
+         for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+            nir_intrinsic_instr *load_reg =
+               nir_load_reg_for_def(instr->src[i].src.ssa);
+            if (load_reg == NULL)
+               continue;
+
+            if (load_reg->src[0].ssa == dest_reg) {
+               need_extra_copy = true;
+               temp = bld.vgrf(result.type, 4);
+               break;
+            }
+         }
+      }
+
+      nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
+      unsigned last_bit = util_last_bit(write_mask);
+
+      for (unsigned i = 0; i < last_bit; i++) {
+         if (!(write_mask & (1 << i)))
+            continue;
+
+         if (instr->op == nir_op_mov) {
+            bld.MOV(offset(temp, bld, i),
+                           offset(op[0], bld, instr->src[0].swizzle[i]));
+         } else {
+            bld.MOV(offset(temp, bld, i),
+                           offset(op[i], bld, instr->src[i].swizzle[0]));
+         }
+      }
+
+      /* In this case the source and destination registers were the same,
+       * so we need to insert an extra set of moves in order to deal with
+       * any swizzling.
+       */
+      if (need_extra_copy) {
+         for (unsigned i = 0; i < last_bit; i++) {
+            if (!(write_mask & (1 << i)))
+               continue;
+
+            bld.MOV(offset(result, bld, i), offset(temp, bld, i));
+         }
+      }
+      return;
+   }
+
+   case nir_op_i2f32:
+   case nir_op_u2f32:
+      if (optimize_extract_to_float(ntb, instr, result))
+         return;
+      inst = bld.MOV(result, op[0]);
+      break;
+
+   case nir_op_f2f16_rtne:
+   case nir_op_f2f16_rtz:
+   case nir_op_f2f16: {
+      brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED;
+
+      if (nir_op_f2f16 == instr->op)
+         rnd = brw_rnd_mode_from_execution_mode(execution_mode);
+      else
+         rnd = brw_rnd_mode_from_nir_op(instr->op);
+
+      if (BRW_RND_MODE_UNSPECIFIED != rnd)
+         bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd));
+
+      assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
+      inst = bld.F32TO16(result, op[0]);
+      break;
+   }
+
+   case nir_op_b2i8:
+   case nir_op_b2i16:
+   case nir_op_b2i32:
+   case nir_op_b2i64:
+   case nir_op_b2f16:
+   case nir_op_b2f32:
+   case nir_op_b2f64:
+      if (try_emit_b2fi_of_inot(ntb, bld, result, instr))
+         break;
+      op[0].type = BRW_REGISTER_TYPE_D;
+      op[0].negate = !op[0].negate;
+      FALLTHROUGH;
+   case nir_op_i2f64:
+   case nir_op_i2i64:
+   case nir_op_u2f64:
+   case nir_op_u2u64:
+   case nir_op_f2f64:
+   case nir_op_f2i64:
+   case nir_op_f2u64:
+   case nir_op_i2i32:
+   case nir_op_u2u32:
+   case nir_op_f2i32:
+   case nir_op_f2u32:
+   case nir_op_i2f16:
+   case nir_op_u2f16:
+   case nir_op_f2i16:
+   case nir_op_f2u16:
+   case nir_op_f2i8:
+   case nir_op_f2u8:
+      if (result.type == BRW_REGISTER_TYPE_B ||
+          result.type == BRW_REGISTER_TYPE_UB ||
+          result.type == BRW_REGISTER_TYPE_HF)
+         assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
+
+      if (op[0].type == BRW_REGISTER_TYPE_B ||
+          op[0].type == BRW_REGISTER_TYPE_UB ||
+          op[0].type == BRW_REGISTER_TYPE_HF)
+         assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
+
+      inst = bld.MOV(result, op[0]);
+      break;
+
+   case nir_op_i2i8:
+   case nir_op_u2u8:
+      assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
+      FALLTHROUGH;
+   case nir_op_i2i16:
+   case nir_op_u2u16: {
+      /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns.
+       * Emitting the instructions one by one results in two MOV instructions
+       * that won't be propagated.  By handling both instructions here, a
+       * single MOV is emitted.
+       */
+      nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src);
+      if (extract_instr != NULL) {
+         if (extract_instr->op == nir_op_extract_u8 ||
+             extract_instr->op == nir_op_extract_i8) {
+            prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
+
+            const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
+            const brw_reg_type type =
+               brw_int_type(1, extract_instr->op == nir_op_extract_i8);
+
+            op[0] = subscript(op[0], type, byte);
+         } else if (extract_instr->op == nir_op_extract_u16 ||
+                    extract_instr->op == nir_op_extract_i16) {
+            prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
+
+            const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
+            const brw_reg_type type =
+               brw_int_type(2, extract_instr->op == nir_op_extract_i16);
+
+            op[0] = subscript(op[0], type, word);
+         }
+      }
+
+      inst = bld.MOV(result, op[0]);
+      break;
+   }
+
+   case nir_op_fsat:
+      inst = bld.MOV(result, op[0]);
+      inst->saturate = true;
+      break;
+
+   case nir_op_fneg:
+   case nir_op_ineg:
+      op[0].negate = true;
+      inst = bld.MOV(result, op[0]);
+      break;
+
+   case nir_op_fabs:
+   case nir_op_iabs:
+      op[0].negate = false;
+      op[0].abs = true;
+      inst = bld.MOV(result, op[0]);
+      break;
+
+   case nir_op_f2f32:
+      if (nir_has_any_rounding_mode_enabled(execution_mode)) {
+         brw_rnd_mode rnd =
+            brw_rnd_mode_from_execution_mode(execution_mode);
+         bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
+                             brw_imm_d(rnd));
+      }
+
+      if (op[0].type == BRW_REGISTER_TYPE_HF)
+         assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
+
+      inst = bld.MOV(result, op[0]);
+      break;
+
+   case nir_op_fsign:
+      emit_fsign(ntb, bld, instr, result, op, 0);
+      break;
+
+   case nir_op_frcp:
+      inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
+      break;
+
+   case nir_op_fexp2:
+      inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
+      break;
+
+   case nir_op_flog2:
+      inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
+      break;
+
+   case nir_op_fsin:
+      inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
+      break;
+
+   case nir_op_fcos:
+      inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
+      break;
+
+   case nir_op_fddx_fine:
+      inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
+      break;
+   case nir_op_fddx:
+   case nir_op_fddx_coarse:
+      inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
+      break;
+   case nir_op_fddy_fine:
+      inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
+      break;
+   case nir_op_fddy:
+   case nir_op_fddy_coarse:
+      inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
+      break;
+
+   case nir_op_fadd:
+      if (nir_has_any_rounding_mode_enabled(execution_mode)) {
+         brw_rnd_mode rnd =
+            brw_rnd_mode_from_execution_mode(execution_mode);
+         bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
+                             brw_imm_d(rnd));
+      }
+      FALLTHROUGH;
+   case nir_op_iadd:
+      inst = bld.ADD(result, op[0], op[1]);
+      break;
+
+   case nir_op_iadd3:
+      inst = bld.ADD3(result, op[0], op[1], op[2]);
+      break;
+
+   case nir_op_iadd_sat:
+   case nir_op_uadd_sat:
+      inst = bld.ADD(result, op[0], op[1]);
+      inst->saturate = true;
+      break;
+
+   case nir_op_isub_sat:
+      bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]);
+      break;
+
+   case nir_op_usub_sat:
+      bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]);
+      break;
+
+   case nir_op_irhadd:
+   case nir_op_urhadd:
+      assert(instr->def.bit_size < 64);
+      inst = bld.AVG(result, op[0], op[1]);
+      break;
+
+   case nir_op_ihadd:
+   case nir_op_uhadd: {
+      assert(instr->def.bit_size < 64);
+      fs_reg tmp = bld.vgrf(result.type);
+
+      if (devinfo->ver >= 8) {
+         op[0] = resolve_source_modifiers(bld, op[0]);
+         op[1] = resolve_source_modifiers(bld, op[1]);
+      }
+
+      /* AVG(x, y) - ((x ^ y) & 1) */
+      bld.XOR(tmp, op[0], op[1]);
+      bld.AND(tmp, tmp, retype(brw_imm_ud(1), result.type));
+      bld.AVG(result, op[0], op[1]);
+      inst = bld.ADD(result, result, tmp);
+      inst->src[1].negate = true;
+      break;
+   }
+
+   case nir_op_fmul:
+      for (unsigned i = 0; i < 2; i++) {
+         if (can_fuse_fmul_fsign(instr, i)) {
+            emit_fsign(ntb, bld, instr, result, op, i);
+            return;
+         }
+      }
+
+      /* We emit the rounding mode after the previous fsign optimization since
+       * it won't result in a MUL, but will try to negate the value by other
+       * means.
+       */
+      if (nir_has_any_rounding_mode_enabled(execution_mode)) {
+         brw_rnd_mode rnd =
+            brw_rnd_mode_from_execution_mode(execution_mode);
+         bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
+                             brw_imm_d(rnd));
+      }
+
+      inst = bld.MUL(result, op[0], op[1]);
+      break;
+
+   case nir_op_imul_2x32_64:
+   case nir_op_umul_2x32_64:
+      bld.MUL(result, op[0], op[1]);
+      break;
+
+   case nir_op_imul_32x16:
+   case nir_op_umul_32x16: {
+      const bool ud = instr->op == nir_op_umul_32x16;
+      const enum brw_reg_type word_type =
+         ud ? BRW_REGISTER_TYPE_UW : BRW_REGISTER_TYPE_W;
+      const enum brw_reg_type dword_type =
+         ud ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
+
+      assert(instr->def.bit_size == 32);
+
+      /* Before copy propagation there are no immediate values. */
+      assert(op[0].file != IMM && op[1].file != IMM);
+
+      op[1] = subscript(op[1], word_type, 0);
+
+      if (devinfo->ver >= 7)
+         bld.MUL(result, retype(op[0], dword_type), op[1]);
+      else
+         bld.MUL(result, op[1], retype(op[0], dword_type));
+
+      break;
+   }
+
+   case nir_op_imul:
+      assert(instr->def.bit_size < 64);
+      bld.MUL(result, op[0], op[1]);
+      break;
+
+   case nir_op_imul_high:
+   case nir_op_umul_high:
+      assert(instr->def.bit_size < 64);
+      if (instr->def.bit_size == 32) {
+         bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
+      } else {
+         fs_reg tmp = bld.vgrf(brw_reg_type_from_bit_size(32, op[0].type));
+         bld.MUL(tmp, op[0], op[1]);
+         bld.MOV(result, subscript(tmp, result.type, 1));
+      }
+      break;
+
+   case nir_op_idiv:
+   case nir_op_udiv:
+      assert(instr->def.bit_size < 64);
+      bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
+      break;
+
+   case nir_op_uadd_carry:
+      unreachable("Should have been lowered by carry_to_arith().");
+
+   case nir_op_usub_borrow:
+      unreachable("Should have been lowered by borrow_to_arith().");
+
+   case nir_op_umod:
+   case nir_op_irem:
+      /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
+       * appears that our hardware just does the right thing for signed
+       * remainder.
+       */
+      assert(instr->def.bit_size < 64);
+      bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
+      break;
+
+   case nir_op_imod: {
+      /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
+      bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
+
+      /* Math instructions don't support conditional mod */
+      inst = bld.MOV(bld.null_reg_d(), result);
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+      /* Now, we need to determine if signs of the sources are different.
+       * When we XOR the sources, the top bit is 0 if they are the same and 1
+       * if they are different.  We can then use a conditional modifier to
+       * turn that into a predicate.  This leads us to an XOR.l instruction.
+       *
+       * Technically, according to the PRM, you're not allowed to use .l on a
+       * XOR instruction.  However, empirical experiments and Curro's reading
+       * of the simulator source both indicate that it's safe.
+       */
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
+      inst = bld.XOR(tmp, op[0], op[1]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->conditional_mod = BRW_CONDITIONAL_L;
+
+      /* If the result of the initial remainder operation is non-zero and the
+       * two sources have different signs, add in a copy of op[1] to get the
+       * final integer modulus value.
+       */
+      inst = bld.ADD(result, result, op[1]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   }
+
+   case nir_op_flt32:
+   case nir_op_fge32:
+   case nir_op_feq32:
+   case nir_op_fneu32: {
+      fs_reg dest = result;
+
+      const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
+      if (bit_size != 32) {
+         dest = bld.vgrf(op[0].type, 1);
+         bld.UNDEF(dest);
+      }
+
+      bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op));
+
+      if (bit_size > 32) {
+         bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+      } else if(bit_size < 32) {
+         /* When we convert the result to 32-bit we need to be careful and do
+          * it as a signed conversion to get sign extension (for 32-bit true)
+          */
+         const brw_reg_type src_type =
+            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
+
+         bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
+      }
+      break;
+   }
+
+   case nir_op_ilt32:
+   case nir_op_ult32:
+   case nir_op_ige32:
+   case nir_op_uge32:
+   case nir_op_ieq32:
+   case nir_op_ine32: {
+      fs_reg dest = result;
+
+      const uint32_t bit_size = type_sz(op[0].type) * 8;
+      if (bit_size != 32) {
+         dest = bld.vgrf(op[0].type, 1);
+         bld.UNDEF(dest);
+      }
+
+      bld.CMP(dest, op[0], op[1],
+              brw_cmod_for_nir_comparison(instr->op));
+
+      if (bit_size > 32) {
+         bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+      } else if (bit_size < 32) {
+         /* When we convert the result to 32-bit we need to be careful and do
+          * it as a signed conversion to get sign extension (for 32-bit true)
+          */
+         const brw_reg_type src_type =
+            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
+
+         bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
+      }
+      break;
+   }
+
+   case nir_op_inot:
+      if (devinfo->ver >= 8) {
+         nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
+
+         if (inot_src_instr != NULL &&
+             (inot_src_instr->op == nir_op_ior ||
+              inot_src_instr->op == nir_op_ixor ||
+              inot_src_instr->op == nir_op_iand)) {
+            /* The sources of the source logical instruction are now the
+             * sources of the instruction that will be generated.
+             */
+            prepare_alu_destination_and_sources(ntb, bld, inot_src_instr, op, false);
+            resolve_inot_sources(ntb, bld, inot_src_instr, op);
+
+            /* Smash all of the sources and destination to be signed.  This
+             * doesn't matter for the operation of the instruction, but cmod
+             * propagation fails on unsigned sources with negation (due to
+             * fs_inst::can_do_cmod returning false).
+             */
+            result.type =
+               brw_type_for_nir_type(devinfo,
+                                     (nir_alu_type)(nir_type_int |
+                                                    instr->def.bit_size));
+            op[0].type =
+               brw_type_for_nir_type(devinfo,
+                                     (nir_alu_type)(nir_type_int |
+                                                    nir_src_bit_size(inot_src_instr->src[0].src)));
+            op[1].type =
+               brw_type_for_nir_type(devinfo,
+                                     (nir_alu_type)(nir_type_int |
+                                                    nir_src_bit_size(inot_src_instr->src[1].src)));
+
+            /* For XOR, only invert one of the sources.  Arbitrarily choose
+             * the first source.
+             */
+            op[0].negate = !op[0].negate;
+            if (inot_src_instr->op != nir_op_ixor)
+               op[1].negate = !op[1].negate;
+
+            switch (inot_src_instr->op) {
+            case nir_op_ior:
+               bld.AND(result, op[0], op[1]);
+               return;
+
+            case nir_op_iand:
+               bld.OR(result, op[0], op[1]);
+               return;
+
+            case nir_op_ixor:
+               bld.XOR(result, op[0], op[1]);
+               return;
+
+            default:
+               unreachable("impossible opcode");
+            }
+         }
+         op[0] = resolve_source_modifiers(bld, op[0]);
+      }
+      bld.NOT(result, op[0]);
+      break;
+   case nir_op_ixor:
+      if (devinfo->ver >= 8) {
+         resolve_inot_sources(ntb, bld, instr, op);
+      }
+      bld.XOR(result, op[0], op[1]);
+      break;
+   case nir_op_ior:
+      if (devinfo->ver >= 8) {
+         resolve_inot_sources(ntb, bld, instr, op);
+      }
+      bld.OR(result, op[0], op[1]);
+      break;
+   case nir_op_iand:
+      if (devinfo->ver >= 8) {
+         resolve_inot_sources(ntb, bld, instr, op);
+      }
+      bld.AND(result, op[0], op[1]);
+      break;
+
+   case nir_op_fdot2:
+   case nir_op_fdot3:
+   case nir_op_fdot4:
+   case nir_op_b32all_fequal2:
+   case nir_op_b32all_iequal2:
+   case nir_op_b32all_fequal3:
+   case nir_op_b32all_iequal3:
+   case nir_op_b32all_fequal4:
+   case nir_op_b32all_iequal4:
+   case nir_op_b32any_fnequal2:
+   case nir_op_b32any_inequal2:
+   case nir_op_b32any_fnequal3:
+   case nir_op_b32any_inequal3:
+   case nir_op_b32any_fnequal4:
+   case nir_op_b32any_inequal4:
+      unreachable("Lowered by nir_lower_alu_reductions");
+
+   case nir_op_ldexp:
+      unreachable("not reached: should be handled by ldexp_to_arith()");
+
+   case nir_op_fsqrt:
+      inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
+      break;
+
+   case nir_op_frsq:
+      inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
+      break;
+
+   case nir_op_ftrunc:
+      inst = bld.RNDZ(result, op[0]);
+      if (devinfo->ver < 6) {
+         set_condmod(BRW_CONDITIONAL_R, inst);
+         set_predicate(BRW_PREDICATE_NORMAL,
+                       bld.ADD(result, result, brw_imm_f(1.0f)));
+         inst = bld.MOV(result, result); /* for potential saturation */
+      }
+      break;
+
+   case nir_op_fceil: {
+      op[0].negate = !op[0].negate;
+      fs_reg temp = s.vgrf(glsl_float_type());
+      bld.RNDD(temp, op[0]);
+      temp.negate = true;
+      inst = bld.MOV(result, temp);
+      break;
+   }
+   case nir_op_ffloor:
+      inst = bld.RNDD(result, op[0]);
+      break;
+   case nir_op_ffract:
+      inst = bld.FRC(result, op[0]);
+      break;
+   case nir_op_fround_even:
+      inst = bld.RNDE(result, op[0]);
+      if (devinfo->ver < 6) {
+         set_condmod(BRW_CONDITIONAL_R, inst);
+         set_predicate(BRW_PREDICATE_NORMAL,
+                       bld.ADD(result, result, brw_imm_f(1.0f)));
+         inst = bld.MOV(result, result); /* for potential saturation */
+      }
+      break;
+
+   case nir_op_fquantize2f16: {
+      fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
+      fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
+      fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
+
+      /* The destination stride must be at least as big as the source stride. */
+      tmp16 = subscript(tmp16, BRW_REGISTER_TYPE_HF, 0);
+
+      /* Check for denormal */
+      fs_reg abs_src0 = op[0];
+      abs_src0.abs = true;
+      bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
+              BRW_CONDITIONAL_L);
+      /* Get the appropriately signed zero */
+      bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
+              retype(op[0], BRW_REGISTER_TYPE_UD),
+              brw_imm_ud(0x80000000));
+      /* Do the actual F32 -> F16 -> F32 conversion */
+      bld.F32TO16(tmp16, op[0]);
+      bld.F16TO32(tmp32, tmp16);
+      /* Select that or zero based on normal status */
+      inst = bld.SEL(result, zero, tmp32);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   }
+
+   case nir_op_imin:
+   case nir_op_umin:
+   case nir_op_fmin:
+      inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
+      break;
+
+   case nir_op_imax:
+   case nir_op_umax:
+   case nir_op_fmax:
+      inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
+      break;
+
+   case nir_op_pack_snorm_2x16:
+   case nir_op_pack_snorm_4x8:
+   case nir_op_pack_unorm_2x16:
+   case nir_op_pack_unorm_4x8:
+   case nir_op_unpack_snorm_2x16:
+   case nir_op_unpack_snorm_4x8:
+   case nir_op_unpack_unorm_2x16:
+   case nir_op_unpack_unorm_4x8:
+   case nir_op_unpack_half_2x16:
+   case nir_op_pack_half_2x16:
+      unreachable("not reached: should be handled by lower_packing_builtins");
+
+   case nir_op_unpack_half_2x16_split_x_flush_to_zero:
+      assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);
+      FALLTHROUGH;
+   case nir_op_unpack_half_2x16_split_x:
+      inst = bld.F16TO32(result, subscript(op[0], BRW_REGISTER_TYPE_HF, 0));
+      break;
+
+   case nir_op_unpack_half_2x16_split_y_flush_to_zero:
+      assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);
+      FALLTHROUGH;
+   case nir_op_unpack_half_2x16_split_y:
+      inst = bld.F16TO32(result, subscript(op[0], BRW_REGISTER_TYPE_HF, 1));
+      break;
+
+   case nir_op_pack_64_2x32_split:
+   case nir_op_pack_32_2x16_split:
+      bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
+      break;
+
+   case nir_op_pack_32_4x8_split:
+      bld.emit(FS_OPCODE_PACK, result, op, 4);
+      break;
+
+   case nir_op_unpack_64_2x32_split_x:
+   case nir_op_unpack_64_2x32_split_y: {
+      if (instr->op == nir_op_unpack_64_2x32_split_x)
+         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
+      else
+         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
+      break;
+   }
+
+   case nir_op_unpack_32_2x16_split_x:
+   case nir_op_unpack_32_2x16_split_y: {
+      if (instr->op == nir_op_unpack_32_2x16_split_x)
+         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
+      else
+         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
+      break;
+   }
+
+   case nir_op_fpow:
+      inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
+      break;
+
+   case nir_op_bitfield_reverse:
+      assert(instr->def.bit_size == 32);
+      assert(nir_src_bit_size(instr->src[0].src) == 32);
+      bld.BFREV(result, op[0]);
+      break;
+
+   case nir_op_bit_count:
+      assert(instr->def.bit_size == 32);
+      assert(nir_src_bit_size(instr->src[0].src) < 64);
+      bld.CBIT(result, op[0]);
+      break;
+
+   case nir_op_uclz:
+      assert(instr->def.bit_size == 32);
+      assert(nir_src_bit_size(instr->src[0].src) == 32);
+      bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
+      break;
+
+   case nir_op_ifind_msb: {
+      assert(instr->def.bit_size == 32);
+      assert(nir_src_bit_size(instr->src[0].src) == 32);
+      assert(devinfo->ver >= 7);
+
+      bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
+
+      /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
+       * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
+       * subtract the result from 31 to convert the MSB count into an LSB
+       * count.
+       */
+      bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
+
+      inst = bld.ADD(result, result, brw_imm_d(31));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->src[0].negate = true;
+      break;
+   }
+
+   case nir_op_find_lsb:
+      assert(instr->def.bit_size == 32);
+      assert(nir_src_bit_size(instr->src[0].src) == 32);
+      assert(devinfo->ver >= 7);
+      bld.FBL(result, op[0]);
+      break;
+
+   case nir_op_ubitfield_extract:
+   case nir_op_ibitfield_extract:
+      unreachable("should have been lowered");
+   case nir_op_ubfe:
+   case nir_op_ibfe:
+      assert(instr->def.bit_size < 64);
+      bld.BFE(result, op[2], op[1], op[0]);
+      break;
+   case nir_op_bfm:
+      assert(instr->def.bit_size < 64);
+      bld.BFI1(result, op[0], op[1]);
+      break;
+   case nir_op_bfi:
+      assert(instr->def.bit_size < 64);
+
+      /* bfi is ((...) | (~src0 & src2)). The second part is zero when src2 is
+       * either 0 or src0. Replacing the 0 with another value can eliminate a
+       * temporary register.
+       */
+      if (is_const_zero(instr->src[2].src))
+         bld.BFI2(result, op[0], op[1], op[0]);
+      else
+         bld.BFI2(result, op[0], op[1], op[2]);
+
+      break;
+
+   case nir_op_bitfield_insert:
+      unreachable("not reached: should have been lowered");
+
+   /* With regards to implicit masking of the shift counts for 8- and 16-bit
+    * types, the PRMs are **incorrect**. They falsely state that on Gen9+ only
+    * the low bits of src1 matching the size of src0 (e.g., 4-bits for W or UW
+    * src0) are used. The Bspec (backed by data from experimentation) state
+    * that 0x3f is used for Q and UQ types, and 0x1f is used for **all** other
+    * types.
+    *
+    * The match the behavior expected for the NIR opcodes, explicit masks for
+    * 8- and 16-bit types must be added.
+    */
+   case nir_op_ishl:
+      if (instr->def.bit_size < 32) {
+         bld.AND(result, op[1], brw_imm_ud(instr->def.bit_size - 1));
+         bld.SHL(result, op[0], result);
+      } else {
+         bld.SHL(result, op[0], op[1]);
+      }
+
+      break;
+   case nir_op_ishr:
+      if (instr->def.bit_size < 32) {
+         bld.AND(result, op[1], brw_imm_ud(instr->def.bit_size - 1));
+         bld.ASR(result, op[0], result);
+      } else {
+         bld.ASR(result, op[0], op[1]);
+      }
+
+      break;
+   case nir_op_ushr:
+      if (instr->def.bit_size < 32) {
+         bld.AND(result, op[1], brw_imm_ud(instr->def.bit_size - 1));
+         bld.SHR(result, op[0], result);
+      } else {
+         bld.SHR(result, op[0], op[1]);
+      }
+
+      break;
+
+   case nir_op_urol:
+      bld.ROL(result, op[0], op[1]);
+      break;
+   case nir_op_uror:
+      bld.ROR(result, op[0], op[1]);
+      break;
+
+   case nir_op_pack_half_2x16_split:
+      bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
+      break;
+
+   case nir_op_sdot_4x8_iadd:
+   case nir_op_sdot_4x8_iadd_sat:
+      inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_D),
+                      retype(op[2], BRW_REGISTER_TYPE_D),
+                      retype(op[0], BRW_REGISTER_TYPE_D),
+                      retype(op[1], BRW_REGISTER_TYPE_D));
+
+      if (instr->op == nir_op_sdot_4x8_iadd_sat)
+         inst->saturate = true;
+      break;
+
+   case nir_op_udot_4x8_uadd:
+   case nir_op_udot_4x8_uadd_sat:
+      inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_UD),
+                      retype(op[2], BRW_REGISTER_TYPE_UD),
+                      retype(op[0], BRW_REGISTER_TYPE_UD),
+                      retype(op[1], BRW_REGISTER_TYPE_UD));
+
+      if (instr->op == nir_op_udot_4x8_uadd_sat)
+         inst->saturate = true;
+      break;
+
+   case nir_op_sudot_4x8_iadd:
+   case nir_op_sudot_4x8_iadd_sat:
+      inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_D),
+                      retype(op[2], BRW_REGISTER_TYPE_D),
+                      retype(op[0], BRW_REGISTER_TYPE_D),
+                      retype(op[1], BRW_REGISTER_TYPE_UD));
+
+      if (instr->op == nir_op_sudot_4x8_iadd_sat)
+         inst->saturate = true;
+      break;
+
+   case nir_op_ffma:
+      if (nir_has_any_rounding_mode_enabled(execution_mode)) {
+         brw_rnd_mode rnd =
+            brw_rnd_mode_from_execution_mode(execution_mode);
+         bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
+                             brw_imm_d(rnd));
+      }
+
+      inst = bld.MAD(result, op[2], op[1], op[0]);
+      break;
+
+   case nir_op_flrp:
+      if (nir_has_any_rounding_mode_enabled(execution_mode)) {
+         brw_rnd_mode rnd =
+            brw_rnd_mode_from_execution_mode(execution_mode);
+         bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
+                             brw_imm_d(rnd));
+      }
+
+      inst = bld.LRP(result, op[0], op[1], op[2]);
+      break;
+
+   case nir_op_b32csel:
+      if (optimize_frontfacing_ternary(ntb, instr, result))
+         return;
+
+      bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
+      inst = bld.SEL(result, op[1], op[2]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+
+   case nir_op_extract_u8:
+   case nir_op_extract_i8: {
+      unsigned byte = nir_src_as_uint(instr->src[1].src);
+
+      /* The PRMs say:
+       *
+       *    BDW+
+       *    There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
+       *    Use two instructions and a word or DWord intermediate integer type.
+       */
+      if (instr->def.bit_size == 64) {
+         const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
+
+         if (instr->op == nir_op_extract_i8) {
+            /* If we need to sign extend, extract to a word first */
+            fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
+            bld.MOV(w_temp, subscript(op[0], type, byte));
+            bld.MOV(result, w_temp);
+         } else if (byte & 1) {
+            /* Extract the high byte from the word containing the desired byte
+             * offset.
+             */
+            bld.SHR(result,
+                    subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
+                    brw_imm_uw(8));
+         } else {
+            /* Otherwise use an AND with 0xff and a word type */
+            bld.AND(result,
+                    subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
+                    brw_imm_uw(0xff));
+         }
+      } else {
+         const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
+         bld.MOV(result, subscript(op[0], type, byte));
+      }
+      break;
+   }
+
+   case nir_op_extract_u16:
+   case nir_op_extract_i16: {
+      const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
+      unsigned word = nir_src_as_uint(instr->src[1].src);
+      bld.MOV(result, subscript(op[0], type, word));
+      break;
+   }
+
+   default:
+      unreachable("unhandled instruction");
+   }
+
+   /* If we need to do a boolean resolve, replace the result with -(x & 1)
+    * to sign extend the low bit to 0/~0
+    */
+   if (devinfo->ver <= 5 &&
+       !result.is_null() &&
+       (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
+      fs_reg masked = s.vgrf(glsl_int_type());
+      bld.AND(masked, result, brw_imm_d(1));
+      masked.negate = true;
+      bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
+   }
+}
+
+static void
+fs_nir_emit_load_const(nir_to_brw_state &ntb,
+                       nir_load_const_instr *instr)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+
+   const brw_reg_type reg_type =
+      brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);
+   fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
+
+   switch (instr->def.bit_size) {
+   case 8:
+      for (unsigned i = 0; i < instr->def.num_components; i++)
+         bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8));
+      break;
+
+   case 16:
+      for (unsigned i = 0; i < instr->def.num_components; i++)
+         bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16));
+      break;
+
+   case 32:
+      for (unsigned i = 0; i < instr->def.num_components; i++)
+         bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32));
+      break;
+
+   case 64:
+      assert(devinfo->ver >= 7);
+      if (!devinfo->has_64bit_int) {
+         for (unsigned i = 0; i < instr->def.num_components; i++) {
+            bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),
+                    setup_imm_df(bld, instr->value[i].f64));
+         }
+      } else {
+         for (unsigned i = 0; i < instr->def.num_components; i++)
+            bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64));
+      }
+      break;
+
+   default:
+      unreachable("Invalid bit size");
+   }
+
+   ntb.ssa_values[instr->def.index] = reg;
+}
+
+static bool
+get_nir_src_bindless(nir_to_brw_state &ntb, const nir_src &src)
+{
+   return ntb.ssa_bind_infos[src.ssa->index].bindless;
+}
+
+static bool
+is_resource_src(nir_src src)
+{
+   return src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
+          nir_instr_as_intrinsic(src.ssa->parent_instr)->intrinsic == nir_intrinsic_resource_intel;
+}
+
+static fs_reg
+get_resource_nir_src(nir_to_brw_state &ntb, const nir_src &src)
+{
+   if (!is_resource_src(src))
+      return fs_reg();
+   return ntb.resource_values[src.ssa->index];
+}
+
+static fs_reg
+get_nir_src(nir_to_brw_state &ntb, const nir_src &src)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+
+   nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa);
+
+   fs_reg reg;
+   if (!load_reg) {
+      if (nir_src_is_undef(src)) {
+         const brw_reg_type reg_type =
+            brw_reg_type_from_bit_size(src.ssa->bit_size,
+                                       BRW_REGISTER_TYPE_D);
+         reg = ntb.bld.vgrf(reg_type, src.ssa->num_components);
+      } else {
+         reg = ntb.ssa_values[src.ssa->index];
+      }
+   } else {
+      nir_intrinsic_instr *decl_reg = nir_reg_get_decl(load_reg->src[0].ssa);
+      /* We don't handle indirects on locals */
+      assert(nir_intrinsic_base(load_reg) == 0);
+      assert(load_reg->intrinsic != nir_intrinsic_load_reg_indirect);
+      reg = ntb.ssa_values[decl_reg->def.index];
+   }
+
+   if (nir_src_bit_size(src) == 64 && devinfo->ver == 7) {
+      /* The only 64-bit type available on gfx7 is DF, so use that. */
+      reg.type = BRW_REGISTER_TYPE_DF;
+   } else {
+      /* To avoid floating-point denorm flushing problems, set the type by
+       * default to an integer type - instructions that need floating point
+       * semantics will set this to F if they need to
+       */
+      reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
+                                            BRW_REGISTER_TYPE_D);
+   }
+
+   return reg;
+}
+
+/**
+ * Return an IMM for constants; otherwise call get_nir_src() as normal.
+ *
+ * This function should not be called on any value which may be 64 bits.
+ * We could theoretically support 64-bit on gfx8+ but we choose not to
+ * because it wouldn't work in general (no gfx7 support) and there are
+ * enough restrictions in 64-bit immediates that you can't take the return
+ * value and treat it the same as the result of get_nir_src().
+ */
+static fs_reg
+get_nir_src_imm(nir_to_brw_state &ntb, const nir_src &src)
+{
+   assert(nir_src_bit_size(src) == 32);
+   return nir_src_is_const(src) ?
+          fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(ntb, src);
+}
+
+static fs_reg
+get_nir_def(nir_to_brw_state &ntb, const nir_def &def)
+{
+   const fs_builder &bld = ntb.bld;
+
+   nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
+   if (!store_reg) {
+      const brw_reg_type reg_type =
+         brw_reg_type_from_bit_size(def.bit_size,
+                                    def.bit_size == 8 ?
+                                    BRW_REGISTER_TYPE_D :
+                                    BRW_REGISTER_TYPE_F);
+      ntb.ssa_values[def.index] =
+         bld.vgrf(reg_type, def.num_components);
+      bld.UNDEF(ntb.ssa_values[def.index]);
+      return ntb.ssa_values[def.index];
+   } else {
+      nir_intrinsic_instr *decl_reg =
+         nir_reg_get_decl(store_reg->src[1].ssa);
+      /* We don't handle indirects on locals */
+      assert(nir_intrinsic_base(store_reg) == 0);
+      assert(store_reg->intrinsic != nir_intrinsic_store_reg_indirect);
+      return ntb.ssa_values[decl_reg->def.index];
+   }
+}
+
+static nir_component_mask_t
+get_nir_write_mask(const nir_def &def)
+{
+   nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
+   if (!store_reg) {
+      return nir_component_mask(def.num_components);
+   } else {
+      return nir_intrinsic_write_mask(store_reg);
+   }
+}
+
+static fs_inst *
+emit_pixel_interpolater_send(const fs_builder &bld,
+                             enum opcode opcode,
+                             const fs_reg &dst,
+                             const fs_reg &src,
+                             const fs_reg &desc,
+                             const fs_reg &flag_reg,
+                             glsl_interp_mode interpolation)
+{
+   struct brw_wm_prog_data *wm_prog_data =
+      brw_wm_prog_data(bld.shader->stage_prog_data);
+
+   fs_reg srcs[INTERP_NUM_SRCS];
+   srcs[INTERP_SRC_OFFSET]       = src;
+   srcs[INTERP_SRC_MSG_DESC]     = desc;
+   srcs[INTERP_SRC_DYNAMIC_MODE] = flag_reg;
+
+   fs_inst *inst = bld.emit(opcode, dst, srcs, INTERP_NUM_SRCS);
+   /* 2 floats per slot returned */
+   inst->size_written = 2 * dst.component_size(inst->exec_size);
+   if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
+      inst->pi_noperspective = true;
+      /* TGL BSpec says:
+       *     This field cannot be set to "Linear Interpolation"
+       *     unless Non-Perspective Barycentric Enable in 3DSTATE_CLIP is enabled"
+       */
+      wm_prog_data->uses_nonperspective_interp_modes = true;
+   }
+
+   wm_prog_data->pulls_bary = true;
+
+   return inst;
+}
+
+/**
+ * Computes 1 << x, given a D/UD register containing some value x.
+ */
+static fs_reg
+intexp2(const fs_builder &bld, const fs_reg &x)
+{
+   assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
+
+   fs_reg result = bld.vgrf(x.type, 1);
+   fs_reg one = bld.vgrf(x.type, 1);
+
+   bld.MOV(one, retype(brw_imm_d(1), one.type));
+   bld.SHL(result, one, x);
+   return result;
+}
+
+static void
+emit_gs_end_primitive(nir_to_brw_state &ntb, const nir_src &vertex_count_nir_src)
+{
+   fs_visitor &s = ntb.s;
+   assert(s.stage == MESA_SHADER_GEOMETRY);
+
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
+
+   if (s.gs_compile->control_data_header_size_bits == 0)
+      return;
+
+   /* We can only do EndPrimitive() functionality when the control data
+    * consists of cut bits.  Fortunately, the only time it isn't is when the
+    * output type is points, in which case EndPrimitive() is a no-op.
+    */
+   if (gs_prog_data->control_data_format !=
+       GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
+      return;
+   }
+
+   /* Cut bits use one bit per vertex. */
+   assert(s.gs_compile->control_data_bits_per_vertex == 1);
+
+   fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
+   vertex_count.type = BRW_REGISTER_TYPE_UD;
+
+   /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
+    * vertex n, 0 otherwise.  So all we need to do here is mark bit
+    * (vertex_count - 1) % 32 in the cut_bits register to indicate that
+    * EndPrimitive() was called after emitting vertex (vertex_count - 1);
+    * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
+    *
+    * Note that if EndPrimitive() is called before emitting any vertices, this
+    * will cause us to set bit 31 of the control_data_bits register to 1.
+    * That's fine because:
+    *
+    * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
+    *   output, so the hardware will ignore cut bit 31.
+    *
+    * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
+    *   last vertex, so setting cut bit 31 has no effect (since the primitive
+    *   is automatically ended when the GS terminates).
+    *
+    * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
+    *   control_data_bits register to 0 when the first vertex is emitted.
+    */
+
+   const fs_builder abld = ntb.bld.annotate("end primitive");
+
+   /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
+   fs_reg prev_count = ntb.bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
+   fs_reg mask = intexp2(abld, prev_count);
+   /* Note: we're relying on the fact that the GEN SHL instruction only pays
+    * attention to the lower 5 bits of its second source argument, so on this
+    * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
+    * ((vertex_count - 1) % 32).
+    */
+   abld.OR(s.control_data_bits, s.control_data_bits, mask);
+}
+
+void
+fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+   assert(gs_compile->control_data_bits_per_vertex != 0);
+
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
+
+   const fs_builder bld = fs_builder(this).at_end();
+   const fs_builder abld = bld.annotate("emit control data bits");
+   const fs_builder fwa_bld = bld.exec_all();
+
+   /* We use a single UD register to accumulate control data bits (32 bits
+    * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
+    * at a time.
+    *
+    * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
+    * We have select a 128-bit group via the Global and Per-Slot Offsets, then
+    * use the Channel Mask phase to enable/disable which DWord within that
+    * group to write.  (Remember, different SIMD8 channels may have emitted
+    * different numbers of vertices, so we may need per-slot offsets.)
+    *
+    * Channel masking presents an annoying problem: we may have to replicate
+    * the data up to 4 times:
+    *
+    * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
+    *
+    * To avoid penalizing shaders that emit a small number of vertices, we
+    * can avoid these sometimes: if the size of the control data header is
+    * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
+    * land in the same 128-bit group, so we can skip per-slot offsets.
+    *
+    * Similarly, if the control data header is <= 32 bits, there is only one
+    * DWord, so we can skip channel masks.
+    */
+   fs_reg channel_mask, per_slot_offset;
+
+   if (gs_compile->control_data_header_size_bits > 32)
+      channel_mask = vgrf(glsl_uint_type());
+
+   if (gs_compile->control_data_header_size_bits > 128)
+      per_slot_offset = vgrf(glsl_uint_type());
+
+   /* Figure out which DWord we're trying to write to using the formula:
+    *
+    *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
+    *
+    * Since bits_per_vertex is a power of two, and is known at compile
+    * time, this can be optimized to:
+    *
+    *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
+    */
+   if (channel_mask.file != BAD_FILE || per_slot_offset.file != BAD_FILE) {
+      fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
+      unsigned log2_bits_per_vertex =
+         util_last_bit(gs_compile->control_data_bits_per_vertex);
+      abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
+
+      if (per_slot_offset.file != BAD_FILE) {
+         /* Set the per-slot offset to dword_index / 4, so that we'll write to
+          * the appropriate OWord within the control data header.
+          */
+         abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
+      }
+
+      /* Set the channel masks to 1 << (dword_index % 4), so that we'll
+       * write to the appropriate DWORD within the OWORD.
+       */
+      fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
+      channel_mask = intexp2(fwa_bld, channel);
+      /* Then the channel masks need to be in bits 23:16. */
+      fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
+   }
+
+   /* If there are channel masks, add 3 extra copies of the data. */
+   const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE);
+   fs_reg sources[4];
+
+   for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
+      sources[i] = this->control_data_bits;
+
+   fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+   srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
+   srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset;
+   srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask;
+   srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_F, length);
+   srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
+   abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
+
+   fs_inst *inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
+                             srcs, ARRAY_SIZE(srcs));
+
+   /* We need to increment Global Offset by 256-bits to make room for
+    * Broadwell's extra "Vertex Count" payload at the beginning of the
+    * URB entry.  Since this is an OWord message, Global Offset is counted
+    * in 128-bit units, so we must set it to 2.
+    */
+   if (gs_prog_data->static_vertex_count == -1)
+      inst->offset = 2;
+}
+
+static void
+set_gs_stream_control_data_bits(nir_to_brw_state &ntb, const fs_reg &vertex_count,
+                                unsigned stream_id)
+{
+   fs_visitor &s = ntb.s;
+
+   /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
+
+   /* Note: we are calling this *before* increasing vertex_count, so
+    * this->vertex_count == vertex_count - 1 in the formula above.
+    */
+
+   /* Stream mode uses 2 bits per vertex */
+   assert(s.gs_compile->control_data_bits_per_vertex == 2);
+
+   /* Must be a valid stream */
+   assert(stream_id < 4); /* MAX_VERTEX_STREAMS */
+
+   /* Control data bits are initialized to 0 so we don't have to set any
+    * bits when sending vertices to stream 0.
+    */
+   if (stream_id == 0)
+      return;
+
+   const fs_builder abld = ntb.bld.annotate("set stream control data bits", NULL);
+
+   /* reg::sid = stream_id */
+   fs_reg sid = ntb.bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   abld.MOV(sid, brw_imm_ud(stream_id));
+
+   /* reg:shift_count = 2 * (vertex_count - 1) */
+   fs_reg shift_count = ntb.bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
+
+   /* Note: we're relying on the fact that the GEN SHL instruction only pays
+    * attention to the lower 5 bits of its second source argument, so on this
+    * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
+    * stream_id << ((2 * (vertex_count - 1)) % 32).
+    */
+   fs_reg mask = ntb.bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   abld.SHL(mask, sid, shift_count);
+   abld.OR(s.control_data_bits, s.control_data_bits, mask);
+}
+
+static void
+emit_gs_vertex(nir_to_brw_state &ntb, const nir_src &vertex_count_nir_src,
+               unsigned stream_id)
+{
+   fs_visitor &s = ntb.s;
+
+   assert(s.stage == MESA_SHADER_GEOMETRY);
+
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
+
+   fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
+   vertex_count.type = BRW_REGISTER_TYPE_UD;
+
+   /* Haswell and later hardware ignores the "Render Stream Select" bits
+    * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
+    * and instead sends all primitives down the pipeline for rasterization.
+    * If the SOL stage is enabled, "Render Stream Select" is honored and
+    * primitives bound to non-zero streams are discarded after stream output.
+    *
+    * Since the only purpose of primives sent to non-zero streams is to
+    * be recorded by transform feedback, we can simply discard all geometry
+    * bound to these streams when transform feedback is disabled.
+    */
+   if (stream_id > 0 && !s.nir->info.has_transform_feedback_varyings)
+      return;
+
+   /* If we're outputting 32 control data bits or less, then we can wait
+    * until the shader is over to output them all.  Otherwise we need to
+    * output them as we go.  Now is the time to do it, since we're about to
+    * output the vertex_count'th vertex, so it's guaranteed that the
+    * control data bits associated with the (vertex_count - 1)th vertex are
+    * correct.
+    */
+   if (s.gs_compile->control_data_header_size_bits > 32) {
+      const fs_builder abld =
+         ntb.bld.annotate("emit vertex: emit control data bits");
+
+      /* Only emit control data bits if we've finished accumulating a batch
+       * of 32 bits.  This is the case when:
+       *
+       *     (vertex_count * bits_per_vertex) % 32 == 0
+       *
+       * (in other words, when the last 5 bits of vertex_count *
+       * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
+       * integer n (which is always the case, since bits_per_vertex is
+       * always 1 or 2), this is equivalent to requiring that the last 5-n
+       * bits of vertex_count are 0:
+       *
+       *     vertex_count & (2^(5-n) - 1) == 0
+       *
+       * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
+       * equivalent to:
+       *
+       *     vertex_count & (32 / bits_per_vertex - 1) == 0
+       *
+       * TODO: If vertex_count is an immediate, we could do some of this math
+       *       at compile time...
+       */
+      fs_inst *inst =
+         abld.AND(ntb.bld.null_reg_d(), vertex_count,
+                  brw_imm_ud(32u / s.gs_compile->control_data_bits_per_vertex - 1u));
+      inst->conditional_mod = BRW_CONDITIONAL_Z;
+
+      abld.IF(BRW_PREDICATE_NORMAL);
+      /* If vertex_count is 0, then no control data bits have been
+       * accumulated yet, so we can skip emitting them.
+       */
+      abld.CMP(ntb.bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
+               BRW_CONDITIONAL_NEQ);
+      abld.IF(BRW_PREDICATE_NORMAL);
+      s.emit_gs_control_data_bits(vertex_count);
+      abld.emit(BRW_OPCODE_ENDIF);
+
+      /* Reset control_data_bits to 0 so we can start accumulating a new
+       * batch.
+       *
+       * Note: in the case where vertex_count == 0, this neutralizes the
+       * effect of any call to EndPrimitive() that the shader may have
+       * made before outputting its first vertex.
+       */
+      inst = abld.MOV(s.control_data_bits, brw_imm_ud(0u));
+      inst->force_writemask_all = true;
+      abld.emit(BRW_OPCODE_ENDIF);
+   }
+
+   s.emit_urb_writes(vertex_count);
+
+   /* In stream mode we have to set control data bits for all vertices
+    * unless we have disabled control data bits completely (which we do
+    * do for MESA_PRIM_POINTS outputs that don't use streams).
+    */
+   if (s.gs_compile->control_data_header_size_bits > 0 &&
+       gs_prog_data->control_data_format ==
+          GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
+      set_gs_stream_control_data_bits(ntb, vertex_count, stream_id);
+   }
+}
+
+static void
+emit_gs_input_load(nir_to_brw_state &ntb, const fs_reg &dst,
+                   const nir_src &vertex_src,
+                   unsigned base_offset,
+                   const nir_src &offset_src,
+                   unsigned num_components,
+                   unsigned first_component)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   assert(type_sz(dst.type) == 4);
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
+   const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
+
+   /* TODO: figure out push input layout for invocations == 1 */
+   if (gs_prog_data->invocations == 1 &&
+       nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
+       4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
+      int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
+                       nir_src_as_uint(vertex_src) * push_reg_count;
+      const fs_reg attr = fs_reg(ATTR, 0, dst.type);
+      for (unsigned i = 0; i < num_components; i++) {
+         ntb.bld.MOV(offset(dst, bld, i),
+                     offset(attr, bld, imm_offset + i + first_component));
+      }
+      return;
+   }
+
+   /* Resort to the pull model.  Ensure the VUE handles are provided. */
+   assert(gs_prog_data->base.include_vue_handles);
+
+   fs_reg start = s.gs_payload().icp_handle_start;
+   fs_reg icp_handle = ntb.bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+   if (gs_prog_data->invocations == 1) {
+      if (nir_src_is_const(vertex_src)) {
+         /* The vertex index is constant; just select the proper URB handle. */
+         icp_handle = offset(start, ntb.bld, nir_src_as_uint(vertex_src));
+      } else {
+         /* The vertex index is non-constant.  We need to use indirect
+          * addressing to fetch the proper URB handle.
+          *
+          * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
+          * indicating that channel <n> should read the handle from
+          * DWord <n>.  We convert that to bytes by multiplying by 4.
+          *
+          * Next, we convert the vertex index to bytes by multiplying
+          * by 32 (shifting by 5), and add the two together.  This is
+          * the final indirect byte offset.
+          */
+         fs_reg sequence =
+            ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
+         fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+         /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
+         bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
+         /* Convert vertex_index to bytes (multiply by 32) */
+         bld.SHL(vertex_offset_bytes,
+                 retype(get_nir_src(ntb, vertex_src), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(5u));
+         bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
+
+         /* Use first_icp_handle as the base offset.  There is one register
+          * of URB handles per vertex, so inform the register allocator that
+          * we might read up to nir->info.gs.vertices_in registers.
+          */
+         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
+                  fs_reg(icp_offset_bytes),
+                  brw_imm_ud(s.nir->info.gs.vertices_in * REG_SIZE));
+      }
+   } else {
+      assert(gs_prog_data->invocations > 1);
+
+      if (nir_src_is_const(vertex_src)) {
+         unsigned vertex = nir_src_as_uint(vertex_src);
+         assert(devinfo->ver >= 9 || vertex <= 5);
+         bld.MOV(icp_handle, component(start, vertex));
+      } else {
+         /* The vertex index is non-constant.  We need to use indirect
+          * addressing to fetch the proper URB handle.
+          *
+          */
+         fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+         /* Convert vertex_index to bytes (multiply by 4) */
+         bld.SHL(icp_offset_bytes,
+                 retype(get_nir_src(ntb, vertex_src), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(2u));
+
+         /* Use first_icp_handle as the base offset.  There is one DWord
+          * of URB handles per vertex, so inform the register allocator that
+          * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
+          */
+         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
+                  fs_reg(icp_offset_bytes),
+                  brw_imm_ud(DIV_ROUND_UP(s.nir->info.gs.vertices_in, 8) *
+                             REG_SIZE));
+      }
+   }
+
+   fs_inst *inst;
+   fs_reg indirect_offset = get_nir_src(ntb, offset_src);
+
+   if (nir_src_is_const(offset_src)) {
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
+
+      /* Constant indexing - use global offset. */
+      if (first_component != 0) {
+         unsigned read_components = num_components + first_component;
+         fs_reg tmp = bld.vgrf(dst.type, read_components);
+         inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
+                         ARRAY_SIZE(srcs));
+         inst->size_written = read_components *
+                              tmp.component_size(inst->exec_size);
+         for (unsigned i = 0; i < num_components; i++) {
+            bld.MOV(offset(dst, bld, i),
+                    offset(tmp, bld, i + first_component));
+         }
+      } else {
+         inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
+                         ARRAY_SIZE(srcs));
+         inst->size_written = num_components *
+                              dst.component_size(inst->exec_size);
+      }
+      inst->offset = base_offset + nir_src_as_uint(offset_src);
+   } else {
+      /* Indirect indexing - use per-slot offsets as well. */
+      unsigned read_components = num_components + first_component;
+      fs_reg tmp = bld.vgrf(dst.type, read_components);
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
+      srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
+
+      if (first_component != 0) {
+         inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
+                         srcs, ARRAY_SIZE(srcs));
+         inst->size_written = read_components *
+                              tmp.component_size(inst->exec_size);
+         for (unsigned i = 0; i < num_components; i++) {
+            bld.MOV(offset(dst, bld, i),
+                    offset(tmp, bld, i + first_component));
+         }
+      } else {
+         inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
+                         srcs, ARRAY_SIZE(srcs));
+         inst->size_written = num_components *
+                              dst.component_size(inst->exec_size);
+      }
+      inst->offset = base_offset;
+   }
+}
+
+static fs_reg
+get_indirect_offset(nir_to_brw_state &ntb, nir_intrinsic_instr *instr)
+{
+   nir_src *offset_src = nir_get_io_offset_src(instr);
+
+   if (nir_src_is_const(*offset_src)) {
+      /* The only constant offset we should find is 0.  brw_nir.c's
+       * add_const_offset_to_base() will fold other constant offsets
+       * into the "base" index.
+       */
+      assert(nir_src_as_uint(*offset_src) == 0);
+      return fs_reg();
+   }
+
+   return get_nir_src(ntb, *offset_src);
+}
+
+static void
+fs_nir_emit_vs_intrinsic(nir_to_brw_state &ntb,
+                         nir_intrinsic_instr *instr)
+{
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+   assert(s.stage == MESA_SHADER_VERTEX);
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_def(ntb, instr->def);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_vertex_id:
+   case nir_intrinsic_load_base_vertex:
+      unreachable("should be lowered by nir_lower_system_values()");
+
+   case nir_intrinsic_load_input: {
+      assert(instr->def.bit_size == 32);
+      const fs_reg src = offset(fs_reg(ATTR, 0, dest.type), bld,
+                                nir_intrinsic_base(instr) * 4 +
+                                nir_intrinsic_component(instr) +
+                                nir_src_as_uint(instr->src[0]));
+
+      for (unsigned i = 0; i < instr->num_components; i++)
+         bld.MOV(offset(dest, bld, i), offset(src, bld, i));
+      break;
+   }
+
+   case nir_intrinsic_load_vertex_id_zero_base:
+   case nir_intrinsic_load_instance_id:
+   case nir_intrinsic_load_base_instance:
+   case nir_intrinsic_load_draw_id:
+   case nir_intrinsic_load_first_vertex:
+   case nir_intrinsic_load_is_indexed_draw:
+      unreachable("lowered by brw_nir_lower_vs_inputs");
+
+   default:
+      fs_nir_emit_intrinsic(ntb, bld, instr);
+      break;
+   }
+}
+
+static fs_reg
+get_tcs_single_patch_icp_handle(nir_to_brw_state &ntb, const fs_builder &bld,
+                                nir_intrinsic_instr *instr)
+{
+   fs_visitor &s = ntb.s;
+
+   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
+   const nir_src &vertex_src = instr->src[0];
+   nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);
+
+   const fs_reg start = s.tcs_payload().icp_handle_start;
+
+   fs_reg icp_handle;
+
+   if (nir_src_is_const(vertex_src)) {
+      /* Emit a MOV to resolve <0,1,0> regioning. */
+      icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      unsigned vertex = nir_src_as_uint(vertex_src);
+      bld.MOV(icp_handle, component(start, vertex));
+   } else if (tcs_prog_data->instances == 1 && vertex_intrin &&
+              vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {
+      /* For the common case of only 1 instance, an array index of
+       * gl_InvocationID means reading the handles from the start.  Skip all
+       * the indirect work.
+       */
+      icp_handle = start;
+   } else {
+      /* The vertex index is non-constant.  We need to use indirect
+       * addressing to fetch the proper URB handle.
+       */
+      icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+      /* Each ICP handle is a single DWord (4 bytes) */
+      fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      bld.SHL(vertex_offset_bytes,
+              retype(get_nir_src(ntb, vertex_src), BRW_REGISTER_TYPE_UD),
+              brw_imm_ud(2u));
+
+      /* We might read up to 4 registers. */
+      bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+               start, vertex_offset_bytes,
+               brw_imm_ud(4 * REG_SIZE));
+   }
+
+   return icp_handle;
+}
+
+static fs_reg
+get_tcs_multi_patch_icp_handle(nir_to_brw_state &ntb, const fs_builder &bld,
+                               nir_intrinsic_instr *instr)
+{
+   fs_visitor &s = ntb.s;
+   const intel_device_info *devinfo = s.devinfo;
+
+   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) s.key;
+   const nir_src &vertex_src = instr->src[0];
+   const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo);
+
+   const fs_reg start = s.tcs_payload().icp_handle_start;
+
+   if (nir_src_is_const(vertex_src))
+      return byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes);
+
+   /* The vertex index is non-constant.  We need to use indirect
+    * addressing to fetch the proper URB handle.
+    *
+    * First, we start with the sequence indicating that channel <n>
+    * should read the handle from DWord <n>.  We convert that to bytes
+    * by multiplying by 4.
+    *
+    * Next, we convert the vertex index to bytes by multiplying
+    * by the GRF size (by shifting), and add the two together.  This is
+    * the final indirect byte offset.
+    */
+   fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   fs_reg sequence = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
+   fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+   /* Offsets will be 0, 4, 8, ... */
+   bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
+   /* Convert vertex_index to bytes (multiply by 32) */
+   assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */
+   bld.SHL(vertex_offset_bytes,
+           retype(get_nir_src(ntb, vertex_src), BRW_REGISTER_TYPE_UD),
+           brw_imm_ud(ffs(grf_size_bytes) - 1));
+   bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
+
+   /* Use start of ICP handles as the base offset.  There is one register
+    * of URB handles per vertex, so inform the register allocator that
+    * we might read up to nir->info.gs.vertices_in registers.
+    */
+   bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
+            icp_offset_bytes,
+            brw_imm_ud(brw_tcs_prog_key_input_vertices(tcs_key) *
+                       grf_size_bytes));
+
+   return icp_handle;
+}
+
+static void
+setup_barrier_message_payload_gfx125(const fs_builder &bld,
+                                     const fs_reg &msg_payload)
+{
+   assert(bld.shader->devinfo->verx10 >= 125);
+
+   /* From BSpec: 54006, mov r0.2[31:24] into m0.2[31:24] and m0.2[23:16] */
+   fs_reg m0_10ub = component(retype(msg_payload, BRW_REGISTER_TYPE_UB), 10);
+   fs_reg r0_11ub =
+      stride(suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UB), 11),
+             0, 1, 0);
+   bld.exec_all().group(2, 0).MOV(m0_10ub, r0_11ub);
+}
+
+static void
+emit_barrier(nir_to_brw_state &ntb)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   /* We are getting the barrier ID from the compute shader header */
+   assert(gl_shader_stage_uses_workgroup(s.stage));
+
+   fs_reg payload = fs_reg(VGRF, s.alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+
+   /* Clear the message payload */
+   bld.exec_all().group(8, 0).MOV(payload, brw_imm_ud(0u));
+
+   if (devinfo->verx10 >= 125) {
+      setup_barrier_message_payload_gfx125(bld, payload);
+   } else {
+      assert(gl_shader_stage_is_compute(s.stage));
+
+      uint32_t barrier_id_mask;
+      switch (devinfo->ver) {
+      case 7:
+      case 8:
+         barrier_id_mask = 0x0f000000u; break;
+      case 9:
+         barrier_id_mask = 0x8f000000u; break;
+      case 11:
+      case 12:
+         barrier_id_mask = 0x7f000000u; break;
+      default:
+         unreachable("barrier is only available on gen >= 7");
+      }
+
+      /* Copy the barrier id from r0.2 to the message payload reg.2 */
+      fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD));
+      bld.exec_all().group(1, 0).AND(component(payload, 2), r0_2,
+                                     brw_imm_ud(barrier_id_mask));
+   }
+
+   /* Emit a gateway "barrier" message using the payload we set up, followed
+    * by a wait instruction.
+    */
+   bld.exec_all().emit(SHADER_OPCODE_BARRIER, reg_undef, payload);
+}
+
+static void
+emit_tcs_barrier(nir_to_brw_state &ntb)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   assert(s.stage == MESA_SHADER_TESS_CTRL);
+   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
+
+   fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   fs_reg m0_2 = component(m0, 2);
+
+   const fs_builder chanbld = bld.exec_all().group(1, 0);
+
+   /* Zero the message header */
+   bld.exec_all().MOV(m0, brw_imm_ud(0u));
+
+   if (devinfo->verx10 >= 125) {
+      setup_barrier_message_payload_gfx125(bld, m0);
+   } else if (devinfo->ver >= 11) {
+      chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
+                  brw_imm_ud(INTEL_MASK(30, 24)));
+
+      /* Set the Barrier Count and the enable bit */
+      chanbld.OR(m0_2, m0_2,
+                 brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
+   } else {
+      /* Copy "Barrier ID" from r0.2, bits 16:13 */
+      chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
+                  brw_imm_ud(INTEL_MASK(16, 13)));
+
+      /* Shift it up to bits 27:24. */
+      chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
+
+      /* Set the Barrier Count and the enable bit */
+      chanbld.OR(m0_2, m0_2,
+                 brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
+   }
+
+   bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
+}
+
+static void
+fs_nir_emit_tcs_intrinsic(nir_to_brw_state &ntb,
+                          nir_intrinsic_instr *instr)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   assert(s.stage == MESA_SHADER_TESS_CTRL);
+   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
+   struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
+
+   fs_reg dst;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dst = get_nir_def(ntb, instr->def);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_primitive_id:
+      bld.MOV(dst, s.tcs_payload().primitive_id);
+      break;
+   case nir_intrinsic_load_invocation_id:
+      bld.MOV(retype(dst, s.invocation_id.type), s.invocation_id);
+      break;
+
+   case nir_intrinsic_barrier:
+      if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
+         fs_nir_emit_intrinsic(ntb, bld, instr);
+      if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
+         if (tcs_prog_data->instances != 1)
+            emit_tcs_barrier(ntb);
+      }
+      break;
+
+   case nir_intrinsic_load_input:
+      unreachable("nir_lower_io should never give us these.");
+      break;
+
+   case nir_intrinsic_load_per_vertex_input: {
+      assert(instr->def.bit_size == 32);
+      fs_reg indirect_offset = get_indirect_offset(ntb, instr);
+      unsigned imm_offset = nir_intrinsic_base(instr);
+      fs_inst *inst;
+
+      const bool multi_patch =
+         vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
+
+      fs_reg icp_handle = multi_patch ?
+         get_tcs_multi_patch_icp_handle(ntb, bld, instr) :
+         get_tcs_single_patch_icp_handle(ntb, bld, instr);
+
+      /* We can only read two double components with each URB read, so
+       * we send two read messages in that case, each one loading up to
+       * two double components.
+       */
+      unsigned num_components = instr->num_components;
+      unsigned first_component = nir_intrinsic_component(instr);
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
+
+      if (indirect_offset.file == BAD_FILE) {
+         /* Constant indexing - use global offset. */
+         if (first_component != 0) {
+            unsigned read_components = num_components + first_component;
+            fs_reg tmp = bld.vgrf(dst.type, read_components);
+            inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
+                            ARRAY_SIZE(srcs));
+            for (unsigned i = 0; i < num_components; i++) {
+               bld.MOV(offset(dst, bld, i),
+                       offset(tmp, bld, i + first_component));
+            }
+         } else {
+            inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
+                            ARRAY_SIZE(srcs));
+         }
+         inst->offset = imm_offset;
+      } else {
+         /* Indirect indexing - use per-slot offsets as well. */
+         srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
+
+         if (first_component != 0) {
+            unsigned read_components = num_components + first_component;
+            fs_reg tmp = bld.vgrf(dst.type, read_components);
+            inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
+                            srcs, ARRAY_SIZE(srcs));
+            for (unsigned i = 0; i < num_components; i++) {
+               bld.MOV(offset(dst, bld, i),
+                       offset(tmp, bld, i + first_component));
+            }
+         } else {
+            inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
+                            srcs, ARRAY_SIZE(srcs));
+         }
+         inst->offset = imm_offset;
+      }
+      inst->size_written = (num_components + first_component) *
+                           inst->dst.component_size(inst->exec_size);
+
+      /* Copy the temporary to the destination to deal with writemasking.
+       *
+       * Also attempt to deal with gl_PointSize being in the .w component.
+       */
+      if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
+         assert(type_sz(dst.type) == 4);
+         inst->dst = bld.vgrf(dst.type, 4);
+         inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
+         bld.MOV(dst, offset(inst->dst, bld, 3));
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_per_vertex_output: {
+      assert(instr->def.bit_size == 32);
+      fs_reg indirect_offset = get_indirect_offset(ntb, instr);
+      unsigned imm_offset = nir_intrinsic_base(instr);
+      unsigned first_component = nir_intrinsic_component(instr);
+
+      fs_inst *inst;
+      if (indirect_offset.file == BAD_FILE) {
+         /* This MOV replicates the output handle to all enabled channels
+          * is SINGLE_PATCH mode.
+          */
+         fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld.MOV(patch_handle, s.tcs_payload().patch_urb_output);
+
+         {
+            fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+            srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle;
+
+            if (first_component != 0) {
+               unsigned read_components =
+                  instr->num_components + first_component;
+               fs_reg tmp = bld.vgrf(dst.type, read_components);
+               inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
+                               srcs, ARRAY_SIZE(srcs));
+               inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
+               for (unsigned i = 0; i < instr->num_components; i++) {
+                  bld.MOV(offset(dst, bld, i),
+                          offset(tmp, bld, i + first_component));
+               }
+            } else {
+               inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
+                               srcs, ARRAY_SIZE(srcs));
+               inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
+            }
+            inst->offset = imm_offset;
+         }
+      } else {
+         /* Indirect indexing - use per-slot offsets as well. */
+         fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+         srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
+         srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
+
+         if (first_component != 0) {
+            unsigned read_components =
+               instr->num_components + first_component;
+            fs_reg tmp = bld.vgrf(dst.type, read_components);
+            inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
+                            srcs, ARRAY_SIZE(srcs));
+            inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
+            for (unsigned i = 0; i < instr->num_components; i++) {
+               bld.MOV(offset(dst, bld, i),
+                       offset(tmp, bld, i + first_component));
+            }
+         } else {
+            inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
+                            srcs, ARRAY_SIZE(srcs));
+            inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
+         }
+         inst->offset = imm_offset;
+      }
+      break;
+   }
+
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_per_vertex_output: {
+      assert(nir_src_bit_size(instr->src[0]) == 32);
+      fs_reg value = get_nir_src(ntb, instr->src[0]);
+      fs_reg indirect_offset = get_indirect_offset(ntb, instr);
+      unsigned imm_offset = nir_intrinsic_base(instr);
+      unsigned mask = nir_intrinsic_write_mask(instr);
+
+      if (mask == 0)
+         break;
+
+      unsigned num_components = util_last_bit(mask);
+      unsigned first_component = nir_intrinsic_component(instr);
+      assert((first_component + num_components) <= 4);
+
+      mask = mask << first_component;
+
+      const bool has_urb_lsc = devinfo->ver >= 20;
+
+      fs_reg mask_reg;
+      if (mask != WRITEMASK_XYZW)
+         mask_reg = brw_imm_ud(mask << 16);
+
+      fs_reg sources[4];
+
+      unsigned m = has_urb_lsc ? 0 : first_component;
+      for (unsigned i = 0; i < num_components; i++) {
+         int c = i + first_component;
+         if (mask & (1 << c)) {
+            sources[m++] = offset(value, bld, i);
+         } else if (devinfo->ver < 20) {
+            m++;
+         }
+      }
+
+      assert(has_urb_lsc || m == (first_component + num_components));
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
+      srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
+      srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg;
+      srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_F, m);
+      srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(m);
+      bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, m, 0);
+
+      fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
+                               srcs, ARRAY_SIZE(srcs));
+      inst->offset = imm_offset;
+      break;
+   }
+
+   default:
+      fs_nir_emit_intrinsic(ntb, bld, instr);
+      break;
+   }
+}
+
+static void
+fs_nir_emit_tes_intrinsic(nir_to_brw_state &ntb,
+                          nir_intrinsic_instr *instr)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   assert(s.stage == MESA_SHADER_TESS_EVAL);
+   struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(s.prog_data);
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_def(ntb, instr->def);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_primitive_id:
+      bld.MOV(dest, s.tes_payload().primitive_id);
+      break;
+
+   case nir_intrinsic_load_tess_coord:
+      for (unsigned i = 0; i < 3; i++)
+         bld.MOV(offset(dest, bld, i), s.tes_payload().coords[i]);
+      break;
+
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_per_vertex_input: {
+      assert(instr->def.bit_size == 32);
+      fs_reg indirect_offset = get_indirect_offset(ntb, instr);
+      unsigned imm_offset = nir_intrinsic_base(instr);
+      unsigned first_component = nir_intrinsic_component(instr);
+
+      fs_inst *inst;
+      if (indirect_offset.file == BAD_FILE) {
+         /* Arbitrarily only push up to 32 vec4 slots worth of data,
+          * which is 16 registers (since each holds 2 vec4 slots).
+          */
+         const unsigned max_push_slots = 32;
+         if (imm_offset < max_push_slots) {
+            const fs_reg src = horiz_offset(fs_reg(ATTR, 0, dest.type),
+                                            4 * imm_offset + first_component);
+            for (int i = 0; i < instr->num_components; i++)
+               bld.MOV(offset(dest, bld, i), component(src, i));
+
+            tes_prog_data->base.urb_read_length =
+               MAX2(tes_prog_data->base.urb_read_length,
+                    (imm_offset / 2) + 1);
+         } else {
+            /* Replicate the patch handle to all enabled channels */
+            fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+            srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
+
+            if (first_component != 0) {
+               unsigned read_components =
+                  instr->num_components + first_component;
+               fs_reg tmp = bld.vgrf(dest.type, read_components);
+               inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
+                               srcs, ARRAY_SIZE(srcs));
+               inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
+               for (unsigned i = 0; i < instr->num_components; i++) {
+                  bld.MOV(offset(dest, bld, i),
+                          offset(tmp, bld, i + first_component));
+               }
+            } else {
+               inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest,
+                               srcs, ARRAY_SIZE(srcs));
+               inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
+            }
+            inst->offset = imm_offset;
+         }
+      } else {
+         /* Indirect indexing - use per-slot offsets as well. */
+
+         /* We can only read two double components with each URB read, so
+          * we send two read messages in that case, each one loading up to
+          * two double components.
+          */
+         unsigned num_components = instr->num_components;
+
+         fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+         srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
+         srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
+
+         if (first_component != 0) {
+            unsigned read_components =
+                num_components + first_component;
+            fs_reg tmp = bld.vgrf(dest.type, read_components);
+            inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
+                            srcs, ARRAY_SIZE(srcs));
+            for (unsigned i = 0; i < num_components; i++) {
+               bld.MOV(offset(dest, bld, i),
+                       offset(tmp, bld, i + first_component));
+            }
+         } else {
+            inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest,
+                            srcs, ARRAY_SIZE(srcs));
+         }
+         inst->offset = imm_offset;
+         inst->size_written = (num_components + first_component) *
+                              inst->dst.component_size(inst->exec_size);
+      }
+      break;
+   }
+   default:
+      fs_nir_emit_intrinsic(ntb, bld, instr);
+      break;
+   }
+}
+
+static void
+fs_nir_emit_gs_intrinsic(nir_to_brw_state &ntb,
+                         nir_intrinsic_instr *instr)
+{
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   assert(s.stage == MESA_SHADER_GEOMETRY);
+   fs_reg indirect_offset;
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_def(ntb, instr->def);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_primitive_id:
+      assert(s.stage == MESA_SHADER_GEOMETRY);
+      assert(brw_gs_prog_data(s.prog_data)->include_primitive_id);
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), s.gs_payload().primitive_id);
+      break;
+
+   case nir_intrinsic_load_input:
+      unreachable("load_input intrinsics are invalid for the GS stage");
+
+   case nir_intrinsic_load_per_vertex_input:
+      emit_gs_input_load(ntb, dest, instr->src[0], nir_intrinsic_base(instr),
+                         instr->src[1], instr->num_components,
+                         nir_intrinsic_component(instr));
+      break;
+
+   case nir_intrinsic_emit_vertex_with_counter:
+      emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr));
+      break;
+
+   case nir_intrinsic_end_primitive_with_counter:
+      emit_gs_end_primitive(ntb, instr->src[0]);
+      break;
+
+   case nir_intrinsic_set_vertex_and_primitive_count:
+      bld.MOV(s.final_gs_vertex_count, get_nir_src(ntb, instr->src[0]));
+      break;
+
+   case nir_intrinsic_load_invocation_id: {
+      fs_reg val = ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
+      assert(val.file != BAD_FILE);
+      dest.type = val.type;
+      bld.MOV(dest, val);
+      break;
+   }
+
+   default:
+      fs_nir_emit_intrinsic(ntb, bld, instr);
+      break;
+   }
+}
+
+/**
+ * Fetch the current render target layer index.
+ */
+static fs_reg
+fetch_render_target_array_index(const fs_builder &bld)
+{
+   const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);
+
+   if (bld.shader->devinfo->ver >= 20) {
+      /* Gfx20+ has separate Render Target Array indices for each pair
+       * of subspans in order to support multiple polygons, so we need
+       * to use a <1;8,0> region in order to select the correct word
+       * for each channel.
+       */
+      const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+      for (unsigned i = 0; i < DIV_ROUND_UP(bld.dispatch_width(), 16); i++) {
+         const fs_builder hbld = bld.group(16, i);
+         const struct brw_reg reg = retype(brw_vec1_grf(2 * i + 1, 1),
+                                           BRW_REGISTER_TYPE_UW);
+         hbld.AND(offset(idx, hbld, i), stride(reg, 1, 8, 0),
+                  brw_imm_uw(0x7ff));
+      }
+
+      return idx;
+   } else if (bld.shader->devinfo->ver >= 12 && v->max_polygons == 2) {
+      /* According to the BSpec "PS Thread Payload for Normal
+       * Dispatch", the render target array index is stored as bits
+       * 26:16 of either the R1.1 or R1.6 poly info dwords, for the
+       * first and second polygons respectively in multipolygon PS
+       * dispatch mode.
+       */
+      assert(bld.dispatch_width() == 16);
+      const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+      for (unsigned i = 0; i < v->max_polygons; i++) {
+         const fs_builder hbld = bld.group(8, i);
+         const struct brw_reg g1 = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3 + 10 * i);
+         hbld.AND(offset(idx, hbld, i), g1, brw_imm_uw(0x7ff));
+      }
+
+      return idx;
+   } else if (bld.shader->devinfo->ver >= 12) {
+      /* The render target array index is provided in the thread payload as
+       * bits 26:16 of r1.1.
+       */
+      const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3),
+              brw_imm_uw(0x7ff));
+      return idx;
+   } else if (bld.shader->devinfo->ver >= 6) {
+      /* The render target array index is provided in the thread payload as
+       * bits 26:16 of r0.0.
+       */
+      const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
+              brw_imm_uw(0x7ff));
+      return idx;
+   } else {
+      /* Pre-SNB we only ever render into the first layer of the framebuffer
+       * since layered rendering is not implemented.
+       */
+      return brw_imm_ud(0);
+   }
+}
+
+/* Sample from the MCS surface attached to this multisample texture. */
+static fs_reg
+emit_mcs_fetch(nir_to_brw_state &ntb, const fs_reg &coordinate, unsigned components,
+               const fs_reg &texture,
+               const fs_reg &texture_handle)
+{
+   const fs_builder &bld = ntb.bld;
+
+   const fs_reg dest = ntb.s.vgrf(glsl_uvec4_type());
+
+   fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
+   srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
+   srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
+   srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0);
+   srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle;
+   srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components);
+   srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
+   srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0);
+
+   fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
+                            ARRAY_SIZE(srcs));
+
+   /* We only care about one or two regs of response, but the sampler always
+    * writes 4/8.
+    */
+   inst->size_written = 4 * dest.component_size(inst->exec_size);
+
+   return dest;
+}
+
+/**
+ * Fake non-coherent framebuffer read implemented using TXF to fetch from the
+ * framebuffer at the current fragment coordinates and sample index.
+ */
+static fs_inst *
+emit_non_coherent_fb_read(nir_to_brw_state &ntb, const fs_builder &bld, const fs_reg &dst,
+                          unsigned target)
+{
+   fs_visitor &s = ntb.s;
+   const struct intel_device_info *devinfo = s.devinfo;
+
+   assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
+   const brw_wm_prog_key *wm_key =
+      reinterpret_cast<const brw_wm_prog_key *>(s.key);
+   assert(!wm_key->coherent_fb_fetch);
+
+   /* Calculate the fragment coordinates. */
+   const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
+   bld.MOV(offset(coords, bld, 0), s.pixel_x);
+   bld.MOV(offset(coords, bld, 1), s.pixel_y);
+   bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
+
+   /* Calculate the sample index and MCS payload when multisampling.  Luckily
+    * the MCS fetch message behaves deterministically for UMS surfaces, so it
+    * shouldn't be necessary to recompile based on whether the framebuffer is
+    * CMS or UMS.
+    */
+   assert(wm_key->multisample_fbo == BRW_ALWAYS ||
+          wm_key->multisample_fbo == BRW_NEVER);
+   if (wm_key->multisample_fbo &&
+       ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
+      ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
+
+   const fs_reg sample = ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
+   const fs_reg mcs = wm_key->multisample_fbo ?
+      emit_mcs_fetch(ntb, coords, 3, brw_imm_ud(target), fs_reg()) : fs_reg();
+
+   /* Use either a normal or a CMS texel fetch message depending on whether
+    * the framebuffer is single or multisample.  On SKL+ use the wide CMS
+    * message just in case the framebuffer uses 16x multisampling, it should
+    * be equivalent to the normal CMS fetch for lower multisampling modes.
+    */
+   opcode op;
+   if (wm_key->multisample_fbo) {
+      /* On SKL+ use the wide CMS message just in case the framebuffer uses 16x
+       * multisampling, it should be equivalent to the normal CMS fetch for
+       * lower multisampling modes.
+       *
+       * On Gfx12HP, there is only CMS_W variant available.
+       */
+      if (devinfo->verx10 >= 125)
+         op = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
+      else if (devinfo->ver >= 9)
+         op = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
+      else
+         op = SHADER_OPCODE_TXF_CMS_LOGICAL;
+   } else {
+      op = SHADER_OPCODE_TXF_LOGICAL;
+   }
+
+   /* Emit the instruction. */
+   fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
+   srcs[TEX_LOGICAL_SRC_COORDINATE]       = coords;
+   srcs[TEX_LOGICAL_SRC_LOD]              = brw_imm_ud(0);
+   srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX]     = sample;
+   srcs[TEX_LOGICAL_SRC_MCS]              = mcs;
+   srcs[TEX_LOGICAL_SRC_SURFACE]          = brw_imm_ud(target);
+   srcs[TEX_LOGICAL_SRC_SAMPLER]          = brw_imm_ud(0);
+   srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3);
+   srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS]  = brw_imm_ud(0);
+   srcs[TEX_LOGICAL_SRC_RESIDENCY]        = brw_imm_ud(0);
+
+   fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
+   inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
+
+   return inst;
+}
+
+/**
+ * Actual coherent framebuffer read implemented using the native render target
+ * read message.  Requires SKL+.
+ */
+static fs_inst *
+emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
+{
+   assert(bld.shader->devinfo->ver >= 9);
+   fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
+   inst->target = target;
+   inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
+
+   return inst;
+}
+
+static fs_reg
+alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
+{
+   if (n && regs[0].file != BAD_FILE) {
+      return regs[0];
+
+   } else {
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
+
+      for (unsigned i = 0; i < n; i++)
+         regs[i] = tmp;
+
+      return tmp;
+   }
+}
+
+static fs_reg
+alloc_frag_output(nir_to_brw_state &ntb, unsigned location)
+{
+   fs_visitor &s = ntb.s;
+
+   assert(s.stage == MESA_SHADER_FRAGMENT);
+   const brw_wm_prog_key *const key =
+      reinterpret_cast<const brw_wm_prog_key *>(s.key);
+   const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
+   const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
+
+   if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
+      return alloc_temporary(ntb.bld, 4, &s.dual_src_output, 1);
+
+   else if (l == FRAG_RESULT_COLOR)
+      return alloc_temporary(ntb.bld, 4, s.outputs,
+                             MAX2(key->nr_color_regions, 1));
+
+   else if (l == FRAG_RESULT_DEPTH)
+      return alloc_temporary(ntb.bld, 1, &s.frag_depth, 1);
+
+   else if (l == FRAG_RESULT_STENCIL)
+      return alloc_temporary(ntb.bld, 1, &s.frag_stencil, 1);
+
+   else if (l == FRAG_RESULT_SAMPLE_MASK)
+      return alloc_temporary(ntb.bld, 1, &s.sample_mask, 1);
+
+   else if (l >= FRAG_RESULT_DATA0 &&
+            l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
+      return alloc_temporary(ntb.bld, 4,
+                             &s.outputs[l - FRAG_RESULT_DATA0], 1);
+
+   else
+      unreachable("Invalid location");
+}
+
+static void
+emit_is_helper_invocation(nir_to_brw_state &ntb, fs_reg result)
+{
+   const fs_builder &bld = ntb.bld;
+
+   /* Unlike the regular gl_HelperInvocation, that is defined at dispatch,
+    * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into
+    * consideration demoted invocations.
+    */
+   result.type = BRW_REGISTER_TYPE_UD;
+
+   bld.MOV(result, brw_imm_ud(0));
+
+   /* See brw_sample_mask_reg() for why we split SIMD32 into SIMD16 here. */
+   unsigned width = bld.dispatch_width();
+   for (unsigned i = 0; i < DIV_ROUND_UP(width, 16); i++) {
+      const fs_builder b = bld.group(MIN2(width, 16), i);
+
+      fs_inst *mov = b.MOV(offset(result, b, i), brw_imm_ud(~0));
+
+      /* The at() ensures that any code emitted to get the predicate happens
+       * before the mov right above.  This is not an issue elsewhere because
+       * lowering code already set up the builder this way.
+       */
+      brw_emit_predicate_on_sample_mask(b.at(NULL, mov), mov);
+      mov->predicate_inverse = true;
+   }
+}
+
+static void
+emit_fragcoord_interpolation(nir_to_brw_state &ntb, fs_reg wpos)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   assert(s.stage == MESA_SHADER_FRAGMENT);
+
+   /* gl_FragCoord.x */
+   bld.MOV(wpos, s.pixel_x);
+   wpos = offset(wpos, bld, 1);
+
+   /* gl_FragCoord.y */
+   bld.MOV(wpos, s.pixel_y);
+   wpos = offset(wpos, bld, 1);
+
+   /* gl_FragCoord.z */
+   if (devinfo->ver >= 6) {
+      bld.MOV(wpos, s.pixel_z);
+   } else {
+      bld.emit(FS_OPCODE_LINTERP, wpos,
+               s.delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
+               s.interp_reg(bld, VARYING_SLOT_POS, 2, 0));
+   }
+   wpos = offset(wpos, bld, 1);
+
+   /* gl_FragCoord.w: Already set up in emit_interpolation */
+   bld.MOV(wpos, s.wpos_w);
+}
+
+static fs_reg
+emit_frontfacing_interpolation(nir_to_brw_state &ntb)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   fs_reg ff = bld.vgrf(BRW_REGISTER_TYPE_D);
+
+   if (devinfo->ver >= 20) {
+      /* Gfx20+ has separate back-facing bits for each pair of
+       * subspans in order to support multiple polygons, so we need to
+       * use a <1;8,0> region in order to select the correct word for
+       * each channel.
+       */
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
+
+      for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
+         const fs_builder hbld = bld.group(16, i);
+         const struct brw_reg gi_uw = retype(xe2_vec1_grf(i, 9),
+                                             BRW_REGISTER_TYPE_UW);
+         hbld.AND(offset(tmp, hbld, i), gi_uw, brw_imm_uw(0x800));
+      }
+
+      bld.CMP(ff, tmp, brw_imm_uw(0), BRW_CONDITIONAL_Z);
+
+   } else if (devinfo->ver >= 12 && s.max_polygons == 2) {
+      /* According to the BSpec "PS Thread Payload for Normal
+       * Dispatch", the front/back facing interpolation bit is stored
+       * as bit 15 of either the R1.1 or R1.6 poly info field, for the
+       * first and second polygons respectively in multipolygon PS
+       * dispatch mode.
+       */
+      assert(s.dispatch_width == 16);
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_W);
+
+      for (unsigned i = 0; i < s.max_polygons; i++) {
+         const fs_builder hbld = bld.group(8, i);
+         const struct brw_reg g1 = retype(brw_vec1_grf(1, 1 + 5 * i),
+                                          BRW_REGISTER_TYPE_W);
+         hbld.ASR(offset(tmp, hbld, i), g1, brw_imm_d(15));
+      }
+
+      bld.NOT(ff, tmp);
+
+   } else if (devinfo->ver >= 12) {
+      fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W));
+
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_W);
+      bld.ASR(tmp, g1, brw_imm_d(15));
+      bld.NOT(ff, tmp);
+   } else if (devinfo->ver >= 6) {
+      /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
+       * a boolean result from this (~0/true or 0/false).
+       *
+       * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
+       * this task in only one instruction:
+       *    - a negation source modifier will flip the bit; and
+       *    - a W -> D type conversion will sign extend the bit into the high
+       *      word of the destination.
+       *
+       * An ASR 15 fills the low word of the destination.
+       */
+      fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
+      g0.negate = true;
+
+      bld.ASR(ff, g0, brw_imm_d(15));
+   } else {
+      /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
+       * a boolean result from this (1/true or 0/false).
+       *
+       * Like in the above case, since the bit is the MSB of g1.6:UD we can use
+       * the negation source modifier to flip it. Unfortunately the SHR
+       * instruction only operates on UD (or D with an abs source modifier)
+       * sources without negation.
+       *
+       * Instead, use ASR (which will give ~0/true or 0/false).
+       */
+      fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
+      g1_6.negate = true;
+
+      bld.ASR(ff, g1_6, brw_imm_d(31));
+   }
+
+   return ff;
+}
+
+static fs_reg
+emit_samplepos_setup(nir_to_brw_state &ntb)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   assert(s.stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
+   assert(devinfo->ver >= 6);
+
+   const fs_builder abld = bld.annotate("compute sample position");
+   fs_reg pos = abld.vgrf(BRW_REGISTER_TYPE_F, 2);
+
+   if (wm_prog_data->persample_dispatch == BRW_NEVER) {
+      /* From ARB_sample_shading specification:
+       * "When rendering to a non-multisample buffer, or if multisample
+       *  rasterization is disabled, gl_SamplePosition will always be
+       *  (0.5, 0.5).
+       */
+      bld.MOV(offset(pos, bld, 0), brw_imm_f(0.5f));
+      bld.MOV(offset(pos, bld, 1), brw_imm_f(0.5f));
+      return pos;
+   }
+
+   /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
+    * mode will be enabled.
+    *
+    * From the Ivy Bridge PRM, volume 2 part 1, page 344:
+    * R31.1:0         Position Offset X/Y for Slot[3:0]
+    * R31.3:2         Position Offset X/Y for Slot[7:4]
+    * .....
+    *
+    * The X, Y sample positions come in as bytes in  thread payload. So, read
+    * the positions using vstride=16, width=8, hstride=2.
+    */
+   const fs_reg sample_pos_reg =
+      fetch_payload_reg(abld, s.fs_payload().sample_pos_reg, BRW_REGISTER_TYPE_W);
+
+   for (unsigned i = 0; i < 2; i++) {
+      fs_reg tmp_d = bld.vgrf(BRW_REGISTER_TYPE_D);
+      abld.MOV(tmp_d, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, i));
+      /* Convert int_sample_pos to floating point */
+      fs_reg tmp_f = bld.vgrf(BRW_REGISTER_TYPE_F);
+      abld.MOV(tmp_f, tmp_d);
+      /* Scale to the range [0, 1] */
+      abld.MUL(offset(pos, abld, i), tmp_f, brw_imm_f(1 / 16.0f));
+   }
+
+   if (wm_prog_data->persample_dispatch == BRW_SOMETIMES) {
+      check_dynamic_msaa_flag(abld, wm_prog_data,
+                              INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
+      for (unsigned i = 0; i < 2; i++) {
+         set_predicate(BRW_PREDICATE_NORMAL,
+                       bld.SEL(offset(pos, abld, i), offset(pos, abld, i),
+                               brw_imm_f(0.5f)));
+      }
+   }
+
+   return pos;
+}
+
+static fs_reg
+emit_sampleid_setup(nir_to_brw_state &ntb)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   assert(s.stage == MESA_SHADER_FRAGMENT);
+   ASSERTED brw_wm_prog_key *key = (brw_wm_prog_key*) s.key;
+   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
+   assert(devinfo->ver >= 6);
+
+   const fs_builder abld = bld.annotate("compute sample id");
+   fs_reg sample_id = abld.vgrf(BRW_REGISTER_TYPE_UD);
+
+   assert(key->multisample_fbo != BRW_NEVER);
+
+   if (devinfo->ver >= 8) {
+      /* Sample ID comes in as 4-bit numbers in g1.0:
+       *
+       *    15:12 Slot 3 SampleID (only used in SIMD16)
+       *     11:8 Slot 2 SampleID (only used in SIMD16)
+       *      7:4 Slot 1 SampleID
+       *      3:0 Slot 0 SampleID
+       *
+       * Each slot corresponds to four channels, so we want to replicate each
+       * half-byte value to 4 channels in a row:
+       *
+       *    dst+0:    .7    .6    .5    .4    .3    .2    .1    .0
+       *             7:4   7:4   7:4   7:4   3:0   3:0   3:0   3:0
+       *
+       *    dst+1:    .7    .6    .5    .4    .3    .2    .1    .0  (if SIMD16)
+       *           15:12 15:12 15:12 15:12  11:8  11:8  11:8  11:8
+       *
+       * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
+       * channels to read the first byte (7:0), and the second group of 8
+       * channels to read the second byte (15:8).  Then, we shift right by
+       * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
+       * values into place.  Finally, we AND with 0xf to keep the low nibble.
+       *
+       *    shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
+       *    and(16) dst<1>D tmp<8,8,1>W  0xf:W
+       *
+       * TODO: These payload bits exist on Gfx7 too, but they appear to always
+       *       be zero, so this code fails to work.  We should find out why.
+       */
+      const fs_reg tmp = abld.vgrf(BRW_REGISTER_TYPE_UW);
+
+      for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
+         const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
+         /* According to the "PS Thread Payload for Normal Dispatch"
+          * pages on the BSpec, the sample ids are stored in R0.8/R1.8
+          * on gfx20+ and in R1.0/R2.0 on gfx8+.
+          */
+         const struct brw_reg id_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) :
+                                       brw_vec1_grf(i + 1, 0);
+         hbld.SHR(offset(tmp, hbld, i),
+                  stride(retype(id_reg, BRW_REGISTER_TYPE_UB), 1, 8, 0),
+                  brw_imm_v(0x44440000));
+      }
+
+      abld.AND(sample_id, tmp, brw_imm_w(0xf));
+   } else {
+      const fs_reg t1 = component(abld.vgrf(BRW_REGISTER_TYPE_UD), 0);
+      const fs_reg t2 = abld.vgrf(BRW_REGISTER_TYPE_UW);
+
+      /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
+       * 8x multisampling, subspan 0 will represent sample N (where N
+       * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
+       * 7. We can find the value of N by looking at R0.0 bits 7:6
+       * ("Starting Sample Pair Index (SSPI)") and multiplying by two
+       * (since samples are always delivered in pairs). That is, we
+       * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
+       * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
+       * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
+       * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
+       * populating a temporary variable with the sequence (0, 1, 2, 3),
+       * and then reading from it using vstride=1, width=4, hstride=0.
+       * These computations hold good for 4x multisampling as well.
+       *
+       * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
+       * the first four slots are sample 0 of subspan 0; the next four
+       * are sample 1 of subspan 0; the third group is sample 0 of
+       * subspan 1, and finally sample 1 of subspan 1.
+       */
+
+      /* SKL+ has an extra bit for the Starting Sample Pair Index to
+       * accommodate 16x MSAA.
+       */
+      abld.exec_all().group(1, 0)
+          .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+               brw_imm_ud(0xc0));
+      abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
+
+      /* This works for SIMD8-SIMD16.  It also works for SIMD32 but only if we
+       * can assume 4x MSAA.  Disallow it on IVB+
+       *
+       * FINISHME: One day, we could come up with a way to do this that
+       * actually works on gfx7.
+       */
+      if (devinfo->ver >= 7)
+         s.limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gfx7");
+      abld.exec_all().group(8, 0).MOV(t2, brw_imm_v(0x32103210));
+
+      /* This special instruction takes care of setting vstride=1,
+       * width=4, hstride=0 of t2 during an ADD instruction.
+       */
+      abld.emit(FS_OPCODE_SET_SAMPLE_ID, sample_id, t1, t2);
+   }
+
+   if (key->multisample_fbo == BRW_SOMETIMES) {
+      check_dynamic_msaa_flag(abld, wm_prog_data,
+                              INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
+      set_predicate(BRW_PREDICATE_NORMAL,
+                    abld.SEL(sample_id, sample_id, brw_imm_ud(0)));
+   }
+
+   return sample_id;
+}
+
+static fs_reg
+emit_samplemaskin_setup(nir_to_brw_state &ntb)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   assert(s.stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
+   assert(devinfo->ver >= 6);
+
+   /* The HW doesn't provide us with expected values. */
+   assert(wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS);
+
+   fs_reg coverage_mask =
+      fetch_payload_reg(bld, s.fs_payload().sample_mask_in_reg, BRW_REGISTER_TYPE_D);
+
+   if (wm_prog_data->persample_dispatch == BRW_NEVER)
+      return coverage_mask;
+
+   /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
+    * and a mask representing which sample is being processed by the
+    * current shader invocation.
+    *
+    * From the OES_sample_variables specification:
+    * "When per-sample shading is active due to the use of a fragment input
+    *  qualified by "sample" or due to the use of the gl_SampleID or
+    *  gl_SamplePosition variables, only the bit for the current sample is
+    *  set in gl_SampleMaskIn."
+    */
+   const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
+
+   if (ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
+      ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
+
+   fs_reg one = s.vgrf(glsl_int_type());
+   fs_reg enabled_mask = s.vgrf(glsl_int_type());
+   abld.MOV(one, brw_imm_d(1));
+   abld.SHL(enabled_mask, one, ntb.system_values[SYSTEM_VALUE_SAMPLE_ID]);
+   fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_D);
+   abld.AND(mask, enabled_mask, coverage_mask);
+
+   if (wm_prog_data->persample_dispatch == BRW_ALWAYS)
+      return mask;
+
+   check_dynamic_msaa_flag(abld, wm_prog_data,
+                           INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
+   set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask));
+
+   return mask;
+}
+
+static fs_reg
+emit_shading_rate_setup(nir_to_brw_state &ntb)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+
+   assert(devinfo->ver >= 11);
+
+   struct brw_wm_prog_data *wm_prog_data =
+      brw_wm_prog_data(bld.shader->stage_prog_data);
+
+   /* Coarse pixel shading size fields overlap with other fields of not in
+    * coarse pixel dispatch mode, so report 0 when that's not the case.
+    */
+   if (wm_prog_data->coarse_pixel_dispatch == BRW_NEVER)
+      return brw_imm_ud(0);
+
+   const fs_builder abld = bld.annotate("compute fragment shading rate");
+
+   /* The shading rates provided in the shader are the actual 2D shading
+    * rate while the SPIR-V built-in is the enum value that has the shading
+    * rate encoded as a bitfield.  Fortunately, the bitfield value is just
+    * the shading rate divided by two and shifted.
+    */
+
+   /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
+   fs_reg actual_x = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));
+   /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
+   fs_reg actual_y = byte_offset(actual_x, 1);
+
+   fs_reg int_rate_x = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   fs_reg int_rate_y = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+   abld.SHR(int_rate_y, actual_y, brw_imm_ud(1));
+   abld.SHR(int_rate_x, actual_x, brw_imm_ud(1));
+   abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2));
+
+   fs_reg rate = abld.vgrf(BRW_REGISTER_TYPE_UD);
+   abld.OR(rate, int_rate_x, int_rate_y);
+
+   if (wm_prog_data->coarse_pixel_dispatch == BRW_ALWAYS)
+      return rate;
+
+   check_dynamic_msaa_flag(abld, wm_prog_data,
+                           INTEL_MSAA_FLAG_COARSE_RT_WRITES);
+   set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(rate, rate, brw_imm_ud(0)));
+
+   return rate;
+}
+
+static void
+fs_nir_emit_fs_intrinsic(nir_to_brw_state &ntb,
+                         nir_intrinsic_instr *instr)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   assert(s.stage == MESA_SHADER_FRAGMENT);
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_def(ntb, instr->def);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_front_face:
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
+              emit_frontfacing_interpolation(ntb));
+      break;
+
+   case nir_intrinsic_load_sample_pos:
+   case nir_intrinsic_load_sample_pos_or_center: {
+      fs_reg sample_pos = ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
+      assert(sample_pos.file != BAD_FILE);
+      dest.type = sample_pos.type;
+      bld.MOV(dest, sample_pos);
+      bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
+      break;
+   }
+
+   case nir_intrinsic_load_layer_id:
+      dest.type = BRW_REGISTER_TYPE_UD;
+      bld.MOV(dest, fetch_render_target_array_index(bld));
+      break;
+
+   case nir_intrinsic_is_helper_invocation:
+      emit_is_helper_invocation(ntb, dest);
+      break;
+
+   case nir_intrinsic_load_helper_invocation:
+   case nir_intrinsic_load_sample_mask_in:
+   case nir_intrinsic_load_sample_id:
+   case nir_intrinsic_load_frag_shading_rate: {
+      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
+      fs_reg val = ntb.system_values[sv];
+      assert(val.file != BAD_FILE);
+      dest.type = val.type;
+      bld.MOV(dest, val);
+      break;
+   }
+
+   case nir_intrinsic_store_output: {
+      const fs_reg src = get_nir_src(ntb, instr->src[0]);
+      const unsigned store_offset = nir_src_as_uint(instr->src[1]);
+      const unsigned location = nir_intrinsic_base(instr) +
+         SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
+      const fs_reg new_dest = retype(alloc_frag_output(ntb, location),
+                                     src.type);
+
+      for (unsigned j = 0; j < instr->num_components; j++)
+         bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
+                 offset(src, bld, j));
+
+      break;
+   }
+
+   case nir_intrinsic_load_output: {
+      const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
+                                   BRW_NIR_FRAG_OUTPUT_LOCATION);
+      assert(l >= FRAG_RESULT_DATA0);
+      const unsigned load_offset = nir_src_as_uint(instr->src[0]);
+      const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
+      const fs_reg tmp = bld.vgrf(dest.type, 4);
+
+      if (reinterpret_cast<const brw_wm_prog_key *>(s.key)->coherent_fb_fetch)
+         emit_coherent_fb_read(bld, tmp, target);
+      else
+         emit_non_coherent_fb_read(ntb, bld, tmp, target);
+
+      for (unsigned j = 0; j < instr->num_components; j++) {
+         bld.MOV(offset(dest, bld, j),
+                 offset(tmp, bld, nir_intrinsic_component(instr) + j));
+      }
+
+      break;
+   }
+
+   case nir_intrinsic_demote:
+   case nir_intrinsic_discard:
+   case nir_intrinsic_terminate:
+   case nir_intrinsic_demote_if:
+   case nir_intrinsic_discard_if:
+   case nir_intrinsic_terminate_if: {
+      /* We track our discarded pixels in f0.1/f1.0.  By predicating on it, we
+       * can update just the flag bits that aren't yet discarded.  If there's
+       * no condition, we emit a CMP of g0 != g0, so all currently executing
+       * channels will get turned off.
+       */
+      fs_inst *cmp = NULL;
+      if (instr->intrinsic == nir_intrinsic_demote_if ||
+          instr->intrinsic == nir_intrinsic_discard_if ||
+          instr->intrinsic == nir_intrinsic_terminate_if) {
+         nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]);
+
+         if (alu != NULL &&
+             alu->op != nir_op_bcsel &&
+             (devinfo->ver > 5 ||
+              (alu->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) != BRW_NIR_BOOLEAN_NEEDS_RESOLVE ||
+              alu->op == nir_op_fneu32 || alu->op == nir_op_feq32 ||
+              alu->op == nir_op_flt32 || alu->op == nir_op_fge32 ||
+              alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 ||
+              alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 ||
+              alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) {
+            /* Re-emit the instruction that generated the Boolean value, but
+             * do not store it.  Since this instruction will be conditional,
+             * other instructions that want to use the real Boolean value may
+             * get garbage.  This was a problem for piglit's fs-discard-exit-2
+             * test.
+             *
+             * Ideally we'd detect that the instruction cannot have a
+             * conditional modifier before emitting the instructions.  Alas,
+             * that is nigh impossible.  Instead, we're going to assume the
+             * instruction (or last instruction) generated can have a
+             * conditional modifier.  If it cannot, fallback to the old-style
+             * compare, and hope dead code elimination will clean up the
+             * extra instructions generated.
+             */
+            fs_nir_emit_alu(ntb, alu, false);
+
+            cmp = (fs_inst *) s.instructions.get_tail();
+            if (cmp->conditional_mod == BRW_CONDITIONAL_NONE) {
+               if (cmp->can_do_cmod())
+                  cmp->conditional_mod = BRW_CONDITIONAL_Z;
+               else
+                  cmp = NULL;
+            } else {
+               /* The old sequence that would have been generated is,
+                * basically, bool_result == false.  This is equivalent to
+                * !bool_result, so negate the old modifier.
+                */
+               cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
+            }
+         }
+
+         if (cmp == NULL) {
+            cmp = bld.CMP(bld.null_reg_f(), get_nir_src(ntb, instr->src[0]),
+                          brw_imm_d(0), BRW_CONDITIONAL_Z);
+         }
+      } else {
+         fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
+                                       BRW_REGISTER_TYPE_UW));
+         cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
+      }
+
+      cmp->predicate = BRW_PREDICATE_NORMAL;
+      cmp->flag_subreg = sample_mask_flag_subreg(s);
+
+      fs_inst *jump = bld.emit(BRW_OPCODE_HALT);
+      jump->flag_subreg = sample_mask_flag_subreg(s);
+      jump->predicate_inverse = true;
+
+      if (instr->intrinsic == nir_intrinsic_terminate ||
+          instr->intrinsic == nir_intrinsic_terminate_if) {
+         jump->predicate = BRW_PREDICATE_NORMAL;
+      } else {
+         /* Only jump when the whole quad is demoted.  For historical
+          * reasons this is also used for discard.
+          */
+         jump->predicate = (devinfo->ver >= 20 ? XE2_PREDICATE_ANY :
+                            BRW_PREDICATE_ALIGN1_ANY4H);
+      }
+
+      if (devinfo->ver < 7)
+         s.limit_dispatch_width(
+            16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
+      break;
+   }
+
+   case nir_intrinsic_load_input: {
+      /* In Fragment Shaders load_input is used either for flat inputs or
+       * per-primitive inputs.
+       */
+      assert(instr->def.bit_size == 32);
+      unsigned base = nir_intrinsic_base(instr);
+      unsigned comp = nir_intrinsic_component(instr);
+      unsigned num_components = instr->num_components;
+
+      const struct brw_wm_prog_key *wm_key = (brw_wm_prog_key*) s.key;
+
+      if (wm_key->mesh_input == BRW_SOMETIMES) {
+         assert(devinfo->verx10 >= 125);
+         /* The FS payload gives us the viewport and layer clamped to valid
+          * ranges, but the spec for gl_ViewportIndex and gl_Layer includes
+          * the language:
+          *   the fragment stage will read the same value written by the
+          *   geometry stage, even if that value is out of range.
+          *
+          * Which is why these are normally passed as regular attributes.
+          * This isn't tested anywhere except some GL-only piglit tests
+          * though, so for the case where the FS may be used against either a
+          * traditional pipeline or a mesh one, where the position of these
+          * will change depending on the previous stage, read them from the
+          * payload to simplify things until the requisite magic is in place.
+          */
+         if (base == VARYING_SLOT_LAYER || base == VARYING_SLOT_VIEWPORT) {
+            assert(num_components == 1);
+            fs_reg g1(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD));
+
+            unsigned mask, shift_count;
+            if (base == VARYING_SLOT_LAYER) {
+               shift_count = 16;
+               mask = 0x7ff << shift_count;
+            } else {
+               shift_count = 27;
+               mask = 0xf << shift_count;
+            }
+
+            fs_reg vp_or_layer = bld.vgrf(BRW_REGISTER_TYPE_UD);
+            bld.AND(vp_or_layer, g1, brw_imm_ud(mask));
+            fs_reg shifted_value = bld.vgrf(BRW_REGISTER_TYPE_UD);
+            bld.SHR(shifted_value, vp_or_layer, brw_imm_ud(shift_count));
+            bld.MOV(offset(dest, bld, 0), retype(shifted_value, dest.type));
+            break;
+         }
+      }
+
+      /* TODO(mesh): Multiview. Verify and handle these special cases for Mesh. */
+
+      /* Special case fields in the VUE header */
+      if (base == VARYING_SLOT_LAYER)
+         comp = 1;
+      else if (base == VARYING_SLOT_VIEWPORT)
+         comp = 2;
+
+      if (BITFIELD64_BIT(base) & s.nir->info.per_primitive_inputs) {
+         assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
+         for (unsigned int i = 0; i < num_components; i++) {
+            bld.MOV(offset(dest, bld, i),
+                    retype(s.per_primitive_reg(bld, base, comp + i), dest.type));
+         }
+      } else {
+         /* Gfx20+ packs the plane parameters of a single logical
+          * input in a vec3 format instead of the previously used vec4
+          * format.
+          */
+         const unsigned k = devinfo->ver >= 20 ? 0 : 3;
+         for (unsigned int i = 0; i < num_components; i++) {
+            bld.MOV(offset(dest, bld, i),
+                    retype(s.interp_reg(bld, base, comp + i, k), dest.type));
+         }
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_fs_input_interp_deltas: {
+      assert(s.stage == MESA_SHADER_FRAGMENT);
+      assert(nir_src_as_uint(instr->src[0]) == 0);
+      const unsigned base = nir_intrinsic_base(instr);
+      const unsigned comp = nir_intrinsic_component(instr);
+      dest.type = BRW_REGISTER_TYPE_F;
+
+      /* Gfx20+ packs the plane parameters of a single logical
+       * input in a vec3 format instead of the previously used vec4
+       * format.
+       */
+      if (devinfo->ver >= 20) {
+         bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 0));
+         bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 2));
+         bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 1));
+      } else {
+         bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 3));
+         bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 1));
+         bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 0));
+      }
+
+      break;
+   }
+
+   case nir_intrinsic_load_barycentric_pixel:
+   case nir_intrinsic_load_barycentric_centroid:
+   case nir_intrinsic_load_barycentric_sample: {
+      /* Use the delta_xy values computed from the payload */
+      enum brw_barycentric_mode bary = brw_barycentric_mode(instr);
+      const fs_reg srcs[] = { offset(s.delta_xy[bary], bld, 0),
+                              offset(s.delta_xy[bary], bld, 1) };
+      bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
+      break;
+   }
+
+   case nir_intrinsic_load_barycentric_at_sample: {
+      const glsl_interp_mode interpolation =
+         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
+
+      fs_reg msg_data;
+      if (nir_src_is_const(instr->src[0])) {
+         msg_data = brw_imm_ud(nir_src_as_uint(instr->src[0]) << 4);
+      } else {
+         const fs_reg sample_src = retype(get_nir_src(ntb, instr->src[0]),
+                                          BRW_REGISTER_TYPE_UD);
+         const fs_reg sample_id = bld.emit_uniformize(sample_src);
+         msg_data = component(bld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD), 0);
+         bld.exec_all().group(1, 0).SHL(msg_data, sample_id, brw_imm_ud(4u));
+      }
+
+      fs_reg flag_reg;
+      struct brw_wm_prog_key *wm_prog_key = (struct brw_wm_prog_key *) s.key;
+      if (wm_prog_key->multisample_fbo == BRW_SOMETIMES) {
+         struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
+
+         check_dynamic_msaa_flag(bld.exec_all().group(8, 0),
+                                 wm_prog_data,
+                                 INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
+         flag_reg = brw_flag_reg(0, 0);
+      }
+
+      emit_pixel_interpolater_send(bld,
+                                   FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+                                   dest,
+                                   fs_reg(), /* src */
+                                   msg_data,
+                                   flag_reg,
+                                   interpolation);
+      break;
+   }
+
+   case nir_intrinsic_load_barycentric_at_offset: {
+      const glsl_interp_mode interpolation =
+         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
+
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+
+      if (const_offset) {
+         assert(nir_src_bit_size(instr->src[0]) == 32);
+         unsigned off_x = const_offset[0].u32 & 0xf;
+         unsigned off_y = const_offset[1].u32 & 0xf;
+
+         emit_pixel_interpolater_send(bld,
+                                      FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
+                                      dest,
+                                      fs_reg(), /* src */
+                                      brw_imm_ud(off_x | (off_y << 4)),
+                                      fs_reg(), /* flag_reg */
+                                      interpolation);
+      } else {
+         fs_reg src = retype(get_nir_src(ntb, instr->src[0]), BRW_REGISTER_TYPE_D);
+         const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
+         emit_pixel_interpolater_send(bld,
+                                      opcode,
+                                      dest,
+                                      src,
+                                      brw_imm_ud(0u),
+                                      fs_reg(), /* flag_reg */
+                                      interpolation);
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_frag_coord:
+      emit_fragcoord_interpolation(ntb, dest);
+      break;
+
+   case nir_intrinsic_load_interpolated_input: {
+      assert(instr->src[0].ssa &&
+             instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
+      nir_intrinsic_instr *bary_intrinsic =
+         nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
+      nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
+      enum glsl_interp_mode interp_mode =
+         (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
+      fs_reg dst_xy;
+
+      if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
+          bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
+         /* Use the result of the PI message. */
+         dst_xy = retype(get_nir_src(ntb, instr->src[0]), BRW_REGISTER_TYPE_F);
+      } else {
+         /* Use the delta_xy values computed from the payload */
+         enum brw_barycentric_mode bary = brw_barycentric_mode(bary_intrinsic);
+         dst_xy = s.delta_xy[bary];
+      }
+
+      for (unsigned int i = 0; i < instr->num_components; i++) {
+         fs_reg interp =
+            s.interp_reg(bld, nir_intrinsic_base(instr),
+                         nir_intrinsic_component(instr) + i, 0);
+         interp.type = BRW_REGISTER_TYPE_F;
+         dest.type = BRW_REGISTER_TYPE_F;
+
+         if (devinfo->ver < 6 && interp_mode == INTERP_MODE_SMOOTH) {
+            fs_reg tmp = s.vgrf(glsl_float_type());
+            bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
+            bld.MUL(offset(dest, bld, i), tmp, s.pixel_w);
+         } else {
+            bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
+         }
+      }
+      break;
+   }
+
+   default:
+      fs_nir_emit_intrinsic(ntb, bld, instr);
+      break;
+   }
+}
+
+static void
+fs_nir_emit_cs_intrinsic(nir_to_brw_state &ntb,
+                         nir_intrinsic_instr *instr)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   assert(gl_shader_stage_uses_workgroup(s.stage));
+   struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(s.prog_data);
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_def(ntb, instr->def);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_barrier:
+      if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
+         fs_nir_emit_intrinsic(ntb, bld, instr);
+      if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
+         /* The whole workgroup fits in a single HW thread, so all the
+          * invocations are already executed lock-step.  Instead of an actual
+          * barrier just emit a scheduling fence, that will generate no code.
+          */
+         if (!s.nir->info.workgroup_size_variable &&
+             s.workgroup_size() <= s.dispatch_width) {
+            bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);
+            break;
+         }
+
+         emit_barrier(ntb);
+         cs_prog_data->uses_barrier = true;
+      }
+      break;
+
+   case nir_intrinsic_load_subgroup_id:
+      s.cs_payload().load_subgroup_id(bld, dest);
+      break;
+
+   case nir_intrinsic_load_local_invocation_id:
+      /* This is only used for hardware generated local IDs. */
+      assert(cs_prog_data->generate_local_id);
+
+      dest.type = BRW_REGISTER_TYPE_UD;
+
+      for (unsigned i = 0; i < 3; i++)
+         bld.MOV(offset(dest, bld, i), s.cs_payload().local_invocation_id[i]);
+      break;
+
+   case nir_intrinsic_load_workgroup_id:
+   case nir_intrinsic_load_workgroup_id_zero_base: {
+      fs_reg val = ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
+      assert(val.file != BAD_FILE);
+      dest.type = val.type;
+      for (unsigned i = 0; i < 3; i++)
+         bld.MOV(offset(dest, bld, i), offset(val, bld, i));
+      break;
+   }
+
+   case nir_intrinsic_load_num_workgroups: {
+      assert(instr->def.bit_size == 32);
+
+      cs_prog_data->uses_num_work_groups = true;
+
+      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+      srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(0);
+      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
+      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(3); /* num components */
+      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(0);
+      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
+      fs_inst *inst =
+         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
+                  dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
+      inst->size_written = 3 * s.dispatch_width * 4;
+      break;
+   }
+
+   case nir_intrinsic_shared_atomic:
+   case nir_intrinsic_shared_atomic_swap:
+      fs_nir_emit_surface_atomic(ntb, bld, instr, brw_imm_ud(GFX7_BTI_SLM),
+                                 false /* bindless */);
+      break;
+
+   case nir_intrinsic_load_shared: {
+      assert(devinfo->ver >= 7);
+
+      const unsigned bit_size = instr->def.bit_size;
+      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+      srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);
+
+      fs_reg addr = get_nir_src(ntb, instr->src[0]);
+      int base = nir_intrinsic_base(instr);
+      if (base) {
+         fs_reg addr_off = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld.ADD(addr_off, addr, brw_imm_d(base));
+         srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
+      } else {
+         srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
+      }
+
+      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
+      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
+
+      /* Make dest unsigned because that's what the temporary will be */
+      dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
+
+      /* Read the vector */
+      assert(bit_size <= 32);
+      assert(nir_intrinsic_align(instr) > 0);
+      if (bit_size == 32 &&
+          nir_intrinsic_align(instr) >= 4) {
+         assert(instr->def.num_components <= 4);
+         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
+         fs_inst *inst =
+            bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
+                     dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
+         inst->size_written = instr->num_components * s.dispatch_width * 4;
+      } else {
+         assert(instr->def.num_components == 1);
+         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
+
+         fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
+         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
+                  read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
+         bld.MOV(dest, subscript(read_result, dest.type, 0));
+      }
+      break;
+   }
+
+   case nir_intrinsic_store_shared: {
+      assert(devinfo->ver >= 7);
+
+      const unsigned bit_size = nir_src_bit_size(instr->src[0]);
+      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+      srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);
+
+      fs_reg addr = get_nir_src(ntb, instr->src[1]);
+      int base = nir_intrinsic_base(instr);
+      if (base) {
+         fs_reg addr_off = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld.ADD(addr_off, addr, brw_imm_d(base));
+         srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
+      } else {
+         srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
+      }
+
+      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
+      /* No point in masking with sample mask, here we're handling compute
+       * intrinsics.
+       */
+      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
+
+      fs_reg data = get_nir_src(ntb, instr->src[0]);
+      data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
+
+      assert(bit_size <= 32);
+      assert(nir_intrinsic_write_mask(instr) ==
+             (1u << instr->num_components) - 1);
+      assert(nir_intrinsic_align(instr) > 0);
+      if (bit_size == 32 &&
+          nir_intrinsic_align(instr) >= 4) {
+         assert(nir_src_num_components(instr->src[0]) <= 4);
+         srcs[SURFACE_LOGICAL_SRC_DATA] = data;
+         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
+         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
+                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
+      } else {
+         assert(nir_src_num_components(instr->src[0]) == 1);
+         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
+
+         srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
+         bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
+
+         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
+                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_workgroup_size: {
+      /* Should have been lowered by brw_nir_lower_cs_intrinsics() or
+       * crocus/iris_setup_uniforms() for the variable group size case.
+       */
+      unreachable("Should have been lowered");
+      break;
+   }
+
+   case nir_intrinsic_dpas_intel: {
+      const unsigned sdepth = nir_intrinsic_systolic_depth(instr);
+      const unsigned rcount = nir_intrinsic_repeat_count(instr);
+
+      const brw_reg_type dest_type =
+         brw_type_for_nir_type(devinfo, nir_intrinsic_dest_type(instr));
+      const brw_reg_type src_type =
+         brw_type_for_nir_type(devinfo, nir_intrinsic_src_type(instr));
+
+      dest = retype(dest, dest_type);
+      fs_reg src2 = retype(get_nir_src(ntb, instr->src[2]), dest_type);
+      const fs_reg dest_hf = dest;
+
+      fs_builder bld8 = bld.exec_all().group(8, 0);
+      fs_builder bld16 = bld.exec_all().group(16, 0);
+
+      /* DG2 cannot have the destination or source 0 of DPAS be float16. It is
+       * still advantageous to support these formats for memory and bandwidth
+       * savings.
+       *
+       * The float16 source must be expanded to float32.
+       */
+      if (devinfo->verx10 == 125 && dest_type == BRW_REGISTER_TYPE_HF &&
+          !s.compiler->lower_dpas) {
+         dest = bld8.vgrf(BRW_REGISTER_TYPE_F, rcount);
+
+         if (src2.file != ARF) {
+            const fs_reg src2_hf = src2;
+
+            src2 = bld8.vgrf(BRW_REGISTER_TYPE_F, rcount);
+
+            for (unsigned i = 0; i < 4; i++) {
+               bld16.MOV(byte_offset(src2, REG_SIZE * i * 2),
+                         byte_offset(src2_hf, REG_SIZE * i));
+            }
+         } else {
+            src2 = retype(src2, BRW_REGISTER_TYPE_F);
+         }
+      }
+
+      bld8.DPAS(dest,
+                src2,
+                retype(get_nir_src(ntb, instr->src[1]), src_type),
+                retype(get_nir_src(ntb, instr->src[0]), src_type),
+                sdepth,
+                rcount)
+         ->saturate = nir_intrinsic_saturate(instr);
+
+      /* Compact the destination to float16 (from float32). */
+      if (!dest.equals(dest_hf)) {
+         for (unsigned i = 0; i < 4; i++) {
+            bld16.MOV(byte_offset(dest_hf, REG_SIZE * i),
+                      byte_offset(dest, REG_SIZE * i * 2));
+         }
+      }
+
+      cs_prog_data->uses_systolic = true;
+      break;
+   }
+
+   default:
+      fs_nir_emit_intrinsic(ntb, bld, instr);
+      break;
+   }
+}
+
+static void
+emit_rt_lsc_fence(const fs_builder &bld,
+                  enum lsc_fence_scope scope,
+                  enum lsc_flush_type flush_type)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+
+   const fs_builder ubld = bld.exec_all().group(8, 0);
+   fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+   fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, tmp,
+                             brw_imm_ud(0) /* desc */,
+                             brw_imm_ud(0) /* ex_desc */,
+                             brw_vec8_grf(0, 0) /* payload */);
+   send->sfid = GFX12_SFID_UGM;
+   send->desc = lsc_fence_msg_desc(devinfo, scope, flush_type, true);
+   send->mlen = reg_unit(devinfo); /* g0 header */
+   send->ex_mlen = 0;
+   /* Temp write for scheduling */
+   send->size_written = REG_SIZE * reg_unit(devinfo);
+   send->send_has_side_effects = true;
+
+   ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), tmp);
+}
+
+
+static void
+fs_nir_emit_bs_intrinsic(nir_to_brw_state &ntb,
+                         nir_intrinsic_instr *instr)
+{
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   assert(brw_shader_stage_is_bindless(s.stage));
+   const bs_thread_payload &payload = s.bs_payload();
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_def(ntb, instr->def);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_btd_global_arg_addr_intel:
+      bld.MOV(dest, retype(payload.global_arg_ptr, dest.type));
+      break;
+
+   case nir_intrinsic_load_btd_local_arg_addr_intel:
+      bld.MOV(dest, retype(payload.local_arg_ptr, dest.type));
+      break;
+
+   case nir_intrinsic_load_btd_shader_type_intel:
+      payload.load_shader_type(bld, dest);
+      break;
+
+   default:
+      fs_nir_emit_intrinsic(ntb, bld, instr);
+      break;
+   }
+}
+
+static fs_reg
+brw_nir_reduction_op_identity(const fs_builder &bld,
+                              nir_op op, brw_reg_type type)
+{
+   nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
+   switch (type_sz(type)) {
+   case 1:
+      if (type == BRW_REGISTER_TYPE_UB) {
+         return brw_imm_uw(value.u8);
+      } else {
+         assert(type == BRW_REGISTER_TYPE_B);
+         return brw_imm_w(value.i8);
+      }
+   case 2:
+      return retype(brw_imm_uw(value.u16), type);
+   case 4:
+      return retype(brw_imm_ud(value.u32), type);
+   case 8:
+      if (type == BRW_REGISTER_TYPE_DF)
+         return setup_imm_df(bld, value.f64);
+      else
+         return retype(brw_imm_u64(value.u64), type);
+   default:
+      unreachable("Invalid type size");
+   }
+}
+
+static opcode
+brw_op_for_nir_reduction_op(nir_op op)
+{
+   switch (op) {
+   case nir_op_iadd: return BRW_OPCODE_ADD;
+   case nir_op_fadd: return BRW_OPCODE_ADD;
+   case nir_op_imul: return BRW_OPCODE_MUL;
+   case nir_op_fmul: return BRW_OPCODE_MUL;
+   case nir_op_imin: return BRW_OPCODE_SEL;
+   case nir_op_umin: return BRW_OPCODE_SEL;
+   case nir_op_fmin: return BRW_OPCODE_SEL;
+   case nir_op_imax: return BRW_OPCODE_SEL;
+   case nir_op_umax: return BRW_OPCODE_SEL;
+   case nir_op_fmax: return BRW_OPCODE_SEL;
+   case nir_op_iand: return BRW_OPCODE_AND;
+   case nir_op_ior:  return BRW_OPCODE_OR;
+   case nir_op_ixor: return BRW_OPCODE_XOR;
+   default:
+      unreachable("Invalid reduction operation");
+   }
+}
+
+static brw_conditional_mod
+brw_cond_mod_for_nir_reduction_op(nir_op op)
+{
+   switch (op) {
+   case nir_op_iadd: return BRW_CONDITIONAL_NONE;
+   case nir_op_fadd: return BRW_CONDITIONAL_NONE;
+   case nir_op_imul: return BRW_CONDITIONAL_NONE;
+   case nir_op_fmul: return BRW_CONDITIONAL_NONE;
+   case nir_op_imin: return BRW_CONDITIONAL_L;
+   case nir_op_umin: return BRW_CONDITIONAL_L;
+   case nir_op_fmin: return BRW_CONDITIONAL_L;
+   case nir_op_imax: return BRW_CONDITIONAL_GE;
+   case nir_op_umax: return BRW_CONDITIONAL_GE;
+   case nir_op_fmax: return BRW_CONDITIONAL_GE;
+   case nir_op_iand: return BRW_CONDITIONAL_NONE;
+   case nir_op_ior:  return BRW_CONDITIONAL_NONE;
+   case nir_op_ixor: return BRW_CONDITIONAL_NONE;
+   default:
+      unreachable("Invalid reduction operation");
+   }
+}
+
+struct rebuild_resource {
+   unsigned idx;
+   std::vector<nir_def *> array;
+};
+
+static bool
+add_rebuild_src(nir_src *src, void *state)
+{
+   struct rebuild_resource *res = (struct rebuild_resource *) state;
+
+   for (nir_def *def : res->array) {
+      if (def == src->ssa)
+         return true;
+   }
+
+   nir_foreach_src(src->ssa->parent_instr, add_rebuild_src, state);
+   res->array.push_back(src->ssa);
+   return true;
+}
+
+static fs_reg
+try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def *resource_def)
+{
+   /* Create a build at the location of the resource_intel intrinsic */
+   fs_builder ubld8 = bld.exec_all().group(8, 0);
+
+   struct rebuild_resource resources = {};
+   resources.idx = 0;
+
+   if (!nir_foreach_src(resource_def->parent_instr,
+                        add_rebuild_src, &resources))
+      return fs_reg();
+   resources.array.push_back(resource_def);
+
+   if (resources.array.size() == 1) {
+      nir_def *def = resources.array[0];
+
+      if (def->parent_instr->type == nir_instr_type_load_const) {
+         nir_load_const_instr *load_const =
+            nir_instr_as_load_const(def->parent_instr);
+         return brw_imm_ud(load_const->value[0].i32);
+      } else {
+         assert(def->parent_instr->type == nir_instr_type_intrinsic &&
+                (nir_instr_as_intrinsic(def->parent_instr)->intrinsic ==
+                 nir_intrinsic_load_uniform));
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr);
+         unsigned base_offset = nir_intrinsic_base(intrin);
+         unsigned load_offset = nir_src_as_uint(intrin->src[0]);
+         fs_reg src(UNIFORM, base_offset / 4, BRW_REGISTER_TYPE_UD);
+         src.offset = load_offset + base_offset % 4;
+         return src;
+      }
+   }
+
+   for (unsigned i = 0; i < resources.array.size(); i++) {
+      nir_def *def = resources.array[i];
+
+      nir_instr *instr = def->parent_instr;
+      switch (instr->type) {
+      case nir_instr_type_load_const: {
+         nir_load_const_instr *load_const =
+            nir_instr_as_load_const(instr);
+         fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
+         ntb.resource_insts[def->index] =
+            ubld8.MOV(dst, brw_imm_ud(load_const->value[0].i32));
+         break;
+      }
+
+      case nir_instr_type_alu: {
+         nir_alu_instr *alu = nir_instr_as_alu(instr);
+
+         if (nir_op_infos[alu->op].num_inputs == 2) {
+            if (alu->src[0].swizzle[0] != 0 ||
+                alu->src[1].swizzle[0] != 0)
+               break;
+         } else if (nir_op_infos[alu->op].num_inputs == 3) {
+            if (alu->src[0].swizzle[0] != 0 ||
+                alu->src[1].swizzle[0] != 0 ||
+                alu->src[2].swizzle[0] != 0)
+               break;
+         } else {
+            /* Not supported ALU input count */
+            break;
+         }
+
+         switch (alu->op) {
+         case nir_op_iadd: {
+            fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
+            fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
+            fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
+            assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
+            assert(src0.type == BRW_REGISTER_TYPE_UD);
+            ntb.resource_insts[def->index] =
+               ubld8.ADD(dst,
+                         src0.file != IMM ? src0 : src1,
+                         src0.file != IMM ? src1 : src0);
+            break;
+         }
+         case nir_op_iadd3: {
+            fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
+            fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
+            fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
+            fs_reg src2 = ntb.resource_insts[alu->src[2].src.ssa->index]->dst;
+            assert(src0.file != BAD_FILE && src1.file != BAD_FILE && src2.file != BAD_FILE);
+            assert(src0.type == BRW_REGISTER_TYPE_UD);
+            ntb.resource_insts[def->index] =
+               ubld8.ADD3(dst,
+                          src1.file == IMM ? src1 : src0,
+                          src1.file == IMM ? src0 : src1,
+                          src2);
+            break;
+         }
+         case nir_op_ushr: {
+            fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
+            fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
+            fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
+            assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
+            assert(src0.type == BRW_REGISTER_TYPE_UD);
+            ntb.resource_insts[def->index] = ubld8.SHR(dst, src0, src1);
+            break;
+         }
+         case nir_op_ishl: {
+            fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
+            fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
+            fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
+            assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
+            assert(src0.type == BRW_REGISTER_TYPE_UD);
+            ntb.resource_insts[def->index] = ubld8.SHL(dst, src0, src1);
+            break;
+         }
+         case nir_op_mov: {
+            break;
+         }
+         default:
+            break;
+         }
+         break;
+      }
+
+      case nir_instr_type_intrinsic: {
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_resource_intel:
+            ntb.resource_insts[def->index] =
+               ntb.resource_insts[intrin->src[1].ssa->index];
+            break;
+
+         case nir_intrinsic_load_uniform: {
+            if (!nir_src_is_const(intrin->src[0]))
+               break;
+
+            unsigned base_offset = nir_intrinsic_base(intrin);
+            unsigned load_offset = nir_src_as_uint(intrin->src[0]);
+            fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
+            fs_reg src(UNIFORM, base_offset / 4, BRW_REGISTER_TYPE_UD);
+            src.offset = load_offset + base_offset % 4;
+            ntb.resource_insts[def->index] = ubld8.MOV(dst, src);
+            break;
+         }
+
+         default:
+            break;
+         }
+         break;
+      }
+
+      default:
+         break;
+      }
+
+      if (ntb.resource_insts[def->index] == NULL)
+         return fs_reg();
+   }
+
+   assert(ntb.resource_insts[resource_def->index] != NULL);
+   return component(ntb.resource_insts[resource_def->index]->dst, 0);
+}
+
+static fs_reg
+get_nir_image_intrinsic_image(nir_to_brw_state &ntb, const brw::fs_builder &bld,
+                              nir_intrinsic_instr *instr)
+{
+   if (is_resource_src(instr->src[0])) {
+      fs_reg surf_index = get_resource_nir_src(ntb, instr->src[0]);
+      if (surf_index.file != BAD_FILE)
+         return surf_index;
+   }
+
+   fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), BRW_REGISTER_TYPE_UD);
+   fs_reg surf_index = image;
+
+   return bld.emit_uniformize(surf_index);
+}
+
+static fs_reg
+get_nir_buffer_intrinsic_index(nir_to_brw_state &ntb, const brw::fs_builder &bld,
+                               nir_intrinsic_instr *instr)
+{
+   /* SSBO stores are weird in that their index is in src[1] */
+   const bool is_store =
+      instr->intrinsic == nir_intrinsic_store_ssbo ||
+      instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
+   nir_src src = is_store ? instr->src[1] : instr->src[0];
+
+   if (nir_src_is_const(src)) {
+      return brw_imm_ud(nir_src_as_uint(src));
+   } else if (is_resource_src(src)) {
+      fs_reg surf_index = get_resource_nir_src(ntb, src);
+      if (surf_index.file != BAD_FILE)
+         return surf_index;
+   }
+   return bld.emit_uniformize(get_nir_src(ntb, src));
+}
+
+/**
+ * The offsets we get from NIR act as if each SIMD channel has it's own blob
+ * of contiguous space.  However, if we actually place each SIMD channel in
+ * it's own space, we end up with terrible cache performance because each SIMD
+ * channel accesses a different cache line even when they're all accessing the
+ * same byte offset.  To deal with this problem, we swizzle the address using
+ * a simple algorithm which ensures that any time a SIMD message reads or
+ * writes the same address, it's all in the same cache line.  We have to keep
+ * the bottom two bits fixed so that we can read/write up to a dword at a time
+ * and the individual element is contiguous.  We do this by splitting the
+ * address as follows:
+ *
+ *    31                             4-6           2          0
+ *    +-------------------------------+------------+----------+
+ *    |        Hi address bits        | chan index | addr low |
+ *    +-------------------------------+------------+----------+
+ *
+ * In other words, the bottom two address bits stay, and the top 30 get
+ * shifted up so that we can stick the SIMD channel index in the middle.  This
+ * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
+ * at the same logical offset, the scratch read/write instruction acts on
+ * continuous elements and we get good cache locality.
+ */
+static fs_reg
+swizzle_nir_scratch_addr(nir_to_brw_state &ntb,
+                         const brw::fs_builder &bld,
+                         const fs_reg &nir_addr,
+                         bool in_dwords)
+{
+   fs_visitor &s = ntb.s;
+
+   const fs_reg &chan_index =
+      ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
+   const unsigned chan_index_bits = ffs(s.dispatch_width) - 1;
+
+   fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   if (in_dwords) {
+      /* In this case, we know the address is aligned to a DWORD and we want
+       * the final address in DWORDs.
+       */
+      bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2));
+      bld.OR(addr, addr, chan_index);
+   } else {
+      /* This case substantially more annoying because we have to pay
+       * attention to those pesky two bottom bits.
+       */
+      fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u));
+      bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits));
+      fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.SHL(chan_addr, chan_index, brw_imm_ud(2));
+      bld.AND(addr, nir_addr, brw_imm_ud(0x3u));
+      bld.OR(addr, addr, addr_hi);
+      bld.OR(addr, addr, chan_addr);
+   }
+   return addr;
+}
+
+static unsigned
+choose_oword_block_size_dwords(const struct intel_device_info *devinfo,
+                               unsigned dwords)
+{
+   unsigned block;
+   if (devinfo->has_lsc && dwords >= 64) {
+      block = 64;
+   } else if (dwords >= 32) {
+      block = 32;
+   } else if (dwords >= 16) {
+      block = 16;
+   } else {
+      block = 8;
+   }
+   assert(block <= dwords);
+   return block;
+}
+
+static void
+increment_a64_address(const fs_builder &bld, fs_reg address, uint32_t v)
+{
+   if (bld.shader->devinfo->has_64bit_int) {
+      bld.ADD(address, address, brw_imm_ud(v));
+   } else {
+      fs_reg low = retype(address, BRW_REGISTER_TYPE_UD);
+      fs_reg high = offset(low, bld, 1);
+
+      /* Add low and if that overflows, add carry to high. */
+      bld.ADD(low, low, brw_imm_ud(v))->conditional_mod = BRW_CONDITIONAL_O;
+      bld.ADD(high, high, brw_imm_ud(0x1))->predicate = BRW_PREDICATE_NORMAL;
+   }
+}
+
+static fs_reg
+emit_fence(const fs_builder &bld, enum opcode opcode,
+           uint8_t sfid, uint32_t desc,
+           bool commit_enable, uint8_t bti)
+{
+   assert(opcode == SHADER_OPCODE_INTERLOCK ||
+          opcode == SHADER_OPCODE_MEMORY_FENCE);
+
+   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   fs_inst *fence = bld.emit(opcode, dst, brw_vec8_grf(0, 0),
+                             brw_imm_ud(commit_enable),
+                             brw_imm_ud(bti));
+   fence->sfid = sfid;
+   fence->desc = desc;
+
+   return dst;
+}
+
+static uint32_t
+lsc_fence_descriptor_for_intrinsic(const struct intel_device_info *devinfo,
+                                   nir_intrinsic_instr *instr)
+{
+   assert(devinfo->has_lsc);
+
+   enum lsc_fence_scope scope = LSC_FENCE_LOCAL;
+   enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE;
+
+   if (nir_intrinsic_has_memory_scope(instr)) {
+      switch (nir_intrinsic_memory_scope(instr)) {
+      case SCOPE_DEVICE:
+      case SCOPE_QUEUE_FAMILY:
+         scope = LSC_FENCE_TILE;
+         flush_type = LSC_FLUSH_TYPE_EVICT;
+         break;
+      case SCOPE_WORKGROUP:
+         scope = LSC_FENCE_THREADGROUP;
+         break;
+      case SCOPE_SHADER_CALL:
+      case SCOPE_INVOCATION:
+      case SCOPE_SUBGROUP:
+      case SCOPE_NONE:
+         break;
+      }
+   } else {
+      /* No scope defined. */
+      scope = LSC_FENCE_TILE;
+      flush_type = LSC_FLUSH_TYPE_EVICT;
+   }
+   return lsc_fence_msg_desc(devinfo, scope, flush_type, true);
+}
+
+/**
+ * Create a MOV to read the timestamp register.
+ */
+static fs_reg
+get_timestamp(const fs_builder &bld)
+{
+   fs_visitor &s = *bld.shader;
+   const intel_device_info *devinfo = s.devinfo;
+
+   assert(devinfo->ver >= 7);
+
+   fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                                          BRW_ARF_TIMESTAMP,
+                                          0),
+                             BRW_REGISTER_TYPE_UD));
+
+   fs_reg dst = fs_reg(VGRF, s.alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+
+   /* We want to read the 3 fields we care about even if it's not enabled in
+    * the dispatch.
+    */
+   bld.group(4, 0).exec_all().MOV(dst, ts);
+
+   return dst;
+}
+
+static unsigned
+component_from_intrinsic(nir_intrinsic_instr *instr)
+{
+   if (nir_intrinsic_has_component(instr))
+      return nir_intrinsic_component(instr);
+   else
+      return 0;
+}
+
+static void
+adjust_handle_and_offset(const fs_builder &bld,
+                         fs_reg &urb_handle,
+                         unsigned &urb_global_offset)
+{
+   /* Make sure that URB global offset is below 2048 (2^11), because
+    * that's the maximum possible value encoded in Message Descriptor.
+    */
+   unsigned adjustment = (urb_global_offset >> 11) << 11;
+
+   if (adjustment) {
+      fs_builder ubld8 = bld.group(8, 0).exec_all();
+      /* Allocate new register to not overwrite the shared URB handle. */
+      fs_reg new_handle = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
+      ubld8.ADD(new_handle, urb_handle, brw_imm_ud(adjustment));
+      urb_handle = new_handle;
+      urb_global_offset -= adjustment;
+   }
+}
+
+static void
+emit_urb_direct_vec4_write(const fs_builder &bld,
+                           unsigned urb_global_offset,
+                           const fs_reg &src,
+                           fs_reg urb_handle,
+                           unsigned dst_comp_offset,
+                           unsigned comps,
+                           unsigned mask)
+{
+   for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
+      fs_builder bld8 = bld.group(8, q);
+
+      fs_reg payload_srcs[8];
+      unsigned length = 0;
+
+      for (unsigned i = 0; i < dst_comp_offset; i++)
+         payload_srcs[length++] = reg_undef;
+
+      for (unsigned c = 0; c < comps; c++)
+         payload_srcs[length++] = quarter(offset(src, bld, c), q);
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
+      srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
+      srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
+                                          BRW_REGISTER_TYPE_F);
+      srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
+      bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
+
+      fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
+                                reg_undef, srcs, ARRAY_SIZE(srcs));
+      inst->offset = urb_global_offset;
+      assert(inst->offset < 2048);
+   }
+}
+
+static void
+emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
+                       const fs_reg &src, fs_reg urb_handle)
+{
+   assert(nir_src_bit_size(instr->src[0]) == 32);
+
+   nir_src *offset_nir_src = nir_get_io_offset_src(instr);
+   assert(nir_src_is_const(*offset_nir_src));
+
+   const unsigned comps = nir_src_num_components(instr->src[0]);
+   assert(comps <= 4);
+
+   const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
+                                     nir_src_as_uint(*offset_nir_src) +
+                                     component_from_intrinsic(instr);
+
+   /* URB writes are vec4 aligned but the intrinsic offsets are in dwords.
+    * We can write up to 8 dwords, so single vec4 write is enough.
+    */
+   const unsigned comp_shift = offset_in_dwords % 4;
+   const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift;
+
+   unsigned urb_global_offset = offset_in_dwords / 4;
+   adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
+
+   emit_urb_direct_vec4_write(bld, urb_global_offset, src, urb_handle,
+                              comp_shift, comps, mask);
+}
+
+static void
+emit_urb_direct_vec4_write_xe2(const fs_builder &bld,
+                               unsigned offset_in_bytes,
+                               const fs_reg &src,
+                               fs_reg urb_handle,
+                               unsigned comps,
+                               unsigned mask)
+{
+   const struct intel_device_info *devinfo = bld.shader->devinfo;
+   const unsigned runit = reg_unit(devinfo);
+   const unsigned write_size = 8 * runit;
+
+   if (offset_in_bytes > 0) {
+      fs_builder bldall = bld.group(write_size, 0).exec_all();
+      fs_reg new_handle = bldall.vgrf(BRW_REGISTER_TYPE_UD);
+      bldall.ADD(new_handle, urb_handle, brw_imm_ud(offset_in_bytes));
+      urb_handle = new_handle;
+   }
+
+   for (unsigned q = 0; q < bld.dispatch_width() / write_size; q++) {
+      fs_builder hbld = bld.group(write_size, q);
+
+      fs_reg payload_srcs[comps];
+
+      for (unsigned c = 0; c < comps; c++)
+         payload_srcs[c] = horiz_offset(offset(src, bld, c), write_size * q);
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
+      srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
+      int nr = bld.shader->alloc.allocate(comps * runit);
+      srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, nr, BRW_REGISTER_TYPE_F);
+      srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(comps);
+      hbld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, comps, 0);
+
+      hbld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
+                reg_undef, srcs, ARRAY_SIZE(srcs));
+   }
+}
+
+static void
+emit_urb_direct_writes_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
+                           const fs_reg &src, fs_reg urb_handle)
+{
+   assert(nir_src_bit_size(instr->src[0]) == 32);
+
+   nir_src *offset_nir_src = nir_get_io_offset_src(instr);
+   assert(nir_src_is_const(*offset_nir_src));
+
+   const unsigned comps = nir_src_num_components(instr->src[0]);
+   assert(comps <= 4);
+
+   const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
+                                     nir_src_as_uint(*offset_nir_src) +
+                                     component_from_intrinsic(instr);
+
+   const unsigned mask = nir_intrinsic_write_mask(instr);
+
+   emit_urb_direct_vec4_write_xe2(bld, offset_in_dwords * 4, src,
+                                    urb_handle, comps, mask);
+}
+
+static void
+emit_urb_indirect_vec4_write(const fs_builder &bld,
+                             const fs_reg &offset_src,
+                             unsigned base,
+                             const fs_reg &src,
+                             fs_reg urb_handle,
+                             unsigned dst_comp_offset,
+                             unsigned comps,
+                             unsigned mask)
+{
+   for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
+      fs_builder bld8 = bld.group(8, q);
+
+      /* offset is always positive, so signedness doesn't matter */
+      assert(offset_src.type == BRW_REGISTER_TYPE_D ||
+             offset_src.type == BRW_REGISTER_TYPE_UD);
+      fs_reg off = bld8.vgrf(offset_src.type, 1);
+      bld8.MOV(off, quarter(offset_src, q));
+      bld8.ADD(off, off, brw_imm_ud(base));
+      bld8.SHR(off, off, brw_imm_ud(2));
+
+      fs_reg payload_srcs[8];
+      unsigned length = 0;
+
+      for (unsigned i = 0; i < dst_comp_offset; i++)
+         payload_srcs[length++] = reg_undef;
+
+      for (unsigned c = 0; c < comps; c++)
+         payload_srcs[length++] = quarter(offset(src, bld, c), q);
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
+      srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
+      srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
+      srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
+                                          BRW_REGISTER_TYPE_F);
+      srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
+      bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
+
+      fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
+                                reg_undef, srcs, ARRAY_SIZE(srcs));
+      inst->offset = 0;
+   }
+}
+
+static void
+emit_urb_indirect_writes_mod(const fs_builder &bld, nir_intrinsic_instr *instr,
+                             const fs_reg &src, const fs_reg &offset_src,
+                             fs_reg urb_handle, unsigned mod)
+{
+   assert(nir_src_bit_size(instr->src[0]) == 32);
+
+   const unsigned comps = nir_src_num_components(instr->src[0]);
+   assert(comps <= 4);
+
+   const unsigned base_in_dwords = nir_intrinsic_base(instr) +
+                                   component_from_intrinsic(instr);
+
+   const unsigned comp_shift = mod;
+   const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift;
+
+   emit_urb_indirect_vec4_write(bld, offset_src, base_in_dwords, src,
+                                urb_handle, comp_shift, comps, mask);
+}
+
+static void
+emit_urb_indirect_writes_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
+                             const fs_reg &src, const fs_reg &offset_src,
+                             fs_reg urb_handle)
+{
+   assert(nir_src_bit_size(instr->src[0]) == 32);
+
+   const struct intel_device_info *devinfo = bld.shader->devinfo;
+   const unsigned runit = reg_unit(devinfo);
+   const unsigned write_size = 8 * runit;
+
+   const unsigned comps = nir_src_num_components(instr->src[0]);
+   assert(comps <= 4);
+
+   const unsigned base_in_dwords = nir_intrinsic_base(instr) +
+                                   component_from_intrinsic(instr);
+
+   if (base_in_dwords > 0) {
+      fs_builder bldall = bld.group(write_size, 0).exec_all();
+      fs_reg new_handle = bldall.vgrf(BRW_REGISTER_TYPE_UD);
+      bldall.ADD(new_handle, urb_handle, brw_imm_ud(base_in_dwords * 4));
+      urb_handle = new_handle;
+   }
+
+   const unsigned mask = nir_intrinsic_write_mask(instr);
+
+   for (unsigned q = 0; q < bld.dispatch_width() / write_size; q++) {
+      fs_builder wbld = bld.group(write_size, q);
+
+      fs_reg payload_srcs[comps];
+
+      for (unsigned c = 0; c < comps; c++)
+         payload_srcs[c] = horiz_offset(offset(src, bld, c), write_size * q);
+
+      fs_reg addr = wbld.vgrf(BRW_REGISTER_TYPE_UD);
+      wbld.SHL(addr, horiz_offset(offset_src, write_size * q), brw_imm_ud(2));
+      wbld.ADD(addr, addr, urb_handle);
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = addr;
+      srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
+      int nr = bld.shader->alloc.allocate(comps * runit);
+      srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, nr, BRW_REGISTER_TYPE_F);
+      srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(comps);
+      wbld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, comps, 0);
+
+      wbld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
+                reg_undef, srcs, ARRAY_SIZE(srcs));
+   }
+}
+
+static void
+emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
+                         const fs_reg &src, const fs_reg &offset_src,
+                         fs_reg urb_handle)
+{
+   assert(nir_src_bit_size(instr->src[0]) == 32);
+
+   const unsigned comps = nir_src_num_components(instr->src[0]);
+   assert(comps <= 4);
+
+   const unsigned base_in_dwords = nir_intrinsic_base(instr) +
+                                   component_from_intrinsic(instr);
+
+   /* Use URB write message that allow different offsets per-slot.  The offset
+    * is in units of vec4s (128 bits), so we use a write for each component,
+    * replicating it in the sources and applying the appropriate mask based on
+    * the dword offset.
+    */
+
+   for (unsigned c = 0; c < comps; c++) {
+      if (((1 << c) & nir_intrinsic_write_mask(instr)) == 0)
+         continue;
+
+      fs_reg src_comp = offset(src, bld, c);
+
+      for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
+         fs_builder bld8 = bld.group(8, q);
+
+         /* offset is always positive, so signedness doesn't matter */
+         assert(offset_src.type == BRW_REGISTER_TYPE_D ||
+                offset_src.type == BRW_REGISTER_TYPE_UD);
+         fs_reg off = bld8.vgrf(offset_src.type, 1);
+         bld8.MOV(off, quarter(offset_src, q));
+         bld8.ADD(off, off, brw_imm_ud(c + base_in_dwords));
+
+         fs_reg mask = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld8.AND(mask, off, brw_imm_ud(0x3));
+
+         fs_reg one = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld8.MOV(one, brw_imm_ud(1));
+         bld8.SHL(mask, one, mask);
+         bld8.SHL(mask, mask, brw_imm_ud(16));
+
+         bld8.SHR(off, off, brw_imm_ud(2));
+
+         fs_reg payload_srcs[4];
+         unsigned length = 0;
+
+         for (unsigned j = 0; j < 4; j++)
+            payload_srcs[length++] = quarter(src_comp, q);
+
+         fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+         srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
+         srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
+         srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask;
+         srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
+                                             BRW_REGISTER_TYPE_F);
+         srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
+         bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
+
+         fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
+                                   reg_undef, srcs, ARRAY_SIZE(srcs));
+         inst->offset = 0;
+      }
+   }
+}
+
+static void
+emit_urb_direct_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
+                      const fs_reg &dest, fs_reg urb_handle)
+{
+   assert(instr->def.bit_size == 32);
+
+   unsigned comps = instr->def.num_components;
+   if (comps == 0)
+      return;
+
+   nir_src *offset_nir_src = nir_get_io_offset_src(instr);
+   assert(nir_src_is_const(*offset_nir_src));
+
+   const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
+                                     nir_src_as_uint(*offset_nir_src) +
+                                     component_from_intrinsic(instr);
+
+   unsigned urb_global_offset = offset_in_dwords / 4;
+   adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
+
+   const unsigned comp_offset = offset_in_dwords % 4;
+   const unsigned num_regs = comp_offset + comps;
+
+   fs_builder ubld8 = bld.group(8, 0).exec_all();
+   fs_reg data = ubld8.vgrf(BRW_REGISTER_TYPE_UD, num_regs);
+   fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+   srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
+
+   fs_inst *inst = ubld8.emit(SHADER_OPCODE_URB_READ_LOGICAL, data,
+                              srcs, ARRAY_SIZE(srcs));
+   inst->offset = urb_global_offset;
+   assert(inst->offset < 2048);
+   inst->size_written = num_regs * REG_SIZE;
+
+   for (unsigned c = 0; c < comps; c++) {
+      fs_reg dest_comp = offset(dest, bld, c);
+      fs_reg data_comp = horiz_stride(offset(data, ubld8, comp_offset + c), 0);
+      bld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp);
+   }
+}
+
+static void
+emit_urb_direct_reads_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
+                          const fs_reg &dest, fs_reg urb_handle)
+{
+   assert(instr->def.bit_size == 32);
+
+   unsigned comps = instr->def.num_components;
+   if (comps == 0)
+      return;
+
+   nir_src *offset_nir_src = nir_get_io_offset_src(instr);
+   assert(nir_src_is_const(*offset_nir_src));
+
+   fs_builder ubld16 = bld.group(16, 0).exec_all();
+
+   const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
+                                     nir_src_as_uint(*offset_nir_src) +
+                                     component_from_intrinsic(instr);
+
+   if (offset_in_dwords > 0) {
+      fs_reg new_handle = ubld16.vgrf(BRW_REGISTER_TYPE_UD);
+      ubld16.ADD(new_handle, urb_handle, brw_imm_ud(offset_in_dwords * 4));
+      urb_handle = new_handle;
+   }
+
+   fs_reg data = ubld16.vgrf(BRW_REGISTER_TYPE_UD, comps);
+   fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+   srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
+
+   fs_inst *inst = ubld16.emit(SHADER_OPCODE_URB_READ_LOGICAL,
+                               data, srcs, ARRAY_SIZE(srcs));
+   inst->size_written = 2 * comps * REG_SIZE;
+
+   for (unsigned c = 0; c < comps; c++) {
+      fs_reg dest_comp = offset(dest, bld, c);
+      fs_reg data_comp = horiz_stride(offset(data, ubld16, c), 0);
+      bld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp);
+   }
+}
+
+static void
+emit_urb_indirect_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
+                        const fs_reg &dest, const fs_reg &offset_src, fs_reg urb_handle)
+{
+   assert(instr->def.bit_size == 32);
+
+   unsigned comps = instr->def.num_components;
+   if (comps == 0)
+      return;
+
+   fs_reg seq_ud;
+   {
+      fs_builder ubld8 = bld.group(8, 0).exec_all();
+      seq_ud = ubld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      fs_reg seq_uw = ubld8.vgrf(BRW_REGISTER_TYPE_UW, 1);
+      ubld8.MOV(seq_uw, fs_reg(brw_imm_v(0x76543210)));
+      ubld8.MOV(seq_ud, seq_uw);
+      ubld8.SHL(seq_ud, seq_ud, brw_imm_ud(2));
+   }
+
+   const unsigned base_in_dwords = nir_intrinsic_base(instr) +
+                                   component_from_intrinsic(instr);
+
+   for (unsigned c = 0; c < comps; c++) {
+      for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
+         fs_builder bld8 = bld.group(8, q);
+
+         /* offset is always positive, so signedness doesn't matter */
+         assert(offset_src.type == BRW_REGISTER_TYPE_D ||
+                offset_src.type == BRW_REGISTER_TYPE_UD);
+         fs_reg off = bld8.vgrf(offset_src.type, 1);
+         bld8.MOV(off, quarter(offset_src, q));
+         bld8.ADD(off, off, brw_imm_ud(base_in_dwords + c));
+
+         STATIC_ASSERT(IS_POT(REG_SIZE) && REG_SIZE > 1);
+
+         fs_reg comp = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld8.AND(comp, off, brw_imm_ud(0x3));
+         bld8.SHL(comp, comp, brw_imm_ud(ffs(REG_SIZE) - 1));
+         bld8.ADD(comp, comp, seq_ud);
+
+         bld8.SHR(off, off, brw_imm_ud(2));
+
+         fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+         srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
+         srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
+
+         fs_reg data = bld8.vgrf(BRW_REGISTER_TYPE_UD, 4);
+
+         fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_READ_LOGICAL,
+                                   data, srcs, ARRAY_SIZE(srcs));
+         inst->offset = 0;
+         inst->size_written = 4 * REG_SIZE;
+
+         fs_reg dest_comp = offset(dest, bld, c);
+         bld8.emit(SHADER_OPCODE_MOV_INDIRECT,
+                   retype(quarter(dest_comp, q), BRW_REGISTER_TYPE_UD),
+                   data,
+                   comp,
+                   brw_imm_ud(4 * REG_SIZE));
+      }
+   }
+}
+
+static void
+emit_urb_indirect_reads_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
+                            const fs_reg &dest, const fs_reg &offset_src,
+                            fs_reg urb_handle)
+{
+   assert(instr->def.bit_size == 32);
+
+   unsigned comps = instr->def.num_components;
+   if (comps == 0)
+      return;
+
+   fs_builder ubld16 = bld.group(16, 0).exec_all();
+
+   const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
+                                     component_from_intrinsic(instr);
+
+   if (offset_in_dwords > 0) {
+      fs_reg new_handle = ubld16.vgrf(BRW_REGISTER_TYPE_UD);
+      ubld16.ADD(new_handle, urb_handle, brw_imm_ud(offset_in_dwords * 4));
+      urb_handle = new_handle;
+   }
+
+   fs_reg data = ubld16.vgrf(BRW_REGISTER_TYPE_UD, comps);
+
+
+   for (unsigned q = 0; q < bld.dispatch_width() / 16; q++) {
+      fs_builder wbld = bld.group(16, q);
+
+      fs_reg addr = wbld.vgrf(BRW_REGISTER_TYPE_UD);
+      wbld.SHL(addr, horiz_offset(offset_src, 16 * q), brw_imm_ud(2));
+      wbld.ADD(addr, addr, urb_handle);
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = addr;
+
+      fs_inst *inst = wbld.emit(SHADER_OPCODE_URB_READ_LOGICAL,
+                                 data, srcs, ARRAY_SIZE(srcs));
+      inst->size_written = 2 * comps * REG_SIZE;
+
+      for (unsigned c = 0; c < comps; c++) {
+         fs_reg dest_comp = horiz_offset(offset(dest, bld, c), 16 * q);
+         fs_reg data_comp = offset(data, wbld, c);
+         wbld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp);
+      }
+   }
+}
+
+static void
+emit_task_mesh_store(nir_to_brw_state &ntb,
+                     const fs_builder &bld, nir_intrinsic_instr *instr,
+                     const fs_reg &urb_handle)
+{
+   fs_reg src = get_nir_src(ntb, instr->src[0]);
+   nir_src *offset_nir_src = nir_get_io_offset_src(instr);
+
+   if (nir_src_is_const(*offset_nir_src)) {
+      if (bld.shader->devinfo->ver >= 20)
+         emit_urb_direct_writes_xe2(bld, instr, src, urb_handle);
+      else
+         emit_urb_direct_writes(bld, instr, src, urb_handle);
+   } else {
+      if (bld.shader->devinfo->ver >= 20) {
+         emit_urb_indirect_writes_xe2(bld, instr, src, get_nir_src(ntb, *offset_nir_src), urb_handle);
+         return;
+      }
+      bool use_mod = false;
+      unsigned mod;
+
+      /* Try to calculate the value of (offset + base) % 4. If we can do
+       * this, then we can do indirect writes using only 1 URB write.
+       */
+      use_mod = nir_mod_analysis(nir_get_scalar(offset_nir_src->ssa, 0), nir_type_uint, 4, &mod);
+      if (use_mod) {
+         mod += nir_intrinsic_base(instr) + component_from_intrinsic(instr);
+         mod %= 4;
+      }
+
+      if (use_mod) {
+         emit_urb_indirect_writes_mod(bld, instr, src, get_nir_src(ntb, *offset_nir_src), urb_handle, mod);
+      } else {
+         emit_urb_indirect_writes(bld, instr, src, get_nir_src(ntb, *offset_nir_src), urb_handle);
+      }
+   }
+}
+
+static void
+emit_task_mesh_load(nir_to_brw_state &ntb,
+                    const fs_builder &bld, nir_intrinsic_instr *instr,
+                    const fs_reg &urb_handle)
+{
+   fs_reg dest = get_nir_def(ntb, instr->def);
+   nir_src *offset_nir_src = nir_get_io_offset_src(instr);
+
+   /* TODO(mesh): for per_vertex and per_primitive, if we could keep around
+    * the non-array-index offset, we could use to decide if we can perform
+    * a single large aligned read instead one per component.
+    */
+
+   if (nir_src_is_const(*offset_nir_src)) {
+      if (bld.shader->devinfo->ver >= 20)
+         emit_urb_direct_reads_xe2(bld, instr, dest, urb_handle);
+      else
+         emit_urb_direct_reads(bld, instr, dest, urb_handle);
+   } else {
+      if (bld.shader->devinfo->ver >= 20)
+         emit_urb_indirect_reads_xe2(bld, instr, dest, get_nir_src(ntb, *offset_nir_src), urb_handle);
+      else
+         emit_urb_indirect_reads(bld, instr, dest, get_nir_src(ntb, *offset_nir_src), urb_handle);
+   }
+}
+
+static void
+fs_nir_emit_task_mesh_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld,
+                                nir_intrinsic_instr *instr)
+{
+   fs_visitor &s = ntb.s;
+
+   assert(s.stage == MESA_SHADER_MESH || s.stage == MESA_SHADER_TASK);
+   const task_mesh_thread_payload &payload = s.task_mesh_payload();
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_def(ntb, instr->def);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_mesh_inline_data_intel: {
+      fs_reg data = offset(payload.inline_parameter, 1, nir_intrinsic_align_offset(instr));
+      bld.MOV(dest, retype(data, dest.type));
+      break;
+   }
+
+   case nir_intrinsic_load_draw_id:
+      dest = retype(dest, BRW_REGISTER_TYPE_UD);
+      bld.MOV(dest, payload.extended_parameter_0);
+      break;
+
+   case nir_intrinsic_load_local_invocation_id:
+      unreachable("local invocation id should have been lowered earlier");
+      break;
+
+   case nir_intrinsic_load_local_invocation_index:
+      dest = retype(dest, BRW_REGISTER_TYPE_UD);
+      bld.MOV(dest, payload.local_index);
+      break;
+
+   case nir_intrinsic_load_num_workgroups:
+      dest = retype(dest, BRW_REGISTER_TYPE_UD);
+      bld.MOV(offset(dest, bld, 0), brw_uw1_grf(0, 13)); /* g0.6 >> 16 */
+      bld.MOV(offset(dest, bld, 1), brw_uw1_grf(0, 8));  /* g0.4 & 0xffff */
+      bld.MOV(offset(dest, bld, 2), brw_uw1_grf(0, 9));  /* g0.4 >> 16 */
+      break;
+
+   case nir_intrinsic_load_workgroup_index:
+      dest = retype(dest, BRW_REGISTER_TYPE_UD);
+      bld.MOV(dest, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
+      break;
+
+   default:
+      fs_nir_emit_cs_intrinsic(ntb, instr);
+      break;
+   }
+}
+
+static void
+fs_nir_emit_task_intrinsic(nir_to_brw_state &ntb,
+                           nir_intrinsic_instr *instr)
+{
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   assert(s.stage == MESA_SHADER_TASK);
+   const task_mesh_thread_payload &payload = s.task_mesh_payload();
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_task_payload:
+      emit_task_mesh_store(ntb, bld, instr, payload.urb_output);
+      break;
+
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_task_payload:
+      emit_task_mesh_load(ntb, bld, instr, payload.urb_output);
+      break;
+
+   default:
+      fs_nir_emit_task_mesh_intrinsic(ntb, bld, instr);
+      break;
+   }
+}
+
+static void
+fs_nir_emit_mesh_intrinsic(nir_to_brw_state &ntb,
+                           nir_intrinsic_instr *instr)
+{
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   assert(s.stage == MESA_SHADER_MESH);
+   const task_mesh_thread_payload &payload = s.task_mesh_payload();
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_store_per_primitive_output:
+   case nir_intrinsic_store_per_vertex_output:
+   case nir_intrinsic_store_output:
+      emit_task_mesh_store(ntb, bld, instr, payload.urb_output);
+      break;
+
+   case nir_intrinsic_load_per_vertex_output:
+   case nir_intrinsic_load_per_primitive_output:
+   case nir_intrinsic_load_output:
+      emit_task_mesh_load(ntb, bld, instr, payload.urb_output);
+      break;
+
+   case nir_intrinsic_load_task_payload:
+      emit_task_mesh_load(ntb, bld, instr, payload.task_urb_input);
+      break;
+
+   default:
+      fs_nir_emit_task_mesh_intrinsic(ntb, bld, instr);
+      break;
+   }
+}
+
+static void
+fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
+                      const fs_builder &bld, nir_intrinsic_instr *instr)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   fs_visitor &s = ntb.s;
+
+   /* We handle this as a special case */
+   if (instr->intrinsic == nir_intrinsic_decl_reg) {
+      assert(nir_intrinsic_num_array_elems(instr) == 0);
+      unsigned bit_size = nir_intrinsic_bit_size(instr);
+      unsigned num_components = nir_intrinsic_num_components(instr);
+      const brw_reg_type reg_type =
+         brw_reg_type_from_bit_size(bit_size, bit_size == 8 ?
+                                              BRW_REGISTER_TYPE_D :
+                                              BRW_REGISTER_TYPE_F);
+
+      /* Re-use the destination's slot in the table for the register */
+      ntb.ssa_values[instr->def.index] =
+         bld.vgrf(reg_type, num_components);
+      return;
+   }
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_def(ntb, instr->def);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_resource_intel:
+      ntb.ssa_bind_infos[instr->def.index].valid = true;
+      ntb.ssa_bind_infos[instr->def.index].bindless =
+         (nir_intrinsic_resource_access_intel(instr) &
+          nir_resource_intel_bindless) != 0;
+      ntb.ssa_bind_infos[instr->def.index].block =
+         nir_intrinsic_resource_block_intel(instr);
+      ntb.ssa_bind_infos[instr->def.index].set =
+         nir_intrinsic_desc_set(instr);
+      ntb.ssa_bind_infos[instr->def.index].binding =
+         nir_intrinsic_binding(instr);
+
+      if (nir_intrinsic_resource_access_intel(instr) &
+           nir_resource_intel_non_uniform) {
+         ntb.resource_values[instr->def.index] = fs_reg();
+      } else {
+         ntb.resource_values[instr->def.index] =
+            try_rebuild_resource(ntb, bld, instr->src[1].ssa);
+      }
+      ntb.ssa_values[instr->def.index] =
+         ntb.ssa_values[instr->src[1].ssa->index];
+      break;
+
+   case nir_intrinsic_load_reg:
+   case nir_intrinsic_store_reg:
+      /* Nothing to do with these. */
+      break;
+
+   case nir_intrinsic_image_load:
+   case nir_intrinsic_image_store:
+   case nir_intrinsic_image_atomic:
+   case nir_intrinsic_image_atomic_swap:
+   case nir_intrinsic_bindless_image_load:
+   case nir_intrinsic_bindless_image_store:
+   case nir_intrinsic_bindless_image_atomic:
+   case nir_intrinsic_bindless_image_atomic_swap: {
+      /* Get some metadata from the image intrinsic. */
+      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+
+      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+
+      switch (instr->intrinsic) {
+      case nir_intrinsic_image_load:
+      case nir_intrinsic_image_store:
+      case nir_intrinsic_image_atomic:
+      case nir_intrinsic_image_atomic_swap:
+         srcs[SURFACE_LOGICAL_SRC_SURFACE] =
+            get_nir_image_intrinsic_image(ntb, bld, instr);
+         break;
+
+      default:
+         /* Bindless */
+         srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] =
+            get_nir_image_intrinsic_image(ntb, bld, instr);
+         break;
+      }
+
+      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
+      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
+         brw_imm_ud(nir_image_intrinsic_coord_components(instr));
+
+      /* Emit an image load, store or atomic op. */
+      if (instr->intrinsic == nir_intrinsic_image_load ||
+          instr->intrinsic == nir_intrinsic_bindless_image_load) {
+         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
+         srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
+         fs_inst *inst =
+            bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
+                     dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
+         inst->size_written = instr->num_components * s.dispatch_width * 4;
+      } else if (instr->intrinsic == nir_intrinsic_image_store ||
+                 instr->intrinsic == nir_intrinsic_bindless_image_store) {
+         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
+         srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[3]);
+         srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
+         bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
+                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
+      } else {
+         unsigned num_srcs = info->num_srcs;
+         enum lsc_opcode op = lsc_aop_for_nir_intrinsic(instr);
+         if (op == LSC_OP_ATOMIC_INC || op == LSC_OP_ATOMIC_DEC) {
+            assert(num_srcs == 4);
+            num_srcs = 3;
+         }
+
+         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
+
+         fs_reg data;
+         if (num_srcs >= 4)
+            data = get_nir_src(ntb, instr->src[3]);
+         if (num_srcs >= 5) {
+            fs_reg tmp = bld.vgrf(data.type, 2);
+            fs_reg sources[2] = { data, get_nir_src(ntb, instr->src[4]) };
+            bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
+            data = tmp;
+         }
+         srcs[SURFACE_LOGICAL_SRC_DATA] = data;
+         srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
+
+         bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
+                  dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
+      }
+      break;
+   }
+
+   case nir_intrinsic_image_size:
+   case nir_intrinsic_bindless_image_size: {
+      /* Cube image sizes should have previously been lowered to a 2D array */
+      assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
+
+      /* Unlike the [un]typed load and store opcodes, the TXS that this turns
+       * into will handle the binding table index for us in the geneerator.
+       * Incidentally, this means that we can handle bindless with exactly the
+       * same code.
+       */
+      fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]),
+                            BRW_REGISTER_TYPE_UD);
+      image = bld.emit_uniformize(image);
+
+      assert(nir_src_as_uint(instr->src[1]) == 0);
+
+      fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
+      if (instr->intrinsic == nir_intrinsic_image_size)
+         srcs[TEX_LOGICAL_SRC_SURFACE] = image;
+      else
+         srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
+      srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
+      srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0);
+      srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
+      srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0);
+
+      /* Since the image size is always uniform, we can just emit a SIMD8
+       * query instruction and splat the result out.
+       */
+      const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
+
+      fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+      fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
+                                tmp, srcs, ARRAY_SIZE(srcs));
+      inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
+
+      for (unsigned c = 0; c < instr->def.num_components; ++c) {
+         bld.MOV(offset(retype(dest, tmp.type), bld, c),
+                 component(offset(tmp, ubld, c), 0));
+      }
+      break;
+   }
+
+   case nir_intrinsic_image_load_raw_intel: {
+      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+      srcs[SURFACE_LOGICAL_SRC_SURFACE] =
+         get_nir_image_intrinsic_image(ntb, bld, instr);
+      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
+      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
+      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
+      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
+
+      fs_inst *inst =
+         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
+                  dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
+      inst->size_written = instr->num_components * s.dispatch_width * 4;
+      break;
+   }
+
+   case nir_intrinsic_image_store_raw_intel: {
+      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+      srcs[SURFACE_LOGICAL_SRC_SURFACE] =
+         get_nir_image_intrinsic_image(ntb, bld, instr);
+      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
+      srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[2]);
+      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
+      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
+      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
+
+      bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
+               fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
+      break;
+   }
+
+   case nir_intrinsic_barrier:
+   case nir_intrinsic_begin_invocation_interlock:
+   case nir_intrinsic_end_invocation_interlock: {
+      bool ugm_fence, slm_fence, tgm_fence, urb_fence;
+      enum opcode opcode = BRW_OPCODE_NOP;
+
+      /* Handling interlock intrinsics here will allow the logic for IVB
+       * render cache (see below) to be reused.
+       */
+
+      switch (instr->intrinsic) {
+      case nir_intrinsic_barrier: {
+         /* Note we only care about the memory part of the
+          * barrier.  The execution part will be taken care
+          * of by the stage specific intrinsic handler functions.
+          */
+         nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
+         ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global);
+         slm_fence = modes & nir_var_mem_shared;
+         tgm_fence = modes & nir_var_image;
+         urb_fence = modes & (nir_var_shader_out | nir_var_mem_task_payload);
+         if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
+            opcode = SHADER_OPCODE_MEMORY_FENCE;
+         break;
+      }
+
+      case nir_intrinsic_begin_invocation_interlock:
+         /* For beginInvocationInterlockARB(), we will generate a memory fence
+          * but with a different opcode so that generator can pick SENDC
+          * instead of SEND.
+          */
+         assert(s.stage == MESA_SHADER_FRAGMENT);
+         ugm_fence = tgm_fence = true;
+         slm_fence = urb_fence = false;
+         opcode = SHADER_OPCODE_INTERLOCK;
+         break;
+
+      case nir_intrinsic_end_invocation_interlock:
+         /* For endInvocationInterlockARB(), we need to insert a memory fence which
+          * stalls in the shader until the memory transactions prior to that
+          * fence are complete.  This ensures that the shader does not end before
+          * any writes from its critical section have landed.  Otherwise, you can
+          * end up with a case where the next invocation on that pixel properly
+          * stalls for previous FS invocation on its pixel to complete but
+          * doesn't actually wait for the dataport memory transactions from that
+          * thread to land before submitting its own.
+          */
+         assert(s.stage == MESA_SHADER_FRAGMENT);
+         ugm_fence = tgm_fence = true;
+         slm_fence = urb_fence = false;
+         opcode = SHADER_OPCODE_MEMORY_FENCE;
+         break;
+
+      default:
+         unreachable("invalid intrinsic");
+      }
+
+      if (opcode == BRW_OPCODE_NOP)
+         break;
+
+      if (s.nir->info.shared_size > 0) {
+         assert(gl_shader_stage_uses_workgroup(s.stage));
+      } else {
+         slm_fence = false;
+      }
+
+      /* If the workgroup fits in a single HW thread, the messages for SLM are
+       * processed in-order and the shader itself is already synchronized so
+       * the memory fence is not necessary.
+       *
+       * TODO: Check if applies for many HW threads sharing same Data Port.
+       */
+      if (!s.nir->info.workgroup_size_variable &&
+          slm_fence && s.workgroup_size() <= s.dispatch_width)
+         slm_fence = false;
+
+      switch (s.stage) {
+         case MESA_SHADER_TESS_CTRL:
+         case MESA_SHADER_TASK:
+         case MESA_SHADER_MESH:
+            break;
+         default:
+            urb_fence = false;
+            break;
+      }
+
+      unsigned fence_regs_count = 0;
+      fs_reg fence_regs[4] = {};
+
+      const fs_builder ubld = bld.group(8, 0);
+
+      /* A memory barrier with acquire semantics requires us to
+       * guarantee that memory operations of the specified storage
+       * class sequenced-after the barrier aren't reordered before the
+       * barrier, nor before any previous atomic operation
+       * sequenced-before the barrier which may be synchronizing this
+       * acquire barrier with a prior release sequence.
+       *
+       * In order to guarantee the latter we must make sure that any
+       * such previous operation has completed execution before
+       * invalidating the relevant caches, since otherwise some cache
+       * could be polluted by a concurrent thread after its
+       * invalidation but before the previous atomic completes, which
+       * could lead to a violation of the expected memory ordering if
+       * a subsequent memory read hits the polluted cacheline, which
+       * would return a stale value read from memory before the
+       * completion of the atomic sequenced-before the barrier.
+       *
+       * This ordering inversion can be avoided trivially if the
+       * operations we need to order are all handled by a single
+       * in-order cache, since the flush implied by the memory fence
+       * occurs after any pending operations have completed, however
+       * that doesn't help us when dealing with multiple caches
+       * processing requests out of order, in which case we need to
+       * explicitly stall the EU until any pending memory operations
+       * have executed.
+       *
+       * Note that that might be somewhat heavy handed in some cases.
+       * In particular when this memory fence was inserted by
+       * spirv_to_nir() lowering an atomic with acquire semantics into
+       * an atomic+barrier sequence we could do a better job by
+       * synchronizing with respect to that one atomic *only*, but
+       * that would require additional information not currently
+       * available to the backend.
+       *
+       * XXX - Use an alternative workaround on IVB and ICL, since
+       *       SYNC.ALLWR is only available on Gfx12+.
+       */
+      if (devinfo->ver >= 12 &&
+          (!nir_intrinsic_has_memory_scope(instr) ||
+           (nir_intrinsic_memory_semantics(instr) & NIR_MEMORY_ACQUIRE))) {
+         ubld.exec_all().group(1, 0).emit(
+            BRW_OPCODE_SYNC, ubld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR));
+      }
+
+      if (devinfo->has_lsc) {
+         assert(devinfo->verx10 >= 125);
+         uint32_t desc =
+            lsc_fence_descriptor_for_intrinsic(devinfo, instr);
+         if (ugm_fence) {
+            fence_regs[fence_regs_count++] =
+               emit_fence(ubld, opcode, GFX12_SFID_UGM, desc,
+                          true /* commit_enable */,
+                          0 /* bti; ignored for LSC */);
+         }
+
+         if (tgm_fence) {
+            fence_regs[fence_regs_count++] =
+               emit_fence(ubld, opcode, GFX12_SFID_TGM, desc,
+                          true /* commit_enable */,
+                          0 /* bti; ignored for LSC */);
+         }
+
+         if (slm_fence) {
+            assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
+            if (intel_needs_workaround(devinfo, 14014063774)) {
+               /* Wa_14014063774
+                *
+                * Before SLM fence compiler needs to insert SYNC.ALLWR in order
+                * to avoid the SLM data race.
+                */
+               ubld.exec_all().group(1, 0).emit(
+                  BRW_OPCODE_SYNC, ubld.null_reg_ud(),
+                  brw_imm_ud(TGL_SYNC_ALLWR));
+            }
+            fence_regs[fence_regs_count++] =
+               emit_fence(ubld, opcode, GFX12_SFID_SLM, desc,
+                          true /* commit_enable */,
+                          0 /* BTI; ignored for LSC */);
+         }
+
+         if (urb_fence) {
+            assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
+            fence_regs[fence_regs_count++] =
+               emit_fence(ubld, opcode, BRW_SFID_URB, desc,
+                          true /* commit_enable */,
+                          0 /* BTI; ignored for LSC */);
+         }
+      } else if (devinfo->ver >= 11) {
+         if (tgm_fence || ugm_fence || urb_fence) {
+            fence_regs[fence_regs_count++] =
+               emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
+                          true /* commit_enable HSD ES # 1404612949 */,
+                          0 /* BTI = 0 means data cache */);
+         }
+
+         if (slm_fence) {
+            assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
+            fence_regs[fence_regs_count++] =
+               emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
+                          true /* commit_enable HSD ES # 1404612949 */,
+                          GFX7_BTI_SLM);
+         }
+      } else {
+         /* Prior to Icelake, they're all lumped into a single cache except on
+          * Ivy Bridge and Bay Trail where typed messages actually go through
+          * the render cache.  There, we need both fences because we may
+          * access storage images as either typed or untyped.
+          */
+         const bool render_fence = tgm_fence && devinfo->verx10 == 70;
+
+         /* Simulation also complains on Gfx9 if we do not enable commit.
+          */
+         const bool commit_enable = render_fence ||
+            instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
+            devinfo->ver == 9;
+
+         if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
+            fence_regs[fence_regs_count++] =
+               emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
+                          commit_enable, 0 /* BTI */);
+         }
+
+         if (render_fence) {
+            fence_regs[fence_regs_count++] =
+               emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE, 0,
+                          commit_enable, /* bti */ 0);
+         }
+      }
+
+      assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
+
+      /* Be conservative in Gen11+ and always stall in a fence.  Since
+       * there are two different fences, and shader might want to
+       * synchronize between them.
+       *
+       * TODO: Use scope and visibility information for the barriers from NIR
+       * to make a better decision on whether we need to stall.
+       */
+      bool force_stall = devinfo->ver >= 11;
+
+      /* There are four cases where we want to insert a stall:
+       *
+       *  1. If we're a nir_intrinsic_end_invocation_interlock.  This is
+       *     required to ensure that the shader EOT doesn't happen until
+       *     after the fence returns.  Otherwise, we might end up with the
+       *     next shader invocation for that pixel not respecting our fence
+       *     because it may happen on a different HW thread.
+       *
+       *  2. If we have multiple fences.  This is required to ensure that
+       *     they all complete and nothing gets weirdly out-of-order.
+       *
+       *  3. If we have no fences.  In this case, we need at least a
+       *     scheduling barrier to keep the compiler from moving things
+       *     around in an invalid way.
+       *
+       *  4. On Gen11+ and platforms with LSC, we have multiple fence types,
+       *     without further information about the fence, we need to force a
+       *     stall.
+       */
+      if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
+          fence_regs_count != 1 || devinfo->has_lsc || force_stall) {
+         ubld.exec_all().group(1, 0).emit(
+            FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
+            fence_regs, fence_regs_count);
+      }
+
+      break;
+   }
+
+   case nir_intrinsic_shader_clock: {
+      /* We cannot do anything if there is an event, so ignore it for now */
+      const fs_reg shader_clock = get_timestamp(bld);
+      const fs_reg srcs[] = { component(shader_clock, 0),
+                              component(shader_clock, 1) };
+      bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
+      break;
+   }
+
+   case nir_intrinsic_load_reloc_const_intel: {
+      uint32_t id = nir_intrinsic_param_idx(instr);
+
+      /* Emit the reloc in the smallest SIMD size to limit register usage. */
+      const fs_builder ubld = bld.exec_all().group(1, 0);
+      fs_reg small_dest = ubld.vgrf(dest.type);
+      ubld.UNDEF(small_dest);
+      ubld.exec_all().group(1, 0).emit(SHADER_OPCODE_MOV_RELOC_IMM,
+                                       small_dest, brw_imm_ud(id));
+
+      /* Copy propagation will get rid of this MOV. */
+      bld.MOV(dest, component(small_dest, 0));
+      break;
+   }
+
+   case nir_intrinsic_load_uniform: {
+      /* Offsets are in bytes but they should always aligned to
+       * the type size
+       */
+      unsigned base_offset = nir_intrinsic_base(instr);
+      assert(base_offset % 4 == 0 || base_offset % type_sz(dest.type) == 0);
+
+      fs_reg src(UNIFORM, base_offset / 4, dest.type);
+
+      if (nir_src_is_const(instr->src[0])) {
+         unsigned load_offset = nir_src_as_uint(instr->src[0]);
+         assert(load_offset % type_sz(dest.type) == 0);
+         /* The base offset can only handle 32-bit units, so for 16-bit
+          * data take the modulo of the offset with 4 bytes and add it to
+          * the offset to read from within the source register.
+          */
+         src.offset = load_offset + base_offset % 4;
+
+         for (unsigned j = 0; j < instr->num_components; j++) {
+            bld.MOV(offset(dest, bld, j), offset(src, bld, j));
+         }
+      } else {
+         fs_reg indirect = retype(get_nir_src(ntb, instr->src[0]),
+                                  BRW_REGISTER_TYPE_UD);
+
+         /* We need to pass a size to the MOV_INDIRECT but we don't want it to
+          * go past the end of the uniform.  In order to keep the n'th
+          * component from running past, we subtract off the size of all but
+          * one component of the vector.
+          */
+         assert(nir_intrinsic_range(instr) >=
+                instr->num_components * type_sz(dest.type));
+         unsigned read_size = nir_intrinsic_range(instr) -
+            (instr->num_components - 1) * type_sz(dest.type);
+
+         bool supports_64bit_indirects =
+            devinfo->platform != INTEL_PLATFORM_CHV && !intel_device_info_is_9lp(devinfo);
+
+         if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
+            for (unsigned j = 0; j < instr->num_components; j++) {
+               bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+                        offset(dest, bld, j), offset(src, bld, j),
+                        indirect, brw_imm_ud(read_size));
+            }
+         } else {
+            const unsigned num_mov_indirects =
+               type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
+            /* We read a little bit less per MOV INDIRECT, as they are now
+             * 32-bits ones instead of 64-bit. Fix read_size then.
+             */
+            const unsigned read_size_32bit = read_size -
+                (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
+            for (unsigned j = 0; j < instr->num_components; j++) {
+               for (unsigned i = 0; i < num_mov_indirects; i++) {
+                  bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+                           subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
+                           subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
+                           indirect, brw_imm_ud(read_size_32bit));
+               }
+            }
+         }
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_ubo_uniform_block_intel: {
+      fs_reg surface, surface_handle;
+
+      if (get_nir_src_bindless(ntb, instr->src[0]))
+         surface_handle = get_nir_buffer_intrinsic_index(ntb, bld, instr);
+      else
+         surface = get_nir_buffer_intrinsic_index(ntb, bld, instr);
+
+      if (!nir_src_is_const(instr->src[1])) {
+         if (instr->intrinsic == nir_intrinsic_load_ubo) {
+            /* load_ubo with non-uniform offset */
+            fs_reg base_offset = retype(get_nir_src(ntb, instr->src[1]),
+                                        BRW_REGISTER_TYPE_UD);
+
+            const unsigned comps_per_load = type_sz(dest.type) == 8 ? 2 : 4;
+
+            for (int i = 0; i < instr->num_components; i += comps_per_load) {
+               const unsigned remaining = instr->num_components - i;
+               s.VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i),
+                                            surface, surface_handle,
+                                            base_offset,
+                                            i * type_sz(dest.type),
+                                            instr->def.bit_size / 8,
+                                            MIN2(remaining, comps_per_load));
+            }
+
+            s.prog_data->has_ubo_pull = true;
+         } else {
+            /* load_ubo with uniform offset */
+            const fs_builder ubld1 = bld.exec_all().group(1, 0);
+            const fs_builder ubld8 = bld.exec_all().group(8, 0);
+            const fs_builder ubld16 = bld.exec_all().group(16, 0);
+
+            fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+
+            srcs[SURFACE_LOGICAL_SRC_SURFACE]        = surface;
+            srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = surface_handle;
+
+            const nir_src load_offset = instr->src[1];
+            if (nir_src_is_const(load_offset)) {
+               fs_reg addr = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
+               ubld8.MOV(addr, brw_imm_ud(nir_src_as_uint(load_offset)));
+               srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
+            } else {
+               srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
+                  bld.emit_uniformize(get_nir_src(ntb, load_offset));
+            }
+
+            const unsigned total_dwords =
+               ALIGN(instr->num_components, REG_SIZE * reg_unit(devinfo) / 4);
+            unsigned loaded_dwords = 0;
+
+            const fs_reg packed_consts =
+               ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords);
+
+            while (loaded_dwords < total_dwords) {
+               const unsigned block =
+                  choose_oword_block_size_dwords(devinfo,
+                                                 total_dwords - loaded_dwords);
+               const unsigned block_bytes = block * 4;
+
+               srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
+
+               const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
+               ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
+                         retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD),
+                         srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written =
+                  align(block_bytes, REG_SIZE * reg_unit(devinfo));
+
+               loaded_dwords += block;
+
+               ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
+                         srcs[SURFACE_LOGICAL_SRC_ADDRESS],
+                         brw_imm_ud(block_bytes));
+            }
+
+            for (unsigned c = 0; c < instr->num_components; c++) {
+               bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD),
+                       component(packed_consts, c));
+            }
+
+            s.prog_data->has_ubo_pull = true;
+         }
+      } else {
+         /* Even if we are loading doubles, a pull constant load will load
+          * a 32-bit vec4, so should only reserve vgrf space for that. If we
+          * need to load a full dvec4 we will have to emit 2 loads. This is
+          * similar to demote_pull_constants(), except that in that case we
+          * see individual accesses to each component of the vector and then
+          * we let CSE deal with duplicate loads. Here we see a vector access
+          * and we have to split it if necessary.
+          */
+         const unsigned type_size = type_sz(dest.type);
+         const unsigned load_offset = nir_src_as_uint(instr->src[1]);
+         const unsigned ubo_block =
+            brw_nir_ubo_surface_index_get_push_block(instr->src[0]);
+         const unsigned offset_256b = load_offset / 32;
+         const unsigned end_256b =
+            DIV_ROUND_UP(load_offset + type_size * instr->num_components, 32);
+
+         /* See if we've selected this as a push constant candidate */
+         fs_reg push_reg;
+         for (int i = 0; i < 4; i++) {
+            const struct brw_ubo_range *range = &s.prog_data->ubo_ranges[i];
+            if (range->block == ubo_block &&
+                offset_256b >= range->start &&
+                end_256b <= range->start + range->length) {
+
+               push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
+               push_reg.offset = load_offset - 32 * range->start;
+               break;
+            }
+         }
+
+         if (push_reg.file != BAD_FILE) {
+            for (unsigned i = 0; i < instr->num_components; i++) {
+               bld.MOV(offset(dest, bld, i),
+                       byte_offset(push_reg, i * type_size));
+            }
+            break;
+         }
+
+         s.prog_data->has_ubo_pull = true;
+
+         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
+         const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
+
+         for (unsigned c = 0; c < instr->num_components;) {
+            const unsigned base = load_offset + c * type_size;
+            /* Number of usable components in the next block-aligned load. */
+            const unsigned count = MIN2(instr->num_components - c,
+                                        (block_sz - base % block_sz) / type_size);
+
+            const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+            fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
+            srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE]        = surface;
+            srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
+            srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]         = brw_imm_ud(base & ~(block_sz - 1));
+            srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]           = brw_imm_ud(block_sz);
+
+            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
+                      srcs, PULL_UNIFORM_CONSTANT_SRCS);
+
+            const fs_reg consts =
+               retype(byte_offset(packed_consts, base & (block_sz - 1)),
+                      dest.type);
+
+            for (unsigned d = 0; d < count; d++)
+               bld.MOV(offset(dest, bld, c + d), component(consts, d));
+
+            c += count;
+         }
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_global:
+   case nir_intrinsic_load_global_constant: {
+      assert(devinfo->ver >= 8);
+
+      assert(instr->def.bit_size <= 32);
+      assert(nir_intrinsic_align(instr) > 0);
+      fs_reg srcs[A64_LOGICAL_NUM_SRCS];
+      srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[0]);
+      srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
+      srcs[A64_LOGICAL_ENABLE_HELPERS] =
+         brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
+
+      if (instr->def.bit_size == 32 &&
+          nir_intrinsic_align(instr) >= 4) {
+         assert(instr->def.num_components <= 4);
+
+         srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components);
+
+         fs_inst *inst =
+            bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, dest,
+                     srcs, A64_LOGICAL_NUM_SRCS);
+         inst->size_written = instr->num_components *
+                              inst->dst.component_size(inst->exec_size);
+      } else {
+         const unsigned bit_size = instr->def.bit_size;
+         assert(instr->def.num_components == 1);
+         fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+         srcs[A64_LOGICAL_ARG] = brw_imm_ud(bit_size);
+
+         bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, tmp,
+                  srcs, A64_LOGICAL_NUM_SRCS);
+         bld.MOV(dest, subscript(tmp, dest.type, 0));
+      }
+      break;
+   }
+
+   case nir_intrinsic_store_global: {
+      assert(devinfo->ver >= 8);
+
+      assert(nir_src_bit_size(instr->src[0]) <= 32);
+      assert(nir_intrinsic_write_mask(instr) ==
+             (1u << instr->num_components) - 1);
+      assert(nir_intrinsic_align(instr) > 0);
+
+      fs_reg srcs[A64_LOGICAL_NUM_SRCS];
+      srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[1]);
+      srcs[A64_LOGICAL_ENABLE_HELPERS] =
+         brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
+
+      if (nir_src_bit_size(instr->src[0]) == 32 &&
+          nir_intrinsic_align(instr) >= 4) {
+         assert(nir_src_num_components(instr->src[0]) <= 4);
+
+         srcs[A64_LOGICAL_SRC] = get_nir_src(ntb, instr->src[0]); /* Data */
+         srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components);
+
+         bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, fs_reg(),
+                  srcs, A64_LOGICAL_NUM_SRCS);
+      } else {
+         assert(nir_src_num_components(instr->src[0]) == 1);
+         const unsigned bit_size = nir_src_bit_size(instr->src[0]);
+         brw_reg_type data_type =
+            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
+         fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+         bld.MOV(tmp, retype(get_nir_src(ntb, instr->src[0]), data_type));
+
+         srcs[A64_LOGICAL_SRC] = tmp;
+         srcs[A64_LOGICAL_ARG] = brw_imm_ud(bit_size);
+
+         bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, fs_reg(),
+                  srcs, A64_LOGICAL_NUM_SRCS);
+      }
+      break;
+   }
+
+   case nir_intrinsic_global_atomic:
+   case nir_intrinsic_global_atomic_swap:
+      fs_nir_emit_global_atomic(ntb, bld, instr);
+      break;
+
+   case nir_intrinsic_load_global_const_block_intel: {
+      assert(instr->def.bit_size == 32);
+      assert(instr->num_components == 8 || instr->num_components == 16);
+
+      const fs_builder ubld = bld.exec_all().group(instr->num_components, 0);
+      fs_reg load_val;
+
+      bool is_pred_const = nir_src_is_const(instr->src[1]);
+      if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) {
+         /* In this case, we don't want the UBO load at all.  We really
+          * shouldn't get here but it's possible.
+          */
+         load_val = brw_imm_ud(0);
+      } else {
+         /* The uniform process may stomp the flag so do this first */
+         fs_reg addr = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
+
+         load_val = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+
+         /* If the predicate is constant and we got here, then it's non-zero
+          * and we don't need the predicate at all.
+          */
+         if (!is_pred_const) {
+            /* Load the predicate */
+            fs_reg pred = bld.emit_uniformize(get_nir_src(ntb, instr->src[1]));
+            fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred);
+            mov->conditional_mod = BRW_CONDITIONAL_NZ;
+
+            /* Stomp the destination with 0 if we're OOB */
+            mov = ubld.MOV(load_val, brw_imm_ud(0));
+            mov->predicate = BRW_PREDICATE_NORMAL;
+            mov->predicate_inverse = true;
+         }
+
+         fs_reg srcs[A64_LOGICAL_NUM_SRCS];
+         srcs[A64_LOGICAL_ADDRESS] = addr;
+         srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
+         srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components);
+         /* This intrinsic loads memory from a uniform address, sometimes
+          * shared across lanes. We never need to mask it.
+          */
+         srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
+
+         fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,
+                                   load_val, srcs, A64_LOGICAL_NUM_SRCS);
+         if (!is_pred_const)
+            load->predicate = BRW_PREDICATE_NORMAL;
+      }
+
+      /* From the HW perspective, we just did a single SIMD16 instruction
+       * which loaded a dword in each SIMD channel.  From NIR's perspective,
+       * this instruction returns a vec16.  Any users of this data in the
+       * back-end will expect a vec16 per SIMD channel so we have to emit a
+       * pile of MOVs to resolve this discrepancy.  Fortunately, copy-prop
+       * will generally clean them up for us.
+       */
+      for (unsigned i = 0; i < instr->num_components; i++) {
+         bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
+                 component(load_val, i));
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_global_constant_uniform_block_intel: {
+      const unsigned total_dwords = ALIGN(instr->num_components,
+                                          REG_SIZE * reg_unit(devinfo) / 4);
+      unsigned loaded_dwords = 0;
+
+      const fs_builder ubld1 = bld.exec_all().group(1, 0);
+      const fs_builder ubld8 = bld.exec_all().group(8, 0);
+      const fs_builder ubld16 = bld.exec_all().group(16, 0);
+
+      const fs_reg packed_consts =
+         ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords);
+      fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
+
+      while (loaded_dwords < total_dwords) {
+         const unsigned block =
+            choose_oword_block_size_dwords(devinfo,
+                                           total_dwords - loaded_dwords);
+         const unsigned block_bytes = block * 4;
+
+         const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
+
+         fs_reg srcs[A64_LOGICAL_NUM_SRCS];
+         srcs[A64_LOGICAL_ADDRESS] = address;
+         srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
+         srcs[A64_LOGICAL_ARG] = brw_imm_ud(block);
+         srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
+         ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
+                   retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD),
+                   srcs, A64_LOGICAL_NUM_SRCS)->size_written =
+            align(block_bytes, REG_SIZE * reg_unit(devinfo));
+
+         increment_a64_address(ubld1, address, block_bytes);
+         loaded_dwords += block;
+      }
+
+      for (unsigned c = 0; c < instr->num_components; c++)
+         bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD),
+                 component(packed_consts, c));
+
+      break;
+   }
+
+   case nir_intrinsic_load_ssbo: {
+      assert(devinfo->ver >= 7);
+
+      const unsigned bit_size = instr->def.bit_size;
+      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+      srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
+           SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
+           SURFACE_LOGICAL_SRC_SURFACE] =
+         get_nir_buffer_intrinsic_index(ntb, bld, instr);
+      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
+      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
+      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
+
+      /* Make dest unsigned because that's what the temporary will be */
+      dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
+
+      /* Read the vector */
+      assert(bit_size <= 32);
+      assert(nir_intrinsic_align(instr) > 0);
+      if (bit_size == 32 &&
+          nir_intrinsic_align(instr) >= 4) {
+         assert(instr->def.num_components <= 4);
+         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
+         fs_inst *inst =
+            bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
+                     dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
+         inst->size_written = instr->num_components * s.dispatch_width * 4;
+      } else {
+         assert(instr->def.num_components == 1);
+         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
+
+         fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
+         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
+                  read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
+         bld.MOV(dest, subscript(read_result, dest.type, 0));
+      }
+      break;
+   }
+
+   case nir_intrinsic_store_ssbo: {
+      assert(devinfo->ver >= 7);
+
+      const unsigned bit_size = nir_src_bit_size(instr->src[0]);
+      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+      srcs[get_nir_src_bindless(ntb, instr->src[1]) ?
+           SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
+           SURFACE_LOGICAL_SRC_SURFACE] =
+         get_nir_buffer_intrinsic_index(ntb, bld, instr);
+      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[2]);
+      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
+      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
+
+      fs_reg data = get_nir_src(ntb, instr->src[0]);
+      data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
+
+      assert(bit_size <= 32);
+      assert(nir_intrinsic_write_mask(instr) ==
+             (1u << instr->num_components) - 1);
+      assert(nir_intrinsic_align(instr) > 0);
+      if (bit_size == 32 &&
+          nir_intrinsic_align(instr) >= 4) {
+         assert(nir_src_num_components(instr->src[0]) <= 4);
+         srcs[SURFACE_LOGICAL_SRC_DATA] = data;
+         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
+         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
+                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
+      } else {
+         assert(nir_src_num_components(instr->src[0]) == 1);
+         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
+
+         srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
+         bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
+
+         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
+                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_ssbo_uniform_block_intel:
+   case nir_intrinsic_load_shared_uniform_block_intel: {
+      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+
+      const bool is_ssbo =
+         instr->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel;
+      if (is_ssbo) {
+         srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
+              SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
+              SURFACE_LOGICAL_SRC_SURFACE] =
+            get_nir_buffer_intrinsic_index(ntb, bld, instr);
+      } else {
+         srcs[SURFACE_LOGICAL_SRC_SURFACE] = fs_reg(brw_imm_ud(GFX7_BTI_SLM));
+      }
+
+      const unsigned total_dwords = ALIGN(instr->num_components,
+                                          REG_SIZE * reg_unit(devinfo) / 4);
+      unsigned loaded_dwords = 0;
+
+      const fs_builder ubld1 = bld.exec_all().group(1, 0);
+      const fs_builder ubld8 = bld.exec_all().group(8, 0);
+      const fs_builder ubld16 = bld.exec_all().group(16, 0);
+
+      const fs_reg packed_consts =
+         ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords);
+
+      const nir_src load_offset = is_ssbo ? instr->src[1] : instr->src[0];
+      if (nir_src_is_const(load_offset)) {
+         fs_reg addr = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
+         ubld8.MOV(addr, brw_imm_ud(nir_src_as_uint(load_offset)));
+         srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
+      } else {
+         srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
+            bld.emit_uniformize(get_nir_src(ntb, load_offset));
+      }
+
+      while (loaded_dwords < total_dwords) {
+         const unsigned block =
+            choose_oword_block_size_dwords(devinfo,
+                                           total_dwords - loaded_dwords);
+         const unsigned block_bytes = block * 4;
+
+         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
+
+         const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
+         ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
+                   retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD),
+                   srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written =
+            align(block_bytes, REG_SIZE * reg_unit(devinfo));
+
+         loaded_dwords += block;
+
+         ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
+                   srcs[SURFACE_LOGICAL_SRC_ADDRESS],
+                   brw_imm_ud(block_bytes));
+      }
+
+      for (unsigned c = 0; c < instr->num_components; c++)
+         bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD),
+                 component(packed_consts, c));
+
+      break;
+   }
+
+   case nir_intrinsic_store_output: {
+      assert(nir_src_bit_size(instr->src[0]) == 32);
+      fs_reg src = get_nir_src(ntb, instr->src[0]);
+
+      unsigned store_offset = nir_src_as_uint(instr->src[1]);
+      unsigned num_components = instr->num_components;
+      unsigned first_component = nir_intrinsic_component(instr);
+
+      fs_reg new_dest = retype(offset(s.outputs[instr->const_index[0]], bld,
+                                      4 * store_offset), src.type);
+      for (unsigned j = 0; j < num_components; j++) {
+         bld.MOV(offset(new_dest, bld, j + first_component),
+                 offset(src, bld, j));
+      }
+      break;
+   }
+
+   case nir_intrinsic_ssbo_atomic:
+   case nir_intrinsic_ssbo_atomic_swap:
+      fs_nir_emit_surface_atomic(ntb, bld, instr,
+                                 get_nir_buffer_intrinsic_index(ntb, bld, instr),
+                                 get_nir_src_bindless(ntb, instr->src[0]));
+      break;
+
+   case nir_intrinsic_get_ssbo_size: {
+      assert(nir_src_num_components(instr->src[0]) == 1);
+
+      /* A resinfo's sampler message is used to get the buffer size.  The
+       * SIMD8's writeback message consists of four registers and SIMD16's
+       * writeback message consists of 8 destination registers (two per each
+       * component).  Because we are only interested on the first channel of
+       * the first returned component, where resinfo returns the buffer size
+       * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
+       * the dispatch width.
+       */
+      const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
+      fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+
+      /* Set LOD = 0 */
+      ubld.MOV(src_payload, brw_imm_d(0));
+
+      fs_reg srcs[GET_BUFFER_SIZE_SRCS];
+      srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
+           GET_BUFFER_SIZE_SRC_SURFACE_HANDLE :
+           GET_BUFFER_SIZE_SRC_SURFACE] =
+         get_nir_buffer_intrinsic_index(ntb, bld, instr);
+      srcs[GET_BUFFER_SIZE_SRC_LOD] = src_payload;
+      fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
+                                srcs, GET_BUFFER_SIZE_SRCS);
+      inst->header_size = 0;
+      inst->mlen = reg_unit(devinfo);
+      inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
+
+      /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
+       *
+       * "Out-of-bounds checking is always performed at a DWord granularity. If
+       * any part of the DWord is out-of-bounds then the whole DWord is
+       * considered out-of-bounds."
+       *
+       * This implies that types with size smaller than 4-bytes need to be
+       * padded if they don't complete the last dword of the buffer. But as we
+       * need to maintain the original size we need to reverse the padding
+       * calculation to return the correct size to know the number of elements
+       * of an unsized array. As we stored in the last two bits of the surface
+       * size the needed padding for the buffer, we calculate here the
+       * original buffer_size reversing the surface_size calculation:
+       *
+       * surface_size = isl_align(buffer_size, 4) +
+       *                (isl_align(buffer_size) - buffer_size)
+       *
+       * buffer_size = surface_size & ~3 - surface_size & 3
+       */
+
+      fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+
+      ubld.AND(size_padding, ret_payload, brw_imm_ud(3));
+      ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3));
+      ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
+
+      bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
+      break;
+   }
+
+   case nir_intrinsic_load_scratch: {
+      assert(devinfo->ver >= 7);
+
+      assert(instr->def.num_components == 1);
+      const unsigned bit_size = instr->def.bit_size;
+      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+
+      if (devinfo->verx10 >= 125) {
+         const fs_builder ubld = bld.exec_all().group(1, 0);
+         fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);
+         ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
+                          brw_imm_ud(INTEL_MASK(31, 10)));
+         srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX125_NON_BINDLESS);
+         srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
+      } else if (devinfo->ver >= 8) {
+         srcs[SURFACE_LOGICAL_SRC_SURFACE] =
+            brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
+      } else {
+         srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);
+      }
+
+      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
+      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
+      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
+      const fs_reg nir_addr = get_nir_src(ntb, instr->src[0]);
+
+      /* Make dest unsigned because that's what the temporary will be */
+      dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
+
+      /* Read the vector */
+      assert(instr->def.num_components == 1);
+      assert(bit_size <= 32);
+      assert(nir_intrinsic_align(instr) > 0);
+      if (bit_size == 32 &&
+          nir_intrinsic_align(instr) >= 4) {
+         if (devinfo->verx10 >= 125) {
+            assert(bit_size == 32 &&
+                   nir_intrinsic_align(instr) >= 4);
+
+            srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
+               swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
+            srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1);
+
+            bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
+                     dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
+         } else {
+            /* The offset for a DWORD scattered message is in dwords. */
+            srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
+               swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
+
+            bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
+                     dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
+         }
+      } else {
+         srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
+            swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
+
+         fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
+         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
+                  read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
+         bld.MOV(dest, read_result);
+      }
+
+      s.shader_stats.fill_count += DIV_ROUND_UP(s.dispatch_width, 16);
+      break;
+   }
+
+   case nir_intrinsic_store_scratch: {
+      assert(devinfo->ver >= 7);
+
+      assert(nir_src_num_components(instr->src[0]) == 1);
+      const unsigned bit_size = nir_src_bit_size(instr->src[0]);
+      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+
+      if (devinfo->verx10 >= 125) {
+         const fs_builder ubld = bld.exec_all().group(1, 0);
+         fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);
+         ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
+                          brw_imm_ud(INTEL_MASK(31, 10)));
+         srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX125_NON_BINDLESS);
+         srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
+      } else if (devinfo->ver >= 8) {
+         srcs[SURFACE_LOGICAL_SRC_SURFACE] =
+            brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
+      } else {
+         srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);
+      }
+
+      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
+      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
+      /**
+       * While this instruction has side-effects, it should not be predicated
+       * on sample mask, because otherwise fs helper invocations would
+       * load undefined values from scratch memory. And scratch memory
+       * load-stores are produced from operations without side-effects, thus
+       * they should not have different behaviour in the helper invocations.
+       */
+      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
+      const fs_reg nir_addr = get_nir_src(ntb, instr->src[1]);
+
+      fs_reg data = get_nir_src(ntb, instr->src[0]);
+      data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
+
+      assert(nir_src_num_components(instr->src[0]) == 1);
+      assert(bit_size <= 32);
+      assert(nir_intrinsic_write_mask(instr) == 1);
+      assert(nir_intrinsic_align(instr) > 0);
+      if (bit_size == 32 &&
+          nir_intrinsic_align(instr) >= 4) {
+         if (devinfo->verx10 >= 125) {
+            srcs[SURFACE_LOGICAL_SRC_DATA] = data;
+
+            srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
+               swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
+            srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1);
+
+            bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
+                     dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
+         } else {
+            srcs[SURFACE_LOGICAL_SRC_DATA] = data;
+
+            /* The offset for a DWORD scattered message is in dwords. */
+            srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
+               swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
+
+            bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
+                     fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
+         }
+      } else {
+         srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
+         bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
+
+         srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
+            swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
+
+         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
+                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
+      }
+      s.shader_stats.spill_count += DIV_ROUND_UP(s.dispatch_width, 16);
+      break;
+   }
+
+   case nir_intrinsic_load_subgroup_size:
+      /* This should only happen for fragment shaders because every other case
+       * is lowered in NIR so we can optimize on it.
+       */
+      assert(s.stage == MESA_SHADER_FRAGMENT);
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(s.dispatch_width));
+      break;
+
+   case nir_intrinsic_load_subgroup_invocation:
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
+              ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
+      break;
+
+   case nir_intrinsic_load_subgroup_eq_mask:
+   case nir_intrinsic_load_subgroup_ge_mask:
+   case nir_intrinsic_load_subgroup_gt_mask:
+   case nir_intrinsic_load_subgroup_le_mask:
+   case nir_intrinsic_load_subgroup_lt_mask:
+      unreachable("not reached");
+
+   case nir_intrinsic_vote_any: {
+      const fs_builder ubld1 = bld.exec_all().group(1, 0);
+
+      /* The any/all predicates do not consider channel enables. To prevent
+       * dead channels from affecting the result, we initialize the flag with
+       * with the identity value for the logical operation.
+       */
+      if (s.dispatch_width == 32) {
+         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
+         ubld1.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
+                   brw_imm_ud(0));
+      } else {
+         ubld1.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
+      }
+      bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
+
+      /* For some reason, the any/all predicates don't work properly with
+       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
+       * doesn't read the correct subset of the flag register and you end up
+       * getting garbage in the second half.  Work around this by using a pair
+       * of 1-wide MOVs and scattering the result.
+       */
+      const fs_builder ubld = devinfo->ver >= 20 ? bld.exec_all() : ubld1;
+      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
+      ubld.MOV(res1, brw_imm_d(0));
+      set_predicate(devinfo->ver >= 20 ? XE2_PREDICATE_ANY :
+                    s.dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
+                    s.dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
+                                             BRW_PREDICATE_ALIGN1_ANY32H,
+                    ubld.MOV(res1, brw_imm_d(-1)));
+
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
+      break;
+   }
+   case nir_intrinsic_vote_all: {
+      const fs_builder ubld1 = bld.exec_all().group(1, 0);
+
+      /* The any/all predicates do not consider channel enables. To prevent
+       * dead channels from affecting the result, we initialize the flag with
+       * with the identity value for the logical operation.
+       */
+      if (s.dispatch_width == 32) {
+         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
+         ubld1.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
+                   brw_imm_ud(0xffffffff));
+      } else {
+         ubld1.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
+      }
+      bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
+
+      /* For some reason, the any/all predicates don't work properly with
+       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
+       * doesn't read the correct subset of the flag register and you end up
+       * getting garbage in the second half.  Work around this by using a pair
+       * of 1-wide MOVs and scattering the result.
+       */
+      const fs_builder ubld = devinfo->ver >= 20 ? bld.exec_all() : ubld1;
+      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
+      ubld.MOV(res1, brw_imm_d(0));
+      set_predicate(devinfo->ver >= 20 ? XE2_PREDICATE_ALL :
+                    s.dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
+                    s.dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
+                                             BRW_PREDICATE_ALIGN1_ALL32H,
+                    ubld.MOV(res1, brw_imm_d(-1)));
+
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
+      break;
+   }
+   case nir_intrinsic_vote_feq:
+   case nir_intrinsic_vote_ieq: {
+      fs_reg value = get_nir_src(ntb, instr->src[0]);
+      if (instr->intrinsic == nir_intrinsic_vote_feq) {
+         const unsigned bit_size = nir_src_bit_size(instr->src[0]);
+         value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B :
+            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
+      }
+
+      fs_reg uniformized = bld.emit_uniformize(value);
+      const fs_builder ubld1 = bld.exec_all().group(1, 0);
+
+      /* The any/all predicates do not consider channel enables. To prevent
+       * dead channels from affecting the result, we initialize the flag with
+       * with the identity value for the logical operation.
+       */
+      if (s.dispatch_width == 32) {
+         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
+         ubld1.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
+                         brw_imm_ud(0xffffffff));
+      } else {
+         ubld1.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
+      }
+      bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
+
+      /* For some reason, the any/all predicates don't work properly with
+       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
+       * doesn't read the correct subset of the flag register and you end up
+       * getting garbage in the second half.  Work around this by using a pair
+       * of 1-wide MOVs and scattering the result.
+       */
+      const fs_builder ubld = devinfo->ver >= 20 ? bld.exec_all() : ubld1;
+      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
+      ubld.MOV(res1, brw_imm_d(0));
+      set_predicate(devinfo->ver >= 20 ? XE2_PREDICATE_ALL :
+                    s.dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
+                    s.dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
+                                             BRW_PREDICATE_ALIGN1_ALL32H,
+                    ubld.MOV(res1, brw_imm_d(-1)));
+
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
+      break;
+   }
+
+   case nir_intrinsic_ballot: {
+      const fs_reg value = retype(get_nir_src(ntb, instr->src[0]),
+                                  BRW_REGISTER_TYPE_UD);
+      struct brw_reg flag = brw_flag_reg(0, 0);
+      /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
+       * as f0.0.  This is a problem for fragment programs as we currently use
+       * f0.1 for discards.  Fortunately, we don't support SIMD32 fragment
+       * programs yet so this isn't a problem.  When we do, something will
+       * have to change.
+       */
+      if (s.dispatch_width == 32)
+         flag.type = BRW_REGISTER_TYPE_UD;
+
+      bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u));
+      bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
+
+      if (instr->def.bit_size > 32) {
+         dest.type = BRW_REGISTER_TYPE_UQ;
+      } else {
+         dest.type = BRW_REGISTER_TYPE_UD;
+      }
+      bld.MOV(dest, flag);
+      break;
+   }
+
+   case nir_intrinsic_read_invocation: {
+      const fs_reg value = get_nir_src(ntb, instr->src[0]);
+      const fs_reg invocation = get_nir_src(ntb, instr->src[1]);
+
+      fs_reg tmp = bld.vgrf(value.type);
+
+      /* When for some reason the subgroup_size picked by NIR is larger than
+       * the dispatch size picked by the backend (this could happen in RT,
+       * FS), bound the invocation to the dispatch size.
+       */
+      fs_reg bound_invocation;
+      if (s.api_subgroup_size == 0 ||
+          bld.dispatch_width() < s.api_subgroup_size) {
+         bound_invocation = bld.vgrf(BRW_REGISTER_TYPE_UD);
+         bld.AND(bound_invocation, invocation, brw_imm_ud(s.dispatch_width - 1));
+      } else {
+         bound_invocation = invocation;
+      }
+      bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
+                          bld.emit_uniformize(bound_invocation));
+
+      bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
+      break;
+   }
+
+   case nir_intrinsic_read_first_invocation: {
+      const fs_reg value = get_nir_src(ntb, instr->src[0]);
+      bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
+      break;
+   }
+
+   case nir_intrinsic_shuffle: {
+      const fs_reg value = get_nir_src(ntb, instr->src[0]);
+      const fs_reg index = get_nir_src(ntb, instr->src[1]);
+
+      bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
+      break;
+   }
+
+   case nir_intrinsic_first_invocation: {
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
+              fs_reg(component(tmp, 0)));
+      break;
+   }
+
+   case nir_intrinsic_last_invocation: {
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.exec_all().emit(SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, tmp);
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
+              fs_reg(component(tmp, 0)));
+      break;
+   }
+
+   case nir_intrinsic_quad_broadcast: {
+      const fs_reg value = get_nir_src(ntb, instr->src[0]);
+      const unsigned index = nir_src_as_uint(instr->src[1]);
+
+      bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
+               value, brw_imm_ud(index), brw_imm_ud(4));
+      break;
+   }
+
+   case nir_intrinsic_quad_swap_horizontal: {
+      const fs_reg value = get_nir_src(ntb, instr->src[0]);
+      const fs_reg tmp = bld.vgrf(value.type);
+      if (devinfo->ver <= 7) {
+         /* The hardware doesn't seem to support these crazy regions with
+          * compressed instructions on gfx7 and earlier so we fall back to
+          * using quad swizzles.  Fortunately, we don't support 64-bit
+          * anything in Vulkan on gfx7.
+          */
+         assert(nir_src_bit_size(instr->src[0]) == 32);
+         const fs_builder ubld = bld.exec_all();
+         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
+                   brw_imm_ud(BRW_SWIZZLE4(1,0,3,2)));
+         bld.MOV(retype(dest, value.type), tmp);
+      } else {
+         const fs_builder ubld = bld.exec_all().group(s.dispatch_width / 2, 0);
+
+         const fs_reg src_left = horiz_stride(value, 2);
+         const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
+         const fs_reg tmp_left = horiz_stride(tmp, 2);
+         const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
+
+         ubld.MOV(tmp_left, src_right);
+         ubld.MOV(tmp_right, src_left);
+
+      }
+      bld.MOV(retype(dest, value.type), tmp);
+      break;
+   }
+
+   case nir_intrinsic_quad_swap_vertical: {
+      const fs_reg value = get_nir_src(ntb, instr->src[0]);
+      if (nir_src_bit_size(instr->src[0]) == 32) {
+         /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
+         const fs_reg tmp = bld.vgrf(value.type);
+         const fs_builder ubld = bld.exec_all();
+         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
+                   brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
+         bld.MOV(retype(dest, value.type), tmp);
+      } else {
+         /* For larger data types, we have to either emit dispatch_width many
+          * MOVs or else fall back to doing indirects.
+          */
+         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
+         bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
+                      brw_imm_w(0x2));
+         bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
+      }
+      break;
+   }
+
+   case nir_intrinsic_quad_swap_diagonal: {
+      const fs_reg value = get_nir_src(ntb, instr->src[0]);
+      if (nir_src_bit_size(instr->src[0]) == 32) {
+         /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
+         const fs_reg tmp = bld.vgrf(value.type);
+         const fs_builder ubld = bld.exec_all();
+         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
+                   brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
+         bld.MOV(retype(dest, value.type), tmp);
+      } else {
+         /* For larger data types, we have to either emit dispatch_width many
+          * MOVs or else fall back to doing indirects.
+          */
+         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
+         bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
+                      brw_imm_w(0x3));
+         bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
+      }
+      break;
+   }
+
+   case nir_intrinsic_reduce: {
+      fs_reg src = get_nir_src(ntb, instr->src[0]);
+      nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
+      unsigned cluster_size = nir_intrinsic_cluster_size(instr);
+      if (cluster_size == 0 || cluster_size > s.dispatch_width)
+         cluster_size = s.dispatch_width;
+
+      /* Figure out the source type */
+      src.type = brw_type_for_nir_type(devinfo,
+         (nir_alu_type)(nir_op_infos[redop].input_types[0] |
+                        nir_src_bit_size(instr->src[0])));
+
+      fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
+      opcode brw_op = brw_op_for_nir_reduction_op(redop);
+      brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
+
+      /* Set up a register for all of our scratching around and initialize it
+       * to reduction operation's identity value.
+       */
+      fs_reg scan = bld.vgrf(src.type);
+      bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
+
+      bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
+
+      dest.type = src.type;
+      if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
+         /* In this case, CLUSTER_BROADCAST instruction isn't needed because
+          * the distance between clusters is at least 2 GRFs.  In this case,
+          * we don't need the weird striding of the CLUSTER_BROADCAST
+          * instruction and can just do regular MOVs.
+          */
+         assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
+         const unsigned groups =
+            (s.dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
+         const unsigned group_size = s.dispatch_width / groups;
+         for (unsigned i = 0; i < groups; i++) {
+            const unsigned cluster = (i * group_size) / cluster_size;
+            const unsigned comp = cluster * cluster_size + (cluster_size - 1);
+            bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
+                                         component(scan, comp));
+         }
+      } else {
+         bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
+                  brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
+      }
+      break;
+   }
+
+   case nir_intrinsic_inclusive_scan:
+   case nir_intrinsic_exclusive_scan: {
+      fs_reg src = get_nir_src(ntb, instr->src[0]);
+      nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
+
+      /* Figure out the source type */
+      src.type = brw_type_for_nir_type(devinfo,
+         (nir_alu_type)(nir_op_infos[redop].input_types[0] |
+                        nir_src_bit_size(instr->src[0])));
+
+      fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
+      opcode brw_op = brw_op_for_nir_reduction_op(redop);
+      brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
+
+      /* Set up a register for all of our scratching around and initialize it
+       * to reduction operation's identity value.
+       */
+      fs_reg scan = bld.vgrf(src.type);
+      const fs_builder allbld = bld.exec_all();
+      allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
+
+      if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
+         /* Exclusive scan is a bit harder because we have to do an annoying
+          * shift of the contents before we can begin.  To make things worse,
+          * we can't do this with a normal stride; we have to use indirects.
+          */
+         fs_reg shifted = bld.vgrf(src.type);
+         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
+         allbld.ADD(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
+                         brw_imm_w(-1));
+         allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
+         allbld.group(1, 0).MOV(component(shifted, 0), identity);
+         scan = shifted;
+      }
+
+      bld.emit_scan(brw_op, scan, s.dispatch_width, cond_mod);
+
+      bld.MOV(retype(dest, src.type), scan);
+      break;
+   }
+
+   case nir_intrinsic_load_global_block_intel: {
+      assert(instr->def.bit_size == 32);
+
+      fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
+
+      const fs_builder ubld1 = bld.exec_all().group(1, 0);
+      const fs_builder ubld8 = bld.exec_all().group(8, 0);
+      const fs_builder ubld16 = bld.exec_all().group(16, 0);
+
+      const unsigned total = instr->num_components * s.dispatch_width;
+      unsigned loaded = 0;
+
+      while (loaded < total) {
+         const unsigned block =
+            choose_oword_block_size_dwords(devinfo, total - loaded);
+         const unsigned block_bytes = block * 4;
+
+         const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
+
+         fs_reg srcs[A64_LOGICAL_NUM_SRCS];
+         srcs[A64_LOGICAL_ADDRESS] = address;
+         srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
+         srcs[A64_LOGICAL_ARG] = brw_imm_ud(block);
+         srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(1);
+         ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
+                   retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD),
+                   srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes;
+
+         increment_a64_address(ubld1, address, block_bytes);
+         loaded += block;
+      }
+
+      assert(loaded == total);
+      break;
+   }
+
+   case nir_intrinsic_store_global_block_intel: {
+      assert(nir_src_bit_size(instr->src[0]) == 32);
+
+      fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[1]));
+      fs_reg src = get_nir_src(ntb, instr->src[0]);
+
+      const fs_builder ubld1 = bld.exec_all().group(1, 0);
+      const fs_builder ubld8 = bld.exec_all().group(8, 0);
+      const fs_builder ubld16 = bld.exec_all().group(16, 0);
+
+      const unsigned total = instr->num_components * s.dispatch_width;
+      unsigned written = 0;
+
+      while (written < total) {
+         const unsigned block =
+            choose_oword_block_size_dwords(devinfo, total - written);
+
+         fs_reg srcs[A64_LOGICAL_NUM_SRCS];
+         srcs[A64_LOGICAL_ADDRESS] = address;
+         srcs[A64_LOGICAL_SRC] = retype(byte_offset(src, written * 4),
+                                        BRW_REGISTER_TYPE_UD);
+         srcs[A64_LOGICAL_ARG] = brw_imm_ud(block);
+         srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
+
+         const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
+         ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, fs_reg(),
+                   srcs, A64_LOGICAL_NUM_SRCS);
+
+         const unsigned block_bytes = block * 4;
+         increment_a64_address(ubld1, address, block_bytes);
+         written += block;
+      }
+
+      assert(written == total);
+      break;
+   }
+
+   case nir_intrinsic_load_shared_block_intel:
+   case nir_intrinsic_load_ssbo_block_intel: {
+      assert(instr->def.bit_size == 32);
+
+      const bool is_ssbo =
+         instr->intrinsic == nir_intrinsic_load_ssbo_block_intel;
+      fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 1 : 0]));
+
+      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+      srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
+         get_nir_buffer_intrinsic_index(ntb, bld, instr) :
+         fs_reg(brw_imm_ud(GFX7_BTI_SLM));
+      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
+
+      const fs_builder ubld1 = bld.exec_all().group(1, 0);
+      const fs_builder ubld8 = bld.exec_all().group(8, 0);
+      const fs_builder ubld16 = bld.exec_all().group(16, 0);
+
+      const unsigned total = instr->num_components * s.dispatch_width;
+      unsigned loaded = 0;
+
+      while (loaded < total) {
+         const unsigned block =
+            choose_oword_block_size_dwords(devinfo, total - loaded);
+         const unsigned block_bytes = block * 4;
+
+         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
+
+         const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
+         ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
+                   retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD),
+                   srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = block_bytes;
+
+         ubld1.ADD(address, address, brw_imm_ud(block_bytes));
+         loaded += block;
+      }
+
+      assert(loaded == total);
+      break;
+   }
+
+   case nir_intrinsic_store_shared_block_intel:
+   case nir_intrinsic_store_ssbo_block_intel: {
+      assert(nir_src_bit_size(instr->src[0]) == 32);
+
+      const bool is_ssbo =
+         instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
+
+      fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 2 : 1]));
+      fs_reg src = get_nir_src(ntb, instr->src[0]);
+
+      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+      srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
+         get_nir_buffer_intrinsic_index(ntb, bld, instr) :
+         fs_reg(brw_imm_ud(GFX7_BTI_SLM));
+      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
+
+      const fs_builder ubld1 = bld.exec_all().group(1, 0);
+      const fs_builder ubld8 = bld.exec_all().group(8, 0);
+      const fs_builder ubld16 = bld.exec_all().group(16, 0);
+
+      const unsigned total = instr->num_components * s.dispatch_width;
+      unsigned written = 0;
+
+      while (written < total) {
+         const unsigned block =
+            choose_oword_block_size_dwords(devinfo, total - written);
+
+         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
+         srcs[SURFACE_LOGICAL_SRC_DATA] =
+            retype(byte_offset(src, written * 4), BRW_REGISTER_TYPE_UD);
+
+         const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
+         ubld.emit(SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL,
+                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
+
+         const unsigned block_bytes = block * 4;
+         ubld1.ADD(address, address, brw_imm_ud(block_bytes));
+         written += block;
+      }
+
+      assert(written == total);
+      break;
+   }
+
+   case nir_intrinsic_load_topology_id_intel: {
+      /* These move around basically every hardware generation, so don't
+       * do any unbounded checks and fail if the platform hasn't explicitly
+       * been enabled here.
+       */
+      assert(devinfo->ver >= 12 && devinfo->ver <= 20);
+
+      /* Here is what the layout of SR0 looks like on Gfx12
+       * https://gfxspecs.intel.com/Predator/Home/Index/47256
+       *   [13:11] : Slice ID.
+       *   [10:9]  : Dual-SubSlice ID
+       *   [8]     : SubSlice ID
+       *   [7]     : EUID[2] (aka EU Row ID)
+       *   [6]     : Reserved
+       *   [5:4]   : EUID[1:0]
+       *   [2:0]   : Thread ID
+       *
+       * Xe2: Engine 3D and GPGPU Programs, EU Overview, Registers and
+       * Register Regions, ARF Registers, State Register,
+       * https://gfxspecs.intel.com/Predator/Home/Index/56623
+       *   [15:11] : Slice ID.
+       *   [9:8]   : SubSlice ID
+       *   [6:4]   : EUID
+       *   [2:0]   : Thread ID
+       */
+      fs_reg raw_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.emit(SHADER_OPCODE_READ_SR_REG, raw_id, brw_imm_ud(0));
+      switch (nir_intrinsic_base(instr)) {
+      case BRW_TOPOLOGY_ID_DSS:
+         if (devinfo->ver >= 20) {
+            /* Xe2+: 3D and GPGPU Programs, Shared Functions, Ray Tracing:
+             * https://gfxspecs.intel.com/Predator/Home/Index/56936
+             *
+             * Note: DSSID in all formulas below is a logical identifier of an
+             * XeCore (a value that goes from 0 to (number_of_slices *
+             * number_of_XeCores_per_slice -1). SW can get this value from
+             * either:
+             *
+             *  - Message Control Register LogicalSSID field (only in shaders
+             *    eligible for Mid-Thread Preemption).
+             *  - Calculated based of State Register with the following formula:
+             *    DSSID = StateRegister.SliceID * GT_ARCH_SS_PER_SLICE +
+             *    StateRRegister.SubSliceID where GT_SS_PER_SLICE is an
+             *    architectural parameter defined per product SKU.
+             *
+             * We are using the state register to calculate the DSSID.
+             */
+            fs_reg slice_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
+            fs_reg subslice_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
+            bld.AND(slice_id, raw_id, brw_imm_ud(INTEL_MASK(15, 11)));
+            bld.SHR(slice_id, slice_id, brw_imm_ud(11));
+
+            /* Assert that max subslices covers at least 2 bits that we use for
+             * subslices.
+             */
+            assert(devinfo->max_subslices_per_slice >= (1 << 2));
+            bld.MUL(slice_id, slice_id,
+                    brw_imm_ud(devinfo->max_subslices_per_slice));
+            bld.AND(subslice_id, raw_id, brw_imm_ud(INTEL_MASK(9, 8)));
+            bld.SHR(subslice_id, subslice_id, brw_imm_ud(8));
+            bld.ADD(retype(dest, BRW_REGISTER_TYPE_UD), slice_id,
+                    subslice_id);
+         } else {
+            bld.AND(raw_id, raw_id, brw_imm_ud(0x3fff));
+            /* Get rid of anything below dualsubslice */
+            bld.SHR(retype(dest, BRW_REGISTER_TYPE_UD), raw_id, brw_imm_ud(9));
+         }
+         break;
+      case BRW_TOPOLOGY_ID_EU_THREAD_SIMD: {
+         s.limit_dispatch_width(16, "Topology helper for Ray queries, "
+                              "not supported in SIMD32 mode.");
+         fs_reg dst = retype(dest, BRW_REGISTER_TYPE_UD);
+
+         if (devinfo->ver >= 20) {
+            /* Xe2+: Graphics Engine, 3D and GPGPU Programs, Shared Functions
+             * Ray Tracing,
+             * https://gfxspecs.intel.com/Predator/Home/Index/56936
+             *
+             * SyncStackID = (EUID[2:0] <<  8) | (ThreadID[2:0] << 4) |
+             *               SIMDLaneID[3:0];
+             *
+             * This section just deals with the EUID part.
+             *
+             * The 3bit EU[2:0] we need to build for ray query memory addresses
+             * computations is a bit odd :
+             *
+             *   EU[2:0] = raw_id[6:4] (identified as EUID[2:0])
+             */
+            bld.AND(dst, raw_id, brw_imm_ud(INTEL_MASK(6, 4)));
+            bld.SHL(dst, dst, brw_imm_ud(4));
+         } else {
+            /* EU[3:0] << 7
+             *
+             * The 4bit EU[3:0] we need to build for ray query memory addresses
+             * computations is a bit odd :
+             *
+             *   EU[1:0] = raw_id[5:4] (identified as EUID[1:0])
+             *   EU[2]   = raw_id[8]   (identified as SubSlice ID)
+             *   EU[3]   = raw_id[7]   (identified as EUID[2] or Row ID)
+             */
+            fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+            bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(7, 7)));
+            bld.SHL(dst, tmp, brw_imm_ud(3));
+            bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(8, 8)));
+            bld.SHL(tmp, tmp, brw_imm_ud(1));
+            bld.OR(dst, dst, tmp);
+            bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(5, 4)));
+            bld.SHL(tmp, tmp, brw_imm_ud(3));
+            bld.OR(dst, dst, tmp);
+         }
+
+         /* ThreadID[2:0] << 4 (ThreadID comes from raw_id[2:0]) */
+         {
+            bld.AND(raw_id, raw_id, brw_imm_ud(INTEL_MASK(2, 0)));
+            bld.SHL(raw_id, raw_id, brw_imm_ud(4));
+            bld.OR(dst, dst, raw_id);
+         }
+
+         /* LaneID[0:3] << 0 (Use nir SYSTEM_VALUE_SUBGROUP_INVOCATION) */
+         assert(bld.dispatch_width() <= 16); /* Limit to 4 bits */
+         bld.ADD(dst, dst,
+                 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
+         break;
+      }
+      default:
+         unreachable("Invalid topology id type");
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_btd_stack_id_intel:
+      if (s.stage == MESA_SHADER_COMPUTE) {
+         assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids);
+      } else {
+         assert(brw_shader_stage_is_bindless(s.stage));
+      }
+      /* Stack IDs are always in R1 regardless of whether we're coming from a
+       * bindless shader or a regular compute shader.
+       */
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
+              retype(brw_vec8_grf(1 * reg_unit(devinfo), 0), BRW_REGISTER_TYPE_UW));
+      break;
+
+   case nir_intrinsic_btd_spawn_intel:
+      if (s.stage == MESA_SHADER_COMPUTE) {
+         assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids);
+      } else {
+         assert(brw_shader_stage_is_bindless(s.stage));
+      }
+      /* Make sure all the pointers to resume shaders have landed where other
+       * threads can see them.
+       */
+      emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
+
+      bld.emit(SHADER_OPCODE_BTD_SPAWN_LOGICAL, bld.null_reg_ud(),
+               bld.emit_uniformize(get_nir_src(ntb, instr->src[0])),
+               get_nir_src(ntb, instr->src[1]));
+      break;
+
+   case nir_intrinsic_btd_retire_intel:
+      if (s.stage == MESA_SHADER_COMPUTE) {
+         assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids);
+      } else {
+         assert(brw_shader_stage_is_bindless(s.stage));
+      }
+      /* Make sure all the pointers to resume shaders have landed where other
+       * threads can see them.
+       */
+      emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
+      bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL);
+      break;
+
+   case nir_intrinsic_trace_ray_intel: {
+      const bool synchronous = nir_intrinsic_synchronous(instr);
+      assert(brw_shader_stage_is_bindless(s.stage) || synchronous);
+
+      /* Make sure all the previous RT structure writes are visible to the RT
+       * fixed function within the DSS, as well as stack pointers to resume
+       * shaders.
+       */
+      emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
+
+      fs_reg srcs[RT_LOGICAL_NUM_SRCS];
+
+      fs_reg globals = get_nir_src(ntb, instr->src[0]);
+      srcs[RT_LOGICAL_SRC_GLOBALS] = bld.emit_uniformize(globals);
+      srcs[RT_LOGICAL_SRC_BVH_LEVEL] = get_nir_src(ntb, instr->src[1]);
+      srcs[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] = get_nir_src(ntb, instr->src[2]);
+      srcs[RT_LOGICAL_SRC_SYNCHRONOUS] = brw_imm_ud(synchronous);
+      bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, bld.null_reg_ud(),
+               srcs, RT_LOGICAL_NUM_SRCS);
+
+      /* There is no actual value to use in the destination register of the
+       * synchronous trace instruction. All of the communication with the HW
+       * unit happens through memory reads/writes. So to ensure that the
+       * operation has completed before we go read the results in memory, we
+       * need a barrier followed by an invalidate before accessing memory.
+       */
+      if (synchronous) {
+         bld.emit(BRW_OPCODE_SYNC, bld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR));
+         emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_INVALIDATE);
+      }
+      break;
+   }
+
+   default:
+#ifndef NDEBUG
+      assert(instr->intrinsic < nir_num_intrinsics);
+      fprintf(stderr, "intrinsic: %s\n", nir_intrinsic_infos[instr->intrinsic].name);
+#endif
+      unreachable("unknown intrinsic");
+   }
+}
+
+static fs_reg
+expand_to_32bit(const fs_builder &bld, const fs_reg &src)
+{
+   if (type_sz(src.type) == 2) {
+      fs_reg src32 = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.MOV(src32, retype(src, BRW_REGISTER_TYPE_UW));
+      return src32;
+   } else {
+      return src;
+   }
+}
+
+static void
+fs_nir_emit_surface_atomic(nir_to_brw_state &ntb, const fs_builder &bld,
+                           nir_intrinsic_instr *instr,
+                           fs_reg surface,
+                           bool bindless)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   fs_visitor &s = ntb.s;
+
+   enum lsc_opcode op = lsc_aop_for_nir_intrinsic(instr);
+   int num_data = lsc_op_num_data_values(op);
+
+   bool shared = surface.file == IMM && surface.ud == GFX7_BTI_SLM;
+
+   /* The BTI untyped atomic messages only support 32-bit atomics.  If you
+    * just look at the big table of messages in the Vol 7 of the SKL PRM, they
+    * appear to exist.  However, if you look at Vol 2a, there are no message
+    * descriptors provided for Qword atomic ops except for A64 messages.
+    *
+    * 16-bit float atomics are supported, however.
+    */
+   assert(instr->def.bit_size == 32 ||
+          (instr->def.bit_size == 64 && devinfo->has_lsc) ||
+          (instr->def.bit_size == 16 &&
+           (devinfo->has_lsc || lsc_opcode_is_atomic_float(op))));
+
+   fs_reg dest = get_nir_def(ntb, instr->def);
+
+   fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
+   srcs[bindless ?
+        SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
+        SURFACE_LOGICAL_SRC_SURFACE] = surface;
+   srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
+   srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
+   srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
+
+   if (shared) {
+      /* SLM - Get the offset */
+      if (nir_src_is_const(instr->src[0])) {
+         srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
+            brw_imm_ud(nir_intrinsic_base(instr) +
+                       nir_src_as_uint(instr->src[0]));
+      } else {
+         srcs[SURFACE_LOGICAL_SRC_ADDRESS] = s.vgrf(glsl_uint_type());
+         bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
+                 retype(get_nir_src(ntb, instr->src[0]), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(nir_intrinsic_base(instr)));
+      }
+   } else {
+      /* SSBOs */
+      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
+   }
+
+   fs_reg data;
+   if (num_data >= 1)
+      data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 1 : 2]));
+
+   if (num_data >= 2) {
+      fs_reg tmp = bld.vgrf(data.type, 2);
+      fs_reg sources[2] = {
+         data,
+         expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 2 : 3]))
+      };
+      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
+      data = tmp;
+   }
+   srcs[SURFACE_LOGICAL_SRC_DATA] = data;
+
+   /* Emit the actual atomic operation */
+
+   switch (instr->def.bit_size) {
+      case 16: {
+         fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD);
+         bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
+                  retype(dest32, dest.type),
+                  srcs, SURFACE_LOGICAL_NUM_SRCS);
+         bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW),
+                 retype(dest32, BRW_REGISTER_TYPE_UD));
+         break;
+      }
+
+      case 32:
+      case 64:
+         bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
+                  dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
+         break;
+      default:
+         unreachable("Unsupported bit size");
+   }
+}
+
+static void
+fs_nir_emit_global_atomic(nir_to_brw_state &ntb, const fs_builder &bld,
+                          nir_intrinsic_instr *instr)
+{
+   enum lsc_opcode op = lsc_aop_for_nir_intrinsic(instr);
+   int num_data = lsc_op_num_data_values(op);
+
+   fs_reg dest = get_nir_def(ntb, instr->def);
+
+   fs_reg addr = get_nir_src(ntb, instr->src[0]);
+
+   fs_reg data;
+   if (num_data >= 1)
+      data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[1]));
+
+   if (num_data >= 2) {
+      fs_reg tmp = bld.vgrf(data.type, 2);
+      fs_reg sources[2] = {
+         data,
+         expand_to_32bit(bld, get_nir_src(ntb, instr->src[2]))
+      };
+      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
+      data = tmp;
+   }
+
+   fs_reg srcs[A64_LOGICAL_NUM_SRCS];
+   srcs[A64_LOGICAL_ADDRESS] = addr;
+   srcs[A64_LOGICAL_SRC] = data;
+   srcs[A64_LOGICAL_ARG] = brw_imm_ud(op);
+   srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
+
+   switch (instr->def.bit_size) {
+   case 16: {
+      fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
+               retype(dest32, dest.type),
+               srcs, A64_LOGICAL_NUM_SRCS);
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32);
+      break;
+   }
+   case 32:
+   case 64:
+      bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest,
+               srcs, A64_LOGICAL_NUM_SRCS);
+      break;
+   default:
+      unreachable("Unsupported bit size");
+   }
+}
+
+static void
+fs_nir_emit_texture(nir_to_brw_state &ntb,
+                    nir_tex_instr *instr)
+{
+   const intel_device_info *devinfo = ntb.devinfo;
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
+
+   /* SKL PRMs: Volume 7: 3D-Media-GPGPU:
+    *
+    *    "The Pixel Null Mask field, when enabled via the Pixel Null Mask
+    *     Enable will be incorect for sample_c when applied to a surface with
+    *     64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask
+    *     Enable may incorrectly report pixels as referencing a Null surface."
+    *
+    * We'll take care of this in NIR.
+    */
+   assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
+
+   srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(instr->is_sparse);
+
+   int lod_components = 0;
+
+   /* The hardware requires a LOD for buffer textures */
+   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
+      srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
+
+   ASSERTED bool got_lod = false;
+   ASSERTED bool got_bias = false;
+   uint32_t header_bits = 0;
+   for (unsigned i = 0; i < instr->num_srcs; i++) {
+      nir_src nir_src = instr->src[i].src;
+      fs_reg src = get_nir_src(ntb, nir_src);
+      switch (instr->src[i].src_type) {
+      case nir_tex_src_bias:
+         assert(!got_lod);
+         got_bias = true;
+
+         srcs[TEX_LOGICAL_SRC_LOD] =
+            retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_F);
+         break;
+      case nir_tex_src_comparator:
+         srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
+         break;
+      case nir_tex_src_coord:
+         switch (instr->op) {
+         case nir_texop_txf:
+         case nir_texop_txf_ms:
+         case nir_texop_txf_ms_mcs_intel:
+         case nir_texop_samples_identical:
+            srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
+            break;
+         default:
+            srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
+            break;
+         }
+         break;
+      case nir_tex_src_ddx:
+         srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
+         lod_components = nir_tex_instr_src_size(instr, i);
+         break;
+      case nir_tex_src_ddy:
+         srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
+         break;
+      case nir_tex_src_lod:
+         assert(!got_bias);
+         got_lod = true;
+
+         switch (instr->op) {
+         case nir_texop_txs:
+            srcs[TEX_LOGICAL_SRC_LOD] =
+               retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_UD);
+            break;
+         case nir_texop_txf:
+            srcs[TEX_LOGICAL_SRC_LOD] =
+               retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_D);
+            break;
+         default:
+            srcs[TEX_LOGICAL_SRC_LOD] =
+               retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_F);
+            break;
+         }
+         break;
+      case nir_tex_src_min_lod:
+         srcs[TEX_LOGICAL_SRC_MIN_LOD] =
+            retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_F);
+         break;
+      case nir_tex_src_ms_index:
+         srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
+         break;
+
+      case nir_tex_src_offset: {
+         uint32_t offset_bits = 0;
+         if (brw_texture_offset(instr, i, &offset_bits)) {
+            header_bits |= offset_bits;
+         } else {
+            /* On gfx12.5+, if the offsets are not both constant and in the
+             * {-8,7} range, nir_lower_tex() will have already lowered the
+             * source offset. So we should never reach this point.
+             */
+            assert(devinfo->verx10 < 125);
+            srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
+               retype(src, BRW_REGISTER_TYPE_D);
+         }
+         break;
+      }
+
+      case nir_tex_src_projector:
+         unreachable("should be lowered");
+
+      case nir_tex_src_texture_offset: {
+         assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE);
+         /* Emit code to evaluate the actual indexing expression */
+         if (instr->texture_index == 0 && is_resource_src(nir_src))
+            srcs[TEX_LOGICAL_SRC_SURFACE] = get_resource_nir_src(ntb, nir_src);
+         if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE) {
+            fs_reg tmp = s.vgrf(glsl_uint_type());
+            bld.ADD(tmp, src, brw_imm_ud(instr->texture_index));
+            srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
+         }
+         assert(srcs[TEX_LOGICAL_SRC_SURFACE].file != BAD_FILE);
+         break;
+      }
+
+      case nir_tex_src_sampler_offset: {
+         /* Emit code to evaluate the actual indexing expression */
+         if (instr->sampler_index == 0 && is_resource_src(nir_src))
+            srcs[TEX_LOGICAL_SRC_SAMPLER] = get_resource_nir_src(ntb, nir_src);
+         if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE) {
+            fs_reg tmp = s.vgrf(glsl_uint_type());
+            bld.ADD(tmp, src, brw_imm_ud(instr->sampler_index));
+            srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
+         }
+         break;
+      }
+
+      case nir_tex_src_texture_handle:
+         assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
+         srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg();
+         if (is_resource_src(nir_src))
+            srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = get_resource_nir_src(ntb, nir_src);
+         if (srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
+            srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
+         break;
+
+      case nir_tex_src_sampler_handle:
+         assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
+         srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg();
+         if (is_resource_src(nir_src))
+            srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = get_resource_nir_src(ntb, nir_src);
+         if (srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
+            srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
+         break;
+
+      case nir_tex_src_ms_mcs_intel:
+         assert(instr->op == nir_texop_txf_ms);
+         srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
+         break;
+
+      /* If this parameter is present, we are packing either the explicit LOD
+       * or LOD bias and the array index into a single (32-bit) value when
+       * 32-bit texture coordinates are used.
+       */
+      case nir_tex_src_backend1:
+         assert(!got_lod && !got_bias);
+         got_lod = true;
+
+         assert(instr->op == nir_texop_txl || instr->op == nir_texop_txb);
+         srcs[TEX_LOGICAL_SRC_LOD] =
+            retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_F);
+         break;
+
+      default:
+         unreachable("unknown texture source");
+      }
+   }
+
+   /* If the surface or sampler were not specified through sources, use the
+    * instruction index.
+    */
+   if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE &&
+       srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
+      srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(instr->texture_index);
+   if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE &&
+       srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
+      srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(instr->sampler_index);
+
+   if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
+       (instr->op == nir_texop_txf_ms ||
+        instr->op == nir_texop_samples_identical)) {
+      if (devinfo->ver >= 7) {
+         srcs[TEX_LOGICAL_SRC_MCS] =
+            emit_mcs_fetch(ntb, srcs[TEX_LOGICAL_SRC_COORDINATE],
+                           instr->coord_components,
+                           srcs[TEX_LOGICAL_SRC_SURFACE],
+                           srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
+      } else {
+         srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
+      }
+   }
+
+   srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
+   srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
+
+   enum opcode opcode;
+   switch (instr->op) {
+   case nir_texop_tex:
+      opcode = SHADER_OPCODE_TEX_LOGICAL;
+      break;
+   case nir_texop_txb:
+      opcode = FS_OPCODE_TXB_LOGICAL;
+      break;
+   case nir_texop_txl:
+      opcode = SHADER_OPCODE_TXL_LOGICAL;
+      break;
+   case nir_texop_txd:
+      opcode = SHADER_OPCODE_TXD_LOGICAL;
+      break;
+   case nir_texop_txf:
+      opcode = SHADER_OPCODE_TXF_LOGICAL;
+      break;
+   case nir_texop_txf_ms:
+      /* On Gfx12HP there is only CMS_W available. From the Bspec: Shared
+       * Functions - 3D Sampler - Messages - Message Format:
+       *
+       *   ld2dms REMOVEDBY(GEN:HAS:1406788836)
+       */
+      if (devinfo->verx10 >= 125)
+         opcode = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
+      else if (devinfo->ver >= 9)
+         opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
+      else
+         opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
+      break;
+   case nir_texop_txf_ms_mcs_intel:
+      opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
+      break;
+   case nir_texop_query_levels:
+   case nir_texop_txs:
+      opcode = SHADER_OPCODE_TXS_LOGICAL;
+      break;
+   case nir_texop_lod:
+      opcode = SHADER_OPCODE_LOD_LOGICAL;
+      break;
+   case nir_texop_tg4:
+      if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
+         opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
+      else
+         opcode = SHADER_OPCODE_TG4_LOGICAL;
+      break;
+   case nir_texop_texture_samples:
+      opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
+      break;
+   case nir_texop_samples_identical: {
+      fs_reg dst = retype(get_nir_def(ntb, instr->def), BRW_REGISTER_TYPE_D);
+
+      /* If mcs is an immediate value, it means there is no MCS.  In that case
+       * just return false.
+       */
+      if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
+         bld.MOV(dst, brw_imm_ud(0u));
+      } else if (devinfo->ver >= 9) {
+         fs_reg tmp = s.vgrf(glsl_uint_type());
+         bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
+                offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
+         bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
+      } else {
+         bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
+                 BRW_CONDITIONAL_EQ);
+      }
+      return;
+   }
+   default:
+      unreachable("unknown texture opcode");
+   }
+
+   if (instr->op == nir_texop_tg4) {
+      if (instr->component == 1 &&
+          s.key_tex->gather_channel_quirk_mask & (1 << instr->texture_index)) {
+         /* gather4 sampler is broken for green channel on RG32F --
+          * we must ask for blue instead.
+          */
+         header_bits |= 2 << 16;
+      } else {
+         header_bits |= instr->component << 16;
+      }
+   }
+
+   fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse);
+   fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
+   inst->offset = header_bits;
+
+   const unsigned dest_size = nir_tex_instr_dest_size(instr);
+   if (devinfo->ver >= 9 &&
+       instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
+      unsigned write_mask = nir_def_components_read(&instr->def);
+      assert(write_mask != 0); /* dead code should have been eliminated */
+      if (instr->is_sparse) {
+         inst->size_written = (util_last_bit(write_mask) - 1) *
+                              inst->dst.component_size(inst->exec_size) +
+                              (reg_unit(devinfo) * REG_SIZE);
+      } else {
+         inst->size_written = util_last_bit(write_mask) *
+                              inst->dst.component_size(inst->exec_size);
+      }
+   } else {
+      inst->size_written = 4 * inst->dst.component_size(inst->exec_size) +
+                           (instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0);
+   }
+
+   if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
+      inst->shadow_compare = true;
+
+   /* Wa_14012688258:
+    *
+    * Don't trim zeros at the end of payload for sample operations
+    * in cube and cube arrays.
+    */
+   if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
+       intel_needs_workaround(devinfo, 14012688258)) {
+
+      /* Compiler should send U,V,R parameters even if V,R are 0. */
+      if (srcs[TEX_LOGICAL_SRC_COORDINATE].file != BAD_FILE)
+         assert(instr->coord_components >= 3u);
+
+      /* See opt_zero_samples(). */
+      inst->keep_payload_trailing_zeros = true;
+   }
+
+   fs_reg nir_dest[5];
+   for (unsigned i = 0; i < dest_size; i++)
+      nir_dest[i] = offset(dst, bld, i);
+
+   if (instr->op == nir_texop_query_levels) {
+      /* # levels is in .w */
+      if (devinfo->ver <= 9) {
+         /**
+          * Wa_1940217:
+          *
+          * When a surface of type SURFTYPE_NULL is accessed by resinfo, the
+          * MIPCount returned is undefined instead of 0.
+          */
+         fs_inst *mov = bld.MOV(bld.null_reg_d(), dst);
+         mov->conditional_mod = BRW_CONDITIONAL_NZ;
+         nir_dest[0] = bld.vgrf(BRW_REGISTER_TYPE_D);
+         fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), brw_imm_d(0));
+         sel->predicate = BRW_PREDICATE_NORMAL;
+      } else {
+         nir_dest[0] = offset(dst, bld, 3);
+      }
+   } else if (instr->op == nir_texop_txs &&
+              dest_size >= 3 && devinfo->ver < 7) {
+      /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */
+      fs_reg depth = offset(dst, bld, 2);
+      nir_dest[2] = s.vgrf(glsl_int_type());
+      bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
+   }
+
+   /* The residency bits are only in the first component. */
+   if (instr->is_sparse)
+      nir_dest[dest_size - 1] = component(offset(dst, bld, dest_size - 1), 0);
+
+   bld.LOAD_PAYLOAD(get_nir_def(ntb, instr->def), nir_dest, dest_size, 0);
+}
+
+static void
+fs_nir_emit_jump(nir_to_brw_state &ntb, nir_jump_instr *instr)
+{
+   switch (instr->type) {
+   case nir_jump_break:
+      ntb.bld.emit(BRW_OPCODE_BREAK);
+      break;
+   case nir_jump_continue:
+      ntb.bld.emit(BRW_OPCODE_CONTINUE);
+      break;
+   case nir_jump_halt:
+      ntb.bld.emit(BRW_OPCODE_HALT);
+      break;
+   case nir_jump_return:
+   default:
+      unreachable("unknown jump");
+   }
+}
+
+/*
+ * This helper takes a source register and un/shuffles it into the destination
+ * register.
+ *
+ * If source type size is smaller than destination type size the operation
+ * needed is a component shuffle. The opposite case would be an unshuffle. If
+ * source/destination type size is equal a shuffle is done that would be
+ * equivalent to a simple MOV.
+ *
+ * For example, if source is a 16-bit type and destination is 32-bit. A 3
+ * components .xyz 16-bit vector on SIMD8 would be.
+ *
+ *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
+ *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
+ *
+ * This helper will return the following 2 32-bit components with the 16-bit
+ * values shuffled:
+ *
+ *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
+ *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
+ *
+ * For unshuffle, the example would be the opposite, a 64-bit type source
+ * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
+ * would be:
+ *
+ *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
+ *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
+ *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
+ *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
+ *
+ * The returned result would be the following 4 32-bit components unshuffled:
+ *
+ *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
+ *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
+ *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
+ *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
+ *
+ * - Source and destination register must not be overlapped.
+ * - components units are measured in terms of the smaller type between
+ *   source and destination because we are un/shuffling the smaller
+ *   components from/into the bigger ones.
+ * - first_component parameter allows skipping source components.
+ */
+void
+shuffle_src_to_dst(const fs_builder &bld,
+                   const fs_reg &dst,
+                   const fs_reg &src,
+                   uint32_t first_component,
+                   uint32_t components)
+{
+   if (type_sz(src.type) == type_sz(dst.type)) {
+      assert(!regions_overlap(dst,
+         type_sz(dst.type) * bld.dispatch_width() * components,
+         offset(src, bld, first_component),
+         type_sz(src.type) * bld.dispatch_width() * components));
+      for (unsigned i = 0; i < components; i++) {
+         bld.MOV(retype(offset(dst, bld, i), src.type),
+                 offset(src, bld, i + first_component));
+      }
+   } else if (type_sz(src.type) < type_sz(dst.type)) {
+      /* Source is shuffled into destination */
+      unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
+      assert(!regions_overlap(dst,
+         type_sz(dst.type) * bld.dispatch_width() *
+         DIV_ROUND_UP(components, size_ratio),
+         offset(src, bld, first_component),
+         type_sz(src.type) * bld.dispatch_width() * components));
+
+      brw_reg_type shuffle_type =
+         brw_reg_type_from_bit_size(8 * type_sz(src.type),
+                                    BRW_REGISTER_TYPE_D);
+      for (unsigned i = 0; i < components; i++) {
+         fs_reg shuffle_component_i =
+            subscript(offset(dst, bld, i / size_ratio),
+                      shuffle_type, i % size_ratio);
+         bld.MOV(shuffle_component_i,
+                 retype(offset(src, bld, i + first_component), shuffle_type));
+      }
+   } else {
+      /* Source is unshuffled into destination */
+      unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
+      assert(!regions_overlap(dst,
+         type_sz(dst.type) * bld.dispatch_width() * components,
+         offset(src, bld, first_component / size_ratio),
+         type_sz(src.type) * bld.dispatch_width() *
+         DIV_ROUND_UP(components + (first_component % size_ratio),
+                      size_ratio)));
+
+      brw_reg_type shuffle_type =
+         brw_reg_type_from_bit_size(8 * type_sz(dst.type),
+                                    BRW_REGISTER_TYPE_D);
+      for (unsigned i = 0; i < components; i++) {
+         fs_reg shuffle_component_i =
+            subscript(offset(src, bld, (first_component + i) / size_ratio),
+                      shuffle_type, (first_component + i) % size_ratio);
+         bld.MOV(retype(offset(dst, bld, i), shuffle_type),
+                 shuffle_component_i);
+      }
+   }
+}
+
+void
+shuffle_from_32bit_read(const fs_builder &bld,
+                        const fs_reg &dst,
+                        const fs_reg &src,
+                        uint32_t first_component,
+                        uint32_t components)
+{
+   assert(type_sz(src.type) == 4);
+
+   /* This function takes components in units of the destination type while
+    * shuffle_src_to_dst takes components in units of the smallest type
+    */
+   if (type_sz(dst.type) > 4) {
+      assert(type_sz(dst.type) == 8);
+      first_component *= 2;
+      components *= 2;
+   }
+
+   shuffle_src_to_dst(bld, dst, src, first_component, components);
+}
+
+fs_reg
+setup_imm_df(const fs_builder &bld, double v)
+{
+   const struct intel_device_info *devinfo = bld.shader->devinfo;
+   assert(devinfo->ver >= 7);
+
+   if (devinfo->ver >= 8)
+      return brw_imm_df(v);
+
+   /* gfx7.5 does not support DF immediates straightforward but the DIM
+    * instruction allows to set the 64-bit immediate value.
+    */
+   if (devinfo->platform == INTEL_PLATFORM_HSW) {
+      const fs_builder ubld = bld.exec_all().group(1, 0);
+      fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
+      ubld.DIM(dst, brw_imm_df(v));
+      return component(dst, 0);
+   }
+
+   /* gfx7 does not support DF immediates, so we generate a 64-bit constant by
+    * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
+    * the high 32-bit to suboffset 4 and then applying a stride of 0.
+    *
+    * Alternatively, we could also produce a normal VGRF (without stride 0)
+    * by writing to all the channels in the VGRF, however, that would hit the
+    * gfx7 bug where we have to split writes that span more than 1 register
+    * into instructions with a width of 4 (otherwise the write to the second
+    * register written runs into an execmask hardware bug) which isn't very
+    * nice.
+    */
+   union {
+      double d;
+      struct {
+         uint32_t i1;
+         uint32_t i2;
+      };
+   } di;
+
+   di.d = v;
+
+   const fs_builder ubld = bld.exec_all().group(1, 0);
+   const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+   ubld.MOV(tmp, brw_imm_ud(di.i1));
+   ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
+
+   return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
+}
+
+fs_reg
+setup_imm_b(const fs_builder &bld, int8_t v)
+{
+   const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B);
+   bld.MOV(tmp, brw_imm_w(v));
+   return tmp;
+}
+
+fs_reg
+setup_imm_ub(const fs_builder &bld, uint8_t v)
+{
+   const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB);
+   bld.MOV(tmp, brw_imm_uw(v));
+   return tmp;
+}
+
+static void
+fs_nir_emit_instr(nir_to_brw_state &ntb, nir_instr *instr)
+{
+   ntb.bld = ntb.bld.annotate(NULL, instr);
+
+   switch (instr->type) {
+   case nir_instr_type_alu:
+      fs_nir_emit_alu(ntb, nir_instr_as_alu(instr), true);
+      break;
+
+   case nir_instr_type_deref:
+      unreachable("All derefs should've been lowered");
+      break;
+
+   case nir_instr_type_intrinsic:
+      switch (ntb.s.stage) {
+      case MESA_SHADER_VERTEX:
+         fs_nir_emit_vs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_TESS_CTRL:
+         fs_nir_emit_tcs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_TESS_EVAL:
+         fs_nir_emit_tes_intrinsic(ntb, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_GEOMETRY:
+         fs_nir_emit_gs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_FRAGMENT:
+         fs_nir_emit_fs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_COMPUTE:
+      case MESA_SHADER_KERNEL:
+         fs_nir_emit_cs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_RAYGEN:
+      case MESA_SHADER_ANY_HIT:
+      case MESA_SHADER_CLOSEST_HIT:
+      case MESA_SHADER_MISS:
+      case MESA_SHADER_INTERSECTION:
+      case MESA_SHADER_CALLABLE:
+         fs_nir_emit_bs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_TASK:
+         fs_nir_emit_task_intrinsic(ntb, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_MESH:
+         fs_nir_emit_mesh_intrinsic(ntb, nir_instr_as_intrinsic(instr));
+         break;
+      default:
+         unreachable("unsupported shader stage");
+      }
+      break;
+
+   case nir_instr_type_tex:
+      fs_nir_emit_texture(ntb, nir_instr_as_tex(instr));
+      break;
+
+   case nir_instr_type_load_const:
+      fs_nir_emit_load_const(ntb, nir_instr_as_load_const(instr));
+      break;
+
+   case nir_instr_type_undef:
+      /* We create a new VGRF for undefs on every use (by handling
+       * them in get_nir_src()), rather than for each definition.
+       * This helps register coalescing eliminate MOVs from undef.
+       */
+      break;
+
+   case nir_instr_type_jump:
+      fs_nir_emit_jump(ntb, nir_instr_as_jump(instr));
+      break;
+
+   default:
+      unreachable("unknown instruction type");
+   }
+}
+
+static unsigned
+brw_rnd_mode_from_nir(unsigned mode, unsigned *mask)
+{
+   unsigned brw_mode = 0;
+   *mask = 0;
+
+   if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
+        FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
+        FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
+       mode) {
+      brw_mode |= BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT;
+      *mask |= BRW_CR0_RND_MODE_MASK;
+   }
+   if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
+        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
+        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
+       mode) {
+      brw_mode |= BRW_RND_MODE_RTNE << BRW_CR0_RND_MODE_SHIFT;
+      *mask |= BRW_CR0_RND_MODE_MASK;
+   }
+   if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) {
+      brw_mode |= BRW_CR0_FP16_DENORM_PRESERVE;
+      *mask |= BRW_CR0_FP16_DENORM_PRESERVE;
+   }
+   if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) {
+      brw_mode |= BRW_CR0_FP32_DENORM_PRESERVE;
+      *mask |= BRW_CR0_FP32_DENORM_PRESERVE;
+   }
+   if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) {
+      brw_mode |= BRW_CR0_FP64_DENORM_PRESERVE;
+      *mask |= BRW_CR0_FP64_DENORM_PRESERVE;
+   }
+   if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16)
+      *mask |= BRW_CR0_FP16_DENORM_PRESERVE;
+   if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32)
+      *mask |= BRW_CR0_FP32_DENORM_PRESERVE;
+   if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64)
+      *mask |= BRW_CR0_FP64_DENORM_PRESERVE;
+   if (mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
+      *mask |= BRW_CR0_FP_MODE_MASK;
+
+   if (*mask != 0)
+      assert((*mask & brw_mode) == brw_mode);
+
+   return brw_mode;
+}
+
+static void
+emit_shader_float_controls_execution_mode(nir_to_brw_state &ntb)
+{
+   const fs_builder &bld = ntb.bld;
+   fs_visitor &s = ntb.s;
+
+   unsigned execution_mode = s.nir->info.float_controls_execution_mode;
+   if (execution_mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
+      return;
+
+   fs_builder ubld = bld.exec_all().group(1, 0);
+   fs_builder abld = ubld.annotate("shader floats control execution mode");
+   unsigned mask, mode = brw_rnd_mode_from_nir(execution_mode, &mask);
+
+   if (mask == 0)
+      return;
+
+   abld.emit(SHADER_OPCODE_FLOAT_CONTROL_MODE, bld.null_reg_ud(),
+             brw_imm_d(mode), brw_imm_d(mask));
+}
+
+void
+nir_to_brw(fs_visitor *s)
+{
+   nir_to_brw_state ntb = {
+      .s       = *s,
+      .nir     = s->nir,
+      .devinfo = s->devinfo,
+      .mem_ctx = ralloc_context(NULL),
+      .bld     = fs_builder(s).at_end(),
+   };
+
+   emit_shader_float_controls_execution_mode(ntb);
+
+   /* emit the arrays used for inputs and outputs - load/store intrinsics will
+    * be converted to reads/writes of these arrays
+    */
+   fs_nir_setup_outputs(ntb);
+   fs_nir_setup_uniforms(ntb.s);
+   fs_nir_emit_system_values(ntb);
+   ntb.s.last_scratch = ALIGN(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;
+
+   fs_nir_emit_impl(ntb, nir_shader_get_entrypoint((nir_shader *)ntb.nir));
+
+   ntb.bld.emit(SHADER_OPCODE_HALT_TARGET);
+
+   ralloc_free(ntb.mem_ctx);
+}
+
diff --git a/src/intel/compiler/elk/brw_fs_reg_allocate.cpp b/src/intel/compiler/elk/brw_fs_reg_allocate.cpp
new file mode 100644
index 00000000000..cc0f4762bc6
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_reg_allocate.cpp
@@ -0,0 +1,1412 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_cfg.h"
+#include "util/set.h"
+#include "util/register_allocate.h"
+
+using namespace brw;
+
+#define REG_CLASS_COUNT 20
+
+static void
+assign_reg(const struct intel_device_info *devinfo,
+           unsigned *reg_hw_locations, fs_reg *reg)
+{
+   if (reg->file == VGRF) {
+      reg->nr = reg_unit(devinfo) * reg_hw_locations[reg->nr] + reg->offset / REG_SIZE;
+      reg->offset %= REG_SIZE;
+   }
+}
+
+void
+fs_visitor::assign_regs_trivial()
+{
+   unsigned hw_reg_mapping[this->alloc.count + 1];
+   unsigned i;
+   int reg_width = dispatch_width / 8;
+
+   /* Note that compressed instructions require alignment to 2 registers. */
+   hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
+   for (i = 1; i <= this->alloc.count; i++) {
+      hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
+                           DIV_ROUND_UP(this->alloc.sizes[i - 1],
+                                        reg_unit(devinfo)));
+   }
+   this->grf_used = hw_reg_mapping[this->alloc.count];
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      assign_reg(devinfo, hw_reg_mapping, &inst->dst);
+      for (i = 0; i < inst->sources; i++) {
+         assign_reg(devinfo, hw_reg_mapping, &inst->src[i]);
+      }
+   }
+
+   if (this->grf_used >= max_grf) {
+      fail("Ran out of regs on trivial allocator (%d/%d)\n",
+	   this->grf_used, max_grf);
+   } else {
+      this->alloc.count = this->grf_used;
+   }
+
+}
+
+/**
+ * Size of a register from the aligned_bary_class register class.
+ */
+static unsigned
+aligned_bary_size(unsigned dispatch_width)
+{
+   return (dispatch_width == 8 ? 2 : 4);
+}
+
+static void
+brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   int base_reg_count = BRW_MAX_GRF;
+   const int index = util_logbase2(dispatch_width / 8);
+
+   if (dispatch_width > 8 && devinfo->ver >= 7) {
+      /* For IVB+, we don't need the PLN hacks or the even-reg alignment in
+       * SIMD16.  Therefore, we can use the exact same register sets for
+       * SIMD16 as we do for SIMD8 and we don't need to recalculate them.
+       */
+      compiler->fs_reg_sets[index] = compiler->fs_reg_sets[0];
+      return;
+   }
+
+   /* The registers used to make up almost all values handled in the compiler
+    * are a scalar value occupying a single register (or 2 registers in the
+    * case of SIMD16, which is handled by dividing base_reg_count by 2 and
+    * multiplying allocated register numbers by 2).  Things that were
+    * aggregates of scalar values at the GLSL level were split to scalar
+    * values by split_virtual_grfs().
+    *
+    * However, texture SEND messages return a series of contiguous registers
+    * to write into.  We currently always ask for 4 registers, but we may
+    * convert that to use less some day.
+    *
+    * Additionally, on gfx5 we need aligned pairs of registers for the PLN
+    * instruction, and on gfx4 we need 8 contiguous regs for workaround simd16
+    * texturing.
+    */
+   assert(REG_CLASS_COUNT == MAX_VGRF_SIZE(devinfo) / reg_unit(devinfo));
+   int class_sizes[REG_CLASS_COUNT];
+   for (unsigned i = 0; i < REG_CLASS_COUNT; i++)
+      class_sizes[i] = i + 1;
+
+   struct ra_regs *regs = ra_alloc_reg_set(compiler, BRW_MAX_GRF, false);
+   if (devinfo->ver >= 6)
+      ra_set_allocate_round_robin(regs);
+   struct ra_class **classes = ralloc_array(compiler, struct ra_class *,
+                                            REG_CLASS_COUNT);
+   struct ra_class *aligned_bary_class = NULL;
+
+   /* Now, make the register classes for each size of contiguous register
+    * allocation we might need to make.
+    */
+   for (int i = 0; i < REG_CLASS_COUNT; i++) {
+      classes[i] = ra_alloc_contig_reg_class(regs, class_sizes[i]);
+
+      if (devinfo->ver <= 5 && dispatch_width >= 16) {
+         /* From the G45 PRM:
+          *
+          * In order to reduce the hardware complexity, the following
+          * rules and restrictions apply to the compressed instruction:
+          * ...
+          * * Operand Alignment Rule: With the exceptions listed below, a
+          *   source/destination operand in general should be aligned to
+          *   even 256-bit physical register with a region size equal to
+          *   two 256-bit physical register
+          */
+         for (int reg = 0; reg <= base_reg_count - class_sizes[i]; reg += 2)
+            ra_class_add_reg(classes[i], reg);
+      } else {
+         for (int reg = 0; reg <= base_reg_count - class_sizes[i]; reg++)
+            ra_class_add_reg(classes[i], reg);
+      }
+   }
+
+   /* Add a special class for aligned barycentrics, which we'll put the
+    * first source of LINTERP on so that we can do PLN on Gen <= 6.
+    */
+   if (devinfo->has_pln && (devinfo->ver == 6 ||
+                            (dispatch_width == 8 && devinfo->ver <= 5))) {
+      int contig_len = aligned_bary_size(dispatch_width);
+      aligned_bary_class = ra_alloc_contig_reg_class(regs, contig_len);
+
+      for (int i = 0; i <= base_reg_count - contig_len; i += 2)
+         ra_class_add_reg(aligned_bary_class, i);
+   }
+
+   ra_set_finalize(regs, NULL);
+
+   compiler->fs_reg_sets[index].regs = regs;
+   for (unsigned i = 0; i < ARRAY_SIZE(compiler->fs_reg_sets[index].classes); i++)
+      compiler->fs_reg_sets[index].classes[i] = NULL;
+   for (int i = 0; i < REG_CLASS_COUNT; i++)
+      compiler->fs_reg_sets[index].classes[class_sizes[i] - 1] = classes[i];
+   compiler->fs_reg_sets[index].aligned_bary_class = aligned_bary_class;
+}
+
+void
+brw_fs_alloc_reg_sets(struct brw_compiler *compiler)
+{
+   brw_alloc_reg_set(compiler, 8);
+   brw_alloc_reg_set(compiler, 16);
+   brw_alloc_reg_set(compiler, 32);
+}
+
+static int
+count_to_loop_end(const bblock_t *block)
+{
+   if (block->end()->opcode == BRW_OPCODE_WHILE)
+      return block->end_ip;
+
+   int depth = 1;
+   /* Skip the first block, since we don't want to count the do the calling
+    * function found.
+    */
+   for (block = block->next();
+        depth > 0;
+        block = block->next()) {
+      if (block->start()->opcode == BRW_OPCODE_DO)
+         depth++;
+      if (block->end()->opcode == BRW_OPCODE_WHILE) {
+         depth--;
+         if (depth == 0)
+            return block->end_ip;
+      }
+   }
+   unreachable("not reached");
+}
+
+void fs_visitor::calculate_payload_ranges(unsigned payload_node_count,
+                                          int *payload_last_use_ip) const
+{
+   int loop_depth = 0;
+   int loop_end_ip = 0;
+
+   for (unsigned i = 0; i < payload_node_count; i++)
+      payload_last_use_ip[i] = -1;
+
+   int ip = 0;
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      switch (inst->opcode) {
+      case BRW_OPCODE_DO:
+         loop_depth++;
+
+         /* Since payload regs are deffed only at the start of the shader
+          * execution, any uses of the payload within a loop mean the live
+          * interval extends to the end of the outermost loop.  Find the ip of
+          * the end now.
+          */
+         if (loop_depth == 1)
+            loop_end_ip = count_to_loop_end(block);
+         break;
+      case BRW_OPCODE_WHILE:
+         loop_depth--;
+         break;
+      default:
+         break;
+      }
+
+      int use_ip;
+      if (loop_depth > 0)
+         use_ip = loop_end_ip;
+      else
+         use_ip = ip;
+
+      /* Note that UNIFORM args have been turned into FIXED_GRF by
+       * assign_curbe_setup(), and interpolation uses fixed hardware regs from
+       * the start (see interp_reg()).
+       */
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == FIXED_GRF) {
+            unsigned reg_nr = inst->src[i].nr;
+            if (reg_nr / reg_unit(devinfo) >= payload_node_count)
+               continue;
+
+            for (unsigned j = reg_nr / reg_unit(devinfo);
+                 j < DIV_ROUND_UP(reg_nr + regs_read(inst, i),
+                                  reg_unit(devinfo));
+                 j++) {
+               payload_last_use_ip[j] = use_ip;
+               assert(j < payload_node_count);
+            }
+         }
+      }
+
+      if (inst->dst.file == FIXED_GRF) {
+         unsigned reg_nr = inst->dst.nr;
+         if (reg_nr / reg_unit(devinfo) < payload_node_count) {
+            for (unsigned j = reg_nr / reg_unit(devinfo);
+                 j < DIV_ROUND_UP(reg_nr + regs_written(inst),
+                                  reg_unit(devinfo));
+                 j++) {
+               payload_last_use_ip[j] = use_ip;
+               assert(j < payload_node_count);
+            }
+         }
+      }
+
+      /* Special case instructions which have extra implied registers used. */
+      switch (inst->opcode) {
+      case CS_OPCODE_CS_TERMINATE:
+         payload_last_use_ip[0] = use_ip;
+         break;
+
+      default:
+         if (inst->eot) {
+            /* We could omit this for the !inst->header_present case, except
+             * that the simulator apparently incorrectly reads from g0/g1
+             * instead of sideband.  It also really freaks out driver
+             * developers to see g0 used in unusual places, so just always
+             * reserve it.
+             */
+            payload_last_use_ip[0] = use_ip;
+            payload_last_use_ip[1] = use_ip;
+         }
+         break;
+      }
+
+      ip++;
+   }
+}
+
+class fs_reg_alloc {
+public:
+   fs_reg_alloc(fs_visitor *fs):
+      fs(fs), devinfo(fs->devinfo), compiler(fs->compiler),
+      live(fs->live_analysis.require()), g(NULL),
+      have_spill_costs(false)
+   {
+      mem_ctx = ralloc_context(NULL);
+
+      /* Stash the number of instructions so we can sanity check that our
+       * counts still match liveness.
+       */
+      live_instr_count = fs->cfg->last_block()->end_ip + 1;
+
+      spill_insts = _mesa_pointer_set_create(mem_ctx);
+
+      /* Most of this allocation was written for a reg_width of 1
+       * (dispatch_width == 8).  In extending to SIMD16, the code was
+       * left in place and it was converted to have the hardware
+       * registers it's allocating be contiguous physical pairs of regs
+       * for reg_width == 2.
+       */
+      int reg_width = fs->dispatch_width / 8;
+      rsi = util_logbase2(reg_width);
+      payload_node_count = ALIGN(fs->first_non_payload_grf, reg_width);
+
+      /* Get payload IP information */
+      payload_last_use_ip = ralloc_array(mem_ctx, int, payload_node_count);
+
+      node_count = 0;
+      first_payload_node = 0;
+      first_mrf_hack_node = 0;
+      scratch_header_node = 0;
+      grf127_send_hack_node = 0;
+      first_vgrf_node = 0;
+      last_vgrf_node = 0;
+      first_spill_node = 0;
+
+      spill_vgrf_ip = NULL;
+      spill_vgrf_ip_alloc = 0;
+      spill_node_count = 0;
+   }
+
+   ~fs_reg_alloc()
+   {
+      ralloc_free(mem_ctx);
+   }
+
+   bool assign_regs(bool allow_spilling, bool spill_all);
+
+private:
+   void setup_live_interference(unsigned node,
+                                int node_start_ip, int node_end_ip);
+   void setup_inst_interference(const fs_inst *inst);
+
+   void build_interference_graph(bool allow_spilling);
+   void discard_interference_graph();
+
+   fs_reg build_lane_offsets(const fs_builder &bld,
+                             uint32_t spill_offset, int ip);
+   fs_reg build_single_offset(const fs_builder &bld,
+                              uint32_t spill_offset, int ip);
+
+   void emit_unspill(const fs_builder &bld, struct shader_stats *stats,
+                     fs_reg dst, uint32_t spill_offset, unsigned count, int ip);
+   void emit_spill(const fs_builder &bld, struct shader_stats *stats,
+                   fs_reg src, uint32_t spill_offset, unsigned count, int ip);
+
+   void set_spill_costs();
+   int choose_spill_reg();
+   fs_reg alloc_scratch_header();
+   fs_reg alloc_spill_reg(unsigned size, int ip);
+   void spill_reg(unsigned spill_reg);
+
+   void *mem_ctx;
+   fs_visitor *fs;
+   const intel_device_info *devinfo;
+   const brw_compiler *compiler;
+   const fs_live_variables &live;
+   int live_instr_count;
+
+   set *spill_insts;
+
+   /* Which compiler->fs_reg_sets[] to use */
+   int rsi;
+
+   ra_graph *g;
+   bool have_spill_costs;
+
+   int payload_node_count;
+   int *payload_last_use_ip;
+
+   int node_count;
+   int first_payload_node;
+   int first_mrf_hack_node;
+   int scratch_header_node;
+   int grf127_send_hack_node;
+   int first_vgrf_node;
+   int last_vgrf_node;
+   int first_spill_node;
+
+   int *spill_vgrf_ip;
+   int spill_vgrf_ip_alloc;
+   int spill_node_count;
+
+   fs_reg scratch_header;
+};
+
+/**
+ * Sets the mrf_used array to indicate which MRFs are used by the shader IR
+ *
+ * This is used in assign_regs() to decide which of the GRFs that we use as
+ * MRFs on gfx7 get normally register allocated, and in register spilling to
+ * see if we can actually use MRFs to do spills without overwriting normal MRF
+ * contents.
+ */
+static void
+get_used_mrfs(const fs_visitor *v, bool *mrf_used)
+{
+   int reg_width = v->dispatch_width / 8;
+
+   memset(mrf_used, 0, BRW_MAX_MRF(v->devinfo->ver) * sizeof(bool));
+
+   foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+      if (inst->dst.file == MRF) {
+         int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
+         mrf_used[reg] = true;
+         if (reg_width == 2) {
+            if (inst->dst.nr & BRW_MRF_COMPR4) {
+               mrf_used[reg + 4] = true;
+            } else {
+               mrf_used[reg + 1] = true;
+            }
+         }
+      }
+
+      if (inst->mlen > 0) {
+	 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
+            mrf_used[inst->base_mrf + i] = true;
+         }
+      }
+   }
+}
+
+namespace {
+   /**
+    * Maximum spill block size we expect to encounter in 32B units.
+    *
+    * This is somewhat arbitrary and doesn't necessarily limit the maximum
+    * variable size that can be spilled -- A higher value will allow a
+    * variable of a given size to be spilled more efficiently with a smaller
+    * number of scratch messages, but will increase the likelihood of a
+    * collision between the MRFs reserved for spilling and other MRFs used by
+    * the program (and possibly increase GRF register pressure on platforms
+    * without hardware MRFs), what could cause register allocation to fail.
+    *
+    * For the moment reserve just enough space so a register of 32 bit
+    * component type and natural region width can be spilled without splitting
+    * into multiple (force_writemask_all) scratch messages.
+    */
+   unsigned
+   spill_max_size(const backend_shader *s)
+   {
+      /* LSC is limited to SIMD16 sends */
+      if (s->devinfo->has_lsc)
+         return 2;
+
+      /* FINISHME - On Gfx7+ it should be possible to avoid this limit
+       *            altogether by spilling directly from the temporary GRF
+       *            allocated to hold the result of the instruction (and the
+       *            scratch write header).
+       */
+      /* FINISHME - The shader's dispatch width probably belongs in
+       *            backend_shader (or some nonexistent fs_shader class?)
+       *            rather than in the visitor class.
+       */
+      return static_cast<const fs_visitor *>(s)->dispatch_width / 8;
+   }
+
+   /**
+    * First MRF register available for spilling.
+    */
+   unsigned
+   spill_base_mrf(const backend_shader *s)
+   {
+      /* We don't use the MRF hack on Gfx9+ */
+      assert(s->devinfo->ver < 9);
+      return BRW_MAX_MRF(s->devinfo->ver) - spill_max_size(s) - 1;
+   }
+}
+
+void
+fs_reg_alloc::setup_live_interference(unsigned node,
+                                      int node_start_ip, int node_end_ip)
+{
+   /* Mark any virtual grf that is live between the start of the program and
+    * the last use of a payload node interfering with that payload node.
+    */
+   for (int i = 0; i < payload_node_count; i++) {
+      if (payload_last_use_ip[i] == -1)
+         continue;
+
+      /* Note that we use a <= comparison, unlike vgrfs_interfere(),
+       * in order to not have to worry about the uniform issue described in
+       * calculate_live_intervals().
+       */
+      if (node_start_ip <= payload_last_use_ip[i])
+         ra_add_node_interference(g, node, first_payload_node + i);
+   }
+
+   /* If we have the MRF hack enabled, mark this node as interfering with all
+    * MRF registers.
+    */
+   if (first_mrf_hack_node >= 0) {
+      for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->ver); i++)
+         ra_add_node_interference(g, node, first_mrf_hack_node + i);
+   }
+
+   /* Everything interferes with the scratch header */
+   if (scratch_header_node >= 0)
+      ra_add_node_interference(g, node, scratch_header_node);
+
+   /* Add interference with every vgrf whose live range intersects this
+    * node's.  We only need to look at nodes below this one as the reflexivity
+    * of interference will take care of the rest.
+    */
+   for (unsigned n2 = first_vgrf_node;
+        n2 <= (unsigned)last_vgrf_node && n2 < node; n2++) {
+      unsigned vgrf = n2 - first_vgrf_node;
+      if (!(node_end_ip <= live.vgrf_start[vgrf] ||
+            live.vgrf_end[vgrf] <= node_start_ip))
+         ra_add_node_interference(g, node, n2);
+   }
+}
+
+void
+fs_reg_alloc::setup_inst_interference(const fs_inst *inst)
+{
+   /* Certain instructions can't safely use the same register for their
+    * sources and destination.  Add interference.
+    */
+   if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF) {
+            ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
+                                        first_vgrf_node + inst->src[i].nr);
+         }
+      }
+   }
+
+   /* A compressed instruction is actually two instructions executed
+    * simultaneously.  On most platforms, it ok to have the source and
+    * destination registers be the same.  In this case, each instruction
+    * over-writes its own source and there's no problem.  The real problem
+    * here is if the source and destination registers are off by one.  Then
+    * you can end up in a scenario where the first instruction over-writes the
+    * source of the second instruction.  Since the compiler doesn't know about
+    * this level of granularity, we simply make the source and destination
+    * interfere.
+    */
+   if (inst->dst.component_size(inst->exec_size) > REG_SIZE &&
+       inst->dst.file == VGRF) {
+      for (int i = 0; i < inst->sources; ++i) {
+         if (inst->src[i].file == VGRF) {
+            ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
+                                        first_vgrf_node + inst->src[i].nr);
+         }
+      }
+   }
+
+   if (grf127_send_hack_node >= 0) {
+      /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
+       * subsection "EUISA Instructions", Send Message (page 990):
+       *
+       * "r127 must not be used for return address when there is a src and
+       * dest overlap in send instruction."
+       *
+       * We are avoiding using grf127 as part of the destination of send
+       * messages adding a node interference to the grf127_send_hack_node.
+       * This node has a fixed assignment to grf127.
+       *
+       * We don't apply it to SIMD16 instructions because previous code avoids
+       * any register overlap between sources and destination.
+       */
+      if (inst->exec_size < 16 && inst->is_send_from_grf() &&
+          inst->dst.file == VGRF)
+         ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
+                                     grf127_send_hack_node);
+
+      /* Spilling instruction are generated as SEND messages from MRF but as
+       * Gfx7+ supports sending from GRF the driver will maps assingn these
+       * MRF registers to a GRF. Implementations reuses the dest of the send
+       * message as source. So as we will have an overlap for sure, we create
+       * an interference between destination and grf127.
+       */
+      if ((inst->opcode == SHADER_OPCODE_GFX7_SCRATCH_READ ||
+           inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_READ) &&
+          inst->dst.file == VGRF)
+         ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
+                                     grf127_send_hack_node);
+   }
+
+   /* From the Skylake PRM Vol. 2a docs for sends:
+    *
+    *    "It is required that the second block of GRFs does not overlap with
+    *    the first block."
+    *
+    * Normally, this is taken care of by fixup_sends_duplicate_payload() but
+    * in the case where one of the registers is an undefined value, the
+    * register allocator may decide that they don't interfere even though
+    * they're used as sources in the same instruction.  We also need to add
+    * interference here.
+    */
+   if (devinfo->ver >= 9) {
+      if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
+          inst->src[2].file == VGRF && inst->src[3].file == VGRF &&
+          inst->src[2].nr != inst->src[3].nr)
+         ra_add_node_interference(g, first_vgrf_node + inst->src[2].nr,
+                                     first_vgrf_node + inst->src[3].nr);
+   }
+
+   /* When we do send-from-GRF for FB writes, we need to ensure that the last
+    * write instruction sends from a high register.  This is because the
+    * vertex fetcher wants to start filling the low payload registers while
+    * the pixel data port is still working on writing out the memory.  If we
+    * don't do this, we get rendering artifacts.
+    *
+    * We could just do "something high".  Instead, we just pick the highest
+    * register that works.
+    */
+   if (inst->eot) {
+      const int vgrf = inst->opcode == SHADER_OPCODE_SEND ?
+                       inst->src[2].nr : inst->src[0].nr;
+      const int size = DIV_ROUND_UP(fs->alloc.sizes[vgrf], reg_unit(devinfo));
+      int reg = BRW_MAX_GRF - size;
+
+      if (first_mrf_hack_node >= 0) {
+         /* If something happened to spill, we want to push the EOT send
+          * register early enough in the register file that we don't
+          * conflict with any used MRF hack registers.
+          */
+         reg -= BRW_MAX_MRF(devinfo->ver) - spill_base_mrf(fs);
+      } else if (grf127_send_hack_node >= 0) {
+         /* Avoid r127 which might be unusable if the node was previously
+          * written by a SIMD8 SEND message with source/destination overlap.
+          */
+         reg--;
+      }
+
+      ra_set_node_reg(g, first_vgrf_node + vgrf, reg);
+
+      if (inst->ex_mlen > 0) {
+         const int vgrf = inst->src[3].nr;
+         reg -= DIV_ROUND_UP(fs->alloc.sizes[vgrf], reg_unit(devinfo));
+         ra_set_node_reg(g, first_vgrf_node + vgrf, reg);
+      }
+   }
+}
+
+void
+fs_reg_alloc::build_interference_graph(bool allow_spilling)
+{
+   /* Compute the RA node layout */
+   node_count = 0;
+   first_payload_node = node_count;
+   node_count += payload_node_count;
+   if (devinfo->ver >= 7 && devinfo->ver < 9 && allow_spilling) {
+      first_mrf_hack_node = node_count;
+      node_count += BRW_MAX_GRF - GFX7_MRF_HACK_START;
+   } else {
+      first_mrf_hack_node = -1;
+   }
+   if (devinfo->ver >= 8) {
+      grf127_send_hack_node = node_count;
+      node_count ++;
+   } else {
+      grf127_send_hack_node = -1;
+   }
+   first_vgrf_node = node_count;
+   node_count += fs->alloc.count;
+   last_vgrf_node = node_count - 1;
+   if ((devinfo->ver >= 9 && devinfo->verx10 < 125) && allow_spilling) {
+      scratch_header_node = node_count++;
+   } else {
+      scratch_header_node = -1;
+   }
+   first_spill_node = node_count;
+
+   fs->calculate_payload_ranges(payload_node_count,
+                                payload_last_use_ip);
+
+   assert(g == NULL);
+   g = ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count);
+   ralloc_steal(mem_ctx, g);
+
+   /* Set up the payload nodes */
+   for (int i = 0; i < payload_node_count; i++)
+      ra_set_node_reg(g, first_payload_node + i, i);
+
+   if (first_mrf_hack_node >= 0) {
+      /* Mark each MRF reg node as being allocated to its physical
+       * register.
+       *
+       * The alternative would be to have per-physical-register classes,
+       * which would just be silly.
+       */
+      for (int i = 0; i < BRW_MAX_MRF(devinfo->ver); i++) {
+         ra_set_node_reg(g, first_mrf_hack_node + i,
+                            GFX7_MRF_HACK_START + i);
+      }
+   }
+
+   if (grf127_send_hack_node >= 0)
+      ra_set_node_reg(g, grf127_send_hack_node, 127);
+
+   /* Specify the classes of each virtual register. */
+   for (unsigned i = 0; i < fs->alloc.count; i++) {
+      unsigned size = DIV_ROUND_UP(fs->alloc.sizes[i], reg_unit(devinfo));
+
+      assert(size <= ARRAY_SIZE(compiler->fs_reg_sets[rsi].classes) &&
+             "Register allocation relies on split_virtual_grfs()");
+
+      ra_set_node_class(g, first_vgrf_node + i,
+                        compiler->fs_reg_sets[rsi].classes[size - 1]);
+   }
+
+   /* Special case: on pre-Gfx7 hardware that supports PLN, the second operand
+    * of a PLN instruction needs to be an even-numbered register, so we have a
+    * special register class aligned_bary_class to handle this case.
+    */
+   if (compiler->fs_reg_sets[rsi].aligned_bary_class) {
+      foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
+         if (inst->opcode == FS_OPCODE_LINTERP && inst->src[0].file == VGRF &&
+             fs->alloc.sizes[inst->src[0].nr] ==
+               aligned_bary_size(fs->dispatch_width)) {
+            ra_set_node_class(g, first_vgrf_node + inst->src[0].nr,
+                              compiler->fs_reg_sets[rsi].aligned_bary_class);
+         }
+      }
+   }
+
+   /* Add interference based on the live range of the register */
+   for (unsigned i = 0; i < fs->alloc.count; i++) {
+      setup_live_interference(first_vgrf_node + i,
+                              live.vgrf_start[i],
+                              live.vgrf_end[i]);
+   }
+
+   /* Add interference based on the instructions in which a register is used.
+    */
+   foreach_block_and_inst(block, fs_inst, inst, fs->cfg)
+      setup_inst_interference(inst);
+}
+
+void
+fs_reg_alloc::discard_interference_graph()
+{
+   ralloc_free(g);
+   g = NULL;
+   have_spill_costs = false;
+}
+
+fs_reg
+fs_reg_alloc::build_single_offset(const fs_builder &bld, uint32_t spill_offset, int ip)
+{
+   fs_reg offset = retype(alloc_spill_reg(1, ip), BRW_REGISTER_TYPE_UD);
+   fs_inst *inst = bld.MOV(offset, brw_imm_ud(spill_offset));
+   _mesa_set_add(spill_insts, inst);
+   return offset;
+}
+
+fs_reg
+fs_reg_alloc::build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, int ip)
+{
+   /* LSC messages are limited to SIMD16 */
+   assert(bld.dispatch_width() <= 16);
+
+   const fs_builder ubld = bld.exec_all();
+   const unsigned reg_count = ubld.dispatch_width() / 8;
+
+   fs_reg offset = retype(alloc_spill_reg(reg_count, ip), BRW_REGISTER_TYPE_UD);
+   fs_inst *inst;
+
+   /* Build an offset per lane in SIMD8 */
+   inst = ubld.group(8, 0).MOV(retype(offset, BRW_REGISTER_TYPE_UW),
+                               brw_imm_uv(0x76543210));
+   _mesa_set_add(spill_insts, inst);
+   inst = ubld.group(8, 0).MOV(offset, retype(offset, BRW_REGISTER_TYPE_UW));
+   _mesa_set_add(spill_insts, inst);
+
+   /* Build offsets in the upper 8 lanes of SIMD16 */
+   if (ubld.dispatch_width() > 8) {
+      inst = ubld.group(8, 0).ADD(
+         byte_offset(offset, REG_SIZE),
+         byte_offset(offset, 0),
+         brw_imm_ud(8));
+      _mesa_set_add(spill_insts, inst);
+   }
+
+   /* Make the offset a dword */
+   inst = ubld.SHL(offset, offset, brw_imm_ud(2));
+   _mesa_set_add(spill_insts, inst);
+
+   /* Add the base offset */
+   inst = ubld.ADD(offset, offset, brw_imm_ud(spill_offset));
+   _mesa_set_add(spill_insts, inst);
+
+   return offset;
+}
+
+void
+fs_reg_alloc::emit_unspill(const fs_builder &bld,
+                           struct shader_stats *stats,
+                           fs_reg dst,
+                           uint32_t spill_offset, unsigned count, int ip)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   const unsigned reg_size = dst.component_size(bld.dispatch_width()) /
+                             REG_SIZE;
+   assert(count % reg_size == 0);
+
+   for (unsigned i = 0; i < count / reg_size; i++) {
+      ++stats->fill_count;
+
+      fs_inst *unspill_inst;
+      if (devinfo->verx10 >= 125) {
+         /* LSC is limited to SIMD16 load/store but we can load more using
+          * transpose messages.
+          */
+         const bool use_transpose = bld.dispatch_width() > 16;
+         const fs_builder ubld = use_transpose ? bld.exec_all().group(1, 0) : bld;
+         fs_reg offset;
+         if (use_transpose) {
+            offset = build_single_offset(ubld, spill_offset, ip);
+         } else {
+            offset = build_lane_offsets(ubld, spill_offset, ip);
+         }
+         /* We leave the extended descriptor empty and flag the instruction to
+          * ask the generated to insert the extended descriptor in the address
+          * register. That way we don't need to burn an additional register
+          * for register allocation spill/fill.
+          */
+         fs_reg srcs[] = {
+            brw_imm_ud(0), /* desc */
+            brw_imm_ud(0), /* ex_desc */
+            offset,        /* payload */
+            fs_reg(),      /* payload2 */
+         };
+
+         unspill_inst = ubld.emit(SHADER_OPCODE_SEND, dst,
+                                  srcs, ARRAY_SIZE(srcs));
+         unspill_inst->sfid = GFX12_SFID_UGM;
+         unspill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
+                                           unspill_inst->exec_size,
+                                           LSC_ADDR_SURFTYPE_SS,
+                                           LSC_ADDR_SIZE_A32,
+                                           1 /* num_coordinates */,
+                                           LSC_DATA_SIZE_D32,
+                                           use_transpose ? reg_size * 8 : 1 /* num_channels */,
+                                           use_transpose,
+                                           LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
+                                           true /* has_dest */);
+         unspill_inst->header_size = 0;
+         unspill_inst->mlen =
+            lsc_msg_desc_src0_len(devinfo, unspill_inst->desc);
+         unspill_inst->ex_mlen = 0;
+         unspill_inst->size_written =
+            lsc_msg_desc_dest_len(devinfo, unspill_inst->desc) * REG_SIZE;
+         unspill_inst->send_has_side_effects = false;
+         unspill_inst->send_is_volatile = true;
+         unspill_inst->send_ex_desc_scratch = true;
+      } else if (devinfo->ver >= 9) {
+         fs_reg header = this->scratch_header;
+         fs_builder ubld = bld.exec_all().group(1, 0);
+         assert(spill_offset % 16 == 0);
+         unspill_inst = ubld.MOV(component(header, 2),
+                                 brw_imm_ud(spill_offset / 16));
+         _mesa_set_add(spill_insts, unspill_inst);
+
+         const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
+         const fs_reg ex_desc = brw_imm_ud(0);
+
+         fs_reg srcs[] = { brw_imm_ud(0), ex_desc, header };
+         unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst,
+                                 srcs, ARRAY_SIZE(srcs));
+         unspill_inst->mlen = 1;
+         unspill_inst->header_size = 1;
+         unspill_inst->size_written = reg_size * REG_SIZE;
+         unspill_inst->send_has_side_effects = false;
+         unspill_inst->send_is_volatile = true;
+         unspill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
+         unspill_inst->desc =
+            brw_dp_desc(devinfo, bti,
+                        BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
+                        BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8));
+      } else if (devinfo->ver >= 7 && spill_offset < (1 << 12) * REG_SIZE) {
+         /* The Gfx7 descriptor-based offset is 12 bits of HWORD units.
+          * Because the Gfx7-style scratch block read is hardwired to BTI 255,
+          * on Gfx9+ it would cause the DC to do an IA-coherent read, what
+          * largely outweighs the slight advantage from not having to provide
+          * the address as part of the message header, so we're better off
+          * using plain old oword block reads.
+          */
+         unspill_inst = bld.emit(SHADER_OPCODE_GFX7_SCRATCH_READ, dst);
+         unspill_inst->offset = spill_offset;
+      } else {
+         unspill_inst = bld.emit(SHADER_OPCODE_GFX4_SCRATCH_READ, dst);
+         unspill_inst->offset = spill_offset;
+         unspill_inst->base_mrf = spill_base_mrf(bld.shader);
+         unspill_inst->mlen = 1; /* header contains offset */
+      }
+      _mesa_set_add(spill_insts, unspill_inst);
+
+      dst.offset += reg_size * REG_SIZE;
+      spill_offset += reg_size * REG_SIZE;
+   }
+}
+
+void
+fs_reg_alloc::emit_spill(const fs_builder &bld,
+                         struct shader_stats *stats,
+                         fs_reg src,
+                         uint32_t spill_offset, unsigned count, int ip)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   const unsigned reg_size = src.component_size(bld.dispatch_width()) /
+                             REG_SIZE;
+   assert(count % reg_size == 0);
+
+   for (unsigned i = 0; i < count / reg_size; i++) {
+      ++stats->spill_count;
+
+      fs_inst *spill_inst;
+      if (devinfo->verx10 >= 125) {
+         fs_reg offset = build_lane_offsets(bld, spill_offset, ip);
+         /* We leave the extended descriptor empty and flag the instruction
+          * relocate the extended descriptor. That way the surface offset is
+          * directly put into the instruction and we don't need to use a
+          * register to hold it.
+          */
+         fs_reg srcs[] = {
+            brw_imm_ud(0),        /* desc */
+            brw_imm_ud(0),        /* ex_desc */
+            offset,               /* payload */
+            src,                  /* payload2 */
+         };
+         spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
+                               srcs, ARRAY_SIZE(srcs));
+         spill_inst->sfid = GFX12_SFID_UGM;
+         spill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
+                                         bld.dispatch_width(),
+                                         LSC_ADDR_SURFTYPE_SS,
+                                         LSC_ADDR_SIZE_A32,
+                                         1 /* num_coordinates */,
+                                         LSC_DATA_SIZE_D32,
+                                         1 /* num_channels */,
+                                         false /* transpose */,
+                                         LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
+                                         false /* has_dest */);
+         spill_inst->header_size = 0;
+         spill_inst->mlen = lsc_msg_desc_src0_len(devinfo, spill_inst->desc);
+         spill_inst->ex_mlen = reg_size;
+         spill_inst->size_written = 0;
+         spill_inst->send_has_side_effects = true;
+         spill_inst->send_is_volatile = false;
+         spill_inst->send_ex_desc_scratch = true;
+      } else if (devinfo->ver >= 9) {
+         fs_reg header = this->scratch_header;
+         fs_builder ubld = bld.exec_all().group(1, 0);
+         assert(spill_offset % 16 == 0);
+         spill_inst = ubld.MOV(component(header, 2),
+                               brw_imm_ud(spill_offset / 16));
+         _mesa_set_add(spill_insts, spill_inst);
+
+         const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
+         const fs_reg ex_desc = brw_imm_ud(0);
+
+         fs_reg srcs[] = { brw_imm_ud(0), ex_desc, header, src };
+         spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
+                               srcs, ARRAY_SIZE(srcs));
+         spill_inst->mlen = 1;
+         spill_inst->ex_mlen = reg_size;
+         spill_inst->size_written = 0;
+         spill_inst->header_size = 1;
+         spill_inst->send_has_side_effects = true;
+         spill_inst->send_is_volatile = false;
+         spill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
+         spill_inst->desc =
+            brw_dp_desc(devinfo, bti,
+                        GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE,
+                        BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8));
+      } else {
+         spill_inst = bld.emit(SHADER_OPCODE_GFX4_SCRATCH_WRITE,
+                               bld.null_reg_f(), src);
+         spill_inst->offset = spill_offset;
+         spill_inst->mlen = 1 + reg_size; /* header, value */
+         spill_inst->base_mrf = spill_base_mrf(bld.shader);
+      }
+      _mesa_set_add(spill_insts, spill_inst);
+
+      src.offset += reg_size * REG_SIZE;
+      spill_offset += reg_size * REG_SIZE;
+   }
+}
+
+void
+fs_reg_alloc::set_spill_costs()
+{
+   float block_scale = 1.0;
+   float spill_costs[fs->alloc.count];
+   bool no_spill[fs->alloc.count];
+
+   for (unsigned i = 0; i < fs->alloc.count; i++) {
+      spill_costs[i] = 0.0;
+      no_spill[i] = false;
+   }
+
+   /* Calculate costs for spilling nodes.  Call it a cost of 1 per
+    * spill/unspill we'll have to do, and guess that the insides of
+    * loops run 10 times.
+    */
+   foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
+      for (unsigned int i = 0; i < inst->sources; i++) {
+	 if (inst->src[i].file == VGRF)
+            spill_costs[inst->src[i].nr] += regs_read(inst, i) * block_scale;
+      }
+
+      if (inst->dst.file == VGRF)
+         spill_costs[inst->dst.nr] += regs_written(inst) * block_scale;
+
+      /* Don't spill anything we generated while spilling */
+      if (_mesa_set_search(spill_insts, inst)) {
+         for (unsigned int i = 0; i < inst->sources; i++) {
+	    if (inst->src[i].file == VGRF)
+               no_spill[inst->src[i].nr] = true;
+         }
+	 if (inst->dst.file == VGRF)
+            no_spill[inst->dst.nr] = true;
+      }
+
+      switch (inst->opcode) {
+
+      case BRW_OPCODE_DO:
+	 block_scale *= 10;
+	 break;
+
+      case BRW_OPCODE_WHILE:
+	 block_scale /= 10;
+	 break;
+
+      case BRW_OPCODE_IF:
+      case BRW_OPCODE_IFF:
+         block_scale *= 0.5;
+         break;
+
+      case BRW_OPCODE_ENDIF:
+         block_scale /= 0.5;
+         break;
+
+      default:
+	 break;
+      }
+   }
+
+   for (unsigned i = 0; i < fs->alloc.count; i++) {
+      /* Do the no_spill check first.  Registers that are used as spill
+       * temporaries may have been allocated after we calculated liveness so
+       * we shouldn't look their liveness up.  Fortunately, they're always
+       * used in SCRATCH_READ/WRITE instructions so they'll always be flagged
+       * no_spill.
+       */
+      if (no_spill[i])
+         continue;
+
+      int live_length = live.vgrf_end[i] - live.vgrf_start[i];
+      if (live_length <= 0)
+         continue;
+
+      /* Divide the cost (in number of spills/fills) by the log of the length
+       * of the live range of the register.  This will encourage spill logic
+       * to spill long-living things before spilling short-lived things where
+       * spilling is less likely to actually do us any good.  We use the log
+       * of the length because it will fall off very quickly and not cause us
+       * to spill medium length registers with more uses.
+       */
+      float adjusted_cost = spill_costs[i] / logf(live_length);
+      ra_set_node_spill_cost(g, first_vgrf_node + i, adjusted_cost);
+   }
+
+   have_spill_costs = true;
+}
+
+int
+fs_reg_alloc::choose_spill_reg()
+{
+   if (!have_spill_costs)
+      set_spill_costs();
+
+   int node = ra_get_best_spill_node(g);
+   if (node < 0)
+      return -1;
+
+   assert(node >= first_vgrf_node);
+   return node - first_vgrf_node;
+}
+
+fs_reg
+fs_reg_alloc::alloc_scratch_header()
+{
+   int vgrf = fs->alloc.allocate(1);
+   assert(first_vgrf_node + vgrf == scratch_header_node);
+   ra_set_node_class(g, scratch_header_node,
+                        compiler->fs_reg_sets[rsi].classes[0]);
+
+   setup_live_interference(scratch_header_node, 0, INT_MAX);
+
+   return fs_reg(VGRF, vgrf, BRW_REGISTER_TYPE_UD);
+}
+
+fs_reg
+fs_reg_alloc::alloc_spill_reg(unsigned size, int ip)
+{
+   int vgrf = fs->alloc.allocate(ALIGN(size, reg_unit(devinfo)));
+   int class_idx = DIV_ROUND_UP(size, reg_unit(devinfo)) - 1;
+   int n = ra_add_node(g, compiler->fs_reg_sets[rsi].classes[class_idx]);
+   assert(n == first_vgrf_node + vgrf);
+   assert(n == first_spill_node + spill_node_count);
+
+   setup_live_interference(n, ip - 1, ip + 1);
+
+   /* Add interference between this spill node and any other spill nodes for
+    * the same instruction.
+    */
+   for (int s = 0; s < spill_node_count; s++) {
+      if (spill_vgrf_ip[s] == ip)
+         ra_add_node_interference(g, n, first_spill_node + s);
+   }
+
+   /* Add this spill node to the list for next time */
+   if (spill_node_count >= spill_vgrf_ip_alloc) {
+      if (spill_vgrf_ip_alloc == 0)
+         spill_vgrf_ip_alloc = 16;
+      else
+         spill_vgrf_ip_alloc *= 2;
+      spill_vgrf_ip = reralloc(mem_ctx, spill_vgrf_ip, int,
+                               spill_vgrf_ip_alloc);
+   }
+   spill_vgrf_ip[spill_node_count++] = ip;
+
+   return fs_reg(VGRF, vgrf);
+}
+
+void
+fs_reg_alloc::spill_reg(unsigned spill_reg)
+{
+   int size = fs->alloc.sizes[spill_reg];
+   unsigned int spill_offset = fs->last_scratch;
+   assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
+
+   /* Spills may use MRFs 13-15 in the SIMD16 case.  Our texturing is done
+    * using up to 11 MRFs starting from either m1 or m2, and fb writes can use
+    * up to m13 (gfx6+ simd16: 2 header + 8 color + 2 src0alpha + 2 omask) or
+    * m15 (gfx4-5 simd16: 2 header + 8 color + 1 aads + 2 src depth + 2 dst
+    * depth), starting from m1.  In summary: We may not be able to spill in
+    * SIMD16 mode, because we'd stomp the FB writes.
+    */
+   if (!fs->spilled_any_registers) {
+      if (devinfo->verx10 >= 125) {
+         /* We will allocate a register on the fly */
+      } else if (devinfo->ver >= 9) {
+         this->scratch_header = alloc_scratch_header();
+         fs_builder ubld = fs_builder(fs, 8).exec_all().at(
+            fs->cfg->first_block(), fs->cfg->first_block()->start());
+
+         fs_inst *inst = ubld.emit(SHADER_OPCODE_SCRATCH_HEADER,
+                                   this->scratch_header);
+         _mesa_set_add(spill_insts, inst);
+      } else {
+         bool mrf_used[BRW_MAX_MRF(devinfo->ver)];
+         get_used_mrfs(fs, mrf_used);
+
+         for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->ver); i++) {
+            if (mrf_used[i]) {
+               fs->fail("Register spilling not supported with m%d used", i);
+             return;
+            }
+         }
+      }
+
+      fs->spilled_any_registers = true;
+   }
+
+   fs->last_scratch += size * REG_SIZE;
+
+   /* We're about to replace all uses of this register.  It no longer
+    * conflicts with anything so we can get rid of its interference.
+    */
+   ra_set_node_spill_cost(g, first_vgrf_node + spill_reg, 0);
+   ra_reset_node_interference(g, first_vgrf_node + spill_reg);
+
+   /* Generate spill/unspill instructions for the objects being
+    * spilled.  Right now, we spill or unspill the whole thing to a
+    * virtual grf of the same size.  For most instructions, though, we
+    * could just spill/unspill the GRF being accessed.
+    */
+   int ip = 0;
+   foreach_block_and_inst (block, fs_inst, inst, fs->cfg) {
+      const fs_builder ibld = fs_builder(fs, block, inst);
+      exec_node *before = inst->prev;
+      exec_node *after = inst->next;
+
+      for (unsigned int i = 0; i < inst->sources; i++) {
+	 if (inst->src[i].file == VGRF &&
+             inst->src[i].nr == spill_reg) {
+            int count = regs_read(inst, i);
+            int subset_spill_offset = spill_offset +
+               ROUND_DOWN_TO(inst->src[i].offset, REG_SIZE);
+            fs_reg unspill_dst = alloc_spill_reg(count, ip);
+
+            inst->src[i].nr = unspill_dst.nr;
+            inst->src[i].offset %= REG_SIZE;
+
+            /* We read the largest power-of-two divisor of the register count
+             * (because only POT scratch read blocks are allowed by the
+             * hardware) up to the maximum supported block size.
+             */
+            const unsigned width =
+               MIN2(32, 1u << (ffs(MAX2(1, count) * 8) - 1));
+
+            /* Set exec_all() on unspill messages under the (rather
+             * pessimistic) assumption that there is no one-to-one
+             * correspondence between channels of the spilled variable in
+             * scratch space and the scratch read message, which operates on
+             * 32 bit channels.  It shouldn't hurt in any case because the
+             * unspill destination is a block-local temporary.
+             */
+            emit_unspill(ibld.exec_all().group(width, 0), &fs->shader_stats,
+                         unspill_dst, subset_spill_offset, count, ip);
+	 }
+      }
+
+      if (inst->dst.file == VGRF &&
+          inst->dst.nr == spill_reg &&
+          inst->opcode != SHADER_OPCODE_UNDEF) {
+         int subset_spill_offset = spill_offset +
+            ROUND_DOWN_TO(inst->dst.offset, REG_SIZE);
+         fs_reg spill_src = alloc_spill_reg(regs_written(inst), ip);
+
+         inst->dst.nr = spill_src.nr;
+         inst->dst.offset %= REG_SIZE;
+
+         /* If we're immediately spilling the register, we should not use
+          * destination dependency hints.  Doing so will cause the GPU do
+          * try to read and write the register at the same time and may
+          * hang the GPU.
+          */
+         inst->no_dd_clear = false;
+         inst->no_dd_check = false;
+
+         /* Calculate the execution width of the scratch messages (which work
+          * in terms of 32 bit components so we have a fixed number of eight
+          * channels per spilled register).  We attempt to write one
+          * exec_size-wide component of the variable at a time without
+          * exceeding the maximum number of (fake) MRF registers reserved for
+          * spills.
+          */
+         const unsigned width = 8 * reg_unit(devinfo) *
+            DIV_ROUND_UP(MIN2(inst->dst.component_size(inst->exec_size),
+                              spill_max_size(fs) * REG_SIZE),
+                         reg_unit(devinfo) * REG_SIZE);
+
+         /* Spills should only write data initialized by the instruction for
+          * whichever channels are enabled in the execution mask.  If that's
+          * not possible we'll have to emit a matching unspill before the
+          * instruction and set force_writemask_all on the spill.
+          */
+         const bool per_channel =
+            inst->dst.is_contiguous() && type_sz(inst->dst.type) == 4 &&
+            inst->exec_size == width;
+
+         /* Builder used to emit the scratch messages. */
+         const fs_builder ubld = ibld.exec_all(!per_channel).group(width, 0);
+
+	 /* If our write is going to affect just part of the
+          * regs_written(inst), then we need to unspill the destination since
+          * we write back out all of the regs_written().  If the original
+          * instruction had force_writemask_all set and is not a partial
+          * write, there should be no need for the unspill since the
+          * instruction will be overwriting the whole destination in any case.
+	  */
+         if (inst->is_partial_write() ||
+             (!inst->force_writemask_all && !per_channel))
+            emit_unspill(ubld, &fs->shader_stats, spill_src,
+                         subset_spill_offset, regs_written(inst), ip);
+
+         emit_spill(ubld.at(block, inst->next), &fs->shader_stats, spill_src,
+                    subset_spill_offset, regs_written(inst), ip);
+      }
+
+      for (fs_inst *inst = (fs_inst *)before->next;
+           inst != after; inst = (fs_inst *)inst->next)
+         setup_inst_interference(inst);
+
+      /* We don't advance the ip for scratch read/write instructions
+       * because we consider them to have the same ip as instruction we're
+       * spilling around for the purposes of interference.  Also, we're
+       * inserting spill instructions without re-running liveness analysis
+       * and we don't want to mess up our IPs.
+       */
+      if (!_mesa_set_search(spill_insts, inst))
+         ip++;
+   }
+
+   assert(ip == live_instr_count);
+}
+
+bool
+fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all)
+{
+   build_interference_graph(fs->spilled_any_registers || spill_all);
+
+   unsigned spilled = 0;
+   while (1) {
+      /* Debug of register spilling: Go spill everything. */
+      if (unlikely(spill_all)) {
+         int reg = choose_spill_reg();
+         if (reg != -1) {
+            spill_reg(reg);
+            continue;
+         }
+      }
+
+      if (ra_allocate(g))
+         break;
+
+      if (!allow_spilling)
+         return false;
+
+      /* Failed to allocate registers.  Spill some regs, and the caller will
+       * loop back into here to try again.
+       */
+      unsigned nr_spills = 1;
+      if (compiler->spilling_rate)
+         nr_spills = MAX2(1, spilled / compiler->spilling_rate);
+
+      for (unsigned j = 0; j < nr_spills; j++) {
+         int reg = choose_spill_reg();
+         if (reg == -1) {
+            if (j == 0)
+               return false; /* Nothing to spill */
+            break;
+         }
+
+         /* If we're going to spill but we've never spilled before, we need
+          * to re-build the interference graph with MRFs enabled to allow
+          * spilling.
+          */
+         if (!fs->spilled_any_registers) {
+            discard_interference_graph();
+            build_interference_graph(true);
+         }
+
+         spill_reg(reg);
+         spilled++;
+      }
+   }
+
+   if (spilled)
+      fs->invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   /* Get the chosen virtual registers for each node, and map virtual
+    * regs in the register classes back down to real hardware reg
+    * numbers.
+    */
+   unsigned hw_reg_mapping[fs->alloc.count];
+   fs->grf_used = fs->first_non_payload_grf;
+   for (unsigned i = 0; i < fs->alloc.count; i++) {
+      int reg = ra_get_node_reg(g, first_vgrf_node + i);
+
+      hw_reg_mapping[i] = reg;
+      fs->grf_used = MAX2(fs->grf_used,
+			  hw_reg_mapping[i] + DIV_ROUND_UP(fs->alloc.sizes[i],
+                                                           reg_unit(devinfo)));
+   }
+
+   foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
+      assign_reg(devinfo, hw_reg_mapping, &inst->dst);
+      for (int i = 0; i < inst->sources; i++) {
+         assign_reg(devinfo, hw_reg_mapping, &inst->src[i]);
+      }
+   }
+
+   fs->alloc.count = fs->grf_used;
+
+   return true;
+}
+
+bool
+fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
+{
+   fs_reg_alloc alloc(this);
+   bool success = alloc.assign_regs(allow_spilling, spill_all);
+   if (!success && allow_spilling) {
+      fail("no register to spill:\n");
+      dump_instructions(NULL);
+   }
+   return success;
+}
diff --git a/src/intel/compiler/elk/brw_fs_register_coalesce.cpp b/src/intel/compiler/elk/brw_fs_register_coalesce.cpp
new file mode 100644
index 00000000000..4c9bb3edba8
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_register_coalesce.cpp
@@ -0,0 +1,349 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_register_coalesce.cpp
+ *
+ * Implements register coalescing: Checks if the two registers involved in a
+ * raw move don't interfere, in which case they can both be stored in the same
+ * place and the MOV removed.
+ *
+ * To do this, all uses of the source of the MOV in the shader are replaced
+ * with the destination of the MOV. For example:
+ *
+ * add vgrf3:F, vgrf1:F, vgrf2:F
+ * mov vgrf4:F, vgrf3:F
+ * mul vgrf5:F, vgrf5:F, vgrf4:F
+ *
+ * becomes
+ *
+ * add vgrf4:F, vgrf1:F, vgrf2:F
+ * mul vgrf5:F, vgrf5:F, vgrf4:F
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_fs_live_variables.h"
+
+using namespace brw;
+
+static bool
+is_nop_mov(const fs_inst *inst)
+{
+   if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+      fs_reg dst = inst->dst;
+      for (int i = 0; i < inst->sources; i++) {
+         if (!dst.equals(inst->src[i])) {
+            return false;
+         }
+         dst.offset += (i < inst->header_size ? REG_SIZE :
+                        inst->exec_size * dst.stride *
+                        type_sz(inst->src[i].type));
+      }
+      return true;
+   } else if (inst->opcode == BRW_OPCODE_MOV) {
+      return inst->dst.equals(inst->src[0]);
+   }
+
+   return false;
+}
+
+static bool
+is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst)
+{
+   if ((inst->opcode != BRW_OPCODE_MOV &&
+        inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) ||
+       inst->is_partial_write() ||
+       inst->saturate ||
+       inst->src[0].file != VGRF ||
+       inst->src[0].negate ||
+       inst->src[0].abs ||
+       !inst->src[0].is_contiguous() ||
+       inst->dst.file != VGRF ||
+       inst->dst.type != inst->src[0].type) {
+      return false;
+   }
+
+   if (v->alloc.sizes[inst->src[0].nr] >
+       v->alloc.sizes[inst->dst.nr])
+      return false;
+
+   if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+      if (!is_coalescing_payload(v->alloc, inst)) {
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static bool
+can_coalesce_vars(const fs_live_variables &live, const cfg_t *cfg,
+                  const bblock_t *block, const fs_inst *inst,
+                  int dst_var, int src_var)
+{
+   if (!live.vars_interfere(src_var, dst_var))
+      return true;
+
+   int dst_start = live.start[dst_var];
+   int dst_end = live.end[dst_var];
+   int src_start = live.start[src_var];
+   int src_end = live.end[src_var];
+
+   /* Variables interfere and one line range isn't a subset of the other. */
+   if ((dst_end > src_end && src_start < dst_start) ||
+       (src_end > dst_end && dst_start < src_start))
+      return false;
+
+   /* Check for a write to either register in the intersection of their live
+    * ranges.
+    */
+   int start_ip = MAX2(dst_start, src_start);
+   int end_ip = MIN2(dst_end, src_end);
+
+   foreach_block(scan_block, cfg) {
+      if (scan_block->end_ip < start_ip)
+         continue;
+
+      int scan_ip = scan_block->start_ip - 1;
+
+      bool seen_src_write = false;
+      bool seen_copy = false;
+      foreach_inst_in_block(fs_inst, scan_inst, scan_block) {
+         scan_ip++;
+
+         /* Ignore anything before the intersection of the live ranges */
+         if (scan_ip < start_ip)
+            continue;
+
+         /* Ignore the copying instruction itself */
+         if (scan_inst == inst) {
+            seen_copy = true;
+            continue;
+         }
+
+         if (scan_ip > end_ip)
+            return true; /* registers do not interfere */
+
+         if (seen_src_write && !seen_copy) {
+            /* In order to satisfy the guarantee of register coalescing, we
+             * must ensure that the two registers always have the same value
+             * during the intersection of their live ranges.  One way to do
+             * this is to simply ensure that neither is ever written apart
+             * from the one copy which syncs up the two registers.  However,
+             * this can be overly conservative and only works in the case
+             * where the destination live range is entirely contained in the
+             * source live range.
+             *
+             * To handle the other case where the source is contained in the
+             * destination, we allow writes to the source register as long as
+             * they happen before the copy, in the same block as the copy, and
+             * the destination is never read between first such write and the
+             * copy.  This effectively moves the write from the copy up.
+             */
+            for (int j = 0; j < scan_inst->sources; j++) {
+               if (regions_overlap(scan_inst->src[j], scan_inst->size_read(j),
+                                   inst->dst, inst->size_written))
+                  return false; /* registers interfere */
+            }
+         }
+
+         /* The MOV being coalesced had better be the only instruction which
+          * writes to the coalesce destination in the intersection.
+          */
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->dst, inst->size_written))
+            return false; /* registers interfere */
+
+         /* See the big comment above */
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->src[0], inst->size_read(0))) {
+            if (seen_copy || scan_block != block ||
+                (scan_inst->force_writemask_all && !inst->force_writemask_all))
+               return false;
+            seen_src_write = true;
+         }
+      }
+   }
+
+   return true;
+}
+
+bool
+fs_visitor::register_coalesce()
+{
+   bool progress = false;
+   fs_live_variables &live = live_analysis.require();
+   int src_size = 0;
+   int channels_remaining = 0;
+   unsigned src_reg = ~0u, dst_reg = ~0u;
+   int *dst_reg_offset = new int[MAX_VGRF_SIZE(devinfo)];
+   fs_inst **mov = new fs_inst *[MAX_VGRF_SIZE(devinfo)];
+   int *dst_var = new int[MAX_VGRF_SIZE(devinfo)];
+   int *src_var = new int[MAX_VGRF_SIZE(devinfo)];
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (!is_coalesce_candidate(this, inst))
+         continue;
+
+      if (is_nop_mov(inst)) {
+         inst->opcode = BRW_OPCODE_NOP;
+         progress = true;
+         continue;
+      }
+
+      if (src_reg != inst->src[0].nr) {
+         src_reg = inst->src[0].nr;
+
+         src_size = alloc.sizes[inst->src[0].nr];
+         assert(src_size <= MAX_VGRF_SIZE(devinfo));
+
+         channels_remaining = src_size;
+         memset(mov, 0, sizeof(*mov) * MAX_VGRF_SIZE(devinfo));
+
+         dst_reg = inst->dst.nr;
+      }
+
+      if (dst_reg != inst->dst.nr)
+         continue;
+
+      if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+         for (int i = 0; i < src_size; i++) {
+            dst_reg_offset[i] = i;
+         }
+         mov[0] = inst;
+         channels_remaining -= regs_written(inst);
+      } else {
+         const int offset = inst->src[0].offset / REG_SIZE;
+         if (mov[offset]) {
+            /* This is the second time that this offset in the register has
+             * been set.  This means, in particular, that inst->dst was
+             * live before this instruction and that the live ranges of
+             * inst->dst and inst->src[0] overlap and we can't coalesce the
+             * two variables.  Let's ensure that doesn't happen.
+             */
+            channels_remaining = -1;
+            continue;
+         }
+         for (unsigned i = 0; i < MAX2(inst->size_written / REG_SIZE, 1); i++)
+            dst_reg_offset[offset + i] = inst->dst.offset / REG_SIZE + i;
+         mov[offset] = inst;
+         channels_remaining -= regs_written(inst);
+      }
+
+      if (channels_remaining)
+         continue;
+
+      bool can_coalesce = true;
+      for (int i = 0; i < src_size; i++) {
+         if (dst_reg_offset[i] != dst_reg_offset[0] + i) {
+            /* Registers are out-of-order. */
+            can_coalesce = false;
+            src_reg = ~0u;
+            break;
+         }
+
+         dst_var[i] = live.var_from_vgrf[dst_reg] + dst_reg_offset[i];
+         src_var[i] = live.var_from_vgrf[src_reg] + i;
+
+         if (!can_coalesce_vars(live, cfg, block, inst, dst_var[i], src_var[i])) {
+            can_coalesce = false;
+            src_reg = ~0u;
+            break;
+         }
+      }
+
+      if (!can_coalesce)
+         continue;
+
+      progress = true;
+
+      for (int i = 0; i < src_size; i++) {
+         if (!mov[i])
+            continue;
+
+         if (mov[i]->conditional_mod == BRW_CONDITIONAL_NONE) {
+            mov[i]->opcode = BRW_OPCODE_NOP;
+            mov[i]->dst = reg_undef;
+            for (int j = 0; j < mov[i]->sources; j++) {
+               mov[i]->src[j] = reg_undef;
+            }
+         } else {
+            /* If we have a conditional modifier, rewrite the MOV to be a
+             * MOV.cmod from the coalesced register.  Hopefully, cmod
+             * propagation will clean this up and move it to the instruction
+             * that writes the register.  If not, this keeps things correct
+             * while still letting us coalesce.
+             */
+            assert(mov[i]->opcode == BRW_OPCODE_MOV);
+            assert(mov[i]->sources == 1);
+            mov[i]->src[0] = mov[i]->dst;
+            mov[i]->dst = retype(brw_null_reg(), mov[i]->dst.type);
+         }
+      }
+
+      foreach_block_and_inst(block, fs_inst, scan_inst, cfg) {
+         if (scan_inst->dst.file == VGRF &&
+             scan_inst->dst.nr == src_reg) {
+            scan_inst->dst.nr = dst_reg;
+            scan_inst->dst.offset = scan_inst->dst.offset % REG_SIZE +
+               dst_reg_offset[scan_inst->dst.offset / REG_SIZE] * REG_SIZE;
+         }
+
+         for (int j = 0; j < scan_inst->sources; j++) {
+            if (scan_inst->src[j].file == VGRF &&
+                scan_inst->src[j].nr == src_reg) {
+               scan_inst->src[j].nr = dst_reg;
+               scan_inst->src[j].offset = scan_inst->src[j].offset % REG_SIZE +
+                  dst_reg_offset[scan_inst->src[j].offset / REG_SIZE] * REG_SIZE;
+            }
+         }
+      }
+
+      for (int i = 0; i < src_size; i++) {
+         live.start[dst_var[i]] = MIN2(live.start[dst_var[i]],
+                                       live.start[src_var[i]]);
+         live.end[dst_var[i]] = MAX2(live.end[dst_var[i]],
+                                     live.end[src_var[i]]);
+      }
+      src_reg = ~0u;
+   }
+
+   if (progress) {
+      foreach_block_and_inst_safe (block, backend_instruction, inst, cfg) {
+         if (inst->opcode == BRW_OPCODE_NOP) {
+            inst->remove(block, true);
+         }
+      }
+
+      cfg->adjust_block_ips();
+
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+   }
+
+   delete[] src_var;
+   delete[] dst_var;
+   delete[] mov;
+   delete[] dst_reg_offset;
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_fs_saturate_propagation.cpp b/src/intel/compiler/elk/brw_fs_saturate_propagation.cpp
new file mode 100644
index 00000000000..50b05dd92b8
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_saturate_propagation.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_live_variables.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+/** @file brw_fs_saturate_propagation.cpp
+ *
+ * Implements a pass that propagates the SAT modifier from a MOV.SAT into the
+ * instruction that produced the source of the MOV.SAT, thereby allowing the
+ * MOV's src and dst to be coalesced and the MOV removed.
+ *
+ * For instance,
+ *
+ *    ADD     tmp, src0, src1
+ *    MOV.SAT dst, tmp
+ *
+ * would be transformed into
+ *
+ *    ADD.SAT tmp, src0, src1
+ *    MOV     dst, tmp
+ */
+
+static bool
+opt_saturate_propagation_local(const fs_live_variables &live, bblock_t *block)
+{
+   bool progress = false;
+   int ip = block->end_ip + 1;
+
+   foreach_inst_in_block_reverse(fs_inst, inst, block) {
+      ip--;
+
+      if (inst->opcode != BRW_OPCODE_MOV ||
+          !inst->saturate ||
+          inst->dst.file != VGRF ||
+          inst->dst.type != inst->src[0].type ||
+          inst->src[0].file != VGRF ||
+          inst->src[0].abs)
+         continue;
+
+      int src_var = live.var_from_reg(inst->src[0]);
+      int src_end_ip = live.end[src_var];
+
+      bool interfered = false;
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+         if (scan_inst->exec_size == inst->exec_size &&
+             regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->src[0], inst->size_read(0))) {
+            if (scan_inst->is_partial_write() ||
+                (scan_inst->dst.type != inst->dst.type &&
+                 !scan_inst->can_change_types()))
+               break;
+
+            if (scan_inst->saturate) {
+               inst->saturate = false;
+               progress = true;
+            } else if (src_end_ip == ip || inst->dst.equals(inst->src[0])) {
+               if (scan_inst->can_do_saturate()) {
+                  if (scan_inst->dst.type != inst->dst.type) {
+                     scan_inst->dst.type = inst->dst.type;
+                     for (int i = 0; i < scan_inst->sources; i++) {
+                        scan_inst->src[i].type = inst->dst.type;
+                     }
+                  }
+
+                  if (inst->src[0].negate) {
+                     if (scan_inst->opcode == BRW_OPCODE_MUL) {
+                        scan_inst->src[0].negate = !scan_inst->src[0].negate;
+                        inst->src[0].negate = false;
+                     } else if (scan_inst->opcode == BRW_OPCODE_MAD) {
+                        for (int i = 0; i < 2; i++) {
+                           if (scan_inst->src[i].file == IMM) {
+                              brw_negate_immediate(scan_inst->src[i].type,
+                                                   &scan_inst->src[i].as_brw_reg());
+                           } else {
+                              scan_inst->src[i].negate = !scan_inst->src[i].negate;
+                           }
+                        }
+                        inst->src[0].negate = false;
+                     } else if (scan_inst->opcode == BRW_OPCODE_ADD) {
+                        if (scan_inst->src[1].file == IMM) {
+                           if (!brw_negate_immediate(scan_inst->src[1].type,
+                                                     &scan_inst->src[1].as_brw_reg())) {
+                              break;
+                           }
+                        } else {
+                           scan_inst->src[1].negate = !scan_inst->src[1].negate;
+                        }
+                        scan_inst->src[0].negate = !scan_inst->src[0].negate;
+                        inst->src[0].negate = false;
+                     } else {
+                        break;
+                     }
+                  }
+
+                  scan_inst->saturate = true;
+                  inst->saturate = false;
+                  progress = true;
+               }
+            }
+            break;
+         }
+         for (int i = 0; i < scan_inst->sources; i++) {
+            if (scan_inst->src[i].file == VGRF &&
+                scan_inst->src[i].nr == inst->src[0].nr &&
+                regions_overlap(
+                  scan_inst->src[i], scan_inst->size_read(i),
+                  inst->src[0], inst->size_read(0))) {
+               if (scan_inst->opcode != BRW_OPCODE_MOV ||
+                   !scan_inst->saturate ||
+                   scan_inst->src[0].abs ||
+                   scan_inst->src[0].negate ||
+                   scan_inst->src[0].abs != inst->src[0].abs ||
+                   scan_inst->src[0].negate != inst->src[0].negate) {
+                  interfered = true;
+                  break;
+               }
+            }
+         }
+
+         if (interfered)
+            break;
+      }
+   }
+
+   return progress;
+}
+
+bool
+fs_visitor::opt_saturate_propagation()
+{
+   const fs_live_variables &live = live_analysis.require();
+   bool progress = false;
+
+   foreach_block (block, cfg) {
+      progress = opt_saturate_propagation_local(live, block) || progress;
+   }
+
+   /* Live intervals are still valid. */
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_fs_scoreboard.cpp b/src/intel/compiler/elk/brw_fs_scoreboard.cpp
new file mode 100644
index 00000000000..144179941c2
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_scoreboard.cpp
@@ -0,0 +1,1365 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_scoreboard.cpp
+ *
+ * Gfx12+ hardware lacks the register scoreboard logic that used to guarantee
+ * data coherency between register reads and writes in previous generations.
+ * This lowering pass runs after register allocation in order to make up for
+ * it.
+ *
+ * It works by performing global dataflow analysis in order to determine the
+ * set of potential dependencies of every instruction in the shader, and then
+ * inserts any required SWSB annotations and additional SYNC instructions in
+ * order to guarantee data coherency.
+ *
+ * WARNING - Access of the following (rarely used) ARF registers is not
+ *           tracked here, and require the RegDist SWSB annotation to be set
+ *           to 1 by the generator in order to avoid data races:
+ *
+ *  - sp stack pointer
+ *  - sr0 state register
+ *  - cr0 control register
+ *  - ip instruction pointer
+ *  - tm0 timestamp register
+ *  - dbg0 debug register
+ *  - acc2-9 special accumulator registers on TGL
+ *  - mme0-7 math macro extended accumulator registers
+ *
+ * The following ARF registers don't need to be tracked here because data
+ * coherency is still provided transparently by the hardware:
+ *
+ *  - f0-1 flag registers
+ *  - n0 notification register
+ *  - tdr0 thread dependency register
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+namespace {
+   /**
+    * In-order instruction accounting.
+    * @{
+    */
+
+   /**
+    * Return the RegDist pipeline the hardware will synchronize with if no
+    * pipeline information is provided in the SWSB annotation of an
+    * instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb).
+    */
+   tgl_pipe
+   inferred_sync_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
+   {
+      if (devinfo->verx10 >= 125) {
+         bool has_int_src = false, has_long_src = false;
+         const bool has_long_pipe = !devinfo->has_64bit_float_via_math_pipe;
+
+         if (is_send(inst))
+            return TGL_PIPE_NONE;
+
+         for (unsigned i = 0; i < inst->sources; i++) {
+            if (inst->src[i].file != BAD_FILE &&
+                !inst->is_control_source(i)) {
+               const brw_reg_type t = inst->src[i].type;
+               has_int_src |= !brw_reg_type_is_floating_point(t);
+               has_long_src |= type_sz(t) >= 8;
+            }
+         }
+
+         /* Avoid the emitting (RegDist, SWSB) annotations for long
+          * instructions on platforms where they are unordered. It's not clear
+          * what the inferred sync pipe is for them or if we are even allowed
+          * to use these annotations in this case. Return NONE, which should
+          * prevent baked_{un,}ordered_dependency_mode functions from even
+          * trying to emit these annotations.
+          */
+         if (!has_long_pipe && has_long_src)
+            return TGL_PIPE_NONE;
+
+         return has_long_src ? TGL_PIPE_LONG :
+                has_int_src ? TGL_PIPE_INT :
+                TGL_PIPE_FLOAT;
+
+      } else {
+         return TGL_PIPE_FLOAT;
+      }
+   }
+
+   /**
+    * Return the RegDist pipeline that will execute an instruction, or
+    * TGL_PIPE_NONE if the instruction is out-of-order and doesn't use the
+    * RegDist synchronization mechanism.
+    */
+   tgl_pipe
+   inferred_exec_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
+   {
+      const brw_reg_type t = get_exec_type(inst);
+      const bool is_dword_multiply = !brw_reg_type_is_floating_point(t) &&
+         ((inst->opcode == BRW_OPCODE_MUL &&
+           MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||
+          (inst->opcode == BRW_OPCODE_MAD &&
+           MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
+
+      if (is_unordered(devinfo, inst))
+         return TGL_PIPE_NONE;
+      else if (devinfo->verx10 < 125)
+         return TGL_PIPE_FLOAT;
+      else if (inst->is_math() && devinfo->ver >= 20)
+         return TGL_PIPE_MATH;
+      else if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT ||
+               inst->opcode == SHADER_OPCODE_BROADCAST ||
+               inst->opcode == SHADER_OPCODE_SHUFFLE)
+         return TGL_PIPE_INT;
+      else if (inst->opcode == FS_OPCODE_PACK_HALF_2x16_SPLIT)
+         return TGL_PIPE_FLOAT;
+      else if (devinfo->ver >= 20 && type_sz(inst->dst.type) >= 8 &&
+               brw_reg_type_is_floating_point(inst->dst.type)) {
+         assert(devinfo->has_64bit_float);
+         return TGL_PIPE_LONG;
+      } else if (devinfo->ver < 20 &&
+                 (type_sz(inst->dst.type) >= 8 || type_sz(t) >= 8 ||
+                  is_dword_multiply)) {
+         assert(devinfo->has_64bit_float || devinfo->has_64bit_int ||
+                devinfo->has_integer_dword_mul);
+         return TGL_PIPE_LONG;
+      } else if (brw_reg_type_is_floating_point(inst->dst.type))
+         return TGL_PIPE_FLOAT;
+      else
+         return TGL_PIPE_INT;
+   }
+
+   /**
+    * Index of the \p p pipeline counter in the ordered_address vector defined
+    * below.
+    */
+#define IDX(p) (p >= TGL_PIPE_FLOAT ? unsigned(p - TGL_PIPE_FLOAT) :    \
+                (abort(), ~0u))
+
+   /**
+    * Number of in-order hardware instructions for pipeline index \p contained
+    * in this IR instruction.  This determines the increment applied to the
+    * RegDist counter calculated for any ordered dependency that crosses this
+    * instruction.
+    */
+   unsigned
+   ordered_unit(const struct intel_device_info *devinfo, const fs_inst *inst,
+                unsigned p)
+   {
+      switch (inst->opcode) {
+      case BRW_OPCODE_SYNC:
+      case BRW_OPCODE_DO:
+      case SHADER_OPCODE_UNDEF:
+      case SHADER_OPCODE_HALT_TARGET:
+      case FS_OPCODE_SCHEDULING_FENCE:
+         return 0;
+      default:
+         /* Note that the following is inaccurate for virtual instructions
+          * that expand to more in-order instructions than assumed here, but
+          * that can only lead to suboptimal execution ordering, data
+          * coherency won't be impacted.  Providing exact RegDist counts for
+          * each virtual instruction would allow better ALU performance, but
+          * it would require keeping this switch statement in perfect sync
+          * with the generator in order to avoid data corruption.  Lesson is
+          * (again) don't use virtual instructions if you want optimal
+          * scheduling.
+          */
+         if (!is_unordered(devinfo, inst) &&
+             (p == IDX(inferred_exec_pipe(devinfo, inst)) ||
+              p == IDX(TGL_PIPE_ALL)))
+            return 1;
+         else
+            return 0;
+      }
+   }
+
+   /**
+    * Type for an instruction counter that increments for in-order
+    * instructions only, arbitrarily denoted 'jp' throughout this lowering
+    * pass in order to distinguish it from the regular instruction counter.
+    * This is represented as a vector with an independent counter for each
+    * asynchronous ALU pipeline in the EU.
+    */
+   struct ordered_address {
+      /**
+       * Construct the ordered address of a dependency known to execute on a
+       * single specified pipeline \p p (unless TGL_PIPE_NONE or TGL_PIPE_ALL
+       * is provided), in which case the vector counter will be initialized
+       * with all components equal to INT_MIN (always satisfied) except for
+       * component IDX(p).
+       */
+      ordered_address(tgl_pipe p = TGL_PIPE_NONE, int jp0 = INT_MIN) {
+         for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++)
+            jp[q] = (p == TGL_PIPE_NONE || (IDX(p) != q && p != TGL_PIPE_ALL) ?
+                     INT_MIN : jp0);
+      }
+
+      int jp[IDX(TGL_PIPE_ALL)];
+
+      friend bool
+      operator==(const ordered_address &jp0, const ordered_address &jp1)
+      {
+         for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
+            if (jp0.jp[p] != jp1.jp[p])
+               return false;
+         }
+
+         return true;
+      }
+   };
+
+   /**
+    * Return true if the specified ordered address is trivially satisfied for
+    * all pipelines except potentially for the specified pipeline \p p.
+    */
+   bool
+   is_single_pipe(const ordered_address &jp, tgl_pipe p)
+   {
+      for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
+         if ((p == TGL_PIPE_NONE || IDX(p) != q) && jp.jp[q] > INT_MIN)
+            return false;
+      }
+
+      return true;
+   }
+
+   /**
+    * Return the number of instructions in the program.
+    */
+   unsigned
+   num_instructions(const backend_shader *shader)
+   {
+      return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
+   }
+
+   /**
+    * Calculate the local ordered_address instruction counter at every
+    * instruction of the shader for subsequent constant-time look-up.
+    */
+   ordered_address *
+   ordered_inst_addresses(const fs_visitor *shader)
+   {
+      ordered_address *jps = new ordered_address[num_instructions(shader)];
+      ordered_address jp(TGL_PIPE_ALL, 0);
+      unsigned ip = 0;
+
+      foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
+         jps[ip] = jp;
+         for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
+            jp.jp[p] += ordered_unit(shader->devinfo, inst, p);
+         ip++;
+      }
+
+      return jps;
+   }
+
+   /**
+    * Synchronization mode required for data manipulated by in-order
+    * instructions.
+    *
+    * Similar to tgl_sbid_mode, but without SET mode.  Defined as a separate
+    * enum for additional type safety.  The hardware doesn't provide control
+    * over the synchronization mode for RegDist annotations, this is only used
+    * internally in this pass in order to optimize out redundant read
+    * dependencies where possible.
+    */
+   enum tgl_regdist_mode {
+      TGL_REGDIST_NULL = 0,
+      TGL_REGDIST_SRC = 1,
+      TGL_REGDIST_DST = 2
+   };
+
+   /**
+    * Allow bitwise arithmetic of tgl_regdist_mode enums.
+    */
+   tgl_regdist_mode
+   operator|(tgl_regdist_mode x, tgl_regdist_mode y)
+   {
+      return tgl_regdist_mode(unsigned(x) | unsigned(y));
+   }
+
+   tgl_regdist_mode
+   operator&(tgl_regdist_mode x, tgl_regdist_mode y)
+   {
+      return tgl_regdist_mode(unsigned(x) & unsigned(y));
+   }
+
+   tgl_regdist_mode &
+   operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
+   {
+      return x = x | y;
+   }
+
+   tgl_regdist_mode &
+   operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
+   {
+      return x = x & y;
+   }
+
+   /** @} */
+
+   /**
+    * Representation of an equivalence relation among the set of unsigned
+    * integers.
+    *
+    * Its initial state is the identity relation '~' such that i ~ j if and
+    * only if i == j for every pair of unsigned integers i and j.
+    */
+   struct equivalence_relation {
+      equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
+      {
+         for (unsigned i = 0; i < n; i++)
+            is[i] = i;
+      }
+
+      ~equivalence_relation()
+      {
+         delete[] is;
+      }
+
+      /**
+       * Return equivalence class index of the specified element.  Effectively
+       * this is the numeric value of an arbitrary representative from the
+       * equivalence class.
+       *
+       * Allows the evaluation of the equivalence relation according to the
+       * rule that i ~ j if and only if lookup(i) == lookup(j).
+       */
+      unsigned
+      lookup(unsigned i) const
+      {
+         if (i < n && is[i] != i)
+            return lookup(is[i]);
+         else
+            return i;
+      }
+
+      /**
+       * Create an array with the results of the lookup() method for
+       * constant-time evaluation.
+       */
+      unsigned *
+      flatten() const
+      {
+         unsigned *ids = new unsigned[n];
+
+         for (unsigned i = 0; i < n; i++)
+            ids[i] = lookup(i);
+
+         return ids;
+      }
+
+      /**
+       * Mutate the existing equivalence relation minimally by imposing the
+       * additional requirement that i ~ j.
+       *
+       * The algorithm updates the internal representation recursively in
+       * order to guarantee transitivity while preserving the previously
+       * specified equivalence requirements.
+       */
+      unsigned
+      link(unsigned i, unsigned j)
+      {
+         const unsigned k = lookup(i);
+         assign(i, k);
+         assign(j, k);
+         return k;
+      }
+
+   private:
+      equivalence_relation(const equivalence_relation &);
+
+      equivalence_relation &
+      operator=(const equivalence_relation &);
+
+      /**
+       * Assign the representative of \p from to be equivalent to \p to.
+       *
+       * At the same time the data structure is partially flattened as much as
+       * it's possible without increasing the number of recursive calls.
+       */
+      void
+      assign(unsigned from, unsigned to)
+      {
+         if (from != to) {
+            assert(from < n);
+
+            if (is[from] != from)
+               assign(is[from], to);
+
+            is[from] = to;
+         }
+      }
+
+      unsigned *is;
+      unsigned n;
+   };
+
+   /**
+    * Representation of a data dependency between two instructions in the
+    * program.
+    * @{
+    */
+   struct dependency {
+      /**
+       * No dependency information.
+       */
+      dependency() : ordered(TGL_REGDIST_NULL), jp(),
+                     unordered(TGL_SBID_NULL), id(0),
+                     exec_all(false) {}
+
+      /**
+       * Construct a dependency on the in-order instruction with the provided
+       * ordered_address instruction counter.
+       */
+      dependency(tgl_regdist_mode mode, const ordered_address &jp,
+                 bool exec_all) :
+         ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
+         exec_all(exec_all) {}
+
+      /**
+       * Construct a dependency on the out-of-order instruction with the
+       * specified synchronization token.
+       */
+      dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
+         ordered(TGL_REGDIST_NULL), jp(), unordered(mode), id(id),
+         exec_all(exec_all) {}
+
+      /**
+       * Synchronization mode of in-order dependency, or zero if no in-order
+       * dependency is present.
+       */
+      tgl_regdist_mode ordered;
+
+      /**
+       * Instruction counter of in-order dependency.
+       *
+       * For a dependency part of a different block in the program, this is
+       * relative to the specific control flow path taken between the
+       * dependency and the current block: It is the ordered_address such that
+       * the difference between it and the ordered_address of the first
+       * instruction of the current block is exactly the number of in-order
+       * instructions across that control flow path.  It is not guaranteed to
+       * be equal to the local ordered_address of the generating instruction
+       * [as returned by ordered_inst_addresses()], except for block-local
+       * dependencies.
+       */
+      ordered_address jp;
+
+      /**
+       * Synchronization mode of unordered dependency, or zero if no unordered
+       * dependency is present.
+       */
+      tgl_sbid_mode unordered;
+
+      /** Synchronization token of out-of-order dependency. */
+      unsigned id;
+
+      /**
+       * Whether the dependency could be run with execution masking disabled,
+       * which might lead to the unwanted execution of the generating
+       * instruction in cases where a BB is executed with all channels
+       * disabled due to hardware bug Wa_1407528679.
+       */
+      bool exec_all;
+
+      /**
+       * Trivial in-order dependency that's always satisfied.
+       *
+       * Note that unlike a default-constructed dependency() which is also
+       * trivially satisfied, this is considered to provide dependency
+       * information and can be used to clear a previously pending dependency
+       * via shadow().
+       */
+      static const dependency done;
+
+      friend bool
+      operator==(const dependency &dep0, const dependency &dep1)
+      {
+         return dep0.ordered == dep1.ordered &&
+                dep0.jp == dep1.jp &&
+                dep0.unordered == dep1.unordered &&
+                dep0.id == dep1.id &&
+                dep0.exec_all == dep1.exec_all;
+      }
+
+      friend bool
+      operator!=(const dependency &dep0, const dependency &dep1)
+      {
+         return !(dep0 == dep1);
+      }
+   };
+
+   const dependency dependency::done =
+        dependency(TGL_REGDIST_DST, ordered_address(), false);
+
+   /**
+    * Return whether \p dep contains any dependency information.
+    */
+   bool
+   is_valid(const dependency &dep)
+   {
+      return dep.ordered || dep.unordered;
+   }
+
+   /**
+    * Combine \p dep0 and \p dep1 into a single dependency object that is only
+    * satisfied when both original dependencies are satisfied.  This might
+    * involve updating the equivalence relation \p eq in order to make sure
+    * that both out-of-order dependencies are assigned the same hardware SBID
+    * as synchronization token.
+    */
+   dependency
+   merge(equivalence_relation &eq,
+         const dependency &dep0, const dependency &dep1)
+   {
+      dependency dep;
+
+      if (dep0.ordered || dep1.ordered) {
+         dep.ordered = dep0.ordered | dep1.ordered;
+         for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
+            dep.jp.jp[p] = MAX2(dep0.jp.jp[p], dep1.jp.jp[p]);
+      }
+
+      if (dep0.unordered || dep1.unordered) {
+         dep.unordered = dep0.unordered | dep1.unordered;
+         dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
+                          dep1.unordered ? dep1.id : dep0.id);
+      }
+
+      dep.exec_all = dep0.exec_all || dep1.exec_all;
+
+      return dep;
+   }
+
+   /**
+    * Override dependency information of \p dep0 with that of \p dep1.
+    */
+   dependency
+   shadow(const dependency &dep0, const dependency &dep1)
+   {
+      if (dep0.ordered == TGL_REGDIST_SRC &&
+          is_valid(dep1) && !(dep1.unordered & TGL_SBID_DST) &&
+                            !(dep1.ordered & TGL_REGDIST_DST)) {
+         /* As an optimization (see dependency_for_read()),
+          * instructions with a RaR dependency don't synchronize
+          * against a previous in-order read, so we need to pass
+          * through both ordered dependencies instead of simply
+          * dropping the first one.  Otherwise we could encounter a
+          * WaR data hazard between OP0 and OP2 in cases like:
+          *
+          *   OP0 r1:f r0:d
+          *   OP1 r2:d r0:d
+          *   OP2 r0:d r3:d
+          *
+          * since only the integer-pipeline r0 dependency from OP1
+          * would be visible to OP2, even though OP0 could technically
+          * execute after OP1 due to the floating-point and integer
+          * pipelines being asynchronous on Gfx12.5+ platforms, so
+          * synchronizing OP2 against OP1 would be insufficient.
+          */
+         dependency dep = dep1;
+
+         dep.ordered |= dep0.ordered;
+         for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
+               dep.jp.jp[p] = MAX2(dep.jp.jp[p], dep0.jp.jp[p]);
+
+         return dep;
+      } else {
+         return is_valid(dep1) ? dep1 : dep0;
+      }
+   }
+
+   /**
+    * Translate dependency information across the program.
+    *
+    * This returns a dependency on the same instruction translated to the
+    * ordered_address space of a different block.  The correct shift for
+    * transporting a dependency across an edge of the CFG is the difference
+    * between the local ordered_address of the first instruction of the target
+    * block and the local ordered_address of the instruction immediately after
+    * the end of the origin block.
+    */
+   dependency
+   transport(dependency dep, int delta[IDX(TGL_PIPE_ALL)])
+   {
+      if (dep.ordered) {
+         for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
+            if (dep.jp.jp[p] > INT_MIN)
+               dep.jp.jp[p] += delta[p];
+         }
+      }
+
+      return dep;
+   }
+
+   /**
+    * Return simplified dependency removing any synchronization modes not
+    * applicable to an instruction reading the same register location.
+    */
+   dependency
+   dependency_for_read(dependency dep)
+   {
+      dep.ordered &= TGL_REGDIST_DST;
+      return dep;
+   }
+
+   /**
+    * Return simplified dependency removing any synchronization modes not
+    * applicable to an instruction \p inst writing the same register location.
+    *
+    * This clears any WaR dependency for writes performed from the same
+    * pipeline as the read, since there is no possibility for a data hazard.
+    */
+   dependency
+   dependency_for_write(const struct intel_device_info *devinfo,
+                        const fs_inst *inst, dependency dep)
+   {
+      if (!is_unordered(devinfo, inst) &&
+          is_single_pipe(dep.jp, inferred_exec_pipe(devinfo, inst)))
+         dep.ordered &= TGL_REGDIST_DST;
+      return dep;
+   }
+
+   /** @} */
+
+   /**
+    * Scoreboard representation.  This keeps track of the data dependencies of
+    * registers with GRF granularity.
+    */
+   class scoreboard {
+   public:
+      /**
+       * Look up the most current data dependency for register \p r.
+       */
+      dependency
+      get(const fs_reg &r) const
+      {
+         if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
+            return *p;
+         else
+            return dependency();
+      }
+
+      /**
+       * Specify the most current data dependency for register \p r.
+       */
+      void
+      set(const fs_reg &r, const dependency &d)
+      {
+         if (dependency *p = dep(r))
+            *p = d;
+      }
+
+      /**
+       * Component-wise merge() of corresponding dependencies from two
+       * scoreboard objects.  \sa merge().
+       */
+      friend scoreboard
+      merge(equivalence_relation &eq,
+            const scoreboard &sb0, const scoreboard &sb1)
+      {
+         scoreboard sb;
+
+         for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
+            sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
+
+         sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
+         sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep);
+
+         return sb;
+      }
+
+      /**
+       * Component-wise shadow() of corresponding dependencies from two
+       * scoreboard objects.  \sa shadow().
+       */
+      friend scoreboard
+      shadow(const scoreboard &sb0, const scoreboard &sb1)
+      {
+         scoreboard sb;
+
+         for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
+            sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
+
+         sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
+         sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep);
+
+         return sb;
+      }
+
+      /**
+       * Component-wise transport() of dependencies from a scoreboard
+       * object.  \sa transport().
+       */
+      friend scoreboard
+      transport(const scoreboard &sb0, int delta[IDX(TGL_PIPE_ALL)])
+      {
+         scoreboard sb;
+
+         for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
+            sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
+
+         sb.addr_dep = transport(sb0.addr_dep, delta);
+         sb.accum_dep = transport(sb0.accum_dep, delta);
+
+         return sb;
+      }
+
+      friend bool
+      operator==(const scoreboard &sb0, const scoreboard &sb1)
+      {
+         for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
+            if (sb0.grf_deps[i] != sb1.grf_deps[i])
+               return false;
+         }
+
+         if (sb0.addr_dep != sb1.addr_dep)
+            return false;
+
+         if (sb0.accum_dep != sb1.accum_dep)
+            return false;
+
+         return true;
+      }
+
+      friend bool
+      operator!=(const scoreboard &sb0, const scoreboard &sb1)
+      {
+         return !(sb0 == sb1);
+      }
+
+   private:
+      dependency grf_deps[XE2_MAX_GRF];
+      dependency addr_dep;
+      dependency accum_dep;
+
+      dependency *
+      dep(const fs_reg &r)
+      {
+         const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
+                               reg_offset(r) / REG_SIZE);
+
+         return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
+                 r.file == MRF ? &grf_deps[GFX7_MRF_HACK_START + reg] :
+                 r.file == ARF && reg >= BRW_ARF_ADDRESS &&
+                                  reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
+                 r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
+                                  reg < BRW_ARF_FLAG ? &accum_dep :
+                 NULL);
+      }
+   };
+
+   /**
+    * Dependency list handling.
+    * @{
+    */
+   struct dependency_list {
+      dependency_list() : deps(NULL), n(0) {}
+
+      ~dependency_list()
+      {
+         free(deps);
+      }
+
+      void
+      push_back(const dependency &dep)
+      {
+         deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
+         deps[n++] = dep;
+      }
+
+      unsigned
+      size() const
+      {
+         return n;
+      }
+
+      const dependency &
+      operator[](unsigned i) const
+      {
+         assert(i < n);
+         return deps[i];
+      }
+
+      dependency &
+      operator[](unsigned i)
+      {
+         assert(i < n);
+         return deps[i];
+      }
+
+   private:
+      dependency_list(const dependency_list &);
+      dependency_list &
+      operator=(const dependency_list &);
+
+      dependency *deps;
+      unsigned n;
+   };
+
+   /**
+    * Add dependency \p dep to the list of dependencies of an instruction
+    * \p deps.
+    */
+   void
+   add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
+   {
+      if (is_valid(dep)) {
+         /* Translate the unordered dependency token first in order to keep
+          * the list minimally redundant.
+          */
+         if (dep.unordered)
+            dep.id = ids[dep.id];
+
+         /* Try to combine the specified dependency with any existing ones. */
+         for (unsigned i = 0; i < deps.size(); i++) {
+            /* Don't combine otherwise matching dependencies if there is an
+             * exec_all mismatch which would cause a SET dependency to gain an
+             * exec_all flag, since that would prevent it from being baked
+             * into the instruction we want to allocate an SBID for.
+             */
+            if (deps[i].exec_all != dep.exec_all &&
+                (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
+                (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
+               continue;
+
+            if (dep.ordered && deps[i].ordered) {
+               for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
+                  deps[i].jp.jp[p] = MAX2(deps[i].jp.jp[p], dep.jp.jp[p]);
+
+               deps[i].ordered |= dep.ordered;
+               deps[i].exec_all |= dep.exec_all;
+               dep.ordered = TGL_REGDIST_NULL;
+            }
+
+            if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
+               deps[i].unordered |= dep.unordered;
+               deps[i].exec_all |= dep.exec_all;
+               dep.unordered = TGL_SBID_NULL;
+            }
+         }
+
+         /* Add it to the end of the list if necessary. */
+         if (is_valid(dep))
+            deps.push_back(dep);
+      }
+   }
+
+   /**
+    * Construct a tgl_swsb annotation encoding any ordered dependencies from
+    * the dependency list \p deps of an instruction with ordered_address \p
+    * jp.  If \p exec_all is false only dependencies known to be executed with
+    * channel masking applied will be considered in the calculation.
+    */
+   tgl_swsb
+   ordered_dependency_swsb(const dependency_list &deps,
+                           const ordered_address &jp,
+                           bool exec_all)
+   {
+      tgl_pipe p = TGL_PIPE_NONE;
+      unsigned min_dist = ~0u;
+
+      for (unsigned i = 0; i < deps.size(); i++) {
+         if (deps[i].ordered && exec_all >= deps[i].exec_all) {
+            for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
+               const unsigned dist = jp.jp[q] - int64_t(deps[i].jp.jp[q]);
+               const unsigned max_dist = (q == IDX(TGL_PIPE_LONG) ? 14 : 10);
+               assert(jp.jp[q] > deps[i].jp.jp[q]);
+               if (dist <= max_dist) {
+                  p = (p && IDX(p) != q ? TGL_PIPE_ALL :
+                       tgl_pipe(TGL_PIPE_FLOAT + q));
+                  min_dist = MIN3(min_dist, dist, 7);
+               }
+            }
+         }
+      }
+
+      return { p ? min_dist : 0, p };
+   }
+
+   /**
+    * Return whether the dependency list \p deps of an instruction with
+    * ordered_address \p jp has any non-trivial ordered dependencies.  If \p
+    * exec_all is false only dependencies known to be executed with channel
+    * masking applied will be considered in the calculation.
+    */
+   bool
+   find_ordered_dependency(const dependency_list &deps,
+                           const ordered_address &jp,
+                           bool exec_all)
+   {
+      return ordered_dependency_swsb(deps, jp, exec_all).regdist;
+   }
+
+   /**
+    * Return the full tgl_sbid_mode bitset for the first unordered dependency
+    * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
+    * no such dependency is present.  If \p exec_all is false only
+    * dependencies known to be executed with channel masking applied will be
+    * considered in the calculation.
+    */
+   tgl_sbid_mode
+   find_unordered_dependency(const dependency_list &deps,
+                             tgl_sbid_mode unordered,
+                             bool exec_all)
+   {
+      if (unordered) {
+         for (unsigned i = 0; i < deps.size(); i++) {
+            if ((unordered & deps[i].unordered) &&
+                exec_all >= deps[i].exec_all)
+               return deps[i].unordered;
+         }
+      }
+
+      return TGL_SBID_NULL;
+   }
+
+   /**
+    * Return the tgl_sbid_mode bitset of an unordered dependency from the list
+    * \p deps that can be represented directly in the SWSB annotation of the
+    * instruction without additional SYNC instructions, or zero if no such
+    * dependency is present.
+    */
+   tgl_sbid_mode
+   baked_unordered_dependency_mode(const struct intel_device_info *devinfo,
+                                   const fs_inst *inst,
+                                   const dependency_list &deps,
+                                   const ordered_address &jp)
+   {
+      const bool exec_all = inst->force_writemask_all;
+      const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
+      const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
+                                                            exec_all).pipe;
+
+      if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
+         return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
+      else if (has_ordered && is_unordered(devinfo, inst))
+         return TGL_SBID_NULL;
+      else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
+               (!has_ordered || ordered_pipe == inferred_sync_pipe(devinfo, inst)))
+         return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
+      else if (!has_ordered)
+         return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
+      else
+         return TGL_SBID_NULL;
+   }
+
+   /**
+    * Return whether an ordered dependency from the list \p deps can be
+    * represented directly in the SWSB annotation of the instruction without
+    * additional SYNC instructions.
+    */
+   bool
+   baked_ordered_dependency_mode(const struct intel_device_info *devinfo,
+                                 const fs_inst *inst,
+                                 const dependency_list &deps,
+                                 const ordered_address &jp)
+   {
+      const bool exec_all = inst->force_writemask_all;
+      const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
+      const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
+                                                            exec_all).pipe;
+      const tgl_sbid_mode unordered_mode =
+         baked_unordered_dependency_mode(devinfo, inst, deps, jp);
+
+      if (!has_ordered)
+         return false;
+      else if (!unordered_mode)
+         return true;
+      else
+         return ordered_pipe == inferred_sync_pipe(devinfo, inst) &&
+                unordered_mode == (is_unordered(devinfo, inst) ? TGL_SBID_SET :
+                                   TGL_SBID_DST);
+   }
+
+   /** @} */
+
+   /**
+    * Shader instruction dependency calculation.
+    * @{
+    */
+
+   /**
+    * Update scoreboard object \p sb to account for the execution of
+    * instruction \p inst.
+    */
+   void
+   update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps,
+                          const fs_inst *inst, unsigned ip, scoreboard &sb)
+   {
+      const bool exec_all = inst->force_writemask_all;
+      const struct intel_device_info *devinfo = shader->devinfo;
+      const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
+      const ordered_address jp = p ? ordered_address(p, jps[ip].jp[IDX(p)]) :
+                                     ordered_address();
+      const bool is_ordered = ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL));
+      const bool is_unordered_math =
+         (inst->is_math() && devinfo->ver < 20) ||
+         (devinfo->has_64bit_float_via_math_pipe &&
+          (get_exec_type(inst) == BRW_REGISTER_TYPE_DF ||
+           inst->dst.type == BRW_REGISTER_TYPE_DF));
+
+      /* Track any source registers that may be fetched asynchronously by this
+       * instruction, otherwise clear the dependency in order to avoid
+       * subsequent redundant synchronization.
+       */
+      for (unsigned i = 0; i < inst->sources; i++) {
+         const dependency rd_dep =
+            (inst->is_payload(i) ||
+             inst->opcode == BRW_OPCODE_DPAS ||
+             is_unordered_math) ? dependency(TGL_SBID_SRC, ip, exec_all) :
+            is_ordered ? dependency(TGL_REGDIST_SRC, jp, exec_all) :
+            dependency::done;
+
+         for (unsigned j = 0; j < regs_read(inst, i); j++) {
+            const fs_reg r = byte_offset(inst->src[i], REG_SIZE * j);
+            sb.set(r, shadow(sb.get(r), rd_dep));
+         }
+      }
+
+      if (inst->reads_accumulator_implicitly())
+         sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all));
+
+      if (is_send(inst) && inst->base_mrf != -1) {
+         const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all);
+
+         for (unsigned j = 0; j < inst->mlen; j++)
+            sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
+      }
+
+      /* Track any destination registers of this instruction. */
+      const dependency wr_dep =
+         is_unordered(devinfo, inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
+         is_ordered ? dependency(TGL_REGDIST_DST, jp, exec_all) :
+         dependency();
+
+      if (inst->writes_accumulator_implicitly(devinfo))
+         sb.set(brw_acc_reg(8), wr_dep);
+
+      if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
+          !inst->dst.is_null()) {
+         for (unsigned j = 0; j < regs_written(inst); j++)
+            sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
+      }
+   }
+
+   /**
+    * Calculate scoreboard objects locally that represent any pending (and
+    * unconditionally resolved) dependencies at the end of each block of the
+    * program.
+    */
+   scoreboard *
+   gather_block_scoreboards(const fs_visitor *shader,
+                            const ordered_address *jps)
+   {
+      scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
+      unsigned ip = 0;
+
+      foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
+         update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
+
+      return sbs;
+   }
+
+   /**
+    * Propagate data dependencies globally through the control flow graph
+    * until a fixed point is reached.
+    *
+    * Calculates the set of dependencies potentially pending at the beginning
+    * of each block, and returns it as an array of scoreboard objects.
+    */
+   scoreboard *
+   propagate_block_scoreboards(const fs_visitor *shader,
+                               const ordered_address *jps,
+                               equivalence_relation &eq)
+   {
+      const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
+      scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
+      scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
+
+      for (bool progress = true; progress;) {
+         progress = false;
+
+         foreach_block(block, shader->cfg) {
+            const scoreboard sb = shadow(in_sbs[block->num],
+                                         delta_sbs[block->num]);
+
+            if (sb != out_sbs[block->num]) {
+               foreach_list_typed(bblock_link, child_link, link,
+                                  &block->children) {
+                  scoreboard &in_sb = in_sbs[child_link->block->num];
+                  int delta[IDX(TGL_PIPE_ALL)];
+
+                  for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
+                     delta[p] = jps[child_link->block->start_ip].jp[p]
+                        - jps[block->end_ip].jp[p]
+                        - ordered_unit(shader->devinfo,
+                                       static_cast<const fs_inst *>(block->end()), p);
+
+                  in_sb = merge(eq, in_sb, transport(sb, delta));
+               }
+
+               out_sbs[block->num] = sb;
+               progress = true;
+            }
+         }
+      }
+
+      delete[] delta_sbs;
+      delete[] out_sbs;
+
+      return in_sbs;
+   }
+
+   /**
+    * Return the list of potential dependencies of each instruction in the
+    * shader based on the result of global dependency analysis.
+    */
+   dependency_list *
+   gather_inst_dependencies(const fs_visitor *shader,
+                            const ordered_address *jps)
+   {
+      const struct intel_device_info *devinfo = shader->devinfo;
+      equivalence_relation eq(num_instructions(shader));
+      scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
+      const unsigned *ids = eq.flatten();
+      dependency_list *deps = new dependency_list[num_instructions(shader)];
+      unsigned ip = 0;
+
+      foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
+         const bool exec_all = inst->force_writemask_all;
+         const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
+         scoreboard &sb = sbs[block->num];
+
+         for (unsigned i = 0; i < inst->sources; i++) {
+            for (unsigned j = 0; j < regs_read(inst, i); j++)
+               add_dependency(ids, deps[ip], dependency_for_read(
+                  sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
+         }
+
+         if (inst->reads_accumulator_implicitly()) {
+            /* Wa_22012725308:
+             *
+             * "When the accumulator registers are used as source and/or
+             *  destination, hardware does not ensure prevention of write
+             *  after read hazard across execution pipes."
+             */
+            const dependency dep = sb.get(brw_acc_reg(8));
+            if (dep.ordered && !is_single_pipe(dep.jp, p))
+               add_dependency(ids, deps[ip], dep);
+         }
+
+         if (is_send(inst) && inst->base_mrf != -1) {
+            for (unsigned j = 0; j < inst->mlen; j++)
+               add_dependency(ids, deps[ip], dependency_for_read(
+                  sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
+         }
+
+         if (is_unordered(devinfo, inst) && !inst->eot)
+            add_dependency(ids, deps[ip],
+                           dependency(TGL_SBID_SET, ip, exec_all));
+
+         if (!inst->no_dd_check) {
+            if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&
+                !inst->dst.is_accumulator()) {
+               for (unsigned j = 0; j < regs_written(inst); j++) {
+                  add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
+                     sb.get(byte_offset(inst->dst, REG_SIZE * j))));
+               }
+            }
+
+            if (inst->writes_accumulator_implicitly(devinfo) ||
+                inst->dst.is_accumulator()) {
+               /* Wa_22012725308:
+                *
+                * "When the accumulator registers are used as source and/or
+                *  destination, hardware does not ensure prevention of write
+                *  after read hazard across execution pipes."
+                */
+               const dependency dep = sb.get(brw_acc_reg(8));
+               if (dep.ordered && !is_single_pipe(dep.jp, p))
+                  add_dependency(ids, deps[ip], dep);
+            }
+
+            if (is_send(inst) && inst->base_mrf != -1) {
+               for (unsigned j = 0; j < inst->implied_mrf_writes(); j++)
+                  add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
+                     sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
+            }
+         }
+
+         update_inst_scoreboard(shader, jps, inst, ip, sb);
+         ip++;
+      }
+
+      delete[] sbs;
+      delete[] ids;
+
+      return deps;
+   }
+
+   /** @} */
+
+   /**
+    * Allocate SBID tokens to track the execution of every out-of-order
+    * instruction of the shader.
+    */
+   dependency_list *
+   allocate_inst_dependencies(const fs_visitor *shader,
+                              const dependency_list *deps0)
+   {
+      /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
+       *       shaders with a large number of SEND messages.
+       *
+       * XXX - Use 32 SBIDs on Xe2+ while in large GRF mode.
+       */
+      const unsigned num_sbids = 16;
+
+      /* Allocate an unordered dependency ID to hardware SBID translation
+       * table with as many entries as instructions there are in the shader,
+       * which is the maximum number of unordered IDs we can find in the
+       * program.
+       */
+      unsigned *ids = new unsigned[num_instructions(shader)];
+      for (unsigned ip = 0; ip < num_instructions(shader); ip++)
+         ids[ip] = ~0u;
+
+      dependency_list *deps1 = new dependency_list[num_instructions(shader)];
+      unsigned next_id = 0;
+
+      for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
+         for (unsigned i = 0; i < deps0[ip].size(); i++) {
+            const dependency &dep = deps0[ip][i];
+
+            if (dep.unordered && ids[dep.id] == ~0u)
+               ids[dep.id] = (next_id++) & (num_sbids - 1);
+
+            add_dependency(ids, deps1[ip], dep);
+         }
+      }
+
+      delete[] ids;
+
+      return deps1;
+   }
+
+   /**
+    * Emit dependency information provided by \p deps into the shader,
+    * inserting additional SYNC instructions for dependencies that can't be
+    * represented directly by annotating existing instructions.
+    */
+   void
+   emit_inst_dependencies(fs_visitor *shader,
+                          const ordered_address *jps,
+                          const dependency_list *deps)
+   {
+      const struct intel_device_info *devinfo = shader->devinfo;
+      unsigned ip = 0;
+
+      foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
+         const bool exec_all = inst->force_writemask_all;
+         const bool ordered_mode =
+            baked_ordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
+         const tgl_sbid_mode unordered_mode =
+            baked_unordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
+         tgl_swsb swsb = !ordered_mode ? tgl_swsb() :
+            ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
+
+         for (unsigned i = 0; i < deps[ip].size(); i++) {
+            const dependency &dep = deps[ip][i];
+
+            if (dep.unordered) {
+               if (unordered_mode == dep.unordered &&
+                   exec_all >= dep.exec_all && !swsb.mode) {
+                  /* Bake unordered dependency into the instruction's SWSB if
+                   * possible, except in cases where the current instruction
+                   * isn't marked NoMask but the dependency is, since that
+                   * might lead to data coherency issues due to
+                   * Wa_1407528679.
+                   */
+                  swsb.sbid = dep.id;
+                  swsb.mode = dep.unordered;
+               } else {
+                  /* Emit dependency into the SWSB of an extra SYNC
+                   * instruction.
+                   */
+                  const fs_builder ibld = fs_builder(shader, block, inst)
+                                          .exec_all().group(1, 0);
+                  fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
+                                            brw_imm_ud(TGL_SYNC_NOP));
+                  sync->sched.sbid = dep.id;
+                  sync->sched.mode = dep.unordered;
+                  assert(!(sync->sched.mode & TGL_SBID_SET));
+               }
+            }
+         }
+
+         for (unsigned i = 0; i < deps[ip].size(); i++) {
+            const dependency &dep = deps[ip][i];
+
+            if (dep.ordered &&
+                find_ordered_dependency(deps[ip], jps[ip], true) &&
+                (!ordered_mode || dep.exec_all > exec_all)) {
+               /* If the current instruction is not marked NoMask but an
+                * ordered dependency is, perform the synchronization as a
+                * separate NoMask SYNC instruction in order to avoid data
+                * coherency issues due to Wa_1407528679.  The similar
+                * scenario with unordered dependencies should have been
+                * handled above.
+                */
+               const fs_builder ibld = fs_builder(shader, block, inst)
+                                       .exec_all().group(1, 0);
+               fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
+                                         brw_imm_ud(TGL_SYNC_NOP));
+               sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
+               break;
+            }
+         }
+
+         /* Update the IR. */
+         inst->sched = swsb;
+         inst->no_dd_check = inst->no_dd_clear = false;
+         ip++;
+      }
+   }
+}
+
+bool
+fs_visitor::lower_scoreboard()
+{
+   if (devinfo->ver >= 12) {
+      const ordered_address *jps = ordered_inst_addresses(this);
+      const dependency_list *deps0 = gather_inst_dependencies(this, jps);
+      const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);
+      emit_inst_dependencies(this, jps, deps1);
+      delete[] deps1;
+      delete[] deps0;
+      delete[] jps;
+   }
+
+   return true;
+}
diff --git a/src/intel/compiler/elk/brw_fs_sel_peephole.cpp b/src/intel/compiler/elk/brw_fs_sel_peephole.cpp
new file mode 100644
index 00000000000..1b7fd14e59e
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_sel_peephole.cpp
@@ -0,0 +1,229 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_cfg.h"
+
+/** @file brw_fs_sel_peephole.cpp
+ *
+ * This file contains the opt_peephole_sel() optimization pass that replaces
+ * MOV instructions to the same destination in the "then" and "else" bodies of
+ * an if statement with SEL instructions.
+ */
+
+/* Four MOVs seems to be pretty typical, so I picked the next power of two in
+ * the hopes that it would handle almost anything possible in a single
+ * pass.
+ */
+#define MAX_MOVS 8 /**< The maximum number of MOVs to attempt to match. */
+
+using namespace brw;
+
+/**
+ * Scans forwards from an IF counting consecutive MOV instructions in the
+ * "then" and "else" blocks of the if statement.
+ *
+ * A pointer to the bblock_t following the IF is passed as the <then_block>
+ * argument. The function stores pointers to the MOV instructions in the
+ * <then_mov> and <else_mov> arrays.
+ *
+ * \return the minimum number of MOVs found in the two branches or zero if
+ *         an error occurred.
+ *
+ * E.g.:
+ *                  IF ...
+ *    then_mov[0] = MOV g4, ...
+ *    then_mov[1] = MOV g5, ...
+ *    then_mov[2] = MOV g6, ...
+ *                  ELSE ...
+ *    else_mov[0] = MOV g4, ...
+ *    else_mov[1] = MOV g5, ...
+ *    else_mov[2] = MOV g7, ...
+ *                  ENDIF
+ *    returns 3.
+ */
+static int
+count_movs_from_if(const intel_device_info *devinfo,
+                   fs_inst *then_mov[MAX_MOVS], fs_inst *else_mov[MAX_MOVS],
+                   bblock_t *then_block, bblock_t *else_block)
+{
+   int then_movs = 0;
+   foreach_inst_in_block(fs_inst, inst, then_block) {
+      if (then_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV ||
+          inst->flags_written(devinfo))
+         break;
+
+      then_mov[then_movs] = inst;
+      then_movs++;
+   }
+
+   int else_movs = 0;
+   foreach_inst_in_block(fs_inst, inst, else_block) {
+      if (else_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV ||
+          inst->flags_written(devinfo))
+         break;
+
+      else_mov[else_movs] = inst;
+      else_movs++;
+   }
+
+   return MIN2(then_movs, else_movs);
+}
+
+/**
+ * Try to replace IF/MOV+/ELSE/MOV+/ENDIF with SEL.
+ *
+ * Many GLSL shaders contain the following pattern:
+ *
+ *    x = condition ? foo : bar
+ *
+ * or
+ *
+ *    if (...) a.xyzw = foo.xyzw;
+ *    else     a.xyzw = bar.xyzw;
+ *
+ * The compiler emits an ir_if tree for this, since each subexpression might be
+ * a complex tree that could have side-effects or short-circuit logic.
+ *
+ * However, the common case is to simply select one of two constants or
+ * variable values---which is exactly what SEL is for.  In this case, the
+ * assembly looks like:
+ *
+ *    (+f0) IF
+ *    MOV dst src0
+ *    ...
+ *    ELSE
+ *    MOV dst src1
+ *    ...
+ *    ENDIF
+ *
+ * where each pair of MOVs to a common destination and can be easily translated
+ * into
+ *
+ *    (+f0) SEL dst src0 src1
+ *
+ * If src0 is an immediate value, we promote it to a temporary GRF.
+ */
+bool
+fs_visitor::opt_peephole_sel()
+{
+   bool progress = false;
+
+   foreach_block (block, cfg) {
+      /* IF instructions, by definition, can only be found at the ends of
+       * basic blocks.
+       */
+      fs_inst *if_inst = (fs_inst *)block->end();
+      if (if_inst->opcode != BRW_OPCODE_IF)
+         continue;
+
+      fs_inst *else_mov[MAX_MOVS] = { NULL };
+      fs_inst *then_mov[MAX_MOVS] = { NULL };
+
+      bblock_t *then_block = block->next();
+      bblock_t *else_block = NULL;
+      foreach_list_typed(bblock_link, child, link, &block->children) {
+         if (child->block != then_block) {
+            if (child->block->prev()->end()->opcode == BRW_OPCODE_ELSE) {
+               else_block = child->block;
+            }
+            break;
+         }
+      }
+      if (else_block == NULL)
+         continue;
+
+      int movs = count_movs_from_if(devinfo, then_mov, else_mov, then_block, else_block);
+
+      if (movs == 0)
+         continue;
+
+      /* Generate SEL instructions for pairs of MOVs to a common destination. */
+      for (int i = 0; i < movs; i++) {
+         if (!then_mov[i] || !else_mov[i])
+            break;
+
+         /* Check that the MOVs are the right form. */
+         if (!then_mov[i]->dst.equals(else_mov[i]->dst) ||
+             then_mov[i]->exec_size != else_mov[i]->exec_size ||
+             then_mov[i]->group != else_mov[i]->group ||
+             then_mov[i]->force_writemask_all != else_mov[i]->force_writemask_all ||
+             then_mov[i]->is_partial_write() ||
+             else_mov[i]->is_partial_write() ||
+             then_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE ||
+             else_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE) {
+            movs = i;
+            break;
+         }
+
+         /* Check that source types for mov operations match. */
+         if (then_mov[i]->src[0].type != else_mov[i]->src[0].type) {
+            movs = i;
+            break;
+         }
+      }
+
+      if (movs == 0)
+         continue;
+
+      for (int i = 0; i < movs; i++) {
+         const fs_builder ibld = fs_builder(this, then_block, then_mov[i])
+                                 .at(block, if_inst);
+
+         if (then_mov[i]->src[0].equals(else_mov[i]->src[0])) {
+            ibld.MOV(then_mov[i]->dst, then_mov[i]->src[0]);
+         } else {
+            /* Only the last source register can be a constant, so if the MOV
+             * in the "then" clause uses a constant, we need to put it in a
+             * temporary.
+             */
+            fs_reg src0(then_mov[i]->src[0]);
+            if (src0.file == IMM) {
+               src0 = ibld.vgrf(then_mov[i]->src[0].type);
+               ibld.MOV(src0, then_mov[i]->src[0]);
+            }
+
+            /* 64-bit immediates can't be placed in src1. */
+            fs_reg src1(else_mov[i]->src[0]);
+            if (src1.file == IMM && type_sz(src1.type) == 8) {
+               src1 = ibld.vgrf(else_mov[i]->src[0].type);
+               ibld.MOV(src1, else_mov[i]->src[0]);
+            }
+
+            set_predicate_inv(if_inst->predicate, if_inst->predicate_inverse,
+                              ibld.SEL(then_mov[i]->dst, src0, src1));
+         }
+
+         then_mov[i]->remove(then_block);
+         else_mov[i]->remove(else_block);
+      }
+
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_fs_thread_payload.cpp b/src/intel/compiler/elk/brw_fs_thread_payload.cpp
new file mode 100644
index 00000000000..b78567fa2d1
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_thread_payload.cpp
@@ -0,0 +1,605 @@
+/*
+ * Copyright © 2006-2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+vs_thread_payload::vs_thread_payload(const fs_visitor &v)
+{
+   unsigned r = 0;
+
+   /* R0: Thread header. */
+   r += reg_unit(v.devinfo);
+
+   /* R1: URB handles. */
+   urb_handles = brw_ud8_grf(r, 0);
+   r += reg_unit(v.devinfo);
+
+   num_regs = r;
+}
+
+tcs_thread_payload::tcs_thread_payload(const fs_visitor &v)
+{
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data);
+   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(v.prog_data);
+   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) v.key;
+
+   if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH) {
+      patch_urb_output = brw_ud1_grf(0, 0);
+      primitive_id = brw_vec1_grf(0, 1);
+
+      /* r1-r4 contain the ICP handles. */
+      icp_handle_start = brw_ud8_grf(1, 0);
+
+      num_regs = 5;
+   } else {
+      assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
+      assert(tcs_key->input_vertices <= BRW_MAX_TCS_INPUT_VERTICES);
+
+      unsigned r = 0;
+
+      r += reg_unit(v.devinfo);
+
+      patch_urb_output = brw_ud8_grf(r, 0);
+      r += reg_unit(v.devinfo);
+
+      if (tcs_prog_data->include_primitive_id) {
+         primitive_id = brw_vec8_grf(r, 0);
+         r += reg_unit(v.devinfo);
+      }
+
+      /* ICP handles occupy the next 1-32 registers. */
+      icp_handle_start = brw_ud8_grf(r, 0);
+      r += brw_tcs_prog_key_input_vertices(tcs_key) * reg_unit(v.devinfo);
+
+      num_regs = r;
+   }
+}
+
+tes_thread_payload::tes_thread_payload(const fs_visitor &v)
+{
+   unsigned r = 0;
+
+   /* R0: Thread Header. */
+   patch_urb_input = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
+   primitive_id = brw_vec1_grf(0, 1);
+   r += reg_unit(v.devinfo);
+
+   /* R1-3: gl_TessCoord.xyz. */
+   for (unsigned i = 0; i < 3; i++) {
+      coords[i] = brw_vec8_grf(r, 0);
+      r += reg_unit(v.devinfo);
+   }
+
+   /* R4: URB output handles. */
+   urb_output = brw_ud8_grf(r, 0);
+   r += reg_unit(v.devinfo);
+
+   num_regs = r;
+}
+
+gs_thread_payload::gs_thread_payload(fs_visitor &v)
+{
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data);
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(v.prog_data);
+   const fs_builder bld = fs_builder(&v).at_end();
+
+   /* R0: thread header. */
+   unsigned r = reg_unit(v.devinfo);
+
+   /* R1: output URB handles. */
+   urb_handles = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   bld.AND(urb_handles, brw_ud8_grf(r, 0),
+         v.devinfo->ver >= 20 ? brw_imm_ud(0xFFFFFF) : brw_imm_ud(0xFFFF));
+
+   /* R1: Instance ID stored in bits 31:27 */
+   instance_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   bld.SHR(instance_id, brw_ud8_grf(r, 0), brw_imm_ud(27u));
+
+   r += reg_unit(v.devinfo);
+
+   if (gs_prog_data->include_primitive_id) {
+      primitive_id = brw_ud8_grf(r, 0);
+      r += reg_unit(v.devinfo);
+   }
+
+   /* Always enable VUE handles so we can safely use pull model if needed.
+    *
+    * The push model for a GS uses a ton of register space even for trivial
+    * scenarios with just a few inputs, so just make things easier and a bit
+    * safer by always having pull model available.
+    */
+   gs_prog_data->base.include_vue_handles = true;
+
+   /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
+   icp_handle_start = brw_ud8_grf(r, 0);
+   r += v.nir->info.gs.vertices_in * reg_unit(v.devinfo);
+
+   num_regs = r;
+
+   /* Use a maximum of 24 registers for push-model inputs. */
+   const unsigned max_push_components = 24;
+
+   /* If pushing our inputs would take too many registers, reduce the URB read
+    * length (which is in HWords, or 8 registers), and resort to pulling.
+    *
+    * Note that the GS reads <URB Read Length> HWords for every vertex - so we
+    * have to multiply by VerticesIn to obtain the total storage requirement.
+    */
+   if (8 * vue_prog_data->urb_read_length * v.nir->info.gs.vertices_in >
+       max_push_components) {
+      vue_prog_data->urb_read_length =
+         ROUND_DOWN_TO(max_push_components / v.nir->info.gs.vertices_in, 8) / 8;
+   }
+}
+
+static inline void
+setup_fs_payload_gfx20(fs_thread_payload &payload,
+                       const fs_visitor &v,
+                       bool &source_depth_to_render_target)
+{
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
+   const unsigned payload_width = 16;
+   assert(v.dispatch_width % payload_width == 0);
+   assert(v.devinfo->ver >= 20);
+
+   for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
+      /* R0-1: PS thread payload header, masks and pixel X/Y coordinates. */
+      payload.num_regs++;
+      payload.subspan_coord_reg[j] = payload.num_regs++;
+   }
+
+   for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
+      /* R2-13: Barycentric interpolation coordinates.  These appear
+       * in the same order that they appear in the brw_barycentric_mode
+       * enum.  Each set of coordinates occupies 2 64B registers per
+       * SIMD16 half.  Coordinates only appear if they were enabled
+       * using the "Barycentric Interpolation Mode" bits in WM_STATE.
+       */
+      for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
+         if (prog_data->barycentric_interp_modes & (1 << i)) {
+            payload.barycentric_coord_reg[i][j] = payload.num_regs;
+            payload.num_regs += payload_width / 4;
+         }
+      }
+
+      /* R14: Interpolated depth if "Pixel Shader Uses Source Depth" is set. */
+      if (prog_data->uses_src_depth) {
+         payload.source_depth_reg[j] = payload.num_regs;
+         payload.num_regs += payload_width / 8;
+      }
+
+      /* R15: Interpolated W if "Pixel Shader Uses Source W" is set. */
+      if (prog_data->uses_src_w) {
+         payload.source_w_reg[j] = payload.num_regs;
+         payload.num_regs += payload_width / 8;
+      }
+
+      /* R16: MSAA input coverage mask if "Pixel Shader Uses Input
+       * Coverage Mask" is set.
+       */
+      if (prog_data->uses_sample_mask) {
+         payload.sample_mask_in_reg[j] = payload.num_regs;
+         payload.num_regs += payload_width / 8;
+      }
+
+      /* R19: MSAA position XY offsets if "Position XY Offset Select"
+       * is either POSOFFSET_CENTROID or POSOFFSET_SAMPLE.  Note that
+       * this is delivered as a single SIMD32 vector, inconsistently
+       * with most other PS payload fields.
+       */
+      if (prog_data->uses_pos_offset && j == 0) {
+         for (unsigned k = 0; k < 2; k++) {
+            payload.sample_pos_reg[k] = payload.num_regs;
+            payload.num_regs++;
+         }
+      }
+   }
+
+   if (prog_data->uses_depth_w_coefficients) {
+      assert(v.max_polygons == 1);
+      payload.depth_w_coef_reg = payload.num_regs;
+      payload.num_regs += 2;
+   }
+
+   if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+      source_depth_to_render_target = true;
+   }
+}
+
+static inline void
+setup_fs_payload_gfx6(fs_thread_payload &payload,
+                      const fs_visitor &v,
+                      bool &source_depth_to_render_target)
+{
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
+
+   const unsigned payload_width = MIN2(16, v.dispatch_width);
+   assert(v.dispatch_width % payload_width == 0);
+   assert(v.devinfo->ver >= 6 && v.devinfo->ver < 20);
+
+   payload.num_regs = 0;
+
+   /* R0: PS thread payload header. */
+   payload.num_regs++;
+
+   for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
+      /* R1: masks, pixel X/Y coordinates. */
+      payload.subspan_coord_reg[j] = payload.num_regs++;
+   }
+
+   for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
+      /* R3-26: barycentric interpolation coordinates.  These appear in the
+       * same order that they appear in the brw_barycentric_mode enum.  Each
+       * set of coordinates occupies 2 registers if dispatch width == 8 and 4
+       * registers if dispatch width == 16.  Coordinates only appear if they
+       * were enabled using the "Barycentric Interpolation Mode" bits in
+       * WM_STATE.
+       */
+      for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
+         if (prog_data->barycentric_interp_modes & (1 << i)) {
+            payload.barycentric_coord_reg[i][j] = payload.num_regs;
+            payload.num_regs += payload_width / 4;
+         }
+      }
+
+      /* R27-28: interpolated depth if uses source depth */
+      if (prog_data->uses_src_depth) {
+         payload.source_depth_reg[j] = payload.num_regs;
+         payload.num_regs += payload_width / 8;
+      }
+
+      /* R29-30: interpolated W set if GFX6_WM_USES_SOURCE_W. */
+      if (prog_data->uses_src_w) {
+         payload.source_w_reg[j] = payload.num_regs;
+         payload.num_regs += payload_width / 8;
+      }
+
+      /* R31: MSAA position offsets. */
+      if (prog_data->uses_pos_offset) {
+         payload.sample_pos_reg[j] = payload.num_regs;
+         payload.num_regs++;
+      }
+
+      /* R32-33: MSAA input coverage mask */
+      if (prog_data->uses_sample_mask) {
+         assert(v.devinfo->ver >= 7);
+         payload.sample_mask_in_reg[j] = payload.num_regs;
+         payload.num_regs += payload_width / 8;
+      }
+   }
+
+   /* R66: Source Depth and/or W Attribute Vertex Deltas */
+   if (prog_data->uses_depth_w_coefficients) {
+      assert(v.max_polygons == 1);
+      payload.depth_w_coef_reg = payload.num_regs;
+      payload.num_regs++;
+   }
+
+   if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+      source_depth_to_render_target = true;
+   }
+}
+
+#undef P                        /* prompted depth */
+#undef C                        /* computed */
+#undef N                        /* non-promoted? */
+
+#define P 0
+#define C 1
+#define N 2
+
+static const struct {
+   GLuint mode:2;
+   GLuint sd_present:1;
+   GLuint sd_to_rt:1;
+   GLuint dd_present:1;
+   GLuint ds_present:1;
+} wm_iz_table[BRW_WM_IZ_BIT_MAX] =
+{
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 1 },
+ { N, 0, 1, 0, 1 },
+ { N, 0, 1, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 1 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 0, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 1, 1, 0, 1 },
+ { C, 0, 1, 0, 1 },
+ { C, 0, 1, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 1, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 1, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 }
+};
+
+/**
+ * \param line_aa  BRW_NEVER, BRW_ALWAYS or BRW_SOMETIMES
+ * \param lookup  bitmask of BRW_WM_IZ_* flags
+ */
+static inline void
+setup_fs_payload_gfx4(fs_thread_payload &payload,
+                      const fs_visitor &v,
+                      bool &source_depth_to_render_target,
+                      bool &runtime_check_aads_emit)
+{
+   assert(v.dispatch_width <= 16);
+
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
+   brw_wm_prog_key *key = (brw_wm_prog_key *) v.key;
+
+   GLuint reg = 1;
+   bool kill_stats_promoted_workaround = false;
+   int lookup = key->iz_lookup;
+
+   assert(lookup < BRW_WM_IZ_BIT_MAX);
+
+   /* Crazy workaround in the windowizer, which we need to track in
+    * our register allocation and render target writes.  See the "If
+    * statistics are enabled..." paragraph of 11.5.3.2: Early Depth
+    * Test Cases [Pre-DevGT] of the 3D Pipeline - Windower B-Spec.
+    */
+   if (key->stats_wm &&
+       (lookup & BRW_WM_IZ_PS_KILL_ALPHATEST_BIT) &&
+       wm_iz_table[lookup].mode == P) {
+      kill_stats_promoted_workaround = true;
+   }
+
+   payload.subspan_coord_reg[0] = reg++;
+
+   if (wm_iz_table[lookup].sd_present || prog_data->uses_src_depth ||
+       kill_stats_promoted_workaround) {
+      payload.source_depth_reg[0] = reg;
+      reg += 2;
+   }
+
+   if (wm_iz_table[lookup].sd_to_rt || kill_stats_promoted_workaround)
+      source_depth_to_render_target = true;
+
+   if (wm_iz_table[lookup].ds_present || key->line_aa != BRW_NEVER) {
+      payload.aa_dest_stencil_reg[0] = reg;
+      runtime_check_aads_emit =
+         !wm_iz_table[lookup].ds_present && key->line_aa == BRW_SOMETIMES;
+      reg++;
+   }
+
+   if (wm_iz_table[lookup].dd_present) {
+      payload.dest_depth_reg[0] = reg;
+      reg+=2;
+   }
+
+   payload.num_regs = reg;
+}
+
+#undef P                        /* prompted depth */
+#undef C                        /* computed */
+#undef N                        /* non-promoted? */
+
+fs_thread_payload::fs_thread_payload(const fs_visitor &v,
+                                     bool &source_depth_to_render_target,
+                                     bool &runtime_check_aads_emit)
+  : subspan_coord_reg(),
+    source_depth_reg(),
+    source_w_reg(),
+    aa_dest_stencil_reg(),
+    dest_depth_reg(),
+    sample_pos_reg(),
+    sample_mask_in_reg(),
+    depth_w_coef_reg(),
+    barycentric_coord_reg()
+{
+   if (v.devinfo->ver >= 20)
+      setup_fs_payload_gfx20(*this, v, source_depth_to_render_target);
+   else if (v.devinfo->ver >= 6)
+      setup_fs_payload_gfx6(*this, v, source_depth_to_render_target);
+   else
+      setup_fs_payload_gfx4(*this, v, source_depth_to_render_target,
+                            runtime_check_aads_emit);
+}
+
+cs_thread_payload::cs_thread_payload(const fs_visitor &v)
+{
+   struct brw_cs_prog_data *prog_data = brw_cs_prog_data(v.prog_data);
+
+   unsigned r = reg_unit(v.devinfo);
+
+   /* See nir_setup_uniforms for subgroup_id in earlier versions. */
+   if (v.devinfo->verx10 >= 125) {
+      subgroup_id_ = brw_ud1_grf(0, 2);
+
+      for (int i = 0; i < 3; i++) {
+         if (prog_data->generate_local_id & (1 << i)) {
+            local_invocation_id[i] = brw_uw8_grf(r, 0);
+            r += reg_unit(v.devinfo);
+            if (v.devinfo->ver < 20 && v.dispatch_width == 32)
+               r += reg_unit(v.devinfo);
+         } else {
+            local_invocation_id[i] = brw_imm_uw(0);
+         }
+      }
+
+      /* TODO: Fill out uses_btd_stack_ids automatically */
+      if (prog_data->uses_btd_stack_ids)
+         r += reg_unit(v.devinfo);
+   }
+
+   num_regs = r;
+}
+
+void
+cs_thread_payload::load_subgroup_id(const fs_builder &bld,
+                                    fs_reg &dest) const
+{
+   auto devinfo = bld.shader->devinfo;
+   dest = retype(dest, BRW_REGISTER_TYPE_UD);
+
+   if (subgroup_id_.file != BAD_FILE) {
+      assert(devinfo->verx10 >= 125);
+      bld.AND(dest, subgroup_id_, brw_imm_ud(INTEL_MASK(7, 0)));
+   } else {
+      assert(devinfo->verx10 < 125);
+      assert(gl_shader_stage_is_compute(bld.shader->stage));
+      int index = brw_get_subgroup_id_param_index(devinfo,
+                                                  bld.shader->stage_prog_data);
+      bld.MOV(dest, fs_reg(UNIFORM, index, BRW_REGISTER_TYPE_UD));
+   }
+}
+
+task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v)
+   : cs_thread_payload(v)
+{
+   /* Task and Mesh Shader Payloads (SIMD8 and SIMD16)
+    *
+    *  R0: Header
+    *  R1: Local_ID.X[0-7 or 0-15]
+    *  R2: Inline Parameter
+    *
+    * Task and Mesh Shader Payloads (SIMD32)
+    *
+    *  R0: Header
+    *  R1: Local_ID.X[0-15]
+    *  R2: Local_ID.X[16-31]
+    *  R3: Inline Parameter
+    *
+    * Local_ID.X values are 16 bits.
+    *
+    * Inline parameter is optional but always present since we use it to pass
+    * the address to descriptors.
+    */
+
+   const fs_builder bld = fs_builder(&v).at_end();
+
+   unsigned r = 0;
+   assert(subgroup_id_.file != BAD_FILE);
+   extended_parameter_0 = retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD);
+
+   if (v.devinfo->ver >= 20) {
+      urb_output = brw_ud1_grf(1, 0);
+   } else {
+      urb_output = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      /* In both mesh and task shader payload, lower 16 bits of g0.6 is
+       * an offset within Slice's Local URB, which says where shader is
+       * supposed to output its data.
+       */
+      bld.AND(urb_output, brw_ud1_grf(0, 6), brw_imm_ud(0xFFFF));
+   }
+
+   if (v.stage == MESA_SHADER_MESH) {
+      /* g0.7 is Task Shader URB Entry Offset, which contains both an offset
+       * within Slice's Local USB (bits 0:15) and a slice selector
+       * (bits 16:24). Slice selector can be non zero when mesh shader
+       * is spawned on slice other than the one where task shader was run.
+       * Bit 24 says that Slice ID is present and bits 16:23 is the Slice ID.
+       */
+      task_urb_input = brw_ud1_grf(0, 7);
+   }
+   r += reg_unit(v.devinfo);
+
+   local_index = brw_uw8_grf(r, 0);
+   r += reg_unit(v.devinfo);
+   if (v.devinfo->ver < 20 && v.dispatch_width == 32)
+      r += reg_unit(v.devinfo);
+
+   inline_parameter = brw_ud1_grf(r, 0);
+   r += reg_unit(v.devinfo);
+
+   num_regs = r;
+}
+
+bs_thread_payload::bs_thread_payload(const fs_visitor &v)
+{
+   unsigned r = 0;
+
+   /* R0: Thread header. */
+   r += reg_unit(v.devinfo);
+
+   /* R1: Stack IDs. */
+   r += reg_unit(v.devinfo);
+
+   /* R2: Inline Parameter.  Used for argument addresses. */
+   global_arg_ptr = brw_ud1_grf(r, 0);
+   local_arg_ptr = brw_ud1_grf(r, 2);
+   r += reg_unit(v.devinfo);
+
+   num_regs = r;
+}
+
+void
+bs_thread_payload::load_shader_type(const fs_builder &bld, fs_reg &dest) const
+{
+   fs_reg ud_dest = retype(dest, BRW_REGISTER_TYPE_UD);
+   bld.MOV(ud_dest, retype(brw_vec1_grf(0, 3), ud_dest.type));
+   bld.AND(ud_dest, ud_dest, brw_imm_ud(0xf));
+}
diff --git a/src/intel/compiler/elk/brw_fs_validate.cpp b/src/intel/compiler/elk/brw_fs_validate.cpp
new file mode 100644
index 00000000000..499bc8181c3
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_validate.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_validate.cpp
+ *
+ * Implements a pass that validates various invariants of the IR.  The current
+ * pass only validates that GRF's uses are sane.  More can be added later.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+#define fsv_assert(assertion)                                           \
+   {                                                                    \
+      if (!(assertion)) {                                               \
+         fprintf(stderr, "ASSERT: Scalar %s validation failed!\n",      \
+                 _mesa_shader_stage_to_abbrev(stage));                  \
+         dump_instruction(inst, stderr);                                \
+         fprintf(stderr, "%s:%d: '%s' failed\n", __FILE__, __LINE__, #assertion);  \
+         abort();                                                       \
+      }                                                                 \
+   }
+
+#define fsv_assert_eq(first, second)                                    \
+   {                                                                    \
+      unsigned f = (first);                                             \
+      unsigned s = (second);                                            \
+      if (f != s) {                                                     \
+         fprintf(stderr, "ASSERT: Scalar %s validation failed!\n",      \
+                 _mesa_shader_stage_to_abbrev(stage));                  \
+         dump_instruction(inst, stderr);                                \
+         fprintf(stderr, "%s:%d: A == B failed\n", __FILE__, __LINE__); \
+         fprintf(stderr, "  A = %s = %u\n", #first, f);                 \
+         fprintf(stderr, "  B = %s = %u\n", #second, s);                \
+         abort();                                                       \
+      }                                                                 \
+   }
+
+#define fsv_assert_ne(first, second)                                    \
+   {                                                                    \
+      unsigned f = (first);                                             \
+      unsigned s = (second);                                            \
+      if (f == s) {                                                     \
+         fprintf(stderr, "ASSERT: Scalar %s validation failed!\n",      \
+                 _mesa_shader_stage_to_abbrev(stage));                  \
+         dump_instruction(inst, stderr);                                \
+         fprintf(stderr, "%s:%d: A != B failed\n", __FILE__, __LINE__); \
+         fprintf(stderr, "  A = %s = %u\n", #first, f);                 \
+         fprintf(stderr, "  B = %s = %u\n", #second, s);                \
+         abort();                                                       \
+      }                                                                 \
+   }
+
+#define fsv_assert_lte(first, second)                                   \
+   {                                                                    \
+      unsigned f = (first);                                             \
+      unsigned s = (second);                                            \
+      if (f > s) {                                                      \
+         fprintf(stderr, "ASSERT: Scalar %s validation failed!\n",      \
+                 _mesa_shader_stage_to_abbrev(stage));                  \
+         dump_instruction(inst, stderr);                                \
+         fprintf(stderr, "%s:%d: A <= B failed\n", __FILE__, __LINE__); \
+         fprintf(stderr, "  A = %s = %u\n", #first, f);                 \
+         fprintf(stderr, "  B = %s = %u\n", #second, s);                \
+         abort();                                                       \
+      }                                                                 \
+   }
+
+#ifndef NDEBUG
+void
+fs_visitor::validate()
+{
+   cfg->validate(_mesa_shader_stage_to_abbrev(stage));
+
+   foreach_block_and_inst (block, fs_inst, inst, cfg) {
+      switch (inst->opcode) {
+      case SHADER_OPCODE_SEND:
+         fsv_assert(is_uniform(inst->src[0]) && is_uniform(inst->src[1]));
+         break;
+
+      case BRW_OPCODE_MOV:
+         fsv_assert(inst->sources == 1);
+         break;
+
+      default:
+         break;
+      }
+
+      if (inst->is_3src(compiler)) {
+         const unsigned integer_sources =
+            brw_reg_type_is_integer(inst->src[0].type) +
+            brw_reg_type_is_integer(inst->src[1].type) +
+            brw_reg_type_is_integer(inst->src[2].type);
+         const unsigned float_sources =
+            brw_reg_type_is_floating_point(inst->src[0].type) +
+            brw_reg_type_is_floating_point(inst->src[1].type) +
+            brw_reg_type_is_floating_point(inst->src[2].type);
+
+         fsv_assert((integer_sources == 3 && float_sources == 0) ||
+                    (integer_sources == 0 && float_sources == 3));
+
+         if (devinfo->ver >= 10) {
+            for (unsigned i = 0; i < 3; i++) {
+               if (inst->src[i].file == BRW_IMMEDIATE_VALUE)
+                  continue;
+
+               switch (inst->src[i].vstride) {
+               case BRW_VERTICAL_STRIDE_0:
+               case BRW_VERTICAL_STRIDE_4:
+               case BRW_VERTICAL_STRIDE_8:
+               case BRW_VERTICAL_STRIDE_16:
+                  break;
+
+               case BRW_VERTICAL_STRIDE_1:
+                  fsv_assert_lte(12, devinfo->ver);
+                  break;
+
+               case BRW_VERTICAL_STRIDE_2:
+                  fsv_assert_lte(devinfo->ver, 11);
+                  break;
+
+               default:
+                  fsv_assert(!"invalid vstride");
+                  break;
+               }
+            }
+         } else if (grf_used != 0) {
+            /* Only perform the pre-Gfx10 checks after register allocation has
+             * occured.
+             *
+             * Many passes (e.g., constant copy propagation) will genenerate
+             * invalid 3-source instructions with the expectation that later
+             * passes (e.g., combine constants) will fix them.
+             */
+            for (unsigned i = 0; i < 3; i++) {
+               fsv_assert_ne(inst->src[i].file, BRW_IMMEDIATE_VALUE);
+
+               /* A stride of 1 (the usual case) or 0, with a special
+                * "repctrl" bit, is allowed. The repctrl bit doesn't work for
+                * 64-bit datatypes, so if the source type is 64-bit then only
+                * a stride of 1 is allowed. From the Broadwell PRM, Volume 7
+                * "3D Media GPGPU", page 944:
+                *
+                *    This is applicable to 32b datatypes and 16b datatype. 64b
+                *    datatypes cannot use the replicate control.
+                */
+               fsv_assert_lte(inst->src[i].vstride, 1);
+
+               if (type_sz(inst->src[i].type) > 4)
+                  fsv_assert_eq(inst->src[i].vstride, 1);
+            }
+         }
+      }
+
+      if (inst->dst.file == VGRF) {
+         fsv_assert_lte(inst->dst.offset / REG_SIZE + regs_written(inst),
+                        alloc.sizes[inst->dst.nr]);
+      }
+
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF) {
+            fsv_assert_lte(inst->src[i].offset / REG_SIZE + regs_read(inst, i),
+                           alloc.sizes[inst->src[i].nr]);
+         }
+      }
+
+      /* Accumulator Registers, bspec 47251:
+       *
+       * "When destination is accumulator with offset 0, destination
+       * horizontal stride must be 1."
+       */
+      if (intel_needs_workaround(devinfo, 14014617373) &&
+          inst->dst.is_accumulator() &&
+          inst->dst.offset == 0) {
+         fsv_assert_eq(inst->dst.stride, 1);
+      }
+   }
+}
+#endif
diff --git a/src/intel/compiler/elk/brw_fs_visitor.cpp b/src/intel/compiler/elk/brw_fs_visitor.cpp
new file mode 100644
index 00000000000..9f7f1befd83
--- /dev/null
+++ b/src/intel/compiler/elk/brw_fs_visitor.cpp
@@ -0,0 +1,1266 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_visitor.cpp
+ *
+ * This file supports generating the FS LIR from the GLSL IR.  The LIR
+ * makes it easier to do backend-specific optimizations than doing so
+ * in the GLSL IR or in the native code.
+ */
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_nir.h"
+#include "compiler/glsl_types.h"
+
+using namespace brw;
+
+/* Input data is organized with first the per-primitive values, followed
+ * by per-vertex values.  The per-vertex will have interpolation information
+ * associated, so use 4 components for each value.
+ */
+
+/* The register location here is relative to the start of the URB
+ * data.  It will get adjusted to be a real location before
+ * generate_code() time.
+ */
+fs_reg
+fs_visitor::interp_reg(const fs_builder &bld, unsigned location,
+                       unsigned channel, unsigned comp)
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   assert(BITFIELD64_BIT(location) & ~nir->info.per_primitive_inputs);
+
+   const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+
+   assert(prog_data->urb_setup[location] >= 0);
+   unsigned nr = prog_data->urb_setup[location];
+   channel += prog_data->urb_setup_channel[location];
+
+   /* Adjust so we start counting from the first per_vertex input. */
+   assert(nr >= prog_data->num_per_primitive_inputs);
+   nr -= prog_data->num_per_primitive_inputs;
+
+   const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
+   const unsigned regnr = per_vertex_start + (nr * 4) + channel;
+
+   if (max_polygons > 1) {
+      /* In multipolygon dispatch each plane parameter is a
+       * dispatch_width-wide SIMD vector (see comment in
+       * assign_urb_setup()), so we need to use offset() instead of
+       * component() to select the specified parameter.
+       */
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.MOV(tmp, offset(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_UD),
+                          dispatch_width, comp));
+      return retype(tmp, BRW_REGISTER_TYPE_F);
+   } else {
+      return component(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F), comp);
+   }
+}
+
+/* The register location here is relative to the start of the URB
+ * data.  It will get adjusted to be a real location before
+ * generate_code() time.
+ */
+fs_reg
+fs_visitor::per_primitive_reg(const fs_builder &bld, int location, unsigned comp)
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   assert(BITFIELD64_BIT(location) & nir->info.per_primitive_inputs);
+
+   const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+
+   comp += prog_data->urb_setup_channel[location];
+
+   assert(prog_data->urb_setup[location] >= 0);
+
+   const unsigned regnr = prog_data->urb_setup[location] + comp / 4;
+
+   assert(regnr < prog_data->num_per_primitive_inputs);
+
+   if (max_polygons > 1) {
+      /* In multipolygon dispatch each primitive constant is a
+       * dispatch_width-wide SIMD vector (see comment in
+       * assign_urb_setup()), so we need to use offset() instead of
+       * component() to select the specified parameter.
+       */
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.MOV(tmp, offset(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_UD),
+                          dispatch_width, comp % 4));
+      return retype(tmp, BRW_REGISTER_TYPE_F);
+   } else {
+      return component(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F), comp % 4);
+   }
+}
+
+/** Emits the interpolation for the varying inputs. */
+void
+fs_visitor::emit_interpolation_setup_gfx4()
+{
+   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
+
+   fs_builder abld = fs_builder(this).at_end().annotate("compute pixel centers");
+   this->pixel_x = vgrf(glsl_uint_type());
+   this->pixel_y = vgrf(glsl_uint_type());
+   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
+   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
+   abld.ADD(this->pixel_x,
+            fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
+            fs_reg(brw_imm_v(0x10101010)));
+   abld.ADD(this->pixel_y,
+            fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
+            fs_reg(brw_imm_v(0x11001100)));
+
+   const fs_builder bld = fs_builder(this).at_end();
+   abld = bld.annotate("compute pixel deltas from v0");
+
+   this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] =
+      vgrf(glsl_vec2_type());
+   const fs_reg &delta_xy = this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL];
+   const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
+   const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
+
+   if (devinfo->has_pln) {
+      for (unsigned i = 0; i < dispatch_width / 8; i++) {
+         abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 0), i),
+                             quarter(this->pixel_x, i), xstart);
+         abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 1), i),
+                             quarter(this->pixel_y, i), ystart);
+      }
+   } else {
+      abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart);
+      abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart);
+   }
+
+   this->pixel_z = fetch_payload_reg(bld, fs_payload().source_depth_reg);
+
+   /* The SF program automatically handles doing the perspective correction or
+    * not based on wm_prog_data::interp_mode[] so we can use the same pixel
+    * offsets for both perspective and non-perspective.
+    */
+   this->delta_xy[BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL] =
+      this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL];
+
+   abld = bld.annotate("compute pos.w and 1/pos.w");
+   /* Compute wpos.w.  It's always in our setup, since it's needed to
+    * interpolate the other attributes.
+    */
+   this->wpos_w = vgrf(glsl_float_type());
+   abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy,
+             interp_reg(abld, VARYING_SLOT_POS, 3, 0));
+   /* Compute the pixel 1/W value from wpos.w. */
+   this->pixel_w = vgrf(glsl_float_type());
+   abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
+}
+
+/** Emits the interpolation for the varying inputs. */
+void
+fs_visitor::emit_interpolation_setup_gfx6()
+{
+   const fs_builder bld = fs_builder(this).at_end();
+   fs_builder abld = bld.annotate("compute pixel centers");
+
+   this->pixel_x = vgrf(glsl_float_type());
+   this->pixel_y = vgrf(glsl_float_type());
+
+   const struct brw_wm_prog_key *wm_key = (brw_wm_prog_key*) this->key;
+   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data);
+
+   fs_reg int_sample_offset_x, int_sample_offset_y; /* Used on Gen12HP+ */
+   fs_reg int_sample_offset_xy; /* Used on Gen8+ */
+   fs_reg half_int_sample_offset_x, half_int_sample_offset_y;
+   if (wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS) {
+      /* The thread payload only delivers subspan locations (ss0, ss1,
+       * ss2, ...). Since subspans covers 2x2 pixels blocks, we need to
+       * generate 4 pixel coordinates out of each subspan location. We do this
+       * by replicating a subspan coordinate 4 times and adding an offset of 1
+       * in each direction from the initial top left (tl) location to generate
+       * top right (tr = +1 in x), bottom left (bl = +1 in y) and bottom right
+       * (br = +1 in x, +1 in y).
+       *
+       * The locations we build look like this in SIMD8 :
+       *
+       *    ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
+       *
+       * The value 0x11001010 is a vector of 8 half byte vector. It adds
+       * following to generate the 4 pixels coordinates out of the subspan0:
+       *
+       *  0x
+       *    1 : ss0.y + 1 -> ss0.br.y
+       *    1 : ss0.y + 1 -> ss0.bl.y
+       *    0 : ss0.y + 0 -> ss0.tr.y
+       *    0 : ss0.y + 0 -> ss0.tl.y
+       *    1 : ss0.x + 1 -> ss0.br.x
+       *    0 : ss0.x + 0 -> ss0.bl.x
+       *    1 : ss0.x + 1 -> ss0.tr.x
+       *    0 : ss0.x + 0 -> ss0.tl.x
+       *
+       * By doing a SIMD16 add in a SIMD8 shader, we can generate the 8 pixels
+       * coordinates out of 2 subspans coordinates in a single ADD instruction
+       * (twice the operation above).
+       */
+      int_sample_offset_xy = fs_reg(brw_imm_v(0x11001010));
+      half_int_sample_offset_x = fs_reg(brw_imm_uw(0));
+      half_int_sample_offset_y = fs_reg(brw_imm_uw(0));
+      /* On Gfx12.5, because of regioning restrictions, the interpolation code
+       * is slightly different and works off X & Y only inputs. The ordering
+       * of the half bytes here is a bit odd, with each subspan replicated
+       * twice and every other element is discarded :
+       *
+       *             ss0.tl ss0.tl ss0.tr ss0.tr ss0.bl ss0.bl ss0.br ss0.br
+       *  X offset:    0      0      1      0      0      0      1      0
+       *  Y offset:    0      0      0      0      1      0      1      0
+       */
+      int_sample_offset_x = fs_reg(brw_imm_v(0x01000100));
+      int_sample_offset_y = fs_reg(brw_imm_v(0x01010000));
+   }
+
+   fs_reg int_coarse_offset_x, int_coarse_offset_y; /* Used on Gen12HP+ */
+   fs_reg int_coarse_offset_xy; /* Used on Gen8+ */
+   fs_reg half_int_coarse_offset_x, half_int_coarse_offset_y;
+   if (wm_prog_data->coarse_pixel_dispatch != BRW_NEVER) {
+      /* In coarse pixel dispatch we have to do the same ADD instruction that
+       * we do in normal per pixel dispatch, except this time we're not adding
+       * 1 in each direction, but instead the coarse pixel size.
+       *
+       * The coarse pixel size is delivered as 2 u8 in r1.0
+       */
+      struct brw_reg r1_0 = retype(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), BRW_REGISTER_TYPE_UB);
+
+      const fs_builder dbld =
+         abld.exec_all().group(MIN2(16, dispatch_width) * 2, 0);
+
+      if (devinfo->verx10 >= 125) {
+         /* To build the array of half bytes we do and AND operation with the
+          * right mask in X.
+          */
+         int_coarse_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+         dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0f000f00));
+
+         /* And the right mask in Y. */
+         int_coarse_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+         dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0x0f0f0000));
+      } else {
+         /* To build the array of half bytes we do and AND operation with the
+          * right mask in X.
+          */
+         int_coarse_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+         dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0000f0f0));
+
+         /* And the right mask in Y. */
+         int_coarse_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+         dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0xff000000));
+
+         /* Finally OR the 2 registers. */
+         int_coarse_offset_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+         dbld.OR(int_coarse_offset_xy, int_coarse_offset_x, int_coarse_offset_y);
+      }
+
+      /* Also compute the half coarse size used to center coarses. */
+      half_int_coarse_offset_x = bld.vgrf(BRW_REGISTER_TYPE_UW);
+      half_int_coarse_offset_y = bld.vgrf(BRW_REGISTER_TYPE_UW);
+
+      bld.SHR(half_int_coarse_offset_x, suboffset(r1_0, 0), brw_imm_ud(1));
+      bld.SHR(half_int_coarse_offset_y, suboffset(r1_0, 1), brw_imm_ud(1));
+   }
+
+   fs_reg int_pixel_offset_x, int_pixel_offset_y; /* Used on Gen12HP+ */
+   fs_reg int_pixel_offset_xy; /* Used on Gen8+ */
+   fs_reg half_int_pixel_offset_x, half_int_pixel_offset_y;
+   switch (wm_prog_data->coarse_pixel_dispatch) {
+   case BRW_NEVER:
+      int_pixel_offset_x = int_sample_offset_x;
+      int_pixel_offset_y = int_sample_offset_y;
+      int_pixel_offset_xy = int_sample_offset_xy;
+      half_int_pixel_offset_x = half_int_sample_offset_x;
+      half_int_pixel_offset_y = half_int_sample_offset_y;
+      break;
+
+   case BRW_SOMETIMES: {
+      const fs_builder dbld =
+         abld.exec_all().group(MIN2(16, dispatch_width) * 2, 0);
+
+      check_dynamic_msaa_flag(dbld, wm_prog_data,
+                              INTEL_MSAA_FLAG_COARSE_RT_WRITES);
+
+      int_pixel_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+      set_predicate(BRW_PREDICATE_NORMAL,
+                    dbld.SEL(int_pixel_offset_x,
+                             int_coarse_offset_x,
+                             int_sample_offset_x));
+
+      int_pixel_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+      set_predicate(BRW_PREDICATE_NORMAL,
+                    dbld.SEL(int_pixel_offset_y,
+                             int_coarse_offset_y,
+                             int_sample_offset_y));
+
+      int_pixel_offset_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+      set_predicate(BRW_PREDICATE_NORMAL,
+                    dbld.SEL(int_pixel_offset_xy,
+                             int_coarse_offset_xy,
+                             int_sample_offset_xy));
+
+      half_int_pixel_offset_x = bld.vgrf(BRW_REGISTER_TYPE_UW);
+      set_predicate(BRW_PREDICATE_NORMAL,
+                    bld.SEL(half_int_pixel_offset_x,
+                            half_int_coarse_offset_x,
+                            half_int_sample_offset_x));
+
+      half_int_pixel_offset_y = bld.vgrf(BRW_REGISTER_TYPE_UW);
+      set_predicate(BRW_PREDICATE_NORMAL,
+                    bld.SEL(half_int_pixel_offset_y,
+                            half_int_coarse_offset_y,
+                            half_int_sample_offset_y));
+      break;
+   }
+
+   case BRW_ALWAYS:
+      int_pixel_offset_x = int_coarse_offset_x;
+      int_pixel_offset_y = int_coarse_offset_y;
+      int_pixel_offset_xy = int_coarse_offset_xy;
+      half_int_pixel_offset_x = half_int_coarse_offset_x;
+      half_int_pixel_offset_y = half_int_coarse_offset_y;
+      break;
+   }
+
+   for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
+      const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
+      /* According to the "PS Thread Payload for Normal Dispatch"
+       * pages on the BSpec, subspan X/Y coordinates are stored in
+       * R1.2-R1.5/R2.2-R2.5 on gfx6+, and on R0.10-R0.13/R1.10-R1.13
+       * on gfx20+.  gi_reg is the 32B section of the GRF that
+       * contains the subspan coordinates.
+       */
+      const struct brw_reg gi_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) :
+                                    brw_vec1_grf(i + 1, 0);
+      const struct brw_reg gi_uw = retype(gi_reg, BRW_REGISTER_TYPE_UW);
+
+      if (devinfo->verx10 >= 125) {
+         const fs_builder dbld =
+            abld.exec_all().group(hbld.dispatch_width() * 2, 0);
+         const fs_reg int_pixel_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+         const fs_reg int_pixel_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+
+         dbld.ADD(int_pixel_x,
+                  fs_reg(stride(suboffset(gi_uw, 4), 2, 8, 0)),
+                  int_pixel_offset_x);
+         dbld.ADD(int_pixel_y,
+                  fs_reg(stride(suboffset(gi_uw, 5), 2, 8, 0)),
+                  int_pixel_offset_y);
+
+         if (wm_prog_data->coarse_pixel_dispatch != BRW_NEVER) {
+            fs_inst *addx = dbld.ADD(int_pixel_x, int_pixel_x,
+                                     horiz_stride(half_int_pixel_offset_x, 0));
+            fs_inst *addy = dbld.ADD(int_pixel_y, int_pixel_y,
+                                     horiz_stride(half_int_pixel_offset_y, 0));
+            if (wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS) {
+               addx->predicate = BRW_PREDICATE_NORMAL;
+               addy->predicate = BRW_PREDICATE_NORMAL;
+            }
+         }
+
+         hbld.MOV(offset(pixel_x, hbld, i), horiz_stride(int_pixel_x, 2));
+         hbld.MOV(offset(pixel_y, hbld, i), horiz_stride(int_pixel_y, 2));
+
+      } else if (devinfo->ver >= 8 || dispatch_width == 8) {
+         /* The "Register Region Restrictions" page says for BDW (and newer,
+          * presumably):
+          *
+          *     "When destination spans two registers, the source may be one or
+          *      two registers. The destination elements must be evenly split
+          *      between the two registers."
+          *
+          * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16
+          * to compute our pixel centers.
+          */
+         const fs_builder dbld =
+            abld.exec_all().group(hbld.dispatch_width() * 2, 0);
+         fs_reg int_pixel_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+
+         dbld.ADD(int_pixel_xy,
+                  fs_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)),
+                  int_pixel_offset_xy);
+
+         hbld.emit(FS_OPCODE_PIXEL_X, offset(pixel_x, hbld, i), int_pixel_xy,
+                                      horiz_stride(half_int_pixel_offset_x, 0));
+         hbld.emit(FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy,
+                                      horiz_stride(half_int_pixel_offset_y, 0));
+      } else {
+         /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
+          *
+          *     "When destination spans two registers, the source MUST span
+          *      two registers."
+          *
+          * Since the GRF source of the ADD will only read a single register,
+          * we must do two separate ADDs in SIMD16.
+          */
+         const fs_reg int_pixel_x = hbld.vgrf(BRW_REGISTER_TYPE_UW);
+         const fs_reg int_pixel_y = hbld.vgrf(BRW_REGISTER_TYPE_UW);
+
+         hbld.ADD(int_pixel_x,
+                  fs_reg(stride(suboffset(gi_uw, 4), 2, 4, 0)),
+                  fs_reg(brw_imm_v(0x10101010)));
+         hbld.ADD(int_pixel_y,
+                  fs_reg(stride(suboffset(gi_uw, 5), 2, 4, 0)),
+                  fs_reg(brw_imm_v(0x11001100)));
+
+         /* As of gfx6, we can no longer mix float and int sources.  We have
+          * to turn the integer pixel centers into floats for their actual
+          * use.
+          */
+         hbld.MOV(offset(pixel_x, hbld, i), int_pixel_x);
+         hbld.MOV(offset(pixel_y, hbld, i), int_pixel_y);
+      }
+   }
+
+   abld = bld.annotate("compute pos.z");
+   fs_reg coarse_z;
+   if (wm_prog_data->uses_depth_w_coefficients) {
+      /* In coarse pixel mode, the HW doesn't interpolate Z coordinate
+       * properly. In the same way we have to add the coarse pixel size to
+       * pixels locations, here we recompute the Z value with 2 coefficients
+       * in X & Y axis.
+       */
+      fs_reg coef_payload = brw_vec8_grf(fs_payload().depth_w_coef_reg, 0);
+      const fs_reg x_start = brw_vec1_grf(coef_payload.nr, 2);
+      const fs_reg y_start = brw_vec1_grf(coef_payload.nr, 6);
+      const fs_reg z_cx    = brw_vec1_grf(coef_payload.nr, 1);
+      const fs_reg z_cy    = brw_vec1_grf(coef_payload.nr, 0);
+      const fs_reg z_c0    = brw_vec1_grf(coef_payload.nr, 3);
+
+      const fs_reg float_pixel_x = abld.vgrf(BRW_REGISTER_TYPE_F);
+      const fs_reg float_pixel_y = abld.vgrf(BRW_REGISTER_TYPE_F);
+
+      abld.ADD(float_pixel_x, this->pixel_x, negate(x_start));
+      abld.ADD(float_pixel_y, this->pixel_y, negate(y_start));
+
+      /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
+      const fs_reg u8_cps_width = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));
+      /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
+      const fs_reg u8_cps_height = byte_offset(u8_cps_width, 1);
+      const fs_reg u32_cps_width = abld.vgrf(BRW_REGISTER_TYPE_UD);
+      const fs_reg u32_cps_height = abld.vgrf(BRW_REGISTER_TYPE_UD);
+      abld.MOV(u32_cps_width, u8_cps_width);
+      abld.MOV(u32_cps_height, u8_cps_height);
+
+      const fs_reg f_cps_width = abld.vgrf(BRW_REGISTER_TYPE_F);
+      const fs_reg f_cps_height = abld.vgrf(BRW_REGISTER_TYPE_F);
+      abld.MOV(f_cps_width, u32_cps_width);
+      abld.MOV(f_cps_height, u32_cps_height);
+
+      /* Center in the middle of the coarse pixel. */
+      abld.MAD(float_pixel_x, float_pixel_x, brw_imm_f(0.5f), f_cps_width);
+      abld.MAD(float_pixel_y, float_pixel_y, brw_imm_f(0.5f), f_cps_height);
+
+      coarse_z = abld.vgrf(BRW_REGISTER_TYPE_F);
+      abld.MAD(coarse_z, z_c0, z_cx, float_pixel_x);
+      abld.MAD(coarse_z, coarse_z, z_cy, float_pixel_y);
+   }
+
+   if (wm_prog_data->uses_src_depth)
+      this->pixel_z = fetch_payload_reg(bld, fs_payload().source_depth_reg);
+
+   if (wm_prog_data->uses_depth_w_coefficients ||
+       wm_prog_data->uses_src_depth) {
+      fs_reg sample_z = this->pixel_z;
+
+      switch (wm_prog_data->coarse_pixel_dispatch) {
+      case BRW_NEVER:
+         assert(wm_prog_data->uses_src_depth);
+         assert(!wm_prog_data->uses_depth_w_coefficients);
+         this->pixel_z = sample_z;
+         break;
+
+      case BRW_SOMETIMES:
+         assert(wm_prog_data->uses_src_depth);
+         assert(wm_prog_data->uses_depth_w_coefficients);
+         this->pixel_z = abld.vgrf(BRW_REGISTER_TYPE_F);
+
+         /* We re-use the check_dynamic_msaa_flag() call from above */
+         set_predicate(BRW_PREDICATE_NORMAL,
+                       abld.SEL(this->pixel_z, coarse_z, sample_z));
+         break;
+
+      case BRW_ALWAYS:
+         assert(!wm_prog_data->uses_src_depth);
+         assert(wm_prog_data->uses_depth_w_coefficients);
+         this->pixel_z = coarse_z;
+         break;
+      }
+   }
+
+   if (wm_prog_data->uses_src_w) {
+      abld = bld.annotate("compute pos.w");
+      this->pixel_w = fetch_payload_reg(abld, fs_payload().source_w_reg);
+      this->wpos_w = vgrf(glsl_float_type());
+      abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
+   }
+
+   if (wm_key->persample_interp == BRW_SOMETIMES) {
+      assert(!devinfo->needs_unlit_centroid_workaround);
+
+      const fs_builder ubld = bld.exec_all().group(16, 0);
+      bool loaded_flag = false;
+
+      for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
+         if (!(wm_prog_data->barycentric_interp_modes & BITFIELD_BIT(i)))
+            continue;
+
+         /* The sample mode will always be the top bit set in the perspective
+          * or non-perspective section.  In the case where no SAMPLE mode was
+          * requested, wm_prog_data_barycentric_modes() will swap out the top
+          * mode for SAMPLE so this works regardless of whether SAMPLE was
+          * requested or not.
+          */
+         int sample_mode;
+         if (BITFIELD_BIT(i) & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) {
+            sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
+                                        BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1;
+         } else {
+            sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
+                                        BRW_BARYCENTRIC_PERSPECTIVE_BITS) - 1;
+         }
+         assert(wm_prog_data->barycentric_interp_modes &
+                BITFIELD_BIT(sample_mode));
+
+         if (i == sample_mode)
+            continue;
+
+         uint8_t *barys = fs_payload().barycentric_coord_reg[i];
+
+         uint8_t *sample_barys = fs_payload().barycentric_coord_reg[sample_mode];
+         assert(barys[0] && sample_barys[0]);
+
+         if (!loaded_flag) {
+            check_dynamic_msaa_flag(ubld, wm_prog_data,
+                                    INTEL_MSAA_FLAG_PERSAMPLE_INTERP);
+         }
+
+         for (unsigned j = 0; j < dispatch_width / 8; j++) {
+            set_predicate(
+               BRW_PREDICATE_NORMAL,
+               ubld.MOV(brw_vec8_grf(barys[j / 2] + (j % 2) * 2, 0),
+                        brw_vec8_grf(sample_barys[j / 2] + (j % 2) * 2, 0)));
+         }
+      }
+   }
+
+   for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
+      this->delta_xy[i] = fetch_barycentric_reg(
+         bld, fs_payload().barycentric_coord_reg[i]);
+   }
+
+   uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes &
+      (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID |
+       1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
+
+   if (devinfo->needs_unlit_centroid_workaround && centroid_modes) {
+      /* Get the pixel/sample mask into f0 so that we know which
+       * pixels are lit.  Then, for each channel that is unlit,
+       * replace the centroid data with non-centroid data.
+       */
+      for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
+         bld.exec_all().group(1, 0)
+            .MOV(retype(brw_flag_reg(0, i), BRW_REGISTER_TYPE_UW),
+                 retype(brw_vec1_grf(1 + i, 7), BRW_REGISTER_TYPE_UW));
+      }
+
+      for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
+         if (!(centroid_modes & (1 << i)))
+            continue;
+
+         const fs_reg centroid_delta_xy = delta_xy[i];
+         const fs_reg &pixel_delta_xy = delta_xy[i - 1];
+
+         delta_xy[i] = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
+
+         for (unsigned c = 0; c < 2; c++) {
+            for (unsigned q = 0; q < dispatch_width / 8; q++) {
+               set_predicate(BRW_PREDICATE_NORMAL,
+                  bld.quarter(q).SEL(
+                     quarter(offset(delta_xy[i], bld, c), q),
+                     quarter(offset(centroid_delta_xy, bld, c), q),
+                     quarter(offset(pixel_delta_xy, bld, c), q)));
+            }
+         }
+      }
+   }
+}
+
+static enum brw_conditional_mod
+cond_for_alpha_func(enum compare_func func)
+{
+   switch(func) {
+   case COMPARE_FUNC_GREATER:
+      return BRW_CONDITIONAL_G;
+   case COMPARE_FUNC_GEQUAL:
+      return BRW_CONDITIONAL_GE;
+   case COMPARE_FUNC_LESS:
+      return BRW_CONDITIONAL_L;
+   case COMPARE_FUNC_LEQUAL:
+      return BRW_CONDITIONAL_LE;
+   case COMPARE_FUNC_EQUAL:
+      return BRW_CONDITIONAL_EQ;
+   case COMPARE_FUNC_NOTEQUAL:
+      return BRW_CONDITIONAL_NEQ;
+   default:
+      unreachable("Not reached");
+   }
+}
+
+/**
+ * Alpha test support for when we compile it into the shader instead
+ * of using the normal fixed-function alpha test.
+ */
+void
+fs_visitor::emit_alpha_test()
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+   const fs_builder bld = fs_builder(this).at_end();
+   const fs_builder abld = bld.annotate("Alpha test");
+
+   fs_inst *cmp;
+   if (key->alpha_test_func == COMPARE_FUNC_ALWAYS)
+      return;
+
+   if (key->alpha_test_func == COMPARE_FUNC_NEVER) {
+      /* f0.1 = 0 */
+      fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
+                                      BRW_REGISTER_TYPE_UW));
+      cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg,
+                     BRW_CONDITIONAL_NEQ);
+   } else {
+      /* RT0 alpha */
+      fs_reg color = offset(outputs[0], bld, 3);
+
+      /* f0.1 &= func(color, ref) */
+      cmp = abld.CMP(bld.null_reg_f(), color, brw_imm_f(key->alpha_test_ref),
+                     cond_for_alpha_func(key->alpha_test_func));
+   }
+   cmp->predicate = BRW_PREDICATE_NORMAL;
+   cmp->flag_subreg = 1;
+}
+
+fs_inst *
+fs_visitor::emit_single_fb_write(const fs_builder &bld,
+                                 fs_reg color0, fs_reg color1,
+                                 fs_reg src0_alpha, unsigned components)
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+
+   /* Hand over gl_FragDepth or the payload depth. */
+   const fs_reg dst_depth = fetch_payload_reg(bld, fs_payload().dest_depth_reg);
+   fs_reg src_depth, src_stencil;
+
+   if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+      src_depth = frag_depth;
+   } else if (source_depth_to_render_target) {
+      /* If we got here, we're in one of those strange Gen4-5 cases where
+       * we're forced to pass the source depth, unmodified, to the FB write.
+       * In this case, we don't want to use pixel_z because we may not have
+       * set up interpolation.  It's also perfectly safe because it only
+       * happens on old hardware (no coarse interpolation) and this is
+       * explicitly the pass-through case.
+       */
+      assert(devinfo->ver <= 5);
+      src_depth = fetch_payload_reg(bld, fs_payload().source_depth_reg);
+   }
+
+   if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
+      src_stencil = frag_stencil;
+
+   const fs_reg sources[] = {
+      color0, color1, src0_alpha, src_depth, dst_depth, src_stencil,
+      (prog_data->uses_omask ? sample_mask : fs_reg()),
+      brw_imm_ud(components)
+   };
+   assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS);
+   fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(),
+                             sources, ARRAY_SIZE(sources));
+
+   if (prog_data->uses_kill) {
+      write->predicate = BRW_PREDICATE_NORMAL;
+      write->flag_subreg = sample_mask_flag_subreg(*this);
+   }
+
+   return write;
+}
+
+void
+fs_visitor::do_emit_fb_writes(int nr_color_regions, bool replicate_alpha)
+{
+   const fs_builder bld = fs_builder(this).at_end();
+   fs_inst *inst = NULL;
+
+   for (int target = 0; target < nr_color_regions; target++) {
+      /* Skip over outputs that weren't written. */
+      if (this->outputs[target].file == BAD_FILE)
+         continue;
+
+      const fs_builder abld = bld.annotate(
+         ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
+
+      fs_reg src0_alpha;
+      if (devinfo->ver >= 6 && replicate_alpha && target != 0)
+         src0_alpha = offset(outputs[0], bld, 3);
+
+      inst = emit_single_fb_write(abld, this->outputs[target],
+                                  this->dual_src_output, src0_alpha, 4);
+      inst->target = target;
+   }
+
+   if (inst == NULL) {
+      /* Even if there's no color buffers enabled, we still need to send
+       * alpha out the pipeline to our null renderbuffer to support
+       * alpha-testing, alpha-to-coverage, and so on.
+       */
+      /* FINISHME: Factor out this frequently recurring pattern into a
+       * helper function.
+       */
+      const fs_reg srcs[] = { reg_undef, reg_undef,
+                              reg_undef, offset(this->outputs[0], bld, 3) };
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+      bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
+
+      inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4);
+      inst->target = 0;
+   }
+
+   inst->last_rt = true;
+   inst->eot = true;
+}
+
+void
+fs_visitor::emit_fb_writes()
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+
+   if (source_depth_to_render_target && devinfo->ver == 6) {
+      /* For outputting oDepth on gfx6, SIMD8 writes have to be used.  This
+       * would require SIMD8 moves of each half to message regs, e.g. by using
+       * the SIMD lowering pass.  Unfortunately this is more difficult than it
+       * sounds because the SIMD8 single-source message lacks channel selects
+       * for the second and third subspans.
+       */
+      limit_dispatch_width(8, "Depth writes unsupported in SIMD16+ mode.\n");
+   }
+
+   if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
+      /* From the 'Render Target Write message' section of the docs:
+       * "Output Stencil is not supported with SIMD16 Render Target Write
+       * Messages."
+       */
+      limit_dispatch_width(8, "gl_FragStencilRefARB unsupported "
+                           "in SIMD16+ mode.\n");
+   }
+
+   /* ANV doesn't know about sample mask output during the wm key creation
+    * so we compute if we need replicate alpha and emit alpha to coverage
+    * workaround here.
+    */
+   const bool replicate_alpha = key->alpha_test_replicate_alpha ||
+      (key->nr_color_regions > 1 && key->alpha_to_coverage &&
+       (sample_mask.file == BAD_FILE || devinfo->ver == 6));
+
+   prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE &&
+                                this->outputs[0].file != BAD_FILE);
+   assert(!prog_data->dual_src_blend || key->nr_color_regions == 1);
+
+   /* Following condition implements Wa_14017468336:
+    *
+    * "If dual source blend is enabled do not enable SIMD32 dispatch" and
+    * "For a thread dispatched as SIMD32, must not issue SIMD8 message with Last
+    *  Render Target Select set."
+    */
+   if (devinfo->ver >= 11 && devinfo->ver <= 12 &&
+       prog_data->dual_src_blend) {
+      /* The dual-source RT write messages fail to release the thread
+       * dependency on ICL and TGL with SIMD32 dispatch, leading to hangs.
+       *
+       * XXX - Emit an extra single-source NULL RT-write marked LastRT in
+       *       order to release the thread dependency without disabling
+       *       SIMD32.
+       *
+       * The dual-source RT write messages may lead to hangs with SIMD16
+       * dispatch on ICL due some unknown reasons, see
+       * https://gitlab.freedesktop.org/mesa/mesa/-/issues/2183
+       */
+      limit_dispatch_width(8, "Dual source blending unsupported "
+                           "in SIMD16 and SIMD32 modes.\n");
+   }
+
+   do_emit_fb_writes(key->nr_color_regions, replicate_alpha);
+}
+
+void
+fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
+{
+   int slot, urb_offset, length;
+   int starting_urb_offset = 0;
+   const struct brw_vue_prog_data *vue_prog_data =
+      brw_vue_prog_data(this->prog_data);
+   const struct brw_vs_prog_key *vs_key =
+      (const struct brw_vs_prog_key *) this->key;
+   const GLbitfield64 psiz_mask =
+      VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ | VARYING_BIT_PRIMITIVE_SHADING_RATE;
+   const struct intel_vue_map *vue_map = &vue_prog_data->vue_map;
+   bool flush;
+   fs_reg sources[8];
+   fs_reg urb_handle;
+
+   switch (stage) {
+   case MESA_SHADER_VERTEX:
+      urb_handle = vs_payload().urb_handles;
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      urb_handle = tes_payload().urb_output;
+      break;
+   case MESA_SHADER_GEOMETRY:
+      urb_handle = gs_payload().urb_handles;
+      break;
+   default:
+      unreachable("invalid stage");
+   }
+
+   const fs_builder bld = fs_builder(this).at_end();
+
+   fs_reg per_slot_offsets;
+
+   if (stage == MESA_SHADER_GEOMETRY) {
+      const struct brw_gs_prog_data *gs_prog_data =
+         brw_gs_prog_data(this->prog_data);
+
+      /* We need to increment the Global Offset to skip over the control data
+       * header and the extra "Vertex Count" field (1 HWord) at the beginning
+       * of the VUE.  We're counting in OWords, so the units are doubled.
+       */
+      starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
+      if (gs_prog_data->static_vertex_count == -1)
+         starting_urb_offset += 2;
+
+      /* The URB offset is in 128-bit units, so we need to multiply by 2 */
+      const int output_vertex_size_owords =
+         gs_prog_data->output_vertex_size_hwords * 2;
+
+      if (gs_vertex_count.file == IMM) {
+         per_slot_offsets = brw_imm_ud(output_vertex_size_owords *
+                                       gs_vertex_count.ud);
+      } else {
+         per_slot_offsets = vgrf(glsl_uint_type());
+         bld.MUL(per_slot_offsets, gs_vertex_count,
+                 brw_imm_ud(output_vertex_size_owords));
+      }
+   }
+
+   length = 0;
+   urb_offset = starting_urb_offset;
+   flush = false;
+
+   /* SSO shaders can have VUE slots allocated which are never actually
+    * written to, so ignore them when looking for the last (written) slot.
+    */
+   int last_slot = vue_map->num_slots - 1;
+   while (last_slot > 0 &&
+          (vue_map->slot_to_varying[last_slot] == BRW_VARYING_SLOT_PAD ||
+           outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) {
+      last_slot--;
+   }
+
+   bool urb_written = false;
+   for (slot = 0; slot < vue_map->num_slots; slot++) {
+      int varying = vue_map->slot_to_varying[slot];
+      switch (varying) {
+      case VARYING_SLOT_PSIZ: {
+         /* The point size varying slot is the vue header and is always in the
+          * vue map.  But often none of the special varyings that live there
+          * are written and in that case we can skip writing to the vue
+          * header, provided the corresponding state properly clamps the
+          * values further down the pipeline. */
+         if ((vue_map->slots_valid & psiz_mask) == 0) {
+            assert(length == 0);
+            urb_offset++;
+            break;
+         }
+
+         fs_reg zero(VGRF, alloc.allocate(dispatch_width / 8),
+                     BRW_REGISTER_TYPE_UD);
+         bld.MOV(zero, brw_imm_ud(0u));
+
+         if (vue_map->slots_valid & VARYING_BIT_PRIMITIVE_SHADING_RATE &&
+             this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE].file != BAD_FILE) {
+            sources[length++] = this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE];
+         } else if (devinfo->has_coarse_pixel_primitive_and_cb) {
+            uint32_t one_fp16 = 0x3C00;
+            fs_reg one_by_one_fp16(VGRF, alloc.allocate(dispatch_width / 8),
+                                   BRW_REGISTER_TYPE_UD);
+            bld.MOV(one_by_one_fp16, brw_imm_ud((one_fp16 << 16) | one_fp16));
+            sources[length++] = one_by_one_fp16;
+         } else {
+            sources[length++] = zero;
+         }
+
+         if (vue_map->slots_valid & VARYING_BIT_LAYER)
+            sources[length++] = this->outputs[VARYING_SLOT_LAYER];
+         else
+            sources[length++] = zero;
+
+         if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
+            sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
+         else
+            sources[length++] = zero;
+
+         if (vue_map->slots_valid & VARYING_BIT_PSIZ)
+            sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
+         else
+            sources[length++] = zero;
+         break;
+      }
+      case BRW_VARYING_SLOT_NDC:
+      case VARYING_SLOT_EDGE:
+         unreachable("unexpected scalar vs output");
+         break;
+
+      default:
+         /* gl_Position is always in the vue map, but isn't always written by
+          * the shader.  Other varyings (clip distances) get added to the vue
+          * map but don't always get written.  In those cases, the
+          * corresponding this->output[] slot will be invalid we and can skip
+          * the urb write for the varying.  If we've already queued up a vue
+          * slot for writing we flush a mlen 5 urb write, otherwise we just
+          * advance the urb_offset.
+          */
+         if (varying == BRW_VARYING_SLOT_PAD ||
+             this->outputs[varying].file == BAD_FILE) {
+            if (length > 0)
+               flush = true;
+            else
+               urb_offset++;
+            break;
+         }
+
+         if (stage == MESA_SHADER_VERTEX && vs_key->clamp_vertex_color &&
+             (varying == VARYING_SLOT_COL0 ||
+              varying == VARYING_SLOT_COL1 ||
+              varying == VARYING_SLOT_BFC0 ||
+              varying == VARYING_SLOT_BFC1)) {
+            /* We need to clamp these guys, so do a saturating MOV into a
+             * temp register and use that for the payload.
+             */
+            for (int i = 0; i < 4; i++) {
+               fs_reg reg = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
+                                   outputs[varying].type);
+               fs_reg src = offset(this->outputs[varying], bld, i);
+               set_saturate(true, bld.MOV(reg, src));
+               sources[length++] = reg;
+            }
+         } else {
+            int slot_offset = 0;
+
+            /* When using Primitive Replication, there may be multiple slots
+             * assigned to POS.
+             */
+            if (varying == VARYING_SLOT_POS)
+               slot_offset = slot - vue_map->varying_to_slot[VARYING_SLOT_POS];
+
+            for (unsigned i = 0; i < 4; i++) {
+               sources[length++] = offset(this->outputs[varying], bld,
+                                          i + (slot_offset * 4));
+            }
+         }
+         break;
+      }
+
+      const fs_builder abld = bld.annotate("URB write");
+
+      /* If we've queued up 8 registers of payload (2 VUE slots), if this is
+       * the last slot or if we need to flush (see BAD_FILE varying case
+       * above), emit a URB write send now to flush out the data.
+       */
+      if (length == 8 || (length > 0 && slot == last_slot))
+         flush = true;
+      if (flush) {
+         fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+
+         srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
+         srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets;
+         srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF,
+                                             alloc.allocate((dispatch_width / 8) * length),
+                                             BRW_REGISTER_TYPE_F);
+         srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
+         abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
+
+         fs_inst *inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
+                                   srcs, ARRAY_SIZE(srcs));
+
+         /* For ICL Wa_1805992985 one needs additional write in the end. */
+         if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL)
+            inst->eot = false;
+         else
+            inst->eot = slot == last_slot && stage != MESA_SHADER_GEOMETRY;
+
+         inst->offset = urb_offset;
+         urb_offset = starting_urb_offset + slot + 1;
+         length = 0;
+         flush = false;
+         urb_written = true;
+      }
+   }
+
+   /* If we don't have any valid slots to write, just do a minimal urb write
+    * send to terminate the shader.  This includes 1 slot of undefined data,
+    * because it's invalid to write 0 data:
+    *
+    * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
+    * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
+    * Write Data Payload:
+    *
+    *    "The write data payload can be between 1 and 8 message phases long."
+    */
+   if (!urb_written) {
+      /* For GS, just turn EmitVertex() into a no-op.  We don't want it to
+       * end the thread, and emit_gs_thread_end() already emits a SEND with
+       * EOT at the end of the program for us.
+       */
+      if (stage == MESA_SHADER_GEOMETRY)
+         return;
+
+      fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
+                                         BRW_REGISTER_TYPE_UD);
+      fs_reg payload = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
+                              BRW_REGISTER_TYPE_UD);
+
+      bld.exec_all().MOV(uniform_urb_handle, urb_handle);
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
+      srcs[URB_LOGICAL_SRC_DATA] = payload;
+      srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
+
+      fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
+                               srcs, ARRAY_SIZE(srcs));
+      inst->eot = true;
+      inst->offset = 1;
+      return;
+   }
+
+   /* ICL Wa_1805992985:
+    *
+    * ICLLP GPU hangs on one of tessellation vkcts tests with DS not done. The
+    * send cycle, which is a urb write with an eot must be 4 phases long and
+    * all 8 lanes must valid.
+    */
+   if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL) {
+      assert(dispatch_width == 8);
+      fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+      fs_reg uniform_mask = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+      fs_reg payload = fs_reg(VGRF, alloc.allocate(4), BRW_REGISTER_TYPE_UD);
+
+      /* Workaround requires all 8 channels (lanes) to be valid. This is
+       * understood to mean they all need to be alive. First trick is to find
+       * a live channel and copy its urb handle for all the other channels to
+       * make sure all handles are valid.
+       */
+      bld.exec_all().MOV(uniform_urb_handle, bld.emit_uniformize(urb_handle));
+
+      /* Second trick is to use masked URB write where one can tell the HW to
+       * actually write data only for selected channels even though all are
+       * active.
+       * Third trick is to take advantage of the must-be-zero (MBZ) area in
+       * the very beginning of the URB.
+       *
+       * One masks data to be written only for the first channel and uses
+       * offset zero explicitly to land data to the MBZ area avoiding trashing
+       * any other part of the URB.
+       *
+       * Since the WA says that the write needs to be 4 phases long one uses
+       * 4 slots data. All are explicitly zeros in order to to keep the MBZ
+       * area written as zeros.
+       */
+      bld.exec_all().MOV(uniform_mask, brw_imm_ud(0x10000u));
+      bld.exec_all().MOV(offset(payload, bld, 0), brw_imm_ud(0u));
+      bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0u));
+      bld.exec_all().MOV(offset(payload, bld, 2), brw_imm_ud(0u));
+      bld.exec_all().MOV(offset(payload, bld, 3), brw_imm_ud(0u));
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
+      srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = uniform_mask;
+      srcs[URB_LOGICAL_SRC_DATA] = payload;
+      srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(4);
+
+      fs_inst *inst = bld.exec_all().emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
+                                          reg_undef, srcs, ARRAY_SIZE(srcs));
+      inst->eot = true;
+      inst->offset = 0;
+   }
+}
+
+void
+fs_visitor::emit_urb_fence()
+{
+   const fs_builder bld = fs_builder(this).at_end();
+   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   fs_inst *fence = bld.emit(SHADER_OPCODE_MEMORY_FENCE, dst,
+                             brw_vec8_grf(0, 0),
+                             brw_imm_ud(true),
+                             brw_imm_ud(0));
+   fence->sfid = BRW_SFID_URB;
+   fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_LOCAL,
+                                    LSC_FLUSH_TYPE_NONE, true);
+
+   bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE,
+                                   bld.null_reg_ud(),
+                                   &dst,
+                                   1);
+}
+
+void
+fs_visitor::emit_cs_terminate()
+{
+   assert(devinfo->ver >= 7);
+   const fs_builder bld = fs_builder(this).at_end();
+
+   /* We can't directly send from g0, since sends with EOT have to use
+    * g112-127. So, copy it to a virtual register, The register allocator will
+    * make sure it uses the appropriate register range.
+    */
+   struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
+   fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+   bld.group(8, 0).exec_all().MOV(payload, g0);
+
+   /* Send a message to the thread spawner to terminate the thread. */
+   fs_inst *inst = bld.exec_all()
+                      .emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
+   inst->eot = true;
+}
+
+fs_visitor::fs_visitor(const struct brw_compiler *compiler,
+                       const struct brw_compile_params *params,
+                       const brw_base_prog_key *key,
+                       struct brw_stage_prog_data *prog_data,
+                       const nir_shader *shader,
+                       unsigned dispatch_width,
+                       bool needs_register_pressure,
+                       bool debug_enabled)
+   : backend_shader(compiler, params, shader, prog_data, debug_enabled),
+     key(key), gs_compile(NULL), prog_data(prog_data),
+     live_analysis(this), regpressure_analysis(this),
+     performance_analysis(this),
+     needs_register_pressure(needs_register_pressure),
+     dispatch_width(dispatch_width),
+     max_polygons(0),
+     api_subgroup_size(brw_nir_api_subgroup_size(shader, dispatch_width))
+{
+   init();
+}
+
+fs_visitor::fs_visitor(const struct brw_compiler *compiler,
+                       const struct brw_compile_params *params,
+                       const brw_wm_prog_key *key,
+                       struct brw_wm_prog_data *prog_data,
+                       const nir_shader *shader,
+                       unsigned dispatch_width, unsigned max_polygons,
+                       bool needs_register_pressure,
+                       bool debug_enabled)
+   : backend_shader(compiler, params, shader, &prog_data->base,
+                    debug_enabled),
+     key(&key->base), gs_compile(NULL), prog_data(&prog_data->base),
+     live_analysis(this), regpressure_analysis(this),
+     performance_analysis(this),
+     needs_register_pressure(needs_register_pressure),
+     dispatch_width(dispatch_width),
+     max_polygons(max_polygons),
+     api_subgroup_size(brw_nir_api_subgroup_size(shader, dispatch_width))
+{
+   init();
+   assert(api_subgroup_size == 0 ||
+          api_subgroup_size == 8 ||
+          api_subgroup_size == 16 ||
+          api_subgroup_size == 32);
+}
+
+fs_visitor::fs_visitor(const struct brw_compiler *compiler,
+                       const struct brw_compile_params *params,
+                       struct brw_gs_compile *c,
+                       struct brw_gs_prog_data *prog_data,
+                       const nir_shader *shader,
+                       bool needs_register_pressure,
+                       bool debug_enabled)
+   : backend_shader(compiler, params, shader, &prog_data->base.base,
+                    debug_enabled),
+     key(&c->key.base), gs_compile(c),
+     prog_data(&prog_data->base.base),
+     live_analysis(this), regpressure_analysis(this),
+     performance_analysis(this),
+     needs_register_pressure(needs_register_pressure),
+     dispatch_width(compiler->devinfo->ver >= 20 ? 16 : 8),
+     max_polygons(0),
+     api_subgroup_size(brw_nir_api_subgroup_size(shader, dispatch_width))
+{
+   init();
+   assert(api_subgroup_size == 0 ||
+          api_subgroup_size == 8 ||
+          api_subgroup_size == 16 ||
+          api_subgroup_size == 32);
+}
+
+void
+fs_visitor::init()
+{
+   if (key)
+      this->key_tex = &key->tex;
+   else
+      this->key_tex = NULL;
+
+   this->max_dispatch_width = 32;
+   this->prog_data = this->stage_prog_data;
+
+   this->failed = false;
+   this->fail_msg = NULL;
+
+   this->payload_ = NULL;
+   this->source_depth_to_render_target = false;
+   this->runtime_check_aads_emit = false;
+   this->first_non_payload_grf = 0;
+   this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF;
+
+   this->uniforms = 0;
+   this->last_scratch = 0;
+   this->push_constant_loc = NULL;
+
+   memset(&this->shader_stats, 0, sizeof(this->shader_stats));
+
+   this->grf_used = 0;
+   this->spilled_any_registers = false;
+}
+
+fs_visitor::~fs_visitor()
+{
+   delete this->payload_;
+}
diff --git a/src/intel/compiler/elk/brw_gram.y b/src/intel/compiler/elk/brw_gram.y
new file mode 100644
index 00000000000..a32b2bffb0c
--- /dev/null
+++ b/src/intel/compiler/elk/brw_gram.y
@@ -0,0 +1,2566 @@
+%{
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include "brw_asm.h"
+
+#undef yyerror
+#ifdef YYBYACC
+struct YYLTYPE;
+void yyerror (struct YYLTYPE *, char *);
+#else
+void yyerror (char *);
+#endif
+
+#undef ALIGN16
+
+#define YYLTYPE YYLTYPE
+typedef struct YYLTYPE
+{
+	int first_line;
+	int first_column;
+	int last_line;
+	int last_column;
+} YYLTYPE;
+
+enum message_level {
+	WARN,
+	ERROR,
+};
+
+int yydebug = 1;
+
+static void
+message(enum message_level level, YYLTYPE *location,
+	const char *fmt, ...)
+{
+	static const char *level_str[] = { "warning", "error" };
+	va_list args;
+
+	if (location)
+		fprintf(stderr, "%s:%d:%d: %s: ", input_filename,
+		        location->first_line,
+		        location->first_column, level_str[level]);
+	else
+		fprintf(stderr, "%s:%s: ", input_filename, level_str[level]);
+
+	va_start(args, fmt);
+	vfprintf(stderr, fmt, args);
+	va_end(args);
+}
+
+#define warn(flag, l, fmt, ...) 				 \
+	do {							 \
+		if (warning_flags & WARN_ ## flag)		 \
+			message(WARN, l, fmt, ## __VA_ARGS__);	 \
+	} while (0)
+
+#define error(l, fmt, ...)				   	 \
+	do {						   	 \
+		message(ERROR, l, fmt, ## __VA_ARGS__);	   	 \
+	} while (0)
+
+static bool
+isPowerofTwo(unsigned int x)
+{
+	return x && (!(x & (x - 1)));
+}
+
+static struct brw_reg
+set_direct_src_operand(struct brw_reg *reg, int type)
+{
+	return brw_reg(reg->file,
+		       reg->nr,
+		       reg->subnr,
+		       0,		// negate
+		       0,		// abs
+		       type,
+		       0,		// vstride
+		       0,		// width
+		       0,		// hstride
+		       BRW_SWIZZLE_NOOP,
+		       WRITEMASK_XYZW);
+}
+
+static void
+i965_asm_unary_instruction(int opcode, struct brw_codegen *p,
+		           struct brw_reg dest, struct brw_reg src0)
+{
+	switch (opcode) {
+	case BRW_OPCODE_BFREV:
+		brw_BFREV(p, dest, src0);
+		break;
+	case BRW_OPCODE_CBIT:
+		brw_CBIT(p, dest, src0);
+		break;
+	case BRW_OPCODE_F32TO16:
+		brw_F32TO16(p, dest, src0);
+		break;
+	case BRW_OPCODE_F16TO32:
+		brw_F16TO32(p, dest, src0);
+		break;
+	case BRW_OPCODE_MOV:
+		brw_MOV(p, dest, src0);
+		break;
+	case BRW_OPCODE_FBL:
+		brw_FBL(p, dest, src0);
+		break;
+	case BRW_OPCODE_FRC:
+		brw_FRC(p, dest, src0);
+		break;
+	case BRW_OPCODE_FBH:
+		brw_FBH(p, dest, src0);
+		break;
+	case BRW_OPCODE_NOT:
+		brw_NOT(p, dest, src0);
+		break;
+	case BRW_OPCODE_RNDE:
+		brw_RNDE(p, dest, src0);
+		break;
+	case BRW_OPCODE_RNDZ:
+		brw_RNDZ(p, dest, src0);
+		break;
+	case BRW_OPCODE_RNDD:
+		brw_RNDD(p, dest, src0);
+		break;
+	case BRW_OPCODE_LZD:
+		brw_LZD(p, dest, src0);
+		break;
+	case BRW_OPCODE_DIM:
+		brw_DIM(p, dest, src0);
+		break;
+	case BRW_OPCODE_RNDU:
+		fprintf(stderr, "Opcode BRW_OPCODE_RNDU unhandled\n");
+		break;
+	default:
+		fprintf(stderr, "Unsupported unary opcode\n");
+	}
+}
+
+static void
+i965_asm_binary_instruction(int opcode,
+			    struct brw_codegen *p,
+			    struct brw_reg dest,
+			    struct brw_reg src0,
+			    struct brw_reg src1)
+{
+	switch (opcode) {
+	case BRW_OPCODE_ADDC:
+		brw_ADDC(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_BFI1:
+		brw_BFI1(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_DP2:
+		brw_DP2(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_DP3:
+		brw_DP3(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_DP4:
+		brw_DP4(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_DPH:
+		brw_DPH(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_LINE:
+		brw_LINE(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_MAC:
+		brw_MAC(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_MACH:
+		brw_MACH(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_PLN:
+		brw_PLN(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_ROL:
+		brw_ROL(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_ROR:
+		brw_ROR(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_SAD2:
+		fprintf(stderr, "Opcode BRW_OPCODE_SAD2 unhandled\n");
+		break;
+	case BRW_OPCODE_SADA2:
+		fprintf(stderr, "Opcode BRW_OPCODE_SADA2 unhandled\n");
+		break;
+	case BRW_OPCODE_SUBB:
+		brw_SUBB(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_ADD:
+		brw_ADD(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_CMP:
+		/* Third parameter is conditional modifier
+		 * which gets updated later
+		 */
+		brw_CMP(p, dest, 0, src0, src1);
+		break;
+	case BRW_OPCODE_AND:
+		brw_AND(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_ASR:
+		brw_ASR(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_AVG:
+		brw_AVG(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_OR:
+		brw_OR(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_SEL:
+		brw_SEL(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_SHL:
+		brw_SHL(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_SHR:
+		brw_SHR(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_XOR:
+		brw_XOR(p, dest, src0, src1);
+		break;
+	case BRW_OPCODE_MUL:
+		brw_MUL(p, dest, src0, src1);
+		break;
+	default:
+		fprintf(stderr, "Unsupported binary opcode\n");
+	}
+}
+
+static void
+i965_asm_ternary_instruction(int opcode,
+			     struct brw_codegen *p,
+			     struct brw_reg dest,
+			     struct brw_reg src0,
+			     struct brw_reg src1,
+			     struct brw_reg src2)
+{
+	switch (opcode) {
+	case BRW_OPCODE_MAD:
+		brw_MAD(p, dest, src0, src1, src2);
+		break;
+	case BRW_OPCODE_CSEL:
+		brw_CSEL(p, dest, src0, src1, src2);
+		break;
+	case BRW_OPCODE_LRP:
+		brw_LRP(p, dest, src0, src1, src2);
+		break;
+	case BRW_OPCODE_BFE:
+		brw_BFE(p, dest, src0, src1, src2);
+		break;
+	case BRW_OPCODE_BFI2:
+		brw_BFI2(p, dest, src0, src1, src2);
+		break;
+	case BRW_OPCODE_DP4A:
+		brw_DP4A(p, dest, src0, src1, src2);
+		break;
+	case BRW_OPCODE_ADD3:
+		brw_ADD3(p, dest, src0, src1, src2);
+		break;
+	default:
+		fprintf(stderr, "Unsupported ternary opcode\n");
+	}
+}
+
+static void
+i965_asm_set_instruction_options(struct brw_codegen *p,
+				 struct options options)
+{
+	brw_inst_set_access_mode(p->devinfo, brw_last_inst,
+			         options.access_mode);
+	brw_inst_set_mask_control(p->devinfo, brw_last_inst,
+				  options.mask_control);
+	if (p->devinfo->ver < 12) {
+		brw_inst_set_thread_control(p->devinfo, brw_last_inst,
+					    options.thread_control);
+		brw_inst_set_no_dd_check(p->devinfo, brw_last_inst,
+					 options.no_dd_check);
+		brw_inst_set_no_dd_clear(p->devinfo, brw_last_inst,
+					 options.no_dd_clear);
+	} else {
+		brw_inst_set_swsb(p->devinfo, brw_last_inst,
+		                  tgl_swsb_encode(p->devinfo, options.depinfo));
+	}
+	brw_inst_set_debug_control(p->devinfo, brw_last_inst,
+			           options.debug_control);
+	if (p->devinfo->ver >= 6)
+		brw_inst_set_acc_wr_control(p->devinfo, brw_last_inst,
+					    options.acc_wr_control);
+	brw_inst_set_cmpt_control(p->devinfo, brw_last_inst,
+				  options.compaction);
+}
+
+static void
+i965_asm_set_dst_nr(struct brw_codegen *p,
+	            struct brw_reg *reg,
+	            struct options options)
+{
+	if (p->devinfo->ver <= 6) {
+		if (reg->file == BRW_MESSAGE_REGISTER_FILE &&
+		    options.qtr_ctrl == BRW_COMPRESSION_COMPRESSED &&
+		    !options.is_compr)
+			reg->nr |= BRW_MRF_COMPR4;
+	}
+}
+
+static void
+add_label(struct brw_codegen *p, const char* label_name, enum instr_label_type type)
+{
+	if (!label_name) {
+		return;
+	}
+
+	struct instr_label *label = rzalloc(p->mem_ctx, struct instr_label);
+
+	label->name = ralloc_strdup(p->mem_ctx, label_name);
+	label->offset = p->next_insn_offset;
+	label->type = type;
+
+	list_addtail(&label->link, &instr_labels);
+}
+
+%}
+
+%locations
+
+%start ROOT
+
+%union {
+	char *string;
+	double number;
+	int integer;
+	unsigned long long int llint;
+	struct brw_reg reg;
+	enum brw_reg_type reg_type;
+	struct brw_codegen *program;
+	struct predicate predicate;
+	struct condition condition;
+	struct options options;
+	struct instoption instoption;
+	struct msgdesc msgdesc;
+	struct tgl_swsb depinfo;
+	brw_inst *instruction;
+}
+
+%token ABS
+%token COLON
+%token COMMA
+%token DOT
+%token LANGLE RANGLE
+%token LCURLY RCURLY
+%token LPAREN RPAREN
+%token LSQUARE RSQUARE
+%token PLUS MINUS
+%token SEMICOLON
+%token ASSIGN
+
+/* datatypes */
+%token <integer> TYPE_B TYPE_UB
+%token <integer> TYPE_W TYPE_UW
+%token <integer> TYPE_D TYPE_UD
+%token <integer> TYPE_Q TYPE_UQ
+%token <integer> TYPE_V TYPE_UV
+%token <integer> TYPE_F TYPE_HF
+%token <integer> TYPE_DF TYPE_NF
+%token <integer> TYPE_VF
+
+/* label */
+%token <string> JUMP_LABEL
+%token <string> JUMP_LABEL_TARGET
+
+/* opcodes */
+%token <integer> ADD ADD3 ADDC AND ASR AVG
+%token <integer> BFE BFI1 BFI2 BFB BFREV BRC BRD BREAK
+%token <integer> CALL CALLA CASE CBIT CMP CMPN CONT CSEL
+%token <integer> DIM DO DPAS DPASW DP2 DP3 DP4 DP4A DPH
+%token <integer> ELSE ENDIF F16TO32 F32TO16 FBH FBL FORK FRC
+%token <integer> GOTO
+%token <integer> HALT
+%token <integer> IF IFF ILLEGAL
+%token <integer> JMPI JOIN
+%token <integer> LINE LRP LZD
+%token <integer> MAC MACH MAD MADM MOV MOVI MUL MREST MSAVE
+%token <integer> NENOP NOP NOT
+%token <integer> OR
+%token <integer> PLN POP PUSH
+%token <integer> RET RNDD RNDE RNDU RNDZ ROL ROR
+%token <integer> SAD2 SADA2 SEL SENDS SENDSC SHL SHR SMOV SUBB SYNC
+%token <integer> SEND_GFX4 SENDC_GFX4 SEND_GFX12 SENDC_GFX12
+%token <integer> WAIT WHILE
+%token <integer> XOR
+
+/* extended math functions */
+%token <integer> COS EXP FDIV INV INVM INTDIV INTDIVMOD INTMOD LOG POW RSQ
+%token <integer> RSQRTM SIN SINCOS SQRT
+
+/* sync instruction */
+%token <integer> ALLRD ALLWR FENCE BAR HOST
+%type <integer> sync_function
+%type <reg> sync_arg
+
+/* shared functions for send */
+%token CONST CRE DATA DP_DATA_1 GATEWAY MATH PIXEL_INTERP READ RENDER SAMPLER
+%token THREAD_SPAWNER URB VME WRITE DP_SAMPLER RT_ACCEL SLM TGM UGM
+
+/* message details for send */
+%token MSGDESC_BEGIN SRC1_LEN EX_BSO MSGDESC_END
+%type <msgdesc> msgdesc msgdesc_parts;
+
+/* Conditional modifiers */
+%token <integer> EQUAL GREATER GREATER_EQUAL LESS LESS_EQUAL NOT_EQUAL
+%token <integer> NOT_ZERO OVERFLOW UNORDERED ZERO
+
+/* register Access Modes */
+%token ALIGN1 ALIGN16
+
+/* accumulator write control */
+%token ACCWREN
+
+/* compaction control */
+%token CMPTCTRL
+
+/* compression control */
+%token COMPR COMPR4 SECHALF
+
+/* mask control (WeCtrl) */
+%token WECTRL
+
+/* debug control */
+%token BREAKPOINT
+
+/* dependency control */
+%token NODDCLR NODDCHK
+
+/* end of thread */
+%token EOT
+
+/* mask control */
+%token MASK_DISABLE;
+
+/* predicate control */
+%token <integer> ANYV ALLV ANY2H ALL2H ANY4H ALL4H ANY8H ALL8H ANY16H ALL16H
+%token <integer> ANY32H ALL32H
+
+/* round instructions */
+%token <integer> ROUND_INCREMENT
+
+/* staturation */
+%token SATURATE
+
+/* thread control */
+%token ATOMIC SWITCH
+
+/* quater control */
+%token QTR_2Q QTR_3Q QTR_4Q QTR_2H QTR_2N QTR_3N QTR_4N QTR_5N
+%token QTR_6N QTR_7N QTR_8N
+
+/* channels */
+%token <integer> X Y Z W
+
+/* reg files */
+%token GENREGFILE MSGREGFILE
+
+/* vertical stride in register region */
+%token VxH
+
+/* register type */
+%token <integer> GENREG MSGREG ADDRREG ACCREG FLAGREG NOTIFYREG STATEREG
+%token <integer> CONTROLREG IPREG PERFORMANCEREG THREADREG CHANNELENABLEREG
+%token <integer> MASKREG
+
+%token <integer> INTEGER
+%token <llint> LONG
+%token NULL_TOKEN
+
+%nonassoc SUBREGNUM
+%left PLUS MINUS
+%nonassoc DOT
+%nonassoc EMPTYEXECSIZE
+%nonassoc LPAREN
+
+%type <integer> execsize simple_int exp
+%type <llint> exp2
+
+/* predicate control */
+%type <integer> predctrl predstate
+%type <predicate> predicate
+
+/* conditional modifier */
+%type <condition> cond_mod
+%type <integer> condModifiers
+
+/* instruction options  */
+%type <options> instoptions instoption_list
+%type <instoption> instoption
+
+/* writemask */
+%type <integer> writemask_x writemask_y writemask_z writemask_w
+%type <integer> writemask
+
+/* dst operand */
+%type <reg> dst dstoperand dstoperandex dstoperandex_typed dstreg
+%type <integer> dstregion
+
+%type <integer> saturate relativelocation rellocation
+%type <reg> relativelocation2
+
+/* src operand */
+%type <reg> directsrcoperand directsrcaccoperand indirectsrcoperand srcacc
+%type <reg> srcarcoperandex srcaccimm srcarcoperandex_typed srcimm
+%type <reg> indirectgenreg indirectregion
+%type <reg> immreg src reg32 payload directgenreg_list addrparam region
+%type <reg> region_wh directgenreg directmsgreg indirectmsgreg
+%type <reg> desc ex_desc reg32a
+%type <integer> swizzle
+
+/* registers */
+%type <reg> accreg addrreg channelenablereg controlreg flagreg ipreg
+%type <reg> notifyreg nullreg performancereg threadcontrolreg statereg maskreg
+%type <integer> subregnum
+
+/* register types */
+%type <reg_type> reg_type imm_type
+
+/* immediate values */
+%type <llint> immval
+
+/* instruction opcodes */
+%type <integer> unaryopcodes binaryopcodes binaryaccopcodes ternaryopcodes
+%type <integer> sendop sendsop
+%type <instruction> sendopcode sendsopcode
+
+%type <integer> negate abs chansel math_function sharedfunction
+
+%type <string> jumplabeltarget
+%type <string> jumplabel
+
+/* SWSB */
+%token <integer> REG_DIST_CURRENT
+%token <integer> REG_DIST_FLOAT
+%token <integer> REG_DIST_INT
+%token <integer> REG_DIST_LONG
+%token <integer> REG_DIST_ALL
+%token <integer> SBID_ALLOC
+%token <integer> SBID_WAIT_SRC
+%token <integer> SBID_WAIT_DST
+
+%type <depinfo> depinfo
+
+%code {
+
+static void
+add_instruction_option(struct options *options, struct instoption opt)
+{
+	if (opt.type == INSTOPTION_DEP_INFO) {
+		if (opt.depinfo_value.regdist) {
+			options->depinfo.regdist = opt.depinfo_value.regdist;
+			options->depinfo.pipe = opt.depinfo_value.pipe;
+		} else {
+			options->depinfo.sbid = opt.depinfo_value.sbid;
+			options->depinfo.mode = opt.depinfo_value.mode;
+		}
+		return;
+	}
+	switch (opt.uint_value) {
+	case ALIGN1:
+		options->access_mode = BRW_ALIGN_1;
+		break;
+	case ALIGN16:
+		options->access_mode = BRW_ALIGN_16;
+		break;
+	case SECHALF:
+		options->qtr_ctrl |= BRW_COMPRESSION_2NDHALF;
+		break;
+	case COMPR:
+		options->qtr_ctrl |= BRW_COMPRESSION_COMPRESSED;
+		options->is_compr = true;
+		break;
+	case COMPR4:
+		options->qtr_ctrl |= BRW_COMPRESSION_COMPRESSED;
+		break;
+	case SWITCH:
+		options->thread_control |= BRW_THREAD_SWITCH;
+		break;
+	case ATOMIC:
+		options->thread_control |= BRW_THREAD_ATOMIC;
+		break;
+	case NODDCHK:
+		options->no_dd_check = true;
+		break;
+	case NODDCLR:
+		options->no_dd_clear = BRW_DEPENDENCY_NOTCLEARED;
+		break;
+	case MASK_DISABLE:
+		options->mask_control |= BRW_MASK_DISABLE;
+		break;
+	case BREAKPOINT:
+		options->debug_control = BRW_DEBUG_BREAKPOINT;
+		break;
+	case WECTRL:
+		options->mask_control |= BRW_WE_ALL;
+		break;
+	case CMPTCTRL:
+		options->compaction = true;
+		break;
+	case ACCWREN:
+		options->acc_wr_control = true;
+		break;
+	case EOT:
+		options->end_of_thread = true;
+		break;
+	/* TODO : Figure out how to set instruction group and get rid of
+	 * code below
+	 */
+	case QTR_2Q:
+		options->qtr_ctrl = BRW_COMPRESSION_2NDHALF;
+		break;
+	case QTR_3Q:
+		options->qtr_ctrl = BRW_COMPRESSION_COMPRESSED;
+		break;
+	case QTR_4Q:
+		options->qtr_ctrl = 3;
+		break;
+	case QTR_2H:
+		options->qtr_ctrl = BRW_COMPRESSION_COMPRESSED;
+		break;
+	case QTR_2N:
+		options->qtr_ctrl = BRW_COMPRESSION_NONE;
+		options->nib_ctrl = true;
+		break;
+	case QTR_3N:
+		options->qtr_ctrl = BRW_COMPRESSION_2NDHALF;
+		break;
+	case QTR_4N:
+		options->qtr_ctrl = BRW_COMPRESSION_2NDHALF;
+		options->nib_ctrl = true;
+		break;
+	case QTR_5N:
+		options->qtr_ctrl = BRW_COMPRESSION_COMPRESSED;
+		break;
+	case QTR_6N:
+		options->qtr_ctrl = BRW_COMPRESSION_COMPRESSED;
+		options->nib_ctrl = true;
+		break;
+	case QTR_7N:
+		options->qtr_ctrl = 3;
+		break;
+	case QTR_8N:
+		options->qtr_ctrl = 3;
+		options->nib_ctrl = true;
+		break;
+	}
+}
+}
+%%
+
+ROOT:
+	instrseq
+	;
+
+instrseq:
+	instrseq instruction SEMICOLON
+	| instrseq relocatableinstruction SEMICOLON
+	| instruction SEMICOLON
+	| relocatableinstruction SEMICOLON
+	| instrseq jumplabeltarget
+	| jumplabeltarget
+	;
+
+/* Instruction Group */
+instruction:
+	unaryinstruction
+	| binaryinstruction
+	| binaryaccinstruction
+	| mathinstruction
+	| nopinstruction
+	| waitinstruction
+	| ternaryinstruction
+	| sendinstruction
+	| illegalinstruction
+	| syncinstruction
+	;
+
+relocatableinstruction:
+	jumpinstruction
+	| branchinstruction
+	| breakinstruction
+	| loopinstruction
+	;
+
+illegalinstruction:
+	ILLEGAL execsize instoptions
+	{
+		brw_next_insn(p, $1);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $2);
+		i965_asm_set_instruction_options(p, $3);
+	}
+	;
+
+/* Unary instruction */
+unaryinstruction:
+	predicate unaryopcodes saturate cond_mod execsize dst srcaccimm	instoptions
+	{
+		i965_asm_set_dst_nr(p, &$6, $8);
+		brw_set_default_access_mode(p, $8.access_mode);
+		i965_asm_unary_instruction($2, p, $6, $7);
+		brw_pop_insn_state(p);
+		i965_asm_set_instruction_options(p, $8);
+		brw_inst_set_cond_modifier(p->devinfo, brw_last_inst,
+					   $4.cond_modifier);
+
+		if (p->devinfo->ver >= 7 && $2 != BRW_OPCODE_DIM &&
+		    !brw_inst_flag_reg_nr(p->devinfo, brw_last_inst)) {
+			brw_inst_set_flag_reg_nr(p->devinfo,
+						 brw_last_inst,
+						 $4.flag_reg_nr);
+			brw_inst_set_flag_subreg_nr(p->devinfo,
+						    brw_last_inst,
+						    $4.flag_subreg_nr);
+		}
+
+		if ($7.file != BRW_IMMEDIATE_VALUE) {
+			brw_inst_set_src0_vstride(p->devinfo, brw_last_inst,
+						  $7.vstride);
+		}
+		brw_inst_set_saturate(p->devinfo, brw_last_inst, $3);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $5);
+		// TODO: set instruction group instead of qtr and nib ctrl
+		brw_inst_set_qtr_control(p->devinfo, brw_last_inst,
+				         $8.qtr_ctrl);
+
+		if (p->devinfo->ver >= 7)
+			brw_inst_set_nib_control(p->devinfo, brw_last_inst,
+					         $8.nib_ctrl);
+	}
+	;
+
+unaryopcodes:
+	BFREV
+	| CBIT
+	| DIM
+	| F16TO32
+	| F32TO16
+	| FBH
+	| FBL
+	| FRC
+	| LZD
+	| MOV
+	| NOT
+	| RNDD
+	| RNDE
+	| RNDU
+	| RNDZ
+	;
+
+/* Binary instruction */
+binaryinstruction:
+	predicate binaryopcodes saturate cond_mod execsize dst srcimm srcimm instoptions
+	{
+		i965_asm_set_dst_nr(p, &$6, $9);
+		brw_set_default_access_mode(p, $9.access_mode);
+		i965_asm_binary_instruction($2, p, $6, $7, $8);
+		i965_asm_set_instruction_options(p, $9);
+		brw_inst_set_cond_modifier(p->devinfo, brw_last_inst,
+					   $4.cond_modifier);
+
+		if (p->devinfo->ver >= 7 &&
+		    !brw_inst_flag_reg_nr(p->devinfo, brw_last_inst)) {
+			brw_inst_set_flag_reg_nr(p->devinfo, brw_last_inst,
+					         $4.flag_reg_nr);
+			brw_inst_set_flag_subreg_nr(p->devinfo, brw_last_inst,
+						    $4.flag_subreg_nr);
+		}
+
+		brw_inst_set_saturate(p->devinfo, brw_last_inst, $3);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $5);
+		// TODO: set instruction group instead of qtr and nib ctrl
+		brw_inst_set_qtr_control(p->devinfo, brw_last_inst,
+				         $9.qtr_ctrl);
+
+		if (p->devinfo->ver >= 7)
+			brw_inst_set_nib_control(p->devinfo, brw_last_inst,
+					         $9.nib_ctrl);
+
+		brw_pop_insn_state(p);
+	}
+	;
+
+binaryopcodes:
+	ADDC
+	| BFI1
+	| DP2
+	| DP3
+	| DP4
+	| DPH
+	| LINE
+	| MAC
+	| MACH
+	| MUL
+	| PLN
+	| ROL
+	| ROR
+	| SAD2
+	| SADA2
+	| SUBB
+	;
+
+/* Binary acc instruction */
+binaryaccinstruction:
+	predicate binaryaccopcodes saturate cond_mod execsize dst srcacc srcimm instoptions
+	{
+		i965_asm_set_dst_nr(p, &$6, $9);
+		brw_set_default_access_mode(p, $9.access_mode);
+		i965_asm_binary_instruction($2, p, $6, $7, $8);
+		brw_pop_insn_state(p);
+		i965_asm_set_instruction_options(p, $9);
+		brw_inst_set_cond_modifier(p->devinfo, brw_last_inst,
+					   $4.cond_modifier);
+
+		if (p->devinfo->ver >= 7 &&
+		    !brw_inst_flag_reg_nr(p->devinfo, brw_last_inst)) {
+			brw_inst_set_flag_reg_nr(p->devinfo,
+						 brw_last_inst,
+						 $4.flag_reg_nr);
+			brw_inst_set_flag_subreg_nr(p->devinfo,
+						    brw_last_inst,
+						    $4.flag_subreg_nr);
+		}
+
+		brw_inst_set_saturate(p->devinfo, brw_last_inst, $3);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $5);
+		// TODO: set instruction group instead of qtr and nib ctrl
+		brw_inst_set_qtr_control(p->devinfo, brw_last_inst,
+				         $9.qtr_ctrl);
+
+		if (p->devinfo->ver >= 7)
+			brw_inst_set_nib_control(p->devinfo, brw_last_inst,
+					         $9.nib_ctrl);
+	}
+	;
+
+binaryaccopcodes:
+	ADD
+	| AND
+	| ASR
+	| AVG
+	| CMP
+	| CMPN
+	| OR
+	| SEL
+	| SHL
+	| SHR
+	| XOR
+	;
+
+/* Math instruction */
+mathinstruction:
+	predicate MATH saturate math_function execsize dst src srcimm instoptions
+	{
+		brw_set_default_access_mode(p, $9.access_mode);
+		gfx6_math(p, $6, $4, $7, $8);
+		i965_asm_set_instruction_options(p, $9);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $5);
+		brw_inst_set_saturate(p->devinfo, brw_last_inst, $3);
+		// TODO: set instruction group instead
+		brw_inst_set_qtr_control(p->devinfo, brw_last_inst,
+				         $9.qtr_ctrl);
+
+		if (p->devinfo->ver >= 7)
+			brw_inst_set_nib_control(p->devinfo, brw_last_inst,
+					         $9.nib_ctrl);
+
+		brw_pop_insn_state(p);
+	}
+	;
+
+math_function:
+	COS
+	| EXP
+	| FDIV
+	| INV
+	| INVM
+	| INTDIV
+	| INTDIVMOD
+	| INTMOD
+	| LOG
+	| POW
+	| RSQ
+	| RSQRTM
+	| SIN
+	| SQRT
+	| SINCOS
+	;
+
+/* NOP instruction */
+nopinstruction:
+	NOP
+	{
+		brw_NOP(p);
+	}
+	;
+
+/* Ternary operand instruction */
+ternaryinstruction:
+	predicate ternaryopcodes saturate cond_mod execsize dst srcimm src srcimm instoptions
+	{
+		brw_set_default_access_mode(p, $10.access_mode);
+		i965_asm_ternary_instruction($2, p, $6, $7, $8, $9);
+		brw_pop_insn_state(p);
+		i965_asm_set_instruction_options(p, $10);
+		brw_inst_set_cond_modifier(p->devinfo, brw_last_inst,
+					   $4.cond_modifier);
+
+		if (p->devinfo->ver >= 7 && p->devinfo->ver < 12) {
+			brw_inst_set_3src_a16_flag_reg_nr(p->devinfo, brw_last_inst,
+					         $4.flag_reg_nr);
+			brw_inst_set_3src_a16_flag_subreg_nr(p->devinfo, brw_last_inst,
+						    $4.flag_subreg_nr);
+		}
+
+		brw_inst_set_saturate(p->devinfo, brw_last_inst, $3);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $5);
+		// TODO: set instruction group instead of qtr and nib ctrl
+		brw_inst_set_qtr_control(p->devinfo, brw_last_inst,
+				         $10.qtr_ctrl);
+
+		if (p->devinfo->ver >= 7)
+			brw_inst_set_nib_control(p->devinfo, brw_last_inst,
+					         $10.nib_ctrl);
+	}
+	;
+
+ternaryopcodes:
+	CSEL
+	| BFE
+	| BFI2
+	| LRP
+	| MAD
+	| DP4A
+	| ADD3
+	;
+
+/* Wait instruction */
+waitinstruction:
+	WAIT execsize dst instoptions
+	{
+		brw_next_insn(p, $1);
+		i965_asm_set_instruction_options(p, $4);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $2);
+		brw_set_default_access_mode(p, $4.access_mode);
+		struct brw_reg dest = $3;
+		dest.swizzle = brw_swizzle_for_mask(dest.writemask);
+		if (dest.file != ARF || dest.nr != BRW_ARF_NOTIFICATION_COUNT)
+			error(&@1, "WAIT must use the notification register\n");
+		brw_set_dest(p, brw_last_inst, dest);
+		brw_set_src0(p, brw_last_inst, dest);
+		brw_set_src1(p, brw_last_inst, brw_null_reg());
+		brw_inst_set_mask_control(p->devinfo, brw_last_inst, BRW_MASK_DISABLE);
+	}
+	;
+
+/* Send instruction */
+sendinstruction:
+	predicate sendopcode execsize dst payload exp2 sharedfunction msgdesc instoptions
+	{
+		i965_asm_set_instruction_options(p, $9);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+		brw_set_dest(p, brw_last_inst, $4);
+		brw_set_src0(p, brw_last_inst, $5);
+		brw_inst_set_bits(brw_last_inst, 127, 96, $6);
+		brw_inst_set_src1_file_type(p->devinfo, brw_last_inst,
+				            BRW_IMMEDIATE_VALUE,
+					    BRW_REGISTER_TYPE_UD);
+		brw_inst_set_sfid(p->devinfo, brw_last_inst, $7);
+		brw_inst_set_eot(p->devinfo, brw_last_inst, $9.end_of_thread);
+		// TODO: set instruction group instead of qtr and nib ctrl
+		brw_inst_set_qtr_control(p->devinfo, brw_last_inst,
+				         $9.qtr_ctrl);
+
+		if (p->devinfo->ver >= 7)
+			brw_inst_set_nib_control(p->devinfo, brw_last_inst,
+					         $9.nib_ctrl);
+
+		brw_pop_insn_state(p);
+	}
+	| predicate sendopcode execsize exp dst payload exp2 sharedfunction msgdesc instoptions
+	{
+		assert(p->devinfo->ver < 6);
+
+		i965_asm_set_instruction_options(p, $10);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+		brw_inst_set_base_mrf(p->devinfo, brw_last_inst, $4);
+		brw_set_dest(p, brw_last_inst, $5);
+		brw_set_src0(p, brw_last_inst, $6);
+		brw_inst_set_bits(brw_last_inst, 127, 96, $7);
+		brw_inst_set_src1_file_type(p->devinfo, brw_last_inst,
+				            BRW_IMMEDIATE_VALUE,
+					    BRW_REGISTER_TYPE_UD);
+		brw_inst_set_sfid(p->devinfo, brw_last_inst, $8);
+		brw_inst_set_eot(p->devinfo, brw_last_inst, $10.end_of_thread);
+		// TODO: set instruction group instead of qtr and nib ctrl
+		brw_inst_set_qtr_control(p->devinfo, brw_last_inst,
+				         $10.qtr_ctrl);
+
+		brw_pop_insn_state(p);
+	}
+	| predicate sendopcode execsize dst payload payload exp2 sharedfunction msgdesc instoptions
+	{
+		assert(p->devinfo->ver >= 6 && p->devinfo->ver < 12);
+
+		i965_asm_set_instruction_options(p, $10);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+		brw_set_dest(p, brw_last_inst, $4);
+		brw_set_src0(p, brw_last_inst, $5);
+		brw_inst_set_bits(brw_last_inst, 127, 96, $7);
+		brw_inst_set_sfid(p->devinfo, brw_last_inst, $8);
+		brw_inst_set_eot(p->devinfo, brw_last_inst, $10.end_of_thread);
+		// TODO: set instruction group instead of qtr and nib ctrl
+		brw_inst_set_qtr_control(p->devinfo, brw_last_inst,
+				         $10.qtr_ctrl);
+
+		if (p->devinfo->ver >= 7)
+			brw_inst_set_nib_control(p->devinfo, brw_last_inst,
+					         $10.nib_ctrl);
+
+		brw_pop_insn_state(p);
+	}
+	| predicate sendsopcode execsize dst payload payload desc ex_desc sharedfunction msgdesc instoptions
+	{
+		assert(p->devinfo->ver >= 9);
+
+		i965_asm_set_instruction_options(p, $11);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+		brw_set_dest(p, brw_last_inst, $4);
+		brw_set_src0(p, brw_last_inst, $5);
+		brw_set_src1(p, brw_last_inst, $6);
+
+		if ($7.file == BRW_IMMEDIATE_VALUE) {
+			brw_inst_set_send_sel_reg32_desc(p->devinfo, brw_last_inst, 0);
+			brw_inst_set_send_desc(p->devinfo, brw_last_inst, $7.ud);
+		} else {
+			brw_inst_set_send_sel_reg32_desc(p->devinfo, brw_last_inst, 1);
+		}
+
+		if ($8.file == BRW_IMMEDIATE_VALUE) {
+			brw_inst_set_send_sel_reg32_ex_desc(p->devinfo, brw_last_inst, 0);
+			brw_inst_set_sends_ex_desc(p->devinfo, brw_last_inst, $8.ud);
+		} else {
+			brw_inst_set_send_sel_reg32_ex_desc(p->devinfo, brw_last_inst, 1);
+			brw_inst_set_send_ex_desc_ia_subreg_nr(p->devinfo, brw_last_inst, $8.subnr >> 2);
+		}
+
+		brw_inst_set_sfid(p->devinfo, brw_last_inst, $9);
+		brw_inst_set_eot(p->devinfo, brw_last_inst, $11.end_of_thread);
+		// TODO: set instruction group instead of qtr and nib ctrl
+		brw_inst_set_qtr_control(p->devinfo, brw_last_inst,
+				         $11.qtr_ctrl);
+
+		brw_inst_set_nib_control(p->devinfo, brw_last_inst,
+					 $11.nib_ctrl);
+
+		if (p->devinfo->verx10 >= 125 && $10.ex_bso) {
+			brw_inst_set_send_ex_bso(p->devinfo, brw_last_inst, 1);
+			brw_inst_set_send_src1_len(p->devinfo, brw_last_inst,
+						   $10.src1_len);
+		}
+
+		brw_pop_insn_state(p);
+	}
+	;
+
+sendop:
+	SEND_GFX4
+	| SENDC_GFX4
+	;
+
+sendsop:
+	SEND_GFX12
+	| SENDC_GFX12
+	| SENDS
+	| SENDSC
+	;
+
+sendopcode:
+	sendop   { $$ = brw_next_insn(p, $1); }
+	;
+
+sendsopcode:
+	sendsop  { $$ = brw_next_insn(p, $1); }
+	;
+
+sharedfunction:
+	NULL_TOKEN 	        { $$ = BRW_SFID_NULL; }
+	| MATH 		        { $$ = BRW_SFID_MATH; }
+	| GATEWAY 	        { $$ = BRW_SFID_MESSAGE_GATEWAY; }
+	| READ 		        { $$ = BRW_SFID_DATAPORT_READ; }
+	| WRITE 	        { $$ = BRW_SFID_DATAPORT_WRITE; }
+	| URB 		        { $$ = BRW_SFID_URB; }
+	| THREAD_SPAWNER 	{ $$ = BRW_SFID_THREAD_SPAWNER; }
+	| VME 		        { $$ = BRW_SFID_VME; }
+	| RENDER 	        { $$ = GFX6_SFID_DATAPORT_RENDER_CACHE; }
+	| CONST 	        { $$ = GFX6_SFID_DATAPORT_CONSTANT_CACHE; }
+	| DATA 		        { $$ = GFX7_SFID_DATAPORT_DATA_CACHE; }
+	| PIXEL_INTERP 	        { $$ = GFX7_SFID_PIXEL_INTERPOLATOR; }
+	| DP_DATA_1 	        { $$ = HSW_SFID_DATAPORT_DATA_CACHE_1; }
+	| CRE 		        { $$ = HSW_SFID_CRE; }
+	| SAMPLER	        { $$ = BRW_SFID_SAMPLER; }
+	| DP_SAMPLER	        { $$ = GFX6_SFID_DATAPORT_SAMPLER_CACHE; }
+	| RT_ACCEL		{ $$ = GEN_RT_SFID_RAY_TRACE_ACCELERATOR; }
+	| SLM			{ $$ = GFX12_SFID_SLM; }
+	| TGM			{ $$ = GFX12_SFID_TGM; }
+	| UGM			{ $$ = GFX12_SFID_UGM; }
+	;
+
+exp2:
+	LONG 		{ $$ = $1; }
+	| MINUS LONG 	{ $$ = -$2; }
+	;
+
+desc:
+	reg32a
+	| exp2
+	{
+		$$ = brw_imm_ud($1);
+	}
+	;
+
+ex_desc:
+	reg32a
+	| exp2
+	{
+		$$ = brw_imm_ud($1);
+	}
+	;
+
+reg32a:
+	addrreg region reg_type
+	{
+		$$ = set_direct_src_operand(&$1, $3);
+		$$ = stride($$, $2.vstride, $2.width, $2.hstride);
+	}
+	;
+
+
+/* Jump instruction */
+jumpinstruction:
+	predicate JMPI execsize relativelocation2 instoptions
+	{
+		brw_next_insn(p, $2);
+		i965_asm_set_instruction_options(p, $5);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+		brw_set_dest(p, brw_last_inst, brw_ip_reg());
+		brw_set_src0(p, brw_last_inst, brw_ip_reg());
+		brw_set_src1(p, brw_last_inst, $4);
+		brw_inst_set_pred_control(p->devinfo, brw_last_inst,
+					  brw_inst_pred_control(p->devinfo,
+								brw_last_inst));
+		brw_pop_insn_state(p);
+	}
+	;
+
+/* branch instruction */
+branchinstruction:
+	predicate ENDIF execsize JUMP_LABEL instoptions
+	{
+		add_label(p, $4, INSTR_LABEL_JIP);
+
+		brw_next_insn(p, $2);
+		i965_asm_set_instruction_options(p, $5);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+
+		if (p->devinfo->ver == 6) {
+			brw_set_dest(p, brw_last_inst, brw_imm_w(0x0));
+			brw_set_src0(p, brw_last_inst, retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D));
+			brw_set_src1(p, brw_last_inst, retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D));
+		} else if (p->devinfo->ver == 7) {
+			brw_set_dest(p, brw_last_inst, retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D));
+			brw_set_src0(p, brw_last_inst, retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D));
+			brw_set_src1(p, brw_last_inst, brw_imm_w(0x0));
+		} else {
+			brw_set_src0(p, brw_last_inst, brw_imm_d(0x0));
+		}
+
+		brw_pop_insn_state(p);
+	}
+	| predicate ENDIF execsize relativelocation instoptions
+	{
+		brw_next_insn(p, $2);
+		i965_asm_set_instruction_options(p, $5);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+
+		brw_set_dest(p, brw_last_inst, retype(brw_null_reg(),
+					BRW_REGISTER_TYPE_D));
+		brw_set_src0(p, brw_last_inst, retype(brw_null_reg(),
+					BRW_REGISTER_TYPE_D));
+		brw_set_src1(p, brw_last_inst, brw_imm_d(0x0));
+		brw_inst_set_gfx4_pop_count(p->devinfo, brw_last_inst, $4);
+
+		brw_inst_set_thread_control(p->devinfo, brw_last_inst,
+						BRW_THREAD_SWITCH);
+
+		brw_pop_insn_state(p);
+	}
+	| ELSE execsize JUMP_LABEL jumplabel instoptions
+	{
+		add_label(p, $3, INSTR_LABEL_JIP);
+		add_label(p, $4, INSTR_LABEL_UIP);
+
+		brw_next_insn(p, $1);
+		i965_asm_set_instruction_options(p, $5);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $2);
+
+		if (p->devinfo->ver == 6) {
+			brw_set_dest(p, brw_last_inst, brw_imm_w(0x0));
+			brw_set_src0(p, brw_last_inst, retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D));
+			brw_set_src1(p, brw_last_inst, retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D));
+		} else if (p->devinfo->ver == 7) {
+			brw_set_dest(p, brw_last_inst, retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D));
+			brw_set_src0(p, brw_last_inst, retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D));
+			brw_set_src1(p, brw_last_inst, brw_imm_w(0));
+		} else {
+			brw_set_dest(p, brw_last_inst, retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D));
+			if (p->devinfo->ver < 12)
+				brw_set_src0(p, brw_last_inst, brw_imm_d(0));
+		}
+	}
+	| ELSE execsize relativelocation rellocation instoptions
+	{
+		brw_next_insn(p, $1);
+		i965_asm_set_instruction_options(p, $5);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $2);
+
+		brw_set_dest(p, brw_last_inst, brw_ip_reg());
+		brw_set_src0(p, brw_last_inst, brw_ip_reg());
+		brw_set_src1(p, brw_last_inst, brw_imm_d(0x0));
+		brw_inst_set_gfx4_jump_count(p->devinfo, brw_last_inst, $3);
+		brw_inst_set_gfx4_pop_count(p->devinfo, brw_last_inst, $4);
+
+		if (!p->single_program_flow)
+			brw_inst_set_thread_control(p->devinfo, brw_last_inst,
+						    BRW_THREAD_SWITCH);
+	}
+	| predicate IF execsize JUMP_LABEL jumplabel instoptions
+	{
+		add_label(p, $4, INSTR_LABEL_JIP);
+		add_label(p, $5, INSTR_LABEL_UIP);
+
+		brw_next_insn(p, $2);
+		i965_asm_set_instruction_options(p, $6);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+
+		if (p->devinfo->ver == 6) {
+			brw_set_dest(p, brw_last_inst, brw_imm_w(0x0));
+			brw_set_src0(p, brw_last_inst,
+				     vec1(retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D)));
+			brw_set_src1(p, brw_last_inst,
+				     vec1(retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D)));
+		} else if (p->devinfo->ver == 7) {
+			brw_set_dest(p, brw_last_inst,
+				     vec1(retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D)));
+			brw_set_src0(p, brw_last_inst,
+				     vec1(retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D)));
+			brw_set_src1(p, brw_last_inst, brw_imm_w(0x0));
+		} else {
+			brw_set_dest(p, brw_last_inst,
+				     vec1(retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D)));
+			if (p->devinfo->ver < 12)
+				brw_set_src0(p, brw_last_inst, brw_imm_d(0x0));
+		}
+
+		brw_pop_insn_state(p);
+	}
+	| predicate IF execsize relativelocation rellocation instoptions
+	{
+		brw_next_insn(p, $2);
+		i965_asm_set_instruction_options(p, $6);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+
+		brw_set_dest(p, brw_last_inst, brw_ip_reg());
+		brw_set_src0(p, brw_last_inst, brw_ip_reg());
+		brw_set_src1(p, brw_last_inst, brw_imm_d(0x0));
+		brw_inst_set_gfx4_jump_count(p->devinfo, brw_last_inst, $4);
+		brw_inst_set_gfx4_pop_count(p->devinfo, brw_last_inst, $5);
+
+		if (!p->single_program_flow)
+			brw_inst_set_thread_control(p->devinfo, brw_last_inst,
+						    BRW_THREAD_SWITCH);
+
+		brw_pop_insn_state(p);
+	}
+	| predicate IFF execsize JUMP_LABEL instoptions
+	{
+		add_label(p, $4, INSTR_LABEL_JIP);
+
+		brw_next_insn(p, $2);
+		i965_asm_set_instruction_options(p, $5);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+
+		if (p->devinfo->ver == 6) {
+			brw_set_src0(p, brw_last_inst,
+				     vec1(retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D)));
+			brw_set_src1(p, brw_last_inst,
+				     vec1(retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D)));
+		} else if (p->devinfo->ver == 7) {
+			brw_set_dest(p, brw_last_inst,
+				     vec1(retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D)));
+			brw_set_src0(p, brw_last_inst,
+				     vec1(retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D)));
+			brw_set_src1(p, brw_last_inst, brw_imm_w(0x0));
+		} else {
+			brw_set_dest(p, brw_last_inst,
+				     vec1(retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D)));
+			if (p->devinfo->ver < 12)
+				brw_set_src0(p, brw_last_inst, brw_imm_d(0x0));
+		}
+
+		brw_pop_insn_state(p);
+	}
+	| predicate IFF execsize relativelocation instoptions
+	{
+		brw_next_insn(p, $2);
+		i965_asm_set_instruction_options(p, $5);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+
+		brw_set_dest(p, brw_last_inst, brw_ip_reg());
+		brw_set_src0(p, brw_last_inst, brw_ip_reg());
+		brw_inst_set_gfx4_jump_count(p->devinfo, brw_last_inst, $4);
+		brw_set_src1(p, brw_last_inst, brw_imm_d($4));
+
+		if (!p->single_program_flow)
+			brw_inst_set_thread_control(p->devinfo, brw_last_inst,
+						    BRW_THREAD_SWITCH);
+
+		brw_pop_insn_state(p);
+	}
+	;
+
+/* break instruction */
+breakinstruction:
+	predicate BREAK execsize JUMP_LABEL JUMP_LABEL instoptions
+	{
+		add_label(p, $4, INSTR_LABEL_JIP);
+		add_label(p, $5, INSTR_LABEL_UIP);
+
+		brw_next_insn(p, $2);
+		i965_asm_set_instruction_options(p, $6);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+
+		if (p->devinfo->ver >= 8) {
+			brw_set_dest(p, brw_last_inst, retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D));
+			brw_set_src0(p, brw_last_inst, brw_imm_d(0x0));
+		} else {
+			brw_set_dest(p, brw_last_inst, retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D));
+			brw_set_src0(p, brw_last_inst, retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D));
+			brw_set_src1(p, brw_last_inst, brw_imm_d(0x0));
+		}
+
+		brw_pop_insn_state(p);
+	}
+	| predicate BREAK execsize relativelocation relativelocation instoptions
+	{
+		brw_next_insn(p, $2);
+		i965_asm_set_instruction_options(p, $6);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+
+		brw_set_dest(p, brw_last_inst, brw_ip_reg());
+		brw_set_src0(p, brw_last_inst, brw_ip_reg());
+		brw_set_src1(p, brw_last_inst, brw_imm_d(0x0));
+		brw_inst_set_gfx4_jump_count(p->devinfo, brw_last_inst, $4);
+		brw_inst_set_gfx4_pop_count(p->devinfo, brw_last_inst, $5);
+
+		brw_pop_insn_state(p);
+	}
+	| predicate HALT execsize JUMP_LABEL JUMP_LABEL instoptions
+	{
+		add_label(p, $4, INSTR_LABEL_JIP);
+		add_label(p, $5, INSTR_LABEL_UIP);
+
+		brw_next_insn(p, $2);
+		i965_asm_set_instruction_options(p, $6);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+
+		brw_set_dest(p, brw_last_inst, retype(brw_null_reg(),
+			     BRW_REGISTER_TYPE_D));
+
+		if (p->devinfo->ver < 8) {
+			brw_set_src0(p, brw_last_inst, retype(brw_null_reg(),
+				     BRW_REGISTER_TYPE_D));
+			brw_set_src1(p, brw_last_inst, brw_imm_d(0x0));
+		} else if (p->devinfo->ver < 12) {
+			brw_set_src0(p, brw_last_inst, brw_imm_d(0x0));
+		}
+
+		brw_pop_insn_state(p);
+	}
+	| predicate CONT execsize JUMP_LABEL JUMP_LABEL instoptions
+	{
+		add_label(p, $4, INSTR_LABEL_JIP);
+		add_label(p, $5, INSTR_LABEL_UIP);
+
+		brw_next_insn(p, $2);
+		i965_asm_set_instruction_options(p, $6);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+		brw_set_dest(p, brw_last_inst, brw_ip_reg());
+
+		if (p->devinfo->ver >= 8) {
+			brw_set_src0(p, brw_last_inst, brw_imm_d(0x0));
+		} else {
+			brw_set_src0(p, brw_last_inst, brw_ip_reg());
+			brw_set_src1(p, brw_last_inst, brw_imm_d(0x0));
+		}
+
+		brw_pop_insn_state(p);
+	}
+	| predicate CONT execsize relativelocation relativelocation instoptions
+	{
+		brw_next_insn(p, $2);
+		i965_asm_set_instruction_options(p, $6);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+		brw_set_dest(p, brw_last_inst, brw_ip_reg());
+
+		brw_set_src0(p, brw_last_inst, brw_ip_reg());
+		brw_set_src1(p, brw_last_inst, brw_imm_d(0x0));
+
+		brw_inst_set_gfx4_jump_count(p->devinfo, brw_last_inst, $4);
+		brw_inst_set_gfx4_pop_count(p->devinfo, brw_last_inst, $5);
+
+		brw_pop_insn_state(p);
+	}
+	;
+
+/* loop instruction */
+loopinstruction:
+	predicate WHILE execsize JUMP_LABEL instoptions
+	{
+		add_label(p, $4, INSTR_LABEL_JIP);
+
+		brw_next_insn(p, $2);
+		i965_asm_set_instruction_options(p, $5);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+
+		if (p->devinfo->ver >= 8) {
+			brw_set_dest(p, brw_last_inst,
+						retype(brw_null_reg(),
+						BRW_REGISTER_TYPE_D));
+			if (p->devinfo->ver < 12)
+				brw_set_src0(p, brw_last_inst, brw_imm_d(0x0));
+		} else if (p->devinfo->ver == 7) {
+			brw_set_dest(p, brw_last_inst,
+						retype(brw_null_reg(),
+						BRW_REGISTER_TYPE_D));
+			brw_set_src0(p, brw_last_inst,
+						retype(brw_null_reg(),
+						BRW_REGISTER_TYPE_D));
+			brw_set_src1(p, brw_last_inst,
+						brw_imm_w(0x0));
+		} else {
+			brw_set_dest(p, brw_last_inst, brw_imm_w(0x0));
+			brw_set_src0(p, brw_last_inst,
+						retype(brw_null_reg(),
+						BRW_REGISTER_TYPE_D));
+			brw_set_src1(p, brw_last_inst,
+						retype(brw_null_reg(),
+						BRW_REGISTER_TYPE_D));
+		}
+
+		brw_pop_insn_state(p);
+	}
+	| predicate WHILE execsize relativelocation instoptions
+	{
+		brw_next_insn(p, $2);
+		i965_asm_set_instruction_options(p, $5);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $3);
+
+		brw_set_dest(p, brw_last_inst, brw_ip_reg());
+		brw_set_src0(p, brw_last_inst, brw_ip_reg());
+		brw_set_src1(p, brw_last_inst, brw_imm_d(0x0));
+		brw_inst_set_gfx4_jump_count(p->devinfo, brw_last_inst, $4);
+		brw_inst_set_gfx4_pop_count(p->devinfo, brw_last_inst, 0);
+
+		brw_pop_insn_state(p);
+	}
+	| DO execsize instoptions
+	{
+		brw_next_insn(p, $1);
+		if (p->devinfo->ver < 6) {
+			brw_inst_set_exec_size(p->devinfo, brw_last_inst, $2);
+			i965_asm_set_instruction_options(p, $3);
+			brw_set_dest(p, brw_last_inst, brw_null_reg());
+			brw_set_src0(p, brw_last_inst, brw_null_reg());
+			brw_set_src1(p, brw_last_inst, brw_null_reg());
+
+			brw_inst_set_qtr_control(p->devinfo, brw_last_inst, BRW_COMPRESSION_NONE);
+		}
+	}
+	;
+
+/* sync instruction */
+syncinstruction:
+	predicate SYNC sync_function execsize sync_arg instoptions
+	{
+		if (p->devinfo->ver < 12) {
+			error(&@2, "sync instruction is supported only on gfx12+\n");
+		}
+
+		if ($5.file == BRW_IMMEDIATE_VALUE &&
+		    $3 != TGL_SYNC_ALLRD &&
+		    $3 != TGL_SYNC_ALLWR) {
+			error(&@2, "Only allrd and allwr support immediate argument\n");
+		}
+
+		brw_set_default_access_mode(p, $6.access_mode);
+		brw_SYNC(p, $3);
+		i965_asm_set_instruction_options(p, $6);
+		brw_inst_set_exec_size(p->devinfo, brw_last_inst, $4);
+		brw_set_src0(p, brw_last_inst, $5);
+		brw_inst_set_eot(p->devinfo, brw_last_inst, $6.end_of_thread);
+		brw_inst_set_qtr_control(p->devinfo, brw_last_inst, $6.qtr_ctrl);
+		brw_inst_set_nib_control(p->devinfo, brw_last_inst, $6.nib_ctrl);
+
+		brw_pop_insn_state(p);
+	}
+	;
+
+sync_function:
+	NOP		{ $$ = TGL_SYNC_NOP; }
+	| ALLRD
+	| ALLWR
+	| FENCE
+	| BAR
+	| HOST
+	;
+
+sync_arg:
+	nullreg region reg_type
+	{
+		$$ = $1;
+		$$.vstride = $2.vstride;
+		$$.width = $2.width;
+		$$.hstride = $2.hstride;
+		$$.type = $3;
+	}
+	| immreg
+	;
+
+/* Relative location */
+relativelocation2:
+	immreg
+	| reg32
+	;
+
+simple_int:
+	INTEGER 	        { $$ = $1; }
+	| MINUS INTEGER 	{ $$ = -$2; }
+	| LONG 		        { $$ = $1; }
+	| MINUS LONG 	        { $$ = -$2; }
+	;
+
+rellocation:
+	relativelocation
+	| /* empty */ { $$ = 0; }
+	;
+
+relativelocation:
+	simple_int
+	{
+		$$ = $1;
+	}
+	;
+
+jumplabel:
+	JUMP_LABEL	{ $$ = $1; }
+	| /* empty */	{ $$ = NULL; }
+	;
+
+jumplabeltarget:
+	JUMP_LABEL_TARGET
+	{
+		struct target_label *label = rzalloc(p->mem_ctx, struct target_label);
+
+		label->name = ralloc_strdup(p->mem_ctx, $1);
+		label->offset = p->next_insn_offset;
+
+		list_addtail(&label->link, &target_labels);
+	}
+	;
+
+/* Destination register */
+dst:
+	dstoperand
+	| dstoperandex
+	;
+
+dstoperand:
+	dstreg dstregion writemask reg_type
+	{
+		$$ = $1;
+		$$.vstride = BRW_VERTICAL_STRIDE_1;
+		$$.width = BRW_WIDTH_1;
+		$$.hstride = $2;
+		$$.type = $4;
+		$$.writemask = $3;
+		$$.swizzle = BRW_SWIZZLE_NOOP;
+		$$.subnr = $$.subnr * brw_reg_type_to_size($4);
+	}
+	;
+
+dstoperandex:
+	dstoperandex_typed dstregion writemask reg_type
+	{
+		$$ = $1;
+		$$.hstride = $2;
+		$$.type = $4;
+		$$.writemask = $3;
+		$$.subnr = $$.subnr * brw_reg_type_to_size($4);
+	}
+	/* BSpec says "When the conditional modifier is present, updates
+	 * to the selected flag register also occur. In this case, the
+	 * register region fields of the ‘null’ operand are valid."
+	 */
+	| nullreg dstregion writemask reg_type
+	{
+		$$ = $1;
+		$$.vstride = BRW_VERTICAL_STRIDE_1;
+		$$.width = BRW_WIDTH_1;
+		$$.hstride = $2;
+		$$.writemask = $3;
+		$$.type = $4;
+	}
+	| threadcontrolreg
+	{
+		$$ = $1;
+		$$.hstride = 1;
+		$$.type = BRW_REGISTER_TYPE_UW;
+	}
+	;
+
+dstoperandex_typed:
+	accreg
+	| addrreg
+	| channelenablereg
+	| controlreg
+	| flagreg
+	| ipreg
+	| maskreg
+	| notifyreg
+	| performancereg
+	| statereg
+	;
+
+dstreg:
+	directgenreg
+	{
+		$$ = $1;
+		$$.address_mode = BRW_ADDRESS_DIRECT;
+	}
+	| indirectgenreg
+	{
+		$$ = $1;
+		$$.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+	}
+	| directmsgreg
+	{
+		$$ = $1;
+		$$.address_mode = BRW_ADDRESS_DIRECT;
+	}
+	| indirectmsgreg
+	{
+		$$ = $1;
+		$$.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+	}
+	;
+
+/* Source register */
+srcaccimm:
+	srcacc
+	| immreg
+	;
+
+immreg:
+	immval imm_type
+	{
+		switch ($2) {
+		case BRW_REGISTER_TYPE_UD:
+			$$ = brw_imm_ud($1);
+			break;
+		case BRW_REGISTER_TYPE_D:
+			$$ = brw_imm_d($1);
+			break;
+		case BRW_REGISTER_TYPE_UW:
+			$$ = brw_imm_uw($1 | ($1 << 16));
+			break;
+		case BRW_REGISTER_TYPE_W:
+			$$ = brw_imm_w($1);
+			break;
+		case BRW_REGISTER_TYPE_F:
+			$$ = brw_imm_reg(BRW_REGISTER_TYPE_F);
+			/* Set u64 instead of ud since DIM uses a 64-bit F-typed imm */
+			$$.u64 = $1;
+			break;
+		case BRW_REGISTER_TYPE_V:
+			$$ = brw_imm_v($1);
+			break;
+		case BRW_REGISTER_TYPE_UV:
+			$$ = brw_imm_uv($1);
+			break;
+		case BRW_REGISTER_TYPE_VF:
+			$$ = brw_imm_vf($1);
+			break;
+		case BRW_REGISTER_TYPE_Q:
+			$$ = brw_imm_q($1);
+			break;
+		case BRW_REGISTER_TYPE_UQ:
+			$$ = brw_imm_uq($1);
+			break;
+		case BRW_REGISTER_TYPE_DF:
+			$$ = brw_imm_reg(BRW_REGISTER_TYPE_DF);
+			$$.d64 = $1;
+			break;
+		case BRW_REGISTER_TYPE_HF:
+			$$ = brw_imm_reg(BRW_REGISTER_TYPE_HF);
+			$$.ud = $1 | ($1 << 16);
+			break;
+		default:
+			error(&@2, "Unknown immediate type %s\n",
+			      brw_reg_type_to_letters($2));
+		}
+	}
+	;
+
+reg32:
+	directgenreg region reg_type
+	{
+		$$ = set_direct_src_operand(&$1, $3);
+		$$ = stride($$, $2.vstride, $2.width, $2.hstride);
+	}
+	;
+
+payload:
+	directsrcoperand
+	;
+
+src:
+	directsrcoperand
+	| indirectsrcoperand
+	;
+
+srcacc:
+	directsrcaccoperand
+	| indirectsrcoperand
+	;
+
+srcimm:
+	directsrcoperand
+	| indirectsrcoperand
+	| immreg
+	;
+
+directsrcaccoperand:
+	directsrcoperand
+	| negate abs accreg region reg_type
+	{
+		$$ = set_direct_src_operand(&$3, $5);
+		$$.negate = $1;
+		$$.abs = $2;
+		$$.vstride = $4.vstride;
+		$$.width = $4.width;
+		$$.hstride = $4.hstride;
+	}
+	;
+
+srcarcoperandex:
+	srcarcoperandex_typed region reg_type
+	{
+		$$ = brw_reg($1.file,
+			     $1.nr,
+			     $1.subnr,
+			     0,
+			     0,
+			     $3,
+			     $2.vstride,
+			     $2.width,
+			     $2.hstride,
+			     BRW_SWIZZLE_NOOP,
+			     WRITEMASK_XYZW);
+	}
+	| nullreg region reg_type
+	{
+		$$ = set_direct_src_operand(&$1, $3);
+		$$.vstride = $2.vstride;
+		$$.width = $2.width;
+		$$.hstride = $2.hstride;
+	}
+	| threadcontrolreg
+	{
+		$$ = set_direct_src_operand(&$1, BRW_REGISTER_TYPE_UW);
+	}
+	;
+
+srcarcoperandex_typed:
+	channelenablereg
+	| controlreg
+	| flagreg
+	| ipreg
+	| maskreg
+	| statereg
+	;
+
+indirectsrcoperand:
+	negate abs indirectgenreg indirectregion swizzle reg_type
+	{
+		$$ = brw_reg($3.file,
+			     0,
+			     $3.subnr,
+			     $1,  // negate
+			     $2,  // abs
+			     $6,
+			     $4.vstride,
+			     $4.width,
+			     $4.hstride,
+			     $5,
+			     WRITEMASK_X);
+
+		$$.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+		// brw_reg set indirect_offset to 0 so set it to valid value
+		$$.indirect_offset = $3.indirect_offset;
+	}
+	;
+
+directgenreg_list:
+	directgenreg
+	| directmsgreg
+	| notifyreg
+	| addrreg
+	| performancereg
+	;
+
+directsrcoperand:
+	negate abs directgenreg_list region swizzle reg_type
+	{
+		$$ = brw_reg($3.file,
+			     $3.nr,
+			     $3.subnr,
+			     $1,
+			     $2,
+			     $6,
+			     $4.vstride,
+			     $4.width,
+			     $4.hstride,
+			     $5,
+			     WRITEMASK_X);
+	}
+	| srcarcoperandex
+	;
+
+/* Address register */
+addrparam:
+	addrreg exp
+	{
+		memset(&$$, '\0', sizeof($$));
+		$$.subnr = $1.subnr;
+		$$.indirect_offset = $2;
+	}
+	| addrreg
+	;
+
+/* Register files and register numbers */
+exp:
+	INTEGER 	{ $$ = $1; }
+	| LONG 	        { $$ = $1; }
+	;
+
+subregnum:
+	DOT exp 		        { $$ = $2; }
+	| /* empty */ %prec SUBREGNUM 	{ $$ = 0; }
+	;
+
+directgenreg:
+	GENREG subregnum
+	{
+		memset(&$$, '\0', sizeof($$));
+		$$.file = BRW_GENERAL_REGISTER_FILE;
+		$$.nr = $1;
+		$$.subnr = $2;
+	}
+	;
+
+indirectgenreg:
+	GENREGFILE LSQUARE addrparam RSQUARE
+	{
+		memset(&$$, '\0', sizeof($$));
+		$$.file = BRW_GENERAL_REGISTER_FILE;
+		$$.subnr = $3.subnr;
+		$$.indirect_offset = $3.indirect_offset;
+	}
+	;
+
+directmsgreg:
+	MSGREG subregnum
+	{
+		$$.file = BRW_MESSAGE_REGISTER_FILE;
+		$$.nr = $1;
+		$$.subnr = $2;
+	}
+	;
+
+indirectmsgreg:
+	MSGREGFILE LSQUARE addrparam RSQUARE
+	{
+		memset(&$$, '\0', sizeof($$));
+		$$.file = BRW_MESSAGE_REGISTER_FILE;
+		$$.subnr = $3.subnr;
+		$$.indirect_offset = $3.indirect_offset;
+	}
+	;
+
+addrreg:
+	ADDRREG subregnum
+	{
+		int subnr = (p->devinfo->ver >= 8) ? 16 : 8;
+
+		if ($2 > subnr)
+			error(&@2, "Address sub register number %d"
+				   "out of range\n", $2);
+
+		$$.file = BRW_ARCHITECTURE_REGISTER_FILE;
+		$$.nr = BRW_ARF_ADDRESS;
+		$$.subnr = $2;
+	}
+	;
+
+accreg:
+	ACCREG subregnum
+	{
+		int nr_reg;
+		if (p->devinfo->ver < 8)
+			nr_reg = 2;
+		else
+			nr_reg = 10;
+
+		if ($1 > nr_reg)
+			error(&@1, "Accumulator register number %d"
+				   " out of range\n", $1);
+
+		memset(&$$, '\0', sizeof($$));
+		$$.file = BRW_ARCHITECTURE_REGISTER_FILE;
+		$$.nr = BRW_ARF_ACCUMULATOR;
+		$$.subnr = $2;
+	}
+	;
+
+flagreg:
+	FLAGREG subregnum
+	{
+		// SNB = 1 flag reg and IVB+ = 2 flag reg
+		int nr_reg = (p->devinfo->ver >= 7) ? 2 : 1;
+		int subnr = nr_reg;
+
+		if ($1 > nr_reg)
+			error(&@1, "Flag register number %d"
+				   " out of range \n", $1);
+		if ($2 > subnr)
+			error(&@2, "Flag subregister number %d"
+				   " out of range\n", $2);
+
+		$$.file = BRW_ARCHITECTURE_REGISTER_FILE;
+		$$.nr = BRW_ARF_FLAG | $1;
+		$$.subnr = $2;
+	}
+	;
+
+maskreg:
+	MASKREG subregnum
+	{
+		if ($1 > 0)
+			error(&@1, "Mask register number %d"
+				   " out of range\n", $1);
+
+		$$.file = BRW_ARCHITECTURE_REGISTER_FILE;
+		$$.nr = BRW_ARF_MASK;
+		$$.subnr = $2;
+	}
+	;
+
+notifyreg:
+	NOTIFYREG subregnum
+	{
+		int subnr = (p->devinfo->ver >= 11) ? 2 : 3;
+		if ($2 > subnr)
+			error(&@2, "Notification sub register number %d"
+				   " out of range\n", $2);
+
+		$$.file = BRW_ARCHITECTURE_REGISTER_FILE;
+		$$.nr = BRW_ARF_NOTIFICATION_COUNT;
+		$$.subnr = $2;
+	}
+	;
+
+statereg:
+	STATEREG subregnum
+	{
+		if ($1 > 2)
+			error(&@1, "State register number %d"
+				   " out of range\n", $1);
+
+		if ($2 > 4)
+			error(&@2, "State sub register number %d"
+				   " out of range\n", $2);
+
+		$$.file = BRW_ARCHITECTURE_REGISTER_FILE;
+		$$.nr = BRW_ARF_STATE;
+		$$.subnr = $2;
+	}
+	;
+
+controlreg:
+	CONTROLREG subregnum
+	{
+		if ($2 > 3)
+			error(&@2, "control sub register number %d"
+				   " out of range\n", $2);
+
+		$$.file = BRW_ARCHITECTURE_REGISTER_FILE;
+		$$.nr = BRW_ARF_CONTROL;
+		$$.subnr = $2;
+	}
+	;
+
+ipreg:
+	IPREG		{ $$ = brw_ip_reg(); }
+	;
+
+nullreg:
+	NULL_TOKEN 	{ $$ = brw_null_reg(); }
+	;
+
+threadcontrolreg:
+	THREADREG subregnum
+	{
+		if ($2 > 7)
+			error(&@2, "Thread control sub register number %d"
+				   " out of range\n", $2);
+
+		$$.file = BRW_ARCHITECTURE_REGISTER_FILE;
+		$$.nr = BRW_ARF_TDR;
+		$$.subnr = $2;
+	}
+	;
+
+performancereg:
+	PERFORMANCEREG subregnum
+	{
+		int subnr;
+		if (p->devinfo->ver >= 10)
+			subnr = 5;
+		else if (p->devinfo->ver <= 8)
+			subnr = 3;
+		else
+			subnr = 4;
+
+		if ($2 > subnr)
+			error(&@2, "Performance sub register number %d"
+				   " out of range\n", $2);
+
+		$$.file = BRW_ARCHITECTURE_REGISTER_FILE;
+		$$.nr = BRW_ARF_TIMESTAMP;
+		$$.subnr = $2;
+	}
+	;
+
+channelenablereg:
+	CHANNELENABLEREG subregnum
+	{
+		if ($1 > 0)
+			error(&@1, "Channel enable register number %d"
+				   " out of range\n", $1);
+
+		$$.file = BRW_ARCHITECTURE_REGISTER_FILE;
+		$$.nr = BRW_ARF_MASK;
+		$$.subnr = $2;
+	}
+	;
+
+/* Immediate values */
+immval:
+	exp2
+	{
+		$$ = $1;
+	}
+	| LSQUARE exp2 COMMA exp2 COMMA exp2 COMMA exp2 RSQUARE
+	{
+		$$ = ($2 << 0) | ($4 << 8) | ($6 << 16) | ($8 << 24);
+	}
+	;
+
+/* Regions */
+dstregion:
+	/* empty */
+	{
+		$$ = BRW_HORIZONTAL_STRIDE_1;
+	}
+	| LANGLE exp RANGLE
+	{
+		if ($2 != 0 && ($2 > 4 || !isPowerofTwo($2)))
+			error(&@2, "Invalid Horizontal stride %d\n", $2);
+
+		$$ = ffs($2);
+	}
+	;
+
+indirectregion:
+	region
+	| region_wh
+	;
+
+region:
+	/* empty */
+	{
+		$$ = stride($$, 0, 1, 0);
+	}
+	| LANGLE exp RANGLE
+	{
+		if ($2 != 0 && ($2 > 32 || !isPowerofTwo($2)))
+			error(&@2, "Invalid VertStride %d\n", $2);
+
+		$$ = stride($$, $2, 1, 0);
+	}
+	| LANGLE exp COMMA exp COMMA exp RANGLE
+	{
+
+		if ($2 != 0 && ($2 > 32 || !isPowerofTwo($2)))
+			error(&@2, "Invalid VertStride %d\n", $2);
+
+		if ($4 > 16 || !isPowerofTwo($4))
+			error(&@4, "Invalid width %d\n", $4);
+
+		if ($6 != 0 && ($6 > 4 || !isPowerofTwo($6)))
+			error(&@6, "Invalid Horizontal stride in"
+				   "  region_wh %d\n", $6);
+
+		$$ = stride($$, $2, $4, $6);
+	}
+	| LANGLE exp SEMICOLON exp COMMA exp RANGLE
+	{
+		if ($2 != 0 && ($2 > 32 || !isPowerofTwo($2)))
+			error(&@2, "Invalid VertStride %d\n", $2);
+
+		if ($4 > 16 || !isPowerofTwo($4))
+			error(&@4, "Invalid width %d\n", $4);
+
+		if ($6 != 0 && ($6 > 4 || !isPowerofTwo($6)))
+			error(&@6, "Invalid Horizontal stride in"
+				   " region_wh %d\n", $6);
+
+		$$ = stride($$, $2, $4, $6);
+	}
+	| LANGLE VxH COMMA exp COMMA exp RANGLE
+	{
+		if ($4 > 16 || !isPowerofTwo($4))
+			error(&@4, "Invalid width %d\n", $4);
+
+		if ($6 != 0 && ($6 > 4 || !isPowerofTwo($6)))
+			error(&@6, "Invalid Horizontal stride in"
+				   " region_wh %d\n", $6);
+
+		$$ = brw_VxH_indirect(0, 0);
+	}
+	;
+
+region_wh:
+	LANGLE exp COMMA exp RANGLE
+	{
+		if ($2 > 16 || !isPowerofTwo($2))
+			error(&@2, "Invalid width %d\n", $2);
+
+		if ($4 != 0 && ($4 > 4 || !isPowerofTwo($4)))
+			error(&@4, "Invalid Horizontal stride in"
+				   " region_wh %d\n", $4);
+
+		$$ = stride($$, 0, $2, $4);
+		$$.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+	}
+	;
+
+reg_type:
+	  TYPE_F 	{ $$ = BRW_REGISTER_TYPE_F;  }
+	| TYPE_UD 	{ $$ = BRW_REGISTER_TYPE_UD; }
+	| TYPE_D 	{ $$ = BRW_REGISTER_TYPE_D;  }
+	| TYPE_UW 	{ $$ = BRW_REGISTER_TYPE_UW; }
+	| TYPE_W 	{ $$ = BRW_REGISTER_TYPE_W;  }
+	| TYPE_UB 	{ $$ = BRW_REGISTER_TYPE_UB; }
+	| TYPE_B 	{ $$ = BRW_REGISTER_TYPE_B;  }
+	| TYPE_DF 	{ $$ = BRW_REGISTER_TYPE_DF; }
+	| TYPE_UQ 	{ $$ = BRW_REGISTER_TYPE_UQ; }
+	| TYPE_Q 	{ $$ = BRW_REGISTER_TYPE_Q;  }
+	| TYPE_HF 	{ $$ = BRW_REGISTER_TYPE_HF; }
+	| TYPE_NF 	{ $$ = BRW_REGISTER_TYPE_NF; }
+	;
+
+imm_type:
+	reg_type 	{ $$ = $1; }
+	| TYPE_V 	{ $$ = BRW_REGISTER_TYPE_V;  }
+	| TYPE_VF 	{ $$ = BRW_REGISTER_TYPE_VF; }
+	| TYPE_UV 	{ $$ = BRW_REGISTER_TYPE_UV; }
+	;
+
+writemask:
+	/* empty */
+	{
+		$$ = WRITEMASK_XYZW;
+	}
+	| DOT writemask_x writemask_y writemask_z writemask_w
+	{
+		$$ = $2 | $3 | $4 | $5;
+	}
+	;
+
+writemask_x:
+	/* empty */ 	{ $$ = 0; }
+	| X 	{ $$ = 1 << BRW_CHANNEL_X; }
+	;
+
+writemask_y:
+	/* empty */ 	{ $$ = 0; }
+	| Y 	{ $$ = 1 << BRW_CHANNEL_Y; }
+	;
+
+writemask_z:
+	/* empty */ 	{ $$ = 0; }
+	| Z 	{ $$ = 1 << BRW_CHANNEL_Z; }
+	;
+
+writemask_w:
+	/* empty */ 	{ $$ = 0; }
+	| W 	{ $$ = 1 << BRW_CHANNEL_W; }
+	;
+
+swizzle:
+	/* empty */
+	{
+		$$ = BRW_SWIZZLE_NOOP;
+	}
+	| DOT chansel
+	{
+		$$ = BRW_SWIZZLE4($2, $2, $2, $2);
+	}
+	| DOT chansel chansel chansel chansel
+	{
+		$$ = BRW_SWIZZLE4($2, $3, $4, $5);
+	}
+	;
+
+chansel:
+	X
+	| Y
+	| Z
+	| W
+	;
+
+/* Instruction prediction and modifiers */
+predicate:
+	/* empty */
+	{
+		brw_push_insn_state(p);
+		brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+		brw_set_default_flag_reg(p, 0, 0);
+		brw_set_default_predicate_inverse(p, false);
+	}
+	| LPAREN predstate flagreg predctrl RPAREN
+	{
+		brw_push_insn_state(p);
+		brw_set_default_predicate_inverse(p, $2);
+		brw_set_default_flag_reg(p, $3.nr, $3.subnr);
+		brw_set_default_predicate_control(p, $4);
+	}
+	;
+
+predstate:
+	/* empty */     { $$ = 0; }
+	| PLUS 	        { $$ = 0; }
+	| MINUS 	{ $$ = 1; }
+	;
+
+predctrl:
+	/* empty */ 	{ $$ = BRW_PREDICATE_NORMAL; }
+	| DOT X 	{ $$ = BRW_PREDICATE_ALIGN16_REPLICATE_X; }
+	| DOT Y 	{ $$ = BRW_PREDICATE_ALIGN16_REPLICATE_Y; }
+	| DOT Z 	{ $$ = BRW_PREDICATE_ALIGN16_REPLICATE_Z; }
+	| DOT W 	{ $$ = BRW_PREDICATE_ALIGN16_REPLICATE_W; }
+	| ANYV
+	| ALLV
+	| ANY2H
+	| ALL2H
+	| ANY4H
+	| ALL4H
+	| ANY8H
+	| ALL8H
+	| ANY16H
+	| ALL16H
+	| ANY32H
+	| ALL32H
+	;
+
+/* Source Modification */
+negate:
+	/* empty */	{ $$ = 0; }
+	| MINUS 	{ $$ = 1; }
+	;
+
+abs:
+	/* empty */ 	{ $$ = 0; }
+	| ABS 	{ $$ = 1; }
+	;
+
+/* Flag (Conditional) Modifier */
+cond_mod:
+	condModifiers
+	{
+		$$.cond_modifier = $1;
+		$$.flag_reg_nr = 0;
+		$$.flag_subreg_nr = 0;
+	}
+	| condModifiers DOT flagreg
+	{
+		$$.cond_modifier = $1;
+		$$.flag_reg_nr = $3.nr;
+		$$.flag_subreg_nr = $3.subnr;
+	}
+	;
+
+condModifiers:
+	/* empty */ 	{ $$ = BRW_CONDITIONAL_NONE; }
+	| ZERO
+	| EQUAL
+	| NOT_ZERO
+	| NOT_EQUAL
+	| GREATER
+	| GREATER_EQUAL
+	| LESS
+	| LESS_EQUAL
+	| OVERFLOW
+	| ROUND_INCREMENT
+	| UNORDERED
+	;
+
+/* message details for send */
+msgdesc:
+	MSGDESC_BEGIN msgdesc_parts MSGDESC_END { $$ = $2; }
+	;
+
+msgdesc_parts:
+	SRC1_LEN ASSIGN INTEGER msgdesc_parts
+	{
+		$$ = $4;
+		$$.src1_len = $3;
+	}
+	| EX_BSO msgdesc_parts
+	{
+		$$ = $2;
+		$$.ex_bso = 1;
+	}
+	| INTEGER msgdesc_parts { $$ = $2; }
+	| ASSIGN msgdesc_parts { $$ = $2; }
+	| /* empty */
+	{
+		memset(&$$, 0, sizeof($$));
+	}
+	;
+
+saturate:
+	/* empty */ 	{ $$ = BRW_INSTRUCTION_NORMAL; }
+	| SATURATE 	{ $$ = BRW_INSTRUCTION_SATURATE; }
+	;
+
+/* Execution size */
+execsize:
+	/* empty */ %prec EMPTYEXECSIZE
+	{
+		$$ = 0;
+	}
+	| LPAREN exp2 RPAREN
+	{
+		if ($2 > 32 || !isPowerofTwo($2))
+			error(&@2, "Invalid execution size %llu\n", $2);
+
+		$$ = cvt($2) - 1;
+	}
+	;
+
+/* Instruction options */
+instoptions:
+	/* empty */
+	{
+		memset(&$$, 0, sizeof($$));
+	}
+	| LCURLY instoption_list RCURLY
+	{
+		memset(&$$, 0, sizeof($$));
+		$$ = $2;
+	}
+	;
+
+instoption_list:
+	instoption_list COMMA instoption
+	{
+		memset(&$$, 0, sizeof($$));
+		$$ = $1;
+		add_instruction_option(&$$, $3);
+	}
+	| instoption_list instoption
+	{
+		memset(&$$, 0, sizeof($$));
+		$$ = $1;
+		add_instruction_option(&$$, $2);
+	}
+	| /* empty */
+	{
+		memset(&$$, 0, sizeof($$));
+	}
+	;
+
+depinfo:
+	REG_DIST_CURRENT
+	{
+		memset(&$$, 0, sizeof($$));
+		$$.regdist = $1;
+		$$.pipe = TGL_PIPE_NONE;
+	}
+	| REG_DIST_FLOAT
+	{
+		memset(&$$, 0, sizeof($$));
+		$$.regdist = $1;
+		$$.pipe = TGL_PIPE_FLOAT;
+	}
+	| REG_DIST_INT
+	{
+		memset(&$$, 0, sizeof($$));
+		$$.regdist = $1;
+		$$.pipe = TGL_PIPE_INT;
+	}
+	| REG_DIST_LONG
+	{
+		memset(&$$, 0, sizeof($$));
+		$$.regdist = $1;
+		$$.pipe = TGL_PIPE_LONG;
+	}
+	| REG_DIST_ALL
+	{
+		memset(&$$, 0, sizeof($$));
+		$$.regdist = $1;
+		$$.pipe = TGL_PIPE_ALL;
+	}
+	| SBID_ALLOC
+	{
+		memset(&$$, 0, sizeof($$));
+		$$.sbid = $1;
+		$$.mode = TGL_SBID_SET;
+	}
+	| SBID_WAIT_SRC
+	{
+		memset(&$$, 0, sizeof($$));
+		$$.sbid = $1;
+		$$.mode = TGL_SBID_SRC;
+	}
+	| SBID_WAIT_DST
+	{
+		memset(&$$, 0, sizeof($$));
+		$$.sbid = $1;
+		$$.mode = TGL_SBID_DST;
+	}
+
+instoption:
+	ALIGN1 	        { $$.type = INSTOPTION_FLAG; $$.uint_value = ALIGN1;}
+	| ALIGN16 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = ALIGN16; }
+	| ACCWREN 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = ACCWREN; }
+	| SECHALF 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = SECHALF; }
+	| COMPR 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = COMPR; }
+	| COMPR4 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = COMPR4; }
+	| BREAKPOINT 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = BREAKPOINT; }
+	| NODDCLR 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = NODDCLR; }
+	| NODDCHK 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = NODDCHK; }
+	| MASK_DISABLE 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = MASK_DISABLE; }
+	| EOT 	        { $$.type = INSTOPTION_FLAG; $$.uint_value = EOT; }
+	| SWITCH 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = SWITCH; }
+	| ATOMIC 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = ATOMIC; }
+	| CMPTCTRL 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = CMPTCTRL; }
+	| WECTRL 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = WECTRL; }
+	| QTR_2Q 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_2Q; }
+	| QTR_3Q 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_3Q; }
+	| QTR_4Q 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_4Q; }
+	| QTR_2H 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_2H; }
+	| QTR_2N 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_2N; }
+	| QTR_3N 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_3N; }
+	| QTR_4N 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_4N; }
+	| QTR_5N 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_5N; }
+	| QTR_6N 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_6N; }
+	| QTR_7N 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_7N; }
+	| QTR_8N 	{ $$.type = INSTOPTION_FLAG; $$.uint_value = QTR_8N; }
+	| depinfo	{ $$.type = INSTOPTION_DEP_INFO; $$.depinfo_value = $1; }
+	;
+
+%%
+
+extern int yylineno;
+
+#ifdef YYBYACC
+void
+yyerror(YYLTYPE *ltype, char *msg)
+#else
+void
+yyerror(char *msg)
+#endif
+{
+	fprintf(stderr, "%s: %d: %s at \"%s\"\n",
+	        input_filename, yylineno, msg, lex_text());
+	++errors;
+}
diff --git a/src/intel/compiler/elk/brw_inst.h b/src/intel/compiler/elk/brw_inst.h
new file mode 100644
index 00000000000..c3a324257cb
--- /dev/null
+++ b/src/intel/compiler/elk/brw_inst.h
@@ -0,0 +1,1732 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file brw_inst.h
+ *
+ * A representation of i965 EU assembly instructions, with helper methods to
+ * get and set various fields.  This is the actual hardware format.
+ */
+
+#ifndef BRW_INST_H
+#define BRW_INST_H
+
+#include <assert.h>
+#include <stdint.h>
+
+#include "brw_eu_defines.h"
+#include "brw_isa_info.h"
+#include "brw_reg_type.h"
+#include "dev/intel_device_info.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* brw_context.h has a forward declaration of brw_inst, so name the struct. */
+typedef struct brw_inst {
+   uint64_t data[2];
+} brw_inst;
+
+static inline uint64_t brw_inst_bits(const brw_inst *inst,
+                                     unsigned high, unsigned low);
+static inline void brw_inst_set_bits(brw_inst *inst,
+                                     unsigned high, unsigned low,
+                                     uint64_t value);
+
+#define FC(name, hi4, lo4, hi12, lo12, assertions)            \
+static inline void                                            \
+brw_inst_set_##name(const struct intel_device_info *devinfo,  \
+                    brw_inst *inst, uint64_t v)               \
+{                                                             \
+   assert(assertions);                                        \
+   if (devinfo->ver >= 12)                                    \
+      brw_inst_set_bits(inst, hi12, lo12, v);                 \
+   else                                                       \
+      brw_inst_set_bits(inst, hi4, lo4, v);                   \
+}                                                             \
+static inline uint64_t                                        \
+brw_inst_##name(const struct intel_device_info *devinfo,      \
+                const brw_inst *inst)                         \
+{                                                             \
+   assert(assertions);                                        \
+   if (devinfo->ver >= 12)                                    \
+      return brw_inst_bits(inst, hi12, lo12);                 \
+   else                                                       \
+      return brw_inst_bits(inst, hi4, lo4);                   \
+}
+
+/* A simple macro for fields which stay in the same place on all generations,
+ * except for Gfx12!
+ */
+#define F(name, hi4, lo4, hi12, lo12) FC(name, hi4, lo4, hi12, lo12, true)
+
+/* A simple macro for fields which stay in the same place on all generations,
+ * except for Gfx12 and Gfx20.
+ */
+#define F20(name, hi4, lo4, hi12, lo12, hi20, lo20)                \
+   static inline void                                              \
+   brw_inst_set_##name(const struct intel_device_info *devinfo,    \
+                       brw_inst *inst, uint64_t v)                 \
+   {                                                               \
+      if (devinfo->ver >= 20)                                      \
+         brw_inst_set_bits(inst, hi20, lo20, v);                   \
+      else if (devinfo->ver >= 12)                                 \
+         brw_inst_set_bits(inst, hi12, lo12, v);                   \
+      else                                                         \
+         brw_inst_set_bits(inst, hi4, lo4, v);                     \
+   }                                                               \
+   static inline uint64_t                                          \
+   brw_inst_##name(const struct intel_device_info *devinfo,        \
+                   const brw_inst *inst)                           \
+   {                                                               \
+      if (devinfo->ver >= 20)                                      \
+         return brw_inst_bits(inst, hi20, lo20);                   \
+      else if (devinfo->ver >= 12)                                 \
+         return brw_inst_bits(inst, hi12, lo12);                   \
+      else                                                         \
+         return brw_inst_bits(inst, hi4, lo4);                     \
+   }
+
+#define FV20(name, hi4, lo4, hi12, lo12, hi20, lo20)               \
+   static inline void                                              \
+   brw_inst_set_##name(const struct intel_device_info *devinfo,    \
+                       brw_inst *inst, uint64_t v)                 \
+   {                                                               \
+      if (devinfo->ver >= 20)                                      \
+         brw_inst_set_bits(inst, hi20, lo20, v & 0x7);             \
+      else if (devinfo->ver >= 12)                                 \
+         brw_inst_set_bits(inst, hi12, lo12, v);                   \
+      else                                                         \
+         brw_inst_set_bits(inst, hi4, lo4, v);                     \
+   }                                                               \
+   static inline uint64_t                                          \
+   brw_inst_##name(const struct intel_device_info *devinfo,        \
+                   const brw_inst *inst)                           \
+   {                                                               \
+      if (devinfo->ver >= 20)                                      \
+         return brw_inst_bits(inst, hi20, lo20) == 0x7 ? 0xF :     \
+                brw_inst_bits(inst, hi20, lo20);                   \
+      else if (devinfo->ver >= 12)                                 \
+         return brw_inst_bits(inst, hi12, lo12);                   \
+      else                                                         \
+         return brw_inst_bits(inst, hi4, lo4);                     \
+   }
+
+#define FD20(name, hi4, lo4, hi12, lo12, hi20, lo20, zero20)       \
+   static inline void                                              \
+   brw_inst_set_##name(const struct intel_device_info *devinfo,    \
+                       brw_inst *inst, uint64_t v)                 \
+   {                                                               \
+      if (devinfo->ver >= 20) {                                    \
+         brw_inst_set_bits(inst, hi20, lo20, v >> 1);              \
+         if (zero20 == -1)                                         \
+            assert((v & 1) == 0);                                  \
+         else                                                      \
+            brw_inst_set_bits(inst, zero20, zero20, v & 1);        \
+      } else if (devinfo->ver >= 12)                               \
+         brw_inst_set_bits(inst, hi12, lo12, v);                   \
+      else                                                         \
+         brw_inst_set_bits(inst, hi4, lo4, v);                     \
+   }                                                               \
+   static inline uint64_t                                          \
+   brw_inst_##name(const struct intel_device_info *devinfo,        \
+                   const brw_inst *inst)                           \
+   {                                                               \
+      if (devinfo->ver >= 20)                                      \
+         return (brw_inst_bits(inst, hi20, lo20) << 1) |           \
+                (zero20 == -1 ? 0 :                                \
+                 brw_inst_bits(inst, zero20, zero20));             \
+      else if (devinfo->ver >= 12)                                 \
+         return brw_inst_bits(inst, hi12, lo12);                   \
+      else                                                         \
+         return brw_inst_bits(inst, hi4, lo4);                     \
+   }
+
+#define BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6,                     \
+               hi7, lo7, hi8, lo8, hi12, lo12, hi20, lo20)                   \
+   unsigned high, low;                                                       \
+   if (devinfo->ver >= 20) {                                                 \
+      high = hi20; low = lo20;                                               \
+   } else if (devinfo->ver >= 12) {                                          \
+      high = hi12; low = lo12;                                               \
+   } else if (devinfo->ver >= 8) {                                           \
+      high = hi8;  low = lo8;                                                \
+   } else if (devinfo->ver >= 7) {                                           \
+      high = hi7;  low = lo7;                                                \
+   } else if (devinfo->ver >= 6) {                                           \
+      high = hi6;  low = lo6;                                                \
+   } else if (devinfo->ver >= 5) {                                           \
+      high = hi5;  low = lo5;                                                \
+   } else if (devinfo->verx10 >= 45) {                                       \
+      high = hi45; low = lo45;                                               \
+   } else {                                                                  \
+      high = hi4;  low = lo4;                                                \
+   }                                                                         \
+   assert(((int) high) != -1 && ((int) low) != -1);
+
+/* A general macro for cases where the field has moved to several different
+ * bit locations across generations.  GCC appears to combine cases where the
+ * bits are identical, removing some of the inefficiency.
+ */
+#define FF(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6,                    \
+           hi7, lo7, hi8, lo8, hi12, lo12, hi20, lo20)                        \
+static inline void                                                            \
+brw_inst_set_##name(const struct intel_device_info *devinfo,                  \
+                    brw_inst *inst, uint64_t value)                           \
+{                                                                             \
+   BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6,                           \
+          hi7, lo7, hi8, lo8, hi12, lo12, hi20, lo20)                         \
+   brw_inst_set_bits(inst, high, low, value);                                 \
+}                                                                             \
+static inline uint64_t                                                        \
+brw_inst_##name(const struct intel_device_info *devinfo, const brw_inst *inst)\
+{                                                                             \
+   BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6,                           \
+          hi7, lo7, hi8, lo8, hi12, lo12, hi20, lo20)                         \
+   return brw_inst_bits(inst, high, low);                                     \
+}
+
+/* A macro for fields which moved as of Gfx8+. */
+#define F8(name, gfx4_high, gfx4_low, gfx8_high, gfx8_low, \
+           gfx12_high, gfx12_low)                          \
+FF(name,                                                   \
+   /* 4:   */ gfx4_high, gfx4_low,                         \
+   /* 4.5: */ gfx4_high, gfx4_low,                         \
+   /* 5:   */ gfx4_high, gfx4_low,                         \
+   /* 6:   */ gfx4_high, gfx4_low,                         \
+   /* 7:   */ gfx4_high, gfx4_low,                         \
+   /* 8:   */ gfx8_high, gfx8_low,                         \
+   /* 12:  */ gfx12_high, gfx12_low,                       \
+   /* 20:  */ gfx12_high, gfx12_low);
+
+/* Macro for fields that gained extra discontiguous MSBs in Gfx12 (specified
+ * by hi12ex-lo12ex).
+ */
+#define FFDC(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6,                  \
+             hi7, lo7, hi8, lo8, hi12ex, lo12ex, hi12, lo12, assertions)      \
+static inline void                                                            \
+brw_inst_set_##name(const struct intel_device_info *devinfo,                  \
+                    brw_inst *inst, uint64_t value)                           \
+{                                                                             \
+   assert(assertions);                                                        \
+   if (devinfo->ver >= 12) {                                                  \
+      const unsigned k = hi12 - lo12 + 1;                                     \
+      if (hi12ex != -1 && lo12ex != -1)                                       \
+         brw_inst_set_bits(inst, hi12ex, lo12ex, value >> k);                 \
+      brw_inst_set_bits(inst, hi12, lo12, value & ((1ull << k) - 1));         \
+   } else {                                                                   \
+      BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6,                        \
+             hi7, lo7, hi8, lo8, -1, -1, -1, -1);                             \
+      brw_inst_set_bits(inst, high, low, value);                              \
+   }                                                                          \
+}                                                                             \
+static inline uint64_t                                                        \
+brw_inst_##name(const struct intel_device_info *devinfo, const brw_inst *inst)\
+{                                                                             \
+   assert(assertions);                                                        \
+   if (devinfo->ver >= 12) {                                                  \
+      const unsigned k = hi12 - lo12 + 1;                                     \
+      return (hi12ex == -1 || lo12ex == -1 ? 0 :                              \
+              brw_inst_bits(inst, hi12ex, lo12ex) << k) |                     \
+             brw_inst_bits(inst, hi12, lo12);                                 \
+   } else {                                                                   \
+      BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6,                        \
+             hi7, lo7, hi8, lo8, -1, -1, -1, -1);                             \
+      return brw_inst_bits(inst, high, low);                                  \
+   }                                                                          \
+}
+
+#define FD(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6,        \
+           hi7, lo7, hi8, lo8, hi12ex, lo12ex, hi12, lo12)        \
+   FFDC(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6,           \
+        hi7, lo7, hi8, lo8, hi12ex, lo12ex, hi12, lo12, true)
+
+/* Macro for fields that didn't move across generations until Gfx12, and then
+ * gained extra discontiguous bits.
+ */
+#define FDC(name, hi4, lo4, hi12ex, lo12ex, hi12, lo12, assertions)     \
+   FFDC(name, hi4, lo4, hi4, lo4, hi4, lo4, hi4, lo4,                   \
+        hi4, lo4, hi4, lo4, hi12ex, lo12ex, hi12, lo12, assertions)
+
+
+/* Macro for the 2-bit register file field, which on Gfx12+ is stored as the
+ * variable length combination of an IsImm (hi12) bit and an additional file
+ * (lo12) bit.
+ */
+#define FI(name, hi4, lo4, hi8, lo8, hi12, lo12)                              \
+static inline void                                                            \
+brw_inst_set_##name(const struct intel_device_info *devinfo,                  \
+                    brw_inst *inst, uint64_t value)                           \
+{                                                                             \
+   if (devinfo->ver >= 12) {                                                  \
+      brw_inst_set_bits(inst, hi12, hi12, value >> 1);                        \
+      if ((value >> 1) == 0)                                                  \
+         brw_inst_set_bits(inst, lo12, lo12, value & 1);                      \
+   } else {                                                                   \
+      BOUNDS(hi4, lo4, hi4, lo4, hi4, lo4, hi4, lo4,                          \
+             hi4, lo4, hi8, lo8, -1, -1, -1, -1);                             \
+      brw_inst_set_bits(inst, high, low, value);                              \
+   }                                                                          \
+}                                                                             \
+static inline uint64_t                                                        \
+brw_inst_##name(const struct intel_device_info *devinfo, const brw_inst *inst)\
+{                                                                             \
+   if (devinfo->ver >= 12) {                                                  \
+      return (brw_inst_bits(inst, hi12, hi12) << 1) |                         \
+             (brw_inst_bits(inst, hi12, hi12) == 0 ?                          \
+              brw_inst_bits(inst, lo12, lo12) : 1);                           \
+   } else {                                                                   \
+      BOUNDS(hi4, lo4, hi4, lo4, hi4, lo4, hi4, lo4,                          \
+             hi4, lo4, hi8, lo8, -1, -1, -1, -1);                             \
+      return brw_inst_bits(inst, high, low);                                  \
+   }                                                                          \
+}
+
+/* Macro for fields that become a constant in Gfx12+ not actually represented
+ * in the instruction.
+ */
+#define FK(name, hi4, lo4, const12)                           \
+static inline void                                            \
+brw_inst_set_##name(const struct intel_device_info *devinfo,  \
+                    brw_inst *inst, uint64_t v)               \
+{                                                             \
+   if (devinfo->ver >= 12)                                    \
+      assert(v == (const12));                                 \
+   else                                                       \
+      brw_inst_set_bits(inst, hi4, lo4, v);                   \
+}                                                             \
+static inline uint64_t                                        \
+brw_inst_##name(const struct intel_device_info *devinfo,      \
+                const brw_inst *inst)                         \
+{                                                             \
+   if (devinfo->ver >= 12)                                    \
+      return (const12);                                       \
+   else                                                       \
+      return brw_inst_bits(inst, hi4, lo4);                   \
+}
+
+FV20(src1_vstride,     /* 4+ */ 120, 117, /* 12+ */ 119, 116, /* 20+ */ 118, 116)
+F(src1_width,          /* 4+ */ 116, 114, /* 12+ */ 115, 113)
+F(src1_da16_swiz_w,    /* 4+ */ 115, 114, /* 12+ */ -1, -1)
+F(src1_da16_swiz_z,    /* 4+ */ 113, 112, /* 12+ */ -1, -1)
+F(src1_hstride,        /* 4+ */ 113, 112, /* 12+ */ 97, 96)
+F(src1_address_mode,   /* 4+ */ 111, 111, /* 12+ */ 112, 112)
+/** Src1.SrcMod @{ */
+F(src1_negate,         /* 4+ */ 110, 110, /* 12+ */ 121, 121)
+F(src1_abs,            /* 4+ */ 109, 109, /* 12+ */ 120, 120)
+/** @} */
+F8(src1_ia_subreg_nr,  /* 4+ */ 108, 106, /* 8+ */  108, 105, /* 12+ */ 111, 108)
+F(src1_da_reg_nr,      /* 4+ */ 108, 101, /* 12+ */ 111, 104)
+F(src1_da16_subreg_nr, /* 4+ */ 100, 100, /* 12+ */ -1, -1)
+FD20(src1_da1_subreg_nr, /* 4+ */ 100, 96, /* 12+ */ 103, 99, /* 20+ */ 103, 99, -1)
+F(src1_da16_swiz_y,    /* 4+ */ 99,  98,  /* 12+ */ -1, -1)
+F(src1_da16_swiz_x,    /* 4+ */ 97,  96,  /* 12+ */ -1, -1)
+F8(src1_reg_hw_type,   /* 4+ */ 46,  44,  /* 8+ */  94,  91, /* 12+ */ 91, 88)
+FI(src1_reg_file,      /* 4+ */ 43,  42,  /* 8+ */  90,  89, /* 12+ */ 47, 98)
+F(src1_is_imm,         /* 4+ */ -1,  -1,  /* 12+ */ 47, 47)
+FV20(src0_vstride,     /* 4+ */ 88,  85,  /* 12+ */ 87, 84,  /* 20+ */ 86, 84)
+F(src0_width,          /* 4+ */ 84,  82,  /* 12+ */ 83, 81)
+F(src0_da16_swiz_w,    /* 4+ */ 83,  82,  /* 12+ */ -1, -1)
+F(src0_da16_swiz_z,    /* 4+ */ 81,  80,  /* 12+ */ -1, -1)
+F(src0_hstride,        /* 4+ */ 81,  80,  /* 12+ */ 65, 64)
+F(src0_address_mode,   /* 4+ */ 79,  79,  /* 12+ */ 80, 80)
+/** Src0.SrcMod @{ */
+F(src0_negate,         /* 4+ */ 78,  78,  /* 12+ */ 45, 45)
+F(src0_abs,            /* 4+ */ 77,  77,  /* 12+ */ 44, 44)
+/** @} */
+F8(src0_ia_subreg_nr,  /* 4+ */ 76,  74,  /* 8+ */  76,  73, /* 12+ */ 79, 76)
+F(src0_da_reg_nr,      /* 4+ */ 76,  69,  /* 12+ */ 79, 72)
+F(src0_da16_subreg_nr, /* 4+ */ 68,  68,  /* 12+ */ -1, -1)
+FD20(src0_da1_subreg_nr, /* 4+ */ 68, 64, /* 12+ */ 71,  67, /* 20+ */ 71, 67, 87)
+F(src0_da16_swiz_y,    /* 4+ */ 67,  66,  /* 12+ */ -1, -1)
+F(src0_da16_swiz_x,    /* 4+ */ 65,  64,  /* 12+ */ -1, -1)
+F(dst_address_mode,    /* 4+ */ 63,  63,  /* 12+ */ 35, 35)
+F(dst_hstride,         /* 4+ */ 62,  61,  /* 12+ */ 49, 48)
+F8(dst_ia_subreg_nr,   /* 4+ */ 60,  58,  /* 8+ */  60,  57, /* 12+ */ 63, 60)
+F(dst_da_reg_nr,       /* 4+ */ 60,  53,  /* 12+ */ 63, 56)
+F(dst_da16_subreg_nr,  /* 4+ */ 52,  52,  /* 12+ */ -1, -1)
+FD20(dst_da1_subreg_nr, /* 4+ */ 52, 48,  /* 12+ */ 55, 51, /* 20+ */ 55, 51, 33)
+F(da16_writemask,      /* 4+ */ 51,  48,  /* 12+ */ -1, -1) /* Dst.ChanEn */
+F8(src0_reg_hw_type,   /* 4+ */ 41,  39,  /* 8+ */  46,  43, /* 12+ */ 43, 40)
+FI(src0_reg_file,      /* 4+ */ 38,  37,  /* 8+ */  42,  41, /* 12+ */ 46, 66)
+F(src0_is_imm,         /* 4+ */ -1,  -1,  /* 12+ */ 46, 46)
+F8(dst_reg_hw_type,    /* 4+ */ 36,  34,  /* 8+ */  40,  37, /* 12+ */ 39, 36)
+F8(dst_reg_file,       /* 4+ */ 33,  32,  /* 8+ */  36,  35, /* 12+ */ 50, 50)
+F8(mask_control,       /* 4+ */  9,   9,  /* 8+ */  34,  34, /* 12+ */ 31, 31)
+FF(flag_reg_nr,
+   /* 4-6: doesn't exist */ -1, -1, -1, -1, -1, -1, -1, -1,
+   /* 7: */ 90, 90,
+   /* 8: */ 33, 33,
+   /* 12: */ 23, 23,
+   /* 20: */ 23, 22)
+FF(flag_subreg_nr,
+   /* 4-7: */ 89, 89,  89, 89,  89, 89,  89, 89,  89, 89,
+   /* 8: */ 32, 32,
+   /* 12: */ 22, 22,
+   /* 20: */ 21, 21)
+F(saturate,            /* 4+ */ 31,  31,  /* 12+ */ 34, 34)
+F(debug_control,       /* 4+ */ 30,  30,  /* 12+ */ 30, 30)
+F(cmpt_control,        /* 4+ */ 29,  29,  /* 12+ */ 29, 29)
+FC(branch_control,     /* 4+ */ 28,  28,  /* 12+ */ 33, 33, devinfo->ver >= 8)
+FC(acc_wr_control,     /* 4+ */ 28,  28,  /* 12+ */ 33, 33, devinfo->ver >= 6 && devinfo->ver < 20)
+FC(mask_control_ex,    /* 4+ */ 28,  28,  /* 12+ */ -1, -1, devinfo->verx10 == 45 ||
+                                                            devinfo->ver == 5)
+F(cond_modifier,       /* 4+ */ 27,  24,  /* 12+ */ 95, 92)
+FC(math_function,      /* 4+ */ 27,  24,  /* 12+ */ 95, 92, devinfo->ver >= 6)
+F20(exec_size,         /* 4+ */ 23,  21,  /* 12+ */ 18, 16,  /* 20+ */ 20, 18)
+F(pred_inv,            /* 4+ */ 20,  20,  /* 12+ */ 28, 28)
+F20(pred_control,      /* 4+ */ 19,  16,  /* 12+ */ 27, 24,  /* 20+ */ 27, 26)
+F(thread_control,      /* 4+ */ 15,  14,  /* 12+ */ -1, -1)
+F(atomic_control,      /* 4+ */ -1,  -1,  /* 12+ */ 32, 32)
+F20(qtr_control,       /* 4+ */ 13,  12,  /* 12+ */ 21, 20,  /* 20+ */ 25, 24)
+FF(nib_control,
+   /* 4-6: doesn't exist */ -1, -1, -1, -1, -1, -1, -1, -1,
+   /* 7: */ 47, 47,
+   /* 8: */ 11, 11,
+   /* 12: */ 19, 19,
+   /* 20: */ -1, -1)
+F8(no_dd_check,        /* 4+ */  11, 11,  /* 8+ */  10,  10, /* 12+ */ -1, -1)
+F8(no_dd_clear,        /* 4+ */  10, 10,  /* 8+ */   9,   9, /* 12+ */ -1, -1)
+F20(swsb,              /* 4+ */  -1, -1,  /* 12+ */ 15,   8, /* 20+ */ 17, 8)
+FK(access_mode,        /* 4+ */   8,  8,  /* 12+ */ BRW_ALIGN_1)
+/* Bit 7 is Reserved (for future Opcode expansion) */
+F(hw_opcode,           /* 4+ */   6,  0,  /* 12+ */ 6,  0)
+
+/**
+ * Three-source instructions:
+ *  @{
+ */
+F(3src_src2_reg_nr,         /* 4+ */ 125, 118, /* 12+ */ 127, 120) /* same in align1 */
+F(3src_a16_src2_subreg_nr,  /* 4+ */ 117, 115, /* 12+ */ -1, -1) /* Extra discontiguous bit on CHV? */
+F(3src_a16_src2_swizzle,    /* 4+ */ 114, 107, /* 12+ */ -1, -1)
+F(3src_a16_src2_rep_ctrl,   /* 4+ */ 106, 106, /* 12+ */ -1, -1)
+F(3src_src1_reg_nr,         /* 4+ */ 104,  97, /* 12+ */ 111, 104) /* same in align1 */
+F(3src_a16_src1_subreg_nr,  /* 4+ */ 96,  94,  /* 12+ */ -1, -1) /* Extra discontiguous bit on CHV? */
+F(3src_a16_src1_swizzle,    /* 4+ */ 93,  86,  /* 12+ */ -1, -1)
+F(3src_a16_src1_rep_ctrl,   /* 4+ */ 85,  85,  /* 12+ */ -1, -1)
+F(3src_src0_reg_nr,         /* 4+ */ 83,  76,  /* 12+ */ 79, 72) /* same in align1 */
+F(3src_a16_src0_subreg_nr,  /* 4+ */ 75,  73,  /* 12+ */ -1, -1) /* Extra discontiguous bit on CHV? */
+F(3src_a16_src0_swizzle,    /* 4+ */ 72,  65,  /* 12+ */ -1, -1)
+F(3src_a16_src0_rep_ctrl,   /* 4+ */ 64,  64,  /* 12+ */ -1, -1)
+F(3src_dst_reg_nr,          /* 4+ */ 63,  56,  /* 12+ */ 63, 56) /* same in align1 */
+F(3src_a16_dst_subreg_nr,   /* 4+ */ 55,  53,  /* 12+ */ -1, -1)
+F(3src_a16_dst_writemask,   /* 4+ */ 52,  49,  /* 12+ */ -1, -1)
+F8(3src_a16_nib_ctrl,       /* 4+ */ 47, 47,   /* 8+ */  11, 11, /* 12+ */ -1, -1) /* only exists on IVB+ */
+F8(3src_a16_dst_hw_type,    /* 4+ */ 45, 44,   /* 8+ */  48, 46, /* 12+ */ -1, -1) /* only exists on IVB+ */
+F8(3src_a16_src_hw_type,    /* 4+ */ 43, 42,   /* 8+ */  45, 43, /* 12+ */ -1, -1)
+F8(3src_src2_negate,        /* 4+ */ 41, 41,   /* 8+ */  42, 42, /* 12+ */ 85, 85)
+F8(3src_src2_abs,           /* 4+ */ 40, 40,   /* 8+ */  41, 41, /* 12+ */ 84, 84)
+F8(3src_src1_negate,        /* 4+ */ 39, 39,   /* 8+ */  40, 40, /* 12+ */ 87, 87)
+F8(3src_src1_abs,           /* 4+ */ 38, 38,   /* 8+ */  39, 39, /* 12+ */ 86, 86)
+F8(3src_src0_negate,        /* 4+ */ 37, 37,   /* 8+ */  38, 38, /* 12+ */ 45, 45)
+F8(3src_src0_abs,           /* 4+ */ 36, 36,   /* 8+ */  37, 37, /* 12+ */ 44, 44)
+F8(3src_a16_src1_type,      /* 4+ */ -1, -1,   /* 8+ */  36, 36, /* 12+ */ -1, -1)
+F8(3src_a16_src2_type,      /* 4+ */ -1, -1,   /* 8+ */  35, 35, /* 12+ */ -1, -1)
+F8(3src_a16_flag_reg_nr,    /* 4+ */ 34, 34,   /* 8+ */  33, 33, /* 12+ */ -1, -1)
+F8(3src_a16_flag_subreg_nr, /* 4+ */ 33, 33,   /* 8+ */  32, 32, /* 12+ */ -1, -1)
+FF(3src_a16_dst_reg_file,
+   /* 4-5: doesn't exist - no 3-source instructions */ -1, -1, -1, -1, -1, -1,
+   /* 6: */ 32, 32,
+   /* 7-8: doesn't exist - no MRFs */ -1, -1, -1, -1,
+   /* 12: */ -1, -1,
+   /* 20: */ -1, -1)
+F(3src_saturate,            /* 4+ */ 31, 31,   /* 12+ */ 34, 34)
+F(3src_debug_control,       /* 4+ */ 30, 30,   /* 12+ */ 30, 30)
+F(3src_cmpt_control,        /* 4+ */ 29, 29,   /* 12+ */ 29, 29)
+FC(3src_acc_wr_control,     /* 4+ */ 28, 28,   /* 12+ */ 33, 33, devinfo->ver < 20)
+F(3src_cond_modifier,       /* 4+ */ 27, 24,   /* 12+ */ 95, 92)
+F(3src_exec_size,           /* 4+ */ 23, 21,   /* 12+ */ 18, 16)
+F(3src_pred_inv,            /* 4+ */ 20, 20,   /* 12+ */ 28, 28)
+F20(3src_pred_control,      /* 4+ */ 19, 16,   /* 12+ */ 27, 24, /* 20+ */ 27, 26)
+F(3src_thread_control,      /* 4+ */ 15, 14,   /* 12+ */ -1, -1)
+F(3src_atomic_control,      /* 4+ */ -1, -1,   /* 12+ */ 32, 32)
+F20(3src_qtr_control,       /* 4+ */ 13, 12,   /* 12+ */ 21, 20, /* 20+ */ 25, 24)
+F8(3src_no_dd_check,        /* 4+ */ 11, 11,   /* 8+ */  10, 10, /* 12+ */ -1, -1)
+F8(3src_no_dd_clear,        /* 4+ */ 10, 10,   /* 8+ */   9,  9, /* 12+ */ -1, -1)
+F8(3src_mask_control,       /* 4+ */ 9,  9,    /* 8+ */  34, 34, /* 12+ */ 31, 31)
+FK(3src_access_mode,        /* 4+ */ 8,  8,    /* 12+ */ BRW_ALIGN_1)
+F(3src_swsb,                /* 4+ */ -1, -1,   /* 12+ */ 15,  8)
+/* Bit 7 is Reserved (for future Opcode expansion) */
+F(3src_hw_opcode,           /* 4+ */ 6,  0,    /* 12+ */ 6, 0)
+/** @} */
+
+#define REG_TYPE(reg)                                                         \
+static inline void                                                            \
+brw_inst_set_3src_a16_##reg##_type(const struct intel_device_info *devinfo,   \
+                                   brw_inst *inst, enum brw_reg_type type)    \
+{                                                                             \
+   unsigned hw_type = brw_reg_type_to_a16_hw_3src_type(devinfo, type);        \
+   brw_inst_set_3src_a16_##reg##_hw_type(devinfo, inst, hw_type);             \
+}                                                                             \
+                                                                              \
+static inline enum brw_reg_type                                               \
+brw_inst_3src_a16_##reg##_type(const struct intel_device_info *devinfo,       \
+                               const brw_inst *inst)                          \
+{                                                                             \
+   unsigned hw_type = brw_inst_3src_a16_##reg##_hw_type(devinfo, inst);       \
+   return brw_a16_hw_3src_type_to_reg_type(devinfo, hw_type);                 \
+}
+
+REG_TYPE(dst)
+REG_TYPE(src)
+#undef REG_TYPE
+
+/**
+ * Three-source align1 instructions:
+ *  @{
+ */
+/* Reserved 127:126 */
+/* src2_reg_nr same in align16 */
+FD20(3src_a1_src2_subreg_nr,/* 4+ */   117, 113, /* 12+ */ 119, 115, /* 20+ */ 119, 115, -1)
+FC(3src_a1_src2_hstride,    /* 4+ */   112, 111, /* 12+ */ 113, 112, devinfo->ver >= 10)
+/* Reserved 110:109. src2 vstride is an implied parameter */
+FC(3src_a1_src2_hw_type,    /* 4+ */   108, 106, /* 12+ */ 82, 80, devinfo->ver >= 10)
+/* Reserved 105 */
+/* src1_reg_nr same in align16 */
+FD20(3src_a1_src1_subreg_nr, /* 4+ */   96,  92, /* 12+ */ 103, 99, /* 20+ */ 103, 99, -1)
+FC(3src_a1_src1_hstride,    /* 4+ */   91,  90,  /* 12+ */ 97, 96, devinfo->ver >= 10)
+FDC(3src_a1_src1_vstride,  /* 4+ */   89,  88,  /* 12+ */ 91, 91, 83, 83, devinfo->ver >= 10)
+FC(3src_a1_src1_hw_type,    /* 4+ */   87,  85,  /* 12+ */ 90, 88, devinfo->ver >= 10)
+/* Reserved 84 */
+/* src0_reg_nr same in align16 */
+FD20(3src_a1_src0_subreg_nr, /* 4+ */   75,  71, /* 12+ */ 71, 67, /* 20+ */ 71, 67, -1)
+FC(3src_a1_src0_hstride,    /* 4+ */   70,  69,  /* 12+ */ 65, 64, devinfo->ver >= 10)
+FDC(3src_a1_src0_vstride,  /* 4+ */   68,  67,  /* 12+ */ 43, 43, 35, 35, devinfo->ver >= 10)
+FC(3src_a1_src0_hw_type,    /* 4+ */   66,  64,  /* 12+ */ 42, 40, devinfo->ver >= 10)
+/* dst_reg_nr same in align16 */
+FC(3src_a1_dst_subreg_nr,   /* 4+ */   55,  54,  /* 12+ */ 55, 54, devinfo->ver >= 10)
+FC(3src_a1_special_acc,     /* 4+ */   55,  52,  /* 12+ */ 54, 51, devinfo->ver >= 10) /* aliases dst_subreg_nr */
+/* Reserved 51:50 */
+FC(3src_a1_dst_hstride,     /* 4+ */   49,  49,  /* 12+ */ 48, 48, devinfo->ver >= 10)
+FC(3src_a1_dst_hw_type,     /* 4+ */   48,  46,  /* 12+ */ 38, 36, devinfo->ver >= 10)
+FI(3src_a1_src2_reg_file,   /* 4+ */   -1,  -1,  /* 8+ */  45, 45,  /* 12+ */ 47, 114)
+FC(3src_a1_src1_reg_file,   /* 4+ */   44,  44,  /* 12+ */ 98, 98, devinfo->ver >= 10)
+FI(3src_a1_src0_reg_file,   /* 4+ */   -1,  -1,  /* 8+ */  43, 43,  /* 12+ */ 46, 66)
+
+F(3src_a1_src2_is_imm,      /* 4+ */   -1,  -1,  /* 12+ */ 47, 47)
+F(3src_a1_src0_is_imm,      /* 4+ */   -1,  -1,  /* 12+ */ 46, 46)
+
+/* Source Modifier fields same in align16 */
+FC(3src_a1_dst_reg_file,    /* 4+ */    36,  36, /* 12+ */ 50, 50, devinfo->ver >= 10)
+FC(3src_a1_exec_type,       /* 4+ */    35,  35, /* 12+ */ 39, 39, devinfo->ver >= 10)
+/* Fields below this same in align16 */
+/** @} */
+
+#define REG_TYPE(reg)                                                         \
+static inline void                                                            \
+brw_inst_set_3src_a1_##reg##_type(const struct intel_device_info *devinfo,    \
+                                  brw_inst *inst, enum brw_reg_type type)     \
+{                                                                             \
+   UNUSED enum gfx10_align1_3src_exec_type exec_type =                        \
+      (enum gfx10_align1_3src_exec_type) brw_inst_3src_a1_exec_type(devinfo,  \
+                                                                    inst);    \
+   if (brw_reg_type_is_floating_point(type)) {                                \
+      assert(exec_type == BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);                   \
+   } else {                                                                   \
+      assert(exec_type == BRW_ALIGN1_3SRC_EXEC_TYPE_INT);                     \
+   }                                                                          \
+   unsigned hw_type = brw_reg_type_to_a1_hw_3src_type(devinfo, type);         \
+   brw_inst_set_3src_a1_##reg##_hw_type(devinfo, inst, hw_type);              \
+}                                                                             \
+                                                                              \
+static inline enum brw_reg_type                                               \
+brw_inst_3src_a1_##reg##_type(const struct intel_device_info *devinfo,        \
+                              const brw_inst *inst)                           \
+{                                                                             \
+   enum gfx10_align1_3src_exec_type exec_type =                               \
+      (enum gfx10_align1_3src_exec_type) brw_inst_3src_a1_exec_type(devinfo,  \
+                                                                    inst);    \
+   unsigned hw_type = brw_inst_3src_a1_##reg##_hw_type(devinfo, inst);        \
+   return brw_a1_hw_3src_type_to_reg_type(devinfo, hw_type, exec_type);       \
+}
+
+REG_TYPE(dst)
+REG_TYPE(src0)
+REG_TYPE(src1)
+REG_TYPE(src2)
+#undef REG_TYPE
+
+/**
+ * Three-source align1 instruction immediates:
+ *  @{
+ */
+static inline uint16_t
+brw_inst_3src_a1_src0_imm(ASSERTED const struct intel_device_info *devinfo,
+                          const brw_inst *insn)
+{
+   assert(devinfo->ver >= 10);
+   if (devinfo->ver >= 12)
+      return brw_inst_bits(insn, 79, 64);
+   else
+      return brw_inst_bits(insn, 82, 67);
+}
+
+static inline uint16_t
+brw_inst_3src_a1_src2_imm(ASSERTED const struct intel_device_info *devinfo,
+                          const brw_inst *insn)
+{
+   assert(devinfo->ver >= 10);
+   if (devinfo->ver >= 12)
+      return brw_inst_bits(insn, 127, 112);
+   else
+      return brw_inst_bits(insn, 124, 109);
+}
+
+static inline void
+brw_inst_set_3src_a1_src0_imm(ASSERTED const struct intel_device_info *devinfo,
+                              brw_inst *insn, uint16_t value)
+{
+   assert(devinfo->ver >= 10);
+   if (devinfo->ver >= 12)
+      brw_inst_set_bits(insn, 79, 64, value);
+   else
+      brw_inst_set_bits(insn, 82, 67, value);
+}
+
+static inline void
+brw_inst_set_3src_a1_src2_imm(ASSERTED const struct intel_device_info *devinfo,
+                              brw_inst *insn, uint16_t value)
+{
+   assert(devinfo->ver >= 10);
+   if (devinfo->ver >= 12)
+      brw_inst_set_bits(insn, 127, 112, value);
+   else
+      brw_inst_set_bits(insn, 124, 109, value);
+}
+/** @} */
+
+/**
+ * Three-source systolic instructions:
+ *  @{
+ */
+F(dpas_3src_src2_reg_nr,    /* 4+ */ -1, -1,   /* 12+ */ 127, 120)
+F(dpas_3src_src2_subreg_nr, /* 4+ */ -1, -1,   /* 12+ */ 119, 115)
+F(dpas_3src_src2_reg_file,  /* 4+ */ -1, -1,   /* 12+ */ 114, 114)
+F(dpas_3src_src1_reg_nr,    /* 4+ */ -1, -1,   /* 12+ */ 111, 104)
+F(dpas_3src_src1_subreg_nr, /* 4+ */ -1, -1,   /* 12+ */ 103, 99)
+F(dpas_3src_src1_reg_file,  /* 4+ */ -1, -1,   /* 12+ */ 98,  98)
+F(dpas_3src_src1_hw_type,   /* 4+ */ -1, -1,   /* 12+ */ 90,  88)
+F(dpas_3src_src1_subbyte,   /* 4+ */ -1, -1,   /* 12+ */ 87,  86)
+F(dpas_3src_src2_subbyte,   /* 4+ */ -1, -1,   /* 12+ */ 85,  84)
+F(dpas_3src_src2_hw_type,   /* 4+ */ -1, -1,   /* 12+ */ 82,  80)
+F(dpas_3src_src0_reg_nr,    /* 4+ */ -1, -1,   /* 12+ */ 79,  72)
+F(dpas_3src_src0_subreg_nr, /* 4+ */ -1, -1,   /* 12+ */ 71,  67)
+F(dpas_3src_src0_reg_file,  /* 4+ */ -1, -1,   /* 12+ */ 66,  66)
+F(dpas_3src_dst_reg_nr,     /* 4+ */ -1, -1,   /* 12+ */ 63,  56)
+F(dpas_3src_dst_subreg_nr,  /* 4+ */ -1, -1,   /* 12+ */ 55,  51)
+F(dpas_3src_dst_reg_file,   /* 4+ */ -1, -1,   /* 12+ */ 50,  50)
+F(dpas_3src_sdepth,         /* 4+ */ -1, -1,   /* 12+ */ 49,  48)
+F(dpas_3src_rcount,         /* 4+ */ -1, -1,   /* 12+ */ 45,  43)
+F(dpas_3src_src0_hw_type,   /* 4+ */ -1, -1,   /* 12+ */ 42,  40)
+F(dpas_3src_exec_type,      /* 4+ */ -1, -1,   /* 12+ */ 39,  39)
+F(dpas_3src_dst_hw_type,    /* 4+ */ -1, -1,   /* 12+ */ 38,  36)
+/** @} */
+
+#define REG_TYPE(reg)                                                         \
+static inline void                                                            \
+brw_inst_set_dpas_3src_##reg##_type(const struct intel_device_info *devinfo,  \
+                                    brw_inst *inst, enum brw_reg_type type)   \
+{                                                                             \
+   UNUSED enum gfx10_align1_3src_exec_type exec_type =                        \
+      (enum gfx10_align1_3src_exec_type) brw_inst_dpas_3src_exec_type(devinfo,\
+                                                                      inst);  \
+   if (brw_reg_type_is_floating_point(type)) {                                \
+      assert(exec_type == BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);                   \
+   } else {                                                                   \
+      assert(exec_type == BRW_ALIGN1_3SRC_EXEC_TYPE_INT);                     \
+   }                                                                          \
+   unsigned hw_type = brw_reg_type_to_a1_hw_3src_type(devinfo, type);         \
+   brw_inst_set_dpas_3src_##reg##_hw_type(devinfo, inst, hw_type);            \
+}                                                                             \
+                                                                              \
+static inline enum brw_reg_type                                               \
+brw_inst_dpas_3src_##reg##_type(const struct intel_device_info *devinfo,      \
+                              const brw_inst *inst)                           \
+{                                                                             \
+   enum gfx10_align1_3src_exec_type exec_type =                               \
+      (enum gfx10_align1_3src_exec_type) brw_inst_dpas_3src_exec_type(devinfo,\
+                                                                      inst);  \
+   unsigned hw_type = brw_inst_dpas_3src_##reg##_hw_type(devinfo, inst);      \
+   return brw_a1_hw_3src_type_to_reg_type(devinfo, hw_type, exec_type);       \
+}
+
+REG_TYPE(dst)
+REG_TYPE(src0)
+REG_TYPE(src1)
+REG_TYPE(src2)
+#undef REG_TYPE
+
+/**
+ * Flow control instruction bits:
+ *  @{
+ */
+static inline void
+brw_inst_set_uip(const struct intel_device_info *devinfo,
+                 brw_inst *inst, int32_t value)
+{
+   assert(devinfo->ver >= 6);
+
+   if (devinfo->ver >= 12)
+      brw_inst_set_src1_is_imm(devinfo, inst, 1);
+
+   if (devinfo->ver >= 8) {
+      brw_inst_set_bits(inst, 95, 64, (uint32_t)value);
+   } else {
+      assert(value <= (1 << 16) - 1);
+      assert(value > -(1 << 16));
+      brw_inst_set_bits(inst, 127, 112, (uint16_t)value);
+   }
+}
+
+static inline int32_t
+brw_inst_uip(const struct intel_device_info *devinfo, const brw_inst *inst)
+{
+   assert(devinfo->ver >= 6);
+
+   if (devinfo->ver >= 8) {
+      return brw_inst_bits(inst, 95, 64);
+   } else {
+      return (int16_t)brw_inst_bits(inst, 127, 112);
+   }
+}
+
+static inline void
+brw_inst_set_jip(const struct intel_device_info *devinfo,
+                 brw_inst *inst, int32_t value)
+{
+   assert(devinfo->ver >= 6);
+
+   if (devinfo->ver >= 12)
+      brw_inst_set_src0_is_imm(devinfo, inst, 1);
+
+   if (devinfo->ver >= 8) {
+      brw_inst_set_bits(inst, 127, 96, (uint32_t)value);
+   } else {
+      assert(value <= (1 << 15) - 1);
+      assert(value >= -(1 << 15));
+      brw_inst_set_bits(inst, 111, 96, (uint16_t)value);
+   }
+}
+
+static inline int32_t
+brw_inst_jip(const struct intel_device_info *devinfo, const brw_inst *inst)
+{
+   assert(devinfo->ver >= 6);
+
+   if (devinfo->ver >= 8) {
+      return brw_inst_bits(inst, 127, 96);
+   } else {
+      return (int16_t)brw_inst_bits(inst, 111, 96);
+   }
+}
+
+/** Like FC, but using int16_t to handle negative jump targets. */
+#define FJ(name, high, low, assertions)                                       \
+static inline void                                                            \
+brw_inst_set_##name(const struct intel_device_info *devinfo, brw_inst *inst, int16_t v) \
+{                                                                             \
+   assert(assertions);                                                        \
+   (void) devinfo;                                                            \
+   brw_inst_set_bits(inst, high, low, (uint16_t) v);                          \
+}                                                                             \
+static inline int16_t                                                         \
+brw_inst_##name(const struct intel_device_info *devinfo, const brw_inst *inst)\
+{                                                                             \
+   assert(assertions);                                                        \
+   (void) devinfo;                                                            \
+   return brw_inst_bits(inst, high, low);                                     \
+}
+
+FJ(gfx6_jump_count,  63,  48, devinfo->ver == 6)
+FJ(gfx4_jump_count, 111,  96, devinfo->ver < 6)
+FC(gfx4_pop_count,  /* 4+ */ 115, 112, /* 12+ */ -1, -1, devinfo->ver < 6)
+/** @} */
+
+/**
+ * SEND instructions:
+ *  @{
+ */
+FC(send_ex_desc_ia_subreg_nr, /* 4+ */ 82, 80, /* 12+ */  42,  40, devinfo->ver >= 9)
+FC(send_src0_address_mode,    /* 4+ */ 79, 79, /* 12+ */  -1,  -1, devinfo->ver >= 9)
+FC(send_sel_reg32_desc,       /* 4+ */ 77, 77, /* 12+ */  48,  48, devinfo->ver >= 9)
+FC(send_sel_reg32_ex_desc,    /* 4+ */ 61, 61, /* 12+ */  49,  49, devinfo->ver >= 9)
+F8(send_src0_reg_file,        /* 4+ */ 38, 37, /* 8+ */   42,  41, /* 12+ */ 66, 66)
+FC(send_src1_reg_nr,          /* 4+ */ 51, 44, /* 12+ */ 111, 104, devinfo->ver >= 9)
+FC(send_src1_len,             /* 4+ */ -1, -1, /* 12+ */ 103,  99, devinfo->verx10 >= 125)
+FC(send_src1_reg_file,        /* 4+ */ 36, 36, /* 12+ */  98,  98, devinfo->ver >= 9)
+FC(send_dst_reg_file,         /* 4+ */ 35, 35, /* 12+ */  50,  50, devinfo->ver >= 9)
+FC(send_ex_bso,               /* 4+ */ -1, -1, /* 12+ */  39,  39, devinfo->verx10 >= 125)
+/** @} */
+
+/* Message descriptor bits */
+#define MD(x) ((x) + 96)
+#define MD12(x) ((x) >= 30 ? (x) - 30 + 122 :        \
+                 (x) >= 25 ? (x) - 25 + 67 :         \
+                 (x) >= 20 ? (x) - 20 + 51 :         \
+                 (x) >= 11 ? (x) - 11 + 113 :        \
+                 (x) - 0 + 81)
+
+/**
+ * Set the SEND(C) message descriptor immediate.
+ *
+ * This doesn't include the SFID nor the EOT field that were considered to be
+ * part of the message descriptor by ancient versions of the BSpec, because
+ * they are present in the instruction even if the message descriptor is
+ * provided indirectly in the address register, so we want to specify them
+ * separately.
+ */
+static inline void
+brw_inst_set_send_desc(const struct intel_device_info *devinfo,
+                       brw_inst *inst, uint32_t value)
+{
+   if (devinfo->ver >= 12) {
+      brw_inst_set_bits(inst, 123, 122, GET_BITS(value, 31, 30));
+      brw_inst_set_bits(inst, 71, 67, GET_BITS(value, 29, 25));
+      brw_inst_set_bits(inst, 55, 51, GET_BITS(value, 24, 20));
+      brw_inst_set_bits(inst, 121, 113, GET_BITS(value, 19, 11));
+      brw_inst_set_bits(inst, 91, 81, GET_BITS(value, 10, 0));
+   } else if (devinfo->ver >= 9) {
+      brw_inst_set_bits(inst, 126, 96, value);
+      assert(value >> 31 == 0);
+   } else if (devinfo->ver >= 5) {
+      brw_inst_set_bits(inst, 124, 96, value);
+      assert(value >> 29 == 0);
+   } else {
+      brw_inst_set_bits(inst, 119, 96, value);
+      assert(value >> 24 == 0);
+   }
+}
+
+/**
+ * Get the SEND(C) message descriptor immediate.
+ *
+ * \sa brw_inst_set_send_desc().
+ */
+static inline uint32_t
+brw_inst_send_desc(const struct intel_device_info *devinfo,
+                   const brw_inst *inst)
+{
+   if (devinfo->ver >= 12) {
+      return (brw_inst_bits(inst, 123, 122) << 30 |
+              brw_inst_bits(inst, 71, 67) << 25 |
+              brw_inst_bits(inst, 55, 51) << 20 |
+              brw_inst_bits(inst, 121, 113) << 11 |
+              brw_inst_bits(inst, 91, 81));
+   } else if (devinfo->ver >= 9) {
+      return brw_inst_bits(inst, 126, 96);
+   } else if (devinfo->ver >= 5) {
+      return brw_inst_bits(inst, 124, 96);
+   } else {
+      return brw_inst_bits(inst, 119, 96);
+   }
+}
+
+/**
+ * Set the SEND(C) message extended descriptor immediate.
+ *
+ * This doesn't include the SFID nor the EOT field that were considered to be
+ * part of the extended message descriptor by some versions of the BSpec,
+ * because they are present in the instruction even if the extended message
+ * descriptor is provided indirectly in a register, so we want to specify them
+ * separately.
+ */
+static inline void
+brw_inst_set_send_ex_desc(const struct intel_device_info *devinfo,
+                          brw_inst *inst, uint32_t value)
+{
+   if (devinfo->ver >= 12) {
+      brw_inst_set_bits(inst, 127, 124, GET_BITS(value, 31, 28));
+      brw_inst_set_bits(inst, 97, 96, GET_BITS(value, 27, 26));
+      brw_inst_set_bits(inst, 65, 64, GET_BITS(value, 25, 24));
+      brw_inst_set_bits(inst, 47, 35, GET_BITS(value, 23, 11));
+      brw_inst_set_bits(inst, 103, 99, GET_BITS(value, 10, 6));
+      assert(GET_BITS(value, 5, 0) == 0);
+   } else {
+      assert(devinfo->ver >= 9);
+      brw_inst_set_bits(inst, 94, 91, GET_BITS(value, 31, 28));
+      brw_inst_set_bits(inst, 88, 85, GET_BITS(value, 27, 24));
+      brw_inst_set_bits(inst, 83, 80, GET_BITS(value, 23, 20));
+      brw_inst_set_bits(inst, 67, 64, GET_BITS(value, 19, 16));
+      assert(GET_BITS(value, 15, 0) == 0);
+   }
+}
+
+/**
+ * Set the SENDS(C) message extended descriptor immediate.
+ *
+ * This doesn't include the SFID nor the EOT field that were considered to be
+ * part of the extended message descriptor by some versions of the BSpec,
+ * because they are present in the instruction even if the extended message
+ * descriptor is provided indirectly in a register, so we want to specify them
+ * separately.
+ */
+static inline void
+brw_inst_set_sends_ex_desc(const struct intel_device_info *devinfo,
+                           brw_inst *inst, uint32_t value)
+{
+   if (devinfo->ver >= 12) {
+      brw_inst_set_send_ex_desc(devinfo, inst, value);
+   } else {
+      brw_inst_set_bits(inst, 95, 80, GET_BITS(value, 31, 16));
+      assert(GET_BITS(value, 15, 10) == 0);
+      brw_inst_set_bits(inst, 67, 64, GET_BITS(value, 9, 6));
+      assert(GET_BITS(value, 5, 0) == 0);
+   }
+}
+
+/**
+ * Get the SEND(C) message extended descriptor immediate.
+ *
+ * \sa brw_inst_set_send_ex_desc().
+ */
+static inline uint32_t
+brw_inst_send_ex_desc(const struct intel_device_info *devinfo,
+                      const brw_inst *inst)
+{
+   if (devinfo->ver >= 12) {
+      return (brw_inst_bits(inst, 127, 124) << 28 |
+              brw_inst_bits(inst, 97, 96) << 26 |
+              brw_inst_bits(inst, 65, 64) << 24 |
+              brw_inst_bits(inst, 47, 35) << 11 |
+              brw_inst_bits(inst, 103, 99) << 6);
+   } else {
+      assert(devinfo->ver >= 9);
+      return (brw_inst_bits(inst, 94, 91) << 28 |
+              brw_inst_bits(inst, 88, 85) << 24 |
+              brw_inst_bits(inst, 83, 80) << 20 |
+              brw_inst_bits(inst, 67, 64) << 16);
+   }
+}
+
+/**
+ * Get the SENDS(C) message extended descriptor immediate.
+ *
+ * \sa brw_inst_set_send_ex_desc().
+ */
+static inline uint32_t
+brw_inst_sends_ex_desc(const struct intel_device_info *devinfo,
+                       const brw_inst *inst)
+{
+   if (devinfo->ver >= 12) {
+      return brw_inst_send_ex_desc(devinfo, inst);
+   } else {
+      return (brw_inst_bits(inst, 95, 80) << 16 |
+              brw_inst_bits(inst, 67, 64) << 6);
+   }
+}
+
+/**
+ * Fields for SEND messages:
+ *  @{
+ */
+F(eot,                 /* 4+ */ 127, 127, /* 12+ */ 34, 34)
+FF(mlen,
+   /* 4:   */ 119, 116,
+   /* 4.5: */ 119, 116,
+   /* 5:   */ 124, 121,
+   /* 6:   */ 124, 121,
+   /* 7:   */ 124, 121,
+   /* 8:   */ 124, 121,
+   /* 12:  */ MD12(28), MD12(25),
+   /* 20:  */ MD12(28), MD12(25));
+FF(rlen,
+   /* 4:   */ 115, 112,
+   /* 4.5: */ 115, 112,
+   /* 5:   */ 120, 116,
+   /* 6:   */ 120, 116,
+   /* 7:   */ 120, 116,
+   /* 8:   */ 120, 116,
+   /* 12:  */ MD12(24), MD12(20),
+   /* 20:  */ MD12(24), MD12(20));
+FF(header_present,
+   /* 4: doesn't exist */ -1, -1, -1, -1,
+   /* 5:   */ 115, 115,
+   /* 6:   */ 115, 115,
+   /* 7:   */ 115, 115,
+   /* 8:   */ 115, 115,
+   /* 12:  */ MD12(19), MD12(19),
+   /* 20:  */ MD12(19), MD12(19))
+F(gateway_notify, /* 4+ */ MD(16), MD(15), /* 12+ */ -1, -1)
+FD(function_control,
+   /* 4:   */ 111,  96,
+   /* 4.5: */ 111,  96,
+   /* 5:   */ 114,  96,
+   /* 6:   */ 114,  96,
+   /* 7:   */ 114,  96,
+   /* 8:   */ 114,  96,
+   /* 12:  */ MD12(18), MD12(11), MD12(10), MD12(0))
+FF(gateway_subfuncid,
+   /* 4:   */ MD(1), MD(0),
+   /* 4.5: */ MD(1), MD(0),
+   /* 5:   */ MD(1), MD(0), /* 2:0, but bit 2 is reserved MBZ */
+   /* 6:   */ MD(2), MD(0),
+   /* 7:   */ MD(2), MD(0),
+   /* 8:   */ MD(2), MD(0),
+   /* 12:  */ MD12(2), MD12(0),
+   /* 20:  */ MD12(2), MD12(0))
+FF(sfid,
+   /* 4:   */ 123, 120, /* called msg_target */
+   /* 4.5  */ 123, 120,
+   /* 5:   */  95,  92,
+   /* 6:   */  27,  24,
+   /* 7:   */  27,  24,
+   /* 8:   */  27,  24,
+   /* 12:  */  95,  92,
+   /* 20:  */  95,  92)
+FF(null_rt,
+   /* 4-7: */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   /* 8:   */ 80, 80,
+   /* 12:  */ 44, 44,
+   /* 20:  */ 44, 44) /* actually only Gfx11+ */
+FC(base_mrf,   /* 4+ */ 27,  24, /* 12+ */ -1, -1, devinfo->ver < 6);
+FF(send_rta_index,
+   /* 4:   */  -1,  -1,
+   /* 4.5  */  -1,  -1,
+   /* 5:   */  -1,  -1,
+   /* 6:   */  -1,  -1,
+   /* 7:   */  -1,  -1,
+   /* 8:   */  -1,  -1,
+   /* 12:  */  38,  36,
+   /* 20:  */  38,  36)
+/** @} */
+
+/**
+ * URB message function control bits:
+ *  @{
+ */
+FF(urb_per_slot_offset,
+   /* 4-6: */ -1, -1, -1, -1, -1, -1, -1, -1,
+   /* 7:   */ MD(16), MD(16),
+   /* 8:   */ MD(17), MD(17),
+   /* 12:  */ MD12(17), MD12(17),
+   /* 20:  */ MD12(17), MD12(17))
+FC(urb_channel_mask_present, /* 4+ */ MD(15), MD(15), /* 12+ */ MD12(15), MD12(15), devinfo->ver >= 8)
+FC(urb_complete, /* 4+ */ MD(15), MD(15), /* 12+ */ -1, -1, devinfo->ver < 8)
+FC(urb_used,     /* 4+ */ MD(14), MD(14), /* 12+ */ -1, -1, devinfo->ver < 7)
+FC(urb_allocate, /* 4+ */ MD(13), MD(13), /* 12+ */ -1, -1, devinfo->ver < 7)
+FF(urb_swizzle_control,
+   /* 4:   */ MD(11), MD(10),
+   /* 4.5: */ MD(11), MD(10),
+   /* 5:   */ MD(11), MD(10),
+   /* 6:   */ MD(11), MD(10),
+   /* 7:   */ MD(14), MD(14),
+   /* 8:   */ MD(15), MD(15),
+   /* 12:  */ -1, -1,
+   /* 20:  */ -1, -1)
+FD(urb_global_offset,
+   /* 4:   */ MD( 9), MD(4),
+   /* 4.5: */ MD( 9), MD(4),
+   /* 5:   */ MD( 9), MD(4),
+   /* 6:   */ MD( 9), MD(4),
+   /* 7:   */ MD(13), MD(3),
+   /* 8:   */ MD(14), MD(4),
+   /* 12:  */ MD12(14), MD12(11), MD12(10), MD12(4))
+FF(urb_opcode,
+   /* 4:   */ MD( 3), MD(0),
+   /* 4.5: */ MD( 3), MD(0),
+   /* 5:   */ MD( 3), MD(0),
+   /* 6:   */ MD( 3), MD(0),
+   /* 7:   */ MD( 2), MD(0),
+   /* 8:   */ MD( 3), MD(0),
+   /* 12:  */ MD12(3), MD12(0),
+   /* 20:  */ MD12(3), MD12(0))
+/** @} */
+
+/**
+ * Gfx4-5 math messages:
+ *  @{
+ */
+FC(math_msg_data_type,  /* 4+ */ MD(7), MD(7), /* 12+ */ -1, -1, devinfo->ver < 6)
+FC(math_msg_saturate,   /* 4+ */ MD(6), MD(6), /* 12+ */ -1, -1, devinfo->ver < 6)
+FC(math_msg_precision,  /* 4+ */ MD(5), MD(5), /* 12+ */ -1, -1, devinfo->ver < 6)
+FC(math_msg_signed_int, /* 4+ */ MD(4), MD(4), /* 12+ */ -1, -1, devinfo->ver < 6)
+FC(math_msg_function,   /* 4+ */ MD(3), MD(0), /* 12+ */ -1, -1, devinfo->ver < 6)
+/** @} */
+
+/**
+ * Sampler message function control bits:
+ *  @{
+ */
+FF(sampler_simd_mode,
+   /* 4: doesn't exist */ -1, -1, -1, -1,
+   /* 5:   */ MD(17), MD(16),
+   /* 6:   */ MD(17), MD(16),
+   /* 7:   */ MD(18), MD(17),
+   /* 8:   */ MD(18), MD(17),
+   /* 12:  */ MD12(18), MD12(17),
+   /* 20:  */ MD12(18), MD12(17))
+FF(sampler_msg_type,
+   /* 4:   */ MD(15), MD(14),
+   /* 4.5: */ MD(15), MD(12),
+   /* 5:   */ MD(15), MD(12),
+   /* 6:   */ MD(15), MD(12),
+   /* 7:   */ MD(16), MD(12),
+   /* 8:   */ MD(16), MD(12),
+   /* 12:  */ MD12(16), MD12(12),
+   /* 20:  */ MD12(16), MD12(12))
+FC(sampler_return_format, /* 4+ */ MD(13), MD(12), /* 12+ */ -1, -1, devinfo->verx10 == 40)
+FD(sampler,
+   /* 4:   */ MD(11), MD(8),
+   /* 4.5: */ MD(11), MD(8),
+   /* 5:   */ MD(11), MD(8),
+   /* 6:   */ MD(11), MD(8),
+   /* 7:   */ MD(11), MD(8),
+   /* 8:   */ MD(11), MD(8),
+   /* 12:   */ MD12(11), MD12(11), MD12(10), MD12(8))
+F(binding_table_index,    /* 4+ */ MD(7), MD(0),  /* 12+ */ MD12(7), MD12(0)) /* also used by other messages */
+/** @} */
+
+/**
+ * Data port message function control bits:
+ *  @{
+ */
+FC(dp_category,           /* 4+ */ MD(18), MD(18), /* 12+ */ MD12(18), MD12(18), devinfo->ver >= 7)
+
+/* Gfx4-5 store fields in different bits for read/write messages. */
+FF(dp_read_msg_type,
+   /* 4:   */ MD(13), MD(12),
+   /* 4.5: */ MD(13), MD(11),
+   /* 5:   */ MD(13), MD(11),
+   /* 6:   */ MD(16), MD(13),
+   /* 7:   */ MD(17), MD(14),
+   /* 8:   */ MD(17), MD(14),
+   /* 12:  */ MD12(17), MD12(14),
+   /* 20:  */ MD12(17), MD12(14))
+FF(dp_write_msg_type,
+   /* 4:   */ MD(14), MD(12),
+   /* 4.5: */ MD(14), MD(12),
+   /* 5:   */ MD(14), MD(12),
+   /* 6:   */ MD(16), MD(13),
+   /* 7:   */ MD(17), MD(14),
+   /* 8:   */ MD(17), MD(14),
+   /* 12:  */ MD12(17), MD12(14),
+   /* 20:  */ MD12(17), MD12(14))
+FD(dp_read_msg_control,
+   /* 4:   */ MD(11), MD( 8),
+   /* 4.5: */ MD(10), MD( 8),
+   /* 5:   */ MD(10), MD( 8),
+   /* 6:   */ MD(12), MD( 8),
+   /* 7:   */ MD(13), MD( 8),
+   /* 8:   */ MD(13), MD( 8),
+   /* 12:  */ MD12(13), MD12(11), MD12(10), MD12(8))
+FD(dp_write_msg_control,
+   /* 4:   */ MD(11), MD( 8),
+   /* 4.5: */ MD(11), MD( 8),
+   /* 5:   */ MD(11), MD( 8),
+   /* 6:   */ MD(12), MD( 8),
+   /* 7:   */ MD(13), MD( 8),
+   /* 8:   */ MD(13), MD( 8),
+   /* 12:  */ MD12(13), MD12(11), MD12(10), MD12(8))
+FC(dp_read_target_cache, /* 4+ */ MD(15), MD(14), /* 12+ */ -1, -1, devinfo->ver < 6);
+
+FF(dp_write_commit,
+   /* 4:   */ MD(15),  MD(15),
+   /* 4.5: */ MD(15),  MD(15),
+   /* 5:   */ MD(15),  MD(15),
+   /* 6:   */ MD(17),  MD(17),
+   /* 7+: does not exist */ -1, -1, -1, -1,
+   /* 12:  */ -1, -1,
+   /* 20:  */ -1, -1)
+
+/* Gfx6+ use the same bit locations for everything. */
+FF(dp_msg_type,
+   /* 4-5: use dp_read_msg_type or dp_write_msg_type instead */
+   -1, -1, -1, -1, -1, -1,
+   /* 6:   */ MD(16), MD(13),
+   /* 7:   */ MD(17), MD(14),
+   /* 8:   */ MD(18), MD(14),
+   /* 12:  */ MD12(18), MD12(14),
+   /* 20:  */ MD12(18), MD12(14))
+FD(dp_msg_control,
+   /* 4:   */ MD(11), MD( 8),
+   /* 4.5-5: use dp_read_msg_control or dp_write_msg_control */ -1, -1, -1, -1,
+   /* 6:   */ MD(12), MD( 8),
+   /* 7:   */ MD(13), MD( 8),
+   /* 8:   */ MD(13), MD( 8),
+   /* 12:  */ MD12(13), MD12(11), MD12(10), MD12(8))
+/** @} */
+
+/**
+ * Scratch message bits (Gfx7+):
+ *  @{
+ */
+FC(scratch_read_write, /* 4+ */ MD(17), MD(17), /* 12+ */ MD12(17), MD12(17), devinfo->ver >= 7) /* 0 = read,  1 = write */
+FC(scratch_type,       /* 4+ */ MD(16), MD(16), /* 12+ */ -1, -1, devinfo->ver >= 7) /* 0 = OWord, 1 = DWord */
+FC(scratch_invalidate_after_read, /* 4+ */ MD(15), MD(15), /* 12+ */ MD12(15), MD12(15), devinfo->ver >= 7)
+FC(scratch_block_size, /* 4+ */ MD(13), MD(12), /* 12+ */ MD12(13), MD12(12), devinfo->ver >= 7)
+FD(scratch_addr_offset,
+   /* 4:   */ -1, -1,
+   /* 4.5: */ -1, -1,
+   /* 5:   */ -1, -1,
+   /* 6:   */ -1, -1,
+   /* 7:   */ MD(11), MD(0),
+   /* 8:   */ MD(11), MD(0),
+   /* 12:  */ MD12(11), MD12(11), MD12(10), MD12(0))
+/** @} */
+
+/**
+ * Render Target message function control bits:
+ *  @{
+ */
+FF(rt_last,
+   /* 4:   */ MD(11), MD(11),
+   /* 4.5: */ MD(11), MD(11),
+   /* 5:   */ MD(11), MD(11),
+   /* 6:   */ MD(12), MD(12),
+   /* 7:   */ MD(12), MD(12),
+   /* 8:   */ MD(12), MD(12),
+   /* 12:  */ MD12(12), MD12(12),
+   /* 20:  */ MD12(12), MD12(12))
+FC(rt_slot_group,      /* 4+ */ MD(11),  MD(11), /* 12+ */ MD12(11), MD12(11), devinfo->ver >= 6)
+F(rt_message_type,     /* 4+ */ MD(10),  MD( 8), /* 12+ */ MD12(10), MD12(8))
+/** @} */
+
+/**
+ * Thread Spawn message function control bits:
+ *  @{
+ */
+FC(ts_resource_select,  /* 4+ */ MD( 4),  MD( 4), /* 12+ */ -1, -1, devinfo->ver < 11)
+FC(ts_request_type,     /* 4+ */ MD( 1),  MD( 1), /* 12+ */ -1, -1, devinfo->ver < 11)
+F(ts_opcode,           /* 4+ */ MD( 0),  MD( 0), /* 12+ */ MD12(0), MD12(0))
+/** @} */
+
+/**
+ * Pixel Interpolator message function control bits:
+ *  @{
+ */
+F(pi_simd_mode,        /* 4+ */ MD(16),  MD(16), /* 12+ */ MD12(16), MD12(16))
+F(pi_nopersp,          /* 4+ */ MD(14),  MD(14), /* 12+ */ MD12(14), MD12(14))
+F(pi_message_type,     /* 4+ */ MD(13),  MD(12), /* 12+ */ MD12(13), MD12(12))
+F(pi_slot_group,       /* 4+ */ MD(11),  MD(11), /* 12+ */ MD12(11), MD12(11))
+F(pi_message_data,     /* 4+ */ MD(7),   MD(0),  /* 12+ */  MD12(7), MD12(0))
+/** @} */
+
+/**
+ * Immediates:
+ *  @{
+ */
+static inline int
+brw_inst_imm_d(const struct intel_device_info *devinfo, const brw_inst *insn)
+{
+   (void) devinfo;
+   return brw_inst_bits(insn, 127, 96);
+}
+
+static inline unsigned
+brw_inst_imm_ud(const struct intel_device_info *devinfo, const brw_inst *insn)
+{
+   (void) devinfo;
+   return brw_inst_bits(insn, 127, 96);
+}
+
+static inline uint64_t
+brw_inst_imm_uq(const struct intel_device_info *devinfo,
+                const brw_inst *insn)
+{
+   if (devinfo->ver >= 12) {
+      return brw_inst_bits(insn, 95, 64) << 32 |
+             brw_inst_bits(insn, 127, 96);
+   } else {
+      assert(devinfo->ver >= 8);
+      return brw_inst_bits(insn, 127, 64);
+   }
+}
+
+static inline float
+brw_inst_imm_f(const struct intel_device_info *devinfo, const brw_inst *insn)
+{
+   union {
+      float f;
+      uint32_t u;
+   } ft;
+   (void) devinfo;
+   ft.u = brw_inst_bits(insn, 127, 96);
+   return ft.f;
+}
+
+static inline double
+brw_inst_imm_df(const struct intel_device_info *devinfo, const brw_inst *insn)
+{
+   union {
+      double d;
+      uint64_t u;
+   } dt;
+   dt.u = brw_inst_imm_uq(devinfo, insn);
+   return dt.d;
+}
+
+static inline void
+brw_inst_set_imm_d(const struct intel_device_info *devinfo,
+                   brw_inst *insn, int value)
+{
+   (void) devinfo;
+   return brw_inst_set_bits(insn, 127, 96, value);
+}
+
+static inline void
+brw_inst_set_imm_ud(const struct intel_device_info *devinfo,
+                    brw_inst *insn, unsigned value)
+{
+   (void) devinfo;
+   return brw_inst_set_bits(insn, 127, 96, value);
+}
+
+static inline void
+brw_inst_set_imm_f(const struct intel_device_info *devinfo,
+                   brw_inst *insn, float value)
+{
+   union {
+      float f;
+      uint32_t u;
+   } ft;
+   (void) devinfo;
+   ft.f = value;
+   brw_inst_set_bits(insn, 127, 96, ft.u);
+}
+
+static inline void
+brw_inst_set_imm_df(const struct intel_device_info *devinfo,
+                    brw_inst *insn, double value)
+{
+   union {
+      double d;
+      uint64_t u;
+   } dt;
+   (void) devinfo;
+   dt.d = value;
+
+   if (devinfo->ver >= 12) {
+      brw_inst_set_bits(insn, 95, 64, dt.u >> 32);
+      brw_inst_set_bits(insn, 127, 96, dt.u & 0xFFFFFFFF);
+   } else {
+      brw_inst_set_bits(insn, 127, 64, dt.u);
+   }
+}
+
+static inline void
+brw_inst_set_imm_uq(const struct intel_device_info *devinfo,
+                    brw_inst *insn, uint64_t value)
+{
+   (void) devinfo;
+   if (devinfo->ver >= 12) {
+      brw_inst_set_bits(insn, 95, 64, value >> 32);
+      brw_inst_set_bits(insn, 127, 96, value & 0xFFFFFFFF);
+   } else {
+      brw_inst_set_bits(insn, 127, 64, value);
+   }
+}
+
+/** @} */
+
+#define REG_TYPE(reg)                                                         \
+static inline void                                                            \
+brw_inst_set_##reg##_file_type(const struct intel_device_info *devinfo,       \
+                               brw_inst *inst, enum brw_reg_file file,        \
+                               enum brw_reg_type type)                        \
+{                                                                             \
+   assert(file <= BRW_IMMEDIATE_VALUE);                                       \
+   unsigned hw_type = brw_reg_type_to_hw_type(devinfo, file, type);           \
+   brw_inst_set_##reg##_reg_file(devinfo, inst, file);                        \
+   brw_inst_set_##reg##_reg_hw_type(devinfo, inst, hw_type);                  \
+}                                                                             \
+                                                                              \
+static inline enum brw_reg_type                                               \
+brw_inst_##reg##_type(const struct intel_device_info *devinfo,                \
+                      const brw_inst *inst)                                   \
+{                                                                             \
+   unsigned file = __builtin_strcmp("dst", #reg) == 0 ?                       \
+                   (unsigned) BRW_GENERAL_REGISTER_FILE :                     \
+                   brw_inst_##reg##_reg_file(devinfo, inst);                  \
+   unsigned hw_type = brw_inst_##reg##_reg_hw_type(devinfo, inst);            \
+   return brw_hw_type_to_reg_type(devinfo, (enum brw_reg_file)file, hw_type); \
+}
+
+REG_TYPE(dst)
+REG_TYPE(src0)
+REG_TYPE(src1)
+#undef REG_TYPE
+
+
+/* The AddrImm fields are split into two discontiguous sections on Gfx8+ */
+#define BRW_IA1_ADDR_IMM(reg, g4_high, g4_low, g8_nine, g8_high, g8_low, \
+                         g12_high, g12_low, g20_high, g20_low, g20_zero) \
+static inline void                                                       \
+brw_inst_set_##reg##_ia1_addr_imm(const struct                           \
+                                  intel_device_info *devinfo,            \
+                                  brw_inst *inst,                        \
+                                  unsigned value)                        \
+{                                                                        \
+   if (devinfo->ver >= 20) {                                             \
+      assert((value & ~0x7ff) == 0);                                     \
+      brw_inst_set_bits(inst, g20_high, g20_low, value >> 1);            \
+      if (g20_zero == -1)                                                \
+         assert((value & 1) == 0);                                       \
+      else                                                               \
+         brw_inst_set_bits(inst, g20_zero, g20_zero, value & 1);         \
+   } else if (devinfo->ver >= 12) {                                      \
+      assert((value & ~0x3ff) == 0);                                     \
+      brw_inst_set_bits(inst, g12_high, g12_low, value);                 \
+   } else if (devinfo->ver >= 8) {                                       \
+      assert((value & ~0x3ff) == 0);                                     \
+      brw_inst_set_bits(inst, g8_high, g8_low, value & 0x1ff);           \
+      brw_inst_set_bits(inst, g8_nine, g8_nine, value >> 9);             \
+   } else {                                                              \
+      assert((value & ~0x3ff) == 0);                                     \
+      brw_inst_set_bits(inst, g4_high, g4_low, value);                   \
+   }                                                                     \
+}                                                                        \
+static inline unsigned                                                   \
+brw_inst_##reg##_ia1_addr_imm(const struct intel_device_info *devinfo,   \
+                              const brw_inst *inst)                      \
+{                                                                        \
+   if (devinfo->ver >= 20) {                                             \
+      return brw_inst_bits(inst, g20_high, g20_low) << 1 |               \
+             (g20_zero == -1 ? 0 :                                       \
+              brw_inst_bits(inst, g20_zero, g20_zero));                  \
+   } else if (devinfo->ver >= 12) {                                      \
+      return brw_inst_bits(inst, g12_high, g12_low);                     \
+   } else if (devinfo->ver >= 8) {                                       \
+      return brw_inst_bits(inst, g8_high, g8_low) |                      \
+             (brw_inst_bits(inst, g8_nine, g8_nine) << 9);               \
+   } else {                                                              \
+      return brw_inst_bits(inst, g4_high, g4_low);                       \
+   }                                                                     \
+}
+
+/* AddrImm for Align1 Indirect Addressing                          */
+/*                     -Gen 4-  ----Gfx8----  -Gfx12-  ---Gfx20--- */
+BRW_IA1_ADDR_IMM(src1, 105, 96, 121, 104, 96, 107, 98, 107, 98, -1)
+BRW_IA1_ADDR_IMM(src0,  73, 64,  95,  72, 64,  75, 66,  75, 66, 87)
+BRW_IA1_ADDR_IMM(dst,   57, 48,  47,  56, 48,  59, 50,  59, 50, 33)
+
+#define BRW_IA16_ADDR_IMM(reg, g4_high, g4_low, g8_nine, g8_high, g8_low) \
+static inline void                                                        \
+brw_inst_set_##reg##_ia16_addr_imm(const struct                           \
+                                   intel_device_info *devinfo,            \
+                                   brw_inst *inst, unsigned value)        \
+{                                                                         \
+   assert(devinfo->ver < 12);                                             \
+   assert((value & ~0x3ff) == 0);                                         \
+   if (devinfo->ver >= 8) {                                               \
+      assert(GET_BITS(value, 3, 0) == 0);                                 \
+      brw_inst_set_bits(inst, g8_high, g8_low, GET_BITS(value, 8, 4));    \
+      brw_inst_set_bits(inst, g8_nine, g8_nine, GET_BITS(value, 9, 9));   \
+   } else {                                                               \
+      brw_inst_set_bits(inst, g4_high, g4_low, value);                    \
+   }                                                                      \
+}                                                                         \
+static inline unsigned                                                    \
+brw_inst_##reg##_ia16_addr_imm(const struct intel_device_info *devinfo,   \
+                               const brw_inst *inst)                      \
+{                                                                         \
+   assert(devinfo->ver < 12);                                             \
+   if (devinfo->ver >= 8) {                                               \
+      return (brw_inst_bits(inst, g8_high, g8_low) << 4) |                \
+             (brw_inst_bits(inst, g8_nine, g8_nine) << 9);                \
+   } else {                                                               \
+      return brw_inst_bits(inst, g4_high, g4_low);                        \
+   }                                                                      \
+}
+
+/* AddrImm[9:0] for Align16 Indirect Addressing:
+ * Compared to Align1, these are missing the low 4 bits.
+ *                     -Gen 4-  ----Gfx8----
+ */
+BRW_IA16_ADDR_IMM(src1,       105, 96, 121, 104, 100)
+BRW_IA16_ADDR_IMM(src0,        73, 64,  95,  72,  68)
+BRW_IA16_ADDR_IMM(dst,         57, 52,  47,  56,  52)
+BRW_IA16_ADDR_IMM(send_src0,   -1, -1,  78,  72,  68)
+BRW_IA16_ADDR_IMM(send_dst,    -1, -1,  62,  56,  52)
+
+/**
+ * Fetch a set of contiguous bits from the instruction.
+ *
+ * Bits indices range from 0..127; fields may not cross 64-bit boundaries.
+ */
+static inline uint64_t
+brw_inst_bits(const brw_inst *inst, unsigned high, unsigned low)
+{
+   assume(high < 128);
+   assume(high >= low);
+   /* We assume the field doesn't cross 64-bit boundaries. */
+   const unsigned word = high / 64;
+   assert(word == low / 64);
+
+   high %= 64;
+   low %= 64;
+
+   const uint64_t mask = (~0ull >> (64 - (high - low + 1)));
+
+   return (inst->data[word] >> low) & mask;
+}
+
+/**
+ * Set bits in the instruction, with proper shifting and masking.
+ *
+ * Bits indices range from 0..127; fields may not cross 64-bit boundaries.
+ */
+static inline void
+brw_inst_set_bits(brw_inst *inst, unsigned high, unsigned low, uint64_t value)
+{
+   assume(high < 128);
+   assume(high >= low);
+   const unsigned word = high / 64;
+   assert(word == low / 64);
+
+   high %= 64;
+   low %= 64;
+
+   const uint64_t mask = (~0ull >> (64 - (high - low + 1))) << low;
+
+   /* Make sure the supplied value actually fits in the given bitfield. */
+   assert((value & (mask >> low)) == value);
+
+   inst->data[word] = (inst->data[word] & ~mask) | (value << low);
+}
+
+#undef BRW_IA16_ADDR_IMM
+#undef BRW_IA1_ADDR_IMM
+#undef MD
+#undef F8
+#undef FF
+#undef BOUNDS
+#undef F
+#undef FC
+#undef F20
+#undef FD20
+
+typedef struct {
+   uint64_t data;
+} brw_compact_inst;
+
+/**
+ * Fetch a set of contiguous bits from the compacted instruction.
+ *
+ * Bits indices range from 0..63.
+ */
+static inline unsigned
+brw_compact_inst_bits(const brw_compact_inst *inst, unsigned high, unsigned low)
+{
+   assume(high < 64);
+   assume(high >= low);
+   const uint64_t mask = (1ull << (high - low + 1)) - 1;
+
+   return (inst->data >> low) & mask;
+}
+
+/**
+ * Set bits in the compacted instruction.
+ *
+ * Bits indices range from 0..63.
+ */
+static inline void
+brw_compact_inst_set_bits(brw_compact_inst *inst, unsigned high, unsigned low,
+                          uint64_t value)
+{
+   assume(high < 64);
+   assume(high >= low);
+   const uint64_t mask = ((1ull << (high - low + 1)) - 1) << low;
+
+   /* Make sure the supplied value actually fits in the given bitfield. */
+   assert((value & (mask >> low)) == value);
+
+   inst->data = (inst->data & ~mask) | (value << low);
+}
+
+#define FC(name, high, low, gfx12_high, gfx12_low, assertions)     \
+static inline void                                                 \
+brw_compact_inst_set_##name(const struct                           \
+                            intel_device_info *devinfo,            \
+                            brw_compact_inst *inst, unsigned v)    \
+{                                                                  \
+   assert(assertions);                                             \
+   if (devinfo->ver >= 12)                                         \
+      brw_compact_inst_set_bits(inst, gfx12_high, gfx12_low, v);   \
+   else                                                            \
+      brw_compact_inst_set_bits(inst, high, low, v);               \
+}                                                                  \
+static inline unsigned                                             \
+brw_compact_inst_##name(const struct intel_device_info *devinfo,   \
+                        const brw_compact_inst *inst)              \
+{                                                                  \
+   assert(assertions);                                             \
+   if (devinfo->ver >= 12)                                         \
+      return brw_compact_inst_bits(inst, gfx12_high, gfx12_low);   \
+   else                                                            \
+      return brw_compact_inst_bits(inst, high, low);               \
+}
+
+/* A simple macro for fields which stay in the same place on all generations
+ * except for Gfx12.
+ */
+#define F(name, high, low, gfx12_high, gfx12_low)       \
+   FC(name, high, low, gfx12_high, gfx12_low, true)
+
+/* A macro for fields which moved to several different locations
+ * across generations.
+ */
+#define F20(name, high, low, hi8, lo8, hi12, lo12, hi20, lo20)     \
+static inline void                                                 \
+brw_compact_inst_set_##name(const struct                           \
+                            intel_device_info *devinfo,            \
+                            brw_compact_inst *inst, unsigned v)    \
+{                                                                  \
+   if (devinfo->ver >= 20)                                         \
+      brw_compact_inst_set_bits(inst, hi20, lo20, v);              \
+   else if (devinfo->ver >= 12)                                    \
+      brw_compact_inst_set_bits(inst, hi12, lo12, v);              \
+   else if (devinfo->ver >= 8)                                     \
+      brw_compact_inst_set_bits(inst, hi8, lo8, v);                \
+   else                                                            \
+      brw_compact_inst_set_bits(inst, high, low, v);               \
+}                                                                  \
+static inline unsigned                                             \
+brw_compact_inst_##name(const struct intel_device_info *devinfo,   \
+                        const brw_compact_inst *inst)              \
+{                                                                  \
+   if (devinfo->ver >= 20)                                         \
+      return brw_compact_inst_bits(inst, hi20, lo20);              \
+   else if (devinfo->ver >= 12)                                    \
+      return brw_compact_inst_bits(inst, hi12, lo12);              \
+   else if (devinfo->ver >= 8)                                     \
+      return brw_compact_inst_bits(inst, hi8, lo8);                \
+   else                                                            \
+      return brw_compact_inst_bits(inst, high, low);               \
+}
+
+/* A macro for fields which gained extra discontiguous bits in Gfx20
+ * (specified by hi20ex-lo20ex).
+ */
+#define FD20(name, high, low, hi8, lo8, hi12, lo12,                     \
+             hi20, lo20, hi20ex, lo20ex)                                \
+   static inline void                                                   \
+brw_compact_inst_set_##name(const struct                                \
+                            intel_device_info *devinfo,                 \
+                            brw_compact_inst *inst, unsigned v)         \
+{                                                                       \
+   if (devinfo->ver >= 20) {                                            \
+      const unsigned k = hi20 - lo20 + 1;                               \
+      brw_compact_inst_set_bits(inst, hi20ex, lo20ex, v >> k);          \
+      brw_compact_inst_set_bits(inst, hi20, lo20, v & ((1u << k) - 1)); \
+   } else if (devinfo->ver >= 12) {                                     \
+      brw_compact_inst_set_bits(inst, hi12, lo12, v);                   \
+   } else if (devinfo->ver >= 8) {                                      \
+      brw_compact_inst_set_bits(inst, hi8, lo8, v);                     \
+   } else {                                                             \
+      brw_compact_inst_set_bits(inst, high, low, v);                    \
+   }                                                                    \
+}                                                                       \
+static inline unsigned                                                  \
+brw_compact_inst_##name(const struct intel_device_info *devinfo,        \
+                        const brw_compact_inst *inst)                   \
+{                                                                       \
+   if (devinfo->ver >= 20) {                                            \
+      const unsigned k = hi20 - lo20 + 1;                               \
+      return (brw_compact_inst_bits(inst, hi20ex, lo20ex) << k |        \
+              brw_compact_inst_bits(inst, hi20, lo20));                 \
+   } else if (devinfo->ver >= 12) {                                     \
+      return brw_compact_inst_bits(inst, hi12, lo12);                   \
+   } else if (devinfo->ver >= 8) {                                      \
+      return brw_compact_inst_bits(inst, hi8, lo8);                     \
+   } else {                                                             \
+      return brw_compact_inst_bits(inst, high, low);                    \
+   }                                                                    \
+}
+
+F(src1_reg_nr,       /* 4+ */ 63, 56, /* 12+ */ 63, 56)
+F(src0_reg_nr,       /* 4+ */ 55, 48, /* 12+ */ 47, 40)
+F20(dst_reg_nr,      /* 4+ */ 47, 40, /*  8+ */ 47, 40, /* 12+ */ 23, 16, /* 20+ */ 39, 32)
+F(src1_index,        /* 4+ */ 39, 35, /* 12+ */ 55, 52)
+F20(src0_index,      /* 4+ */ 34, 30, /*  8+ */ 34, 30, /* 12+ */ 51, 48, /* 20+ */ 25, 23)
+F(cmpt_control,      /* 4+ */ 29, 29, /* 12+ */ 29, 29) /* Same location as brw_inst */
+FC(flag_subreg_nr,   /* 4+ */ 28, 28, /* 12+ */ -1, -1, devinfo->ver <= 6)
+F(cond_modifier,     /* 4+ */ 27, 24, /* 12+ */ -1, -1) /* Same location as brw_inst */
+FC(acc_wr_control,   /* 4+ */ 23, 23, /* 12+ */ -1, -1, devinfo->ver >= 6)
+FC(mask_control_ex,  /* 4+ */ 23, 23, /* 12+ */ -1, -1, devinfo->verx10 == 45 || devinfo->ver == 5)
+F20(subreg_index,    /* 4+ */ 22, 18, /*  8+ */ 22, 18, /* 12+ */ 39, 35, /* 20+ */ 51, 48)
+FD20(datatype_index, /* 4+ */ 17, 13, /*  8+ */ 17, 13, /* 12+ */ 34, 30, /* 20+ */ 28, 26, 31, 30)
+F20(control_index,   /* 4+ */ 12,  8, /*  8+ */ 12,  8, /* 12+ */ 28, 24, /* 20+ */ 22, 18)
+F20(swsb,            /* 4+ */ -1, -1, /*  8+ */ -1, -1, /* 12+ */ 15,  8, /* 20+ */ 17,  8)
+F(debug_control,     /* 4+ */  7,  7, /* 12+ */  7,  7)
+F(hw_opcode,         /* 4+ */  6,  0, /* 12+ */  6,  0) /* Same location as brw_inst */
+
+static inline unsigned
+brw_compact_inst_imm(const struct intel_device_info *devinfo,
+                     const brw_compact_inst *inst)
+{
+   if (devinfo->ver >= 12) {
+      return brw_compact_inst_bits(inst, 63, 52);
+   } else {
+      return (brw_compact_inst_bits(inst, 39, 35) << 8) |
+             (brw_compact_inst_bits(inst, 63, 56));
+   }
+}
+
+/**
+ * (Gfx8+) Compacted three-source instructions:
+ *  @{
+ */
+FC(3src_src2_reg_nr,    /* 4+ */ 63, 57, /* 12+ */ 55, 48, devinfo->ver >= 8)
+FC(3src_src1_reg_nr,    /* 4+ */ 56, 50, /* 12+ */ 63, 56, devinfo->ver >= 8)
+FC(3src_src0_reg_nr,    /* 4+ */ 49, 43, /* 12+ */ 47, 40, devinfo->ver >= 8)
+FC(3src_src2_subreg_nr, /* 4+ */ 42, 40, /* 12+ */ -1, -1, devinfo->ver >= 8)
+FC(3src_src1_subreg_nr, /* 4+ */ 39, 37, /* 12+ */ -1, -1, devinfo->ver >= 8)
+FC(3src_src0_subreg_nr, /* 4+ */ 36, 34, /* 12+ */ -1, -1, devinfo->ver >= 8)
+FC(3src_src2_rep_ctrl,  /* 4+ */ 33, 33, /* 12+ */ -1, -1, devinfo->ver >= 8)
+FC(3src_src1_rep_ctrl,  /* 4+ */ 32, 32, /* 12+ */ -1, -1, devinfo->ver >= 8)
+FC(3src_saturate,       /* 4+ */ 31, 31, /* 12+ */ -1, -1, devinfo->ver >= 8)
+FC(3src_debug_control,  /* 4+ */ 30, 30, /* 12+ */  7,  7, devinfo->ver >= 8)
+FC(3src_cmpt_control,   /* 4+ */ 29, 29, /* 12+ */ 29, 29, devinfo->ver >= 8)
+FC(3src_src0_rep_ctrl,  /* 4+ */ 28, 28, /* 12+ */ -1, -1, devinfo->ver >= 8)
+/* Reserved */
+F20(3src_dst_reg_nr,    /* 4+ */ 18, 12, /*  8+ */ 18, 12, /* 12+ */ 23, 16, /* 20+ */ 39, 32)
+F20(3src_source_index,  /* 4+ */ -1, -1, /*  8+ */ 11, 10, /* 12+ */ 34, 30, /* 20+ */ 25, 22)
+FD20(3src_subreg_index, /* 4+ */ -1, -1, /*  8+ */ -1, -1, /* 12+ */ 39, 35, /* 20+ */ 28, 26, 31, 30)
+F20(3src_control_index, /* 4+ */ -1, -1, /*  8+ */  9,  8, /* 12+ */ 28, 24, /* 20+ */ 21, 18)
+F20(3src_swsb,          /* 4+ */ -1, -1, /*  8+ */ -1, -1, /* 12+ */ 15,  8, /* 20+ */ 17,  8)
+/* Bit 7 is Reserved (for future Opcode expansion) */
+FC(3src_hw_opcode,      /* 4+ */  6,  0, /* 12+ */  6,  0, devinfo->ver >= 8)
+/** @} */
+
+#undef F
+
+static inline void
+brw_inst_set_opcode(const struct brw_isa_info *isa,
+                    struct brw_inst *inst, enum opcode opcode)
+{
+   brw_inst_set_hw_opcode(isa->devinfo, inst, brw_opcode_encode(isa, opcode));
+}
+
+static inline enum opcode
+brw_inst_opcode(const struct brw_isa_info *isa,
+                const struct brw_inst *inst)
+{
+   return brw_opcode_decode(isa, brw_inst_hw_opcode(isa->devinfo, inst));
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/intel/compiler/elk/brw_interpolation_map.c b/src/intel/compiler/elk/brw_interpolation_map.c
new file mode 100644
index 00000000000..bdda1ad5d48
--- /dev/null
+++ b/src/intel/compiler/elk/brw_interpolation_map.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_compiler.h"
+#include "compiler/nir/nir.h"
+
+static char const *get_qual_name(int mode)
+{
+   switch (mode) {
+      case INTERP_MODE_NONE:          return "none";
+      case INTERP_MODE_FLAT:          return "flat";
+      case INTERP_MODE_SMOOTH:        return "smooth";
+      case INTERP_MODE_NOPERSPECTIVE: return "nopersp";
+      default:                             return "???";
+   }
+}
+
+static void
+gfx4_frag_prog_set_interp_modes(struct brw_wm_prog_data *prog_data,
+                                const struct intel_vue_map *vue_map,
+                                unsigned location, unsigned slot_count,
+                                enum glsl_interp_mode interp)
+{
+   for (unsigned k = 0; k < slot_count; k++) {
+      unsigned slot = vue_map->varying_to_slot[location + k];
+      if (slot != -1 && prog_data->interp_mode[slot] == INTERP_MODE_NONE) {
+         prog_data->interp_mode[slot] = interp;
+
+         if (prog_data->interp_mode[slot] == INTERP_MODE_FLAT) {
+            prog_data->contains_flat_varying = true;
+         } else if (prog_data->interp_mode[slot] == INTERP_MODE_NOPERSPECTIVE) {
+            prog_data->contains_noperspective_varying = true;
+         }
+      }
+   }
+}
+
+/* Set up interpolation modes for every element in the VUE */
+void
+brw_setup_vue_interpolation(const struct intel_vue_map *vue_map, nir_shader *nir,
+                            struct brw_wm_prog_data *prog_data)
+{
+   /* Initialise interp_mode. INTERP_MODE_NONE == 0 */
+   memset(prog_data->interp_mode, 0, sizeof(prog_data->interp_mode));
+
+   if (!vue_map)
+      return;
+
+   /* HPOS always wants noperspective. setting it up here allows
+    * us to not need special handling in the SF program.
+    */
+   unsigned pos_slot = vue_map->varying_to_slot[VARYING_SLOT_POS];
+   if (pos_slot != -1) {;
+      prog_data->interp_mode[pos_slot] = INTERP_MODE_NOPERSPECTIVE;
+      prog_data->contains_noperspective_varying = true;
+   }
+
+   nir_foreach_shader_in_variable(var, nir) {
+      unsigned location = var->data.location;
+      unsigned slot_count = glsl_count_attribute_slots(var->type, false);
+
+      gfx4_frag_prog_set_interp_modes(prog_data, vue_map, location, slot_count,
+                                      var->data.interpolation);
+
+      if (location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1) {
+         location = location + VARYING_SLOT_BFC0 - VARYING_SLOT_COL0;
+         gfx4_frag_prog_set_interp_modes(prog_data, vue_map, location,
+                                         slot_count, var->data.interpolation);
+      }
+   }
+
+   const bool debug = false;
+   if (debug) {
+      fprintf(stderr, "VUE map:\n");
+      for (int i = 0; i < vue_map->num_slots; i++) {
+         int varying = vue_map->slot_to_varying[i];
+         if (varying == -1) {
+            fprintf(stderr, "%d: --\n", i);
+            continue;
+         }
+
+         fprintf(stderr, "%d: %d %s ofs %d\n",
+                 i, varying,
+                 get_qual_name(prog_data->interp_mode[i]),
+                 brw_vue_slot_to_offset(i));
+      }
+   }
+}
diff --git a/src/intel/compiler/elk/brw_ir.h b/src/intel/compiler/elk/brw_ir.h
new file mode 100644
index 00000000000..3b4b19c244a
--- /dev/null
+++ b/src/intel/compiler/elk/brw_ir.h
@@ -0,0 +1,216 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_H
+#define BRW_IR_H
+
+#include <assert.h>
+#include "brw_reg.h"
+#include "compiler/glsl/list.h"
+
+#define MAX_SAMPLER_MESSAGE_SIZE 11
+
+/* The sampler can return a vec5 when sampling with sparse residency. In
+ * SIMD32, each component takes up 4 GRFs, so we need to allow up to size-20
+ * VGRFs to hold the result.
+ */
+#define MAX_VGRF_SIZE(devinfo) ((devinfo)->ver >= 20 ? 40 : 20)
+
+#ifdef __cplusplus
+struct backend_reg : private brw_reg
+{
+   backend_reg() {}
+   backend_reg(const struct brw_reg &reg) : brw_reg(reg), offset(0) {}
+
+   const brw_reg &as_brw_reg() const
+   {
+      assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM);
+      assert(offset == 0);
+      return static_cast<const brw_reg &>(*this);
+   }
+
+   brw_reg &as_brw_reg()
+   {
+      assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM);
+      assert(offset == 0);
+      return static_cast<brw_reg &>(*this);
+   }
+
+   bool equals(const backend_reg &r) const;
+   bool negative_equals(const backend_reg &r) const;
+
+   bool is_zero() const;
+   bool is_one() const;
+   bool is_negative_one() const;
+   bool is_null() const;
+   bool is_accumulator() const;
+
+   /** Offset from the start of the (virtual) register in bytes. */
+   uint16_t offset;
+
+   using brw_reg::type;
+   using brw_reg::file;
+   using brw_reg::negate;
+   using brw_reg::abs;
+   using brw_reg::address_mode;
+   using brw_reg::subnr;
+   using brw_reg::nr;
+
+   using brw_reg::swizzle;
+   using brw_reg::writemask;
+   using brw_reg::indirect_offset;
+   using brw_reg::vstride;
+   using brw_reg::width;
+   using brw_reg::hstride;
+
+   using brw_reg::df;
+   using brw_reg::f;
+   using brw_reg::d;
+   using brw_reg::ud;
+   using brw_reg::d64;
+   using brw_reg::u64;
+};
+
+struct bblock_t;
+
+struct backend_instruction : public exec_node {
+   bool is_3src(const struct brw_compiler *compiler) const;
+   bool is_math() const;
+   bool is_control_flow_begin() const;
+   bool is_control_flow_end() const;
+   bool is_control_flow() const;
+   bool is_commutative() const;
+   bool can_do_source_mods() const;
+   bool can_do_saturate() const;
+   bool can_do_cmod() const;
+   bool reads_accumulator_implicitly() const;
+   bool writes_accumulator_implicitly(const struct intel_device_info *devinfo) const;
+
+   /**
+    * Instructions that use indirect addressing have additional register
+    * regioning restrictions.
+    */
+   bool uses_indirect_addressing() const;
+
+   void remove(bblock_t *block, bool defer_later_block_ip_updates = false);
+   void insert_after(bblock_t *block, backend_instruction *inst);
+   void insert_before(bblock_t *block, backend_instruction *inst);
+
+   /**
+    * True if the instruction has side effects other than writing to
+    * its destination registers.  You are expected not to reorder or
+    * optimize these out unless you know what you are doing.
+    */
+   bool has_side_effects() const;
+
+   /**
+    * True if the instruction might be affected by side effects of other
+    * instructions.
+    */
+   bool is_volatile() const;
+#else
+struct backend_instruction {
+   struct exec_node link;
+#endif
+   /** @{
+    * Annotation for the generated IR.  One of the two can be set.
+    */
+   const void *ir;
+   const char *annotation;
+   /** @} */
+
+   /**
+    * Execution size of the instruction.  This is used by the generator to
+    * generate the correct binary for the given instruction.  Current valid
+    * values are 1, 4, 8, 16, 32.
+    */
+   uint8_t exec_size;
+
+   /**
+    * Channel group from the hardware execution and predication mask that
+    * should be applied to the instruction.  The subset of channel enable
+    * signals (calculated from the EU control flow and predication state)
+    * given by [group, group + exec_size) will be used to mask GRF writes and
+    * any other side effects of the instruction.
+    */
+   uint8_t group;
+
+   uint32_t offset; /**< spill/unspill offset or texture offset bitfield */
+   uint8_t mlen; /**< SEND message length */
+   uint8_t ex_mlen; /**< SENDS extended message length */
+   int8_t base_mrf; /**< First MRF in the SEND message, if mlen is nonzero. */
+   uint8_t target; /**< MRT target. */
+   uint8_t sfid; /**< SFID for SEND instructions */
+   uint32_t desc; /**< SEND[S] message descriptor immediate */
+   uint32_t ex_desc; /**< SEND[S] extended message descriptor immediate */
+   unsigned size_written; /**< Data written to the destination register in bytes. */
+
+   enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
+   enum brw_conditional_mod conditional_mod; /**< BRW_CONDITIONAL_* */
+   enum brw_predicate predicate;
+   bool predicate_inverse:1;
+   bool writes_accumulator:1; /**< instruction implicitly writes accumulator */
+   bool force_writemask_all:1;
+   bool no_dd_clear:1;
+   bool no_dd_check:1;
+   bool saturate:1;
+   bool shadow_compare:1;
+   bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */
+   bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */
+   bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */
+   bool send_ex_desc_scratch:1; /**< Only valid for SHADER_OPCODE_SEND, use
+                                 *   the scratch surface offset to build
+                                 *   extended descriptor
+                                 */
+   bool send_ex_bso:1; /**< Only for SHADER_OPCODE_SEND, use extended bindless
+                        *   surface offset (26bits instead of 20bits)
+                        */
+   bool predicate_trivial:1; /**< The predication mask applied to this
+                              *   instruction is guaranteed to be uniform and
+                              *   a superset of the execution mask of the
+                              *   present block, no currently enabled channels
+                              *   will be disabled by the predicate.
+                              */
+   bool eot:1;
+
+   /* Chooses which flag subregister (f0.0 to f3.1) is used for conditional
+    * mod and predication.
+    */
+   unsigned flag_subreg:3;
+
+   /**
+    * Systolic depth used by DPAS instruction.
+    */
+   unsigned sdepth:4;
+
+   /**
+    * Repeat count used by DPAS instruction.
+    */
+   unsigned rcount:4;
+
+   /** The number of hardware registers used for a message header. */
+   uint8_t header_size;
+};
+
+#endif
diff --git a/src/intel/compiler/elk/brw_ir_allocator.h b/src/intel/compiler/elk/brw_ir_allocator.h
new file mode 100644
index 00000000000..4722ae4a4a5
--- /dev/null
+++ b/src/intel/compiler/elk/brw_ir_allocator.h
@@ -0,0 +1,92 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_ALLOCATOR_H
+#define BRW_IR_ALLOCATOR_H
+
+#include "util/compiler.h"
+#include "util/glheader.h"
+#include "util/macros.h"
+#include "util/rounding.h"
+#include "util/u_math.h"
+
+namespace brw {
+   /**
+    * Simple allocator used to keep track of virtual GRFs.
+    */
+   class simple_allocator {
+   public:
+      simple_allocator() :
+         sizes(NULL), offsets(NULL), count(0), total_size(0), capacity(0)
+      {
+      }
+
+      ~simple_allocator()
+      {
+         free(offsets);
+         free(sizes);
+      }
+
+      unsigned
+      allocate(unsigned size)
+      {
+         assert(size > 0);
+         if (capacity <= count) {
+            capacity = MAX2(16, capacity * 2);
+            sizes = (unsigned *)realloc(sizes, capacity * sizeof(unsigned));
+            offsets = (unsigned *)realloc(offsets, capacity * sizeof(unsigned));
+         }
+
+         sizes[count] = size;
+         offsets[count] = total_size;
+         total_size += size;
+
+         return count++;
+      }
+
+      /**
+       * Array of sizes for each allocation.  The allocation unit is up to the
+       * back-end, but it's expected to be one scalar value in the FS back-end
+       * and one vec4 in the VEC4 back-end.
+       */
+      unsigned *sizes;
+
+      /**
+       * Array of offsets from the start of the VGRF space in allocation
+       * units.
+       */
+      unsigned *offsets;
+
+      /** Total number of VGRFs allocated. */
+      unsigned count;
+
+      /** Cumulative size in allocation units. */
+      unsigned total_size;
+
+   private:
+      unsigned capacity;
+   };
+}
+
+#endif
diff --git a/src/intel/compiler/elk/brw_ir_analysis.h b/src/intel/compiler/elk/brw_ir_analysis.h
new file mode 100644
index 00000000000..33b8f5178a6
--- /dev/null
+++ b/src/intel/compiler/elk/brw_ir_analysis.h
@@ -0,0 +1,192 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_ANALYSIS_H
+#define BRW_IR_ANALYSIS_H
+
+namespace brw {
+   /**
+    * Bitset of state categories that can influence the result of IR analysis
+    * passes.
+    */
+   enum analysis_dependency_class {
+      /**
+       * The analysis doesn't depend on the IR, its result is effectively a
+       * constant during the compilation.
+       */
+      DEPENDENCY_NOTHING = 0,
+      /**
+       * The analysis depends on the set of instructions in the program and
+       * their naming.  Note that because instructions are named sequentially
+       * by IP this implies a dependency on the control flow edges between
+       * instructions.  This will be signaled whenever instructions are
+       * inserted, removed or reordered in the program.
+       */
+      DEPENDENCY_INSTRUCTION_IDENTITY = 0x1,
+      /**
+       * The analysis is sensitive to the detailed semantics of instructions
+       * in the program, where "detailed" means any change in the instruction
+       * data structures other than the linked-list pointers (which are
+       * already covered by DEPENDENCY_INSTRUCTION_IDENTITY).  E.g. changing
+       * the negate or abs flags of an instruction source would signal this
+       * flag alone because it would preserve all other instruction dependency
+       * classes.
+       */
+      DEPENDENCY_INSTRUCTION_DETAIL = 0x2,
+      /**
+       * The analysis depends on the set of data flow edges between
+       * instructions.  This will be signaled whenever the dataflow relation
+       * between instructions has potentially changed, e.g. when the VGRF
+       * index of an instruction source or destination changes (in which case
+       * it will appear in combination with DEPENDENCY_INSTRUCTION_DETAIL), or
+       * when data-dependent instructions are reordered (in which case it will
+       * appear in combination with DEPENDENCY_INSTRUCTION_IDENTITY).
+       */
+      DEPENDENCY_INSTRUCTION_DATA_FLOW = 0x4,
+      /**
+       * The analysis depends on all instruction dependency classes.  These
+       * will typically be signaled simultaneously when inserting or removing
+       * instructions in the program (or if you're feeling too lazy to read
+       * through your optimization pass to figure out which of the instruction
+       * dependency classes above it invalidates).
+       */
+      DEPENDENCY_INSTRUCTIONS = 0x7,
+      /**
+       * The analysis depends on the set of VGRFs in the program and their
+       * naming.  This will be signaled when VGRFs are allocated or released.
+       */
+      DEPENDENCY_VARIABLES = 0x8,
+      /**
+       * The analysis depends on the set of basic blocks in the program, their
+       * control flow edges and naming.
+       */
+      DEPENDENCY_BLOCKS = 0x10,
+      /**
+       * The analysis depends on the program being literally the same (good
+       * luck...), any change in the input invalidates previous analysis
+       * computations.
+       */
+      DEPENDENCY_EVERYTHING = ~0
+   };
+
+   inline analysis_dependency_class
+   operator|(analysis_dependency_class x, analysis_dependency_class y)
+   {
+      return static_cast<analysis_dependency_class>(
+         static_cast<unsigned>(x) | static_cast<unsigned>(y));
+   }
+}
+
+/**
+ * Instantiate a program analysis class \p L which can calculate an object of
+ * type \p T as result.  \p C is a closure that encapsulates whatever
+ * information is required as argument to run the analysis pass.  The purpose
+ * of this class is to make sure that:
+ *
+ *  - The analysis pass is executed lazily whenever it's needed and multiple
+ *    executions are optimized out as long as the cached result remains marked
+ *    up-to-date.
+ *
+ *  - There is no way to access the cached analysis result without first
+ *    calling L::require(), which makes sure that the analysis pass is rerun
+ *    if necessary.
+ *
+ *  - The cached result doesn't become inconsistent with the program for as
+ *    long as it remains marked up-to-date. (This is only enforced in debug
+ *    builds for performance reasons)
+ *
+ * The requirements on \p T are the following:
+ *
+ *  - Constructible with a single argument, as in 'x = T(c)' for \p c of type
+ *    \p C.
+ *
+ *  - 'x.dependency_class()' on const \p x returns a bitset of
+ *    brw::analysis_dependency_class specifying the set of IR objects that are
+ *    required to remain invariant for the cached analysis result to be
+ *    considered valid.
+ *
+ *  - 'x.validate(c)' on const \p x returns a boolean result specifying
+ *    whether the analysis result \p x is consistent with the input IR.  This
+ *    is currently only used for validation in debug builds.
+ */
+template<class T, class C>
+class brw_analysis {
+public:
+   /**
+    * Construct a program analysis.  \p c is an arbitrary object
+    * passed as argument to the constructor of the analysis result
+    * object of type \p T.
+    */
+   brw_analysis(const C *c) : c(c), p(NULL) {}
+
+   /**
+    * Destroy a program analysis.
+    */
+   ~brw_analysis()
+   {
+      delete p;
+   }
+
+   /**
+    * Obtain the result of a program analysis.  This gives a
+    * guaranteed up-to-date result, the analysis pass will be
+    * rerun implicitly if it has become stale.
+    */
+   T &
+   require()
+   {
+      if (p)
+         assert(p->validate(c));
+      else
+         p = new T(c);
+
+      return *p;
+   }
+
+   const T &
+   require() const
+   {
+      return const_cast<brw_analysis<T, C> *>(this)->require();
+   }
+
+   /**
+    * Report that dependencies of the analysis pass may have changed
+    * since the last calculation and the cached analysis result may
+    * have to be discarded.
+    */
+   void
+   invalidate(brw::analysis_dependency_class c)
+   {
+      if (p && (c & p->dependency_class())) {
+         delete p;
+         p = NULL;
+      }
+   }
+
+private:
+   const C *c;
+   T *p;
+};
+
+#endif
diff --git a/src/intel/compiler/elk/brw_ir_fs.h b/src/intel/compiler/elk/brw_ir_fs.h
new file mode 100644
index 00000000000..169449bbab8
--- /dev/null
+++ b/src/intel/compiler/elk/brw_ir_fs.h
@@ -0,0 +1,737 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_FS_H
+#define BRW_IR_FS_H
+
+#include "brw_shader.h"
+
+class fs_inst;
+
+class fs_reg : public backend_reg {
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(fs_reg)
+
+   void init();
+
+   fs_reg();
+   fs_reg(struct ::brw_reg reg);
+   fs_reg(enum brw_reg_file file, unsigned nr);
+   fs_reg(enum brw_reg_file file, unsigned nr, enum brw_reg_type type);
+
+   bool equals(const fs_reg &r) const;
+   bool negative_equals(const fs_reg &r) const;
+   bool is_contiguous() const;
+
+   /**
+    * Return the size in bytes of a single logical component of the
+    * register assuming the given execution width.
+    */
+   unsigned component_size(unsigned width) const;
+
+   /** Register region horizontal stride */
+   uint8_t stride;
+};
+
+static inline fs_reg
+negate(fs_reg reg)
+{
+   assert(reg.file != IMM);
+   reg.negate = !reg.negate;
+   return reg;
+}
+
+static inline fs_reg
+retype(fs_reg reg, enum brw_reg_type type)
+{
+   reg.type = type;
+   return reg;
+}
+
+static inline fs_reg
+byte_offset(fs_reg reg, unsigned delta)
+{
+   switch (reg.file) {
+   case BAD_FILE:
+      break;
+   case VGRF:
+   case ATTR:
+   case UNIFORM:
+      reg.offset += delta;
+      break;
+   case MRF: {
+      const unsigned suboffset = reg.offset + delta;
+      reg.nr += suboffset / REG_SIZE;
+      reg.offset = suboffset % REG_SIZE;
+      break;
+   }
+   case ARF:
+   case FIXED_GRF: {
+      const unsigned suboffset = reg.subnr + delta;
+      reg.nr += suboffset / REG_SIZE;
+      reg.subnr = suboffset % REG_SIZE;
+      break;
+   }
+   case IMM:
+   default:
+      assert(delta == 0);
+   }
+   return reg;
+}
+
+static inline fs_reg
+horiz_offset(const fs_reg &reg, unsigned delta)
+{
+   switch (reg.file) {
+   case BAD_FILE:
+   case UNIFORM:
+   case IMM:
+      /* These only have a single component that is implicitly splatted.  A
+       * horizontal offset should be a harmless no-op.
+       * XXX - Handle vector immediates correctly.
+       */
+      return reg;
+   case VGRF:
+   case MRF:
+   case ATTR:
+      return byte_offset(reg, delta * reg.stride * type_sz(reg.type));
+   case ARF:
+   case FIXED_GRF:
+      if (reg.is_null()) {
+         return reg;
+      } else {
+         const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
+         const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
+         const unsigned width = 1 << reg.width;
+
+         if (delta % width == 0) {
+            return byte_offset(reg, delta / width * vstride * type_sz(reg.type));
+         } else {
+            assert(vstride == hstride * width);
+            return byte_offset(reg, delta * hstride * type_sz(reg.type));
+         }
+      }
+   }
+   unreachable("Invalid register file");
+}
+
+static inline fs_reg
+offset(fs_reg reg, unsigned width, unsigned delta)
+{
+   switch (reg.file) {
+   case BAD_FILE:
+      break;
+   case ARF:
+   case FIXED_GRF:
+   case MRF:
+   case VGRF:
+   case ATTR:
+   case UNIFORM:
+      return byte_offset(reg, delta * reg.component_size(width));
+   case IMM:
+      assert(delta == 0);
+   }
+   return reg;
+}
+
+/**
+ * Get the scalar channel of \p reg given by \p idx and replicate it to all
+ * channels of the result.
+ */
+static inline fs_reg
+component(fs_reg reg, unsigned idx)
+{
+   reg = horiz_offset(reg, idx);
+   reg.stride = 0;
+   if (reg.file == ARF || reg.file == FIXED_GRF) {
+      reg.vstride = BRW_VERTICAL_STRIDE_0;
+      reg.width = BRW_WIDTH_1;
+      reg.hstride = BRW_HORIZONTAL_STRIDE_0;
+   }
+   return reg;
+}
+
+/**
+ * Return an integer identifying the discrete address space a register is
+ * contained in.  A register is by definition fully contained in the single
+ * reg_space it belongs to, so two registers with different reg_space ids are
+ * guaranteed not to overlap.  Most register files are a single reg_space of
+ * its own, only the VGRF and ATTR files are composed of multiple discrete
+ * address spaces, one for each allocation and input attribute respectively.
+ */
+static inline uint32_t
+reg_space(const fs_reg &r)
+{
+   return r.file << 16 | (r.file == VGRF || r.file == ATTR ? r.nr : 0);
+}
+
+/**
+ * Return the base offset in bytes of a register relative to the start of its
+ * reg_space().
+ */
+static inline unsigned
+reg_offset(const fs_reg &r)
+{
+   return (r.file == VGRF || r.file == IMM || r.file == ATTR ? 0 : r.nr) *
+          (r.file == UNIFORM ? 4 : REG_SIZE) + r.offset +
+          (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
+}
+
+/**
+ * Return the amount of padding in bytes left unused between individual
+ * components of register \p r due to a (horizontal) stride value greater than
+ * one, or zero if components are tightly packed in the register file.
+ */
+static inline unsigned
+reg_padding(const fs_reg &r)
+{
+   const unsigned stride = ((r.file != ARF && r.file != FIXED_GRF) ? r.stride :
+                            r.hstride == 0 ? 0 :
+                            1 << (r.hstride - 1));
+   return (MAX2(1, stride) - 1) * type_sz(r.type);
+}
+
+/* Do not call this directly. Call regions_overlap() instead. */
+static inline bool
+regions_overlap_MRF(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
+{
+   if (r.nr & BRW_MRF_COMPR4) {
+      fs_reg t = r;
+      t.nr &= ~BRW_MRF_COMPR4;
+      /* COMPR4 regions are translated by the hardware during decompression
+       * into two separate half-regions 4 MRFs apart from each other.
+       *
+       * Note: swapping s and t in this parameter list eliminates one possible
+       * level of recursion (since the s in the called versions of
+       * regions_overlap_MRF can't be COMPR4), and that makes the compiled
+       * code a lot smaller.
+       */
+      return regions_overlap_MRF(s, ds, t, dr / 2) ||
+             regions_overlap_MRF(s, ds, byte_offset(t, 4 * REG_SIZE), dr / 2);
+   } else if (s.nr & BRW_MRF_COMPR4) {
+      return regions_overlap_MRF(s, ds, r, dr);
+   }
+
+   return !((r.nr * REG_SIZE + r.offset + dr) <= (s.nr * REG_SIZE + s.offset) ||
+            (s.nr * REG_SIZE + s.offset + ds) <= (r.nr * REG_SIZE + r.offset));
+}
+
+/**
+ * Return whether the register region starting at \p r and spanning \p dr
+ * bytes could potentially overlap the register region starting at \p s and
+ * spanning \p ds bytes.
+ */
+static inline bool
+regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
+{
+   if (r.file != s.file)
+      return false;
+
+   if (r.file == VGRF) {
+      return r.nr == s.nr &&
+             !(r.offset + dr <= s.offset || s.offset + ds <= r.offset);
+   } else if (r.file != MRF) {
+      return !(reg_offset(r) + dr <= reg_offset(s) ||
+               reg_offset(s) + ds <= reg_offset(r));
+   } else {
+      return regions_overlap_MRF(r, dr, s, ds);
+   }
+}
+
+/**
+ * Check that the register region given by r [r.offset, r.offset + dr[
+ * is fully contained inside the register region given by s
+ * [s.offset, s.offset + ds[.
+ */
+static inline bool
+region_contained_in(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
+{
+   return reg_space(r) == reg_space(s) &&
+          reg_offset(r) >= reg_offset(s) &&
+          reg_offset(r) + dr <= reg_offset(s) + ds;
+}
+
+/**
+ * Return whether the given register region is n-periodic, i.e. whether the
+ * original region remains invariant after shifting it by \p n scalar
+ * channels.
+ */
+static inline bool
+is_periodic(const fs_reg &reg, unsigned n)
+{
+   if (reg.file == BAD_FILE || reg.is_null()) {
+      return true;
+
+   } else if (reg.file == IMM) {
+      const unsigned period = (reg.type == BRW_REGISTER_TYPE_UV ||
+                               reg.type == BRW_REGISTER_TYPE_V ? 8 :
+                               reg.type == BRW_REGISTER_TYPE_VF ? 4 :
+                               1);
+      return n % period == 0;
+
+   } else if (reg.file == ARF || reg.file == FIXED_GRF) {
+      const unsigned period = (reg.hstride == 0 && reg.vstride == 0 ? 1 :
+                               reg.vstride == 0 ? 1 << reg.width :
+                               ~0);
+      return n % period == 0;
+
+   } else {
+      return reg.stride == 0;
+   }
+}
+
+static inline bool
+is_uniform(const fs_reg &reg)
+{
+   return is_periodic(reg, 1);
+}
+
+/**
+ * Get the specified 8-component quarter of a register.
+ */
+static inline fs_reg
+quarter(const fs_reg &reg, unsigned idx)
+{
+   assert(idx < 4);
+   return horiz_offset(reg, 8 * idx);
+}
+
+/**
+ * Reinterpret each channel of register \p reg as a vector of values of the
+ * given smaller type and take the i-th subcomponent from each.
+ */
+static inline fs_reg
+subscript(fs_reg reg, brw_reg_type type, unsigned i)
+{
+   assert((i + 1) * type_sz(type) <= type_sz(reg.type));
+
+   if (reg.file == ARF || reg.file == FIXED_GRF) {
+      /* The stride is encoded inconsistently for fixed GRF and ARF registers
+       * as the log2 of the actual vertical and horizontal strides.
+       */
+      const int delta = util_logbase2(type_sz(reg.type)) -
+                        util_logbase2(type_sz(type));
+      reg.hstride += (reg.hstride ? delta : 0);
+      reg.vstride += (reg.vstride ? delta : 0);
+
+   } else if (reg.file == IMM) {
+      unsigned bit_size = type_sz(type) * 8;
+      reg.u64 >>= i * bit_size;
+      reg.u64 &= BITFIELD64_MASK(bit_size);
+      if (bit_size <= 16)
+         reg.u64 |= reg.u64 << 16;
+      return retype(reg, type);
+   } else {
+      reg.stride *= type_sz(reg.type) / type_sz(type);
+   }
+
+   return byte_offset(retype(reg, type), i * type_sz(type));
+}
+
+static inline fs_reg
+horiz_stride(fs_reg reg, unsigned s)
+{
+   reg.stride *= s;
+   return reg;
+}
+
+static const fs_reg reg_undef;
+
+class fs_inst : public backend_instruction {
+   fs_inst &operator=(const fs_inst &);
+
+   void init(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
+             const fs_reg *src, unsigned sources);
+
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(fs_inst)
+
+   fs_inst();
+   fs_inst(enum opcode opcode, uint8_t exec_size);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+           const fs_reg &src0);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+           const fs_reg &src0, const fs_reg &src1);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+           const fs_reg &src0, const fs_reg &src1, const fs_reg &src2);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+           const fs_reg src[], unsigned sources);
+   fs_inst(const fs_inst &that);
+   ~fs_inst();
+
+   void resize_sources(uint8_t num_sources);
+
+   bool is_send_from_grf() const;
+   bool is_payload(unsigned arg) const;
+   bool is_partial_write() const;
+   unsigned components_read(unsigned i) const;
+   unsigned size_read(int arg) const;
+   bool can_do_source_mods(const struct intel_device_info *devinfo) const;
+   bool can_do_cmod();
+   bool can_change_types() const;
+   bool has_source_and_destination_hazard() const;
+   unsigned implied_mrf_writes() const;
+
+   /**
+    * Return whether \p arg is a control source of a virtual instruction which
+    * shouldn't contribute to the execution type and usual regioning
+    * restriction calculations of arithmetic instructions.
+    */
+   bool is_control_source(unsigned arg) const;
+
+   /**
+    * Return the subset of flag registers read by the instruction as a bitset
+    * with byte granularity.
+    */
+   unsigned flags_read(const intel_device_info *devinfo) const;
+
+   /**
+    * Return the subset of flag registers updated by the instruction (either
+    * partially or fully) as a bitset with byte granularity.
+    */
+   unsigned flags_written(const intel_device_info *devinfo) const;
+
+   /**
+    * Return true if this instruction is a sampler message gathering residency
+    * data.
+    */
+   bool has_sampler_residency() const;
+
+   fs_reg dst;
+   fs_reg *src;
+
+   uint8_t sources; /**< Number of fs_reg sources. */
+
+   bool last_rt:1;
+   bool pi_noperspective:1;   /**< Pixel interpolator noperspective flag */
+   bool keep_payload_trailing_zeros;
+
+   tgl_swsb sched; /**< Scheduling info. */
+};
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a possibly
+ * inverted predicate.
+ */
+static inline fs_inst *
+set_predicate_inv(enum brw_predicate pred, bool inverse,
+                  fs_inst *inst)
+{
+   inst->predicate = pred;
+   inst->predicate_inverse = inverse;
+   return inst;
+}
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a predicate.
+ */
+static inline fs_inst *
+set_predicate(enum brw_predicate pred, fs_inst *inst)
+{
+   return set_predicate_inv(pred, false, inst);
+}
+
+/**
+ * Write the result of evaluating the condition given by \p mod to a flag
+ * register.
+ */
+static inline fs_inst *
+set_condmod(enum brw_conditional_mod mod, fs_inst *inst)
+{
+   inst->conditional_mod = mod;
+   return inst;
+}
+
+/**
+ * Clamp the result of \p inst to the saturation range of its destination
+ * datatype.
+ */
+static inline fs_inst *
+set_saturate(bool saturate, fs_inst *inst)
+{
+   inst->saturate = saturate;
+   return inst;
+}
+
+/**
+ * Return the number of dataflow registers written by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->dst) /
+ * register_size)'.  The somewhat arbitrary register size unit is 4B for the
+ * UNIFORM and IMM files and 32B for all other files.
+ */
+inline unsigned
+regs_written(const fs_inst *inst)
+{
+   assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
+   return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE +
+                       inst->size_written -
+                       MIN2(inst->size_written, reg_padding(inst->dst)),
+                       REG_SIZE);
+}
+
+/**
+ * Return the number of dataflow registers read by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
+ * register_size)'.  The somewhat arbitrary register size unit is 4B for the
+ * UNIFORM files and 32B for all other files.
+ */
+inline unsigned
+regs_read(const fs_inst *inst, unsigned i)
+{
+   if (inst->src[i].file == IMM)
+      return 1;
+
+   const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE;
+   return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
+                       inst->size_read(i) -
+                       MIN2(inst->size_read(i), reg_padding(inst->src[i])),
+                       reg_size);
+}
+
+static inline enum brw_reg_type
+get_exec_type(const fs_inst *inst)
+{
+   brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
+
+   for (int i = 0; i < inst->sources; i++) {
+      if (inst->src[i].file != BAD_FILE &&
+          !inst->is_control_source(i)) {
+         const brw_reg_type t = get_exec_type(inst->src[i].type);
+         if (type_sz(t) > type_sz(exec_type))
+            exec_type = t;
+         else if (type_sz(t) == type_sz(exec_type) &&
+                  brw_reg_type_is_floating_point(t))
+            exec_type = t;
+      }
+   }
+
+   if (exec_type == BRW_REGISTER_TYPE_B)
+      exec_type = inst->dst.type;
+
+   assert(exec_type != BRW_REGISTER_TYPE_B);
+
+   /* Promotion of the execution type to 32-bit for conversions from or to
+    * half-float seems to be consistent with the following text from the
+    * Cherryview PRM Vol. 7, "Execution Data Type":
+    *
+    * "When single precision and half precision floats are mixed between
+    *  source operands or between source and destination operand [..] single
+    *  precision float is the execution datatype."
+    *
+    * and from "Register Region Restrictions":
+    *
+    * "Conversion between Integer and HF (Half Float) must be DWord aligned
+    *  and strided by a DWord on the destination."
+    */
+   if (type_sz(exec_type) == 2 &&
+       inst->dst.type != exec_type) {
+      if (exec_type == BRW_REGISTER_TYPE_HF)
+         exec_type = BRW_REGISTER_TYPE_F;
+      else if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+         exec_type = BRW_REGISTER_TYPE_D;
+   }
+
+   return exec_type;
+}
+
+static inline unsigned
+get_exec_type_size(const fs_inst *inst)
+{
+   return type_sz(get_exec_type(inst));
+}
+
+static inline bool
+is_send(const fs_inst *inst)
+{
+   return inst->mlen || inst->is_send_from_grf();
+}
+
+/**
+ * Return whether the instruction isn't an ALU instruction and cannot be
+ * assumed to complete in-order.
+ */
+static inline bool
+is_unordered(const intel_device_info *devinfo, const fs_inst *inst)
+{
+   return is_send(inst) || (devinfo->ver < 20 && inst->is_math()) ||
+          inst->opcode == BRW_OPCODE_DPAS ||
+          (devinfo->has_64bit_float_via_math_pipe &&
+           (get_exec_type(inst) == BRW_REGISTER_TYPE_DF ||
+            inst->dst.type == BRW_REGISTER_TYPE_DF));
+}
+
+/**
+ * Return whether the following regioning restriction applies to the specified
+ * instruction.  From the Cherryview PRM Vol 7. "Register Region
+ * Restrictions":
+ *
+ * "When source or destination datatype is 64b or operation is integer DWord
+ *  multiply, regioning in Align1 must follow these rules:
+ *
+ *  1. Source and Destination horizontal stride must be aligned to the same qword.
+ *  2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
+ *  3. Source and Destination offset must be the same, except the case of
+ *     scalar source."
+ */
+static inline bool
+has_dst_aligned_region_restriction(const intel_device_info *devinfo,
+                                   const fs_inst *inst,
+                                   brw_reg_type dst_type)
+{
+   const brw_reg_type exec_type = get_exec_type(inst);
+   /* Even though the hardware spec claims that "integer DWord multiply"
+    * operations are restricted, empirical evidence and the behavior of the
+    * simulator suggest that only 32x32-bit integer multiplication is
+    * restricted.
+    */
+   const bool is_dword_multiply = !brw_reg_type_is_floating_point(exec_type) &&
+      ((inst->opcode == BRW_OPCODE_MUL &&
+        MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||
+       (inst->opcode == BRW_OPCODE_MAD &&
+        MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
+
+   if (type_sz(dst_type) > 4 || type_sz(exec_type) > 4 ||
+       (type_sz(exec_type) == 4 && is_dword_multiply))
+      return devinfo->platform == INTEL_PLATFORM_CHV ||
+             intel_device_info_is_9lp(devinfo) ||
+             devinfo->verx10 >= 125;
+
+   else if (brw_reg_type_is_floating_point(dst_type))
+      return devinfo->verx10 >= 125;
+
+   else
+      return false;
+}
+
+static inline bool
+has_dst_aligned_region_restriction(const intel_device_info *devinfo,
+                                   const fs_inst *inst)
+{
+   return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type);
+}
+
+/**
+ * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from
+ * the specified register file into a VGRF.
+ *
+ * This implies identity register regions without any source-destination
+ * overlap, but otherwise has no implications on the location of sources and
+ * destination in the register file: Gathering any number of portions from
+ * multiple virtual registers in any order is allowed.
+ */
+inline bool
+is_copy_payload(brw_reg_file file, const fs_inst *inst)
+{
+   if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD ||
+       inst->is_partial_write() || inst->saturate ||
+       inst->dst.file != VGRF)
+      return false;
+
+   for (unsigned i = 0; i < inst->sources; i++) {
+      if (inst->src[i].file != file ||
+          inst->src[i].abs || inst->src[i].negate)
+         return false;
+
+      if (!inst->src[i].is_contiguous())
+         return false;
+
+      if (regions_overlap(inst->dst, inst->size_written,
+                          inst->src[i], inst->size_read(i)))
+         return false;
+   }
+
+   return true;
+}
+
+/**
+ * Like is_copy_payload(), but the instruction is required to copy a single
+ * contiguous block of registers from the given register file into the
+ * destination without any reordering.
+ */
+inline bool
+is_identity_payload(brw_reg_file file, const fs_inst *inst) {
+   if (is_copy_payload(file, inst)) {
+      fs_reg reg = inst->src[0];
+
+      for (unsigned i = 0; i < inst->sources; i++) {
+         reg.type = inst->src[i].type;
+         if (!inst->src[i].equals(reg))
+            return false;
+
+         reg = byte_offset(reg, inst->size_read(i));
+      }
+
+      return true;
+   } else {
+      return false;
+   }
+}
+
+/**
+ * Like is_copy_payload(), but the instruction is required to source data from
+ * at least two disjoint VGRFs.
+ *
+ * This doesn't necessarily rule out the elimination of this instruction
+ * through register coalescing, but due to limitations of the register
+ * coalesce pass it might be impossible to do so directly until a later stage,
+ * when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV
+ * instructions.
+ */
+inline bool
+is_multi_copy_payload(const fs_inst *inst) {
+   if (is_copy_payload(VGRF, inst)) {
+      for (unsigned i = 0; i < inst->sources; i++) {
+            if (inst->src[i].nr != inst->src[0].nr)
+               return true;
+      }
+   }
+
+   return false;
+}
+
+/**
+ * Like is_identity_payload(), but the instruction is required to copy the
+ * whole contents of a single VGRF into the destination.
+ *
+ * This means that there is a good chance that the instruction will be
+ * eliminated through register coalescing, but it's neither a necessary nor a
+ * sufficient condition for that to happen -- E.g. consider the case where
+ * source and destination registers diverge due to other instructions in the
+ * program overwriting part of their contents, which isn't something we can
+ * predict up front based on a cheap strictly local test of the copy
+ * instruction.
+ */
+inline bool
+is_coalescing_payload(const brw::simple_allocator &alloc, const fs_inst *inst)
+{
+   return is_identity_payload(VGRF, inst) &&
+          inst->src[0].offset == 0 &&
+          alloc.sizes[inst->src[0].nr] * REG_SIZE == inst->size_written;
+}
+
+bool
+has_bank_conflict(const struct brw_isa_info *isa, const fs_inst *inst);
+
+#endif
diff --git a/src/intel/compiler/elk/brw_ir_performance.cpp b/src/intel/compiler/elk/brw_ir_performance.cpp
new file mode 100644
index 00000000000..d50e63bfdb1
--- /dev/null
+++ b/src/intel/compiler/elk/brw_ir_performance.cpp
@@ -0,0 +1,1698 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+namespace {
+   /**
+    * Enumeration representing the various asynchronous units that can run
+    * computations in parallel on behalf of a shader thread.
+    */
+   enum intel_eu_unit {
+      /** EU front-end. */
+      EU_UNIT_FE,
+      /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
+      EU_UNIT_FPU,
+      /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
+      EU_UNIT_EM,
+      /** Sampler shared function. */
+      EU_UNIT_SAMPLER,
+      /** Pixel Interpolator shared function. */
+      EU_UNIT_PI,
+      /** Unified Return Buffer shared function. */
+      EU_UNIT_URB,
+      /** Data Port Data Cache shared function. */
+      EU_UNIT_DP_DC,
+      /** Data Port Render Cache shared function. */
+      EU_UNIT_DP_RC,
+      /** Data Port Constant Cache shared function. */
+      EU_UNIT_DP_CC,
+      /** Message Gateway shared function. */
+      EU_UNIT_GATEWAY,
+      /** Thread Spawner shared function. */
+      EU_UNIT_SPAWNER,
+      /* EU_UNIT_VME, */
+      /* EU_UNIT_CRE, */
+      /** Number of asynchronous units currently tracked. */
+      EU_NUM_UNITS,
+      /** Dummy unit for instructions that don't consume runtime from the above. */
+      EU_UNIT_NULL = EU_NUM_UNITS
+   };
+
+   /**
+    * Enumeration representing a computation result another computation can
+    * potentially depend on.
+    */
+   enum intel_eu_dependency_id {
+      /* Register part of the GRF. */
+      EU_DEPENDENCY_ID_GRF0 = 0,
+      /* Register part of the MRF.  Only used on Gfx4-6. */
+      EU_DEPENDENCY_ID_MRF0 = EU_DEPENDENCY_ID_GRF0 + XE2_MAX_GRF,
+      /* Address register part of the ARF. */
+      EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_MRF0 + 24,
+      /* Accumulator register part of the ARF. */
+      EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
+      /* Flag register part of the ARF. */
+      EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12,
+      /* SBID token write completion.  Only used on Gfx12+. */
+      EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8,
+      /* SBID token read completion.  Only used on Gfx12+. */
+      EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 32,
+      /* Number of computation dependencies currently tracked. */
+      EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 32
+   };
+
+   /**
+    * State of our modeling of the program execution.
+    */
+   struct state {
+      state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
+      /**
+       * Time at which a given unit will be ready to execute the next
+       * computation, in clock units.
+       */
+      unsigned unit_ready[EU_NUM_UNITS];
+      /**
+       * Time at which an instruction dependent on a given dependency ID will
+       * be ready to execute, in clock units.
+       */
+      unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
+      /**
+       * Aggregated utilization of a given unit excluding idle cycles,
+       * in clock units.
+       */
+      float unit_busy[EU_NUM_UNITS];
+      /**
+       * Factor of the overhead of a computation accounted for in the
+       * aggregated utilization calculation.
+       */
+      float weight;
+   };
+
+   /**
+    * Information derived from an IR instruction used to compute performance
+    * estimates.  Allows the timing calculation to work on both FS and VEC4
+    * instructions.
+    */
+   struct instruction_info {
+      instruction_info(const struct brw_isa_info *isa, const fs_inst *inst) :
+         isa(isa), devinfo(isa->devinfo), op(inst->opcode),
+         td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
+         tx(get_exec_type(inst)), sx(0), ss(0),
+         sc(has_bank_conflict(isa, inst) ? sd : 0),
+         desc(inst->desc), sfid(inst->sfid)
+      {
+         /* We typically want the maximum source size, except for split send
+          * messages which require the total size.
+          */
+         if (inst->opcode == SHADER_OPCODE_SEND) {
+            ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
+                 DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
+         } else {
+            for (unsigned i = 0; i < inst->sources; i++)
+               ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
+         }
+
+         /* Convert the execution size to GRF units. */
+         sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
+
+         /* 32x32 integer multiplication has half the usual ALU throughput.
+          * Treat it as double-precision.
+          */
+         if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
+             !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
+             type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
+            tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
+
+         rcount = inst->opcode == BRW_OPCODE_DPAS ? inst->rcount : 0;
+      }
+
+      instruction_info(const struct brw_isa_info *isa,
+                       const vec4_instruction *inst) :
+         isa(isa), devinfo(isa->devinfo), op(inst->opcode),
+         td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
+         tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
+         desc(inst->desc), sfid(inst->sfid), rcount(0)
+      {
+         /* Compute the maximum source size. */
+         for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
+            ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
+
+         /* Convert the execution size to GRF units. */
+         sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
+
+         /* 32x32 integer multiplication has half the usual ALU throughput.
+          * Treat it as double-precision.
+          */
+         if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
+             !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
+             type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
+            tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
+      }
+
+      /** ISA encoding information */
+      const struct brw_isa_info *isa;
+      /** Device information. */
+      const struct intel_device_info *devinfo;
+      /** Instruction opcode. */
+      opcode op;
+      /** Destination type. */
+      brw_reg_type td;
+      /** Destination size in GRF units. */
+      unsigned sd;
+      /** Execution type. */
+      brw_reg_type tx;
+      /** Execution size in GRF units. */
+      unsigned sx;
+      /** Source size. */
+      unsigned ss;
+      /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
+      unsigned sc;
+      /** Send message descriptor. */
+      uint32_t desc;
+      /** Send message shared function ID. */
+      uint8_t sfid;
+      /** Repeat count for DPAS instructions. */
+      uint8_t rcount;
+   };
+
+   /**
+    * Timing information of an instruction used to estimate the performance of
+    * the program.
+    */
+   struct perf_desc {
+      perf_desc(enum intel_eu_unit u, int df, int db,
+                int ls, int ld, int la, int lf) :
+         u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
+
+      /**
+       * Back-end unit its runtime shall be accounted to, in addition to the
+       * EU front-end which is always assumed to be involved.
+       */
+      enum intel_eu_unit u;
+      /**
+       * Overhead cycles from the time that the EU front-end starts executing
+       * the instruction until it's ready to execute the next instruction.
+       */
+      int df;
+      /**
+       * Overhead cycles from the time that the back-end starts executing the
+       * instruction until it's ready to execute the next instruction.
+       */
+      int db;
+      /**
+       * Latency cycles from the time that the back-end starts executing the
+       * instruction until its sources have been read from the register file.
+       */
+      int ls;
+      /**
+       * Latency cycles from the time that the back-end starts executing the
+       * instruction until its regular destination has been written to the
+       * register file.
+       */
+      int ld;
+      /**
+       * Latency cycles from the time that the back-end starts executing the
+       * instruction until its accumulator destination has been written to the
+       * ARF file.
+       *
+       * Note that this is an approximation of the real behavior of
+       * accumulating instructions in the hardware: Instead of modeling a pair
+       * of back-to-back accumulating instructions as a first computation with
+       * latency equal to ld followed by another computation with a
+       * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
+       * model the stall as if it occurred at the top of the pipeline, with
+       * the latency of the accumulator computation offset accordingly.
+       */
+      int la;
+      /**
+       * Latency cycles from the time that the back-end starts executing the
+       * instruction until its flag destination has been written to the ARF
+       * file.
+       */
+      int lf;
+   };
+
+   /**
+    * Compute the timing information of an instruction based on any relevant
+    * information from the IR and a number of parameters specifying a linear
+    * approximation: Parameter X_Y specifies the derivative of timing X
+    * relative to info field Y, while X_1 specifies the independent term of
+    * the approximation of timing X.
+    */
+   perf_desc
+   calculate_desc(const instruction_info &info, enum intel_eu_unit u,
+                  int df_1, int df_sd, int df_sc,
+                  int db_1, int db_sx,
+                  int ls_1, int ld_1, int la_1, int lf_1,
+                  int l_ss, int l_sd)
+   {
+      return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
+                          db_1 + db_sx * int(info.sx),
+                          ls_1 + l_ss * int(info.ss),
+                          ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
+                          la_1, lf_1);
+   }
+
+   /**
+    * Compute the timing information of an instruction based on any relevant
+    * information from the IR and a number of linear approximation parameters
+    * hard-coded for each IR instruction.
+    *
+    * Most timing parameters are obtained from the multivariate linear
+    * regression of a sample of empirical timings measured using the tm0
+    * register (as can be done today by using the shader_time debugging
+    * option).  The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
+    * "Shared Functions - Extended Math", Section 3.2 "Performance".
+    * Parameters marked XXX shall be considered low-quality, they're possibly
+    * high variance or completely guessed in cases where experimental data was
+    * unavailable.
+    */
+   const perf_desc
+   instruction_desc(const instruction_info &info)
+   {
+      const struct intel_device_info *devinfo = info.devinfo;
+
+      switch (info.op) {
+      case BRW_OPCODE_SYNC:
+      case BRW_OPCODE_SEL:
+      case BRW_OPCODE_NOT:
+      case BRW_OPCODE_AND:
+      case BRW_OPCODE_OR:
+      case BRW_OPCODE_XOR:
+      case BRW_OPCODE_SHR:
+      case BRW_OPCODE_SHL:
+      case BRW_OPCODE_DIM:
+      case BRW_OPCODE_ASR:
+      case BRW_OPCODE_CMPN:
+      case BRW_OPCODE_F16TO32:
+      case BRW_OPCODE_BFREV:
+      case BRW_OPCODE_BFI1:
+      case BRW_OPCODE_AVG:
+      case BRW_OPCODE_FRC:
+      case BRW_OPCODE_RNDU:
+      case BRW_OPCODE_RNDD:
+      case BRW_OPCODE_RNDE:
+      case BRW_OPCODE_RNDZ:
+      case BRW_OPCODE_MAC:
+      case BRW_OPCODE_MACH:
+      case BRW_OPCODE_LZD:
+      case BRW_OPCODE_FBH:
+      case BRW_OPCODE_FBL:
+      case BRW_OPCODE_CBIT:
+      case BRW_OPCODE_ADDC:
+      case BRW_OPCODE_ROR:
+      case BRW_OPCODE_ROL:
+      case BRW_OPCODE_SUBB:
+      case BRW_OPCODE_SAD2:
+      case BRW_OPCODE_SADA2:
+      case BRW_OPCODE_LINE:
+      case BRW_OPCODE_NOP:
+      case SHADER_OPCODE_CLUSTER_BROADCAST:
+      case SHADER_OPCODE_SCRATCH_HEADER:
+      case FS_OPCODE_DDX_COARSE:
+      case FS_OPCODE_DDX_FINE:
+      case FS_OPCODE_DDY_COARSE:
+      case FS_OPCODE_PIXEL_X:
+      case FS_OPCODE_PIXEL_Y:
+      case FS_OPCODE_SET_SAMPLE_ID:
+      case VEC4_OPCODE_MOV_BYTES:
+      case VEC4_OPCODE_UNPACK_UNIFORM:
+      case VEC4_OPCODE_DOUBLE_TO_F32:
+      case VEC4_OPCODE_DOUBLE_TO_D32:
+      case VEC4_OPCODE_DOUBLE_TO_U32:
+      case VEC4_OPCODE_TO_DOUBLE:
+      case VEC4_OPCODE_PICK_LOW_32BIT:
+      case VEC4_OPCODE_PICK_HIGH_32BIT:
+      case VEC4_OPCODE_SET_LOW_32BIT:
+      case VEC4_OPCODE_SET_HIGH_32BIT:
+      case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
+      case GS_OPCODE_SET_DWORD_2:
+      case GS_OPCODE_SET_WRITE_OFFSET:
+      case GS_OPCODE_SET_VERTEX_COUNT:
+      case GS_OPCODE_PREPARE_CHANNEL_MASKS:
+      case GS_OPCODE_SET_CHANNEL_MASKS:
+      case GS_OPCODE_GET_INSTANCE_ID:
+      case GS_OPCODE_SET_PRIMITIVE_ID:
+      case GS_OPCODE_SVB_SET_DST_INDEX:
+      case TCS_OPCODE_SRC0_010_IS_ZERO:
+      case TCS_OPCODE_GET_PRIMITIVE_ID:
+      case TES_OPCODE_GET_PRIMITIVE_ID:
+      case SHADER_OPCODE_READ_SR_REG:
+         if (devinfo->ver >= 11) {
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                  0, 10, 6 /* XXX */, 14, 0, 0);
+         } else if (devinfo->ver >= 8) {
+            if (type_sz(info.tx) > 4)
+               return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
+                                     0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
+            else
+               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                     0, 8, 4, 12, 0, 0);
+         } else if (devinfo->verx10 >= 75) {
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                  0, 10, 6 /* XXX */, 16, 0, 0);
+         } else {
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                  0, 12, 8 /* XXX */, 18, 0, 0);
+         }
+
+      case BRW_OPCODE_MOV:
+      case BRW_OPCODE_CMP:
+      case BRW_OPCODE_ADD:
+      case BRW_OPCODE_ADD3:
+      case BRW_OPCODE_MUL:
+      case SHADER_OPCODE_MOV_RELOC_IMM:
+      case VEC4_OPCODE_MOV_FOR_SCRATCH:
+         if (devinfo->ver >= 11) {
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                  0, 10, 6, 14, 0, 0);
+         } else if (devinfo->ver >= 8) {
+            if (type_sz(info.tx) > 4)
+               return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
+                                     0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
+            else
+               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                     0, 8, 4, 12, 0, 0);
+         } else if (devinfo->verx10 >= 75) {
+            if (info.tx == BRW_REGISTER_TYPE_F)
+               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                     0, 12, 8 /* XXX */, 18, 0, 0);
+            else
+               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                     0, 10, 6 /* XXX */, 16, 0, 0);
+         } else if (devinfo->ver >= 7) {
+            if (info.tx == BRW_REGISTER_TYPE_F)
+               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                     0, 14, 10 /* XXX */, 20, 0, 0);
+            else
+               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                     0, 12, 8 /* XXX */, 18, 0, 0);
+         } else {
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
+                                  0, 2 /* XXX */,
+                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
+                                  0, 0);
+         }
+
+      case BRW_OPCODE_BFE:
+      case BRW_OPCODE_BFI2:
+      case BRW_OPCODE_CSEL:
+         if (devinfo->ver >= 11)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
+                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
+                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
+                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 7)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
+                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
+         else
+            abort();
+
+      case BRW_OPCODE_MAD:
+         if (devinfo->ver >= 11) {
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
+                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
+         } else if (devinfo->ver >= 8) {
+            if (type_sz(info.tx) > 4)
+               return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
+                                     0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
+            else
+               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
+                                     0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
+         } else if (devinfo->verx10 >= 75) {
+            if (info.tx == BRW_REGISTER_TYPE_F)
+               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
+                                     0, 12, 8 /* XXX */, 18, 0, 0);
+            else
+               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
+                                     0, 10, 6 /* XXX */, 16, 0, 0);
+         } else if (devinfo->ver >= 7) {
+            if (info.tx == BRW_REGISTER_TYPE_F)
+               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
+                                     0, 14, 10 /* XXX */, 20, 0, 0);
+            else
+               return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
+                                     0, 12, 8 /* XXX */, 18, 0, 0);
+         } else if (devinfo->ver >= 6) {
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 1 /* XXX */,
+                                  0, 2 /* XXX */,
+                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
+                                  0, 0);
+         } else {
+            abort();
+         }
+
+      case BRW_OPCODE_F32TO16:
+         if (devinfo->ver >= 11)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
+                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
+                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
+                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 7)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
+                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
+         else
+            abort();
+
+      case BRW_OPCODE_DP4:
+      case BRW_OPCODE_DPH:
+      case BRW_OPCODE_DP3:
+      case BRW_OPCODE_DP2:
+         if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                  0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
+         else
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
+
+      case BRW_OPCODE_DP4A:
+         if (devinfo->ver >= 12)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
+                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
+         else
+            abort();
+
+      case BRW_OPCODE_DPAS: {
+         unsigned ld;
+
+         switch (info.rcount) {
+         case 1:
+            ld = 21;
+            break;
+         case 2:
+            ld = 22;
+            break;
+         case 8:
+         default:
+            ld = 32;
+            break;
+         }
+
+         /* DPAS cannot write the accumulator or the flags, so pass UINT_MAX
+          * for la and lf.
+          */
+         if (devinfo->verx10 >= 125)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
+                                  0, ld, UINT_MAX, UINT_MAX, 0, 0);
+         else
+            abort();
+      }
+
+      case SHADER_OPCODE_RCP:
+      case SHADER_OPCODE_RSQ:
+      case SHADER_OPCODE_SQRT:
+      case SHADER_OPCODE_EXP2:
+      case SHADER_OPCODE_LOG2:
+      case SHADER_OPCODE_SIN:
+      case SHADER_OPCODE_COS:
+      case SHADER_OPCODE_POW:
+      case SHADER_OPCODE_INT_QUOTIENT:
+      case SHADER_OPCODE_INT_REMAINDER:
+         if (devinfo->ver >= 6) {
+            switch (info.op) {
+            case SHADER_OPCODE_RCP:
+            case SHADER_OPCODE_RSQ:
+            case SHADER_OPCODE_SQRT:
+            case SHADER_OPCODE_EXP2:
+            case SHADER_OPCODE_LOG2:
+            case SHADER_OPCODE_SIN:
+            case SHADER_OPCODE_COS:
+               if (devinfo->ver >= 8)
+                  return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4,
+                                        0, 16, 0, 0, 0, 0);
+               else if (devinfo->verx10 >= 75)
+                  return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
+                                        0, 12, 0, 0, 0, 0);
+               else
+                  return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
+                                        0, 14, 0, 0, 0, 0);
+
+            case SHADER_OPCODE_POW:
+               if (devinfo->ver >= 8)
+                  return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8,
+                                        0, 24, 0, 0, 0, 0);
+               else if (devinfo->verx10 >= 75)
+                  return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
+                                        0, 20, 0, 0, 0, 0);
+               else
+                  return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
+                                        0, 22, 0, 0, 0, 0);
+
+            case SHADER_OPCODE_INT_QUOTIENT:
+            case SHADER_OPCODE_INT_REMAINDER:
+               return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0,
+                                     0, 28 /* XXX */, 0, 0, 0, 0);
+
+            default:
+               abort();
+            }
+         } else {
+            switch (info.op) {
+            case SHADER_OPCODE_RCP:
+               return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 8,
+                                     0, 22, 0, 0, 0, 8);
+
+            case SHADER_OPCODE_RSQ:
+               return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 16,
+                                     0, 44, 0, 0, 0, 8);
+
+            case SHADER_OPCODE_INT_QUOTIENT:
+            case SHADER_OPCODE_SQRT:
+            case SHADER_OPCODE_LOG2:
+               return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 24,
+                                     0, 66, 0, 0, 0, 8);
+
+            case SHADER_OPCODE_INT_REMAINDER:
+            case SHADER_OPCODE_EXP2:
+               return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 32,
+                                     0, 88, 0, 0, 0, 8);
+
+            case SHADER_OPCODE_SIN:
+            case SHADER_OPCODE_COS:
+               return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 48,
+                                     0, 132, 0, 0, 0, 8);
+
+            case SHADER_OPCODE_POW:
+               return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 64,
+                                     0, 176, 0, 0, 0, 8);
+
+            default:
+               abort();
+            }
+         }
+
+      case BRW_OPCODE_DO:
+         if (devinfo->ver >= 6)
+            return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
+                                  0, 0, 0, 0, 0, 0);
+         else
+            return calculate_desc(info, EU_UNIT_NULL, 2 /* XXX */, 0, 0, 0, 0,
+                                  0, 0, 0, 0, 0, 0);
+
+      case BRW_OPCODE_IF:
+      case BRW_OPCODE_ELSE:
+      case BRW_OPCODE_ENDIF:
+      case BRW_OPCODE_WHILE:
+      case BRW_OPCODE_BREAK:
+      case BRW_OPCODE_CONTINUE:
+      case BRW_OPCODE_HALT:
+         if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0,
+                                  0, 0, 0, 0, 0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_NULL, 6, 0, 0, 0, 0,
+                                  0, 0, 0, 0, 0, 0);
+         else
+            return calculate_desc(info, EU_UNIT_NULL, 2, 0, 0, 0, 0,
+                                  0, 0, 0, 0, 0, 0);
+
+      case FS_OPCODE_LINTERP:
+         if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
+                                  0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
+         else
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
+
+      case BRW_OPCODE_LRP:
+         if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
+                                  0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
+                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 6)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
+                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
+         else
+            abort();
+
+      case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+         if (devinfo->ver >= 11)
+            return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
+                                  0, 10 /* XXX */, 6 /* XXX */,
+                                  14 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
+                                  0, 8 /* XXX */, 4 /* XXX */,
+                                  12 /* XXX */, 0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
+                                  0, 10 /* XXX */, 6 /* XXX */,
+                                  16 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 7)
+            return calculate_desc(info, EU_UNIT_FPU, 24, 6, 0, 0, 6,
+                                  0, 12 /* XXX */, 8 /* XXX */,
+                                  18 /* XXX */, 0, 0);
+         else
+            abort();
+
+      case SHADER_OPCODE_MOV_INDIRECT:
+         if (devinfo->ver >= 11)
+            return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
+                                  0, 10 /* XXX */, 6 /* XXX */,
+                                  14 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
+                                  0, 8 /* XXX */, 4 /* XXX */,
+                                  12 /* XXX */, 0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
+                                  0, 10 /* XXX */, 6 /* XXX */,
+                                  16 /* XXX */, 0, 0);
+         else
+            return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
+                                  0, 12 /* XXX */, 8 /* XXX */,
+                                  18 /* XXX */, 0, 0);
+
+      case SHADER_OPCODE_BROADCAST:
+         if (devinfo->ver >= 11)
+            return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
+                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
+                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
+                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 7)
+            return calculate_desc(info, EU_UNIT_FPU, 20, 0, 0, 4, 0,
+                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
+         else
+            abort();
+
+      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+      case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
+         if (devinfo->ver >= 11)
+            return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
+                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
+                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 36, 0, 0, 6, 0,
+                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 7)
+            return calculate_desc(info, EU_UNIT_FPU, 40, 0, 0, 6, 0,
+                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
+         else
+            abort();
+
+      case SHADER_OPCODE_RND_MODE:
+      case SHADER_OPCODE_FLOAT_CONTROL_MODE:
+         if (devinfo->ver >= 11)
+            return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
+                                  4 /* XXX */, 0,
+                                  0, 0, 0, 0, 0, 0);
+         else if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
+                                  4 /* XXX */, 0,
+                                  0, 0, 0, 0, 0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
+                                  4 /* XXX */, 0,
+                                  0, 0, 0, 0, 0, 0);
+         else if (devinfo->ver >= 6)
+            return calculate_desc(info, EU_UNIT_FPU, 28 /* XXX */, 0, 0,
+                                  4 /* XXX */, 0,
+                                  0, 0, 0, 0, 0, 0);
+         else
+            abort();
+
+      case SHADER_OPCODE_SHUFFLE:
+         if (devinfo->ver >= 11)
+            return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
+                                  44 /* XXX */, 0,
+                                  0, 10 /* XXX */, 6 /* XXX */,
+                                  14 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
+                                  42 /* XXX */, 0,
+                                  0, 8 /* XXX */, 4 /* XXX */,
+                                  12 /* XXX */, 0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 44 /* XXX */, 0,
+                                  0, 44 /* XXX */,
+                                  0, 10 /* XXX */, 6 /* XXX */,
+                                  16 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 6)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 46 /* XXX */, 0,
+                                  0, 46 /* XXX */,
+                                  0, 12 /* XXX */, 8 /* XXX */,
+                                  18 /* XXX */, 0, 0);
+         else
+            abort();
+
+      case SHADER_OPCODE_SEL_EXEC:
+         if (devinfo->ver >= 11)
+            return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
+                                  0, 4 /* XXX */,
+                                  0, 10 /* XXX */, 6 /* XXX */,
+                                  14 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
+                                  0, 4 /* XXX */,
+                                  0, 8 /* XXX */, 4 /* XXX */,
+                                  12 /* XXX */, 0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
+                                  0, 4 /* XXX */,
+                                  0, 10 /* XXX */, 6 /* XXX */,
+                                  16 /* XXX */, 0, 0);
+         else
+            return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 4 /* XXX */, 0,
+                                  0, 4 /* XXX */,
+                                  0, 12 /* XXX */, 8 /* XXX */,
+                                  18 /* XXX */, 0, 0);
+
+      case SHADER_OPCODE_QUAD_SWIZZLE:
+         if (devinfo->ver >= 11)
+            return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
+                                  0, 8 /* XXX */,
+                                  0, 10 /* XXX */, 6 /* XXX */,
+                                  14 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
+                                  0, 8 /* XXX */,
+                                  0, 8 /* XXX */, 4 /* XXX */,
+                                  12 /* XXX */, 0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
+                                  0, 8 /* XXX */,
+                                  0, 10 /* XXX */, 6 /* XXX */,
+                                  16 /* XXX */, 0, 0);
+         else
+            return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
+                                  0, 8 /* XXX */,
+                                  0, 12 /* XXX */, 8 /* XXX */,
+                                  18 /* XXX */, 0, 0);
+
+      case FS_OPCODE_DDY_FINE:
+         if (devinfo->ver >= 11)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
+                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
+         else
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
+                                  0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
+
+      case FS_OPCODE_LOAD_LIVE_CHANNELS:
+         if (devinfo->ver >= 11)
+            return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
+                                  2 /* XXX */, 0,
+                                  0, 0, 0, 10 /* XXX */, 0, 0);
+         else if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
+                                  0, 2 /* XXX */,
+                                  0, 0, 0, 8 /* XXX */, 0, 0);
+         else
+            abort();
+
+      case VEC4_OPCODE_PACK_BYTES:
+         if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
+                                  4 /* XXX */, 0,
+                                  0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
+                                  0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
+                                  4 /* XXX */, 0,
+                                  0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
+                                  0, 0);
+         else
+            return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
+                                  4 /* XXX */, 0,
+                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
+                                  0, 0);
+
+      case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
+      case TCS_OPCODE_GET_INSTANCE_ID:
+      case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+      case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+      case TES_OPCODE_CREATE_INPUT_READ_HEADER:
+         if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 22 /* XXX */, 0, 0,
+                                  6 /* XXX */, 0,
+                                  0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
+                                  0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 26 /* XXX */, 0, 0,
+                                  6 /* XXX */, 0,
+                                  0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
+                                  0, 0);
+         else
+            return calculate_desc(info, EU_UNIT_FPU, 30 /* XXX */, 0, 0,
+                                  6 /* XXX */, 0,
+                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
+                                  0, 0);
+
+      case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
+      case TCS_OPCODE_CREATE_BARRIER_HEADER:
+         if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 32 /* XXX */, 0, 0,
+                                  8 /* XXX */, 0,
+                                  0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
+                                  0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 38 /* XXX */, 0, 0,
+                                  8 /* XXX */, 0,
+                                  0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
+                                  0, 0);
+         else if (devinfo->ver >= 6)
+            return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
+                                  8 /* XXX */, 0,
+                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
+                                  0, 0);
+         else
+            abort();
+
+      case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+         if (devinfo->ver >= 8)
+            return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 0, 0,
+                                  4 /* XXX */, 0,
+                                  0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
+                                  0, 0);
+         else if (devinfo->verx10 >= 75)
+            return calculate_desc(info, EU_UNIT_FPU, 14 /* XXX */, 0, 0,
+                                  4 /* XXX */, 0,
+                                  0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
+                                  0, 0);
+         else if (devinfo->ver >= 7)
+            return calculate_desc(info, EU_UNIT_FPU, 16 /* XXX */, 0, 0,
+                                  4 /* XXX */, 0,
+                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
+                                  0, 0);
+         else
+            abort();
+
+      case SHADER_OPCODE_TEX:
+      case FS_OPCODE_TXB:
+      case SHADER_OPCODE_TXD:
+      case SHADER_OPCODE_TXF:
+      case SHADER_OPCODE_TXF_LZ:
+      case SHADER_OPCODE_TXL:
+      case SHADER_OPCODE_TXL_LZ:
+      case SHADER_OPCODE_TXF_CMS:
+      case SHADER_OPCODE_TXF_CMS_W:
+      case SHADER_OPCODE_TXF_UMS:
+      case SHADER_OPCODE_TXF_MCS:
+      case SHADER_OPCODE_TXS:
+      case SHADER_OPCODE_LOD:
+      case SHADER_OPCODE_GET_BUFFER_SIZE:
+      case SHADER_OPCODE_TG4:
+      case SHADER_OPCODE_TG4_OFFSET:
+      case SHADER_OPCODE_SAMPLEINFO:
+      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
+         return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */,
+                               8 /* XXX */, 750 /* XXX */, 0, 0,
+                               2 /* XXX */, 0);
+
+      case VEC4_OPCODE_URB_READ:
+      case VEC4_VS_OPCODE_URB_WRITE:
+      case VEC4_GS_OPCODE_URB_WRITE:
+      case VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
+      case GS_OPCODE_THREAD_END:
+      case GS_OPCODE_FF_SYNC:
+      case VEC4_TCS_OPCODE_URB_WRITE:
+      case TCS_OPCODE_RELEASE_INPUT:
+      case TCS_OPCODE_THREAD_END:
+         return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
+                               32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
+
+      case SHADER_OPCODE_MEMORY_FENCE:
+      case SHADER_OPCODE_INTERLOCK:
+         switch (info.sfid) {
+         case GFX6_SFID_DATAPORT_RENDER_CACHE:
+            if (devinfo->ver >= 7)
+               return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0,
+                                     10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
+            else
+               abort();
+
+         case BRW_SFID_URB:
+         case GFX7_SFID_DATAPORT_DATA_CACHE:
+         case GFX12_SFID_SLM:
+         case GFX12_SFID_TGM:
+         case GFX12_SFID_UGM:
+         case HSW_SFID_DATAPORT_DATA_CACHE_1:
+            if (devinfo->ver >= 7)
+               return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0,
+                                     10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
+            else
+               abort();
+
+         default:
+            abort();
+         }
+
+      case SHADER_OPCODE_GFX4_SCRATCH_READ:
+      case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
+      case SHADER_OPCODE_GFX7_SCRATCH_READ:
+         return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 0, 8 /* XXX */,
+                               10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
+
+      case VEC4_OPCODE_UNTYPED_ATOMIC:
+         if (devinfo->ver >= 7)
+            return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
+                                  30 /* XXX */, 400 /* XXX */,
+                                  10 /* XXX */, 100 /* XXX */, 0, 0,
+                                  0, 400 /* XXX */);
+         else
+            abort();
+
+      case VEC4_OPCODE_UNTYPED_SURFACE_READ:
+      case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
+         if (devinfo->ver >= 7)
+            return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
+                                  0, 20 /* XXX */,
+                                  10 /* XXX */, 100 /* XXX */, 0, 0,
+                                  0, 0);
+         else
+            abort();
+
+      case FS_OPCODE_FB_WRITE:
+      case FS_OPCODE_FB_READ:
+      case FS_OPCODE_REP_FB_WRITE:
+         return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 0, 450 /* XXX */,
+                               10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
+
+      case GS_OPCODE_SVB_WRITE:
+         if (devinfo->ver >= 6)
+            return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
+                                  0, 450 /* XXX */,
+                                  10 /* XXX */, 300 /* XXX */, 0, 0,
+                                  0, 0);
+         else
+            abort();
+
+      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+         return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
+                               10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
+
+      case VS_OPCODE_PULL_CONSTANT_LOAD:
+      case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
+         return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
+                               8, 750, 0, 0, 2, 0);
+
+      case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+      case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+      case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+         if (devinfo->ver >= 7)
+            return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
+                                  0, 90 /* XXX */, 0, 0, 0, 0);
+         else
+            abort();
+
+      case SHADER_OPCODE_BARRIER:
+         if (devinfo->ver >= 7)
+            return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0,
+                                  0 /* XXX */, 0,
+                                  0, 0, 0, 0, 0, 0);
+         else
+            abort();
+
+      case CS_OPCODE_CS_TERMINATE:
+         if (devinfo->ver >= 7)
+            return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
+                                  10 /* XXX */, 0, 0, 0, 0, 0);
+         else
+            abort();
+
+      case SHADER_OPCODE_SEND:
+         switch (info.sfid) {
+         case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
+            if (devinfo->ver >= 7) {
+               /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
+               return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
+                                     10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
+            } else {
+               abort();
+            }
+         case GFX6_SFID_DATAPORT_RENDER_CACHE:
+            if (devinfo->ver >= 7) {
+               switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
+               case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
+                  return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
+                                        30 /* XXX */, 450 /* XXX */,
+                                        10 /* XXX */, 100 /* XXX */,
+                                        0, 0, 0, 400 /* XXX */);
+               default:
+                  return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
+                                        0, 450 /* XXX */,
+                                        10 /* XXX */, 300 /* XXX */, 0, 0,
+                                        0, 0);
+               }
+            } else if (devinfo->ver >= 6)  {
+               return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
+                                     0, 450 /* XXX */,
+                                     10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
+            } else {
+               abort();
+            }
+         case BRW_SFID_SAMPLER: {
+            if (devinfo->ver >= 6)
+               return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
+                                     8, 750, 0, 0, 2, 0);
+            else
+               abort();
+         }
+         case GFX7_SFID_DATAPORT_DATA_CACHE:
+         case HSW_SFID_DATAPORT_DATA_CACHE_1:
+            if (devinfo->verx10 >= 75) {
+               switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
+               case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
+               case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
+               case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
+               case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
+                  return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
+                                        30 /* XXX */, 400 /* XXX */,
+                                        10 /* XXX */, 100 /* XXX */, 0, 0,
+                                        0, 400 /* XXX */);
+
+               default:
+                  return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
+                                        0, 20 /* XXX */,
+                                        10 /* XXX */, 100 /* XXX */, 0, 0,
+                                        0, 0);
+               }
+            } else if (devinfo->ver >= 7) {
+               switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
+               case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
+                  return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
+                                        30 /* XXX */, 400 /* XXX */,
+                                        10 /* XXX */, 100 /* XXX */,
+                                        0, 0, 0, 400 /* XXX */);
+               default:
+                  return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
+                                        0, 20 /* XXX */,
+                                        10 /* XXX */, 100 /* XXX */, 0, 0,
+                                        0, 0);
+               }
+            } else {
+               abort();
+            }
+
+         case GFX7_SFID_PIXEL_INTERPOLATOR:
+            if (devinfo->ver >= 7)
+               return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
+                                     0, 90 /* XXX */, 0, 0, 0, 0);
+            else
+               abort();
+
+         case GFX12_SFID_UGM:
+         case GFX12_SFID_TGM:
+         case GFX12_SFID_SLM:
+            switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
+            case LSC_OP_LOAD:
+            case LSC_OP_STORE:
+            case LSC_OP_LOAD_CMASK:
+            case LSC_OP_STORE_CMASK:
+               return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
+                                     0, 20 /* XXX */,
+                                     10 /* XXX */, 100 /* XXX */, 0, 0,
+                                     0, 0);
+
+            case LSC_OP_FENCE:
+            case LSC_OP_ATOMIC_INC:
+            case LSC_OP_ATOMIC_DEC:
+            case LSC_OP_ATOMIC_LOAD:
+            case LSC_OP_ATOMIC_STORE:
+            case LSC_OP_ATOMIC_ADD:
+            case LSC_OP_ATOMIC_SUB:
+            case LSC_OP_ATOMIC_MIN:
+            case LSC_OP_ATOMIC_MAX:
+            case LSC_OP_ATOMIC_UMIN:
+            case LSC_OP_ATOMIC_UMAX:
+            case LSC_OP_ATOMIC_CMPXCHG:
+            case LSC_OP_ATOMIC_FADD:
+            case LSC_OP_ATOMIC_FSUB:
+            case LSC_OP_ATOMIC_FMIN:
+            case LSC_OP_ATOMIC_FMAX:
+            case LSC_OP_ATOMIC_FCMPXCHG:
+            case LSC_OP_ATOMIC_AND:
+            case LSC_OP_ATOMIC_OR:
+            case LSC_OP_ATOMIC_XOR:
+               return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
+                                     30 /* XXX */, 400 /* XXX */,
+                                     10 /* XXX */, 100 /* XXX */, 0, 0,
+                                     0, 400 /* XXX */);
+            default:
+               abort();
+            }
+
+         case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH:
+         case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
+            return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
+                                  10 /* XXX */, 0, 0, 0, 0, 0);
+
+         case BRW_SFID_URB:
+            return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
+                                  32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
+
+         default:
+            abort();
+         }
+
+      case SHADER_OPCODE_UNDEF:
+      case SHADER_OPCODE_HALT_TARGET:
+      case FS_OPCODE_SCHEDULING_FENCE:
+         return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
+                               0, 0, 0, 0, 0, 0);
+
+      default:
+         abort();
+      }
+   }
+
+   /**
+    * Model the performance behavior of a stall on the specified dependency
+    * ID.
+    */
+   void
+   stall_on_dependency(state &st, enum intel_eu_dependency_id id)
+   {
+      if (id < ARRAY_SIZE(st.dep_ready))
+         st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
+                                       st.dep_ready[id]);
+   }
+
+   /**
+    * Model the performance behavior of the front-end and back-end while
+    * executing an instruction with the specified timing information, assuming
+    * all dependencies are already clear.
+    */
+   void
+   execute_instruction(state &st, const perf_desc &perf)
+   {
+      /* Compute the time at which the front-end will be ready to execute the
+       * next instruction.
+       */
+      st.unit_ready[EU_UNIT_FE] += perf.df;
+
+      if (perf.u < EU_NUM_UNITS) {
+         /* Wait for the back-end to be ready to execute this instruction. */
+         st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
+                                       st.unit_ready[perf.u]);
+
+         /* Compute the time at which the back-end will be ready to execute
+          * the next instruction, and update the back-end utilization.
+          */
+         st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db;
+         st.unit_busy[perf.u] += perf.db * st.weight;
+      }
+   }
+
+   /**
+    * Model the performance behavior of a read dependency provided by an
+    * instruction.
+    */
+   void
+   mark_read_dependency(state &st, const perf_desc &perf,
+                        enum intel_eu_dependency_id id)
+   {
+      if (id < ARRAY_SIZE(st.dep_ready))
+         st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
+   }
+
+   /**
+    * Model the performance behavior of a write dependency provided by an
+    * instruction.
+    */
+   void
+   mark_write_dependency(state &st, const perf_desc &perf,
+                         enum intel_eu_dependency_id id)
+   {
+      if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0)
+         st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
+      else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
+         st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
+      else if (id < ARRAY_SIZE(st.dep_ready))
+         st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
+   }
+
+   /**
+    * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
+    */
+   enum intel_eu_dependency_id
+   reg_dependency_id(const intel_device_info *devinfo, const backend_reg &r,
+                     const int delta)
+   {
+      if (r.file == VGRF) {
+         const unsigned i = r.nr + r.offset / REG_SIZE + delta;
+         assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
+         return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
+
+      } else if (r.file == FIXED_GRF) {
+         const unsigned i = r.nr + delta;
+         assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
+         return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
+
+      } else if (r.file == MRF && devinfo->ver >= 7) {
+         const unsigned i = GFX7_MRF_HACK_START +
+                            r.nr + r.offset / REG_SIZE + delta;
+         assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
+         return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
+
+      } else if (r.file == MRF && devinfo->ver < 7) {
+         const unsigned i = (r.nr & ~BRW_MRF_COMPR4) +
+                            r.offset / REG_SIZE + delta;
+         assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_MRF0);
+         return intel_eu_dependency_id(EU_DEPENDENCY_ID_MRF0 + i);
+
+      } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
+                 r.nr < BRW_ARF_ACCUMULATOR) {
+         assert(delta == 0);
+         return EU_DEPENDENCY_ID_ADDR0;
+
+      } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
+                 r.nr < BRW_ARF_FLAG) {
+         const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
+         assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0);
+         return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
+
+      } else {
+         return EU_NUM_DEPENDENCY_IDS;
+      }
+   }
+
+   /**
+    * Return the dependency ID of flag register starting at offset \p i.
+    */
+   enum intel_eu_dependency_id
+   flag_dependency_id(unsigned i)
+   {
+      assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0);
+      return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i);
+   }
+
+   /**
+    * Return the dependency ID corresponding to the SBID read completion
+    * condition of a Gfx12+ SWSB.
+    */
+   enum intel_eu_dependency_id
+   tgl_swsb_rd_dependency_id(tgl_swsb swsb)
+   {
+      if (swsb.mode) {
+         assert(swsb.sbid <
+                EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0);
+         return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid);
+      } else {
+         return EU_NUM_DEPENDENCY_IDS;
+      }
+   }
+
+   /**
+    * Return the dependency ID corresponding to the SBID write completion
+    * condition of a Gfx12+ SWSB.
+    */
+   enum intel_eu_dependency_id
+   tgl_swsb_wr_dependency_id(tgl_swsb swsb)
+   {
+      if (swsb.mode) {
+         assert(swsb.sbid <
+                EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0);
+         return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid);
+      } else {
+         return EU_NUM_DEPENDENCY_IDS;
+      }
+   }
+
+   /**
+    * Return the implicit accumulator register accessed by channel \p i of the
+    * instruction.
+    */
+   unsigned
+   accum_reg_of_channel(const intel_device_info *devinfo,
+                        const backend_instruction *inst,
+                        brw_reg_type tx, unsigned i)
+   {
+      assert(inst->reads_accumulator_implicitly() ||
+             inst->writes_accumulator_implicitly(devinfo));
+      const unsigned offset = (inst->group + i) * type_sz(tx) *
+         (devinfo->ver < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2);
+      return offset / (reg_unit(devinfo) * REG_SIZE) % 2;
+   }
+
+   /**
+    * Model the performance behavior of an FS back-end instruction.
+    */
+   void
+   issue_fs_inst(state &st, const struct brw_isa_info *isa,
+                 const backend_instruction *be_inst)
+   {
+      const struct intel_device_info *devinfo = isa->devinfo;
+      const fs_inst *inst = static_cast<const fs_inst *>(be_inst);
+      const instruction_info info(isa, inst);
+      const perf_desc perf = instruction_desc(info);
+
+      /* Stall on any source dependencies. */
+      for (unsigned i = 0; i < inst->sources; i++) {
+         for (unsigned j = 0; j < regs_read(inst, i); j++)
+            stall_on_dependency(
+               st, reg_dependency_id(devinfo, inst->src[i], j));
+      }
+
+      if (inst->reads_accumulator_implicitly()) {
+         for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
+              j <= accum_reg_of_channel(devinfo, inst, info.tx,
+                                        inst->exec_size - 1); j++)
+            stall_on_dependency(
+               st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
+      }
+
+      if (is_send(inst) && inst->base_mrf != -1) {
+         for (unsigned j = 0; j < inst->mlen; j++)
+            stall_on_dependency(
+               st, reg_dependency_id(
+                  devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
+      }
+
+      if (const unsigned mask = inst->flags_read(devinfo)) {
+         for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
+            if (mask & (1 << i))
+               stall_on_dependency(st, flag_dependency_id(i));
+         }
+      }
+
+      /* Stall on any write dependencies. */
+      if (!inst->no_dd_check) {
+         if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
+            for (unsigned j = 0; j < regs_written(inst); j++)
+               stall_on_dependency(
+                  st, reg_dependency_id(devinfo, inst->dst, j));
+         }
+
+         if (inst->writes_accumulator_implicitly(devinfo)) {
+            for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
+                 j <= accum_reg_of_channel(devinfo, inst, info.tx,
+                                           inst->exec_size - 1); j++)
+               stall_on_dependency(
+                  st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
+         }
+
+         if (const unsigned mask = inst->flags_written(devinfo)) {
+            for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
+               if (mask & (1 << i))
+                  stall_on_dependency(st, flag_dependency_id(i));
+            }
+         }
+      }
+
+      /* Stall on any SBID dependencies. */
+      if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
+         stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
+      else if (inst->sched.mode & TGL_SBID_SRC)
+         stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
+
+      /* Execute the instruction. */
+      execute_instruction(st, perf);
+
+      /* Mark any source dependencies. */
+      if (inst->is_send_from_grf()) {
+         for (unsigned i = 0; i < inst->sources; i++) {
+            if (inst->is_payload(i)) {
+               for (unsigned j = 0; j < regs_read(inst, i); j++)
+                  mark_read_dependency(
+                     st, perf, reg_dependency_id(devinfo, inst->src[i], j));
+            }
+         }
+      }
+
+      if (is_send(inst) && inst->base_mrf != -1) {
+         for (unsigned j = 0; j < inst->mlen; j++)
+            mark_read_dependency(st, perf,
+               reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
+      }
+
+      /* Mark any destination dependencies. */
+      if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
+         for (unsigned j = 0; j < regs_written(inst); j++) {
+            mark_write_dependency(st, perf,
+                                  reg_dependency_id(devinfo, inst->dst, j));
+         }
+      }
+
+      if (inst->writes_accumulator_implicitly(devinfo)) {
+         for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
+              j <= accum_reg_of_channel(devinfo, inst, info.tx,
+                                        inst->exec_size - 1); j++)
+            mark_write_dependency(st, perf,
+                                  reg_dependency_id(devinfo, brw_acc_reg(8), j));
+      }
+
+      if (const unsigned mask = inst->flags_written(devinfo)) {
+         for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
+            if (mask & (1 << i))
+               mark_write_dependency(st, perf, flag_dependency_id(i));
+         }
+      }
+
+      /* Mark any SBID dependencies. */
+      if (inst->sched.mode & TGL_SBID_SET) {
+         mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
+         mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
+      }
+   }
+
+   /**
+    * Model the performance behavior of a VEC4 back-end instruction.
+    */
+   void
+   issue_vec4_instruction(state &st, const struct brw_isa_info *isa,
+                          const backend_instruction *be_inst)
+   {
+      const struct intel_device_info *devinfo = isa->devinfo;
+      const vec4_instruction *inst =
+         static_cast<const vec4_instruction *>(be_inst);
+      const instruction_info info(isa, inst);
+      const perf_desc perf = instruction_desc(info);
+
+      /* Stall on any source dependencies. */
+      for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
+         for (unsigned j = 0; j < regs_read(inst, i); j++)
+            stall_on_dependency(
+               st, reg_dependency_id(devinfo, inst->src[i], j));
+      }
+
+      if (inst->reads_accumulator_implicitly()) {
+         for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
+              j <= accum_reg_of_channel(devinfo, inst, info.tx,
+                                        inst->exec_size - 1); j++)
+            stall_on_dependency(
+               st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
+      }
+
+      if (inst->base_mrf != -1) {
+         for (unsigned j = 0; j < inst->mlen; j++)
+            stall_on_dependency(
+               st, reg_dependency_id(
+                  devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
+      }
+
+      if (inst->reads_flag())
+         stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
+
+      /* Stall on any write dependencies. */
+      if (!inst->no_dd_check) {
+         if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
+            for (unsigned j = 0; j < regs_written(inst); j++)
+               stall_on_dependency(
+                  st, reg_dependency_id(devinfo, inst->dst, j));
+         }
+
+         if (inst->writes_accumulator_implicitly(devinfo)) {
+            for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
+                 j <= accum_reg_of_channel(devinfo, inst, info.tx,
+                                           inst->exec_size - 1); j++)
+               stall_on_dependency(
+                  st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
+         }
+
+         if (inst->writes_flag(devinfo))
+            stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
+      }
+
+      /* Execute the instruction. */
+      execute_instruction(st, perf);
+
+      /* Mark any source dependencies. */
+      if (inst->is_send_from_grf()) {
+         for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
+            for (unsigned j = 0; j < regs_read(inst, i); j++)
+               mark_read_dependency(
+                  st, perf, reg_dependency_id(devinfo, inst->src[i], j));
+         }
+      }
+
+      if (inst->base_mrf != -1) {
+         for (unsigned j = 0; j < inst->mlen; j++)
+            mark_read_dependency(st, perf,
+               reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
+      }
+
+      /* Mark any destination dependencies. */
+      if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
+         for (unsigned j = 0; j < regs_written(inst); j++) {
+            mark_write_dependency(st, perf,
+                                  reg_dependency_id(devinfo, inst->dst, j));
+         }
+      }
+
+      if (inst->writes_accumulator_implicitly(devinfo)) {
+         for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
+              j <= accum_reg_of_channel(devinfo, inst, info.tx,
+                                        inst->exec_size - 1); j++)
+            mark_write_dependency(st, perf,
+                                  reg_dependency_id(devinfo, brw_acc_reg(8), j));
+      }
+
+      if (inst->writes_flag(devinfo))
+         mark_write_dependency(st, perf, EU_DEPENDENCY_ID_FLAG0);
+   }
+
+   /**
+    * Calculate the maximum possible throughput of the program compatible with
+    * the cycle-count utilization estimated for each asynchronous unit, in
+    * threads-per-cycle units.
+    */
+   float
+   calculate_thread_throughput(const state &st, float busy)
+   {
+      for (unsigned i = 0; i < EU_NUM_UNITS; i++)
+         busy = MAX2(busy, st.unit_busy[i]);
+
+      return 1.0 / busy;
+   }
+
+   /**
+    * Estimate the performance of the specified shader.
+    */
+   void
+   calculate_performance(performance &p, const backend_shader *s,
+                         void (*issue_instruction)(
+                            state &, const struct brw_isa_info *,
+                            const backend_instruction *),
+                         unsigned dispatch_width)
+   {
+      /* XXX - Note that the previous version of this code used worst-case
+       *       scenario estimation of branching divergence for SIMD32 shaders,
+       *       but this heuristic was removed to improve performance in common
+       *       scenarios. Wider shader variants are less optimal when divergence
+       *       is high, e.g. when application renders complex scene on a small
+       *       surface. It is assumed that such renders are short, so their
+       *       time doesn't matter and when it comes to the overall performance,
+       *       they are dominated by more optimal larger renders.
+       *
+       *       It's possible that we could do better with divergence analysis
+       *       by isolating branches which are 100% uniform.
+       *
+       *       Plumbing the trip counts from NIR loop analysis would allow us
+       *       to do a better job regarding the loop weights.
+       *
+       *       In the meantime use values that roughly match the control flow
+       *       weights used elsewhere in the compiler back-end.
+       *
+       *       Note that we provide slightly more pessimistic weights on
+       *       Gfx12+ for SIMD32, since the effective warp size on that
+       *       platform is 2x the SIMD width due to EU fusion, which increases
+       *       the likelihood of divergent control flow in comparison to
+       *       previous generations, giving narrower SIMD modes a performance
+       *       advantage in several test-cases with non-uniform discard jumps.
+       */
+      const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
+                                    1.0 : 0.5);
+      const float loop_weight = 10;
+      unsigned halt_count = 0;
+      unsigned elapsed = 0;
+      state st;
+
+      foreach_block(block, s->cfg) {
+         const unsigned elapsed0 = elapsed;
+
+         foreach_inst_in_block(backend_instruction, inst, block) {
+            const unsigned clock0 = st.unit_ready[EU_UNIT_FE];
+
+            issue_instruction(st, &s->compiler->isa, inst);
+
+            if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count)
+               st.weight /= discard_weight;
+
+            elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight;
+
+            if (inst->opcode == BRW_OPCODE_DO)
+               st.weight *= loop_weight;
+            else if (inst->opcode == BRW_OPCODE_WHILE)
+               st.weight /= loop_weight;
+            else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++)
+               st.weight *= discard_weight;
+         }
+
+         p.block_latency[block->num] = elapsed - elapsed0;
+      }
+
+      p.latency = elapsed;
+      p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
+   }
+}
+
+brw::performance::performance(const fs_visitor *v) :
+   block_latency(new unsigned[v->cfg->num_blocks])
+{
+   calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
+}
+
+brw::performance::performance(const vec4_visitor *v) :
+   block_latency(new unsigned[v->cfg->num_blocks])
+{
+   calculate_performance(*this, v, issue_vec4_instruction, 8);
+}
+
+brw::performance::~performance()
+{
+   delete[] block_latency;
+}
diff --git a/src/intel/compiler/elk/brw_ir_performance.h b/src/intel/compiler/elk/brw_ir_performance.h
new file mode 100644
index 00000000000..c3cefe838aa
--- /dev/null
+++ b/src/intel/compiler/elk/brw_ir_performance.h
@@ -0,0 +1,86 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_PERFORMANCE_H
+#define BRW_IR_PERFORMANCE_H
+
+class fs_visitor;
+
+namespace brw {
+   class vec4_visitor;
+
+   /**
+    * Various estimates of the performance of a shader based on static
+    * analysis.
+    */
+   struct performance {
+      performance(const fs_visitor *v);
+      performance(const vec4_visitor *v);
+      ~performance();
+
+      analysis_dependency_class
+      dependency_class() const
+      {
+         return (DEPENDENCY_INSTRUCTIONS |
+                 DEPENDENCY_BLOCKS);
+      }
+
+      bool
+      validate(const backend_shader *) const
+      {
+         return true;
+      }
+
+      /**
+       * Array containing estimates of the runtime of each basic block of the
+       * program in cycle units.
+       */
+      unsigned *block_latency;
+
+      /**
+       * Estimate of the runtime of the whole program in cycle units assuming
+       * uncontended execution.
+       */
+      unsigned latency;
+
+      /**
+       * Estimate of the throughput of the whole program in
+       * invocations-per-cycle units.
+       *
+       * Note that this might be lower than the ratio between the dispatch
+       * width of the program and its latency estimate in cases where
+       * performance doesn't scale without limits as a function of its thread
+       * parallelism, e.g. due to the existence of a bottleneck in a shared
+       * function.
+       */
+      float throughput;
+
+   private:
+      performance(const performance &perf);
+      performance &
+      operator=(performance u);
+   };
+}
+
+#endif
diff --git a/src/intel/compiler/elk/brw_ir_vec4.h b/src/intel/compiler/elk/brw_ir_vec4.h
new file mode 100644
index 00000000000..78d34729c0b
--- /dev/null
+++ b/src/intel/compiler/elk/brw_ir_vec4.h
@@ -0,0 +1,475 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2011-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_VEC4_H
+#define BRW_IR_VEC4_H
+
+#include "brw_shader.h"
+
+namespace brw {
+
+class dst_reg;
+
+class src_reg : public backend_reg
+{
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(src_reg)
+
+   void init();
+
+   src_reg(enum brw_reg_file file, int nr, const glsl_type *type);
+   src_reg();
+   src_reg(struct ::brw_reg reg);
+
+   bool equals(const src_reg &r) const;
+   bool negative_equals(const src_reg &r) const;
+
+   src_reg(class vec4_visitor *v, const struct glsl_type *type);
+   src_reg(class vec4_visitor *v, const struct glsl_type *type, int size);
+
+   explicit src_reg(const dst_reg &reg);
+
+   src_reg *reladdr;
+};
+
+static inline src_reg
+retype(src_reg reg, enum brw_reg_type type)
+{
+   reg.type = type;
+   return reg;
+}
+
+namespace detail {
+
+static inline void
+add_byte_offset(backend_reg *reg, unsigned bytes)
+{
+   switch (reg->file) {
+      case BAD_FILE:
+         break;
+      case VGRF:
+      case ATTR:
+      case UNIFORM:
+         reg->offset += bytes;
+         assert(reg->offset % 16 == 0);
+         break;
+      case MRF: {
+         const unsigned suboffset = reg->offset + bytes;
+         reg->nr += suboffset / REG_SIZE;
+         reg->offset = suboffset % REG_SIZE;
+         assert(reg->offset % 16 == 0);
+         break;
+      }
+      case ARF:
+      case FIXED_GRF: {
+         const unsigned suboffset = reg->subnr + bytes;
+         reg->nr += suboffset / REG_SIZE;
+         reg->subnr = suboffset % REG_SIZE;
+         assert(reg->subnr % 16 == 0);
+         break;
+      }
+      default:
+         assert(bytes == 0);
+   }
+}
+
+} /* namespace detail */
+
+static inline src_reg
+byte_offset(src_reg reg, unsigned bytes)
+{
+   detail::add_byte_offset(&reg, bytes);
+   return reg;
+}
+
+static inline src_reg
+offset(src_reg reg, unsigned width, unsigned delta)
+{
+   const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
+   const unsigned num_components = MAX2(width / 4 * stride, 4);
+   return byte_offset(reg, num_components * type_sz(reg.type) * delta);
+}
+
+static inline src_reg
+horiz_offset(src_reg reg, unsigned delta)
+{
+   return byte_offset(reg, delta * type_sz(reg.type));
+}
+
+/**
+ * Reswizzle a given source register.
+ * \sa brw_swizzle().
+ */
+static inline src_reg
+swizzle(src_reg reg, unsigned swizzle)
+{
+   if (reg.file == IMM)
+      reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swizzle);
+   else
+      reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle);
+
+   return reg;
+}
+
+static inline src_reg
+negate(src_reg reg)
+{
+   assert(reg.file != IMM);
+   reg.negate = !reg.negate;
+   return reg;
+}
+
+static inline bool
+is_uniform(const src_reg &reg)
+{
+   return (reg.file == IMM || reg.file == UNIFORM || reg.is_null()) &&
+          (!reg.reladdr || is_uniform(*reg.reladdr));
+}
+
+class dst_reg : public backend_reg
+{
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(dst_reg)
+
+   void init();
+
+   dst_reg();
+   dst_reg(enum brw_reg_file file, int nr);
+   dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
+           unsigned writemask);
+   dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
+           unsigned writemask);
+   dst_reg(struct ::brw_reg reg);
+   dst_reg(class vec4_visitor *v, const struct glsl_type *type);
+
+   explicit dst_reg(const src_reg &reg);
+
+   bool equals(const dst_reg &r) const;
+
+   src_reg *reladdr;
+};
+
+static inline dst_reg
+retype(dst_reg reg, enum brw_reg_type type)
+{
+   reg.type = type;
+   return reg;
+}
+
+static inline dst_reg
+byte_offset(dst_reg reg, unsigned bytes)
+{
+   detail::add_byte_offset(&reg, bytes);
+   return reg;
+}
+
+static inline dst_reg
+offset(dst_reg reg, unsigned width, unsigned delta)
+{
+   const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
+   const unsigned num_components = MAX2(width / 4 * stride, 4);
+   return byte_offset(reg, num_components * type_sz(reg.type) * delta);
+}
+
+static inline dst_reg
+horiz_offset(const dst_reg &reg, unsigned delta)
+{
+   if (is_uniform(src_reg(reg)))
+      return reg;
+   else
+      return byte_offset(reg, delta * type_sz(reg.type));
+}
+
+static inline dst_reg
+writemask(dst_reg reg, unsigned mask)
+{
+   assert(reg.file != IMM);
+   assert((reg.writemask & mask) != 0);
+   reg.writemask &= mask;
+   return reg;
+}
+
+/**
+ * Return an integer identifying the discrete address space a register is
+ * contained in.  A register is by definition fully contained in the single
+ * reg_space it belongs to, so two registers with different reg_space ids are
+ * guaranteed not to overlap.  Most register files are a single reg_space of
+ * its own, only the VGRF file is composed of multiple discrete address
+ * spaces, one for each VGRF allocation.
+ */
+static inline uint32_t
+reg_space(const backend_reg &r)
+{
+   return r.file << 16 | (r.file == VGRF ? r.nr : 0);
+}
+
+/**
+ * Return the base offset in bytes of a register relative to the start of its
+ * reg_space().
+ */
+static inline unsigned
+reg_offset(const backend_reg &r)
+{
+   return (r.file == VGRF || r.file == IMM ? 0 : r.nr) *
+          (r.file == UNIFORM ? 16 : REG_SIZE) + r.offset +
+          (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
+}
+
+/**
+ * Return whether the register region starting at \p r and spanning \p dr
+ * bytes could potentially overlap the register region starting at \p s and
+ * spanning \p ds bytes.
+ */
+static inline bool
+regions_overlap(const backend_reg &r, unsigned dr,
+                const backend_reg &s, unsigned ds)
+{
+   if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) {
+      /* COMPR4 regions are translated by the hardware during decompression
+       * into two separate half-regions 4 MRFs apart from each other.
+       */
+      backend_reg t0 = r;
+      t0.nr &= ~BRW_MRF_COMPR4;
+      backend_reg t1 = t0;
+      t1.offset += 4 * REG_SIZE;
+      return regions_overlap(t0, dr / 2, s, ds) ||
+             regions_overlap(t1, dr / 2, s, ds);
+
+   } else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) {
+      return regions_overlap(s, ds, r, dr);
+
+   } else {
+      return reg_space(r) == reg_space(s) &&
+             !(reg_offset(r) + dr <= reg_offset(s) ||
+               reg_offset(s) + ds <= reg_offset(r));
+   }
+}
+
+class vec4_instruction : public backend_instruction {
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(vec4_instruction)
+
+   vec4_instruction(enum opcode opcode,
+                    const dst_reg &dst = dst_reg(),
+                    const src_reg &src0 = src_reg(),
+                    const src_reg &src1 = src_reg(),
+                    const src_reg &src2 = src_reg());
+
+   dst_reg dst;
+   src_reg src[3];
+
+   enum brw_urb_write_flags urb_write_flags;
+
+   unsigned sol_binding; /**< gfx6: SOL binding table index */
+   bool sol_final_write; /**< gfx6: send commit message */
+   unsigned sol_vertex; /**< gfx6: used for setting dst index in SVB header */
+
+   bool is_send_from_grf() const;
+   unsigned size_read(unsigned arg) const;
+   bool can_reswizzle(const struct intel_device_info *devinfo,
+                      int dst_writemask,
+                      int swizzle, int swizzle_mask);
+   void reswizzle(int dst_writemask, int swizzle);
+   bool can_do_source_mods(const struct intel_device_info *devinfo);
+   bool can_do_cmod();
+   bool can_do_writemask(const struct intel_device_info *devinfo);
+   bool can_change_types() const;
+   bool has_source_and_destination_hazard() const;
+   unsigned implied_mrf_writes() const;
+
+   bool is_align1_partial_write()
+   {
+      return opcode == VEC4_OPCODE_SET_LOW_32BIT ||
+             opcode == VEC4_OPCODE_SET_HIGH_32BIT;
+   }
+
+   bool reads_flag() const
+   {
+      return predicate || opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2;
+   }
+
+   bool reads_flag(unsigned c)
+   {
+      if (opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2)
+         return true;
+
+      switch (predicate) {
+      case BRW_PREDICATE_NONE:
+         return false;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_X:
+         return c == 0;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_Y:
+         return c == 1;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_Z:
+         return c == 2;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_W:
+         return c == 3;
+      default:
+         return true;
+      }
+   }
+
+   bool writes_flag(const intel_device_info *devinfo) const
+   {
+      return (conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) &&
+                                  opcode != BRW_OPCODE_CSEL &&
+                                  opcode != BRW_OPCODE_IF &&
+                                  opcode != BRW_OPCODE_WHILE));
+   }
+
+   bool reads_g0_implicitly() const
+   {
+      switch (opcode) {
+      case SHADER_OPCODE_TEX:
+      case SHADER_OPCODE_TXL:
+      case SHADER_OPCODE_TXD:
+      case SHADER_OPCODE_TXF:
+      case SHADER_OPCODE_TXF_CMS_W:
+      case SHADER_OPCODE_TXF_CMS:
+      case SHADER_OPCODE_TXF_MCS:
+      case SHADER_OPCODE_TXS:
+      case SHADER_OPCODE_TG4:
+      case SHADER_OPCODE_TG4_OFFSET:
+      case SHADER_OPCODE_SAMPLEINFO:
+      case VS_OPCODE_PULL_CONSTANT_LOAD:
+      case GS_OPCODE_SET_PRIMITIVE_ID:
+      case GS_OPCODE_GET_INSTANCE_ID:
+      case SHADER_OPCODE_GFX4_SCRATCH_READ:
+      case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
+         return true;
+      default:
+         return false;
+      }
+   }
+};
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a possibly
+ * inverted predicate.
+ */
+inline vec4_instruction *
+set_predicate_inv(enum brw_predicate pred, bool inverse,
+                  vec4_instruction *inst)
+{
+   inst->predicate = pred;
+   inst->predicate_inverse = inverse;
+   return inst;
+}
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a predicate.
+ */
+inline vec4_instruction *
+set_predicate(enum brw_predicate pred, vec4_instruction *inst)
+{
+   return set_predicate_inv(pred, false, inst);
+}
+
+/**
+ * Write the result of evaluating the condition given by \p mod to a flag
+ * register.
+ */
+inline vec4_instruction *
+set_condmod(enum brw_conditional_mod mod, vec4_instruction *inst)
+{
+   inst->conditional_mod = mod;
+   return inst;
+}
+
+/**
+ * Clamp the result of \p inst to the saturation range of its destination
+ * datatype.
+ */
+inline vec4_instruction *
+set_saturate(bool saturate, vec4_instruction *inst)
+{
+   inst->saturate = saturate;
+   return inst;
+}
+
+/**
+ * Return the number of dataflow registers written by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->dst) /
+ * register_size)'.  The somewhat arbitrary register size unit is 16B for the
+ * UNIFORM and IMM files and 32B for all other files.
+ */
+inline unsigned
+regs_written(const vec4_instruction *inst)
+{
+   assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
+   return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE + inst->size_written,
+                       REG_SIZE);
+}
+
+/**
+ * Return the number of dataflow registers read by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
+ * register_size)'.  The somewhat arbitrary register size unit is 16B for the
+ * UNIFORM and IMM files and 32B for all other files.
+ */
+inline unsigned
+regs_read(const vec4_instruction *inst, unsigned i)
+{
+   const unsigned reg_size =
+      inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 16 : REG_SIZE;
+   return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size + inst->size_read(i),
+                       reg_size);
+}
+
+static inline enum brw_reg_type
+get_exec_type(const vec4_instruction *inst)
+{
+   enum brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
+
+   for (int i = 0; i < 3; i++) {
+      if (inst->src[i].file != BAD_FILE) {
+         const brw_reg_type t = get_exec_type(brw_reg_type(inst->src[i].type));
+         if (type_sz(t) > type_sz(exec_type))
+            exec_type = t;
+         else if (type_sz(t) == type_sz(exec_type) &&
+                  brw_reg_type_is_floating_point(t))
+            exec_type = t;
+      }
+   }
+
+   if (exec_type == BRW_REGISTER_TYPE_B)
+      exec_type = inst->dst.type;
+
+   /* TODO: We need to handle half-float conversions. */
+   assert(exec_type != BRW_REGISTER_TYPE_HF ||
+          inst->dst.type == BRW_REGISTER_TYPE_HF);
+   assert(exec_type != BRW_REGISTER_TYPE_B);
+
+   return exec_type;
+}
+
+static inline unsigned
+get_exec_type_size(const vec4_instruction *inst)
+{
+   return type_sz(get_exec_type(inst));
+}
+
+} /* namespace brw */
+
+#endif
diff --git a/src/intel/compiler/elk/brw_isa_info.h b/src/intel/compiler/elk/brw_isa_info.h
new file mode 100644
index 00000000000..ae0ad3e2c2d
--- /dev/null
+++ b/src/intel/compiler/elk/brw_isa_info.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef BRW_ISA_ENCODING_H
+#define BRW_ISA_ENCODING_H
+
+#include "dev/intel_device_info.h"
+#include "brw_eu_defines.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct opcode_desc;
+
+struct brw_isa_info {
+   const struct intel_device_info *devinfo;
+
+   /* A mapping from enum opcode to the corresponding opcode_desc */
+   const struct opcode_desc *ir_to_descs[NUM_BRW_OPCODES];
+
+   /** A mapping from a HW opcode encoding to the corresponding opcode_desc */
+   const struct opcode_desc *hw_to_descs[128];
+};
+
+void brw_init_isa_info(struct brw_isa_info *isa,
+                       const struct intel_device_info *devinfo);
+
+struct opcode_desc {
+   unsigned ir;
+   unsigned hw;
+   const char *name;
+   int nsrc;
+   int ndst;
+   int gfx_vers;
+};
+
+const struct opcode_desc *
+brw_opcode_desc(const struct brw_isa_info *isa, enum opcode opcode);
+
+const struct opcode_desc *
+brw_opcode_desc_from_hw(const struct brw_isa_info *isa, unsigned hw);
+
+static inline unsigned
+brw_opcode_encode(const struct brw_isa_info *isa, enum opcode opcode)
+{
+   return brw_opcode_desc(isa, opcode)->hw;
+}
+
+static inline enum opcode
+brw_opcode_decode(const struct brw_isa_info *isa, unsigned hw)
+{
+   const struct opcode_desc *desc = brw_opcode_desc_from_hw(isa, hw);
+   return desc ? (enum opcode)desc->ir : BRW_OPCODE_ILLEGAL;
+}
+
+static inline bool
+is_3src(const struct brw_isa_info *isa, enum opcode opcode)
+{
+   const struct opcode_desc *desc = brw_opcode_desc(isa, opcode);
+   return desc && desc->nsrc == 3;
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/src/intel/compiler/elk/brw_kernel.c b/src/intel/compiler/elk/brw_kernel.c
new file mode 100644
index 00000000000..a85dc583a58
--- /dev/null
+++ b/src/intel/compiler/elk/brw_kernel.c
@@ -0,0 +1,790 @@
+/*
+ * Copyright © 2020 Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_kernel.h"
+#include "brw_nir.h"
+#include "intel_nir.h"
+
+#include "intel_nir.h"
+#include "nir_clc_helpers.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/spirv/nir_spirv.h"
+#include "dev/intel_debug.h"
+#include "util/u_atomic.h"
+#include "util/u_dynarray.h"
+
+static const nir_shader *
+load_clc_shader(struct brw_compiler *compiler, struct disk_cache *disk_cache,
+                const nir_shader_compiler_options *nir_options,
+                const struct spirv_to_nir_options *spirv_options)
+{
+   if (compiler->clc_shader)
+      return compiler->clc_shader;
+
+   nir_shader *nir =  nir_load_libclc_shader(64, disk_cache,
+                                             spirv_options, nir_options,
+                                             disk_cache != NULL);
+   if (nir == NULL)
+      return NULL;
+
+   const nir_shader *old_nir =
+      p_atomic_cmpxchg(&compiler->clc_shader, NULL, nir);
+   if (old_nir == NULL) {
+      /* We won the race */
+      ralloc_steal(compiler, nir);
+      return nir;
+   } else {
+      /* Someone else built the shader first */
+      ralloc_free(nir);
+      return old_nir;
+   }
+}
+
+static nir_builder
+builder_init_new_impl(nir_function *func)
+{
+   nir_function_impl *impl = nir_function_impl_create(func);
+   return nir_builder_at(nir_before_impl(impl));
+}
+
+static void
+implement_atomic_builtin(nir_function *func, nir_atomic_op atomic_op,
+                         enum glsl_base_type data_base_type,
+                         nir_variable_mode mode)
+{
+   nir_builder b = builder_init_new_impl(func);
+   const struct glsl_type *data_type = glsl_scalar_type(data_base_type);
+
+   unsigned p = 0;
+
+   nir_deref_instr *ret = NULL;
+   ret = nir_build_deref_cast(&b, nir_load_param(&b, p++),
+                              nir_var_function_temp, data_type, 0);
+
+   nir_intrinsic_op op = nir_intrinsic_deref_atomic;
+   nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b.shader, op);
+   nir_intrinsic_set_atomic_op(atomic, atomic_op);
+
+   for (unsigned i = 0; i < nir_intrinsic_infos[op].num_srcs; i++) {
+      nir_def *src = nir_load_param(&b, p++);
+      if (i == 0) {
+         /* The first source is our deref */
+         assert(nir_intrinsic_infos[op].src_components[i] == -1);
+         src = &nir_build_deref_cast(&b, src, mode, data_type, 0)->def;
+      }
+      atomic->src[i] = nir_src_for_ssa(src);
+   }
+
+   nir_def_init_for_type(&atomic->instr, &atomic->def, data_type);
+
+   nir_builder_instr_insert(&b, &atomic->instr);
+   nir_store_deref(&b, ret, &atomic->def, ~0);
+}
+
+static void
+implement_sub_group_ballot_builtin(nir_function *func)
+{
+   nir_builder b = builder_init_new_impl(func);
+   nir_deref_instr *ret =
+      nir_build_deref_cast(&b, nir_load_param(&b, 0),
+                           nir_var_function_temp, glsl_uint_type(), 0);
+   nir_def *cond = nir_load_param(&b, 1);
+
+   nir_intrinsic_instr *ballot =
+      nir_intrinsic_instr_create(b.shader, nir_intrinsic_ballot);
+   ballot->src[0] = nir_src_for_ssa(cond);
+   ballot->num_components = 1;
+   nir_def_init(&ballot->instr, &ballot->def, 1, 32);
+   nir_builder_instr_insert(&b, &ballot->instr);
+
+   nir_store_deref(&b, ret, &ballot->def, ~0);
+}
+
+static bool
+implement_intel_builtins(nir_shader *nir)
+{
+   bool progress = false;
+
+   nir_foreach_function(func, nir) {
+      if (strcmp(func->name, "_Z10atomic_minPU3AS1Vff") == 0) {
+         /* float atom_min(__global float volatile *p, float val) */
+         implement_atomic_builtin(func, nir_atomic_op_fmin,
+                                  GLSL_TYPE_FLOAT, nir_var_mem_global);
+         progress = true;
+      } else if (strcmp(func->name, "_Z10atomic_maxPU3AS1Vff") == 0) {
+         /* float atom_max(__global float volatile *p, float val) */
+         implement_atomic_builtin(func, nir_atomic_op_fmax,
+                                  GLSL_TYPE_FLOAT, nir_var_mem_global);
+         progress = true;
+      } else if (strcmp(func->name, "_Z10atomic_minPU3AS3Vff") == 0) {
+         /* float atomic_min(__shared float volatile *, float) */
+         implement_atomic_builtin(func, nir_atomic_op_fmin,
+                                  GLSL_TYPE_FLOAT, nir_var_mem_shared);
+         progress = true;
+      } else if (strcmp(func->name, "_Z10atomic_maxPU3AS3Vff") == 0) {
+         /* float atomic_max(__shared float volatile *, float) */
+         implement_atomic_builtin(func, nir_atomic_op_fmax,
+                                  GLSL_TYPE_FLOAT, nir_var_mem_shared);
+         progress = true;
+      } else if (strcmp(func->name, "intel_sub_group_ballot") == 0) {
+         implement_sub_group_ballot_builtin(func);
+         progress = true;
+      }
+   }
+
+   nir_shader_preserve_all_metadata(nir);
+
+   return progress;
+}
+
+static bool
+lower_kernel_intrinsics(nir_shader *nir)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+   bool progress = false;
+
+   unsigned kernel_sysvals_start = 0;
+   unsigned kernel_arg_start = sizeof(struct brw_kernel_sysvals);
+   nir->num_uniforms += kernel_arg_start;
+
+   nir_builder b = nir_builder_create(impl);
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_load_kernel_input: {
+            b.cursor = nir_instr_remove(&intrin->instr);
+
+            nir_intrinsic_instr *load =
+               nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
+            load->num_components = intrin->num_components;
+            load->src[0] = nir_src_for_ssa(nir_u2u32(&b, intrin->src[0].ssa));
+            nir_intrinsic_set_base(load, kernel_arg_start);
+            nir_intrinsic_set_range(load, nir->num_uniforms);
+            nir_def_init(&load->instr, &load->def,
+                         intrin->def.num_components,
+                         intrin->def.bit_size);
+            nir_builder_instr_insert(&b, &load->instr);
+
+            nir_def_rewrite_uses(&intrin->def, &load->def);
+            progress = true;
+            break;
+         }
+
+         case nir_intrinsic_load_constant_base_ptr: {
+            b.cursor = nir_instr_remove(&intrin->instr);
+            nir_def *const_data_base_addr = nir_pack_64_2x32_split(&b,
+               nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW),
+               nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH));
+            nir_def_rewrite_uses(&intrin->def, const_data_base_addr);
+            progress = true;
+            break;
+         }
+
+         case nir_intrinsic_load_num_workgroups: {
+            b.cursor = nir_instr_remove(&intrin->instr);
+
+            nir_intrinsic_instr *load =
+               nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
+            load->num_components = 3;
+            load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+            nir_intrinsic_set_base(load, kernel_sysvals_start +
+               offsetof(struct brw_kernel_sysvals, num_work_groups));
+            nir_intrinsic_set_range(load, 3 * 4);
+            nir_def_init(&load->instr, &load->def, 3, 32);
+            nir_builder_instr_insert(&b, &load->instr);
+            nir_def_rewrite_uses(&intrin->def, &load->def);
+            progress = true;
+            break;
+         }
+
+         default:
+            break;
+         }
+      }
+   }
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+   } else {
+      nir_metadata_preserve(impl, nir_metadata_all);
+   }
+
+   return progress;
+}
+
+bool
+brw_kernel_from_spirv(struct brw_compiler *compiler,
+                      struct disk_cache *disk_cache,
+                      struct brw_kernel *kernel,
+                      void *log_data, void *mem_ctx,
+                      const uint32_t *spirv, size_t spirv_size,
+                      const char *entrypoint_name,
+                      char **error_str)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   const nir_shader_compiler_options *nir_options =
+      compiler->nir_options[MESA_SHADER_KERNEL];
+
+   struct spirv_to_nir_options spirv_options = {
+      .environment = NIR_SPIRV_OPENCL,
+      .caps = {
+         .address = true,
+         .float16 = devinfo->ver >= 8,
+         .float64 = devinfo->ver >= 8,
+         .groups = true,
+         .image_write_without_format = true,
+         .int8 = devinfo->ver >= 8,
+         .int16 = devinfo->ver >= 8,
+         .int64 = devinfo->ver >= 8,
+         .int64_atomics = devinfo->ver >= 9,
+         .kernel = true,
+         .linkage = true, /* We receive linked kernel from clc */
+         .float_controls = devinfo->ver >= 8,
+         .generic_pointers = true,
+         .storage_8bit = devinfo->ver >= 8,
+         .storage_16bit = devinfo->ver >= 8,
+         .subgroup_arithmetic = true,
+         .subgroup_basic = true,
+         .subgroup_ballot = true,
+         .subgroup_dispatch = true,
+         .subgroup_quad = true,
+         .subgroup_shuffle = true,
+         .subgroup_vote = true,
+
+         .intel_subgroup_shuffle = true,
+         .intel_subgroup_buffer_block_io = true,
+      },
+      .shared_addr_format = nir_address_format_62bit_generic,
+      .global_addr_format = nir_address_format_62bit_generic,
+      .temp_addr_format = nir_address_format_62bit_generic,
+      .constant_addr_format = nir_address_format_64bit_global,
+   };
+
+   spirv_options.clc_shader = load_clc_shader(compiler, disk_cache,
+                                              nir_options, &spirv_options);
+   if (spirv_options.clc_shader == NULL) {
+      fprintf(stderr, "ERROR: libclc shader missing."
+              " Consider installing the libclc package\n");
+      abort();
+   }
+
+   assert(spirv_size % 4 == 0);
+   nir_shader *nir =
+      spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
+                   entrypoint_name, &spirv_options, nir_options);
+   nir_validate_shader(nir, "after spirv_to_nir");
+   nir_validate_ssa_dominance(nir, "after spirv_to_nir");
+   ralloc_steal(mem_ctx, nir);
+   nir->info.name = ralloc_strdup(nir, entrypoint_name);
+
+   if (INTEL_DEBUG(DEBUG_CS)) {
+      /* Re-index SSA defs so we print more sensible numbers. */
+      nir_foreach_function_impl(impl, nir) {
+         nir_index_ssa_defs(impl);
+      }
+
+      fprintf(stderr, "NIR (from SPIR-V) for kernel\n");
+      nir_print_shader(nir, stderr);
+   }
+
+   NIR_PASS_V(nir, implement_intel_builtins);
+   NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader);
+
+   /* We have to lower away local constant initializers right before we
+    * inline functions.  That way they get properly initialized at the top
+    * of the function and not at the top of its caller.
+    */
+   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
+   NIR_PASS_V(nir, nir_lower_returns);
+   NIR_PASS_V(nir, nir_inline_functions);
+   NIR_PASS_V(nir, nir_copy_prop);
+   NIR_PASS_V(nir, nir_opt_deref);
+
+   /* Pick off the single entrypoint that we want */
+   nir_remove_non_entrypoints(nir);
+
+   /* Now that we've deleted all but the main function, we can go ahead and
+    * lower the rest of the constant initializers.  We do this here so that
+    * nir_remove_dead_variables and split_per_member_structs below see the
+    * corresponding stores.
+    */
+   NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
+
+   /* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B
+    * aligned and so it can just read/write them as vec4s.  This results in a
+    * LOT of vec4->vec3 casts on loads and stores.  One solution to this
+    * problem is to get rid of all vec3 variables.
+    */
+   NIR_PASS_V(nir, nir_lower_vec3_to_vec4,
+              nir_var_shader_temp | nir_var_function_temp |
+              nir_var_mem_shared | nir_var_mem_global|
+              nir_var_mem_constant);
+
+   /* We assign explicit types early so that the optimizer can take advantage
+    * of that information and hopefully get rid of some of our memcpys.
+    */
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
+              nir_var_uniform |
+              nir_var_shader_temp | nir_var_function_temp |
+              nir_var_mem_shared | nir_var_mem_global,
+              glsl_get_cl_type_size_align);
+
+   struct brw_nir_compiler_opts opts = {};
+   brw_preprocess_nir(compiler, nir, &opts);
+
+   int max_arg_idx = -1;
+   nir_foreach_uniform_variable(var, nir) {
+      assert(var->data.location < 256);
+      max_arg_idx = MAX2(max_arg_idx, var->data.location);
+   }
+
+   kernel->args_size = nir->num_uniforms;
+   kernel->arg_count = max_arg_idx + 1;
+
+   /* No bindings */
+   struct brw_kernel_arg_desc *args =
+      rzalloc_array(mem_ctx, struct brw_kernel_arg_desc, kernel->arg_count);
+   kernel->args = args;
+
+   nir_foreach_uniform_variable(var, nir) {
+      struct brw_kernel_arg_desc arg_desc = {
+         .offset = var->data.driver_location,
+         .size = glsl_get_explicit_size(var->type, false),
+      };
+      assert(arg_desc.offset + arg_desc.size <= nir->num_uniforms);
+
+      assert(var->data.location >= 0);
+      args[var->data.location] = arg_desc;
+   }
+
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_all, NULL);
+
+   /* Lower again, this time after dead-variables to get more compact variable
+    * layouts.
+    */
+   nir->global_mem_size = 0;
+   nir->scratch_size = 0;
+   nir->info.shared_size = 0;
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
+              nir_var_shader_temp | nir_var_function_temp |
+              nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant,
+              glsl_get_cl_type_size_align);
+   if (nir->constant_data_size > 0) {
+      assert(nir->constant_data == NULL);
+      nir->constant_data = rzalloc_size(nir, nir->constant_data_size);
+      nir_gather_explicit_io_initializers(nir, nir->constant_data,
+                                          nir->constant_data_size,
+                                          nir_var_mem_constant);
+   }
+
+   if (INTEL_DEBUG(DEBUG_CS)) {
+      /* Re-index SSA defs so we print more sensible numbers. */
+      nir_foreach_function_impl(impl, nir) {
+         nir_index_ssa_defs(impl);
+      }
+
+      fprintf(stderr, "NIR (before I/O lowering) for kernel\n");
+      nir_print_shader(nir, stderr);
+   }
+
+   NIR_PASS_V(nir, nir_lower_memcpy);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant,
+              nir_address_format_64bit_global);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform,
+              nir_address_format_32bit_offset_as_64bit);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io,
+              nir_var_shader_temp | nir_var_function_temp |
+              nir_var_mem_shared | nir_var_mem_global,
+              nir_address_format_62bit_generic);
+
+   NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL);
+
+   NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, NULL);
+   NIR_PASS_V(nir, lower_kernel_intrinsics);
+
+   struct brw_cs_prog_key key = { };
+
+   memset(&kernel->prog_data, 0, sizeof(kernel->prog_data));
+   kernel->prog_data.base.nr_params = DIV_ROUND_UP(nir->num_uniforms, 4);
+
+   struct brw_compile_cs_params params = {
+      .base = {
+         .nir = nir,
+         .stats = kernel->stats,
+         .log_data = log_data,
+         .mem_ctx = mem_ctx,
+      },
+      .key = &key,
+      .prog_data = &kernel->prog_data,
+   };
+
+   kernel->code = brw_compile_cs(compiler, &params);
+
+   if (error_str)
+      *error_str = params.base.error_str;
+
+   return kernel->code != NULL;
+}
+
+static nir_def *
+rebuild_value_from_store(struct util_dynarray *stores,
+                         nir_def *value, unsigned read_offset)
+{
+   unsigned read_size = value->num_components * value->bit_size / 8;
+
+   util_dynarray_foreach(stores, nir_intrinsic_instr *, _store) {
+      nir_intrinsic_instr *store = *_store;
+
+      unsigned write_offset = nir_src_as_uint(store->src[1]);
+      unsigned write_size = nir_src_num_components(store->src[0]) *
+                            nir_src_bit_size(store->src[0]) / 8;
+      if (write_offset <= read_offset &&
+          (write_offset + write_size) >= (read_offset + read_size)) {
+         assert(nir_block_dominates(store->instr.block, value->parent_instr->block));
+         assert(write_size == read_size);
+         return store->src[0].ssa;
+      }
+   }
+   unreachable("Matching scratch store not found");
+}
+
+/**
+ * Remove temporary variables stored to scratch to be then reloaded
+ * immediately. Remap the load to the store SSA value.
+ *
+ * This workaround is only meant to be applied to shaders in src/intel/shaders
+ * were we know there should be no issue. More complex cases might not work
+ * with this approach.
+ */
+static bool
+nir_remove_llvm17_scratch(nir_shader *nir)
+{
+   struct util_dynarray scratch_stores;
+   void *mem_ctx = ralloc_context(NULL);
+
+   util_dynarray_init(&scratch_stores, mem_ctx);
+
+   nir_foreach_function_impl(func, nir) {
+      nir_foreach_block(block, func) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+            if (intrin->intrinsic != nir_intrinsic_store_scratch)
+               continue;
+
+            nir_const_value *offset = nir_src_as_const_value(intrin->src[1]);
+            if (offset != NULL) {
+               util_dynarray_append(&scratch_stores, nir_intrinsic_instr *, intrin);
+            }
+         }
+      }
+   }
+
+   bool progress = false;
+   if (util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) > 0) {
+      nir_foreach_function_impl(func, nir) {
+         nir_foreach_block(block, func) {
+            nir_foreach_instr_safe(instr, block) {
+               if (instr->type != nir_instr_type_intrinsic)
+                  continue;
+
+               nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+               if (intrin->intrinsic != nir_intrinsic_load_scratch)
+                  continue;
+
+               nir_const_value *offset = nir_src_as_const_value(intrin->src[0]);
+               if (offset == NULL)
+                  continue;
+
+               nir_def_rewrite_uses(&intrin->def,
+                                    rebuild_value_from_store(
+                                       &scratch_stores, &intrin->def,
+                                       nir_src_as_uint(intrin->src[0])));
+               nir_instr_remove(instr);
+
+               progress = true;
+            }
+         }
+      }
+   }
+
+   util_dynarray_foreach(&scratch_stores, nir_intrinsic_instr *, _store) {
+      nir_intrinsic_instr *store = *_store;
+      nir_instr_remove(&store->instr);
+   }
+
+   /* Quick sanity check */
+   assert(util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) == 0 ||
+          progress);
+
+   ralloc_free(mem_ctx);
+
+   return progress;
+}
+
+static void
+cleanup_llvm17_scratch(nir_shader *nir)
+{
+   {
+      bool progress;
+      do {
+         progress = false;
+         NIR_PASS(progress, nir, nir_copy_prop);
+         NIR_PASS(progress, nir, nir_opt_dce);
+         NIR_PASS(progress, nir, nir_opt_constant_folding);
+         NIR_PASS(progress, nir, nir_opt_cse);
+         NIR_PASS(progress, nir, nir_opt_algebraic);
+      } while (progress);
+   }
+
+   nir_remove_llvm17_scratch(nir);
+
+   {
+      bool progress;
+      do {
+         progress = false;
+         NIR_PASS(progress, nir, nir_copy_prop);
+         NIR_PASS(progress, nir, nir_opt_dce);
+         NIR_PASS(progress, nir, nir_opt_constant_folding);
+         NIR_PASS(progress, nir, nir_opt_cse);
+         NIR_PASS(progress, nir, nir_opt_algebraic);
+      } while (progress);
+   }
+}
+
+nir_shader *
+brw_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size,
+                   bool llvm17_wa)
+{
+   struct spirv_to_nir_options spirv_options = {
+      .environment = NIR_SPIRV_OPENCL,
+      .caps = {
+         .address = true,
+         .groups = true,
+         .image_write_without_format = true,
+         .int8 = true,
+         .int16 = true,
+         .int64 = true,
+         .int64_atomics = true,
+         .kernel = true,
+         .linkage = true, /* We receive linked kernel from clc */
+         .float_controls = true,
+         .generic_pointers = true,
+         .storage_8bit = true,
+         .storage_16bit = true,
+         .subgroup_arithmetic = true,
+         .subgroup_basic = true,
+         .subgroup_ballot = true,
+         .subgroup_dispatch = true,
+         .subgroup_quad = true,
+         .subgroup_shuffle = true,
+         .subgroup_vote = true,
+
+         .intel_subgroup_shuffle = true,
+         .intel_subgroup_buffer_block_io = true,
+      },
+      .shared_addr_format = nir_address_format_62bit_generic,
+      .global_addr_format = nir_address_format_62bit_generic,
+      .temp_addr_format = nir_address_format_62bit_generic,
+      .constant_addr_format = nir_address_format_64bit_global,
+      .create_library = true,
+   };
+
+   assert(spirv_size % 4 == 0);
+   nir_shader *nir =
+      spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
+                   "library", &spirv_options, &brw_scalar_nir_options);
+   nir_validate_shader(nir, "after spirv_to_nir");
+   nir_validate_ssa_dominance(nir, "after spirv_to_nir");
+   ralloc_steal(mem_ctx, nir);
+   nir->info.name = ralloc_strdup(nir, "library");
+
+   if (INTEL_DEBUG(DEBUG_CS)) {
+      /* Re-index SSA defs so we print more sensible numbers. */
+      nir_foreach_function_impl(impl, nir) {
+         nir_index_ssa_defs(impl);
+      }
+
+      fprintf(stderr, "NIR (from SPIR-V) for kernel\n");
+      nir_print_shader(nir, stderr);
+   }
+
+   NIR_PASS_V(nir, implement_intel_builtins);
+   NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader);
+
+   /* We have to lower away local constant initializers right before we
+    * inline functions.  That way they get properly initialized at the top
+    * of the function and not at the top of its caller.
+    */
+   NIR_PASS_V(nir, nir_lower_variable_initializers, ~(nir_var_shader_temp |
+                                                      nir_var_function_temp));
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo |
+              nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL);
+   {
+      bool progress;
+      do
+      {
+         progress = false;
+         NIR_PASS(progress, nir, nir_copy_prop);
+         NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+         NIR_PASS(progress, nir, nir_opt_deref);
+         NIR_PASS(progress, nir, nir_opt_dce);
+         NIR_PASS(progress, nir, nir_opt_undef);
+         NIR_PASS(progress, nir, nir_opt_constant_folding);
+         NIR_PASS(progress, nir, nir_opt_cse);
+         NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+         NIR_PASS(progress, nir, nir_opt_algebraic);
+      } while (progress);
+   }
+
+   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
+   NIR_PASS_V(nir, nir_lower_returns);
+   NIR_PASS_V(nir, nir_inline_functions);
+
+   assert(nir->scratch_size == 0);
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_cl_type_size_align);
+
+   {
+      bool progress;
+      do
+      {
+         progress = false;
+         NIR_PASS(progress, nir, nir_copy_prop);
+         NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+         NIR_PASS(progress, nir, nir_opt_deref);
+         NIR_PASS(progress, nir, nir_opt_dce);
+         NIR_PASS(progress, nir, nir_opt_undef);
+         NIR_PASS(progress, nir, nir_opt_constant_folding);
+         NIR_PASS(progress, nir, nir_opt_cse);
+         NIR_PASS(progress, nir, nir_split_var_copies);
+         NIR_PASS(progress, nir, nir_lower_var_copies);
+         NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+         NIR_PASS(progress, nir, nir_opt_algebraic);
+         NIR_PASS(progress, nir, nir_opt_if, nir_opt_if_optimize_phi_true_false);
+         NIR_PASS(progress, nir, nir_opt_dead_cf);
+         NIR_PASS(progress, nir, nir_opt_remove_phis);
+         NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
+         NIR_PASS(progress, nir, nir_lower_vec3_to_vec4, nir_var_mem_generic | nir_var_uniform);
+         NIR_PASS(progress, nir, nir_opt_memcpy);
+      } while (progress);
+   }
+
+   NIR_PASS_V(nir, nir_scale_fdiv);
+
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo |
+              nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL);
+
+
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_mem_shared | nir_var_function_temp, NULL);
+
+   nir->scratch_size = 0;
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
+              nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp |
+              nir_var_mem_global | nir_var_mem_constant,
+              glsl_get_cl_type_size_align);
+
+   // Lower memcpy - needs to wait until types are sized
+   {
+      bool progress;
+      do {
+         progress = false;
+         NIR_PASS(progress, nir, nir_opt_memcpy);
+         NIR_PASS(progress, nir, nir_copy_prop);
+         NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+         NIR_PASS(progress, nir, nir_opt_deref);
+         NIR_PASS(progress, nir, nir_opt_dce);
+         NIR_PASS(progress, nir, nir_split_var_copies);
+         NIR_PASS(progress, nir, nir_lower_var_copies);
+         NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+         NIR_PASS(progress, nir, nir_opt_constant_folding);
+         NIR_PASS(progress, nir, nir_opt_cse);
+      } while (progress);
+   }
+   NIR_PASS_V(nir, nir_lower_memcpy);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io,
+              nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp | nir_var_uniform,
+              nir_address_format_32bit_offset_as_64bit);
+
+   NIR_PASS_V(nir, nir_lower_system_values);
+
+   /* Hopefully we can drop this once lower_vars_to_ssa has improved to not
+    * lower everything to scratch.
+    */
+   if (llvm17_wa)
+      cleanup_llvm17_scratch(nir);
+
+   /* Lower again, this time after dead-variables to get more compact variable
+    * layouts.
+    */
+   nir->global_mem_size = 0;
+   nir->scratch_size = 0;
+   nir->info.shared_size = 0;
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
+              nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant,
+              glsl_get_cl_type_size_align);
+   if (nir->constant_data_size > 0) {
+      assert(nir->constant_data == NULL);
+      nir->constant_data = rzalloc_size(nir, nir->constant_data_size);
+      nir_gather_explicit_io_initializers(nir, nir->constant_data,
+                                          nir->constant_data_size,
+                                          nir_var_mem_constant);
+   }
+
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant,
+              nir_address_format_64bit_global);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform,
+              nir_address_format_32bit_offset_as_64bit);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io,
+              nir_var_shader_temp | nir_var_function_temp |
+              nir_var_mem_shared | nir_var_mem_global,
+              nir_address_format_62bit_generic);
+
+   if (INTEL_DEBUG(DEBUG_CS)) {
+      /* Re-index SSA defs so we print more sensible numbers. */
+      nir_foreach_function_impl(impl, nir) {
+         nir_index_ssa_defs(impl);
+      }
+
+      fprintf(stderr, "NIR (before I/O lowering) for kernel\n");
+      nir_print_shader(nir, stderr);
+   }
+
+   return nir;
+}
diff --git a/src/intel/compiler/elk/brw_kernel.h b/src/intel/compiler/elk/brw_kernel.h
new file mode 100644
index 00000000000..fb1289872d5
--- /dev/null
+++ b/src/intel/compiler/elk/brw_kernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2020 Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_KERNEL_H
+#define BRW_KERNEL_H
+
+#include "brw_compiler.h"
+
+struct disk_cache;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Software interface for system values in kernels
+ *
+ * These are intended to go at the start of the kernel argument buffer.
+ */
+struct brw_kernel_sysvals {
+   uint32_t num_work_groups[3];
+   uint32_t pad[5];
+};
+
+struct brw_kernel_arg_desc {
+   uint16_t offset;
+   uint16_t size;
+};
+
+struct brw_kernel {
+   struct brw_cs_prog_data prog_data;
+
+   struct brw_compile_stats stats[3];
+
+   uint16_t args_size;
+   uint16_t arg_count;
+   const struct brw_kernel_arg_desc *args;
+
+   const void *code;
+};
+
+bool
+brw_kernel_from_spirv(struct brw_compiler *compiler,
+                      struct disk_cache *disk_cache,
+                      struct brw_kernel *kernel,
+                      void *log_data, void *mem_ctx,
+                      const uint32_t *spirv, size_t spirv_size,
+                      const char *entrypoint_name,
+                      char **error_str);
+
+nir_shader *
+brw_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size,
+                   bool llvm17_wa);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* BRW_KERNEL_H */
diff --git a/src/intel/compiler/elk/brw_lex.l b/src/intel/compiler/elk/brw_lex.l
new file mode 100644
index 00000000000..d230d997358
--- /dev/null
+++ b/src/intel/compiler/elk/brw_lex.l
@@ -0,0 +1,465 @@
+%option yylineno
+%option nounput
+%{
+#include <string.h>
+#include "brw_asm.h"
+#undef ALIGN16
+#include "brw_gram.tab.h"
+
+/* Locations */
+int yycolumn = 1;
+
+int saved_state = 0;
+extern char *input_filename;
+
+#define YY_NO_INPUT
+#define YY_USER_ACTION                                  	\
+	yylloc.first_line = yylloc.last_line = yylineno;	\
+	yylloc.first_column = yycolumn;			        \
+	yylloc.last_column = yycolumn + yyleng - 1;	        \
+	yycolumn += yyleng;
+%}
+
+%x BLOCK_COMMENT
+%x FILENAME
+%x CHANNEL
+%x REG
+%x DOTSEL
+%x LABEL
+%x MSGDESC
+%%
+
+ /* eat up single line comment */
+\/\/.*[\r\n]	{ yycolumn = 1; }
+
+ /* eat up multiline comment */
+\/\*		{ saved_state = YYSTATE; BEGIN(BLOCK_COMMENT); }
+
+<BLOCK_COMMENT>\*\/	{ BEGIN(saved_state); }
+
+<BLOCK_COMMENT>.     	{ }
+<BLOCK_COMMENT>[\r\n]	{ }
+
+<FILENAME>\"[^\"]+\"	{
+			   char *name = malloc(yyleng - 1);
+			   memmove(name, yytext + 1, yyleng - 2);
+			   name[yyleng-1] = '\0';
+			   input_filename = name;
+			}
+
+ /* null register */
+null 		{ BEGIN(REG); return NULL_TOKEN; }
+
+ /* Opcodes */
+add		{ yylval.integer = BRW_OPCODE_ADD; return ADD; }
+add3		{ yylval.integer = BRW_OPCODE_ADD3; return ADD3; }
+addc		{ yylval.integer = BRW_OPCODE_ADDC; return ADDC; }
+and		{ yylval.integer = BRW_OPCODE_AND; return AND; }
+asr		{ yylval.integer = BRW_OPCODE_ASR; return ASR; }
+avg		{ yylval.integer = BRW_OPCODE_AVG; return AVG; }
+bfe 		{ yylval.integer = BRW_OPCODE_BFE; return BFE; }
+bfi1 		{ yylval.integer = BRW_OPCODE_BFI1; return BFI1; }
+bfi2 		{ yylval.integer = BRW_OPCODE_BFI2; return BFI2; }
+bfrev 		{ yylval.integer = BRW_OPCODE_BFREV; return BFREV; }
+brc 		{ yylval.integer = BRW_OPCODE_BRC; return BRC; }
+brd 		{ yylval.integer = BRW_OPCODE_BRD; return BRD; }
+break 		{ yylval.integer = BRW_OPCODE_BREAK; return BREAK; }
+call 		{ yylval.integer = BRW_OPCODE_CALL; return CALL; }
+calla 		{ yylval.integer = BRW_OPCODE_CALLA; return CALLA; }
+case 		{ yylval.integer = BRW_OPCODE_CASE; return CASE; }
+cbit 		{ yylval.integer = BRW_OPCODE_CBIT; return CBIT; }
+cmp 		{ yylval.integer = BRW_OPCODE_CMP; return CMP; }
+cmpn 		{ yylval.integer = BRW_OPCODE_CMPN; return CMPN; }
+cont 		{ yylval.integer = BRW_OPCODE_CONTINUE; return CONT; }
+csel 		{ yylval.integer = BRW_OPCODE_CSEL; return CSEL; }
+dim 		{ yylval.integer = BRW_OPCODE_DIM; return DIM; }
+do 		{ yylval.integer = BRW_OPCODE_DO; return DO; }
+dp2 		{ yylval.integer = BRW_OPCODE_DP2; return DP2; }
+dp3 		{ yylval.integer = BRW_OPCODE_DP3; return DP3; }
+dp4 		{ yylval.integer = BRW_OPCODE_DP4; return DP4; }
+dp4a		{ yylval.integer = BRW_OPCODE_DP4A; return DP4A; }
+dph 		{ yylval.integer = BRW_OPCODE_DPH; return DPH; }
+else 		{ yylval.integer = BRW_OPCODE_ELSE; return ELSE; }
+endif 		{ yylval.integer = BRW_OPCODE_ENDIF; return ENDIF; }
+f16to32 	{ yylval.integer = BRW_OPCODE_F16TO32; return F16TO32; }
+f32to16 	{ yylval.integer = BRW_OPCODE_F32TO16; return F32TO16; }
+fbh 		{ yylval.integer = BRW_OPCODE_FBH; return FBH; }
+fbl 		{ yylval.integer = BRW_OPCODE_FBL; return FBL; }
+fork 		{ yylval.integer = BRW_OPCODE_FORK; return FORK; }
+frc 		{ yylval.integer = BRW_OPCODE_FRC; return FRC; }
+goto 		{ yylval.integer = BRW_OPCODE_GOTO; return GOTO; }
+halt 		{ yylval.integer = BRW_OPCODE_HALT; return HALT; }
+if 		{ yylval.integer = BRW_OPCODE_IF; return IF; }
+iff 		{ yylval.integer = BRW_OPCODE_IFF; return IFF; }
+illegal 	{ yylval.integer = BRW_OPCODE_ILLEGAL; return ILLEGAL; }
+jmpi 		{ yylval.integer = BRW_OPCODE_JMPI; return JMPI; }
+line 		{ yylval.integer = BRW_OPCODE_LINE; return LINE; }
+lrp 		{ yylval.integer = BRW_OPCODE_LRP; return LRP; }
+lzd 		{ yylval.integer = BRW_OPCODE_LZD; return LZD; }
+mac 		{ yylval.integer = BRW_OPCODE_MAC; return MAC; }
+mach 		{ yylval.integer = BRW_OPCODE_MACH; return MACH; }
+mad 		{ yylval.integer = BRW_OPCODE_MAD; return MAD; }
+madm 		{ yylval.integer = BRW_OPCODE_MADM; return MADM; }
+mov 		{ yylval.integer = BRW_OPCODE_MOV; return MOV; }
+movi 		{ yylval.integer = BRW_OPCODE_MOVI; return MOVI; }
+mul 		{ yylval.integer = BRW_OPCODE_MUL; return MUL; }
+mrest 		{ yylval.integer = BRW_OPCODE_MREST; return MREST; }
+msave 		{ yylval.integer = BRW_OPCODE_MSAVE; return MSAVE; }
+nenop 		{ yylval.integer = BRW_OPCODE_NENOP; return NENOP; }
+nop 		{ yylval.integer = BRW_OPCODE_NOP; return NOP; }
+not 		{ yylval.integer = BRW_OPCODE_NOT; return NOT; }
+or 		{ yylval.integer = BRW_OPCODE_OR; return OR; }
+pln 		{ yylval.integer = BRW_OPCODE_PLN; return PLN; }
+pop 		{ yylval.integer = BRW_OPCODE_POP; return POP; }
+push 		{ yylval.integer = BRW_OPCODE_PUSH; return PUSH; }
+ret 		{ yylval.integer = BRW_OPCODE_RET; return RET; }
+rndd 		{ yylval.integer = BRW_OPCODE_RNDD; return RNDD; }
+rnde 		{ yylval.integer = BRW_OPCODE_RNDE; return RNDE; }
+rndu 		{ yylval.integer = BRW_OPCODE_RNDU; return RNDU; }
+rndz 		{ yylval.integer = BRW_OPCODE_RNDZ; return RNDZ; }
+rol 		{ yylval.integer = BRW_OPCODE_ROL; return ROL; }
+ror 		{ yylval.integer = BRW_OPCODE_ROR; return ROR; }
+sad2 		{ yylval.integer = BRW_OPCODE_SAD2; return SAD2; }
+sada2 		{ yylval.integer = BRW_OPCODE_SADA2; return SADA2; }
+sel 		{ yylval.integer = BRW_OPCODE_SEL; return SEL; }
+send 		{
+			yylval.integer = BRW_OPCODE_SEND;
+			return p->devinfo->ver < 12 ? SEND_GFX4 : SEND_GFX12;
+		}
+sendc 		{
+			yylval.integer = BRW_OPCODE_SENDC;
+			return  p->devinfo->ver < 12 ? SENDC_GFX4 : SENDC_GFX12;
+		}
+sends 	        { yylval.integer = BRW_OPCODE_SENDS; return SENDS; }
+sendsc        	{ yylval.integer = BRW_OPCODE_SENDSC; return SENDSC; }
+shl 		{ yylval.integer = BRW_OPCODE_SHL; return SHL; }
+shr 		{ yylval.integer = BRW_OPCODE_SHR; return SHR; }
+smov 		{ yylval.integer = BRW_OPCODE_SMOV; return SMOV; }
+subb 		{ yylval.integer = BRW_OPCODE_SUBB; return SUBB; }
+wait 		{ yylval.integer = BRW_OPCODE_WAIT; return WAIT; }
+while 		{ yylval.integer = BRW_OPCODE_WHILE; return WHILE; }
+xor 		{ yylval.integer = BRW_OPCODE_XOR; return XOR; }
+sync		{ yylval.integer = BRW_OPCODE_SYNC; return SYNC; }
+
+ /* extended math functions */
+cos 		{ yylval.integer = BRW_MATH_FUNCTION_COS; return COS; }
+exp 		{ yylval.integer = BRW_MATH_FUNCTION_EXP; return EXP; }
+fdiv 		{ yylval.integer = BRW_MATH_FUNCTION_FDIV; return FDIV; }
+inv 		{ yylval.integer = BRW_MATH_FUNCTION_INV; return INV; }
+invm 		{ yylval.integer = GFX8_MATH_FUNCTION_INVM; return INVM; }
+intdiv        	{
+		   yylval.integer = BRW_MATH_FUNCTION_INT_DIV_QUOTIENT;
+		   return INTDIV;
+		}
+intdivmod    	{
+		   yylval.integer =
+		      BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER;
+		   return INTDIVMOD;
+		}
+intmod      	{
+		   yylval.integer = BRW_MATH_FUNCTION_INT_DIV_REMAINDER;
+		   return INTMOD;
+		}
+log 		{ yylval.integer = BRW_MATH_FUNCTION_LOG; return LOG; }
+pow 		{ yylval.integer = BRW_MATH_FUNCTION_POW; return POW; }
+rsq 		{ yylval.integer = BRW_MATH_FUNCTION_RSQ; return RSQ; }
+rsqrtm       	{ yylval.integer = GFX8_MATH_FUNCTION_RSQRTM; return RSQRTM; }
+sin 		{ yylval.integer = BRW_MATH_FUNCTION_SIN; return SIN; }
+sqrt 		{ yylval.integer = BRW_MATH_FUNCTION_SQRT; return SQRT; }
+sincos       	{ yylval.integer = BRW_MATH_FUNCTION_SINCOS; return SINCOS; }
+
+ /* sync instruction */
+allrd		{ yylval.integer = TGL_SYNC_ALLRD; return ALLRD; }
+allwr		{ yylval.integer = TGL_SYNC_ALLWR; return ALLWR; }
+fence		{ yylval.integer = TGL_SYNC_FENCE; return FENCE; }
+bar		{ yylval.integer = TGL_SYNC_BAR; return BAR; }
+host		{ yylval.integer = TGL_SYNC_HOST; return HOST; }
+
+ /* shared functions for send instruction */
+sampler 		{ return SAMPLER; }
+dp_sampler 		{ return DP_SAMPLER; }
+gateway 		{ return GATEWAY; }
+urb 			{ return URB; }
+thread_spawner		{ return THREAD_SPAWNER; }
+render            	{ return RENDER; }
+const 			{ return CONST; }
+data 			{ return DATA; }
+cre 			{ return CRE; }
+math 			{ return MATH; }
+read 			{ return READ; }
+write 			{ return WRITE; }
+vme 			{ return VME; }
+"pixel interp"		{ return PIXEL_INTERP; }
+"dp data 1" 		{ return DP_DATA_1; }
+"rt accel"		{ return RT_ACCEL; }
+slm			{ return SLM; }
+tgm			{ return TGM; }
+ugm			{ return UGM; }
+
+";"    	{ return SEMICOLON; }
+":"    	{ return COLON; }
+"("    	{ return LPAREN; }
+")"    	{ return RPAREN; }
+"{"    	{ return LCURLY; }
+"}"    	{ return RCURLY; }
+"["    	{ return LSQUARE; }
+"]"    	{ return RSQUARE; }
+"<"    	{ return LANGLE; }
+">"    	{ return RANGLE; }
+","    	{ return COMMA; }
+"."    	{ return DOT; }
+"+"    	{ return PLUS; }
+"-"    	{ return MINUS; }
+"~"    	{ return MINUS; }
+"(abs)"	{ return ABS; }
+
+
+"VxH"             	{ return VxH; }
+<REG>"<" 		{ return LANGLE; }
+<REG>[0-9][0-9]* 	{
+			   yylval.integer = strtoul(yytext, NULL, 10);
+			   return INTEGER;
+			}
+<REG>">" 		{ return RANGLE; }
+<REG>","		{ return COMMA; }
+<REG>"."		{ BEGIN(DOTSEL); return DOT; }
+<REG>";"		{ return SEMICOLON; }
+
+<DOTSEL>"x"	        { yylval.integer = BRW_CHANNEL_X; return X; }
+<DOTSEL>"y" 	        { yylval.integer = BRW_CHANNEL_Y; return Y; }
+<DOTSEL>"z" 	        { yylval.integer = BRW_CHANNEL_Z; return Z; }
+<DOTSEL>"w" 	        { yylval.integer = BRW_CHANNEL_W; return W; }
+<DOTSEL>[0-9][0-9]* 	{
+			   yylval.integer = strtoul(yytext, NULL, 10);
+			   BEGIN(REG);
+			   return INTEGER;
+		        }
+<DOTSEL>. 	        { yyless(0); BEGIN(INITIAL); }
+<REG>.             	{ yyless(0); BEGIN(INITIAL); }
+
+ /* Access mode */
+"align1"	{ return ALIGN1; }
+"align16"	{ return ALIGN16; }
+
+ /* Accumulator write control */
+AccWrEnable 	{ return ACCWREN; }
+
+ /* Mask control (formerly WECtrl/Write Enable Control) */
+"WE_all"	{ return WECTRL; }
+
+ /* Compaction control */
+compacted 	{ return CMPTCTRL; }
+
+ /* Debug control */
+breakpoint 	{ return BREAKPOINT; }
+
+ /* Dependency control */
+NoDDClr 	{ return NODDCLR; }
+NoDDChk 	{ return NODDCHK; }
+
+ /* End of thread */
+EOT 		{ return EOT; }
+
+ /* Mask control */
+nomask      	{ return MASK_DISABLE; }
+
+ /* Channel */
+<CHANNEL>"x" 		{ yylval.integer = BRW_CHANNEL_X; return X; }
+<CHANNEL>"y" 		{ yylval.integer = BRW_CHANNEL_Y; return Y; }
+<CHANNEL>"z" 		{ yylval.integer = BRW_CHANNEL_Z; return Z; }
+<CHANNEL>"w" 		{ yylval.integer = BRW_CHANNEL_W; return W; }
+<CHANNEL>[0-9][0-9]* 	{
+			   yylval.integer = strtoul(yytext, NULL, 10);
+			   return INTEGER;
+		        }
+<CHANNEL>"."    	{ return DOT; }
+<CHANNEL>. 		{ yyless(0); BEGIN(INITIAL); }
+
+
+ /* Predicate Control */
+<CHANNEL>".anyv"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ANYV; return ANYV; }
+<CHANNEL>".allv"      	{ yylval.integer = BRW_PREDICATE_ALIGN1_ALLV; return ALLV; }
+<CHANNEL>".any2h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ANY2H; return ANY2H; }
+<CHANNEL>".all2h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ALL2H; return ALL2H; }
+<CHANNEL>".any4h"	{ yylval.integer = BRW_PREDICATE_ALIGN16_ANY4H; return ANY4H; }
+<CHANNEL>".all4h"	{ yylval.integer = BRW_PREDICATE_ALIGN16_ALL4H; return ALL4H; }
+<CHANNEL>".any8h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ANY8H; return ANY8H; }
+<CHANNEL>".all8h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ALL8H; return ALL8H; }
+<CHANNEL>".any16h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ANY16H; return ANY16H; }
+<CHANNEL>".all16h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ALL16H; return ALL16H; }
+<CHANNEL>".any32h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ANY32H; return ANY32H; }
+<CHANNEL>".all32h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ALL32H; return ALL32H; }
+
+ /* Saturation */
+".sat"		{ return SATURATE; }
+
+ /* Thread control */
+atomic       	{ return ATOMIC; }
+switch       	{ return SWITCH; }
+
+ /* compression control */
+compr 		{ return COMPR; }
+compr4    	{ return COMPR4; }
+sechalf 	{ return SECHALF; }
+
+ /* Quarter Control */
+1[HNQ]       	{ }
+"2Q"	        { return QTR_2Q; }
+"3Q"	        { return QTR_3Q; }
+"4Q"	        { return QTR_4Q; }
+"2H"	        { return QTR_2H; }
+"2N"	        { return QTR_2N; }
+"3N"	        { return QTR_3N; }
+"4N"	        { return QTR_4N; }
+"5N"	        { return QTR_5N; }
+"6N"	        { return QTR_6N; }
+"7N"	        { return QTR_7N; }
+"8N"	        { return QTR_8N; }
+
+ /* data types */
+:?B 	{ return TYPE_B; }
+:?D 	{ return TYPE_D; }
+:?DF 	{ return TYPE_DF; }
+:?F 	{ return TYPE_F; }
+:?HF 	{ return TYPE_HF; }
+:?NF 	{ return TYPE_NF; }
+:?Q 	{ return TYPE_Q; }
+:?UB 	{ return TYPE_UB; }
+:?UD 	{ return TYPE_UD; }
+:?UW 	{ return TYPE_UW; }
+:?UQ 	{ return TYPE_UQ; }
+:?UV 	{ return TYPE_UV; }
+:?V 	{ return TYPE_V; }
+:?VF 	{ return TYPE_VF; }
+:?W 	{ return TYPE_W; }
+
+ /* Address registers */
+"a0" 		{ return ADDRREG; }
+
+ /* accumulator registers */
+"acc"[0-9]+ 	{ yylval.integer = atoi(yytext + 3); return ACCREG; }
+
+ /* channel enable registers */
+"ce0"		{ return CHANNELENABLEREG; }
+
+ /* control registers */
+"cr0" 		{ return CONTROLREG; }
+
+ /* flag registers */
+"f"[0|1] 	{ BEGIN(CHANNEL); yylval.integer = atoi(yytext + 1); return FLAGREG; }
+
+ /* message control registers */
+"m" 		{ return MSGREGFILE; }
+m[0-9]+ 	{ yylval.integer = atoi(yytext + 1); BEGIN(REG); return MSGREG; }
+
+ /* state register */
+sr[0-9]+ 	{ yylval.integer = atoi(yytext + 2); return STATEREG; }
+
+ /* notification registers */
+"n0"  		{ BEGIN(REG); return NOTIFYREG; }
+
+ /* IP register */
+"ip" 		{ return IPREG; }
+
+ /* Thread control register */
+"tdr0"		{ return THREADREG; }
+
+ /* performance register */
+"tm0" 		{ BEGIN(REG); return PERFORMANCEREG; }
+
+[gr][0-9]+ 	{
+		   yylval.integer = atoi(yytext + 1);
+		   BEGIN(REG); return GENREG;
+		}
+[gr] 		{ return GENREGFILE; }
+"mask"[0-9]+ 	{ yylval.integer = atoi(yytext + 4); return MASKREG; }
+
+ /* Conditional modifiers */
+".e" 	{ yylval.integer = BRW_CONDITIONAL_Z; return EQUAL; }
+".g" 	{ yylval.integer = BRW_CONDITIONAL_G; return GREATER; }
+".ge"	{ yylval.integer = BRW_CONDITIONAL_GE; return GREATER_EQUAL; }
+".l"	{ yylval.integer = BRW_CONDITIONAL_L; return LESS; }
+".le"	{ yylval.integer = BRW_CONDITIONAL_LE; return LESS_EQUAL; }
+".ne"	{ yylval.integer = BRW_CONDITIONAL_NZ; return NOT_EQUAL; }
+".nz"	{ yylval.integer = BRW_CONDITIONAL_NZ; return NOT_ZERO; }
+".o"	{ yylval.integer = BRW_CONDITIONAL_O; return OVERFLOW; }
+".r"	{ yylval.integer = BRW_CONDITIONAL_R; return ROUND_INCREMENT; }
+".u"	{ yylval.integer = BRW_CONDITIONAL_U; return UNORDERED; }
+".z"	{ yylval.integer = BRW_CONDITIONAL_Z; return ZERO; }
+
+ /* Eat up JIP and UIP token, their values will be parsed
+  * in numeric section
+  */
+"JIP: "		{ BEGIN(LABEL); }
+"UIP: "		{ BEGIN(LABEL); }
+"Jump: "       	{ }
+"Pop: "		{ }
+[ \t]+ 		{ }
+
+"MsgDesc: "		{ BEGIN(MSGDESC); return MSGDESC_BEGIN; }
+<MSGDESC>ex_bso		{ return EX_BSO; }
+<MSGDESC>src1_len	{ return SRC1_LEN; }
+<MSGDESC>"="		{ return ASSIGN; }
+<MSGDESC>[0-9][0-9]*	{
+			   yylval.integer = strtoul(yytext, NULL, 10);
+			   return INTEGER;
+		        }
+<MSGDESC>"{"    	{ yyless(0); BEGIN(INITIAL); return MSGDESC_END; }
+<MSGDESC>.      	{ }
+
+"0x"[0-9a-f][0-9a-f]* 	{
+			   yylval.llint = strtoull(yytext + 2, NULL, 16);
+			   return LONG;
+			}
+[0-9][0-9]* 		{
+			   yylval.llint = strtoll(yytext, NULL, 10);
+			   return LONG;
+			}
+
+ /* jump label target */
+[a-zA-Z_][0-9a-zA-Z_]*":" {
+	yylval.string = ralloc_strdup(p->mem_ctx, yytext);
+	/* Stomp the trailing ':' */
+	yylval.string[yyleng - 1] = '\0';
+	return JUMP_LABEL_TARGET;
+}
+
+ /* jump label */
+<LABEL>[a-zA-Z_][0-9a-zA-Z_]* {
+	yylval.string = ralloc_strdup(p->mem_ctx, yytext);
+	BEGIN(INITIAL);
+	return JUMP_LABEL;
+}
+
+ /* SWSB */
+"@"[1-7]	{ yylval.integer = atoi(yytext + 1); return REG_DIST_CURRENT; }
+"F@"[1-7]	{ yylval.integer = atoi(yytext + 2); return REG_DIST_FLOAT; }
+"I@"[1-7]	{ yylval.integer = atoi(yytext + 2); return REG_DIST_INT; }
+"L@"[1-7]	{ yylval.integer = atoi(yytext + 2); return REG_DIST_LONG; }
+"A@"[1-7]	{ yylval.integer = atoi(yytext + 2); return REG_DIST_ALL; }
+
+"$"[0-9]*	{ yylval.integer = atoi(yytext + 1); return SBID_ALLOC; }
+"$"[0-9]*".src"	{ yylval.integer = atoi(yytext + 1); return SBID_WAIT_SRC; }
+"$"[0-9]*".dst"	{ yylval.integer = atoi(yytext + 1); return SBID_WAIT_DST; }
+
+\n 	{ yycolumn = 1; }
+
+. 	{
+	   fprintf(stderr, "%s: %d: %s: at \"%s\"\n",
+	           input_filename, yylineno,
+	           "unexpected token", lex_text());
+	}
+%%
+
+char *
+lex_text(void)
+{
+	return yytext;
+}
+
+#ifndef yywrap
+int yywrap()
+{
+	return -1;
+}
+#endif
diff --git a/src/intel/compiler/elk/brw_lower_logical_sends.cpp b/src/intel/compiler/elk/brw_lower_logical_sends.cpp
new file mode 100644
index 00000000000..7acbfd20d28
--- /dev/null
+++ b/src/intel/compiler/elk/brw_lower_logical_sends.cpp
@@ -0,0 +1,3398 @@
+/*
+ * Copyright © 2010, 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file brw_lower_logical_sends.cpp
+ */
+
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+static void
+lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   const bool per_slot_present =
+      inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
+
+   assert(inst->size_written % REG_SIZE == 0);
+   assert(inst->header_size == 0);
+
+   fs_reg payload_sources[2];
+   unsigned header_size = 0;
+   payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
+   if (per_slot_present)
+      payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
+
+   fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(header_size),
+                           BRW_REGISTER_TYPE_F);
+   bld.LOAD_PAYLOAD(payload, payload_sources, header_size, header_size);
+
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->header_size = header_size;
+
+   inst->sfid = BRW_SFID_URB;
+   inst->desc = brw_urb_desc(devinfo,
+                             GFX8_URB_OPCODE_SIMD8_READ,
+                             per_slot_present,
+                             false,
+                             inst->offset);
+
+   inst->mlen = header_size;
+   inst->ex_desc = 0;
+   inst->ex_mlen = 0;
+   inst->send_is_volatile = true;
+
+   inst->resize_sources(4);
+
+   inst->src[0] = brw_imm_ud(0); /* desc */
+   inst->src[1] = brw_imm_ud(0); /* ex_desc */
+   inst->src[2] = payload;
+   inst->src[3] = brw_null_reg();
+}
+
+static void
+lower_urb_read_logical_send_xe2(const fs_builder &bld, fs_inst *inst)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   assert(devinfo->has_lsc);
+
+   assert(inst->size_written % (REG_SIZE * reg_unit(devinfo)) == 0);
+   assert(inst->header_size == 0);
+
+   /* Get the logical send arguments. */
+   const fs_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
+
+   /* Calculate the total number of components of the payload. */
+   const unsigned dst_comps = inst->size_written / (REG_SIZE * reg_unit(devinfo));
+
+   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+   bld.MOV(payload, handle);
+
+   /* The low 24-bits of the URB handle is a byte offset into the URB area.
+    * Add the (OWord) offset of the write to this value.
+    */
+   if (inst->offset) {
+      bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
+      inst->offset = 0;
+   }
+
+   fs_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
+   if (offsets.file != BAD_FILE) {
+      fs_reg offsets_B = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.SHL(offsets_B, offsets, brw_imm_ud(4)); /* OWords -> Bytes */
+      bld.ADD(payload, payload, offsets_B);
+   }
+
+   inst->sfid = BRW_SFID_URB;
+
+   assert((dst_comps >= 1 && dst_comps <= 4) || dst_comps == 8);
+
+   inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
+                             LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
+                             1 /* num_coordinates */,
+                             LSC_DATA_SIZE_D32, dst_comps /* num_channels */,
+                             false /* transpose */,
+                             LSC_CACHE(devinfo, STORE, L1UC_L3UC),
+                             false /* has_dest */);
+
+
+   /* Update the original instruction. */
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
+   inst->ex_mlen = 0;
+   inst->header_size = 0;
+   inst->send_has_side_effects = true;
+   inst->send_is_volatile = false;
+
+   inst->resize_sources(4);
+
+   inst->src[0] = brw_imm_ud(0);
+   inst->src[1] = brw_imm_ud(0);
+
+   inst->src[2] = payload;
+   inst->src[3] = brw_null_reg();
+}
+
+static void
+lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   const bool per_slot_present =
+      inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
+   const bool channel_mask_present =
+      inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE;
+
+   assert(inst->header_size == 0);
+
+   const unsigned length = 1 + per_slot_present + channel_mask_present +
+                           inst->components_read(URB_LOGICAL_SRC_DATA);
+
+   fs_reg *payload_sources = new fs_reg[length];
+   fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(length),
+                           BRW_REGISTER_TYPE_F);
+
+   unsigned header_size = 0;
+   payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
+   if (per_slot_present)
+      payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
+
+   if (channel_mask_present)
+      payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
+
+   for (unsigned i = header_size, j = 0; i < length; i++, j++)
+      payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
+
+   bld.LOAD_PAYLOAD(payload, payload_sources, length, header_size);
+
+   delete [] payload_sources;
+
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->header_size = header_size;
+   inst->dst = brw_null_reg();
+
+   inst->sfid = BRW_SFID_URB;
+   inst->desc = brw_urb_desc(devinfo,
+                             GFX8_URB_OPCODE_SIMD8_WRITE,
+                             per_slot_present,
+                             channel_mask_present,
+                             inst->offset);
+
+   inst->mlen = length;
+   inst->ex_desc = 0;
+   inst->ex_mlen = 0;
+   inst->send_has_side_effects = true;
+
+   inst->resize_sources(4);
+
+   inst->src[0] = brw_imm_ud(0); /* desc */
+   inst->src[1] = brw_imm_ud(0); /* ex_desc */
+   inst->src[2] = payload;
+   inst->src[3] = brw_null_reg();
+}
+
+static void
+lower_urb_write_logical_send_xe2(const fs_builder &bld, fs_inst *inst)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   assert(devinfo->has_lsc);
+
+   /* Get the logical send arguments. */
+   const fs_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
+   const fs_reg src = inst->components_read(URB_LOGICAL_SRC_DATA) ?
+      inst->src[URB_LOGICAL_SRC_DATA] : fs_reg(brw_imm_ud(0));
+   assert(type_sz(src.type) == 4);
+
+   /* Calculate the total number of components of the payload. */
+   const unsigned src_comps = MAX2(1, inst->components_read(URB_LOGICAL_SRC_DATA));
+   const unsigned src_sz = type_sz(src.type);
+
+   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+   bld.MOV(payload, handle);
+
+   /* The low 24-bits of the URB handle is a byte offset into the URB area.
+    * Add the (OWord) offset of the write to this value.
+    */
+   if (inst->offset) {
+      bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
+      inst->offset = 0;
+   }
+
+   fs_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
+   if (offsets.file != BAD_FILE) {
+      fs_reg offsets_B = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.SHL(offsets_B, offsets, brw_imm_ud(4)); /* OWords -> Bytes */
+      bld.ADD(payload, payload, offsets_B);
+   }
+
+   const fs_reg cmask = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
+   unsigned mask = 0;
+
+   if (cmask.file != BAD_FILE) {
+      assert(cmask.file == IMM);
+      assert(cmask.type == BRW_REGISTER_TYPE_UD);
+      mask = cmask.ud >> 16;
+   }
+
+   fs_reg payload2 = bld.move_to_vgrf(src, src_comps);
+   const unsigned ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
+
+   inst->sfid = BRW_SFID_URB;
+
+   enum lsc_opcode op = mask ? LSC_OP_STORE_CMASK : LSC_OP_STORE;
+   inst->desc = lsc_msg_desc_wcmask(devinfo, op, inst->exec_size,
+                             LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
+                             1 /* num_coordinates */,
+                             LSC_DATA_SIZE_D32, src_comps /* num_channels */,
+                             false /* transpose */,
+                             LSC_CACHE(devinfo, STORE, L1UC_L3UC),
+                             false /* has_dest */, mask);
+
+
+   /* Update the original instruction. */
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
+   inst->ex_mlen = ex_mlen;
+   inst->header_size = 0;
+   inst->send_has_side_effects = true;
+   inst->send_is_volatile = false;
+
+   inst->resize_sources(4);
+
+   inst->src[0] = brw_imm_ud(0);
+   inst->src[1] = brw_imm_ud(0);
+
+   inst->src[2] = payload;
+   inst->src[3] = payload2;
+}
+
+static void
+setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
+                    fs_reg *dst, fs_reg color, unsigned components)
+{
+   if (key->clamp_fragment_color) {
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+      assert(color.type == BRW_REGISTER_TYPE_F);
+
+      for (unsigned i = 0; i < components; i++)
+         set_saturate(true,
+                      bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
+
+      color = tmp;
+   }
+
+   for (unsigned i = 0; i < components; i++)
+      dst[i] = offset(color, bld, i);
+}
+
+static void
+lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
+                            const struct brw_wm_prog_data *prog_data,
+                            const brw_wm_prog_key *key,
+                            const fs_thread_payload &payload)
+{
+   assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   const fs_reg color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
+   const fs_reg color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
+   const fs_reg src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
+   const fs_reg src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
+   const fs_reg dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
+   const fs_reg src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
+   fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
+   const unsigned components =
+      inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
+
+   assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
+
+   /* We can potentially have a message length of up to 15, so we have to set
+    * base_mrf to either 0 or 1 in order to fit in m0..m15.
+    */
+   fs_reg sources[15];
+   int header_size = 2, payload_header_size;
+   unsigned length = 0;
+
+   if (devinfo->ver < 6) {
+      /* TODO: Support SIMD32 on gfx4-5 */
+      assert(bld.group() < 16);
+
+      /* For gfx4-5, we always have a header consisting of g0 and g1.  We have
+       * an implied MOV from g0,g1 to the start of the message.  The MOV from
+       * g0 is handled by the hardware and the MOV from g1 is provided by the
+       * generator.  This is required because, on gfx4-5, the generator may
+       * generate two write messages with different message lengths in order
+       * to handle AA data properly.
+       *
+       * Also, since the pixel mask goes in the g0 portion of the message and
+       * since render target writes are the last thing in the shader, we write
+       * the pixel mask directly into g0 and it will get copied as part of the
+       * implied write.
+       */
+      if (prog_data->uses_kill) {
+         bld.exec_all().group(1, 0)
+            .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW),
+                 brw_sample_mask_reg(bld));
+      }
+
+      assert(length == 0);
+      length = 2;
+   } else if ((devinfo->verx10 <= 70 &&
+               prog_data->uses_kill) ||
+              (devinfo->ver < 11 &&
+               (color1.file != BAD_FILE || key->nr_color_regions > 1))) {
+      assert(devinfo->ver < 20);
+
+      /* From the Sandy Bridge PRM, volume 4, page 198:
+       *
+       *     "Dispatched Pixel Enables. One bit per pixel indicating
+       *      which pixels were originally enabled when the thread was
+       *      dispatched. This field is only required for the end-of-
+       *      thread message and on all dual-source messages."
+       */
+      const fs_builder ubld = bld.exec_all().group(8, 0);
+
+      fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+      if (bld.group() < 16) {
+         /* The header starts off as g0 and g1 for the first half */
+         ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
+                                              BRW_REGISTER_TYPE_UD));
+      } else {
+         /* The header starts off as g0 and g2 for the second half */
+         assert(bld.group() < 32);
+         const fs_reg header_sources[2] = {
+            retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
+            retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD),
+         };
+         ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
+
+         /* Gfx12 will require additional fix-ups if we ever hit this path. */
+         assert(devinfo->ver < 12);
+      }
+
+      uint32_t g00_bits = 0;
+
+      /* Set "Source0 Alpha Present to RenderTarget" bit in message
+       * header.
+       */
+      if (src0_alpha.file != BAD_FILE)
+         g00_bits |= 1 << 11;
+
+      /* Set computes stencil to render target */
+      if (prog_data->computed_stencil)
+         g00_bits |= 1 << 14;
+
+      if (g00_bits) {
+         /* OR extra bits into g0.0 */
+         ubld.group(1, 0).OR(component(header, 0),
+                             retype(brw_vec1_grf(0, 0),
+                                    BRW_REGISTER_TYPE_UD),
+                             brw_imm_ud(g00_bits));
+      }
+
+      /* Set the render target index for choosing BLEND_STATE. */
+      if (inst->target > 0) {
+         ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
+      }
+
+      if (prog_data->uses_kill) {
+         ubld.group(1, 0).MOV(retype(component(header, 15),
+                                     BRW_REGISTER_TYPE_UW),
+                              brw_sample_mask_reg(bld));
+      }
+
+      assert(length == 0);
+      sources[0] = header;
+      sources[1] = horiz_offset(header, 8);
+      length = 2;
+   }
+   assert(length == 0 || length == 2);
+   header_size = length;
+
+   if (payload.aa_dest_stencil_reg[0]) {
+      assert(inst->group < 16);
+      sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
+      bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
+         .MOV(sources[length],
+              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
+      length++;
+   }
+
+   if (src0_alpha.file != BAD_FILE) {
+      for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
+         const fs_builder &ubld = bld.exec_all().group(8, i)
+                                    .annotate("FB write src0 alpha");
+         const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F);
+         ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
+         setup_color_payload(ubld, key, &sources[length], tmp, 1);
+         length++;
+      }
+   }
+
+   if (sample_mask.file != BAD_FILE) {
+      const fs_reg tmp(VGRF, bld.shader->alloc.allocate(reg_unit(devinfo)),
+                       BRW_REGISTER_TYPE_UD);
+
+      /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
+       * relevant.  Since it's unsigned single words one vgrf is always
+       * 16-wide, but only the lower or higher 8 channels will be used by the
+       * hardware when doing a SIMD8 write depending on whether we have
+       * selected the subspans for the first or second half respectively.
+       */
+      assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
+      sample_mask.type = BRW_REGISTER_TYPE_UW;
+      sample_mask.stride *= 2;
+
+      bld.exec_all().annotate("FB write oMask")
+         .MOV(horiz_offset(retype(tmp, BRW_REGISTER_TYPE_UW),
+                           inst->group % (16 * reg_unit(devinfo))),
+              sample_mask);
+
+      for (unsigned i = 0; i < reg_unit(devinfo); i++)
+         sources[length++] = byte_offset(tmp, REG_SIZE * i);
+   }
+
+   payload_header_size = length;
+
+   setup_color_payload(bld, key, &sources[length], color0, components);
+   length += 4;
+
+   if (color1.file != BAD_FILE) {
+      setup_color_payload(bld, key, &sources[length], color1, components);
+      length += 4;
+   }
+
+   if (src_depth.file != BAD_FILE) {
+      sources[length] = src_depth;
+      length++;
+   }
+
+   if (dst_depth.file != BAD_FILE) {
+      sources[length] = dst_depth;
+      length++;
+   }
+
+   if (src_stencil.file != BAD_FILE) {
+      assert(devinfo->ver >= 9);
+      assert(bld.dispatch_width() == 8 * reg_unit(devinfo));
+
+      /* XXX: src_stencil is only available on gfx9+. dst_depth is never
+       * available on gfx9+. As such it's impossible to have both enabled at the
+       * same time and therefore length cannot overrun the array.
+       */
+      assert(length < 15 * reg_unit(devinfo));
+
+      sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.exec_all().annotate("FB write OS")
+         .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
+              subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
+      length++;
+   }
+
+   fs_inst *load;
+   if (devinfo->ver >= 7) {
+      /* Send from the GRF */
+      fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
+      load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
+      payload.nr = bld.shader->alloc.allocate(regs_written(load));
+      load->dst = payload;
+
+      uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);
+
+      inst->desc =
+         (inst->group / 16) << 11 | /* rt slot group */
+         brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
+                           0 /* coarse_rt_write */);
+
+      fs_reg desc = brw_imm_ud(0);
+      if (prog_data->coarse_pixel_dispatch == BRW_ALWAYS) {
+         inst->desc |= (1 << 18);
+      } else if (prog_data->coarse_pixel_dispatch == BRW_SOMETIMES) {
+         STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_RT_WRITES == (1 << 18));
+         const fs_builder &ubld = bld.exec_all().group(8, 0);
+         desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         ubld.AND(desc, dynamic_msaa_flags(prog_data),
+                  brw_imm_ud(INTEL_MSAA_FLAG_COARSE_RT_WRITES));
+         desc = component(desc, 0);
+      }
+
+      uint32_t ex_desc = 0;
+      if (devinfo->ver >= 11) {
+         /* Set the "Render Target Index" and "Src0 Alpha Present" fields
+          * in the extended message descriptor, in lieu of using a header.
+          */
+         ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15;
+
+         if (key->nr_color_regions == 0)
+            ex_desc |= 1 << 20; /* Null Render Target */
+      }
+      inst->ex_desc = ex_desc;
+
+      inst->opcode = SHADER_OPCODE_SEND;
+      inst->resize_sources(3);
+      inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
+      inst->src[0] = desc;
+      inst->src[1] = brw_imm_ud(0);
+      inst->src[2] = payload;
+      inst->mlen = regs_written(load);
+      inst->ex_mlen = 0;
+      inst->header_size = header_size;
+      inst->check_tdr = true;
+      inst->send_has_side_effects = true;
+   } else {
+      /* Send from the MRF */
+      load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
+                              sources, length, payload_header_size);
+
+      /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
+       * will do this for us if we just give it a COMPR4 destination.
+       */
+      if (devinfo->ver < 6 && bld.dispatch_width() == 16)
+         load->dst.nr |= BRW_MRF_COMPR4;
+
+      if (devinfo->ver < 6) {
+         /* Set up src[0] for the implied MOV from grf0-1 */
+         inst->resize_sources(1);
+         inst->src[0] = brw_vec8_grf(0, 0);
+      } else {
+         inst->resize_sources(0);
+      }
+      inst->base_mrf = 1;
+      inst->opcode = FS_OPCODE_FB_WRITE;
+      inst->mlen = regs_written(load);
+      inst->header_size = header_size;
+   }
+}
+
+static void
+lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   const fs_builder &ubld = bld.exec_all().group(8, 0);
+   const unsigned length = 2;
+   const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length);
+
+   if (bld.group() < 16) {
+      ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
+                                           BRW_REGISTER_TYPE_UD));
+   } else {
+      assert(bld.group() < 32);
+      const fs_reg header_sources[] = {
+         retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
+         retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD)
+      };
+      ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
+
+      if (devinfo->ver >= 12) {
+         /* On Gfx12 the Viewport and Render Target Array Index fields (AKA
+          * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
+          * target message header format was updated accordingly -- However
+          * the updated format only works for the lower 16 channels in a
+          * SIMD32 thread, since the higher 16 channels want the subspan data
+          * from r2 instead of r1, so we need to copy over the contents of
+          * r1.1 in order to fix things up.
+          */
+         ubld.group(1, 0).MOV(component(header, 9),
+                              retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD));
+      }
+   }
+
+   /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) :
+    *
+    *   "Must be zero for Render Target Read message."
+    *
+    * For bits :
+    *   - 14 : Stencil Present to Render Target
+    *   - 13 : Source Depth Present to Render Target
+    *   - 12 : oMask to Render Target
+    *   - 11 : Source0 Alpha Present to Render Target
+    */
+   ubld.group(1, 0).AND(component(header, 0),
+                        component(header, 0),
+                        brw_imm_ud(~INTEL_MASK(14, 11)));
+
+   inst->resize_sources(1);
+   inst->src[0] = header;
+   inst->opcode = FS_OPCODE_FB_READ;
+   inst->mlen = length;
+   inst->header_size = length;
+}
+
+static void
+lower_sampler_logical_send_gfx4(const fs_builder &bld, fs_inst *inst, opcode op,
+                                const fs_reg &coordinate,
+                                const fs_reg &shadow_c,
+                                const fs_reg &lod, const fs_reg &lod2,
+                                const fs_reg &surface,
+                                const fs_reg &sampler,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
+                         op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
+   fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
+   fs_reg msg_end = msg_begin;
+
+   /* g0 header. */
+   msg_end = offset(msg_end, bld.group(8, 0), 1);
+
+   for (unsigned i = 0; i < coord_components; i++)
+      bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
+              offset(coordinate, bld, i));
+
+   msg_end = offset(msg_end, bld, coord_components);
+
+   /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
+    * require all three components to be present and zero if they are unused.
+    */
+   if (coord_components > 0 &&
+       (has_lod || shadow_c.file != BAD_FILE ||
+        (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
+      assert(coord_components <= 3);
+      for (unsigned i = 0; i < 3 - coord_components; i++)
+         bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
+
+      msg_end = offset(msg_end, bld, 3 - coord_components);
+   }
+
+   if (op == SHADER_OPCODE_TXD) {
+      /* TXD unsupported in SIMD16 mode. */
+      assert(bld.dispatch_width() == 8);
+
+      /* the slots for u and v are always present, but r is optional */
+      if (coord_components < 2)
+         msg_end = offset(msg_end, bld, 2 - coord_components);
+
+      /*  P   = u, v, r
+       * dPdx = dudx, dvdx, drdx
+       * dPdy = dudy, dvdy, drdy
+       *
+       * 1-arg: Does not exist.
+       *
+       * 2-arg: dudx   dvdx   dudy   dvdy
+       *        dPdx.x dPdx.y dPdy.x dPdy.y
+       *        m4     m5     m6     m7
+       *
+       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
+       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
+       *        m5     m6     m7     m8     m9     m10
+       */
+      for (unsigned i = 0; i < grad_components; i++)
+         bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
+
+      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+
+      for (unsigned i = 0; i < grad_components; i++)
+         bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
+
+      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+   }
+
+   if (has_lod) {
+      /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
+       * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
+       */
+      assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
+             bld.dispatch_width() == 16);
+
+      const brw_reg_type type =
+         (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
+          BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
+      bld.MOV(retype(msg_end, type), lod);
+      msg_end = offset(msg_end, bld, 1);
+   }
+
+   if (shadow_c.file != BAD_FILE) {
+      if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
+         /* There's no plain shadow compare message, so we use shadow
+          * compare with a bias of 0.0.
+          */
+         bld.MOV(msg_end, brw_imm_f(0.0f));
+         msg_end = offset(msg_end, bld, 1);
+      }
+
+      bld.MOV(msg_end, shadow_c);
+      msg_end = offset(msg_end, bld, 1);
+   }
+
+   inst->opcode = op;
+   inst->src[0] = reg_undef;
+   inst->src[1] = surface;
+   inst->src[2] = sampler;
+   inst->resize_sources(3);
+   inst->base_mrf = msg_begin.nr;
+   inst->mlen = msg_end.nr - msg_begin.nr;
+   inst->header_size = 1;
+}
+
+static void
+lower_sampler_logical_send_gfx5(const fs_builder &bld, fs_inst *inst, opcode op,
+                                const fs_reg &coordinate,
+                                const fs_reg &shadow_c,
+                                const fs_reg &lod, const fs_reg &lod2,
+                                const fs_reg &sample_index,
+                                const fs_reg &surface,
+                                const fs_reg &sampler,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
+   fs_reg msg_coords = message;
+   unsigned header_size = 0;
+
+   if (inst->offset != 0) {
+      /* The offsets set up by the visitor are in the m1 header, so we can't
+       * go headerless.
+       */
+      header_size = 1;
+      message.nr--;
+   }
+
+   for (unsigned i = 0; i < coord_components; i++)
+      bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
+              offset(coordinate, bld, i));
+
+   fs_reg msg_end = offset(msg_coords, bld, coord_components);
+   fs_reg msg_lod = offset(msg_coords, bld, 4);
+
+   if (shadow_c.file != BAD_FILE) {
+      fs_reg msg_shadow = msg_lod;
+      bld.MOV(msg_shadow, shadow_c);
+      msg_lod = offset(msg_shadow, bld, 1);
+      msg_end = msg_lod;
+   }
+
+   switch (op) {
+   case SHADER_OPCODE_TXL:
+   case FS_OPCODE_TXB:
+      bld.MOV(msg_lod, lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXD:
+      /**
+       *  P   =  u,    v,    r
+       * dPdx = dudx, dvdx, drdx
+       * dPdy = dudy, dvdy, drdy
+       *
+       * Load up these values:
+       * - dudx   dudy   dvdx   dvdy   drdx   drdy
+       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
+       */
+      msg_end = msg_lod;
+      for (unsigned i = 0; i < grad_components; i++) {
+         bld.MOV(msg_end, offset(lod, bld, i));
+         msg_end = offset(msg_end, bld, 1);
+
+         bld.MOV(msg_end, offset(lod2, bld, i));
+         msg_end = offset(msg_end, bld, 1);
+      }
+      break;
+   case SHADER_OPCODE_TXS:
+      msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
+      bld.MOV(msg_lod, lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXF:
+      msg_lod = offset(msg_coords, bld, 3);
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXF_CMS:
+      msg_lod = offset(msg_coords, bld, 3);
+      /* lod */
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
+      /* sample index */
+      bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
+      msg_end = offset(msg_lod, bld, 2);
+      break;
+   default:
+      break;
+   }
+
+   inst->opcode = op;
+   inst->src[0] = reg_undef;
+   inst->src[1] = surface;
+   inst->src[2] = sampler;
+   inst->resize_sources(3);
+   inst->base_mrf = message.nr;
+   inst->mlen = msg_end.nr - message.nr;
+   inst->header_size = header_size;
+
+   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
+static bool
+is_high_sampler(const struct intel_device_info *devinfo, const fs_reg &sampler)
+{
+   if (devinfo->verx10 <= 70)
+      return false;
+
+   return sampler.file != IMM || sampler.ud >= 16;
+}
+
+static unsigned
+sampler_msg_type(const intel_device_info *devinfo,
+                 opcode opcode, bool shadow_compare, bool has_min_lod)
+{
+   assert(devinfo->ver >= 5);
+   switch (opcode) {
+   case SHADER_OPCODE_TEX:
+      if (devinfo->ver >= 20 && has_min_lod) {
+         return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD :
+                                 XE2_SAMPLER_MESSAGE_SAMPLE_MLOD;
+      } else {
+         return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
+                                 GFX5_SAMPLER_MESSAGE_SAMPLE;
+      }
+   case FS_OPCODE_TXB:
+      return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
+                              GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
+   case SHADER_OPCODE_TXL:
+      assert(!has_min_lod);
+      return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
+                              GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
+   case SHADER_OPCODE_TXL_LZ:
+      assert(!has_min_lod);
+      return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
+                              GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
+   case SHADER_OPCODE_TXS:
+   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
+      assert(!has_min_lod);
+      return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
+   case SHADER_OPCODE_TXD:
+      assert(!shadow_compare || devinfo->verx10 >= 75);
+      return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
+                              GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
+   case SHADER_OPCODE_TXF:
+      assert(!has_min_lod);
+      return GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
+   case SHADER_OPCODE_TXF_LZ:
+      assert(!has_min_lod);
+      assert(devinfo->ver >= 9);
+      return GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
+   case SHADER_OPCODE_TXF_CMS_W:
+      assert(!has_min_lod);
+      assert(devinfo->ver >= 9);
+      return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
+   case SHADER_OPCODE_TXF_CMS:
+      assert(!has_min_lod);
+      return devinfo->ver >= 7 ? GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS :
+                                 GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
+   case SHADER_OPCODE_TXF_UMS:
+      assert(!has_min_lod);
+      assert(devinfo->ver >= 7);
+      return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
+   case SHADER_OPCODE_TXF_MCS:
+      assert(!has_min_lod);
+      assert(devinfo->ver >= 7);
+      return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
+   case SHADER_OPCODE_LOD:
+      assert(!has_min_lod);
+      return GFX5_SAMPLER_MESSAGE_LOD;
+   case SHADER_OPCODE_TG4:
+      assert(!has_min_lod);
+      assert(devinfo->ver >= 7);
+      return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
+                              GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
+      break;
+   case SHADER_OPCODE_TG4_OFFSET:
+      assert(!has_min_lod);
+      assert(devinfo->ver >= 7);
+      return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
+                              GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
+   case SHADER_OPCODE_SAMPLEINFO:
+      assert(!has_min_lod);
+      return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
+   default:
+      unreachable("not reached");
+   }
+}
+
+/**
+ * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
+ * the given requested_alignment_sz.
+ */
+static fs_inst *
+emit_load_payload_with_padding(const fs_builder &bld, const fs_reg &dst,
+                               const fs_reg *src, unsigned sources,
+                               unsigned header_size,
+                               unsigned requested_alignment_sz)
+{
+   unsigned length = 0;
+   unsigned num_srcs =
+      sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
+   fs_reg *src_comps = new fs_reg[num_srcs];
+
+   for (unsigned i = 0; i < header_size; i++)
+      src_comps[length++] = src[i];
+
+   for (unsigned i = header_size; i < sources; i++) {
+      unsigned src_sz =
+         retype(dst, src[i].type).component_size(bld.dispatch_width());
+      const enum brw_reg_type padding_payload_type =
+         brw_reg_type_from_bit_size(type_sz(src[i].type) * 8,
+                                    BRW_REGISTER_TYPE_UD);
+
+      src_comps[length++] = src[i];
+
+      /* Expand the real sources if component of requested payload type is
+       * larger than real source component.
+       */
+      if (src_sz < requested_alignment_sz) {
+         for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
+            src_comps[length++] = retype(fs_reg(), padding_payload_type);
+         }
+      }
+   }
+
+   fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
+   delete[] src_comps;
+
+   return inst;
+}
+
+static void
+lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
+                                const fs_reg &coordinate,
+                                const fs_reg &shadow_c,
+                                fs_reg lod, const fs_reg &lod2,
+                                const fs_reg &min_lod,
+                                const fs_reg &sample_index,
+                                const fs_reg &mcs,
+                                const fs_reg &surface,
+                                const fs_reg &sampler,
+                                const fs_reg &surface_handle,
+                                const fs_reg &sampler_handle,
+                                const fs_reg &tg4_offset,
+                                unsigned payload_type_bit_size,
+                                unsigned coord_components,
+                                unsigned grad_components,
+                                bool residency)
+{
+   const brw_compiler *compiler = bld.shader->compiler;
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   const enum brw_reg_type payload_type =
+      brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_F);
+   const enum brw_reg_type payload_unsigned_type =
+      brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_UD);
+   const enum brw_reg_type payload_signed_type =
+      brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_D);
+   unsigned reg_width = bld.dispatch_width() / 8;
+   unsigned header_size = 0, length = 0;
+   fs_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE];
+   for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
+      sources[i] = bld.vgrf(payload_type);
+
+   /* We must have exactly one of surface/sampler and surface/sampler_handle */
+   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
+   assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
+
+   if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
+       inst->offset != 0 || inst->eot ||
+       op == SHADER_OPCODE_SAMPLEINFO ||
+       sampler_handle.file != BAD_FILE ||
+       is_high_sampler(devinfo, sampler) ||
+       residency) {
+      /* For general texture offsets (no txf workaround), we need a header to
+       * put them in.
+       *
+       * TG4 needs to place its channel select in the header, for interaction
+       * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
+       * larger sampler numbers we need to offset the Sampler State Pointer in
+       * the header.
+       */
+      fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD);
+      for (header_size = 0; header_size < reg_unit(devinfo); header_size++)
+         sources[length++] = byte_offset(header, REG_SIZE * header_size);
+
+      /* If we're requesting fewer than four channels worth of response,
+       * and we have an explicit header, we need to set up the sampler
+       * writemask.  It's reversed from normal: 1 means "don't write".
+       */
+      unsigned reg_count = regs_written(inst) - reg_unit(devinfo) * residency;
+      if (!inst->eot && reg_count < 4 * reg_width) {
+         assert(reg_count % reg_width == 0);
+         unsigned mask = ~((1 << (reg_count / reg_width)) - 1) & 0xf;
+         inst->offset |= mask << 12;
+      }
+
+      if (residency)
+         inst->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */
+
+      /* Build the actual header */
+      const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
+      const fs_builder ubld1 = ubld.group(1, 0);
+      ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+      if (inst->offset) {
+         ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
+      } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
+                 bld.shader->stage != MESA_SHADER_FRAGMENT) {
+         /* The vertex and fragment stages have g0.2 set to 0, so
+          * header0.2 is 0 when g0 is copied. Other stages may not, so we
+          * must set it to 0 to avoid setting undesirable bits in the
+          * message.
+          */
+         ubld1.MOV(component(header, 2), brw_imm_ud(0));
+      }
+
+      if (sampler_handle.file != BAD_FILE) {
+         /* Bindless sampler handles aren't relative to the sampler state
+          * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
+          * Instead, it's an absolute pointer relative to dynamic state base
+          * address.
+          *
+          * Sampler states are 16 bytes each and the pointer we give here has
+          * to be 32-byte aligned.  In order to avoid more indirect messages
+          * than required, we assume that all bindless sampler states are
+          * 32-byte aligned.  This sacrifices a bit of general state base
+          * address space but means we can do something more efficient in the
+          * shader.
+          */
+         if (compiler->use_bindless_sampler_offset) {
+            assert(devinfo->ver >= 11);
+            ubld1.OR(component(header, 3), sampler_handle, brw_imm_ud(1));
+         } else {
+            ubld1.MOV(component(header, 3), sampler_handle);
+         }
+      } else if (is_high_sampler(devinfo, sampler)) {
+         fs_reg sampler_state_ptr =
+            retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD);
+
+         /* Gfx11+ sampler message headers include bits in 4:0 which conflict
+          * with the ones included in g0.3 bits 4:0.  Mask them out.
+          */
+         if (devinfo->ver >= 11) {
+            sampler_state_ptr = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
+            ubld1.AND(sampler_state_ptr,
+                      retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
+                      brw_imm_ud(INTEL_MASK(31, 5)));
+         }
+
+         if (sampler.file == BRW_IMMEDIATE_VALUE) {
+            assert(sampler.ud >= 16);
+            const int sampler_state_size = 16; /* 16 bytes */
+
+            ubld1.ADD(component(header, 3), sampler_state_ptr,
+                      brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
+         } else {
+            fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
+            ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
+            ubld1.SHL(tmp, tmp, brw_imm_ud(4));
+            ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
+         }
+      } else if (devinfo->ver >= 11) {
+         /* Gfx11+ sampler message headers include bits in 4:0 which conflict
+          * with the ones included in g0.3 bits 4:0.  Mask them out.
+          */
+         ubld1.AND(component(header, 3),
+                   retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
+                   brw_imm_ud(INTEL_MASK(31, 5)));
+      }
+   }
+
+   /* Change the opcode to account for LOD being zero before the
+    * switch-statement that emits sources based on the opcode.
+    */
+   if (devinfo->ver >= 9 && lod.is_zero()) {
+      if (op == SHADER_OPCODE_TXL)
+         op = SHADER_OPCODE_TXL_LZ;
+      else if (op == SHADER_OPCODE_TXF)
+         op = SHADER_OPCODE_TXF_LZ;
+   }
+
+   /* On Xe2 and newer platforms, min_lod is the first parameter specifically
+    * so that a bunch of other, possibly unused, parameters don't need to also
+    * be included.
+    */
+   const unsigned msg_type =
+      sampler_msg_type(devinfo, op, inst->shadow_compare,
+                       min_lod.file != BAD_FILE);
+
+   const bool min_lod_is_first = devinfo->ver >= 20 &&
+      (msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_MLOD ||
+       msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD);
+
+   if (min_lod_is_first) {
+      assert(min_lod.file != BAD_FILE);
+      bld.MOV(sources[length++], min_lod);
+   }
+
+   if (shadow_c.file != BAD_FILE) {
+      bld.MOV(sources[length], shadow_c);
+      length++;
+   }
+
+   bool coordinate_done = false;
+
+   /* Set up the LOD info */
+   switch (op) {
+   case FS_OPCODE_TXB:
+   case SHADER_OPCODE_TXL:
+      bld.MOV(sources[length], lod);
+      length++;
+      break;
+   case SHADER_OPCODE_TXD:
+      /* TXD should have been lowered in SIMD16 mode (in SIMD32 mode in
+       * Xe2+).
+       */
+      assert(bld.dispatch_width() == (8 * reg_unit(devinfo)));
+
+      /* Load dPdx and the coordinate together:
+       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
+       */
+      for (unsigned i = 0; i < coord_components; i++) {
+         bld.MOV(sources[length++], offset(coordinate, bld, i));
+
+         /* For cube map array, the coordinate is (u,v,r,ai) but there are
+          * only derivatives for (u, v, r).
+          */
+         if (i < grad_components) {
+            bld.MOV(sources[length++], offset(lod, bld, i));
+            bld.MOV(sources[length++], offset(lod2, bld, i));
+         }
+      }
+
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TXS:
+      bld.MOV(retype(sources[length], payload_unsigned_type), lod);
+      length++;
+      break;
+   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
+      /* We need an LOD; just use 0 */
+      bld.MOV(retype(sources[length], payload_unsigned_type), brw_imm_ud(0));
+      length++;
+      break;
+   case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_LZ:
+      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
+       * On Gfx9 they are u, v, lod, r
+       */
+      bld.MOV(retype(sources[length++], payload_signed_type), coordinate);
+
+      if (devinfo->ver >= 9) {
+         if (coord_components >= 2) {
+            bld.MOV(retype(sources[length], payload_signed_type),
+                    offset(coordinate, bld, 1));
+         } else {
+            sources[length] = brw_imm_d(0);
+         }
+         length++;
+      }
+
+      if (op != SHADER_OPCODE_TXF_LZ) {
+         bld.MOV(retype(sources[length], payload_signed_type), lod);
+         length++;
+      }
+
+      for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++)
+         bld.MOV(retype(sources[length++], payload_signed_type),
+                 offset(coordinate, bld, i));
+
+      coordinate_done = true;
+      break;
+
+   case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_CMS_W:
+   case SHADER_OPCODE_TXF_UMS:
+   case SHADER_OPCODE_TXF_MCS:
+      if (op == SHADER_OPCODE_TXF_UMS ||
+          op == SHADER_OPCODE_TXF_CMS ||
+          op == SHADER_OPCODE_TXF_CMS_W) {
+         bld.MOV(retype(sources[length++], payload_unsigned_type), sample_index);
+      }
+
+      /* Data from the multisample control surface. */
+      if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
+         unsigned num_mcs_components = 1;
+
+         /* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
+          * Shared Functions - 3D Sampler - Messages - Message Format:
+          *
+          *    ld2dms_w   si  mcs0 mcs1 mcs2  mcs3  u  v  r
+          */
+         if (op == SHADER_OPCODE_TXF_CMS_W)
+            num_mcs_components = 2;
+
+         for (unsigned i = 0; i < num_mcs_components; ++i) {
+            /* Sampler always writes 4/8 register worth of data but for ld_mcs
+             * only valid data is in first two register. So with 16-bit
+             * payload, we need to split 2-32bit register into 4-16-bit
+             * payload.
+             *
+             * From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
+             * Shared Functions - 3D Sampler - Messages - Message Format:
+             *
+             *    ld2dms_w   si  mcs0 mcs1 mcs2  mcs3  u  v  r
+             */
+            if (devinfo->verx10 >= 125 && op == SHADER_OPCODE_TXF_CMS_W) {
+               fs_reg tmp = offset(mcs, bld, i);
+               bld.MOV(retype(sources[length++], payload_unsigned_type),
+                       mcs.file == IMM ? mcs :
+                       subscript(tmp, payload_unsigned_type, 0));
+               bld.MOV(retype(sources[length++], payload_unsigned_type),
+                       mcs.file == IMM ? mcs :
+                       subscript(tmp, payload_unsigned_type, 1));
+            } else {
+               bld.MOV(retype(sources[length++], payload_unsigned_type),
+                       mcs.file == IMM ? mcs : offset(mcs, bld, i));
+            }
+         }
+      }
+
+      /* There is no offsetting for this message; just copy in the integer
+       * texture coordinates.
+       */
+      for (unsigned i = 0; i < coord_components; i++)
+         bld.MOV(retype(sources[length++], payload_signed_type),
+                 offset(coordinate, bld, i));
+
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TG4_OFFSET:
+      /* More crazy intermixing */
+      for (unsigned i = 0; i < 2; i++) /* u, v */
+         bld.MOV(sources[length++], offset(coordinate, bld, i));
+
+      for (unsigned i = 0; i < 2; i++) /* offu, offv */
+         bld.MOV(retype(sources[length++], payload_signed_type),
+                 offset(tg4_offset, bld, i));
+
+      if (coord_components == 3) /* r if present */
+         bld.MOV(sources[length++], offset(coordinate, bld, 2));
+
+      coordinate_done = true;
+      break;
+   default:
+      break;
+   }
+
+   /* Set up the coordinate (except for cases where it was done above) */
+   if (!coordinate_done) {
+      for (unsigned i = 0; i < coord_components; i++)
+         bld.MOV(retype(sources[length++], payload_type),
+                 offset(coordinate, bld, i));
+   }
+
+   if (min_lod.file != BAD_FILE && !min_lod_is_first) {
+      /* Account for all of the missing coordinate sources */
+      if (op == SHADER_OPCODE_TXD && devinfo->verx10 >= 125) {
+         /* On DG2 and newer platforms, sample_d can only be used with 1D and
+          * 2D surfaces, so the maximum number of gradient components is 2.
+          * In spite of this limitation, the Bspec lists a mysterious R
+          * component before the min_lod, so the maximum coordinate components
+          * is 3.
+          *
+          * See bspec 45942, "Enable new message layout for cube array"
+          */
+         length += 3 - coord_components;
+         length += (2 - grad_components) * 2;
+      } else {
+         length += 4 - coord_components;
+         if (op == SHADER_OPCODE_TXD)
+            length += (3 - grad_components) * 2;
+      }
+
+      bld.MOV(sources[length++], min_lod);
+
+      /* Wa_14014595444: Populate MLOD as parameter 5 (twice). */
+       if (devinfo->verx10 == 125 && op == FS_OPCODE_TXB &&
+          !inst->shadow_compare)
+         bld.MOV(sources[length++], min_lod);
+   }
+
+   const fs_reg src_payload =
+      fs_reg(VGRF, bld.shader->alloc.allocate(length * reg_width),
+                                              BRW_REGISTER_TYPE_F);
+   /* In case of 16-bit payload each component takes one full register in
+    * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
+    * elements. In SIMD8H case hardware simply expects the components to be
+    * padded (i.e., aligned on reg boundary).
+    */
+   fs_inst *load_payload_inst =
+      emit_load_payload_with_padding(bld, src_payload, sources, length,
+                                     header_size, REG_SIZE * reg_unit(devinfo));
+   unsigned mlen = load_payload_inst->size_written / REG_SIZE;
+   unsigned simd_mode = 0;
+   if (devinfo->ver < 20) {
+      if (payload_type_bit_size == 16) {
+         assert(devinfo->ver >= 11);
+         simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
+            GFX10_SAMPLER_SIMD_MODE_SIMD16H;
+      } else {
+         simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
+            BRW_SAMPLER_SIMD_MODE_SIMD16;
+      }
+   } else {
+      if (payload_type_bit_size == 16) {
+         simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16H :
+            XE2_SAMPLER_SIMD_MODE_SIMD32H;
+      } else {
+         simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16 :
+            XE2_SAMPLER_SIMD_MODE_SIMD32;
+      }
+   }
+
+   /* Generate the SEND. */
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->mlen = mlen;
+   inst->header_size = header_size;
+
+   assert(msg_type == sampler_msg_type(devinfo, op, inst->shadow_compare,
+                                       min_lod.file != BAD_FILE));
+
+   inst->sfid = BRW_SFID_SAMPLER;
+   if (surface.file == IMM &&
+       (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
+      inst->desc = brw_sampler_desc(devinfo, surface.ud,
+                                    sampler.file == IMM ? sampler.ud % 16 : 0,
+                                    msg_type,
+                                    simd_mode,
+                                    0 /* return_format unused on gfx7+ */);
+      inst->src[0] = brw_imm_ud(0);
+      inst->src[1] = brw_imm_ud(0);
+   } else if (surface_handle.file != BAD_FILE) {
+      /* Bindless surface */
+      assert(devinfo->ver >= 9);
+      inst->desc = brw_sampler_desc(devinfo,
+                                    GFX9_BTI_BINDLESS,
+                                    sampler.file == IMM ? sampler.ud % 16 : 0,
+                                    msg_type,
+                                    simd_mode,
+                                    0 /* return_format unused on gfx7+ */);
+
+      /* For bindless samplers, the entire address is included in the message
+       * header so we can leave the portion in the message descriptor 0.
+       */
+      if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
+         inst->src[0] = brw_imm_ud(0);
+      } else {
+         const fs_builder ubld = bld.group(1, 0).exec_all();
+         fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         ubld.SHL(desc, sampler, brw_imm_ud(8));
+         inst->src[0] = component(desc, 0);
+      }
+
+      /* We assume that the driver provided the handle in the top 20 bits so
+       * we can use the surface handle directly as the extended descriptor.
+       */
+      inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
+      inst->send_ex_bso = compiler->extended_bindless_surface_offset;
+   } else {
+      /* Immediate portion of the descriptor */
+      inst->desc = brw_sampler_desc(devinfo,
+                                    0, /* surface */
+                                    0, /* sampler */
+                                    msg_type,
+                                    simd_mode,
+                                    0 /* return_format unused on gfx7+ */);
+      const fs_builder ubld = bld.group(1, 0).exec_all();
+      fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      if (surface.equals(sampler)) {
+         /* This case is common in GL */
+         ubld.MUL(desc, surface, brw_imm_ud(0x101));
+      } else {
+         if (sampler_handle.file != BAD_FILE) {
+            ubld.MOV(desc, surface);
+         } else if (sampler.file == IMM) {
+            ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
+         } else {
+            ubld.SHL(desc, sampler, brw_imm_ud(8));
+            ubld.OR(desc, desc, surface);
+         }
+      }
+      ubld.AND(desc, desc, brw_imm_ud(0xfff));
+
+      inst->src[0] = component(desc, 0);
+      inst->src[1] = brw_imm_ud(0); /* ex_desc */
+   }
+
+   inst->ex_desc = 0;
+
+   inst->src[2] = src_payload;
+   inst->resize_sources(3);
+
+   if (inst->eot) {
+      /* EOT sampler messages don't make sense to split because it would
+       * involve ending half of the thread early.
+       */
+      assert(inst->group == 0);
+      /* We need to use SENDC for EOT sampler messages */
+      inst->check_tdr = true;
+      inst->send_has_side_effects = true;
+   }
+
+   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE * reg_unit(devinfo));
+}
+
+static unsigned
+get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
+                                      opcode op, const fs_reg *src)
+{
+   unsigned src_type_size = 0;
+
+   /* All sources need to have the same size, therefore seek the first valid
+    * and take the size from there.
+    */
+   for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
+      if (src[i].file != BAD_FILE) {
+         src_type_size = brw_reg_type_to_size(src[i].type);
+         break;
+      }
+   }
+
+   assert(src_type_size == 2 || src_type_size == 4);
+
+#ifndef NDEBUG
+   /* Make sure all sources agree. On gfx12 this doesn't hold when sampling
+    * compressed multisampled surfaces. There the payload contains MCS data
+    * which is already in 16-bits unlike the other parameters that need forced
+    * conversion.
+    */
+   if (devinfo->verx10 < 125 ||
+       (op != SHADER_OPCODE_TXF_CMS_W &&
+        op != SHADER_OPCODE_TXF_CMS)) {
+      for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
+         assert(src[i].file == BAD_FILE ||
+                brw_reg_type_to_size(src[i].type) == src_type_size);
+      }
+   }
+#endif
+
+   if (devinfo->verx10 < 125)
+      return src_type_size * 8;
+
+   /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
+    * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
+    * Format [GFX12:HAS:1209977870] *
+    *
+    *  ld2dms_w       SIMD8H and SIMD16H Only
+    *  ld_mcs         SIMD8H and SIMD16H Only
+    *  ld2dms         REMOVEDBY(GEN:HAS:1406788836)
+    */
+
+   if (op == SHADER_OPCODE_TXF_CMS_W ||
+       op == SHADER_OPCODE_TXF_CMS ||
+       op == SHADER_OPCODE_TXF_UMS ||
+       op == SHADER_OPCODE_TXF_MCS)
+      src_type_size = 2;
+
+   return src_type_size * 8;
+}
+
+static void
+lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   const fs_reg coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
+   const fs_reg shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
+   const fs_reg lod = inst->src[TEX_LOGICAL_SRC_LOD];
+   const fs_reg lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
+   const fs_reg min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
+   const fs_reg sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
+   const fs_reg mcs = inst->src[TEX_LOGICAL_SRC_MCS];
+   const fs_reg surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
+   const fs_reg sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
+   const fs_reg surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
+   const fs_reg sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
+   const fs_reg tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
+   assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
+   const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
+   assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
+   const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
+   assert(inst->src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
+   const bool residency = inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
+   /* residency is only supported on Gfx8+ */
+   assert(!residency || devinfo->ver >= 8);
+
+   if (devinfo->ver >= 7) {
+      const unsigned msg_payload_type_bit_size =
+         get_sampler_msg_payload_type_bit_size(devinfo, op, inst->src);
+
+      /* 16-bit payloads are available only on gfx11+ */
+      assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
+
+      lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, min_lod,
+                                      sample_index,
+                                      mcs, surface, sampler,
+                                      surface_handle, sampler_handle,
+                                      tg4_offset,
+                                      msg_payload_type_bit_size,
+                                      coord_components, grad_components,
+                                      residency);
+   } else if (devinfo->ver >= 5) {
+      lower_sampler_logical_send_gfx5(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, sample_index,
+                                      surface, sampler,
+                                      coord_components, grad_components);
+   } else {
+      lower_sampler_logical_send_gfx4(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2,
+                                      surface, sampler,
+                                      coord_components, grad_components);
+   }
+}
+
+/**
+ * Predicate the specified instruction on the vector mask.
+ */
+static void
+emit_predicate_on_vector_mask(const fs_builder &bld, fs_inst *inst)
+{
+   assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
+          bld.group() == inst->group &&
+          bld.dispatch_width() == inst->exec_size);
+
+   const fs_builder ubld = bld.exec_all().group(1, 0);
+
+   const fs_visitor &s = *bld.shader;
+   const fs_reg vector_mask = ubld.vgrf(BRW_REGISTER_TYPE_UW);
+   ubld.UNDEF(vector_mask);
+   ubld.emit(SHADER_OPCODE_READ_SR_REG, vector_mask, brw_imm_ud(3));
+   const unsigned subreg = sample_mask_flag_subreg(s);
+
+   ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask);
+
+   if (inst->predicate) {
+      assert(inst->predicate == BRW_PREDICATE_NORMAL);
+      assert(!inst->predicate_inverse);
+      assert(inst->flag_subreg == 0);
+      assert(s.devinfo->ver < 20);
+      /* Combine the vector mask with the existing predicate by using a
+       * vertical predication mode.
+       */
+      inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
+   } else {
+      inst->flag_subreg = subreg;
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->predicate_inverse = false;
+   }
+}
+
+static void
+setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,
+                          const fs_reg &surface, const fs_reg &surface_handle)
+{
+   const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
+   const brw_compiler *compiler = bld.shader->compiler;
+
+   /* We must have exactly one of surface and surface_handle */
+   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
+
+   if (surface.file == IMM) {
+      inst->desc = desc | (surface.ud & 0xff);
+      inst->src[0] = brw_imm_ud(0);
+      inst->src[1] = brw_imm_ud(0); /* ex_desc */
+   } else if (surface_handle.file != BAD_FILE) {
+      /* Bindless surface */
+      assert(devinfo->ver >= 9);
+      inst->desc = desc | GFX9_BTI_BINDLESS;
+      inst->src[0] = brw_imm_ud(0);
+
+      /* We assume that the driver provided the handle in the top 20 bits so
+       * we can use the surface handle directly as the extended descriptor.
+       */
+      inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
+      inst->send_ex_bso = compiler->extended_bindless_surface_offset;
+   } else {
+      inst->desc = desc;
+      const fs_builder ubld = bld.exec_all().group(1, 0);
+      fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      ubld.AND(tmp, surface, brw_imm_ud(0xff));
+      inst->src[0] = component(tmp, 0);
+      inst->src[1] = brw_imm_ud(0); /* ex_desc */
+   }
+}
+
+static void
+setup_lsc_surface_descriptors(const fs_builder &bld, fs_inst *inst,
+                              uint32_t desc, const fs_reg &surface)
+{
+   const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
+   const brw_compiler *compiler = bld.shader->compiler;
+
+   inst->src[0] = brw_imm_ud(0); /* desc */
+
+   enum lsc_addr_surface_type surf_type = lsc_msg_desc_addr_type(devinfo, desc);
+   switch (surf_type) {
+   case LSC_ADDR_SURFTYPE_BSS:
+      inst->send_ex_bso = compiler->extended_bindless_surface_offset;
+      /* fall-through */
+   case LSC_ADDR_SURFTYPE_SS:
+      assert(surface.file != BAD_FILE);
+      /* We assume that the driver provided the handle in the top 20 bits so
+       * we can use the surface handle directly as the extended descriptor.
+       */
+      inst->src[1] = retype(surface, BRW_REGISTER_TYPE_UD);
+      break;
+
+   case LSC_ADDR_SURFTYPE_BTI:
+      assert(surface.file != BAD_FILE);
+      if (surface.file == IMM) {
+         inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
+      } else {
+         const fs_builder ubld = bld.exec_all().group(1, 0);
+         fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         ubld.SHL(tmp, surface, brw_imm_ud(24));
+         inst->src[1] = component(tmp, 0);
+      }
+      break;
+
+   case LSC_ADDR_SURFTYPE_FLAT:
+      inst->src[1] = brw_imm_ud(0);
+      break;
+
+   default:
+      unreachable("Invalid LSC surface address type");
+   }
+}
+
+static void
+lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   const brw_compiler *compiler = bld.shader->compiler;
+   const intel_device_info *devinfo = bld.shader->devinfo;
+
+   /* Get the logical send arguments. */
+   const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
+   const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
+   const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
+   const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
+   const UNUSED fs_reg dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
+   const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
+   const fs_reg allow_sample_mask =
+      inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
+   assert(arg.file == IMM);
+   assert(allow_sample_mask.file == IMM);
+
+   /* Calculate the total number of components of the payload. */
+   const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
+   const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
+
+   const bool is_typed_access =
+      inst->opcode == SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||
+      inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
+      inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
+
+   const bool is_surface_access = is_typed_access ||
+      inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL ||
+      inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL ||
+      inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL;
+
+   const bool is_stateless =
+      surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
+                              surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
+
+   const bool has_side_effects = inst->has_side_effects();
+
+   fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) :
+                                               fs_reg(brw_imm_ud(0xffffffff));
+
+   /* From the BDW PRM Volume 7, page 147:
+    *
+    *  "For the Data Cache Data Port*, the header must be present for the
+    *   following message types: [...] Typed read/write/atomics"
+    *
+    * Earlier generations have a similar wording.  Because of this restriction
+    * we don't attempt to implement sample masks via predication for such
+    * messages prior to Gfx9, since we have to provide a header anyway.  On
+    * Gfx11+ the header has been removed so we can only use predication.
+    *
+    * For all stateless A32 messages, we also need a header
+    */
+   fs_reg header;
+   if ((devinfo->ver < 9 && is_typed_access) || is_stateless) {
+      fs_builder ubld = bld.exec_all().group(8, 0);
+      header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      if (is_stateless) {
+         assert(!is_surface_access);
+         ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
+      } else {
+         ubld.MOV(header, brw_imm_d(0));
+         if (is_surface_access)
+            ubld.group(1, 0).MOV(component(header, 7), sample_mask);
+      }
+   }
+   const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
+
+   fs_reg payload, payload2;
+   unsigned mlen, ex_mlen = 0;
+   if (devinfo->ver >= 9 &&
+       (src.file == BAD_FILE || header.file == BAD_FILE)) {
+      /* We have split sends on gfx9 and above */
+      if (header.file == BAD_FILE) {
+         payload = bld.move_to_vgrf(addr, addr_sz);
+         payload2 = bld.move_to_vgrf(src, src_sz);
+         mlen = addr_sz * (inst->exec_size / 8);
+         ex_mlen = src_sz * (inst->exec_size / 8);
+      } else {
+         assert(src.file == BAD_FILE);
+         payload = header;
+         payload2 = bld.move_to_vgrf(addr, addr_sz);
+         mlen = header_sz;
+         ex_mlen = addr_sz * (inst->exec_size / 8);
+      }
+   } else {
+      /* Allocate space for the payload. */
+      const unsigned sz = header_sz + addr_sz + src_sz;
+      payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
+      fs_reg *const components = new fs_reg[sz];
+      unsigned n = 0;
+
+      /* Construct the payload. */
+      if (header.file != BAD_FILE)
+         components[n++] = header;
+
+      for (unsigned i = 0; i < addr_sz; i++)
+         components[n++] = offset(addr, bld, i);
+
+      for (unsigned i = 0; i < src_sz; i++)
+         components[n++] = offset(src, bld, i);
+
+      bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
+      mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
+
+      delete[] components;
+   }
+
+   /* Predicate the instruction on the sample mask if no header is
+    * provided.
+    */
+   if ((header.file == BAD_FILE || !is_surface_access) &&
+       sample_mask.file != BAD_FILE && sample_mask.file != IMM)
+      brw_emit_predicate_on_sample_mask(bld, inst);
+
+   uint32_t sfid;
+   switch (inst->opcode) {
+   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
+   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
+      /* Byte scattered opcodes go through the normal data cache */
+      sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
+      break;
+
+   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
+   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
+      sfid =  devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
+              devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
+                                  BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
+      break;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+      /* Untyped Surface messages go through the data cache but the SFID value
+       * changed on Haswell.
+       */
+      sfid = (devinfo->verx10 >= 75 ?
+              HSW_SFID_DATAPORT_DATA_CACHE_1 :
+              GFX7_SFID_DATAPORT_DATA_CACHE);
+      break;
+
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+      /* Typed surface messages go through the render cache on IVB and the
+       * data cache on HSW+.
+       */
+      sfid = (devinfo->verx10 >= 75 ?
+              HSW_SFID_DATAPORT_DATA_CACHE_1 :
+              GFX6_SFID_DATAPORT_RENDER_CACHE);
+      break;
+
+   default:
+      unreachable("Unsupported surface opcode");
+   }
+
+   uint32_t desc;
+   switch (inst->opcode) {
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+      desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
+                                            arg.ud, /* num_channels */
+                                            false   /* write */);
+      break;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+      desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
+                                            arg.ud, /* num_channels */
+                                            true    /* write */);
+      break;
+
+   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
+      desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
+                                           arg.ud, /* bit_size */
+                                           false   /* write */);
+      break;
+
+   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
+      desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
+                                           arg.ud, /* bit_size */
+                                           true    /* write */);
+      break;
+
+   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
+      assert(arg.ud == 32); /* bit_size */
+      desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
+                                            false  /* write */);
+      break;
+
+   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
+      assert(arg.ud == 32); /* bit_size */
+      desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
+                                            true   /* write */);
+      break;
+
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+      if (lsc_opcode_is_atomic_float((enum lsc_opcode) arg.ud)) {
+         desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
+                                                 lsc_op_to_legacy_atomic(arg.ud),
+                                                 !inst->dst.is_null());
+      } else {
+         desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
+                                           lsc_op_to_legacy_atomic(arg.ud),
+                                           !inst->dst.is_null());
+      }
+      break;
+
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
+                                          arg.ud, /* num_channels */
+                                          false   /* write */);
+      break;
+
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
+                                          arg.ud, /* num_channels */
+                                          true    /* write */);
+      break;
+
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+      desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
+                                      lsc_op_to_legacy_atomic(arg.ud),
+                                      !inst->dst.is_null());
+      break;
+
+   default:
+      unreachable("Unknown surface logical instruction");
+   }
+
+   /* Update the original instruction. */
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->mlen = mlen;
+   inst->ex_mlen = ex_mlen;
+   inst->header_size = header_sz;
+   inst->send_has_side_effects = has_side_effects;
+   inst->send_is_volatile = !has_side_effects;
+   inst->send_ex_bso = surface_handle.file != BAD_FILE &&
+                       compiler->extended_bindless_surface_offset;
+
+   /* Set up SFID and descriptors */
+   inst->sfid = sfid;
+   setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
+
+   inst->resize_sources(4);
+
+   /* Finally, the payload */
+   inst->src[2] = payload;
+   inst->src[3] = payload2;
+}
+
+static enum lsc_data_size
+lsc_bits_to_data_size(unsigned bit_size)
+{
+   switch (bit_size / 8) {
+   case 1:  return LSC_DATA_SIZE_D8U32;
+   case 2:  return LSC_DATA_SIZE_D16U32;
+   case 4:  return LSC_DATA_SIZE_D32;
+   case 8:  return LSC_DATA_SIZE_D64;
+   default:
+      unreachable("Unsupported data size.");
+   }
+}
+
+static void
+lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   const brw_compiler *compiler = bld.shader->compiler;
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   assert(devinfo->has_lsc);
+
+   /* Get the logical send arguments. */
+   const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
+   const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
+   const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
+   const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
+   const UNUSED fs_reg dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
+   const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
+   const fs_reg allow_sample_mask =
+      inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
+   assert(arg.file == IMM);
+   assert(allow_sample_mask.file == IMM);
+
+   /* Calculate the total number of components of the payload. */
+   const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
+   const unsigned src_comps = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
+   const unsigned src_sz = type_sz(src.type);
+   const unsigned dst_sz = type_sz(inst->dst.type);
+
+   const bool has_side_effects = inst->has_side_effects();
+
+   unsigned ex_mlen = 0;
+   fs_reg payload, payload2;
+   payload = bld.move_to_vgrf(addr, addr_sz);
+   if (src.file != BAD_FILE) {
+      payload2 = bld.move_to_vgrf(src, src_comps);
+      ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
+   }
+
+   /* Predicate the instruction on the sample mask if needed */
+   fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) :
+                                               fs_reg(brw_imm_ud(0xffffffff));
+   if (sample_mask.file != BAD_FILE && sample_mask.file != IMM)
+      brw_emit_predicate_on_sample_mask(bld, inst);
+
+   if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
+      inst->sfid = GFX12_SFID_SLM;
+   else
+      inst->sfid = GFX12_SFID_UGM;
+
+   /* We should have exactly one of surface and surface_handle. For scratch
+    * messages generated by brw_fs_nir.cpp we also allow a special value to
+    * know what heap base we should use in STATE_BASE_ADDRESS (SS = Surface
+    * State Offset, or BSS = Bindless Surface State Offset).
+    */
+   bool non_bindless = surface.file == IMM && surface.ud == GFX125_NON_BINDLESS;
+   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE) ||
+          (non_bindless && surface_handle.file != BAD_FILE));
+
+   enum lsc_addr_surface_type surf_type;
+   if (surface_handle.file != BAD_FILE) {
+      if (surface.file == BAD_FILE) {
+         assert(!non_bindless);
+         surf_type = LSC_ADDR_SURFTYPE_BSS;
+      } else {
+         assert(surface.file == IMM &&
+                (surface.ud == 0 || surface.ud == GFX125_NON_BINDLESS));
+         surf_type = non_bindless ? LSC_ADDR_SURFTYPE_SS : LSC_ADDR_SURFTYPE_BSS;
+      }
+   } else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
+      surf_type = LSC_ADDR_SURFTYPE_FLAT;
+   else
+      surf_type = LSC_ADDR_SURFTYPE_BTI;
+
+   switch (inst->opcode) {
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
+                                surf_type, LSC_ADDR_SIZE_A32,
+                                1 /* num_coordinates */,
+                                LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
+                                false /* transpose */,
+                                LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
+                                true /* has_dest */);
+      break;
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
+                                surf_type, LSC_ADDR_SIZE_A32,
+                                1 /* num_coordinates */,
+                                LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
+                                false /* transpose */,
+                                LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
+                                false /* has_dest */);
+      break;
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: {
+      /* Bspec: Atomic instruction -> Cache section:
+       *
+       *    Atomic messages are always forced to "un-cacheable" in the L1
+       *    cache.
+       */
+      enum lsc_opcode opcode = (enum lsc_opcode) arg.ud;
+
+      inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
+                                surf_type, LSC_ADDR_SIZE_A32,
+                                1 /* num_coordinates */,
+                                lsc_bits_to_data_size(dst_sz * 8),
+                                1 /* num_channels */,
+                                false /* transpose */,
+                                LSC_CACHE(devinfo, STORE, L1UC_L3WB),
+                                !inst->dst.is_null());
+      break;
+   }
+   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
+      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
+                                surf_type, LSC_ADDR_SIZE_A32,
+                                1 /* num_coordinates */,
+                                lsc_bits_to_data_size(arg.ud),
+                                1 /* num_channels */,
+                                false /* transpose */,
+                                LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
+                                true /* has_dest */);
+      break;
+   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
+      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
+                                surf_type, LSC_ADDR_SIZE_A32,
+                                1 /* num_coordinates */,
+                                lsc_bits_to_data_size(arg.ud),
+                                1 /* num_channels */,
+                                false /* transpose */,
+                                LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
+                                false /* has_dest */);
+      break;
+   default:
+      unreachable("Unknown surface logical instruction");
+   }
+
+   /* Update the original instruction. */
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
+   inst->ex_mlen = ex_mlen;
+   inst->header_size = 0;
+   inst->send_has_side_effects = has_side_effects;
+   inst->send_is_volatile = !has_side_effects;
+   inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
+                       compiler->extended_bindless_surface_offset;
+
+   inst->resize_sources(4);
+
+   if (non_bindless) {
+      inst->src[0] = brw_imm_ud(0);     /* desc */
+      inst->src[1] = surface_handle;    /* ex_desc */
+   } else {
+      setup_lsc_surface_descriptors(bld, inst, inst->desc,
+                                    surface.file != BAD_FILE ?
+                                    surface : surface_handle);
+   }
+
+   /* Finally, the payload */
+   inst->src[2] = payload;
+   inst->src[3] = payload2;
+}
+
+static void
+lower_lsc_block_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   const brw_compiler *compiler = bld.shader->compiler;
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   assert(devinfo->has_lsc);
+
+   /* Get the logical send arguments. */
+   const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
+   const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
+   const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
+   const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
+   const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
+   assert(arg.file == IMM);
+   assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
+   assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
+
+   const bool is_stateless =
+      surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
+                              surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
+
+   const bool has_side_effects = inst->has_side_effects();
+
+   const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
+
+   fs_builder ubld = bld.exec_all().group(1, 0);
+   fs_reg stateless_ex_desc;
+   if (is_stateless) {
+      stateless_ex_desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      ubld.AND(stateless_ex_desc,
+               retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
+               brw_imm_ud(INTEL_MASK(31, 10)));
+   }
+
+   fs_reg data;
+   if (write) {
+      const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
+      data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD);
+   }
+
+   inst->opcode = SHADER_OPCODE_SEND;
+   if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
+      inst->sfid = GFX12_SFID_SLM;
+   else
+      inst->sfid = GFX12_SFID_UGM;
+   const enum lsc_addr_surface_type surf_type =
+      inst->sfid == GFX12_SFID_SLM ?
+      LSC_ADDR_SURFTYPE_FLAT :
+      surface.file == BAD_FILE ?
+      LSC_ADDR_SURFTYPE_BSS : LSC_ADDR_SURFTYPE_BTI;
+   inst->desc = lsc_msg_desc(devinfo,
+                             write ? LSC_OP_STORE : LSC_OP_LOAD,
+                             1 /* exec_size */,
+                             surf_type,
+                             LSC_ADDR_SIZE_A32,
+                             1 /* num_coordinates */,
+                             LSC_DATA_SIZE_D32,
+                             arg.ud /* num_channels */,
+                             true /* transpose */,
+                             LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
+                             !write /* has_dest */);
+
+   inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
+   inst->size_written = lsc_msg_desc_dest_len(devinfo, inst->desc) * REG_SIZE;
+   inst->exec_size = 1;
+   inst->ex_mlen = write ? DIV_ROUND_UP(arg.ud, 8) : 0;
+   inst->header_size = 0;
+   inst->send_has_side_effects = has_side_effects;
+   inst->send_is_volatile = !has_side_effects;
+   inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
+                       compiler->extended_bindless_surface_offset;
+
+   inst->resize_sources(4);
+
+   if (stateless_ex_desc.file != BAD_FILE) {
+      inst->src[0] = brw_imm_ud(0);     /* desc */
+      inst->src[1] = stateless_ex_desc; /* ex_desc */
+   } else {
+      setup_lsc_surface_descriptors(bld, inst, inst->desc,
+                                    surface.file != BAD_FILE ?
+                                    surface : surface_handle);
+   }
+   inst->src[2] = addr;          /* payload */
+   inst->src[3] = data;          /* payload2 */
+}
+
+static void
+lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   assert(devinfo->ver >= 9);
+
+   /* Get the logical send arguments. */
+   const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
+   const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
+   const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
+   const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
+   const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
+   assert(arg.file == IMM);
+   assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
+   assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
+
+   const bool is_stateless =
+      surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
+                              surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
+
+   const bool has_side_effects = inst->has_side_effects();
+
+   const bool align_16B =
+      inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
+
+   const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
+
+   /* The address is stored in the header.  See MH_A32_GO and MH_BTS_GO. */
+   fs_builder ubld = bld.exec_all().group(8, 0);
+   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+
+   if (is_stateless)
+      ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
+   else
+      ubld.MOV(header, brw_imm_d(0));
+
+   /* Address in OWord units when aligned to OWords. */
+   if (align_16B)
+      ubld.group(1, 0).SHR(component(header, 2), addr, brw_imm_ud(4));
+   else
+      ubld.group(1, 0).MOV(component(header, 2), addr);
+
+   fs_reg data;
+   unsigned ex_mlen = 0;
+   if (write) {
+      const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
+      data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD);
+      ex_mlen = src_sz * type_sz(src.type) * inst->exec_size / REG_SIZE;
+   }
+
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->mlen = 1;
+   inst->ex_mlen = ex_mlen;
+   inst->header_size = 1;
+   inst->send_has_side_effects = has_side_effects;
+   inst->send_is_volatile = !has_side_effects;
+
+   inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
+
+   const uint32_t desc = brw_dp_oword_block_rw_desc(devinfo, align_16B,
+                                                    arg.ud, write);
+   setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
+
+   inst->resize_sources(4);
+
+   inst->src[2] = header;
+   inst->src[3] = data;
+}
+
+static fs_reg
+emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr)
+{
+   const fs_builder ubld = bld.exec_all().group(8, 0);
+
+   assert(type_sz(addr.type) == 8 && addr.stride == 0);
+
+   fs_reg expanded_addr = addr;
+   if (addr.file == UNIFORM) {
+      /* We can't do stride 1 with the UNIFORM file, it requires stride 0 */
+      expanded_addr = ubld.vgrf(BRW_REGISTER_TYPE_UQ);
+      expanded_addr.stride = 0;
+      ubld.MOV(expanded_addr, retype(addr, BRW_REGISTER_TYPE_UQ));
+   }
+
+   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+   ubld.MOV(header, brw_imm_ud(0));
+
+   /* Use a 2-wide MOV to fill out the address */
+   fs_reg addr_vec2 = expanded_addr;
+   addr_vec2.type = BRW_REGISTER_TYPE_UD;
+   addr_vec2.stride = 1;
+   ubld.group(2, 0).MOV(header, addr_vec2);
+
+   return header;
+}
+
+static void
+emit_fragment_mask(const fs_builder &bld, fs_inst *inst)
+{
+   assert(inst->src[A64_LOGICAL_ENABLE_HELPERS].file == IMM);
+   const bool enable_helpers = inst->src[A64_LOGICAL_ENABLE_HELPERS].ud;
+
+   /* If we're a fragment shader, we have to predicate with the sample mask to
+    * avoid helper invocations to avoid helper invocations in instructions
+    * with side effects, unless they are explicitly required.
+    *
+    * There are also special cases when we actually want to run on helpers
+    * (ray queries).
+    */
+   assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
+   if (enable_helpers)
+      emit_predicate_on_vector_mask(bld, inst);
+   else if (inst->has_side_effects())
+      brw_emit_predicate_on_sample_mask(bld, inst);
+}
+
+static void
+lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+
+   /* Get the logical send arguments. */
+   const fs_reg addr = inst->src[A64_LOGICAL_ADDRESS];
+   const fs_reg src = inst->src[A64_LOGICAL_SRC];
+   const unsigned src_sz = type_sz(src.type);
+   const unsigned dst_sz = type_sz(inst->dst.type);
+
+   const unsigned src_comps = inst->components_read(1);
+   assert(inst->src[A64_LOGICAL_ARG].file == IMM);
+   const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
+   const bool has_side_effects = inst->has_side_effects();
+
+   fs_reg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
+   fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps),
+                            BRW_REGISTER_TYPE_UD);
+   unsigned ex_mlen = src_comps * src_sz * inst->exec_size / REG_SIZE;
+
+   switch (inst->opcode) {
+   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
+      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
+                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
+                                1 /* num_coordinates */,
+                                LSC_DATA_SIZE_D32, arg /* num_channels */,
+                                false /* transpose */,
+                                LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
+                                true /* has_dest */);
+      break;
+   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
+      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
+                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
+                                1 /* num_coordinates */,
+                                LSC_DATA_SIZE_D32, arg /* num_channels */,
+                                false /* transpose */,
+                                LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
+                                false /* has_dest */);
+      break;
+   case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
+      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
+                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
+                                1 /* num_coordinates */,
+                                lsc_bits_to_data_size(arg),
+                                1 /* num_channels */,
+                                false /* transpose */,
+                                LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
+                                true /* has_dest */);
+      break;
+   case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
+      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
+                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
+                                1 /* num_coordinates */,
+                                lsc_bits_to_data_size(arg),
+                                1 /* num_channels */,
+                                false /* transpose */,
+                                LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
+                                false /* has_dest */);
+      break;
+   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: {
+      /* Bspec: Atomic instruction -> Cache section:
+       *
+       *    Atomic messages are always forced to "un-cacheable" in the L1
+       *    cache.
+       */
+      enum lsc_opcode opcode = (enum lsc_opcode) arg;
+      inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
+                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
+                                1 /* num_coordinates */,
+                                lsc_bits_to_data_size(dst_sz * 8),
+                                1 /* num_channels */,
+                                false /* transpose */,
+                                LSC_CACHE(devinfo, STORE, L1UC_L3WB),
+                                !inst->dst.is_null());
+      break;
+   }
+   case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
+   case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
+      inst->exec_size = 1;
+      inst->desc = lsc_msg_desc(devinfo,
+                                LSC_OP_LOAD,
+                                1 /* exec_size */,
+                                LSC_ADDR_SURFTYPE_FLAT,
+                                LSC_ADDR_SIZE_A64,
+                                1 /* num_coordinates */,
+                                LSC_DATA_SIZE_D32,
+                                arg /* num_channels */,
+                                true /* transpose */,
+                                LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
+                                true /* has_dest */);
+      break;
+   case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
+      inst->exec_size = 1;
+      inst->desc = lsc_msg_desc(devinfo,
+                                LSC_OP_STORE,
+                                1 /* exec_size */,
+                                LSC_ADDR_SURFTYPE_FLAT,
+                                LSC_ADDR_SIZE_A64,
+                                1 /* num_coordinates */,
+                                LSC_DATA_SIZE_D32,
+                                arg /* num_channels */,
+                                true /* transpose */,
+                                LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
+                                false /* has_dest */);
+
+      break;
+   default:
+      unreachable("Unknown A64 logical instruction");
+   }
+
+   if (bld.shader->stage == MESA_SHADER_FRAGMENT)
+      emit_fragment_mask(bld, inst);
+
+   /* Update the original instruction. */
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
+   inst->ex_mlen = ex_mlen;
+   inst->header_size = 0;
+   inst->send_has_side_effects = has_side_effects;
+   inst->send_is_volatile = !has_side_effects;
+
+   /* Set up SFID and descriptors */
+   inst->sfid = GFX12_SFID_UGM;
+   inst->resize_sources(4);
+   inst->src[0] = brw_imm_ud(0); /* desc */
+   inst->src[1] = brw_imm_ud(0); /* ex_desc */
+   inst->src[2] = payload;
+   inst->src[3] = payload2;
+}
+
+static void
+lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+
+   const fs_reg addr = inst->src[A64_LOGICAL_ADDRESS];
+   const fs_reg src = inst->src[A64_LOGICAL_SRC];
+   const unsigned src_comps = inst->components_read(1);
+   assert(inst->src[A64_LOGICAL_ARG].file == IMM);
+   const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
+   const bool has_side_effects = inst->has_side_effects();
+
+   fs_reg payload, payload2;
+   unsigned mlen, ex_mlen = 0, header_size = 0;
+   if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL ||
+       inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL ||
+       inst->opcode == SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL) {
+      assert(devinfo->ver >= 9);
+
+      /* OWORD messages only take a scalar address in a header */
+      mlen = 1;
+      header_size = 1;
+      payload = emit_a64_oword_block_header(bld, addr);
+
+      if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL) {
+         ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
+         payload2 = retype(bld.move_to_vgrf(src, src_comps),
+                           BRW_REGISTER_TYPE_UD);
+      }
+   } else if (devinfo->ver >= 9) {
+      /* On Skylake and above, we have SENDS */
+      mlen = 2 * (inst->exec_size / 8);
+      ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
+      payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
+      payload2 = retype(bld.move_to_vgrf(src, src_comps),
+                        BRW_REGISTER_TYPE_UD);
+   } else {
+      /* Add two because the address is 64-bit */
+      const unsigned dwords = 2 + src_comps;
+      mlen = dwords * (inst->exec_size / 8);
+
+      fs_reg sources[5];
+
+      sources[0] = addr;
+
+      for (unsigned i = 0; i < src_comps; i++)
+         sources[1 + i] = offset(src, bld, i);
+
+      payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords);
+      bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
+   }
+
+   uint32_t desc;
+   switch (inst->opcode) {
+   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
+      desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
+                                                arg,   /* num_channels */
+                                                false  /* write */);
+      break;
+
+   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
+      desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
+                                                arg,   /* num_channels */
+                                                true   /* write */);
+      break;
+
+   case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
+      desc = brw_dp_a64_oword_block_rw_desc(devinfo,
+                                            true,    /* align_16B */
+                                            arg,     /* num_dwords */
+                                            false    /* write */);
+      break;
+
+   case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
+      desc = brw_dp_a64_oword_block_rw_desc(devinfo,
+                                            false,   /* align_16B */
+                                            arg,     /* num_dwords */
+                                            false    /* write */);
+      break;
+
+   case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
+      desc = brw_dp_a64_oword_block_rw_desc(devinfo,
+                                            true,    /* align_16B */
+                                            arg,     /* num_dwords */
+                                            true     /* write */);
+      break;
+
+   case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
+      desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
+                                               arg,   /* bit_size */
+                                               false  /* write */);
+      break;
+
+   case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
+      desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
+                                               arg,   /* bit_size */
+                                               true   /* write */);
+      break;
+
+   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
+      if (lsc_opcode_is_atomic_float((enum lsc_opcode) arg)) {
+         desc =
+            brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
+                                                 type_sz(inst->dst.type) * 8,
+                                                 lsc_op_to_legacy_atomic(arg),
+                                                 !inst->dst.is_null());
+      } else {
+         desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size,
+                                               type_sz(inst->dst.type) * 8,
+                                               lsc_op_to_legacy_atomic(arg),
+                                               !inst->dst.is_null());
+      }
+      break;
+
+   default:
+      unreachable("Unknown A64 logical instruction");
+   }
+
+   if (bld.shader->stage == MESA_SHADER_FRAGMENT)
+      emit_fragment_mask(bld, inst);
+
+   /* Update the original instruction. */
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->mlen = mlen;
+   inst->ex_mlen = ex_mlen;
+   inst->header_size = header_size;
+   inst->send_has_side_effects = has_side_effects;
+   inst->send_is_volatile = !has_side_effects;
+
+   /* Set up SFID and descriptors */
+   inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
+   inst->desc = desc;
+   inst->resize_sources(4);
+   inst->src[0] = brw_imm_ud(0); /* desc */
+   inst->src[1] = brw_imm_ud(0); /* ex_desc */
+   inst->src[2] = payload;
+   inst->src[3] = payload2;
+}
+
+static void
+lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
+                                             fs_inst *inst)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   ASSERTED const brw_compiler *compiler = bld.shader->compiler;
+
+   fs_reg surface        = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
+   fs_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
+   fs_reg offset_B       = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
+   fs_reg alignment_B    = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT];
+
+   /* We are switching the instruction from an ALU-like instruction to a
+    * send-from-grf instruction.  Since sends can't handle strides or
+    * source modifiers, we have to make a copy of the offset source.
+    */
+   fs_reg ubo_offset = bld.move_to_vgrf(offset_B, 1);
+
+   enum lsc_addr_surface_type surf_type =
+      surface_handle.file == BAD_FILE ?
+      LSC_ADDR_SURFTYPE_BTI : LSC_ADDR_SURFTYPE_BSS;
+
+   assert(alignment_B.file == BRW_IMMEDIATE_VALUE);
+   unsigned alignment = alignment_B.ud;
+
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->sfid = GFX12_SFID_UGM;
+   inst->resize_sources(3);
+   inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
+                       compiler->extended_bindless_surface_offset;
+
+   assert(!compiler->indirect_ubos_use_sampler);
+
+   inst->src[0] = brw_imm_ud(0);
+   inst->src[2] = ubo_offset; /* payload */
+
+   if (alignment >= 4) {
+      inst->desc =
+         lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
+                      surf_type, LSC_ADDR_SIZE_A32,
+                      1 /* num_coordinates */,
+                      LSC_DATA_SIZE_D32,
+                      4 /* num_channels */,
+                      false /* transpose */,
+                      LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
+                      true /* has_dest */);
+      inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
+
+      setup_lsc_surface_descriptors(bld, inst, inst->desc,
+                                    surface.file != BAD_FILE ?
+                                    surface : surface_handle);
+   } else {
+      inst->desc =
+         lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
+                      surf_type, LSC_ADDR_SIZE_A32,
+                      1 /* num_coordinates */,
+                      LSC_DATA_SIZE_D32,
+                      1 /* num_channels */,
+                      false /* transpose */,
+                      LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
+                      true /* has_dest */);
+      inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
+
+      setup_lsc_surface_descriptors(bld, inst, inst->desc,
+                                    surface.file != BAD_FILE ?
+                                    surface : surface_handle);
+
+      /* The byte scattered messages can only read one dword at a time so
+       * we have to duplicate the message 4 times to read the full vec4.
+       * Hopefully, dead code will clean up the mess if some of them aren't
+       * needed.
+       */
+      assert(inst->size_written == 16 * inst->exec_size);
+      inst->size_written /= 4;
+      for (unsigned c = 1; c < 4; c++) {
+         /* Emit a copy of the instruction because we're about to modify
+          * it.  Because this loop starts at 1, we will emit copies for the
+          * first 3 and the final one will be the modified instruction.
+          */
+         bld.emit(*inst);
+
+         /* Offset the source */
+         inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
+         bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
+
+         /* Offset the destination */
+         inst->dst = offset(inst->dst, bld, 1);
+      }
+   }
+}
+
+static void
+lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   const brw_compiler *compiler = bld.shader->compiler;
+
+   if (devinfo->ver >= 7) {
+      fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
+      fs_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
+      fs_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
+
+      /* We are switching the instruction from an ALU-like instruction to a
+       * send-from-grf instruction.  Since sends can't handle strides or
+       * source modifiers, we have to make a copy of the offset source.
+       */
+      fs_reg ubo_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.MOV(ubo_offset, offset_B);
+
+      assert(inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].file == BRW_IMMEDIATE_VALUE);
+      unsigned alignment = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].ud;
+
+      inst->opcode = SHADER_OPCODE_SEND;
+      inst->mlen = inst->exec_size / 8;
+      inst->resize_sources(3);
+
+      /* src[0] & src[1] are filled by setup_surface_descriptors() */
+      inst->src[2] = ubo_offset; /* payload */
+
+      if (compiler->indirect_ubos_use_sampler) {
+         const unsigned simd_mode =
+            inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
+                                   BRW_SAMPLER_SIMD_MODE_SIMD16;
+         const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
+                                                GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
+                                                simd_mode, 0);
+
+         inst->sfid = BRW_SFID_SAMPLER;
+         setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
+      } else if (alignment >= 4) {
+         const uint32_t desc =
+            brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
+                                           4, /* num_channels */
+                                           false   /* write */);
+
+         inst->sfid = (devinfo->verx10 >= 75 ?
+                       HSW_SFID_DATAPORT_DATA_CACHE_1 :
+                       GFX7_SFID_DATAPORT_DATA_CACHE);
+         setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
+      } else {
+         const uint32_t desc =
+            brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
+                                          32,     /* bit_size */
+                                          false   /* write */);
+
+         inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
+         setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
+
+         /* The byte scattered messages can only read one dword at a time so
+          * we have to duplicate the message 4 times to read the full vec4.
+          * Hopefully, dead code will clean up the mess if some of them aren't
+          * needed.
+          */
+         assert(inst->size_written == 16 * inst->exec_size);
+         inst->size_written /= 4;
+         for (unsigned c = 1; c < 4; c++) {
+            /* Emit a copy of the instruction because we're about to modify
+             * it.  Because this loop starts at 1, we will emit copies for the
+             * first 3 and the final one will be the modified instruction.
+             */
+            bld.emit(*inst);
+
+            /* Offset the source */
+            inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
+            bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
+
+            /* Offset the destination */
+            inst->dst = offset(inst->dst, bld, 1);
+         }
+      }
+   } else {
+      fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
+      fs_reg offset = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
+      assert(inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE].file == BAD_FILE);
+
+      const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->ver),
+                           BRW_REGISTER_TYPE_UD);
+
+      bld.MOV(byte_offset(payload, REG_SIZE), offset);
+
+      inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4;
+      inst->base_mrf = payload.nr;
+      inst->header_size = 1;
+      inst->mlen = 1 + inst->exec_size / 8;
+
+      inst->resize_sources(1);
+      inst->src[0] = surface;
+   }
+}
+
+static void
+lower_math_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   assert(bld.shader->devinfo->ver < 6);
+
+   inst->base_mrf = 2;
+   inst->mlen = inst->sources * inst->exec_size / 8;
+
+   if (inst->sources > 1) {
+      /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
+       * "Message Payload":
+       *
+       * "Operand0[7].  For the INT DIV functions, this operand is the
+       *  denominator."
+       *  ...
+       * "Operand1[7].  For the INT DIV functions, this operand is the
+       *  numerator."
+       */
+      const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
+      const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
+      const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
+
+      inst->resize_sources(1);
+      inst->src[0] = src0;
+
+      assert(inst->exec_size == 8);
+      bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
+   }
+}
+
+static void
+lower_interpolator_logical_send(const fs_builder &bld, fs_inst *inst,
+                                const struct brw_wm_prog_key *wm_prog_key,
+                                const struct brw_wm_prog_data *wm_prog_data)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+
+   /* We have to send something */
+   fs_reg payload = brw_vec8_grf(0, 0);
+   unsigned mlen = 1;
+
+   unsigned mode;
+   switch (inst->opcode) {
+   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+      assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
+      mode = GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE;
+      break;
+
+   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+      assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
+      mode = GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET;
+      break;
+
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+      payload = inst->src[INTERP_SRC_OFFSET];
+      mlen = 2 * inst->exec_size / 8;
+      mode = GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET;
+      break;
+
+   default:
+      unreachable("Invalid interpolator instruction");
+   }
+
+   const bool dynamic_mode =
+      inst->src[INTERP_SRC_DYNAMIC_MODE].file != BAD_FILE;
+
+   fs_reg desc = inst->src[INTERP_SRC_MSG_DESC];
+   uint32_t desc_imm =
+      brw_pixel_interp_desc(devinfo,
+                            /* Leave the mode at 0 if persample_dispatch is
+                             * dynamic, it will be ORed in below.
+                             */
+                            dynamic_mode ? 0 : mode,
+                            inst->pi_noperspective,
+                            false /* coarse_pixel_rate */,
+                            inst->exec_size, inst->group);
+
+   if (wm_prog_data->coarse_pixel_dispatch == BRW_ALWAYS) {
+      desc_imm |= (1 << 15);
+   } else if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES) {
+      STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_PI_MSG == (1 << 15));
+      fs_reg orig_desc = desc;
+      const fs_builder &ubld = bld.exec_all().group(8, 0);
+      desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      ubld.AND(desc, dynamic_msaa_flags(wm_prog_data),
+               brw_imm_ud(INTEL_MSAA_FLAG_COARSE_PI_MSG));
+
+      /* And, if it's AT_OFFSET, we might have a non-trivial descriptor */
+      if (orig_desc.file == IMM) {
+         desc_imm |= orig_desc.ud;
+      } else {
+         ubld.OR(desc, desc, orig_desc);
+      }
+   }
+
+   /* If persample_dispatch is dynamic, select the interpolation mode
+    * dynamically and OR into the descriptor to complete the static part
+    * generated by brw_pixel_interp_desc().
+    *
+    * Why does this work? If you look at the SKL PRMs, Volume 7:
+    * 3D-Media-GPGPU, Shared Functions Pixel Interpolater, you'll see that
+    *
+    *   - "Per Message Offset” Message Descriptor
+    *   - “Sample Position Offset” Message Descriptor
+    *
+    * have different formats. Fortunately, a fragment shader dispatched at
+    * pixel rate, will have gl_SampleID = 0 & gl_NumSamples = 1. So the value
+    * we pack in “Sample Position Offset” will be a 0 and will cover the X/Y
+    * components of "Per Message Offset”, which will give us the pixel offset 0x0.
+    */
+   if (dynamic_mode) {
+      fs_reg orig_desc = desc;
+      const fs_builder &ubld = bld.exec_all().group(8, 0);
+      desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+
+      /* The predicate should have been built in brw_fs_nir.cpp when emitting
+       * NIR code. This guarantees that we do not have incorrect interactions
+       * with the flag register holding the predication result.
+       */
+      if (orig_desc.file == IMM) {
+         /* Not using SEL here because we would generate an instruction with 2
+          * immediate sources which is not supported by HW.
+          */
+         set_predicate_inv(BRW_PREDICATE_NORMAL, false,
+                           ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
+                                                     GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
+         set_predicate_inv(BRW_PREDICATE_NORMAL, true,
+                           ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
+                                                     GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
+      } else {
+         set_predicate_inv(BRW_PREDICATE_NORMAL, false,
+                           ubld.OR(desc, orig_desc,
+                                   brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
+         set_predicate_inv(BRW_PREDICATE_NORMAL, true,
+                           ubld.OR(desc, orig_desc,
+                                   brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
+      }
+   }
+
+   assert(bld.shader->devinfo->ver >= 7);
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->sfid = GFX7_SFID_PIXEL_INTERPOLATOR;
+   inst->desc = desc_imm;
+   inst->ex_desc = 0;
+   inst->mlen = mlen;
+   inst->ex_mlen = 0;
+   inst->send_has_side_effects = false;
+   inst->send_is_volatile = false;
+
+   inst->resize_sources(3);
+   inst->src[0] = component(desc, 0);
+   inst->src[1] = brw_imm_ud(0); /* ex_desc */
+   inst->src[2] = payload;
+}
+
+static void
+lower_btd_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   fs_reg global_addr = inst->src[0];
+   const fs_reg btd_record = inst->src[1];
+
+   const unsigned unit = reg_unit(devinfo);
+   const unsigned mlen = 2 * unit;
+   const fs_builder ubld = bld.exec_all();
+   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2 * unit);
+
+   ubld.MOV(header, brw_imm_ud(0));
+   switch (inst->opcode) {
+   case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
+      assert(type_sz(global_addr.type) == 8 && global_addr.stride == 0);
+      global_addr.type = BRW_REGISTER_TYPE_UD;
+      global_addr.stride = 1;
+      ubld.group(2, 0).MOV(header, global_addr);
+      break;
+
+   case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
+      /* The bottom bit is the Stack ID release bit */
+      ubld.group(1, 0).MOV(header, brw_imm_ud(1));
+      break;
+
+   default:
+      unreachable("Invalid BTD message");
+   }
+
+   /* Stack IDs are always in R1 regardless of whether we're coming from a
+    * bindless shader or a regular compute shader.
+    */
+   fs_reg stack_ids = retype(offset(header, bld, 1), BRW_REGISTER_TYPE_UW);
+   bld.exec_all().MOV(stack_ids, retype(brw_vec8_grf(1 * unit, 0),
+                                        BRW_REGISTER_TYPE_UW));
+
+   unsigned ex_mlen = 0;
+   fs_reg payload;
+   if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) {
+      ex_mlen = 2 * (inst->exec_size / 8);
+      payload = bld.move_to_vgrf(btd_record, 1);
+   } else {
+      assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL);
+      /* All these messages take a BTD and things complain if we don't provide
+       * one for RETIRE.  However, it shouldn't ever actually get used so fill
+       * it with zero.
+       */
+      ex_mlen = 2 * (inst->exec_size / 8);
+      payload = bld.move_to_vgrf(brw_imm_uq(0), 1);
+   }
+
+   /* Update the original instruction. */
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->mlen = mlen;
+   inst->ex_mlen = ex_mlen;
+   inst->header_size = 0; /* HW docs require has_header = false */
+   inst->send_has_side_effects = true;
+   inst->send_is_volatile = false;
+
+   /* Set up SFID and descriptors */
+   inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH;
+   inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size,
+                                   GEN_RT_BTD_MESSAGE_SPAWN);
+   inst->resize_sources(4);
+   inst->src[0] = brw_imm_ud(0); /* desc */
+   inst->src[1] = brw_imm_ud(0); /* ex_desc */
+   inst->src[2] = header;
+   inst->src[3] = payload;
+}
+
+static void
+lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   /* The emit_uniformize() in brw_fs_nir.cpp will generate an horizontal
+    * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q
+    * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword
+    * so that the MOV operates on 2 components rather than twice the same
+    * component.
+    */
+   fs_reg globals_addr = retype(inst->src[RT_LOGICAL_SRC_GLOBALS], BRW_REGISTER_TYPE_UD);
+   globals_addr.stride = 1;
+   const fs_reg bvh_level =
+      inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == BRW_IMMEDIATE_VALUE ?
+      inst->src[RT_LOGICAL_SRC_BVH_LEVEL] :
+      bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL],
+                       inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL));
+   const fs_reg trace_ray_control =
+      inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == BRW_IMMEDIATE_VALUE ?
+      inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] :
+      bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL],
+                       inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL));
+   const fs_reg synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS];
+   assert(synchronous_src.file == BRW_IMMEDIATE_VALUE);
+   const bool synchronous = synchronous_src.ud;
+
+   const unsigned unit = reg_unit(devinfo);
+   const unsigned mlen = unit;
+   const fs_builder ubld = bld.exec_all();
+   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+   ubld.MOV(header, brw_imm_ud(0));
+   ubld.group(2, 0).MOV(header, globals_addr);
+   if (synchronous)
+      ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous));
+
+   const unsigned ex_mlen = inst->exec_size / 8;
+   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   if (bvh_level.file == BRW_IMMEDIATE_VALUE &&
+       trace_ray_control.file == BRW_IMMEDIATE_VALUE) {
+      bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, 9, 8) |
+                                  (bvh_level.ud & 0x7)));
+   } else {
+      bld.SHL(payload, trace_ray_control, brw_imm_ud(8));
+      bld.OR(payload, payload, bvh_level);
+   }
+
+   /* When doing synchronous traversal, the HW implicitly computes the
+    * stack_id using the following formula :
+    *
+    *    EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0]
+    *
+    * Only in the asynchronous case we need to set the stack_id given from the
+    * payload register.
+    */
+   if (!synchronous) {
+      bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
+              retype(brw_vec8_grf(1 * unit, 0), BRW_REGISTER_TYPE_UW),
+              brw_imm_uw(0x7ff));
+   }
+
+   /* Update the original instruction. */
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->mlen = mlen;
+   inst->ex_mlen = ex_mlen;
+   inst->header_size = 0; /* HW docs require has_header = false */
+   inst->send_has_side_effects = true;
+   inst->send_is_volatile = false;
+
+   /* Set up SFID and descriptors */
+   inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR;
+   inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size);
+   inst->resize_sources(4);
+   inst->src[0] = brw_imm_ud(0); /* desc */
+   inst->src[1] = brw_imm_ud(0); /* ex_desc */
+   inst->src[2] = header;
+   inst->src[3] = payload;
+}
+
+static void
+lower_get_buffer_size(const fs_builder &bld, fs_inst *inst)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   assert(devinfo->ver >= 7);
+   /* Since we can only execute this instruction on uniform bti/surface
+    * handles, brw_fs_nir.cpp should already have limited this to SIMD8.
+    */
+   assert(inst->exec_size == (devinfo->ver < 20 ? 8 : 16));
+
+   fs_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
+   fs_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
+   fs_reg lod = inst->src[GET_BUFFER_SIZE_SRC_LOD];
+
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->mlen = inst->exec_size / 8;
+   inst->resize_sources(3);
+   inst->ex_mlen = 0;
+   inst->ex_desc = 0;
+
+   /* src[0] & src[1] are filled by setup_surface_descriptors() */
+   inst->src[2] = lod;
+
+   const uint32_t return_format = devinfo->ver >= 8 ?
+      GFX8_SAMPLER_RETURN_FORMAT_32BITS : BRW_SAMPLER_RETURN_FORMAT_SINT32;
+
+   const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
+                                          GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
+                                          BRW_SAMPLER_SIMD_MODE_SIMD8,
+                                          return_format);
+
+   inst->dst = retype(inst->dst, BRW_REGISTER_TYPE_UW);
+   inst->sfid = BRW_SFID_SAMPLER;
+   setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
+}
+
+bool
+fs_visitor::lower_logical_sends()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const fs_builder ibld(this, block, inst);
+
+      switch (inst->opcode) {
+      case FS_OPCODE_FB_WRITE_LOGICAL:
+         assert(stage == MESA_SHADER_FRAGMENT);
+         lower_fb_write_logical_send(ibld, inst,
+                                     brw_wm_prog_data(prog_data),
+                                     (const brw_wm_prog_key *)key,
+                                     fs_payload());
+         break;
+
+      case FS_OPCODE_FB_READ_LOGICAL:
+         lower_fb_read_logical_send(ibld, inst);
+         break;
+
+      case SHADER_OPCODE_TEX_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
+         break;
+
+      case SHADER_OPCODE_TXD_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
+         break;
+
+      case SHADER_OPCODE_TXF_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
+         break;
+
+      case SHADER_OPCODE_TXL_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
+         break;
+
+      case SHADER_OPCODE_TXS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
+         break;
+
+      case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
+         lower_sampler_logical_send(ibld, inst,
+                                    SHADER_OPCODE_IMAGE_SIZE_LOGICAL);
+         break;
+
+      case FS_OPCODE_TXB_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
+         break;
+
+      case SHADER_OPCODE_TXF_CMS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
+         break;
+
+      case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+      case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
+         break;
+
+      case SHADER_OPCODE_TXF_UMS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
+         break;
+
+      case SHADER_OPCODE_TXF_MCS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
+         break;
+
+      case SHADER_OPCODE_LOD_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
+         break;
+
+      case SHADER_OPCODE_TG4_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
+         break;
+
+      case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
+         break;
+
+      case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
+         break;
+
+      case SHADER_OPCODE_GET_BUFFER_SIZE:
+         lower_get_buffer_size(ibld, inst);
+         break;
+
+      case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+      case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+      case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
+      case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
+         if (devinfo->has_lsc) {
+            lower_lsc_surface_logical_send(ibld, inst);
+            break;
+         }
+      case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
+      case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
+      case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+         lower_surface_logical_send(ibld, inst);
+         break;
+
+      case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
+      case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
+         if (devinfo->has_lsc) {
+            lower_lsc_block_logical_send(ibld, inst);
+            break;
+         }
+         lower_surface_block_logical_send(ibld, inst);
+         break;
+
+      case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
+      case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
+      case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
+      case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
+      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
+      case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
+      case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
+      case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
+         if (devinfo->has_lsc) {
+            lower_lsc_a64_logical_send(ibld, inst);
+            break;
+         }
+         lower_a64_logical_send(ibld, inst);
+         break;
+
+      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
+         if (devinfo->has_lsc && !compiler->indirect_ubos_use_sampler)
+            lower_lsc_varying_pull_constant_logical_send(ibld, inst);
+         else
+            lower_varying_pull_constant_logical_send(ibld, inst);
+         break;
+
+      case SHADER_OPCODE_RCP:
+      case SHADER_OPCODE_RSQ:
+      case SHADER_OPCODE_SQRT:
+      case SHADER_OPCODE_EXP2:
+      case SHADER_OPCODE_LOG2:
+      case SHADER_OPCODE_SIN:
+      case SHADER_OPCODE_COS:
+      case SHADER_OPCODE_POW:
+      case SHADER_OPCODE_INT_QUOTIENT:
+      case SHADER_OPCODE_INT_REMAINDER:
+         /* The math opcodes are overloaded for the send-like and
+          * expression-like instructions which seems kind of icky.  Gfx6+ has
+          * a native (but rather quirky) MATH instruction so we don't need to
+          * do anything here.  On Gfx4-5 we'll have to lower the Gfx6-like
+          * logical instructions (which we can easily recognize because they
+          * have mlen = 0) into send-like virtual instructions.
+          */
+         if (devinfo->ver < 6 && inst->mlen == 0) {
+            lower_math_logical_send(ibld, inst);
+            break;
+
+         } else {
+            continue;
+         }
+
+      case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+      case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+      case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+         lower_interpolator_logical_send(ibld, inst,
+                                         (const brw_wm_prog_key *)key,
+                                         brw_wm_prog_data(prog_data));
+         break;
+
+      case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
+      case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
+         lower_btd_logical_send(ibld, inst);
+         break;
+
+      case RT_OPCODE_TRACE_RAY_LOGICAL:
+         lower_trace_ray_logical_send(ibld, inst);
+         break;
+
+      case SHADER_OPCODE_URB_READ_LOGICAL:
+         if (devinfo->ver < 20)
+            lower_urb_read_logical_send(ibld, inst);
+         else
+            lower_urb_read_logical_send_xe2(ibld, inst);
+         break;
+
+      case SHADER_OPCODE_URB_WRITE_LOGICAL:
+         if (devinfo->ver < 20)
+            lower_urb_write_logical_send(ibld, inst);
+         else
+            lower_urb_write_logical_send_xe2(ibld, inst);
+
+         break;
+
+      default:
+         continue;
+      }
+
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+/**
+ * Turns the generic expression-style uniform pull constant load instruction
+ * into a hardware-specific series of instructions for loading a pull
+ * constant.
+ *
+ * The expression style allows the CSE pass before this to optimize out
+ * repeated loads from the same offset, and gives the pre-register-allocation
+ * scheduling full flexibility, while the conversion to native instructions
+ * allows the post-register-allocation scheduler the best information
+ * possible.
+ *
+ * Note that execution masking for setting up pull constant loads is special:
+ * the channels that need to be written are unrelated to the current execution
+ * mask, since a later instruction will use one of the result channels as a
+ * source operand for all 8 or 16 of its channels.
+ */
+bool
+fs_visitor::lower_uniform_pull_constant_loads()
+{
+   bool progress = false;
+
+   foreach_block_and_inst (block, fs_inst, inst, cfg) {
+      if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
+         continue;
+
+      const fs_reg surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE];
+      const fs_reg surface_handle = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE];
+      const fs_reg offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET];
+      const fs_reg size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE];
+      assert(surface.file == BAD_FILE || surface_handle.file == BAD_FILE);
+      assert(offset_B.file == IMM);
+      assert(size_B.file == IMM);
+
+      if (devinfo->has_lsc) {
+         const fs_builder ubld =
+            fs_builder(this, block, inst).group(8, 0).exec_all();
+
+         const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         ubld.MOV(payload, offset_B);
+
+         inst->sfid = GFX12_SFID_UGM;
+         inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
+                                   1 /* simd_size */,
+                                   surface_handle.file == BAD_FILE ?
+                                   LSC_ADDR_SURFTYPE_BTI :
+                                   LSC_ADDR_SURFTYPE_BSS,
+                                   LSC_ADDR_SIZE_A32,
+                                   1 /* num_coordinates */,
+                                   LSC_DATA_SIZE_D32,
+                                   inst->size_written / 4,
+                                   true /* transpose */,
+                                   LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
+                                   true /* has_dest */);
+
+         /* Update the original instruction. */
+         inst->opcode = SHADER_OPCODE_SEND;
+         inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
+         inst->send_ex_bso = surface_handle.file != BAD_FILE &&
+                             compiler->extended_bindless_surface_offset;
+         inst->ex_mlen = 0;
+         inst->header_size = 0;
+         inst->send_has_side_effects = false;
+         inst->send_is_volatile = true;
+         inst->exec_size = 1;
+
+         /* Finally, the payload */
+
+         inst->resize_sources(3);
+         setup_lsc_surface_descriptors(ubld, inst, inst->desc,
+                                       surface.file != BAD_FILE ?
+                                       surface : surface_handle);
+         inst->src[2] = payload;
+
+         invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+      } else if (devinfo->ver >= 7) {
+         const fs_builder ubld = fs_builder(this, block, inst).exec_all();
+         fs_reg header = fs_builder(this, 8).exec_all().vgrf(BRW_REGISTER_TYPE_UD);
+
+         ubld.group(8, 0).MOV(header,
+                              retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+         ubld.group(1, 0).MOV(component(header, 2),
+                              brw_imm_ud(offset_B.ud / 16));
+
+         inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
+         inst->opcode = SHADER_OPCODE_SEND;
+         inst->header_size = 1;
+         inst->mlen = 1;
+
+         uint32_t desc =
+            brw_dp_oword_block_rw_desc(devinfo, true /* align_16B */,
+                                       size_B.ud / 4, false /* write */);
+
+         inst->resize_sources(4);
+
+         setup_surface_descriptors(ubld, inst, desc, surface, surface_handle);
+
+         inst->src[2] = header;
+         inst->src[3] = fs_reg(); /* unused for reads */
+
+         invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+      } else {
+         assert(surface_handle.file == BAD_FILE);
+         /* Before register allocation, we didn't tell the scheduler about the
+          * MRF we use.  We know it's safe to use this MRF because nothing
+          * else does except for register spill/unspill, which generates and
+          * uses its MRF within a single IR instruction.
+          */
+         inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
+         inst->mlen = 1;
+      }
+
+      progress = true;
+   }
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_mesh.cpp b/src/intel/compiler/elk/brw_mesh.cpp
new file mode 100644
index 00000000000..66a02b2275e
--- /dev/null
+++ b/src/intel/compiler/elk/brw_mesh.cpp
@@ -0,0 +1,1606 @@
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <list>
+#include <vector>
+#include "brw_compiler.h"
+#include "brw_fs.h"
+#include "brw_nir.h"
+#include "brw_private.h"
+#include "compiler/nir/nir_builder.h"
+#include "dev/intel_debug.h"
+
+#include <memory>
+
+using namespace brw;
+
+static bool
+brw_nir_lower_load_uniforms_filter(const nir_instr *instr,
+                                   UNUSED const void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   return intrin->intrinsic == nir_intrinsic_load_uniform;
+}
+
+static nir_def *
+brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr,
+                                 UNUSED void *data)
+{
+   assert(instr->type == nir_instr_type_intrinsic);
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   assert(intrin->intrinsic == nir_intrinsic_load_uniform);
+
+   /* Read the first few 32-bit scalars from InlineData. */
+   if (nir_src_is_const(intrin->src[0]) &&
+       intrin->def.bit_size == 32 &&
+       intrin->def.num_components == 1) {
+      unsigned off = nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]);
+      unsigned off_dw = off / 4;
+      if (off % 4 == 0 && off_dw < BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW) {
+         off_dw += BRW_TASK_MESH_PUSH_CONSTANTS_START_DW;
+         return nir_load_mesh_inline_data_intel(b, 32, off_dw);
+      }
+   }
+
+   return brw_nir_load_global_const(b, intrin,
+                                    nir_load_mesh_inline_data_intel(b, 64, 0), 0);
+}
+
+static bool
+brw_nir_lower_load_uniforms(nir_shader *nir)
+{
+   return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter,
+                                        brw_nir_lower_load_uniforms_impl, NULL);
+}
+
+static inline int
+type_size_scalar_dwords(const struct glsl_type *type, bool bindless)
+{
+   return glsl_count_dword_slots(type, bindless);
+}
+
+/* TODO(mesh): Make this a common function. */
+static void
+shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
+{
+   assert(glsl_type_is_vector_or_scalar(type));
+
+   uint32_t comp_size = glsl_type_is_boolean(type)
+      ? 4 : glsl_get_bit_size(type) / 8;
+   unsigned length = glsl_get_vector_elements(type);
+   *size = comp_size * length,
+   *align = comp_size * (length == 3 ? 4 : length);
+}
+
+static bool
+brw_nir_lower_launch_mesh_workgroups_instr(nir_builder *b,
+                                           nir_intrinsic_instr *intrin,
+                                           void *data)
+{
+   if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
+      return false;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_def *local_invocation_index = nir_load_local_invocation_index(b);
+
+   /* Make sure that the mesh workgroup size is taken from the first invocation
+    * (nir_intrinsic_launch_mesh_workgroups requirement)
+    */
+   nir_def *cmp = nir_ieq_imm(b, local_invocation_index, 0);
+   nir_if *if_stmt = nir_push_if(b, cmp);
+   {
+      /* TUE header contains 4 words:
+       *
+       * - Word 0 for Task Count.
+       *
+       * - Words 1-3 used for "Dispatch Dimensions" feature, to allow mapping a
+       *   3D dispatch into the 1D dispatch supported by HW.
+       */
+      nir_def *x = nir_channel(b, intrin->src[0].ssa, 0);
+      nir_def *y = nir_channel(b, intrin->src[0].ssa, 1);
+      nir_def *z = nir_channel(b, intrin->src[0].ssa, 2);
+      nir_def *task_count = nir_imul(b, x, nir_imul(b, y, z));
+      nir_def *tue_header = nir_vec4(b, task_count, x, y, z);
+      nir_store_task_payload(b, tue_header, nir_imm_int(b, 0));
+   }
+   nir_pop_if(b, if_stmt);
+
+   nir_instr_remove(&intrin->instr);
+
+   return true;
+}
+
+static bool
+brw_nir_lower_launch_mesh_workgroups(nir_shader *nir)
+{
+   return nir_shader_intrinsics_pass(nir,
+                                       brw_nir_lower_launch_mesh_workgroups_instr,
+                                       nir_metadata_none,
+                                       NULL);
+}
+
+static void
+brw_nir_lower_tue_outputs(nir_shader *nir, brw_tue_map *map)
+{
+   memset(map, 0, sizeof(*map));
+
+   NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
+            type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
+
+   /* From bspec: "It is suggested that SW reserve the 16 bytes following the
+    * TUE Header, and therefore start the SW-defined data structure at 32B
+    * alignment.  This allows the TUE Header to always be written as 32 bytes
+    * with 32B alignment, the most optimal write performance case."
+    */
+   map->per_task_data_start_dw = 8;
+
+   /* Lowering to explicit types will start offsets from task_payload_size, so
+    * set it to start after the header.
+    */
+   nir->info.task_payload_size = map->per_task_data_start_dw * 4;
+   NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
+            nir_var_mem_task_payload, shared_type_info);
+   NIR_PASS(_, nir, nir_lower_explicit_io,
+            nir_var_mem_task_payload, nir_address_format_32bit_offset);
+
+   map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8);
+}
+
+static void
+brw_print_tue_map(FILE *fp, const struct brw_tue_map *map)
+{
+   fprintf(fp, "TUE (%d dwords)\n\n", map->size_dw);
+}
+
+static bool
+brw_nir_adjust_task_payload_offsets_instr(struct nir_builder *b,
+                                          nir_intrinsic_instr *intrin,
+                                          void *data)
+{
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_store_task_payload:
+   case nir_intrinsic_load_task_payload: {
+      nir_src *offset_src = nir_get_io_offset_src(intrin);
+
+      if (nir_src_is_const(*offset_src))
+         assert(nir_src_as_uint(*offset_src) % 4 == 0);
+
+      b->cursor = nir_before_instr(&intrin->instr);
+
+      /* Regular I/O uses dwords while explicit I/O used for task payload uses
+       * bytes.  Normalize it to dwords.
+       *
+       * TODO(mesh): Figure out how to handle 8-bit, 16-bit.
+       */
+
+      nir_def *offset = nir_ishr_imm(b, offset_src->ssa, 2);
+      nir_src_rewrite(offset_src, offset);
+
+      unsigned base = nir_intrinsic_base(intrin);
+      assert(base % 4 == 0);
+      nir_intrinsic_set_base(intrin, base / 4);
+
+      return true;
+   }
+
+   default:
+      return false;
+   }
+}
+
+static bool
+brw_nir_adjust_task_payload_offsets(nir_shader *nir)
+{
+   return nir_shader_intrinsics_pass(nir,
+                                       brw_nir_adjust_task_payload_offsets_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       NULL);
+}
+
+void
+brw_nir_adjust_payload(nir_shader *shader)
+{
+   /* Adjustment of task payload offsets must be performed *after* last pass
+    * which interprets them as bytes, because it changes their unit.
+    */
+   bool adjusted = false;
+   NIR_PASS(adjusted, shader, brw_nir_adjust_task_payload_offsets);
+   if (adjusted) /* clean up the mess created by offset adjustments */
+      NIR_PASS(_, shader, nir_opt_constant_folding);
+}
+
+static bool
+brw_nir_align_launch_mesh_workgroups_instr(nir_builder *b,
+                                           nir_intrinsic_instr *intrin,
+                                           void *data)
+{
+   if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups)
+      return false;
+
+   /* nir_lower_task_shader uses "range" as task payload size. */
+   unsigned range = nir_intrinsic_range(intrin);
+   /* This will avoid special case in nir_lower_task_shader dealing with
+    * not vec4-aligned payload when payload_in_shared workaround is enabled.
+    */
+   nir_intrinsic_set_range(intrin, ALIGN(range, 16));
+
+   return true;
+}
+
+static bool
+brw_nir_align_launch_mesh_workgroups(nir_shader *nir)
+{
+   return nir_shader_intrinsics_pass(nir,
+                                       brw_nir_align_launch_mesh_workgroups_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       NULL);
+}
+
+const unsigned *
+brw_compile_task(const struct brw_compiler *compiler,
+                 struct brw_compile_task_params *params)
+{
+   struct nir_shader *nir = params->base.nir;
+   const struct brw_task_prog_key *key = params->key;
+   struct brw_task_prog_data *prog_data = params->prog_data;
+   const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TASK);
+
+   brw_nir_lower_tue_outputs(nir, &prog_data->map);
+
+   NIR_PASS(_, nir, brw_nir_align_launch_mesh_workgroups);
+
+   nir_lower_task_shader_options lower_ts_opt = {
+      .payload_to_shared_for_atomics = true,
+      .payload_to_shared_for_small_types = true,
+      /* The actual payload data starts after the TUE header and padding,
+       * so skip those when copying.
+       */
+      .payload_offset_in_bytes = prog_data->map.per_task_data_start_dw * 4,
+   };
+   NIR_PASS(_, nir, nir_lower_task_shader, lower_ts_opt);
+
+   NIR_PASS(_, nir, brw_nir_lower_launch_mesh_workgroups);
+
+   prog_data->base.base.stage = MESA_SHADER_TASK;
+   prog_data->base.base.total_shared = nir->info.shared_size;
+   prog_data->base.base.total_scratch = 0;
+
+   prog_data->base.local_size[0] = nir->info.workgroup_size[0];
+   prog_data->base.local_size[1] = nir->info.workgroup_size[1];
+   prog_data->base.local_size[2] = nir->info.workgroup_size[2];
+
+   prog_data->uses_drawid =
+      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
+
+   brw_simd_selection_state simd_state{
+      .devinfo = compiler->devinfo,
+      .prog_data = &prog_data->base,
+      .required_width = brw_required_dispatch_width(&nir->info),
+   };
+
+   std::unique_ptr<fs_visitor> v[3];
+
+   for (unsigned simd = 0; simd < 3; simd++) {
+      if (!brw_simd_should_compile(simd_state, simd))
+         continue;
+
+      const unsigned dispatch_width = 8 << simd;
+
+      nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
+      brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
+
+      NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
+      NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
+
+      brw_postprocess_nir(shader, compiler, debug_enabled,
+                          key->base.robust_flags);
+
+      v[simd] = std::make_unique<fs_visitor>(compiler, &params->base,
+                                             &key->base,
+                                             &prog_data->base.base,
+                                             shader, dispatch_width,
+                                             params->base.stats != NULL,
+                                             debug_enabled);
+
+      if (prog_data->base.prog_mask) {
+         unsigned first = ffs(prog_data->base.prog_mask) - 1;
+         v[simd]->import_uniforms(v[first].get());
+      }
+
+      const bool allow_spilling = !brw_simd_any_compiled(simd_state);
+      if (v[simd]->run_task(allow_spilling))
+         brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
+      else
+         simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
+   }
+
+   int selected_simd = brw_simd_select(simd_state);
+   if (selected_simd < 0) {
+      params->base.error_str =
+         ralloc_asprintf(params->base.mem_ctx,
+                         "Can't compile shader: "
+                         "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
+                         simd_state.error[0], simd_state.error[1],
+                         simd_state.error[2]);
+      return NULL;
+   }
+
+   fs_visitor *selected = v[selected_simd].get();
+   prog_data->base.prog_mask = 1 << selected_simd;
+
+   if (unlikely(debug_enabled)) {
+      fprintf(stderr, "Task Output ");
+      brw_print_tue_map(stderr, &prog_data->map);
+   }
+
+   fs_generator g(compiler, &params->base, &prog_data->base.base,
+                  false, MESA_SHADER_TASK);
+   if (unlikely(debug_enabled)) {
+      g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
+                                     "%s task shader %s",
+                                     nir->info.label ? nir->info.label
+                                                     : "unnamed",
+                                     nir->info.name));
+   }
+
+   g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
+                   selected->performance_analysis.require(), params->base.stats);
+   g.add_const_data(nir->constant_data, nir->constant_data_size);
+   return g.get_assembly();
+}
+
+static void
+brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
+{
+   if (!map)
+      return;
+
+   nir->info.task_payload_size = map->per_task_data_start_dw * 4;
+
+   bool progress = false;
+
+   NIR_PASS(progress, nir, nir_lower_vars_to_explicit_types,
+            nir_var_mem_task_payload, shared_type_info);
+
+   if (progress) {
+      /* The types for Task Output and Mesh Input should match, so their sizes
+       * should also match.
+       */
+      assert(map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8));
+   } else {
+      /* Mesh doesn't read any input, to make it clearer set the
+       * task_payload_size to zero instead of keeping an incomplete size that
+       * just includes the header.
+       */
+      nir->info.task_payload_size = 0;
+   }
+
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_task_payload,
+            nir_address_format_32bit_offset);
+}
+
+/* Attribute types. Flat attributes have to be a separate class because
+ * flat and interpolated attributes can't share the same vec4 slot
+ * (see 3DSTATE_SBE.ConstantInterpolationEnable).
+ */
+enum {
+   PRIM, /* per primitive */
+   VERT, /* per vertex interpolated */
+   VERT_FLAT, /* per vertex flat */
+};
+
+struct attr_desc {
+   int location;
+   const struct glsl_type *type;
+   unsigned dwords;
+   unsigned slots;
+};
+
+struct attr_type_info {
+   /* order of attributes, negative values are holes */
+   std::list<struct attr_desc> *order;
+
+   /* attributes after which there's hole of size equal to array index */
+   std::list<int> holes[5];
+};
+
+static void
+brw_mue_assign_position(const struct attr_desc *attr,
+                        struct brw_mue_map *map,
+                        unsigned start_dw)
+{
+   bool is_array = glsl_type_is_array(attr->type);
+   int location = attr->location;
+   unsigned remaining = attr->dwords;
+
+   for (unsigned slot = 0; slot < attr->slots; ++slot) {
+      map->start_dw[location + slot] = start_dw;
+
+      unsigned sz;
+
+      if (is_array) {
+         assert(attr->dwords % attr->slots == 0);
+         sz = attr->dwords / attr->slots;
+      } else {
+         sz = MIN2(remaining, 4);
+      }
+
+      map->len_dw[location + slot] = sz;
+      start_dw += sz;
+      remaining -= sz;
+   }
+}
+
+static nir_variable *
+brw_nir_find_complete_variable_with_location(nir_shader *shader,
+                                             nir_variable_mode mode,
+                                             int location)
+{
+   nir_variable *best_var = NULL;
+   unsigned last_size = 0;
+
+   nir_foreach_variable_with_modes(var, shader, mode) {
+      if (var->data.location != location)
+         continue;
+
+      unsigned new_size = glsl_count_dword_slots(var->type, false);
+      if (new_size > last_size) {
+         best_var = var;
+         last_size = new_size;
+      }
+   }
+
+   return best_var;
+}
+
+static unsigned
+brw_sum_size(const std::list<struct attr_desc> &orders)
+{
+   unsigned sz = 0;
+   for (auto it = orders.cbegin(); it != orders.cend(); ++it)
+      sz += (*it).dwords;
+   return sz;
+}
+
+/* Finds order of outputs which require minimum size, without splitting
+ * of URB read/write messages (which operate on vec4-aligned memory).
+ */
+static void
+brw_compute_mue_layout(const struct brw_compiler *compiler,
+                       std::list<struct attr_desc> *orders,
+                       uint64_t outputs_written,
+                       struct nir_shader *nir,
+                       bool *pack_prim_data_into_header,
+                       bool *pack_vert_data_into_header)
+{
+   const struct shader_info *info = &nir->info;
+
+   struct attr_type_info data[3];
+
+   if ((compiler->mesh.mue_header_packing & 1) == 0)
+      *pack_prim_data_into_header = false;
+   if ((compiler->mesh.mue_header_packing & 2) == 0)
+      *pack_vert_data_into_header = false;
+
+   for (unsigned i = PRIM; i <= VERT_FLAT; ++i)
+      data[i].order = &orders[i];
+
+   /* If packing into header is enabled, add a hole of size 4 and add
+    * a virtual location to keep the algorithm happy (it expects holes
+    * to be preceded by some location). We'll remove those virtual
+    * locations at the end.
+    */
+   const gl_varying_slot virtual_header_location = VARYING_SLOT_POS;
+   assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0);
+
+   struct attr_desc d;
+   d.location = virtual_header_location;
+   d.type = NULL;
+   d.dwords = 0;
+   d.slots = 0;
+
+   struct attr_desc h;
+   h.location = -1;
+   h.type = NULL;
+   h.dwords = 4;
+   h.slots = 0;
+
+   if (*pack_prim_data_into_header) {
+      orders[PRIM].push_back(d);
+      orders[PRIM].push_back(h);
+      data[PRIM].holes[4].push_back(virtual_header_location);
+   }
+
+   if (*pack_vert_data_into_header) {
+      orders[VERT].push_back(d);
+      orders[VERT].push_back(h);
+      data[VERT].holes[4].push_back(virtual_header_location);
+   }
+
+   u_foreach_bit64(location, outputs_written) {
+      if ((BITFIELD64_BIT(location) & outputs_written) == 0)
+         continue;
+
+      /* At this point there are both complete and split variables as
+       * outputs. We need the complete variable to compute the required
+       * size.
+       */
+      nir_variable *var =
+            brw_nir_find_complete_variable_with_location(nir,
+                                                         nir_var_shader_out,
+                                                         location);
+
+      d.location = location;
+      d.type     = brw_nir_get_var_type(nir, var);
+      d.dwords   = glsl_count_dword_slots(d.type, false);
+      d.slots    = glsl_count_attribute_slots(d.type, false);
+
+      struct attr_type_info *type_data;
+
+      if (BITFIELD64_BIT(location) & info->per_primitive_outputs)
+         type_data = &data[PRIM];
+      else if (var->data.interpolation == INTERP_MODE_FLAT)
+         type_data = &data[VERT_FLAT];
+      else
+         type_data = &data[VERT];
+
+      std::list<struct attr_desc> *order = type_data->order;
+      std::list<int> *holes = type_data->holes;
+
+      outputs_written &= ~BITFIELD64_RANGE(location, d.slots);
+
+      /* special case to use hole of size 4 */
+      if (d.dwords == 4 && !holes[4].empty()) {
+         holes[4].pop_back();
+
+         assert(order->front().location == virtual_header_location);
+         order->pop_front();
+
+         assert(order->front().location == -1);
+         assert(order->front().dwords == 4);
+         order->front() = d;
+
+         continue;
+      }
+
+      int mod = d.dwords % 4;
+      if (mod == 0) {
+         order->push_back(d);
+         continue;
+      }
+
+      h.location = -1;
+      h.type = NULL;
+      h.dwords = 4 - mod;
+      h.slots = 0;
+
+      if (!compiler->mesh.mue_compaction) {
+         order->push_back(d);
+         order->push_back(h);
+         continue;
+      }
+
+      if (d.dwords > 4) {
+         order->push_back(d);
+         order->push_back(h);
+         holes[h.dwords].push_back(location);
+         continue;
+      }
+
+      assert(d.dwords < 4);
+
+      unsigned found = 0;
+      /* try to find the smallest hole big enough to hold this attribute */
+      for (unsigned sz = d.dwords; sz <= 4; sz++){
+         if (!holes[sz].empty()) {
+            found = sz;
+            break;
+         }
+      }
+
+      /* append at the end if not found */
+      if (found == 0) {
+         order->push_back(d);
+         order->push_back(h);
+         holes[h.dwords].push_back(location);
+
+         continue;
+      }
+
+      assert(found <= 4);
+      assert(!holes[found].empty());
+      int after_loc = holes[found].back();
+      holes[found].pop_back();
+
+      bool inserted_back = false;
+
+      for (auto it = order->begin(); it != order->end(); ++it) {
+         if ((*it).location != after_loc)
+            continue;
+
+         ++it;
+         /* must be a hole */
+         assert((*it).location < 0);
+         /* and it must be big enough */
+         assert(d.dwords <= (*it).dwords);
+
+         if (d.dwords == (*it).dwords) {
+            /* exact size, just replace */
+            *it = d;
+         } else {
+            /* inexact size, shrink hole */
+            (*it).dwords -= d.dwords;
+            /* and insert new attribute before it */
+            order->insert(it, d);
+
+            /* Insert shrunk hole in a spot so that the order of attributes
+             * is preserved.
+             */
+            std::list<int> &hole_list = holes[(*it).dwords];
+            std::list<int>::iterator insert_before = hole_list.end();
+
+            for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) {
+               if ((*it2) >= (int)location) {
+                  insert_before = it2;
+                  break;
+               }
+            }
+
+            hole_list.insert(insert_before, location);
+         }
+
+         inserted_back = true;
+         break;
+      }
+
+      assert(inserted_back);
+   }
+
+   if (*pack_prim_data_into_header) {
+      if (orders[PRIM].front().location == virtual_header_location)
+         orders[PRIM].pop_front();
+
+      if (!data[PRIM].holes[4].empty()) {
+         *pack_prim_data_into_header = false;
+
+         assert(orders[PRIM].front().location == -1);
+         assert(orders[PRIM].front().dwords == 4);
+         orders[PRIM].pop_front();
+      }
+
+      if (*pack_prim_data_into_header) {
+         unsigned sz = brw_sum_size(orders[PRIM]);
+
+         if (sz % 8 == 0 || sz % 8 > 4)
+            *pack_prim_data_into_header = false;
+      }
+   }
+
+   if (*pack_vert_data_into_header) {
+      if (orders[VERT].front().location == virtual_header_location)
+         orders[VERT].pop_front();
+
+      if (!data[VERT].holes[4].empty()) {
+         *pack_vert_data_into_header = false;
+
+         assert(orders[VERT].front().location == -1);
+         assert(orders[VERT].front().dwords == 4);
+         orders[VERT].pop_front();
+      }
+
+      if (*pack_vert_data_into_header) {
+         unsigned sz = brw_sum_size(orders[VERT]) +
+                       brw_sum_size(orders[VERT_FLAT]);
+
+         if (sz % 8 == 0 || sz % 8 > 4)
+            *pack_vert_data_into_header = false;
+      }
+   }
+
+
+   if (INTEL_DEBUG(DEBUG_MESH)) {
+      fprintf(stderr, "MUE attribute order:\n");
+      for (unsigned i = PRIM; i <= VERT_FLAT; ++i) {
+         if (!orders[i].empty())
+            fprintf(stderr, "%d: ", i);
+         for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) {
+            fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords);
+         }
+         if (!orders[i].empty())
+            fprintf(stderr, "\n");
+      }
+   }
+}
+
+/* Mesh URB Entry consists of an initial section
+ *
+ *  - Primitive Count
+ *  - Primitive Indices (from 0 to Max-1)
+ *  - Padding to 32B if needed
+ *
+ * optionally followed by a section for per-primitive data,
+ * in which each primitive (from 0 to Max-1) gets
+ *
+ *  - Primitive Header (e.g. ViewportIndex)
+ *  - Primitive Custom Attributes
+ *
+ * then followed by a section for per-vertex data
+ *
+ *  - Vertex Header (e.g. Position)
+ *  - Vertex Custom Attributes
+ *
+ * Each per-element section has a pitch and a starting offset.  All the
+ * individual attributes offsets in start_dw are considering the first entry
+ * of the section (i.e. where the Position for first vertex, or ViewportIndex
+ * for first primitive).  Attributes for other elements are calculated using
+ * the pitch.
+ */
+static void
+brw_compute_mue_map(const struct brw_compiler *compiler,
+                    struct nir_shader *nir, struct brw_mue_map *map,
+                    enum brw_mesh_index_format index_format, bool compact_mue)
+{
+   memset(map, 0, sizeof(*map));
+
+   memset(&map->start_dw[0], -1, sizeof(map->start_dw));
+   memset(&map->len_dw[0], 0, sizeof(map->len_dw));
+
+   unsigned vertices_per_primitive =
+      mesa_vertices_per_prim(nir->info.mesh.primitive_type);
+
+   map->max_primitives = nir->info.mesh.max_primitives_out;
+   map->max_vertices = nir->info.mesh.max_vertices_out;
+
+   uint64_t outputs_written = nir->info.outputs_written;
+
+   /* One dword for primitives count then K extra dwords for each primitive. */
+   switch (index_format) {
+   case BRW_INDEX_FORMAT_U32:
+      map->per_primitive_indices_dw = vertices_per_primitive;
+      break;
+   case BRW_INDEX_FORMAT_U888X:
+      map->per_primitive_indices_dw = 1;
+      break;
+   default:
+      unreachable("invalid index format");
+   }
+
+   map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw *
+                                       map->max_primitives + 1, 8);
+
+   /* Assign initial section. */
+   if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) {
+      map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0;
+      map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1;
+      outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT);
+   }
+   if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) {
+      map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1;
+      map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] =
+            map->per_primitive_indices_dw * map->max_primitives;
+      outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
+   }
+
+   const uint64_t per_primitive_header_bits =
+         BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) |
+         BITFIELD64_BIT(VARYING_SLOT_LAYER) |
+         BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) |
+         BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
+
+   const uint64_t per_vertex_header_bits =
+         BITFIELD64_BIT(VARYING_SLOT_PSIZ) |
+         BITFIELD64_BIT(VARYING_SLOT_POS) |
+         BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) |
+         BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
+
+   std::list<struct attr_desc> orders[3];
+   uint64_t regular_outputs = outputs_written &
+         ~(per_primitive_header_bits | per_vertex_header_bits);
+
+   /* packing into prim header is possible only if prim header is present */
+   map->user_data_in_primitive_header = compact_mue &&
+         (outputs_written & per_primitive_header_bits) != 0;
+
+   /* Packing into vert header is always possible, but we allow it only
+    * if full vec4 is available (so point size is not used) and there's
+    * nothing between it and normal vertex data (so no clip distances).
+    */
+   map->user_data_in_vertex_header = compact_mue &&
+         (outputs_written & per_vertex_header_bits) ==
+               BITFIELD64_BIT(VARYING_SLOT_POS);
+
+   if (outputs_written & per_primitive_header_bits) {
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
+         map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] =
+               map->per_primitive_start_dw + 0;
+         map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1;
+      }
+
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) {
+         map->start_dw[VARYING_SLOT_LAYER] =
+               map->per_primitive_start_dw + 1; /* RTAIndex */
+         map->len_dw[VARYING_SLOT_LAYER] = 1;
+      }
+
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) {
+         map->start_dw[VARYING_SLOT_VIEWPORT] =
+               map->per_primitive_start_dw + 2;
+         map->len_dw[VARYING_SLOT_VIEWPORT] = 1;
+      }
+
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) {
+         map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] =
+               map->per_primitive_start_dw + 3;
+         map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1;
+      }
+
+      map->per_primitive_header_size_dw = 8;
+      outputs_written &= ~per_primitive_header_bits;
+   } else {
+      map->per_primitive_header_size_dw = 0;
+   }
+
+   map->per_primitive_data_size_dw = 0;
+
+   /* For fast linked libraries, we can't pack the MUE, as the fragment shader
+    * will be compiled without access to the MUE map and won't be able to find
+    * out where everything is.
+    * Instead, keep doing things as we did before the packing, just laying out
+    * everything in varying order, which is how the FS will expect them.
+    */
+   if (compact_mue) {
+      brw_compute_mue_layout(compiler, orders, regular_outputs, nir,
+                             &map->user_data_in_primitive_header,
+                             &map->user_data_in_vertex_header);
+
+      unsigned start_dw = map->per_primitive_start_dw;
+      if (map->user_data_in_primitive_header)
+         start_dw += 4; /* first 4 dwords are used */
+      else
+         start_dw += map->per_primitive_header_size_dw;
+      unsigned header_used_dw = 0;
+
+      for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) {
+         int location = (*it).location;
+         if (location < 0) {
+            start_dw += (*it).dwords;
+            if (map->user_data_in_primitive_header && header_used_dw < 4)
+               header_used_dw += (*it).dwords;
+            else
+               map->per_primitive_data_size_dw += (*it).dwords;
+            assert(header_used_dw <= 4);
+            continue;
+         }
+
+         assert(map->start_dw[location] == -1);
+
+         assert(location == VARYING_SLOT_PRIMITIVE_ID ||
+                location >= VARYING_SLOT_VAR0);
+
+         brw_mue_assign_position(&*it, map, start_dw);
+
+         start_dw += (*it).dwords;
+         if (map->user_data_in_primitive_header && header_used_dw < 4)
+            header_used_dw += (*it).dwords;
+         else
+            map->per_primitive_data_size_dw += (*it).dwords;
+         assert(header_used_dw <= 4);
+         outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
+      }
+   } else {
+      unsigned start_dw = map->per_primitive_start_dw +
+                          map->per_primitive_header_size_dw;
+
+      uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs;
+      while (per_prim_outputs) {
+         uint64_t location = ffsll(per_prim_outputs) - 1;
+
+         assert(map->start_dw[location] == -1);
+         assert(location == VARYING_SLOT_PRIMITIVE_ID ||
+                location >= VARYING_SLOT_VAR0);
+
+         nir_variable *var =
+            brw_nir_find_complete_variable_with_location(nir,
+                                                         nir_var_shader_out,
+                                                         location);
+         struct attr_desc d;
+         d.location = location;
+         d.type     = brw_nir_get_var_type(nir, var);
+         d.dwords   = glsl_count_dword_slots(d.type, false);
+         d.slots    = glsl_count_attribute_slots(d.type, false);
+
+         brw_mue_assign_position(&d, map, start_dw);
+
+         map->per_primitive_data_size_dw += ALIGN(d.dwords, 4);
+         start_dw += ALIGN(d.dwords, 4);
+
+         per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots);
+      }
+   }
+
+   map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw +
+                                       map->per_primitive_data_size_dw, 8);
+
+   map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw +
+                                    map->per_primitive_pitch_dw *
+                                    map->max_primitives, 8);
+
+   /* TODO(mesh): Multiview. */
+   unsigned fixed_header_size = 8;
+   map->per_vertex_header_size_dw = ALIGN(fixed_header_size +
+                                          nir->info.clip_distance_array_size +
+                                          nir->info.cull_distance_array_size, 8);
+
+   if (outputs_written & per_vertex_header_bits) {
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) {
+         map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3;
+         map->len_dw[VARYING_SLOT_PSIZ] = 1;
+      }
+
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) {
+         map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4;
+         map->len_dw[VARYING_SLOT_POS] = 4;
+      }
+
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) {
+         map->start_dw[VARYING_SLOT_CLIP_DIST0] =
+               map->per_vertex_start_dw + fixed_header_size + 0;
+         map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4;
+      }
+
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) {
+         map->start_dw[VARYING_SLOT_CLIP_DIST1] =
+               map->per_vertex_start_dw + fixed_header_size + 4;
+         map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4;
+      }
+
+      outputs_written &= ~per_vertex_header_bits;
+   }
+
+   /* cull distances should be lowered earlier */
+   assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)));
+   assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)));
+
+   map->per_vertex_data_size_dw = 0;
+
+   /* For fast linked libraries, we can't pack the MUE, as the fragment shader
+    * will be compiled without access to the MUE map and won't be able to find
+    * out where everything is.
+    * Instead, keep doing things as we did before the packing, just laying out
+    * everything in varying order, which is how the FS will expect them.
+    */
+   if (compact_mue) {
+      unsigned start_dw = map->per_vertex_start_dw;
+      if (!map->user_data_in_vertex_header)
+         start_dw += map->per_vertex_header_size_dw;
+
+      unsigned header_used_dw = 0;
+      for (unsigned type = VERT; type <= VERT_FLAT; ++type) {
+         for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) {
+            int location = (*it).location;
+            if (location < 0) {
+               start_dw += (*it).dwords;
+               if (map->user_data_in_vertex_header && header_used_dw < 4) {
+                  header_used_dw += (*it).dwords;
+                  assert(header_used_dw <= 4);
+                  if (header_used_dw == 4)
+                     start_dw += 4; /* jump over gl_position */
+               } else {
+                  map->per_vertex_data_size_dw += (*it).dwords;
+               }
+               continue;
+            }
+
+            assert(map->start_dw[location] == -1);
+
+            assert(location >= VARYING_SLOT_VAR0);
+
+            brw_mue_assign_position(&*it, map, start_dw);
+
+            start_dw += (*it).dwords;
+            if (map->user_data_in_vertex_header && header_used_dw < 4) {
+               header_used_dw += (*it).dwords;
+               assert(header_used_dw <= 4);
+               if (header_used_dw == 4)
+                  start_dw += 4; /* jump over gl_position */
+            } else {
+               map->per_vertex_data_size_dw += (*it).dwords;
+            }
+            outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots);
+         }
+      }
+   } else {
+      unsigned start_dw = map->per_vertex_start_dw +
+                          map->per_vertex_header_size_dw;
+
+      uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs;
+      while (per_vertex_outputs) {
+         uint64_t location = ffsll(per_vertex_outputs) - 1;
+
+         assert(map->start_dw[location] == -1);
+         assert(location >= VARYING_SLOT_VAR0);
+
+         nir_variable *var =
+            brw_nir_find_complete_variable_with_location(nir,
+                                                         nir_var_shader_out,
+                                                         location);
+         struct attr_desc d;
+         d.location = location;
+         d.type     = brw_nir_get_var_type(nir, var);
+         d.dwords   = glsl_count_dword_slots(d.type, false);
+         d.slots    = glsl_count_attribute_slots(d.type, false);
+
+         brw_mue_assign_position(&d, map, start_dw);
+
+         map->per_vertex_data_size_dw += ALIGN(d.dwords, 4);
+         start_dw += ALIGN(d.dwords, 4);
+
+         per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots);
+      }
+   }
+
+   map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw +
+                                    map->per_vertex_data_size_dw, 8);
+
+   map->size_dw =
+      map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices;
+
+   assert(map->size_dw % 8 == 0);
+}
+
+static void
+brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir)
+{
+   fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n",
+           map->size_dw, map->max_primitives, map->max_vertices);
+   fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n",
+           map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT],
+           map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] +
+           map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1);
+   fprintf(fp, "  <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n",
+           map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES],
+           map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] +
+           map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1);
+
+   fprintf(fp, "  ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n",
+           map->per_primitive_start_dw,
+           map->per_primitive_header_size_dw,
+           map->per_primitive_data_size_dw,
+           map->per_primitive_pitch_dw);
+
+   for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
+      if (map->start_dw[i] < 0)
+         continue;
+
+      const unsigned offset = map->start_dw[i];
+      const unsigned len = map->len_dw[i];
+
+      if (offset < map->per_primitive_start_dw ||
+          offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw)
+         continue;
+
+      const char *name =
+            gl_varying_slot_name_for_stage((gl_varying_slot)i,
+                                           MESA_SHADER_MESH);
+
+      fprintf(fp, "  <%4d, %4d>: %s (%d)\n", offset, offset + len - 1,
+              name, i);
+   }
+
+   fprintf(fp, "  ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n",
+           map->per_vertex_start_dw,
+           map->per_vertex_header_size_dw,
+           map->per_vertex_data_size_dw,
+           map->per_vertex_pitch_dw);
+
+   for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
+      if (map->start_dw[i] < 0)
+         continue;
+
+      const unsigned offset = map->start_dw[i];
+      const unsigned len = map->len_dw[i];
+
+      if (offset < map->per_vertex_start_dw ||
+          offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw)
+         continue;
+
+      nir_variable *var =
+            nir_find_variable_with_location(nir, nir_var_shader_out, i);
+      bool flat = var->data.interpolation == INTERP_MODE_FLAT;
+
+      const char *name =
+            gl_varying_slot_name_for_stage((gl_varying_slot)i,
+                                           MESA_SHADER_MESH);
+
+      fprintf(fp, "  <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1,
+              name, i, flat ? " (flat)" : "");
+   }
+
+   fprintf(fp, "\n");
+}
+
+static void
+brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map)
+{
+   nir_foreach_shader_out_variable(var, nir) {
+      int location = var->data.location;
+      assert(location >= 0);
+      assert(map->start_dw[location] != -1);
+      var->data.driver_location = map->start_dw[location];
+   }
+
+   NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out,
+            type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32);
+}
+
+static void
+brw_nir_initialize_mue(nir_shader *nir,
+                       const struct brw_mue_map *map,
+                       unsigned dispatch_width)
+{
+   assert(map->per_primitive_header_size_dw > 0);
+
+   nir_builder b;
+   nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
+   b = nir_builder_at(nir_before_impl(entrypoint));
+
+   nir_def *dw_off = nir_imm_int(&b, 0);
+   nir_def *zerovec = nir_imm_vec4(&b, 0, 0, 0, 0);
+
+   /* TODO(mesh): can we write in bigger batches, generating fewer SENDs? */
+
+   assert(!nir->info.workgroup_size_variable);
+   const unsigned workgroup_size = nir->info.workgroup_size[0] *
+                                   nir->info.workgroup_size[1] *
+                                   nir->info.workgroup_size[2];
+
+   /* Invocations from a single workgroup will cooperate in zeroing MUE. */
+
+   /* How many prims each invocation needs to cover without checking its index? */
+   unsigned prims_per_inv = map->max_primitives / workgroup_size;
+
+   /* Zero first 4 dwords of MUE Primitive Header:
+    * Reserved, RTAIndex, ViewportIndex, CullPrimitiveMask.
+    */
+
+   nir_def *local_invocation_index = nir_load_local_invocation_index(&b);
+
+   /* Zero primitive headers distanced by workgroup_size, starting from
+    * invocation index.
+    */
+   for (unsigned prim_in_inv = 0; prim_in_inv < prims_per_inv; ++prim_in_inv) {
+      nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
+                                           prim_in_inv * workgroup_size);
+
+      nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
+                                     .base = (int)map->per_primitive_start_dw,
+                                     .write_mask = WRITEMASK_XYZW,
+                                     .component = 0,
+                                     .src_type = nir_type_uint32);
+   }
+
+   /* How many prims are left? */
+   unsigned remaining = map->max_primitives % workgroup_size;
+
+   if (remaining) {
+      /* Zero "remaining" primitive headers starting from the last one covered
+       * by the loop above + workgroup_size.
+       */
+      nir_def *cmp = nir_ilt_imm(&b, local_invocation_index, remaining);
+      nir_if *if_stmt = nir_push_if(&b, cmp);
+      {
+         nir_def *prim = nir_iadd_imm(&b, local_invocation_index,
+                                               prims_per_inv * workgroup_size);
+
+         nir_store_per_primitive_output(&b, zerovec, prim, dw_off,
+                                        .base = (int)map->per_primitive_start_dw,
+                                        .write_mask = WRITEMASK_XYZW,
+                                        .component = 0,
+                                        .src_type = nir_type_uint32);
+      }
+      nir_pop_if(&b, if_stmt);
+   }
+
+   /* If there's more than one subgroup, then we need to wait for all of them
+    * to finish initialization before we can proceed. Otherwise some subgroups
+    * may start filling MUE before other finished initializing.
+    */
+   if (workgroup_size > dispatch_width) {
+      nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP,
+                         NIR_MEMORY_ACQ_REL, nir_var_shader_out);
+   }
+
+   if (remaining) {
+      nir_metadata_preserve(entrypoint, nir_metadata_none);
+   } else {
+      nir_metadata_preserve(entrypoint, nir_metadata_block_index |
+                                        nir_metadata_dominance);
+   }
+}
+
+static void
+brw_nir_adjust_offset(nir_builder *b, nir_intrinsic_instr *intrin, uint32_t pitch)
+{
+   nir_src *index_src = nir_get_io_arrayed_index_src(intrin);
+   nir_src *offset_src = nir_get_io_offset_src(intrin);
+
+   b->cursor = nir_before_instr(&intrin->instr);
+   nir_def *offset =
+      nir_iadd(b,
+               offset_src->ssa,
+               nir_imul_imm(b, index_src->ssa, pitch));
+   nir_src_rewrite(offset_src, offset);
+}
+
+static bool
+brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b,
+                                                nir_intrinsic_instr *intrin,
+                                                void *data)
+{
+   const struct brw_mue_map *map = (const struct brw_mue_map *) data;
+
+   /* Remap per_vertex and per_primitive offsets using the extra source and
+    * the pitch.
+    */
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_load_per_vertex_output:
+   case nir_intrinsic_store_per_vertex_output:
+      brw_nir_adjust_offset(b, intrin, map->per_vertex_pitch_dw);
+
+      return true;
+
+   case nir_intrinsic_load_per_primitive_output:
+   case nir_intrinsic_store_per_primitive_output: {
+      struct nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
+      uint32_t pitch;
+      if (sem.location == VARYING_SLOT_PRIMITIVE_INDICES)
+         pitch = map->per_primitive_indices_dw;
+      else
+         pitch = map->per_primitive_pitch_dw;
+
+      brw_nir_adjust_offset(b, intrin, pitch);
+
+      return true;
+   }
+
+   default:
+      return false;
+   }
+}
+
+static bool
+brw_nir_adjust_offset_for_arrayed_indices(nir_shader *nir, const struct brw_mue_map *map)
+{
+   return nir_shader_intrinsics_pass(nir,
+                                       brw_nir_adjust_offset_for_arrayed_indices_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       (void *)map);
+}
+
+struct index_packing_state {
+   unsigned vertices_per_primitive;
+   nir_variable *original_prim_indices;
+   nir_variable *packed_prim_indices;
+};
+
+static bool
+brw_can_pack_primitive_indices(nir_shader *nir, struct index_packing_state *state)
+{
+   /* can single index fit into one byte of U888X format? */
+   if (nir->info.mesh.max_vertices_out > 255)
+      return false;
+
+   state->vertices_per_primitive =
+         mesa_vertices_per_prim(nir->info.mesh.primitive_type);
+   /* packing point indices doesn't help */
+   if (state->vertices_per_primitive == 1)
+      return false;
+
+   state->original_prim_indices =
+      nir_find_variable_with_location(nir,
+                                      nir_var_shader_out,
+                                      VARYING_SLOT_PRIMITIVE_INDICES);
+   /* no indices = no changes to the shader, but it's still worth it,
+    * because less URB space will be used
+    */
+   if (!state->original_prim_indices)
+      return true;
+
+   ASSERTED const struct glsl_type *type = state->original_prim_indices->type;
+   assert(glsl_type_is_array(type));
+   assert(glsl_type_is_vector(glsl_without_array(type)));
+   assert(glsl_without_array(type)->vector_elements == state->vertices_per_primitive);
+
+   nir_foreach_function_impl(impl, nir) {
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+            if (intrin->intrinsic != nir_intrinsic_store_deref) {
+               /* any unknown deref operation on primitive indices -> don't pack */
+               unsigned num_srcs = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
+               for (unsigned i = 0; i < num_srcs; i++) {
+                  nir_deref_instr *deref = nir_src_as_deref(intrin->src[i]);
+                  if (!deref)
+                     continue;
+                  nir_variable *var = nir_deref_instr_get_variable(deref);
+
+                  if (var == state->original_prim_indices)
+                     return false;
+               }
+
+               continue;
+            }
+
+            nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+            if (!deref)
+               continue;
+
+            nir_variable *var = nir_deref_instr_get_variable(deref);
+            if (var != state->original_prim_indices)
+               continue;
+
+            if (deref->deref_type != nir_deref_type_array)
+               return false; /* unknown chain of derefs */
+
+            nir_deref_instr *var_deref = nir_src_as_deref(deref->parent);
+            if (!var_deref || var_deref->deref_type != nir_deref_type_var)
+               return false; /* unknown chain of derefs */
+
+            assert (var_deref->var == state->original_prim_indices);
+
+            unsigned write_mask = nir_intrinsic_write_mask(intrin);
+
+            /* If only some components are written, then we can't easily pack.
+             * In theory we could, by loading current dword value, bitmasking
+             * one byte and storing back the whole dword, but it would be slow
+             * and could actually decrease performance. TODO: reevaluate this
+             * once there will be something hitting this.
+             */
+            if (write_mask != BITFIELD_MASK(state->vertices_per_primitive))
+               return false;
+         }
+      }
+   }
+
+   return true;
+}
+
+static bool
+brw_pack_primitive_indices_instr(nir_builder *b, nir_intrinsic_instr *intrin,
+                                 void *data)
+{
+   if (intrin->intrinsic != nir_intrinsic_store_deref)
+      return false;
+
+   nir_deref_instr *array_deref = nir_src_as_deref(intrin->src[0]);
+   if (!array_deref || array_deref->deref_type != nir_deref_type_array)
+      return false;
+
+   nir_deref_instr *var_deref = nir_src_as_deref(array_deref->parent);
+   if (!var_deref || var_deref->deref_type != nir_deref_type_var)
+      return false;
+
+   struct index_packing_state *state =
+         (struct index_packing_state *)data;
+
+   nir_variable *var = var_deref->var;
+
+   if (var != state->original_prim_indices)
+      return false;
+
+   unsigned vertices_per_primitive = state->vertices_per_primitive;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_deref_instr *new_var_deref =
+         nir_build_deref_var(b, state->packed_prim_indices);
+   nir_deref_instr *new_array_deref =
+         nir_build_deref_array(b, new_var_deref, array_deref->arr.index.ssa);
+
+   nir_src *data_src = &intrin->src[1];
+   nir_def *data_def =
+         data_src->ssa;
+
+   nir_def *new_data =
+         nir_ior(b, nir_ishl_imm(b, nir_channel(b, data_def, 0), 0),
+                    nir_ishl_imm(b, nir_channel(b, data_def, 1), 8));
+
+   if (vertices_per_primitive >= 3) {
+      new_data =
+            nir_ior(b, new_data,
+                       nir_ishl_imm(b, nir_channel(b, data_def, 2), 16));
+   }
+
+   nir_build_store_deref(b, &new_array_deref->def, new_data);
+
+   nir_instr_remove(&intrin->instr);
+
+   return true;
+}
+
+static bool
+brw_pack_primitive_indices(nir_shader *nir, void *data)
+{
+   struct index_packing_state *state = (struct index_packing_state *)data;
+
+   const struct glsl_type *new_type =
+         glsl_array_type(glsl_uint_type(),
+                         nir->info.mesh.max_primitives_out,
+                         0);
+
+   state->packed_prim_indices =
+         nir_variable_create(nir, nir_var_shader_out,
+                             new_type, "gl_PrimitiveIndicesPacked");
+   state->packed_prim_indices->data.location = VARYING_SLOT_PRIMITIVE_INDICES;
+   state->packed_prim_indices->data.interpolation = INTERP_MODE_NONE;
+   state->packed_prim_indices->data.per_primitive = 1;
+
+   return nir_shader_intrinsics_pass(nir, brw_pack_primitive_indices_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       data);
+}
+
+const unsigned *
+brw_compile_mesh(const struct brw_compiler *compiler,
+                 struct brw_compile_mesh_params *params)
+{
+   struct nir_shader *nir = params->base.nir;
+   const struct brw_mesh_prog_key *key = params->key;
+   struct brw_mesh_prog_data *prog_data = params->prog_data;
+   const bool debug_enabled = brw_should_print_shader(nir, DEBUG_MESH);
+
+   prog_data->base.base.stage = MESA_SHADER_MESH;
+   prog_data->base.base.total_shared = nir->info.shared_size;
+   prog_data->base.base.total_scratch = 0;
+
+   prog_data->base.local_size[0] = nir->info.workgroup_size[0];
+   prog_data->base.local_size[1] = nir->info.workgroup_size[1];
+   prog_data->base.local_size[2] = nir->info.workgroup_size[2];
+
+   prog_data->clip_distance_mask = (1 << nir->info.clip_distance_array_size) - 1;
+   prog_data->cull_distance_mask =
+         ((1 << nir->info.cull_distance_array_size) - 1) <<
+          nir->info.clip_distance_array_size;
+   prog_data->primitive_type = nir->info.mesh.primitive_type;
+
+   struct index_packing_state index_packing_state = {};
+   if (brw_can_pack_primitive_indices(nir, &index_packing_state)) {
+      if (index_packing_state.original_prim_indices)
+         NIR_PASS(_, nir, brw_pack_primitive_indices, &index_packing_state);
+      prog_data->index_format = BRW_INDEX_FORMAT_U888X;
+   } else {
+      prog_data->index_format = BRW_INDEX_FORMAT_U32;
+   }
+
+   prog_data->uses_drawid =
+      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
+
+   brw_nir_lower_tue_inputs(nir, params->tue_map);
+
+   brw_compute_mue_map(compiler, nir, &prog_data->map,
+                       prog_data->index_format, key->compact_mue);
+   brw_nir_lower_mue_outputs(nir, &prog_data->map);
+
+   brw_simd_selection_state simd_state{
+      .devinfo = compiler->devinfo,
+      .prog_data = &prog_data->base,
+      .required_width = brw_required_dispatch_width(&nir->info),
+   };
+
+   std::unique_ptr<fs_visitor> v[3];
+
+   for (int simd = 0; simd < 3; simd++) {
+      if (!brw_simd_should_compile(simd_state, simd))
+         continue;
+
+      const unsigned dispatch_width = 8 << simd;
+
+      nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
+
+      /*
+       * When Primitive Header is enabled, we may not generates writes to all
+       * fields, so let's initialize everything.
+       */
+      if (prog_data->map.per_primitive_header_size_dw > 0)
+         NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width);
+
+      brw_nir_apply_key(shader, compiler, &key->base, dispatch_width);
+
+      NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map);
+      /* Load uniforms can do a better job for constants, so fold before it. */
+      NIR_PASS(_, shader, nir_opt_constant_folding);
+      NIR_PASS(_, shader, brw_nir_lower_load_uniforms);
+
+      NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
+
+      brw_postprocess_nir(shader, compiler, debug_enabled,
+                          key->base.robust_flags);
+
+      v[simd] = std::make_unique<fs_visitor>(compiler, &params->base,
+                                             &key->base,
+                                             &prog_data->base.base,
+                                             shader, dispatch_width,
+                                             params->base.stats != NULL,
+                                             debug_enabled);
+
+      if (prog_data->base.prog_mask) {
+         unsigned first = ffs(prog_data->base.prog_mask) - 1;
+         v[simd]->import_uniforms(v[first].get());
+      }
+
+      const bool allow_spilling = !brw_simd_any_compiled(simd_state);
+      if (v[simd]->run_mesh(allow_spilling))
+         brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
+      else
+         simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
+   }
+
+   int selected_simd = brw_simd_select(simd_state);
+   if (selected_simd < 0) {
+      params->base.error_str =
+         ralloc_asprintf(params->base.mem_ctx,
+                         "Can't compile shader: "
+                         "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
+                         simd_state.error[0], simd_state.error[1],
+                         simd_state.error[2]);
+      return NULL;
+   }
+
+   fs_visitor *selected = v[selected_simd].get();
+   prog_data->base.prog_mask = 1 << selected_simd;
+
+   if (unlikely(debug_enabled)) {
+      if (params->tue_map) {
+         fprintf(stderr, "Mesh Input ");
+         brw_print_tue_map(stderr, params->tue_map);
+      }
+      fprintf(stderr, "Mesh Output ");
+      brw_print_mue_map(stderr, &prog_data->map, nir);
+   }
+
+   fs_generator g(compiler, &params->base, &prog_data->base.base,
+                  false, MESA_SHADER_MESH);
+   if (unlikely(debug_enabled)) {
+      g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
+                                     "%s mesh shader %s",
+                                     nir->info.label ? nir->info.label
+                                                     : "unnamed",
+                                     nir->info.name));
+   }
+
+   g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats,
+                   selected->performance_analysis.require(), params->base.stats);
+   g.add_const_data(nir->constant_data, nir->constant_data_size);
+   return g.get_assembly();
+}
diff --git a/src/intel/compiler/elk/brw_nir.c b/src/intel/compiler/elk/brw_nir.c
new file mode 100644
index 00000000000..203113ed3ec
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir.c
@@ -0,0 +1,2153 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "intel_nir.h"
+#include "brw_nir.h"
+#include "brw_nir_rt.h"
+#include "brw_shader.h"
+#include "dev/intel_debug.h"
+#include "compiler/glsl_types.h"
+#include "compiler/nir/nir_builder.h"
+#include "util/u_math.h"
+
+static bool
+remap_tess_levels(nir_builder *b, nir_intrinsic_instr *intr,
+                  enum tess_primitive_mode _primitive_mode)
+{
+   const int location = nir_intrinsic_base(intr);
+   const unsigned component = nir_intrinsic_component(intr);
+   bool out_of_bounds = false;
+   bool write = !nir_intrinsic_infos[intr->intrinsic].has_dest;
+   unsigned mask = write ? nir_intrinsic_write_mask(intr) : 0;
+   nir_def *src = NULL, *dest = NULL;
+
+   if (write) {
+      assert(intr->num_components == intr->src[0].ssa->num_components);
+   } else {
+      assert(intr->num_components == intr->def.num_components);
+   }
+
+   if (location == VARYING_SLOT_TESS_LEVEL_INNER) {
+      b->cursor = write ? nir_before_instr(&intr->instr)
+                        : nir_after_instr(&intr->instr);
+
+      switch (_primitive_mode) {
+      case TESS_PRIMITIVE_QUADS:
+         /* gl_TessLevelInner[0..1] lives at DWords 3-2 (reversed). */
+         nir_intrinsic_set_base(intr, 0);
+
+         if (write) {
+            assert(intr->src[0].ssa->num_components == 2);
+
+            intr->num_components = 4;
+
+            nir_def *undef = nir_undef(b, 1, 32);
+            nir_def *x = nir_channel(b, intr->src[0].ssa, 0);
+            nir_def *y = nir_channel(b, intr->src[0].ssa, 1);
+            src = nir_vec4(b, undef, undef, y, x);
+            mask = !!(mask & WRITEMASK_X) << 3 | !!(mask & WRITEMASK_Y) << 2;
+         } else if (intr->def.num_components > 1) {
+            assert(intr->def.num_components == 2);
+
+            intr->num_components = 4;
+            intr->def.num_components = 4;
+
+            unsigned wz[2] = { 3, 2 };
+            dest = nir_swizzle(b, &intr->def, wz, 2);
+         } else {
+            nir_intrinsic_set_component(intr, 3 - component);
+         }
+         break;
+      case TESS_PRIMITIVE_TRIANGLES:
+         /* gl_TessLevelInner[0] lives at DWord 4. */
+         nir_intrinsic_set_base(intr, 1);
+         mask &= WRITEMASK_X;
+         out_of_bounds = component > 0;
+         break;
+      case TESS_PRIMITIVE_ISOLINES:
+         out_of_bounds = true;
+         break;
+      default:
+         unreachable("Bogus tessellation domain");
+      }
+   } else if (location == VARYING_SLOT_TESS_LEVEL_OUTER) {
+      b->cursor = write ? nir_before_instr(&intr->instr)
+                        : nir_after_instr(&intr->instr);
+
+      nir_intrinsic_set_base(intr, 1);
+
+      switch (_primitive_mode) {
+      case TESS_PRIMITIVE_QUADS:
+      case TESS_PRIMITIVE_TRIANGLES:
+         /* Quads:     gl_TessLevelOuter[0..3] lives at DWords 7-4 (reversed).
+          * Triangles: gl_TessLevelOuter[0..2] lives at DWords 7-5 (reversed).
+          */
+         if (write) {
+            assert(intr->src[0].ssa->num_components == 4);
+
+            unsigned wzyx[4] = { 3, 2, 1, 0 };
+            src = nir_swizzle(b, intr->src[0].ssa, wzyx, 4);
+            mask = !!(mask & WRITEMASK_X) << 3 | !!(mask & WRITEMASK_Y) << 2 |
+                   !!(mask & WRITEMASK_Z) << 1 | !!(mask & WRITEMASK_W) << 0;
+
+            /* Don't overwrite the inner factor at DWord 4 for triangles */
+            if (_primitive_mode == TESS_PRIMITIVE_TRIANGLES)
+               mask &= ~WRITEMASK_X;
+         } else if (intr->def.num_components > 1) {
+            assert(intr->def.num_components == 4);
+
+            unsigned wzyx[4] = { 3, 2, 1, 0 };
+            dest = nir_swizzle(b, &intr->def, wzyx, 4);
+         } else {
+            nir_intrinsic_set_component(intr, 3 - component);
+            out_of_bounds = component == 3 &&
+                            _primitive_mode == TESS_PRIMITIVE_TRIANGLES;
+         }
+         break;
+      case TESS_PRIMITIVE_ISOLINES:
+         /* gl_TessLevelOuter[0..1] lives at DWords 6-7 (in order). */
+         if (write) {
+            assert(intr->src[0].ssa->num_components == 4);
+
+            nir_def *undef = nir_undef(b, 1, 32);
+            nir_def *x = nir_channel(b, intr->src[0].ssa, 0);
+            nir_def *y = nir_channel(b, intr->src[0].ssa, 1);
+            src = nir_vec4(b, undef, undef, x, y);
+            mask = !!(mask & WRITEMASK_X) << 2 | !!(mask & WRITEMASK_Y) << 3;
+         } else {
+            nir_intrinsic_set_component(intr, 2 + component);
+            out_of_bounds = component > 1;
+         }
+         break;
+      default:
+         unreachable("Bogus tessellation domain");
+      }
+   } else {
+      return false;
+   }
+
+   if (out_of_bounds) {
+      if (!write)
+         nir_def_rewrite_uses(&intr->def, nir_undef(b, 1, 32));
+      nir_instr_remove(&intr->instr);
+   } else if (write) {
+      nir_intrinsic_set_write_mask(intr, mask);
+
+      if (src) {
+         nir_src_rewrite(&intr->src[0], src);
+      }
+   } else if (dest) {
+      nir_def_rewrite_uses_after(&intr->def, dest,
+                                     dest->parent_instr);
+   }
+
+   return true;
+}
+
+static bool
+is_input(nir_intrinsic_instr *intrin)
+{
+   return intrin->intrinsic == nir_intrinsic_load_input ||
+          intrin->intrinsic == nir_intrinsic_load_per_vertex_input ||
+          intrin->intrinsic == nir_intrinsic_load_interpolated_input;
+}
+
+static bool
+is_output(nir_intrinsic_instr *intrin)
+{
+   return intrin->intrinsic == nir_intrinsic_load_output ||
+          intrin->intrinsic == nir_intrinsic_load_per_vertex_output ||
+          intrin->intrinsic == nir_intrinsic_store_output ||
+          intrin->intrinsic == nir_intrinsic_store_per_vertex_output;
+}
+
+
+static bool
+remap_patch_urb_offsets(nir_block *block, nir_builder *b,
+                        const struct intel_vue_map *vue_map,
+                        enum tess_primitive_mode tes_primitive_mode)
+{
+   nir_foreach_instr_safe(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+      gl_shader_stage stage = b->shader->info.stage;
+
+      if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) ||
+          (stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) {
+
+         if (remap_tess_levels(b, intrin, tes_primitive_mode))
+            continue;
+
+         int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]];
+         assert(vue_slot != -1);
+         intrin->const_index[0] = vue_slot;
+
+         nir_src *vertex = nir_get_io_arrayed_index_src(intrin);
+         if (vertex) {
+            if (nir_src_is_const(*vertex)) {
+               intrin->const_index[0] += nir_src_as_uint(*vertex) *
+                                         vue_map->num_per_vertex_slots;
+            } else {
+               b->cursor = nir_before_instr(&intrin->instr);
+
+               /* Multiply by the number of per-vertex slots. */
+               nir_def *vertex_offset =
+                  nir_imul(b,
+                           vertex->ssa,
+                           nir_imm_int(b,
+                                       vue_map->num_per_vertex_slots));
+
+               /* Add it to the existing offset */
+               nir_src *offset = nir_get_io_offset_src(intrin);
+               nir_def *total_offset =
+                  nir_iadd(b, vertex_offset,
+                           offset->ssa);
+
+               nir_src_rewrite(offset, total_offset);
+            }
+         }
+      }
+   }
+   return true;
+}
+
+void
+brw_nir_lower_vs_inputs(nir_shader *nir,
+                        bool edgeflag_is_last,
+                        const uint8_t *vs_attrib_wa_flags)
+{
+   /* Start with the location of the variable's base. */
+   nir_foreach_shader_in_variable(var, nir)
+      var->data.driver_location = var->data.location;
+
+   /* Now use nir_lower_io to walk dereference chains.  Attribute arrays are
+    * loaded as one vec4 or dvec4 per element (or matrix column), depending on
+    * whether it is a double-precision type or not.
+    */
+   nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
+                nir_lower_io_lower_64bit_to_32);
+
+   /* This pass needs actual constants */
+   nir_opt_constant_folding(nir);
+
+   nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
+
+   brw_nir_apply_attribute_workarounds(nir, vs_attrib_wa_flags);
+
+   /* The last step is to remap VERT_ATTRIB_* to actual registers */
+
+   /* Whether or not we have any system generated values.  gl_DrawID is not
+    * included here as it lives in its own vec4.
+    */
+   const bool has_sgvs =
+      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) ||
+      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) ||
+      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
+      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
+
+   const unsigned num_inputs = util_bitcount64(nir->info.inputs_read);
+
+   nir_foreach_function_impl(impl, nir) {
+      nir_builder b = nir_builder_create(impl);
+
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_load_first_vertex:
+            case nir_intrinsic_load_base_instance:
+            case nir_intrinsic_load_vertex_id_zero_base:
+            case nir_intrinsic_load_instance_id:
+            case nir_intrinsic_load_is_indexed_draw:
+            case nir_intrinsic_load_draw_id: {
+               b.cursor = nir_after_instr(&intrin->instr);
+
+               /* gl_VertexID and friends are stored by the VF as the last
+                * vertex element.  We convert them to load_input intrinsics at
+                * the right location.
+                */
+               nir_intrinsic_instr *load =
+                  nir_intrinsic_instr_create(nir, nir_intrinsic_load_input);
+               load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+
+               nir_intrinsic_set_base(load, num_inputs);
+               switch (intrin->intrinsic) {
+               case nir_intrinsic_load_first_vertex:
+                  nir_intrinsic_set_component(load, 0);
+                  break;
+               case nir_intrinsic_load_base_instance:
+                  nir_intrinsic_set_component(load, 1);
+                  break;
+               case nir_intrinsic_load_vertex_id_zero_base:
+                  nir_intrinsic_set_component(load, 2);
+                  break;
+               case nir_intrinsic_load_instance_id:
+                  nir_intrinsic_set_component(load, 3);
+                  break;
+               case nir_intrinsic_load_draw_id:
+               case nir_intrinsic_load_is_indexed_draw:
+                  /* gl_DrawID and IsIndexedDraw are stored right after
+                   * gl_VertexID and friends if any of them exist.
+                   */
+                  nir_intrinsic_set_base(load, num_inputs + has_sgvs);
+                  if (intrin->intrinsic == nir_intrinsic_load_draw_id)
+                     nir_intrinsic_set_component(load, 0);
+                  else
+                     nir_intrinsic_set_component(load, 1);
+                  break;
+               default:
+                  unreachable("Invalid system value intrinsic");
+               }
+
+               load->num_components = 1;
+               nir_def_init(&load->instr, &load->def, 1, 32);
+               nir_builder_instr_insert(&b, &load->instr);
+
+               nir_def_rewrite_uses(&intrin->def,
+                                        &load->def);
+               nir_instr_remove(&intrin->instr);
+               break;
+            }
+
+            case nir_intrinsic_load_input: {
+               /* Attributes come in a contiguous block, ordered by their
+                * gl_vert_attrib value.  That means we can compute the slot
+                * number for an attribute by masking out the enabled attributes
+                * before it and counting the bits.
+                */
+               int attr = nir_intrinsic_base(intrin);
+               uint64_t inputs_read = nir->info.inputs_read;
+               int slot = -1;
+               if (edgeflag_is_last) {
+                  inputs_read &= ~BITFIELD64_BIT(VERT_ATTRIB_EDGEFLAG);
+                  if (attr == VERT_ATTRIB_EDGEFLAG)
+                     slot = num_inputs - 1;
+               }
+               if (slot == -1)
+                  slot = util_bitcount64(inputs_read &
+                                         BITFIELD64_MASK(attr));
+               nir_intrinsic_set_base(intrin, slot);
+               break;
+            }
+
+            default:
+               break; /* Nothing to do */
+            }
+         }
+      }
+   }
+}
+
+void
+brw_nir_lower_vue_inputs(nir_shader *nir,
+                         const struct intel_vue_map *vue_map)
+{
+   nir_foreach_shader_in_variable(var, nir)
+      var->data.driver_location = var->data.location;
+
+   /* Inputs are stored in vec4 slots, so use type_size_vec4(). */
+   nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
+                nir_lower_io_lower_64bit_to_32);
+
+   /* This pass needs actual constants */
+   nir_opt_constant_folding(nir);
+
+   nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
+
+   nir_foreach_function_impl(impl, nir) {
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+            if (intrin->intrinsic == nir_intrinsic_load_input ||
+                intrin->intrinsic == nir_intrinsic_load_per_vertex_input) {
+               /* Offset 0 is the VUE header, which contains
+                * VARYING_SLOT_LAYER [.y], VARYING_SLOT_VIEWPORT [.z], and
+                * VARYING_SLOT_PSIZ [.w].
+                */
+               int varying = nir_intrinsic_base(intrin);
+               int vue_slot;
+               switch (varying) {
+               case VARYING_SLOT_PSIZ:
+                  nir_intrinsic_set_base(intrin, 0);
+                  nir_intrinsic_set_component(intrin, 3);
+                  break;
+
+               default:
+                  vue_slot = vue_map->varying_to_slot[varying];
+                  assert(vue_slot != -1);
+                  nir_intrinsic_set_base(intrin, vue_slot);
+                  break;
+               }
+            }
+         }
+      }
+   }
+}
+
+void
+brw_nir_lower_tes_inputs(nir_shader *nir, const struct intel_vue_map *vue_map)
+{
+   nir_foreach_shader_in_variable(var, nir)
+      var->data.driver_location = var->data.location;
+
+   nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
+                nir_lower_io_lower_64bit_to_32);
+
+   /* This pass needs actual constants */
+   nir_opt_constant_folding(nir);
+
+   nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
+
+   nir_foreach_function_impl(impl, nir) {
+      nir_builder b = nir_builder_create(impl);
+      nir_foreach_block(block, impl) {
+         remap_patch_urb_offsets(block, &b, vue_map,
+                                 nir->info.tess._primitive_mode);
+      }
+   }
+}
+
+static bool
+lower_barycentric_per_sample(nir_builder *b,
+                             nir_intrinsic_instr *intrin,
+                             UNUSED void *cb_data)
+{
+   if (intrin->intrinsic != nir_intrinsic_load_barycentric_pixel &&
+       intrin->intrinsic != nir_intrinsic_load_barycentric_centroid)
+      return false;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+   nir_def *centroid =
+      nir_load_barycentric(b, nir_intrinsic_load_barycentric_sample,
+                           nir_intrinsic_interp_mode(intrin));
+   nir_def_rewrite_uses(&intrin->def, centroid);
+   nir_instr_remove(&intrin->instr);
+   return true;
+}
+
+/**
+ * Convert interpolateAtOffset() offsets from [-0.5, +0.5] floating point
+ * offsets to integer [-8, +7] offsets (in units of 1/16th of a pixel).
+ *
+ * We clamp to +7/16 on the upper end of the range, since +0.5 isn't
+ * representable in a S0.4 value; a naive conversion would give us -8/16,
+ * which is the opposite of what was intended.
+ *
+ * This is allowed by GL_ARB_gpu_shader5's quantization rules:
+ *
+ *    "Not all values of <offset> may be supported; x and y offsets may
+ *     be rounded to fixed-point values with the number of fraction bits
+ *     given by the implementation-dependent constant
+ *     FRAGMENT_INTERPOLATION_OFFSET_BITS."
+ */
+static bool
+lower_barycentric_at_offset(nir_builder *b, nir_intrinsic_instr *intrin,
+                            void *data)
+{
+   if (intrin->intrinsic != nir_intrinsic_load_barycentric_at_offset)
+      return false;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   assert(intrin->src[0].ssa);
+   nir_def *offset =
+      nir_imin(b, nir_imm_int(b, 7),
+               nir_f2i32(b, nir_fmul_imm(b, intrin->src[0].ssa, 16)));
+
+   nir_src_rewrite(&intrin->src[0], offset);
+
+   return true;
+}
+
+void
+brw_nir_lower_fs_inputs(nir_shader *nir,
+                        const struct intel_device_info *devinfo,
+                        const struct brw_wm_prog_key *key)
+{
+   nir_foreach_shader_in_variable(var, nir) {
+      var->data.driver_location = var->data.location;
+
+      /* Apply default interpolation mode.
+       *
+       * Everything defaults to smooth except for the legacy GL color
+       * built-in variables, which might be flat depending on API state.
+       */
+      if (var->data.interpolation == INTERP_MODE_NONE) {
+         const bool flat = key->flat_shade &&
+            (var->data.location == VARYING_SLOT_COL0 ||
+             var->data.location == VARYING_SLOT_COL1);
+
+         var->data.interpolation = flat ? INTERP_MODE_FLAT
+                                        : INTERP_MODE_SMOOTH;
+      }
+
+      /* On Ironlake and below, there is only one interpolation mode.
+       * Centroid interpolation doesn't mean anything on this hardware --
+       * there is no multisampling.
+       */
+      if (devinfo->ver < 6) {
+         var->data.centroid = false;
+         var->data.sample = false;
+      }
+   }
+
+   nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
+                nir_lower_io_lower_64bit_to_32);
+   if (devinfo->ver >= 11)
+      nir_lower_interpolation(nir, ~0);
+
+   if (key->multisample_fbo == BRW_NEVER) {
+      nir_lower_single_sampled(nir);
+   } else if (key->persample_interp == BRW_ALWAYS) {
+      nir_shader_intrinsics_pass(nir, lower_barycentric_per_sample,
+                                   nir_metadata_block_index |
+                                   nir_metadata_dominance,
+                                   NULL);
+   }
+
+   nir_shader_intrinsics_pass(nir, lower_barycentric_at_offset,
+                                nir_metadata_block_index |
+                                nir_metadata_dominance,
+                                NULL);
+
+   /* This pass needs actual constants */
+   nir_opt_constant_folding(nir);
+
+   nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
+}
+
+void
+brw_nir_lower_vue_outputs(nir_shader *nir)
+{
+   nir_foreach_shader_out_variable(var, nir) {
+      var->data.driver_location = var->data.location;
+   }
+
+   nir_lower_io(nir, nir_var_shader_out, type_size_vec4,
+                nir_lower_io_lower_64bit_to_32);
+}
+
+void
+brw_nir_lower_tcs_outputs(nir_shader *nir, const struct intel_vue_map *vue_map,
+                          enum tess_primitive_mode tes_primitive_mode)
+{
+   nir_foreach_shader_out_variable(var, nir) {
+      var->data.driver_location = var->data.location;
+   }
+
+   nir_lower_io(nir, nir_var_shader_out, type_size_vec4,
+                nir_lower_io_lower_64bit_to_32);
+
+   /* This pass needs actual constants */
+   nir_opt_constant_folding(nir);
+
+   nir_io_add_const_offset_to_base(nir, nir_var_shader_out);
+
+   nir_foreach_function_impl(impl, nir) {
+      nir_builder b = nir_builder_create(impl);
+      nir_foreach_block(block, impl) {
+         remap_patch_urb_offsets(block, &b, vue_map, tes_primitive_mode);
+      }
+   }
+}
+
+void
+brw_nir_lower_fs_outputs(nir_shader *nir)
+{
+   nir_foreach_shader_out_variable(var, nir) {
+      var->data.driver_location =
+         SET_FIELD(var->data.index, BRW_NIR_FRAG_OUTPUT_INDEX) |
+         SET_FIELD(var->data.location, BRW_NIR_FRAG_OUTPUT_LOCATION);
+   }
+
+   nir_lower_io(nir, nir_var_shader_out, type_size_dvec4, 0);
+}
+
+#define OPT(pass, ...) ({                                  \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   if (this_progress)                                      \
+      progress = true;                                     \
+   this_progress;                                          \
+})
+
+void
+brw_nir_optimize(nir_shader *nir, bool is_scalar,
+                 const struct intel_device_info *devinfo)
+{
+   bool progress;
+   unsigned lower_flrp =
+      (nir->options->lower_flrp16 ? 16 : 0) |
+      (nir->options->lower_flrp32 ? 32 : 0) |
+      (nir->options->lower_flrp64 ? 64 : 0);
+
+   do {
+      progress = false;
+      /* This pass is causing problems with types used by OpenCL :
+       *    https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13955
+       *
+       * Running with it disabled made no difference in the resulting assembly
+       * code.
+       */
+      if (nir->info.stage != MESA_SHADER_KERNEL)
+         OPT(nir_split_array_vars, nir_var_function_temp);
+      OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
+      OPT(nir_opt_deref);
+      if (OPT(nir_opt_memcpy))
+         OPT(nir_split_var_copies);
+      OPT(nir_lower_vars_to_ssa);
+      if (!nir->info.var_copies_lowered) {
+         /* Only run this pass if nir_lower_var_copies was not called
+          * yet. That would lower away any copy_deref instructions and we
+          * don't want to introduce any more.
+          */
+         OPT(nir_opt_find_array_copies);
+      }
+      OPT(nir_opt_copy_prop_vars);
+      OPT(nir_opt_dead_write_vars);
+      OPT(nir_opt_combine_stores, nir_var_all);
+
+      OPT(nir_opt_ray_queries);
+      OPT(nir_opt_ray_query_ranges);
+
+      if (is_scalar) {
+         OPT(nir_lower_alu_to_scalar, NULL, NULL);
+      } else {
+         OPT(nir_opt_shrink_stores, true);
+         OPT(nir_opt_shrink_vectors);
+      }
+
+      OPT(nir_copy_prop);
+
+      if (is_scalar) {
+         OPT(nir_lower_phis_to_scalar, false);
+      }
+
+      OPT(nir_copy_prop);
+      OPT(nir_opt_dce);
+      OPT(nir_opt_cse);
+      OPT(nir_opt_combine_stores, nir_var_all);
+
+      /* Passing 0 to the peephole select pass causes it to convert
+       * if-statements that contain only move instructions in the branches
+       * regardless of the count.
+       *
+       * Passing 1 to the peephole select pass causes it to convert
+       * if-statements that contain at most a single ALU instruction (total)
+       * in both branches.  Before Gfx6, some math instructions were
+       * prohibitively expensive and the results of compare operations need an
+       * extra resolve step.  For these reasons, this pass is more harmful
+       * than good on those platforms.
+       *
+       * For indirect loads of uniforms (push constants), we assume that array
+       * indices will nearly always be in bounds and the cost of the load is
+       * low.  Therefore there shouldn't be a performance benefit to avoid it.
+       * However, in vec4 tessellation shaders, these loads operate by
+       * actually pulling from memory.
+       */
+      const bool is_vec4_tessellation = !is_scalar &&
+         (nir->info.stage == MESA_SHADER_TESS_CTRL ||
+          nir->info.stage == MESA_SHADER_TESS_EVAL);
+      OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false);
+      OPT(nir_opt_peephole_select, 8, !is_vec4_tessellation,
+          devinfo->ver >= 6);
+
+      OPT(nir_opt_intrinsics);
+      OPT(nir_opt_idiv_const, 32);
+      OPT(nir_opt_algebraic);
+
+      /* BFI2 did not exist until Gfx7, so there's no point in trying to
+       * optimize an instruction that should not get generated.
+       */
+      if (devinfo->ver >= 7)
+         OPT(nir_opt_reassociate_bfi);
+
+      OPT(nir_lower_constant_convert_alu_types);
+      OPT(nir_opt_constant_folding);
+
+      if (lower_flrp != 0) {
+         if (OPT(nir_lower_flrp,
+                 lower_flrp,
+                 false /* always_precise */)) {
+            OPT(nir_opt_constant_folding);
+         }
+
+         /* Nothing should rematerialize any flrps, so we only need to do this
+          * lowering once.
+          */
+         lower_flrp = 0;
+      }
+
+      OPT(nir_opt_dead_cf);
+      if (OPT(nir_opt_loop)) {
+         /* If nir_opt_loop makes progress, then we need to clean
+          * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
+          * to make progress.
+          */
+         OPT(nir_copy_prop);
+         OPT(nir_opt_dce);
+      }
+      OPT(nir_opt_if, nir_opt_if_optimize_phi_true_false);
+      OPT(nir_opt_conditional_discard);
+      if (nir->options->max_unroll_iterations != 0) {
+         OPT(nir_opt_loop_unroll);
+      }
+      OPT(nir_opt_remove_phis);
+      OPT(nir_opt_gcm, false);
+      OPT(nir_opt_undef);
+      OPT(nir_lower_pack);
+   } while (progress);
+
+   /* Workaround Gfxbench unused local sampler variable which will trigger an
+    * assert in the opt_large_constants pass.
+    */
+   OPT(nir_remove_dead_variables, nir_var_function_temp, NULL);
+}
+
+static unsigned
+lower_bit_size_callback(const nir_instr *instr, UNUSED void *data)
+{
+   const struct brw_compiler *compiler = (const struct brw_compiler *) data;
+   const struct intel_device_info *devinfo = compiler->devinfo;
+
+   switch (instr->type) {
+   case nir_instr_type_alu: {
+      nir_alu_instr *alu = nir_instr_as_alu(instr);
+      switch (alu->op) {
+      case nir_op_bit_count:
+      case nir_op_ufind_msb:
+      case nir_op_ifind_msb:
+      case nir_op_find_lsb:
+         /* These are handled specially because the destination is always
+          * 32-bit and so the bit size of the instruction is given by the
+          * source.
+          */
+         return alu->src[0].src.ssa->bit_size >= 32 ? 0 : 32;
+      default:
+         break;
+      }
+
+      if (alu->def.bit_size >= 32)
+         return 0;
+
+      /* Note: nir_op_iabs and nir_op_ineg are not lowered here because the
+       * 8-bit ABS or NEG instruction should eventually get copy propagated
+       * into the MOV that does the type conversion.  This results in far
+       * fewer MOV instructions.
+       */
+      switch (alu->op) {
+      case nir_op_idiv:
+      case nir_op_imod:
+      case nir_op_irem:
+      case nir_op_udiv:
+      case nir_op_umod:
+      case nir_op_fceil:
+      case nir_op_ffloor:
+      case nir_op_ffract:
+      case nir_op_fround_even:
+      case nir_op_ftrunc:
+         return 32;
+      case nir_op_frcp:
+      case nir_op_frsq:
+      case nir_op_fsqrt:
+      case nir_op_fpow:
+      case nir_op_fexp2:
+      case nir_op_flog2:
+      case nir_op_fsin:
+      case nir_op_fcos:
+         return devinfo->ver < 9 ? 32 : 0;
+      case nir_op_isign:
+         assert(!"Should have been lowered by nir_opt_algebraic.");
+         return 0;
+      default:
+         if (nir_op_infos[alu->op].num_inputs >= 2 &&
+             alu->def.bit_size == 8)
+            return 16;
+
+         if (nir_alu_instr_is_comparison(alu) &&
+             alu->src[0].src.ssa->bit_size == 8)
+            return 16;
+
+         return 0;
+      }
+      break;
+   }
+
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_read_invocation:
+      case nir_intrinsic_read_first_invocation:
+      case nir_intrinsic_vote_feq:
+      case nir_intrinsic_vote_ieq:
+      case nir_intrinsic_shuffle:
+      case nir_intrinsic_shuffle_xor:
+      case nir_intrinsic_shuffle_up:
+      case nir_intrinsic_shuffle_down:
+      case nir_intrinsic_quad_broadcast:
+      case nir_intrinsic_quad_swap_horizontal:
+      case nir_intrinsic_quad_swap_vertical:
+      case nir_intrinsic_quad_swap_diagonal:
+         if (intrin->src[0].ssa->bit_size == 8)
+            return 16;
+         return 0;
+
+      case nir_intrinsic_reduce:
+      case nir_intrinsic_inclusive_scan:
+      case nir_intrinsic_exclusive_scan:
+         /* There are a couple of register region issues that make things
+          * complicated for 8-bit types:
+          *
+          *    1. Only raw moves are allowed to write to a packed 8-bit
+          *       destination.
+          *    2. If we use a strided destination, the efficient way to do
+          *       scan operations ends up using strides that are too big to
+          *       encode in an instruction.
+          *
+          * To get around these issues, we just do all 8-bit scan operations
+          * in 16 bits.  It's actually fewer instructions than what we'd have
+          * to do if we were trying to do it in native 8-bit types and the
+          * results are the same once we truncate to 8 bits at the end.
+          */
+         if (intrin->def.bit_size == 8)
+            return 16;
+         return 0;
+
+      default:
+         return 0;
+      }
+      break;
+   }
+
+   case nir_instr_type_phi: {
+      nir_phi_instr *phi = nir_instr_as_phi(instr);
+      if (phi->def.bit_size == 8)
+         return 16;
+      return 0;
+   }
+
+   default:
+      return 0;
+   }
+}
+
+/* On gfx12.5+, if the offsets are not both constant and in the {-8,7} range,
+ * we will have nir_lower_tex() lower the source offset by returning true from
+ * this filter function.
+ */
+static bool
+lower_xehp_tg4_offset_filter(const nir_instr *instr, UNUSED const void *data)
+{
+   if (instr->type != nir_instr_type_tex)
+      return false;
+
+   nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+   if (tex->op != nir_texop_tg4)
+      return false;
+
+   int offset_index = nir_tex_instr_src_index(tex, nir_tex_src_offset);
+   if (offset_index < 0)
+      return false;
+
+   if (!nir_src_is_const(tex->src[offset_index].src))
+      return true;
+
+   int64_t offset_x = nir_src_comp_as_int(tex->src[offset_index].src, 0);
+   int64_t offset_y = nir_src_comp_as_int(tex->src[offset_index].src, 1);
+
+   return offset_x < -8 || offset_x > 7 || offset_y < -8 || offset_y > 7;
+}
+
+/* Does some simple lowering and runs the standard suite of optimizations
+ *
+ * This is intended to be called more-or-less directly after you get the
+ * shader out of GLSL or some other source.  While it is geared towards i965,
+ * it is not at all generator-specific.
+ */
+void
+brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
+                   const struct brw_nir_compiler_opts *opts)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   UNUSED bool progress; /* Written by OPT */
+
+   const bool is_scalar = compiler->scalar_stage[nir->info.stage];
+
+   nir_validate_ssa_dominance(nir, "before brw_preprocess_nir");
+
+   OPT(nir_lower_frexp);
+
+   if (is_scalar) {
+      OPT(nir_lower_alu_to_scalar, NULL, NULL);
+   }
+
+   if (nir->info.stage == MESA_SHADER_GEOMETRY)
+      OPT(nir_lower_gs_intrinsics, 0);
+
+   /* See also brw_nir_trig_workarounds.py */
+   if (compiler->precise_trig &&
+       !(devinfo->ver >= 10 || devinfo->platform == INTEL_PLATFORM_KBL))
+      OPT(brw_nir_apply_trig_workarounds);
+
+   /* This workaround existing for performance reasons. Since it requires not
+    * setting RENDER_SURFACE_STATE::SurfaceArray when the array length is 1,
+    * we're loosing the HW robustness feature in that case.
+    *
+    * So when robust image access is enabled, just avoid the workaround.
+    */
+   if (intel_needs_workaround(devinfo, 1806565034) && !opts->robust_image_access)
+      OPT(intel_nir_clamp_image_1d_2d_array_sizes);
+
+   const nir_lower_tex_options tex_options = {
+      .lower_txp = ~0,
+      .lower_txf_offset = true,
+      .lower_rect_offset = true,
+      .lower_txd_cube_map = true,
+      /* For below, See bspec 45942, "Enable new message layout for cube array" */
+      .lower_txd_3d = devinfo->verx10 >= 125,
+      .lower_txd_array = devinfo->verx10 >= 125,
+      .lower_txb_shadow_clamp = true,
+      .lower_txd_shadow_clamp = true,
+      .lower_txd_offset_clamp = true,
+      .lower_tg4_offsets = true,
+      .lower_txs_lod = true, /* Wa_14012320009 */
+      .lower_offset_filter =
+         devinfo->verx10 >= 125 ? lower_xehp_tg4_offset_filter : NULL,
+      .lower_invalid_implicit_lod = true,
+   };
+
+   /* In the case where TG4 coords are lowered to offsets and we have a
+    * lower_xehp_tg4_offset_filter lowering those offsets further, we need to
+    * rerun the pass because the instructions inserted by the first lowering
+    * are not visible during that first pass.
+    */
+   if (OPT(nir_lower_tex, &tex_options))
+      OPT(nir_lower_tex, &tex_options);
+   OPT(nir_normalize_cubemap_coords);
+
+   OPT(nir_lower_global_vars_to_local);
+
+   OPT(nir_split_var_copies);
+   OPT(nir_split_struct_vars, nir_var_function_temp);
+
+   brw_nir_optimize(nir, is_scalar, devinfo);
+
+   OPT(nir_lower_doubles, opts->softfp64, nir->options->lower_doubles_options);
+   if (OPT(nir_lower_int64_float_conversions)) {
+      OPT(nir_opt_algebraic);
+      OPT(nir_lower_doubles, opts->softfp64,
+          nir->options->lower_doubles_options);
+   }
+
+   OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler);
+
+   /* Lower a bunch of stuff */
+   OPT(nir_lower_var_copies);
+
+   /* This needs to be run after the first optimization pass but before we
+    * lower indirect derefs away
+    */
+   if (compiler->supports_shader_constants) {
+      OPT(nir_opt_large_constants, NULL, 32);
+   }
+
+   if (is_scalar) {
+      OPT(nir_lower_load_const_to_scalar);
+   }
+
+   OPT(nir_lower_system_values);
+   nir_lower_compute_system_values_options lower_csv_options = {
+      .has_base_workgroup_id = nir->info.stage == MESA_SHADER_COMPUTE,
+   };
+   OPT(nir_lower_compute_system_values, &lower_csv_options);
+
+   const nir_lower_subgroups_options subgroups_options = {
+      .ballot_bit_size = 32,
+      .ballot_components = 1,
+      .lower_to_scalar = true,
+      .lower_vote_trivial = !is_scalar,
+      .lower_relative_shuffle = true,
+      .lower_quad_broadcast_dynamic = true,
+      .lower_elect = true,
+      .lower_inverse_ballot = true,
+      .lower_rotate_to_shuffle = true,
+   };
+   OPT(nir_lower_subgroups, &subgroups_options);
+
+   nir_variable_mode indirect_mask =
+      brw_nir_no_indirect_mask(compiler, nir->info.stage);
+   OPT(nir_lower_indirect_derefs, indirect_mask, UINT32_MAX);
+
+   /* Even in cases where we can handle indirect temporaries via scratch, we
+    * it can still be expensive.  Lower indirects on small arrays to
+    * conditional load/stores.
+    *
+    * The threshold of 16 was chosen semi-arbitrarily.  The idea is that an
+    * indirect on an array of 16 elements is about 30 instructions at which
+    * point, you may be better off doing a send.  With a SIMD8 program, 16
+    * floats is 1/8 of the entire register file.  Any array larger than that
+    * is likely to cause pressure issues.  Also, this value is sufficiently
+    * high that the benchmarks known to suffer from large temporary array
+    * issues are helped but nothing else in shader-db is hurt except for maybe
+    * that one kerbal space program shader.
+    */
+   if (is_scalar && !(indirect_mask & nir_var_function_temp))
+      OPT(nir_lower_indirect_derefs, nir_var_function_temp, 16);
+
+   /* Lower array derefs of vectors for SSBO and UBO loads.  For both UBOs and
+    * SSBOs, our back-end is capable of loading an entire vec4 at a time and
+    * we would like to take advantage of that whenever possible regardless of
+    * whether or not the app gives us full loads.  This should allow the
+    * optimizer to combine UBO and SSBO load operations and save us some send
+    * messages.
+    */
+   OPT(nir_lower_array_deref_of_vec,
+       nir_var_mem_ubo | nir_var_mem_ssbo,
+       nir_lower_direct_array_deref_of_vec_load);
+
+   /* Clamp load_per_vertex_input of the TCS stage so that we do not generate
+    * loads reading out of bounds. We can do this here because we called
+    * nir_lower_system_values above.
+    */
+   if (nir->info.stage == MESA_SHADER_TESS_CTRL &&
+       compiler->use_tcs_multi_patch)
+      OPT(intel_nir_clamp_per_vertex_loads);
+
+   /* Get rid of split copies */
+   brw_nir_optimize(nir, is_scalar, devinfo);
+}
+
+static bool
+brw_nir_zero_inputs_instr(struct nir_builder *b, nir_intrinsic_instr *intrin,
+                          void *data)
+{
+   if (intrin->intrinsic != nir_intrinsic_load_deref)
+      return false;
+
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+   if (!nir_deref_mode_is(deref, nir_var_shader_in))
+      return false;
+
+   if (deref->deref_type != nir_deref_type_var)
+      return false;
+
+   nir_variable *var = deref->var;
+
+   uint64_t zero_inputs = *(uint64_t *)data;
+   if (!(BITFIELD64_BIT(var->data.location) & zero_inputs))
+      return false;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_def *zero = nir_imm_zero(b, 1, 32);
+
+   nir_def_rewrite_uses(&intrin->def, zero);
+
+   nir_instr_remove(&intrin->instr);
+
+   return true;
+}
+
+static bool
+brw_nir_zero_inputs(nir_shader *shader, uint64_t *zero_inputs)
+{
+   return nir_shader_intrinsics_pass(shader, brw_nir_zero_inputs_instr,
+                                     nir_metadata_block_index | nir_metadata_dominance,
+                                     zero_inputs);
+}
+
+/* Code for Wa_18019110168 may have created input/output variables beyond
+ * VARYING_SLOT_MAX and removed uses of variables below VARYING_SLOT_MAX.
+ * Clean it up, so they all stay below VARYING_SLOT_MAX.
+ */
+static void
+brw_mesh_compact_io(nir_shader *mesh, nir_shader *frag)
+{
+   gl_varying_slot mapping[VARYING_SLOT_MAX] = {0, };
+   gl_varying_slot cur = VARYING_SLOT_VAR0;
+   bool compact = false;
+
+   nir_foreach_shader_out_variable(var, mesh) {
+      gl_varying_slot location = var->data.location;
+      if (location < VARYING_SLOT_VAR0)
+         continue;
+      assert(location < ARRAY_SIZE(mapping));
+
+      const struct glsl_type *type = var->type;
+      if (nir_is_arrayed_io(var, MESA_SHADER_MESH) || var->data.per_view) {
+         assert(glsl_type_is_array(type));
+         type = glsl_get_array_element(type);
+      }
+
+      if (mapping[location])
+         continue;
+
+      unsigned num_slots = glsl_count_attribute_slots(type, false);
+
+      compact |= location + num_slots > VARYING_SLOT_MAX;
+
+      mapping[location] = cur;
+      cur += num_slots;
+   }
+
+   if (!compact)
+      return;
+
+   /* The rest of this function should be hit only for Wa_18019110168. */
+
+   nir_foreach_shader_out_variable(var, mesh) {
+      gl_varying_slot location = var->data.location;
+      if (location < VARYING_SLOT_VAR0)
+         continue;
+      location = mapping[location];
+      if (location == 0)
+         continue;
+      var->data.location = location;
+   }
+
+   nir_foreach_shader_in_variable(var, frag) {
+      gl_varying_slot location = var->data.location;
+      if (location < VARYING_SLOT_VAR0)
+         continue;
+      location = mapping[location];
+      if (location == 0)
+         continue;
+      var->data.location = location;
+   }
+
+   nir_shader_gather_info(mesh, nir_shader_get_entrypoint(mesh));
+   nir_shader_gather_info(frag, nir_shader_get_entrypoint(frag));
+
+   if (should_print_nir(mesh)) {
+      printf("%s\n", __func__);
+      nir_print_shader(mesh, stdout);
+   }
+   if (should_print_nir(frag)) {
+      printf("%s\n", __func__);
+      nir_print_shader(frag, stdout);
+   }
+}
+
+void
+brw_nir_link_shaders(const struct brw_compiler *compiler,
+                     nir_shader *producer, nir_shader *consumer)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+
+   if (producer->info.stage == MESA_SHADER_MESH &&
+       consumer->info.stage == MESA_SHADER_FRAGMENT) {
+      uint64_t fs_inputs = 0, ms_outputs = 0;
+      /* gl_MeshPerPrimitiveEXT[].gl_ViewportIndex, gl_PrimitiveID and gl_Layer
+       * are per primitive, but fragment shader does not have them marked as
+       * such. Add the annotation here.
+       */
+      nir_foreach_shader_in_variable(var, consumer) {
+         fs_inputs |= BITFIELD64_BIT(var->data.location);
+
+         switch (var->data.location) {
+            case VARYING_SLOT_LAYER:
+            case VARYING_SLOT_PRIMITIVE_ID:
+            case VARYING_SLOT_VIEWPORT:
+               var->data.per_primitive = 1;
+               break;
+            default:
+               continue;
+         }
+      }
+
+      nir_foreach_shader_out_variable(var, producer)
+         ms_outputs |= BITFIELD64_BIT(var->data.location);
+
+      uint64_t zero_inputs = ~ms_outputs & fs_inputs;
+      zero_inputs &= BITFIELD64_BIT(VARYING_SLOT_LAYER) |
+                     BITFIELD64_BIT(VARYING_SLOT_VIEWPORT);
+
+      if (zero_inputs)
+         NIR_PASS(_, consumer, brw_nir_zero_inputs, &zero_inputs);
+   }
+
+   nir_lower_io_arrays_to_elements(producer, consumer);
+   nir_validate_shader(producer, "after nir_lower_io_arrays_to_elements");
+   nir_validate_shader(consumer, "after nir_lower_io_arrays_to_elements");
+
+   const bool p_is_scalar = compiler->scalar_stage[producer->info.stage];
+   const bool c_is_scalar = compiler->scalar_stage[consumer->info.stage];
+
+   if (p_is_scalar && c_is_scalar) {
+      NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
+      NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
+      brw_nir_optimize(producer, p_is_scalar, devinfo);
+      brw_nir_optimize(consumer, c_is_scalar, devinfo);
+   }
+
+   if (nir_link_opt_varyings(producer, consumer))
+      brw_nir_optimize(consumer, c_is_scalar, devinfo);
+
+   NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
+   NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
+
+   if (nir_remove_unused_varyings(producer, consumer)) {
+      if (should_print_nir(producer)) {
+         printf("nir_remove_unused_varyings\n");
+         nir_print_shader(producer, stdout);
+      }
+      if (should_print_nir(consumer)) {
+         printf("nir_remove_unused_varyings\n");
+         nir_print_shader(consumer, stdout);
+      }
+
+      NIR_PASS(_, producer, nir_lower_global_vars_to_local);
+      NIR_PASS(_, consumer, nir_lower_global_vars_to_local);
+
+      /* The backend might not be able to handle indirects on
+       * temporaries so we need to lower indirects on any of the
+       * varyings we have demoted here.
+       */
+      NIR_PASS(_, producer, nir_lower_indirect_derefs,
+                  brw_nir_no_indirect_mask(compiler, producer->info.stage),
+                  UINT32_MAX);
+      NIR_PASS(_, consumer, nir_lower_indirect_derefs,
+                  brw_nir_no_indirect_mask(compiler, consumer->info.stage),
+                  UINT32_MAX);
+
+      brw_nir_optimize(producer, p_is_scalar, devinfo);
+      brw_nir_optimize(consumer, c_is_scalar, devinfo);
+
+      if (producer->info.stage == MESA_SHADER_MESH &&
+            consumer->info.stage == MESA_SHADER_FRAGMENT) {
+         brw_mesh_compact_io(producer, consumer);
+      }
+   }
+
+   NIR_PASS(_, producer, nir_lower_io_to_vector, nir_var_shader_out);
+
+   if (producer->info.stage == MESA_SHADER_TESS_CTRL &&
+       producer->options->vectorize_tess_levels)
+   NIR_PASS_V(producer, nir_vectorize_tess_levels);
+
+   NIR_PASS(_, producer, nir_opt_combine_stores, nir_var_shader_out);
+   NIR_PASS(_, consumer, nir_lower_io_to_vector, nir_var_shader_in);
+
+   if (producer->info.stage != MESA_SHADER_TESS_CTRL &&
+       producer->info.stage != MESA_SHADER_MESH &&
+       producer->info.stage != MESA_SHADER_TASK) {
+      /* Calling lower_io_to_vector creates output variable writes with
+       * write-masks.  On non-TCS outputs, the back-end can't handle it and we
+       * need to call nir_lower_io_to_temporaries to get rid of them.  This,
+       * in turn, creates temporary variables and extra copy_deref intrinsics
+       * that we need to clean up.
+       *
+       * Note Mesh/Task don't support I/O as temporaries (I/O is shared
+       * between whole workgroup, possibly using multiple HW threads). For
+       * those write-mask in output is handled by I/O lowering.
+       */
+      NIR_PASS_V(producer, nir_lower_io_to_temporaries,
+                 nir_shader_get_entrypoint(producer), true, false);
+      NIR_PASS(_, producer, nir_lower_global_vars_to_local);
+      NIR_PASS(_, producer, nir_split_var_copies);
+      NIR_PASS(_, producer, nir_lower_var_copies);
+   }
+
+   if (producer->info.stage == MESA_SHADER_TASK &&
+         consumer->info.stage == MESA_SHADER_MESH) {
+
+      for (unsigned i = 0; i < 3; ++i)
+         assert(producer->info.mesh.ts_mesh_dispatch_dimensions[i] <= UINT16_MAX);
+
+      nir_lower_compute_system_values_options options = {
+            .lower_workgroup_id_to_index = true,
+            .num_workgroups[0] = producer->info.mesh.ts_mesh_dispatch_dimensions[0],
+            .num_workgroups[1] = producer->info.mesh.ts_mesh_dispatch_dimensions[1],
+            .num_workgroups[2] = producer->info.mesh.ts_mesh_dispatch_dimensions[2],
+            /* nir_lower_idiv generates expensive code */
+            .shortcut_1d_workgroup_id = compiler->devinfo->verx10 >= 125,
+      };
+
+      NIR_PASS(_, consumer, nir_lower_compute_system_values, &options);
+   }
+}
+
+bool
+brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
+                             unsigned bit_size,
+                             unsigned num_components,
+                             nir_intrinsic_instr *low,
+                             nir_intrinsic_instr *high,
+                             void *data)
+{
+   /* Don't combine things to generate 64-bit loads/stores.  We have to split
+    * those back into 32-bit ones anyway and UBO loads aren't split in NIR so
+    * we don't want to make a mess for the back-end.
+    */
+   if (bit_size > 32)
+      return false;
+
+   if (low->intrinsic == nir_intrinsic_load_global_const_block_intel ||
+       low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
+       low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
+       low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
+       low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel) {
+      if (num_components > 4) {
+         if (!util_is_power_of_two_nonzero(num_components))
+            return false;
+
+         if (bit_size != 32)
+            return false;
+
+         if (num_components > 32)
+            return false;
+      }
+   } else {
+      /* We can handle at most a vec4 right now.  Anything bigger would get
+       * immediately split by brw_nir_lower_mem_access_bit_sizes anyway.
+       */
+      if (num_components > 4)
+         return false;
+   }
+
+
+   uint32_t align;
+   if (align_offset)
+      align = 1 << (ffs(align_offset) - 1);
+   else
+      align = align_mul;
+
+   if (align < bit_size / 8)
+      return false;
+
+   return true;
+}
+
+static
+bool combine_all_memory_barriers(nir_intrinsic_instr *a,
+                                 nir_intrinsic_instr *b,
+                                 void *data)
+{
+   /* Combine control barriers with identical memory semantics. This prevents
+    * the second barrier generating a spurious, identical fence message as the
+    * first barrier.
+    */
+   if (nir_intrinsic_memory_modes(a) == nir_intrinsic_memory_modes(b) &&
+       nir_intrinsic_memory_semantics(a) == nir_intrinsic_memory_semantics(b) &&
+       nir_intrinsic_memory_scope(a) == nir_intrinsic_memory_scope(b)) {
+      nir_intrinsic_set_execution_scope(a, MAX2(nir_intrinsic_execution_scope(a),
+                                                nir_intrinsic_execution_scope(b)));
+      return true;
+   }
+
+   /* Only combine pure memory barriers */
+   if ((nir_intrinsic_execution_scope(a) != SCOPE_NONE) ||
+       (nir_intrinsic_execution_scope(b) != SCOPE_NONE))
+      return false;
+
+   /* Translation to backend IR will get rid of modes we don't care about, so
+    * no harm in always combining them.
+    *
+    * TODO: While HW has only ACQUIRE|RELEASE fences, we could improve the
+    * scheduling so that it can take advantage of the different semantics.
+    */
+   nir_intrinsic_set_memory_modes(a, nir_intrinsic_memory_modes(a) |
+                                     nir_intrinsic_memory_modes(b));
+   nir_intrinsic_set_memory_semantics(a, nir_intrinsic_memory_semantics(a) |
+                                         nir_intrinsic_memory_semantics(b));
+   nir_intrinsic_set_memory_scope(a, MAX2(nir_intrinsic_memory_scope(a),
+                                          nir_intrinsic_memory_scope(b)));
+   return true;
+}
+
+static nir_mem_access_size_align
+get_mem_access_size_align(nir_intrinsic_op intrin, uint8_t bytes,
+                          uint8_t bit_size, uint32_t align_mul, uint32_t align_offset,
+                          bool offset_is_const, const void *cb_data)
+{
+   const uint32_t align = nir_combined_align(align_mul, align_offset);
+
+   switch (intrin) {
+   case nir_intrinsic_load_ssbo:
+   case nir_intrinsic_load_shared:
+   case nir_intrinsic_load_scratch:
+      /* The offset is constant so we can use a 32-bit load and just shift it
+       * around as needed.
+       */
+      if (align < 4 && offset_is_const) {
+         assert(util_is_power_of_two_nonzero(align_mul) && align_mul >= 4);
+         const unsigned pad = align_offset % 4;
+         const unsigned comps32 = MIN2(DIV_ROUND_UP(bytes + pad, 4), 4);
+         return (nir_mem_access_size_align) {
+            .bit_size = 32,
+            .num_components = comps32,
+            .align = 4,
+         };
+      }
+      break;
+
+   case nir_intrinsic_load_task_payload:
+      if (bytes < 4 || align < 4) {
+         return (nir_mem_access_size_align) {
+            .bit_size = 32,
+            .num_components = 1,
+            .align = 4,
+         };
+      }
+      break;
+
+   default:
+      break;
+   }
+
+   const bool is_load = nir_intrinsic_infos[intrin].has_dest;
+   const bool is_scratch = intrin == nir_intrinsic_load_scratch ||
+                           intrin == nir_intrinsic_store_scratch;
+
+   if (align < 4 || bytes < 4) {
+      /* Choose a byte, word, or dword */
+      bytes = MIN2(bytes, 4);
+      if (bytes == 3)
+         bytes = is_load ? 4 : 2;
+
+      if (is_scratch) {
+         /* The way scratch address swizzling works in the back-end, it
+          * happens at a DWORD granularity so we can't have a single load
+          * or store cross a DWORD boundary.
+          */
+         if ((align_offset % 4) + bytes > MIN2(align_mul, 4))
+            bytes = MIN2(align_mul, 4) - (align_offset % 4);
+
+         /* Must be a power of two */
+         if (bytes == 3)
+            bytes = 2;
+      }
+
+      return (nir_mem_access_size_align) {
+         .bit_size = bytes * 8,
+         .num_components = 1,
+         .align = 1,
+      };
+   } else {
+      bytes = MIN2(bytes, 16);
+      return (nir_mem_access_size_align) {
+         .bit_size = 32,
+         .num_components = is_scratch ? 1 :
+                           is_load ? DIV_ROUND_UP(bytes, 4) : bytes / 4,
+         .align = 4,
+      };
+   }
+}
+
+static void
+brw_vectorize_lower_mem_access(nir_shader *nir,
+                               const struct brw_compiler *compiler,
+                               enum brw_robustness_flags robust_flags)
+{
+   bool progress = false;
+   const bool is_scalar = compiler->scalar_stage[nir->info.stage];
+
+   if (is_scalar) {
+      nir_load_store_vectorize_options options = {
+         .modes = nir_var_mem_ubo | nir_var_mem_ssbo |
+                  nir_var_mem_global | nir_var_mem_shared |
+                  nir_var_mem_task_payload,
+         .callback = brw_nir_should_vectorize_mem,
+         .robust_modes = (nir_variable_mode)0,
+      };
+
+      if (robust_flags & BRW_ROBUSTNESS_UBO)
+         options.robust_modes |= nir_var_mem_ubo | nir_var_mem_global;
+      if (robust_flags & BRW_ROBUSTNESS_SSBO)
+         options.robust_modes |= nir_var_mem_ssbo | nir_var_mem_global;
+
+      OPT(nir_opt_load_store_vectorize, &options);
+
+      /* Only run the blockify optimization on Gfx9+ because although prior HW
+       * versions have support for block loads, they do have limitations on
+       * alignment as well as requiring split sends which are not supported
+       * there.
+       */
+      if (compiler->devinfo->ver >= 9) {
+         /* Required for nir_divergence_analysis() */
+         OPT(nir_convert_to_lcssa, true, true);
+
+         /* When HW supports block loads, using the divergence analysis, try
+          * to find uniform SSBO loads and turn them into block loads.
+          *
+          * Rerun the vectorizer after that to make the largest possible block
+          * loads.
+          *
+          * This is a win on 2 fronts :
+          *   - fewer send messages
+          *   - reduced register pressure
+          */
+         nir_divergence_analysis(nir);
+         if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo))
+            OPT(nir_opt_load_store_vectorize, &options);
+         OPT(nir_opt_remove_phis);
+      }
+   }
+
+   nir_lower_mem_access_bit_sizes_options mem_access_options = {
+      .modes = nir_var_mem_ssbo |
+               nir_var_mem_constant |
+               nir_var_mem_task_payload |
+               nir_var_shader_temp |
+               nir_var_function_temp |
+               nir_var_mem_global |
+               nir_var_mem_shared,
+      .callback = get_mem_access_size_align,
+   };
+   OPT(nir_lower_mem_access_bit_sizes, &mem_access_options);
+
+   while (progress) {
+      progress = false;
+
+      OPT(nir_lower_pack);
+      OPT(nir_copy_prop);
+      OPT(nir_opt_dce);
+      OPT(nir_opt_cse);
+      OPT(nir_opt_algebraic);
+      OPT(nir_opt_constant_folding);
+   }
+}
+
+static bool
+nir_shader_has_local_variables(const nir_shader *nir)
+{
+   nir_foreach_function_impl(impl, nir) {
+      if (!exec_list_is_empty(&impl->locals))
+         return true;
+   }
+
+   return false;
+}
+
+/* Prepare the given shader for codegen
+ *
+ * This function is intended to be called right before going into the actual
+ * backend and is highly backend-specific.  Also, once this function has been
+ * called on a shader, it will no longer be in SSA form so most optimizations
+ * will not work.
+ */
+void
+brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
+                    bool debug_enabled,
+                    enum brw_robustness_flags robust_flags)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   const bool is_scalar = compiler->scalar_stage[nir->info.stage];
+
+   UNUSED bool progress; /* Written by OPT */
+
+   OPT(intel_nir_lower_sparse_intrinsics);
+
+   OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler);
+
+   OPT(nir_opt_combine_barriers, combine_all_memory_barriers, NULL);
+
+   do {
+      progress = false;
+      OPT(nir_opt_algebraic_before_ffma);
+   } while (progress);
+
+   if (devinfo->verx10 >= 125) {
+      /* Lower integer division by constants before nir_lower_idiv. */
+      OPT(nir_opt_idiv_const, 32);
+      const nir_lower_idiv_options options = {
+         .allow_fp16 = false
+      };
+      OPT(nir_lower_idiv, &options);
+   }
+
+   if (gl_shader_stage_can_set_fragment_shading_rate(nir->info.stage))
+      NIR_PASS(_, nir, intel_nir_lower_shading_rate_output);
+
+   brw_nir_optimize(nir, is_scalar, devinfo);
+
+   if (is_scalar && nir_shader_has_local_variables(nir)) {
+      OPT(nir_lower_vars_to_explicit_types, nir_var_function_temp,
+          glsl_get_natural_size_align_bytes);
+      OPT(nir_lower_explicit_io, nir_var_function_temp,
+          nir_address_format_32bit_offset);
+      brw_nir_optimize(nir, is_scalar, devinfo);
+   }
+
+   brw_vectorize_lower_mem_access(nir, compiler, robust_flags);
+
+   if (OPT(nir_lower_int64))
+      brw_nir_optimize(nir, is_scalar, devinfo);
+
+   if (devinfo->ver >= 6) {
+      /* Try and fuse multiply-adds, if successful, run shrink_vectors to
+       * avoid peephole_ffma to generate things like this :
+       *    vec16 ssa_0 = ...
+       *    vec16 ssa_1 = fneg ssa_0
+       *    vec1  ssa_2 = ffma ssa_1, ...
+       *
+       * We want this instead :
+       *    vec16 ssa_0 = ...
+       *    vec1  ssa_1 = fneg ssa_0.x
+       *    vec1  ssa_2 = ffma ssa_1, ...
+       */
+      if (OPT(intel_nir_opt_peephole_ffma))
+         OPT(nir_opt_shrink_vectors);
+   }
+
+   if (is_scalar)
+      OPT(intel_nir_opt_peephole_imul32x16);
+
+   if (OPT(nir_opt_comparison_pre)) {
+      OPT(nir_copy_prop);
+      OPT(nir_opt_dce);
+      OPT(nir_opt_cse);
+
+      /* Do the select peepehole again.  nir_opt_comparison_pre (combined with
+       * the other optimization passes) will have removed at least one
+       * instruction from one of the branches of the if-statement, so now it
+       * might be under the threshold of conversion to bcsel.
+       *
+       * See brw_nir_optimize for the explanation of is_vec4_tessellation.
+       */
+      const bool is_vec4_tessellation = !is_scalar &&
+         (nir->info.stage == MESA_SHADER_TESS_CTRL ||
+          nir->info.stage == MESA_SHADER_TESS_EVAL);
+      OPT(nir_opt_peephole_select, 0, is_vec4_tessellation, false);
+      OPT(nir_opt_peephole_select, 1, is_vec4_tessellation,
+          compiler->devinfo->ver >= 6);
+   }
+
+   do {
+      progress = false;
+      if (OPT(nir_opt_algebraic_late)) {
+         /* At this late stage, anything that makes more constants will wreak
+          * havok on the vec4 backend.  The handling of constants in the vec4
+          * backend is not good.
+          */
+         if (is_scalar)
+            OPT(nir_opt_constant_folding);
+
+         OPT(nir_copy_prop);
+         OPT(nir_opt_dce);
+         OPT(nir_opt_cse);
+      }
+   } while (progress);
+
+
+   if (OPT(nir_lower_fp16_casts, nir_lower_fp16_split_fp64)) {
+      if (OPT(nir_lower_int64)) {
+         brw_nir_optimize(nir, is_scalar, devinfo);
+      }
+   }
+
+   OPT(intel_nir_lower_conversions);
+
+   if (is_scalar)
+      OPT(nir_lower_alu_to_scalar, NULL, NULL);
+
+   while (OPT(nir_opt_algebraic_distribute_src_mods)) {
+      if (is_scalar)
+         OPT(nir_opt_constant_folding);
+
+      OPT(nir_copy_prop);
+      OPT(nir_opt_dce);
+      OPT(nir_opt_cse);
+   }
+
+   OPT(nir_copy_prop);
+   OPT(nir_opt_dce);
+   OPT(nir_opt_move, nir_move_comparisons);
+   OPT(nir_opt_dead_cf);
+
+   bool divergence_analysis_dirty = false;
+   NIR_PASS(_, nir, nir_convert_to_lcssa, true, true);
+   NIR_PASS_V(nir, nir_divergence_analysis);
+
+   /* TODO: Enable nir_opt_uniform_atomics on Gfx7.x too.
+    * It currently fails Vulkan tests on Haswell for an unknown reason.
+    *
+    * TODO: Using this optimization on RT/OpenCL kernels also seems to cause
+    *       issues. Until we can understand those issues, disable it.
+    */
+   bool opt_uniform_atomic_stage_allowed =
+      devinfo->ver >= 8 &&
+      nir->info.stage != MESA_SHADER_KERNEL &&
+      nir->info.stage != MESA_SHADER_RAYGEN &&
+      !gl_shader_stage_is_callable(nir->info.stage);
+
+   if (opt_uniform_atomic_stage_allowed && OPT(nir_opt_uniform_atomics)) {
+      const nir_lower_subgroups_options subgroups_options = {
+         .ballot_bit_size = 32,
+         .ballot_components = 1,
+         .lower_elect = true,
+      };
+      OPT(nir_lower_subgroups, &subgroups_options);
+
+      if (OPT(nir_lower_int64))
+         brw_nir_optimize(nir, is_scalar, devinfo);
+
+      divergence_analysis_dirty = true;
+   }
+
+   /* Do this only after the last opt_gcm. GCM will undo this lowering. */
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      if (divergence_analysis_dirty) {
+         NIR_PASS(_, nir, nir_convert_to_lcssa, true, true);
+         NIR_PASS_V(nir, nir_divergence_analysis);
+      }
+
+      OPT(intel_nir_lower_non_uniform_barycentric_at_sample);
+   }
+
+   /* Clean up LCSSA phis */
+   OPT(nir_opt_remove_phis);
+
+   OPT(nir_lower_bool_to_int32);
+   OPT(nir_copy_prop);
+   OPT(nir_opt_dce);
+
+   OPT(nir_lower_locals_to_regs, 32);
+
+   if (unlikely(debug_enabled)) {
+      /* Re-index SSA defs so we print more sensible numbers. */
+      nir_foreach_function_impl(impl, nir) {
+         nir_index_ssa_defs(impl);
+      }
+
+      fprintf(stderr, "NIR (SSA form) for %s shader:\n",
+              _mesa_shader_stage_to_string(nir->info.stage));
+      nir_print_shader(nir, stderr);
+   }
+
+   nir_validate_ssa_dominance(nir, "before nir_convert_from_ssa");
+
+   /* Rerun the divergence analysis before convert_from_ssa as this pass has
+    * some assert on consistent divergence flags.
+    */
+   NIR_PASS(_, nir, nir_convert_to_lcssa, true, true);
+   NIR_PASS_V(nir, nir_divergence_analysis);
+   OPT(nir_opt_remove_phis);
+
+   OPT(nir_convert_from_ssa, true);
+
+   if (!is_scalar) {
+      OPT(nir_move_vec_src_uses_to_dest, true);
+      OPT(nir_lower_vec_to_regs, NULL, NULL);
+   }
+
+   OPT(nir_opt_dce);
+
+   if (OPT(nir_opt_rematerialize_compares))
+      OPT(nir_opt_dce);
+
+   OPT(nir_opt_dce);
+
+   /* The mesh stages require this pass to be called at the last minute,
+    * but if anything is done by it, it will also constant fold, and that
+    * undoes the work done by nir_trivialize_registers, so call it right
+    * before that one instead.
+    */
+   if (nir->info.stage == MESA_SHADER_MESH ||
+       nir->info.stage == MESA_SHADER_TASK)
+      brw_nir_adjust_payload(nir);
+
+   nir_trivialize_registers(nir);
+
+   /* This is the last pass we run before we start emitting stuff.  It
+    * determines when we need to insert boolean resolves on Gen <= 5.  We
+    * run it last because it stashes data in instr->pass_flags and we don't
+    * want that to be squashed by other NIR passes.
+    */
+   if (devinfo->ver <= 5)
+      brw_nir_analyze_boolean_resolves(nir);
+
+   nir_sweep(nir);
+
+   if (unlikely(debug_enabled)) {
+      fprintf(stderr, "NIR (final form) for %s shader:\n",
+              _mesa_shader_stage_to_string(nir->info.stage));
+      nir_print_shader(nir, stderr);
+   }
+}
+
+static bool
+brw_nir_apply_sampler_key(nir_shader *nir,
+                          const struct brw_compiler *compiler,
+                          const struct brw_sampler_prog_key_data *key_tex)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   nir_lower_tex_options tex_options = {
+      .lower_txd_clamp_bindless_sampler = true,
+      .lower_txd_clamp_if_sampler_index_not_lt_16 = true,
+      .lower_invalid_implicit_lod = true,
+      .lower_index_to_offset = true,
+   };
+
+   /* Iron Lake and prior require lowering of all rectangle textures */
+   if (devinfo->ver < 6)
+      tex_options.lower_rect = true;
+
+   /* Prior to Broadwell, our hardware can't actually do GL_CLAMP */
+   if (devinfo->ver < 8) {
+      tex_options.saturate_s = key_tex->gl_clamp_mask[0];
+      tex_options.saturate_t = key_tex->gl_clamp_mask[1];
+      tex_options.saturate_r = key_tex->gl_clamp_mask[2];
+   }
+
+   /* Prior to Haswell, we have to lower gradients on shadow samplers */
+   tex_options.lower_txd_shadow = devinfo->verx10 <= 70;
+
+   return nir_lower_tex(nir, &tex_options);
+}
+
+static unsigned
+get_subgroup_size(const struct shader_info *info, unsigned max_subgroup_size)
+{
+   switch (info->subgroup_size) {
+   case SUBGROUP_SIZE_API_CONSTANT:
+      /* We have to use the global constant size. */
+      return BRW_SUBGROUP_SIZE;
+
+   case SUBGROUP_SIZE_UNIFORM:
+      /* It has to be uniform across all invocations but can vary per stage
+       * if we want.  This gives us a bit more freedom.
+       *
+       * For compute, brw_nir_apply_key is called per-dispatch-width so this
+       * is the actual subgroup size and not a maximum.  However, we only
+       * invoke one size of any given compute shader so it's still guaranteed
+       * to be uniform across invocations.
+       */
+      return max_subgroup_size;
+
+   case SUBGROUP_SIZE_VARYING:
+      /* The subgroup size is allowed to be fully varying.  For geometry
+       * stages, we know it's always 8 which is max_subgroup_size so we can
+       * return that.  For compute, brw_nir_apply_key is called once per
+       * dispatch-width so max_subgroup_size is the real subgroup size.
+       *
+       * For fragment, we return 0 and let it fall through to the back-end
+       * compiler.  This means we can't optimize based on subgroup size but
+       * that's a risk the client took when it asked for a varying subgroup
+       * size.
+       */
+      return info->stage == MESA_SHADER_FRAGMENT ? 0 : max_subgroup_size;
+
+   case SUBGROUP_SIZE_REQUIRE_8:
+   case SUBGROUP_SIZE_REQUIRE_16:
+   case SUBGROUP_SIZE_REQUIRE_32:
+      assert(gl_shader_stage_uses_workgroup(info->stage) ||
+             (info->stage >= MESA_SHADER_RAYGEN && info->stage <= MESA_SHADER_CALLABLE));
+      /* These enum values are expressly chosen to be equal to the subgroup
+       * size that they require.
+       */
+      return info->subgroup_size;
+
+   case SUBGROUP_SIZE_FULL_SUBGROUPS:
+   case SUBGROUP_SIZE_REQUIRE_64:
+   case SUBGROUP_SIZE_REQUIRE_128:
+      break;
+   }
+
+   unreachable("Invalid subgroup size type");
+}
+
+unsigned
+brw_nir_api_subgroup_size(const nir_shader *nir,
+                          unsigned hw_subgroup_size)
+{
+   return get_subgroup_size(&nir->info, hw_subgroup_size);
+}
+
+void
+brw_nir_apply_key(nir_shader *nir,
+                  const struct brw_compiler *compiler,
+                  const struct brw_base_prog_key *key,
+                  unsigned max_subgroup_size)
+{
+   bool progress = false;
+
+   OPT(brw_nir_apply_sampler_key, compiler, &key->tex);
+
+   const struct intel_nir_lower_texture_opts tex_opts = {
+      .combined_lod_and_array_index = compiler->devinfo->ver >= 20,
+   };
+   OPT(intel_nir_lower_texture, &tex_opts);
+
+   const nir_lower_subgroups_options subgroups_options = {
+      .subgroup_size = get_subgroup_size(&nir->info, max_subgroup_size),
+      .ballot_bit_size = 32,
+      .ballot_components = 1,
+      .lower_subgroup_masks = true,
+   };
+   OPT(nir_lower_subgroups, &subgroups_options);
+
+   if (key->limit_trig_input_range)
+      OPT(brw_nir_limit_trig_input_range_workaround);
+
+   if (progress) {
+      const bool is_scalar = compiler->scalar_stage[nir->info.stage];
+      brw_nir_optimize(nir, is_scalar, compiler->devinfo);
+   }
+}
+
+enum brw_conditional_mod
+brw_cmod_for_nir_comparison(nir_op op)
+{
+   switch (op) {
+   case nir_op_flt:
+   case nir_op_flt32:
+   case nir_op_ilt:
+   case nir_op_ilt32:
+   case nir_op_ult:
+   case nir_op_ult32:
+      return BRW_CONDITIONAL_L;
+
+   case nir_op_fge:
+   case nir_op_fge32:
+   case nir_op_ige:
+   case nir_op_ige32:
+   case nir_op_uge:
+   case nir_op_uge32:
+      return BRW_CONDITIONAL_GE;
+
+   case nir_op_feq:
+   case nir_op_feq32:
+   case nir_op_ieq:
+   case nir_op_ieq32:
+   case nir_op_b32all_fequal2:
+   case nir_op_b32all_iequal2:
+   case nir_op_b32all_fequal3:
+   case nir_op_b32all_iequal3:
+   case nir_op_b32all_fequal4:
+   case nir_op_b32all_iequal4:
+      return BRW_CONDITIONAL_Z;
+
+   case nir_op_fneu:
+   case nir_op_fneu32:
+   case nir_op_ine:
+   case nir_op_ine32:
+   case nir_op_b32any_fnequal2:
+   case nir_op_b32any_inequal2:
+   case nir_op_b32any_fnequal3:
+   case nir_op_b32any_inequal3:
+   case nir_op_b32any_fnequal4:
+   case nir_op_b32any_inequal4:
+      return BRW_CONDITIONAL_NZ;
+
+   default:
+      unreachable("Unsupported NIR comparison op");
+   }
+}
+
+enum lsc_opcode
+lsc_aop_for_nir_intrinsic(const nir_intrinsic_instr *atomic)
+{
+   switch (nir_intrinsic_atomic_op(atomic)) {
+   case nir_atomic_op_iadd: {
+      unsigned src_idx;
+      switch (atomic->intrinsic) {
+      case nir_intrinsic_image_atomic:
+      case nir_intrinsic_bindless_image_atomic:
+         src_idx = 3;
+         break;
+      case nir_intrinsic_ssbo_atomic:
+         src_idx = 2;
+         break;
+      case nir_intrinsic_shared_atomic:
+      case nir_intrinsic_global_atomic:
+         src_idx = 1;
+         break;
+      default:
+         unreachable("Invalid add atomic opcode");
+      }
+
+      if (nir_src_is_const(atomic->src[src_idx])) {
+         int64_t add_val = nir_src_as_int(atomic->src[src_idx]);
+         if (add_val == 1)
+            return LSC_OP_ATOMIC_INC;
+         else if (add_val == -1)
+            return LSC_OP_ATOMIC_DEC;
+      }
+      return LSC_OP_ATOMIC_ADD;
+   }
+
+   case nir_atomic_op_imin: return LSC_OP_ATOMIC_MIN;
+   case nir_atomic_op_umin: return LSC_OP_ATOMIC_UMIN;
+   case nir_atomic_op_imax: return LSC_OP_ATOMIC_MAX;
+   case nir_atomic_op_umax: return LSC_OP_ATOMIC_UMAX;
+   case nir_atomic_op_iand: return LSC_OP_ATOMIC_AND;
+   case nir_atomic_op_ior:  return LSC_OP_ATOMIC_OR;
+   case nir_atomic_op_ixor: return LSC_OP_ATOMIC_XOR;
+   case nir_atomic_op_xchg: return LSC_OP_ATOMIC_STORE;
+   case nir_atomic_op_cmpxchg: return LSC_OP_ATOMIC_CMPXCHG;
+
+   case nir_atomic_op_fmin: return LSC_OP_ATOMIC_FMIN;
+   case nir_atomic_op_fmax: return LSC_OP_ATOMIC_FMAX;
+   case nir_atomic_op_fcmpxchg: return LSC_OP_ATOMIC_FCMPXCHG;
+   case nir_atomic_op_fadd: return LSC_OP_ATOMIC_FADD;
+
+   default:
+      unreachable("Unsupported NIR atomic intrinsic");
+   }
+}
+
+enum brw_reg_type
+brw_type_for_nir_type(const struct intel_device_info *devinfo,
+                      nir_alu_type type)
+{
+   switch (type) {
+   case nir_type_uint:
+   case nir_type_uint32:
+      return BRW_REGISTER_TYPE_UD;
+   case nir_type_bool:
+   case nir_type_int:
+   case nir_type_bool32:
+   case nir_type_int32:
+      return BRW_REGISTER_TYPE_D;
+   case nir_type_float:
+   case nir_type_float32:
+      return BRW_REGISTER_TYPE_F;
+   case nir_type_float16:
+      return BRW_REGISTER_TYPE_HF;
+   case nir_type_float64:
+      return BRW_REGISTER_TYPE_DF;
+   case nir_type_int64:
+      return devinfo->ver < 8 ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_Q;
+   case nir_type_uint64:
+      return devinfo->ver < 8 ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_UQ;
+   case nir_type_int16:
+      return BRW_REGISTER_TYPE_W;
+   case nir_type_uint16:
+      return BRW_REGISTER_TYPE_UW;
+   case nir_type_int8:
+      return BRW_REGISTER_TYPE_B;
+   case nir_type_uint8:
+      return BRW_REGISTER_TYPE_UB;
+   default:
+      unreachable("unknown type");
+   }
+
+   return BRW_REGISTER_TYPE_F;
+}
+
+nir_shader *
+brw_nir_create_passthrough_tcs(void *mem_ctx, const struct brw_compiler *compiler,
+                               const struct brw_tcs_prog_key *key)
+{
+   assert(key->input_vertices > 0);
+
+   const nir_shader_compiler_options *options =
+      compiler->nir_options[MESA_SHADER_TESS_CTRL];
+
+   uint64_t inputs_read = key->outputs_written &
+      ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
+
+   unsigned locations[64];
+   unsigned num_locations = 0;
+
+   u_foreach_bit64(varying, inputs_read)
+      locations[num_locations++] = varying;
+
+   nir_shader *nir =
+      nir_create_passthrough_tcs_impl(options, locations, num_locations,
+                                      key->input_vertices);
+
+   ralloc_steal(mem_ctx, nir);
+
+   nir->info.inputs_read = inputs_read;
+   nir->info.tess._primitive_mode = key->_tes_primitive_mode;
+   nir_validate_shader(nir, "in brw_nir_create_passthrough_tcs");
+
+   struct brw_nir_compiler_opts opts = {};
+   brw_preprocess_nir(compiler, nir, &opts);
+
+   return nir;
+}
+
+nir_def *
+brw_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load_uniform,
+      nir_def *base_addr, unsigned off)
+{
+   assert(load_uniform->intrinsic == nir_intrinsic_load_uniform);
+
+   unsigned bit_size = load_uniform->def.bit_size;
+   assert(bit_size >= 8 && bit_size % 8 == 0);
+   unsigned byte_size = bit_size / 8;
+   nir_def *sysval;
+
+   if (nir_src_is_const(load_uniform->src[0])) {
+      uint64_t offset = off +
+                        nir_intrinsic_base(load_uniform) +
+                        nir_src_as_uint(load_uniform->src[0]);
+
+      /* Things should be component-aligned. */
+      assert(offset % byte_size == 0);
+
+      unsigned suboffset = offset % 64;
+      uint64_t aligned_offset = offset - suboffset;
+
+      /* Load two just in case we go over a 64B boundary */
+      nir_def *data[2];
+      for (unsigned i = 0; i < 2; i++) {
+         nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64);
+         data[i] = nir_load_global_const_block_intel(b, 16, addr,
+                                                     nir_imm_true(b));
+      }
+
+      sysval = nir_extract_bits(b, data, 2, suboffset * 8,
+                                load_uniform->num_components, bit_size);
+   } else {
+      nir_def *offset32 =
+         nir_iadd_imm(b, load_uniform->src[0].ssa,
+                         off + nir_intrinsic_base(load_uniform));
+      nir_def *addr = nir_iadd(b, base_addr, nir_u2u64(b, offset32));
+      sysval = nir_load_global_constant(b, addr, byte_size,
+                                        load_uniform->num_components, bit_size);
+   }
+
+   return sysval;
+}
+
+const struct glsl_type *
+brw_nir_get_var_type(const struct nir_shader *nir, nir_variable *var)
+{
+   const struct glsl_type *type = var->interface_type;
+   if (!type) {
+      type = var->type;
+      if (nir_is_arrayed_io(var, nir->info.stage) || var->data.per_view) {
+         assert(glsl_type_is_array(type));
+         type = glsl_get_array_element(type);
+      }
+   }
+
+   return type;
+}
+
diff --git a/src/intel/compiler/elk/brw_nir.h b/src/intel/compiler/elk/brw_nir.h
new file mode 100644
index 00000000000..891d139cb3f
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir.h
@@ -0,0 +1,298 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_NIR_H
+#define BRW_NIR_H
+
+#include "brw_reg.h"
+#include "compiler/nir/nir.h"
+#include "brw_compiler.h"
+#include "nir_builder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const struct nir_shader_compiler_options brw_scalar_nir_options;
+extern const struct nir_shader_compiler_options brw_vector_nir_options;
+
+int type_size_vec4(const struct glsl_type *type, bool bindless);
+int type_size_dvec4(const struct glsl_type *type, bool bindless);
+
+static inline int
+type_size_scalar_bytes(const struct glsl_type *type, bool bindless)
+{
+   return glsl_count_dword_slots(type, bindless) * 4;
+}
+
+static inline int
+type_size_vec4_bytes(const struct glsl_type *type, bool bindless)
+{
+   return type_size_vec4(type, bindless) * 16;
+}
+
+/* Flags set in the instr->pass_flags field by i965 analysis passes */
+enum {
+   BRW_NIR_NON_BOOLEAN           = 0x0,
+
+   /* Indicates that the given instruction's destination is a boolean
+    * value but that it needs to be resolved before it can be used.
+    * On Gen <= 5, CMP instructions return a 32-bit value where the bottom
+    * bit represents the actual true/false value of the compare and the top
+    * 31 bits are undefined.  In order to use this value, we have to do a
+    * "resolve" operation by replacing the value of the CMP with -(x & 1)
+    * to sign-extend the bottom bit to 0/~0.
+    */
+   BRW_NIR_BOOLEAN_NEEDS_RESOLVE = 0x1,
+
+   /* Indicates that the given instruction's destination is a boolean
+    * value that has intentionally been left unresolved.  Not all boolean
+    * values need to be resolved immediately.  For instance, if we have
+    *
+    *    CMP r1 r2 r3
+    *    CMP r4 r5 r6
+    *    AND r7 r1 r4
+    *
+    * We don't have to resolve the result of the two CMP instructions
+    * immediately because the AND still does an AND of the bottom bits.
+    * Instead, we can save ourselves instructions by delaying the resolve
+    * until after the AND.  The result of the two CMP instructions is left
+    * as BRW_NIR_BOOLEAN_UNRESOLVED.
+    */
+   BRW_NIR_BOOLEAN_UNRESOLVED    = 0x2,
+
+   /* Indicates a that the given instruction's destination is a boolean
+    * value that does not need a resolve.  For instance, if you AND two
+    * values that are BRW_NIR_BOOLEAN_NEEDS_RESOLVE then we know that both
+    * values will be 0/~0 before we get them and the result of the AND is
+    * also guaranteed to be 0/~0 and does not need a resolve.
+    */
+   BRW_NIR_BOOLEAN_NO_RESOLVE    = 0x3,
+
+   /* A mask to mask the boolean status values off of instr->pass_flags */
+   BRW_NIR_BOOLEAN_MASK          = 0x3,
+};
+
+void brw_nir_analyze_boolean_resolves(nir_shader *nir);
+
+struct brw_nir_compiler_opts {
+   /* Soft floating point implementation shader */
+   const nir_shader *softfp64;
+
+   /* Whether robust image access is enabled */
+   bool robust_image_access;
+
+   /* Input vertices for TCS stage (0 means dynamic) */
+   unsigned input_vertices;
+};
+
+/* UBO surface index can come in 2 flavors :
+ *    - nir_intrinsic_resource_intel
+ *    - anything else
+ *
+ * In the first case, checking that the surface index is const requires
+ * checking resource_intel::src[1]. In any other case it's a simple
+ * nir_src_is_const().
+ *
+ * This function should only be called on src[0] of load_ubo intrinsics.
+ */
+static inline bool
+brw_nir_ubo_surface_index_is_pushable(nir_src src)
+{
+   nir_intrinsic_instr *intrin =
+      src.ssa->parent_instr->type == nir_instr_type_intrinsic ?
+      nir_instr_as_intrinsic(src.ssa->parent_instr) : NULL;
+
+   if (intrin && intrin->intrinsic == nir_intrinsic_resource_intel) {
+      return (nir_intrinsic_resource_access_intel(intrin) &
+              nir_resource_intel_pushable);
+   }
+
+   return nir_src_is_const(src);
+}
+
+static inline unsigned
+brw_nir_ubo_surface_index_get_push_block(nir_src src)
+{
+   if (nir_src_is_const(src))
+      return nir_src_as_uint(src);
+
+   if (!brw_nir_ubo_surface_index_is_pushable(src))
+      return UINT32_MAX;
+
+   assert(src.ssa->parent_instr->type == nir_instr_type_intrinsic);
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src.ssa->parent_instr);
+   assert(intrin->intrinsic == nir_intrinsic_resource_intel);
+
+   return nir_intrinsic_resource_block_intel(intrin);
+}
+
+/* This helper return the binding table index of a surface access (any
+ * buffer/image/etc...). It works off the source of one of the intrinsics
+ * (load_ubo, load_ssbo, store_ssbo, load_image, store_image, etc...).
+ *
+ * If the source is constant, then this is the binding table index. If we're
+ * going through a resource_intel intel intrinsic, then we need to check
+ * src[1] of that intrinsic.
+ */
+static inline unsigned
+brw_nir_ubo_surface_index_get_bti(nir_src src)
+{
+   if (nir_src_is_const(src))
+      return nir_src_as_uint(src);
+
+   assert(src.ssa->parent_instr->type == nir_instr_type_intrinsic);
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src.ssa->parent_instr);
+   if (!intrin || intrin->intrinsic != nir_intrinsic_resource_intel)
+      return UINT32_MAX;
+
+   /* In practice we could even drop this intrinsic because the bindless
+    * access always operate from a base offset coming from a push constant, so
+    * they can never be constant.
+    */
+   if (nir_intrinsic_resource_access_intel(intrin) &
+       nir_resource_intel_bindless)
+      return UINT32_MAX;
+
+   if (!nir_src_is_const(intrin->src[1]))
+      return UINT32_MAX;
+
+   return nir_src_as_uint(intrin->src[1]);
+}
+
+void brw_preprocess_nir(const struct brw_compiler *compiler,
+                        nir_shader *nir,
+                        const struct brw_nir_compiler_opts *opts);
+
+void
+brw_nir_link_shaders(const struct brw_compiler *compiler,
+                     nir_shader *producer, nir_shader *consumer);
+
+bool brw_nir_lower_cs_intrinsics(nir_shader *nir,
+                                 const struct intel_device_info *devinfo,
+                                 struct brw_cs_prog_data *prog_data);
+bool brw_nir_lower_alpha_to_coverage(nir_shader *shader,
+                                     const struct brw_wm_prog_key *key,
+                                     const struct brw_wm_prog_data *prog_data);
+void brw_nir_lower_vs_inputs(nir_shader *nir,
+                             bool edgeflag_is_last,
+                             const uint8_t *vs_attrib_wa_flags);
+void brw_nir_lower_vue_inputs(nir_shader *nir,
+                              const struct intel_vue_map *vue_map);
+void brw_nir_lower_tes_inputs(nir_shader *nir, const struct intel_vue_map *vue);
+void brw_nir_lower_fs_inputs(nir_shader *nir,
+                             const struct intel_device_info *devinfo,
+                             const struct brw_wm_prog_key *key);
+void brw_nir_lower_vue_outputs(nir_shader *nir);
+void brw_nir_lower_tcs_outputs(nir_shader *nir, const struct intel_vue_map *vue,
+                               enum tess_primitive_mode tes_primitive_mode);
+void brw_nir_lower_fs_outputs(nir_shader *nir);
+
+bool brw_nir_lower_cmat(nir_shader *nir, unsigned subgroup_size);
+
+bool brw_nir_lower_shading_rate_output(nir_shader *nir);
+
+bool brw_nir_lower_sparse_intrinsics(nir_shader *nir);
+
+struct brw_nir_lower_storage_image_opts {
+   const struct intel_device_info *devinfo;
+
+   bool lower_loads;
+   bool lower_stores;
+   bool lower_atomics;
+   bool lower_get_size;
+};
+
+bool brw_nir_lower_storage_image(nir_shader *nir,
+                                 const struct brw_nir_lower_storage_image_opts *opts);
+
+bool brw_nir_lower_mem_access_bit_sizes(nir_shader *shader,
+                                        const struct
+                                        intel_device_info *devinfo);
+
+void brw_postprocess_nir(nir_shader *nir,
+                         const struct brw_compiler *compiler,
+                         bool debug_enabled,
+                         enum brw_robustness_flags robust_flags);
+
+bool brw_nir_apply_attribute_workarounds(nir_shader *nir,
+                                         const uint8_t *attrib_wa_flags);
+
+bool brw_nir_apply_trig_workarounds(nir_shader *nir);
+
+bool brw_nir_limit_trig_input_range_workaround(nir_shader *nir);
+
+void brw_nir_apply_key(nir_shader *nir,
+                       const struct brw_compiler *compiler,
+                       const struct brw_base_prog_key *key,
+                       unsigned max_subgroup_size);
+
+unsigned brw_nir_api_subgroup_size(const nir_shader *nir,
+                                   unsigned hw_subgroup_size);
+
+enum brw_conditional_mod brw_cmod_for_nir_comparison(nir_op op);
+enum lsc_opcode lsc_aop_for_nir_intrinsic(const nir_intrinsic_instr *atomic);
+enum brw_reg_type brw_type_for_nir_type(const struct intel_device_info *devinfo,
+                                        nir_alu_type type);
+
+bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
+                                  unsigned bit_size,
+                                  unsigned num_components,
+                                  nir_intrinsic_instr *low,
+                                  nir_intrinsic_instr *high,
+                                  void *data);
+
+void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
+                                nir_shader *nir,
+                                struct brw_ubo_range out_ranges[4]);
+
+void brw_nir_optimize(nir_shader *nir, bool is_scalar,
+                      const struct intel_device_info *devinfo);
+
+nir_shader *brw_nir_create_passthrough_tcs(void *mem_ctx,
+                                           const struct brw_compiler *compiler,
+                                           const struct brw_tcs_prog_key *key);
+
+#define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0
+#define BRW_NIR_FRAG_OUTPUT_INDEX_MASK INTEL_MASK(0, 0)
+#define BRW_NIR_FRAG_OUTPUT_LOCATION_SHIFT 1
+#define BRW_NIR_FRAG_OUTPUT_LOCATION_MASK INTEL_MASK(31, 1)
+
+bool brw_nir_move_interpolation_to_top(nir_shader *nir);
+nir_def *brw_nir_load_global_const(nir_builder *b,
+                                       nir_intrinsic_instr *load_uniform,
+                                       nir_def *base_addr,
+                                       unsigned off);
+
+const struct glsl_type *brw_nir_get_var_type(const struct nir_shader *nir,
+                                             nir_variable *var);
+
+void brw_nir_adjust_payload(nir_shader *shader);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BRW_NIR_H */
diff --git a/src/intel/compiler/elk/brw_nir_analyze_boolean_resolves.c b/src/intel/compiler/elk/brw_nir_analyze_boolean_resolves.c
new file mode 100644
index 00000000000..be9b79aa4b0
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir_analyze_boolean_resolves.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+
+/*
+ * This file implements an analysis pass that determines when we have to do
+ * a boolean resolve on Gen <= 5.  Instructions that need a boolean resolve
+ * will have the booleans portion of the instr->pass_flags field set to
+ * BRW_NIR_BOOLEAN_NEEDS_RESOLVE.
+ */
+
+
+/** Returns the resolve status for the given source
+ *
+ * If the source has a parent instruction then the resolve status is the
+ * status of the parent instruction.  If the source does not have a parent
+ * instruction then we don't know so we return NON_BOOLEAN.
+ */
+static uint8_t
+get_resolve_status_for_src(nir_src *src)
+{
+   nir_instr *src_instr = src->ssa->parent_instr;
+   uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
+
+   /* If the source instruction needs resolve, then from the perspective
+    * of the user, it's a true boolean.
+    */
+   if (resolve_status == BRW_NIR_BOOLEAN_NEEDS_RESOLVE)
+      resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
+   return resolve_status;
+}
+
+/** Marks the given source as needing a resolve
+ *
+ * If the given source corresponds to an unresolved boolean it marks it as
+ * needing a resolve.  Otherwise, we leave it alone.
+ */
+static bool
+src_mark_needs_resolve(nir_src *src, void *void_state)
+{
+   nir_instr *src_instr = src->ssa->parent_instr;
+   uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
+
+   /* If the source instruction is unresolved, then mark it as needing
+    * to be resolved.
+    */
+   if (resolve_status == BRW_NIR_BOOLEAN_UNRESOLVED) {
+      src_instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK;
+      src_instr->pass_flags |= BRW_NIR_BOOLEAN_NEEDS_RESOLVE;
+   }
+
+   return true;
+}
+
+static bool
+analyze_boolean_resolves_block(nir_block *block)
+{
+   nir_foreach_instr(instr, block) {
+      switch (instr->type) {
+      case nir_instr_type_alu: {
+         /* For ALU instructions, the resolve status is handled in a
+          * three-step process.
+          *
+          * 1) Look at the instruction type and sources and determine if it
+          *    can be left unresolved.
+          *
+          * 2) Look at the destination and see if we have to resolve
+          *    anyway.  (This is the case if this instruction is not the
+          *    only instruction writing to a given register.)
+          *
+          * 3) If the instruction has a resolve status other than
+          *    BOOL_UNRESOLVED or BOOL_NEEDS_RESOLVE then we walk through
+          *    the sources and ensure that they are also resolved.  This
+          *    ensures that we don't end up with any stray unresolved
+          *    booleans going into ADDs or something like that.
+          */
+
+         uint8_t resolve_status;
+         nir_alu_instr *alu = nir_instr_as_alu(instr);
+         switch (alu->op) {
+         case nir_op_b32all_fequal2:
+         case nir_op_b32all_iequal2:
+         case nir_op_b32all_fequal3:
+         case nir_op_b32all_iequal3:
+         case nir_op_b32all_fequal4:
+         case nir_op_b32all_iequal4:
+         case nir_op_b32any_fnequal2:
+         case nir_op_b32any_inequal2:
+         case nir_op_b32any_fnequal3:
+         case nir_op_b32any_inequal3:
+         case nir_op_b32any_fnequal4:
+         case nir_op_b32any_inequal4:
+            /* These are only implemented by the vec4 backend and its
+             * implementation emits resolved booleans.  At some point in the
+             * future, this may change and we'll have to remove some of the
+             * above cases.
+             */
+            resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
+            break;
+
+         case nir_op_mov:
+         case nir_op_inot:
+            /* This is a single-source instruction.  Just copy the resolve
+             * status from the source.
+             */
+            resolve_status = get_resolve_status_for_src(&alu->src[0].src);
+            break;
+
+         case nir_op_b32csel:
+         case nir_op_iand:
+         case nir_op_ior:
+         case nir_op_ixor: {
+            const unsigned first = alu->op == nir_op_b32csel ? 1 : 0;
+            uint8_t src0_status = get_resolve_status_for_src(&alu->src[first + 0].src);
+            uint8_t src1_status = get_resolve_status_for_src(&alu->src[first + 1].src);
+
+            /* src0 of a bcsel is evaluated as a Boolean with the expectation
+             * that it has already been resolved.  Mark it as such.
+             */
+            if (alu->op == nir_op_b32csel)
+               src_mark_needs_resolve(&alu->src[0].src, NULL);
+
+            if (src0_status == src1_status) {
+               resolve_status = src0_status;
+            } else if (src0_status == BRW_NIR_NON_BOOLEAN ||
+                       src1_status == BRW_NIR_NON_BOOLEAN) {
+               /* If one of the sources is a non-boolean then the whole
+                * thing is a non-boolean.
+                */
+               resolve_status = BRW_NIR_NON_BOOLEAN;
+            } else {
+               /* At this point one of them is a true boolean and one is a
+                * boolean that needs a resolve.  We could either resolve the
+                * unresolved source or we could resolve here.  If we resolve
+                * the unresolved source then we get two resolves for the price
+                * of one.  Just set this one to BOOLEAN_NO_RESOLVE and we'll
+                * let the code below force a resolve on the unresolved source.
+                */
+               resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
+            }
+            break;
+         }
+
+         default:
+            if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_bool) {
+               /* This instructions will turn into a CMP when we actually emit
+                * them so the result will have to be resolved before it can be
+                * used.
+                */
+               resolve_status = BRW_NIR_BOOLEAN_UNRESOLVED;
+
+               /* Even though the destination is allowed to be left
+                * unresolved, the sources are treated as regular integers or
+                * floats so they need to be resolved.
+                */
+               nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+            } else {
+               resolve_status = BRW_NIR_NON_BOOLEAN;
+            }
+         }
+
+         /* Go ahead allow unresolved booleans. */
+         instr->pass_flags = (instr->pass_flags & ~BRW_NIR_BOOLEAN_MASK) |
+                             resolve_status;
+
+         /* Finally, resolve sources if it's needed */
+         switch (resolve_status) {
+         case BRW_NIR_BOOLEAN_NEEDS_RESOLVE:
+         case BRW_NIR_BOOLEAN_UNRESOLVED:
+            /* This instruction is either unresolved or we're doing the
+             * resolve here; leave the sources alone.
+             */
+            break;
+
+         case BRW_NIR_BOOLEAN_NO_RESOLVE:
+         case BRW_NIR_NON_BOOLEAN:
+            nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+            break;
+
+         default:
+            unreachable("Invalid boolean flag");
+         }
+
+         break;
+      }
+
+      case nir_instr_type_load_const: {
+         nir_load_const_instr *load = nir_instr_as_load_const(instr);
+
+         /* For load_const instructions, it's a boolean exactly when it holds
+          * one of the values NIR_TRUE or NIR_FALSE.
+          *
+          * Since load_const instructions don't have any sources, we don't
+          * have to worry about resolving them.
+          */
+         instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK;
+         if (load->value[0].u32 == NIR_TRUE || load->value[0].u32 == NIR_FALSE) {
+            instr->pass_flags |= BRW_NIR_BOOLEAN_NO_RESOLVE;
+         } else {
+            instr->pass_flags |= BRW_NIR_NON_BOOLEAN;
+         }
+         continue;
+      }
+
+      default:
+         /* Everything else is an unknown non-boolean value and needs to
+          * have all sources resolved.
+          */
+         instr->pass_flags = (instr->pass_flags & ~BRW_NIR_BOOLEAN_MASK) |
+                             BRW_NIR_NON_BOOLEAN;
+         nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+         continue;
+      }
+   }
+
+   nir_if *following_if = nir_block_get_following_if(block);
+   if (following_if)
+      src_mark_needs_resolve(&following_if->condition, NULL);
+
+   return true;
+}
+
+static void
+analyze_boolean_resolves_impl(nir_function_impl *impl)
+{
+   nir_foreach_block(block, impl) {
+      analyze_boolean_resolves_block(block);
+   }
+}
+
+void
+brw_nir_analyze_boolean_resolves(nir_shader *shader)
+{
+   nir_foreach_function_impl(impl, shader) {
+      analyze_boolean_resolves_impl(impl);
+   }
+}
diff --git a/src/intel/compiler/elk/brw_nir_analyze_ubo_ranges.c b/src/intel/compiler/elk/brw_nir_analyze_ubo_ranges.c
new file mode 100644
index 00000000000..ab7c8323c85
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir_analyze_ubo_ranges.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir.h"
+#include "util/u_dynarray.h"
+
+/**
+ * \file brw_nir_analyze_ubo_ranges.c
+ *
+ * This pass decides which portions of UBOs to upload as push constants,
+ * so shaders can access them as part of the thread payload, rather than
+ * having to issue expensive memory reads to pull the data.
+ *
+ * The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different
+ * buffers, in GRF (256-bit/32-byte) units.
+ *
+ * To do this, we examine NIR load_ubo intrinsics, recording the number of
+ * loads at each offset.  We track offsets at a 32-byte granularity, so even
+ * fields with a bit of padding between them tend to fall into contiguous
+ * ranges.  We build a list of these ranges, tracking their "cost" (number
+ * of registers required) and "benefit" (number of pull loads eliminated
+ * by pushing the range).  We then sort the list to obtain the four best
+ * ranges (most benefit for the least cost).
+ */
+
+struct ubo_range_entry
+{
+   struct brw_ubo_range range;
+   int benefit;
+};
+
+static int
+score(const struct ubo_range_entry *entry)
+{
+   return 2 * entry->benefit - entry->range.length;
+}
+
+/**
+ * Compares score for two UBO range entries.
+ *
+ * For a descending qsort().
+ */
+static int
+cmp_ubo_range_entry(const void *va, const void *vb)
+{
+   const struct ubo_range_entry *a = va;
+   const struct ubo_range_entry *b = vb;
+
+   /* Rank based on scores, descending order */
+   int delta = score(b) - score(a);
+
+   /* Then use the UBO block index as a tie-breaker, descending order */
+   if (delta == 0)
+      delta = b->range.block - a->range.block;
+
+   /* Finally use the start offset as a second tie-breaker, ascending order */
+   if (delta == 0)
+      delta = a->range.start - b->range.start;
+
+   return delta;
+}
+
+struct ubo_block_info
+{
+   /* Each bit in the offsets bitfield represents a 32-byte section of data.
+    * If it's set to one, there is interesting UBO data at that offset.  If
+    * not, there's a "hole" - padding between data - or just nothing at all.
+    */
+   uint64_t offsets;
+   uint8_t uses[64];
+};
+
+struct ubo_analysis_state
+{
+   struct hash_table *blocks;
+   bool uses_regular_uniforms;
+};
+
+static struct ubo_block_info *
+get_block_info(struct ubo_analysis_state *state, int block)
+{
+   uint32_t hash = block + 1;
+   void *key = (void *) (uintptr_t) hash;
+
+   struct hash_entry *entry =
+      _mesa_hash_table_search_pre_hashed(state->blocks, hash, key);
+
+   if (entry)
+      return (struct ubo_block_info *) entry->data;
+
+   struct ubo_block_info *info =
+      rzalloc(state->blocks, struct ubo_block_info);
+   _mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info);
+
+   return info;
+}
+
+static void
+analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
+{
+   nir_foreach_instr(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_load_uniform:
+      case nir_intrinsic_image_deref_load:
+      case nir_intrinsic_image_deref_store:
+      case nir_intrinsic_image_deref_atomic:
+      case nir_intrinsic_image_deref_atomic_swap:
+      case nir_intrinsic_image_deref_size:
+         state->uses_regular_uniforms = true;
+         continue;
+
+      case nir_intrinsic_load_ubo:
+         break; /* Fall through to the analysis below */
+
+      default:
+         continue; /* Not a uniform or UBO intrinsic */
+      }
+
+      if (brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) &&
+          nir_src_is_const(intrin->src[1])) {
+         const int block = brw_nir_ubo_surface_index_get_push_block(intrin->src[0]);
+         const unsigned byte_offset = nir_src_as_uint(intrin->src[1]);
+         const int offset = byte_offset / 32;
+
+         /* Avoid shifting by larger than the width of our bitfield, as this
+          * is undefined in C.  Even if we require multiple bits to represent
+          * the entire value, it's OK to record a partial value - the backend
+          * is capable of falling back to pull loads for later components of
+          * vectors, as it has to shrink ranges for other reasons anyway.
+          */
+         if (offset >= 64)
+            continue;
+
+         /* The value might span multiple 32-byte chunks. */
+         const int bytes = nir_intrinsic_dest_components(intrin) *
+                           (intrin->def.bit_size / 8);
+         const int start = ROUND_DOWN_TO(byte_offset, 32);
+         const int end = ALIGN(byte_offset + bytes, 32);
+         const int chunks = (end - start) / 32;
+
+         /* TODO: should we count uses in loops as higher benefit? */
+
+         struct ubo_block_info *info = get_block_info(state, block);
+         info->offsets |= ((1ull << chunks) - 1) << offset;
+         info->uses[offset]++;
+      }
+   }
+}
+
+static void
+print_ubo_entry(FILE *file,
+                const struct ubo_range_entry *entry,
+                struct ubo_analysis_state *state)
+{
+   struct ubo_block_info *info = get_block_info(state, entry->range.block);
+
+   fprintf(file,
+           "block %2d, start %2d, length %2d, bits = %"PRIx64", "
+           "benefit %2d, cost %2d, score = %2d\n",
+           entry->range.block, entry->range.start, entry->range.length,
+           info->offsets, entry->benefit, entry->range.length, score(entry));
+}
+
+void
+brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
+                           nir_shader *nir,
+                           struct brw_ubo_range out_ranges[4])
+{
+   void *mem_ctx = ralloc_context(NULL);
+
+   struct ubo_analysis_state state = {
+      .uses_regular_uniforms = false,
+      .blocks =
+         _mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal),
+   };
+
+   /* Compute shaders use push constants to get the subgroup ID so it's
+    * best to just assume some system values are pushed.
+    */
+   if (nir->info.stage == MESA_SHADER_COMPUTE)
+      state.uses_regular_uniforms = true;
+
+   /* Walk the IR, recording how many times each UBO block/offset is used. */
+   nir_foreach_function_impl(impl, nir) {
+      nir_foreach_block(block, impl) {
+         analyze_ubos_block(&state, block);
+      }
+   }
+
+   /* Find ranges: a block, starting 32-byte offset, and length. */
+   struct util_dynarray ranges;
+   util_dynarray_init(&ranges, mem_ctx);
+
+   hash_table_foreach(state.blocks, entry) {
+      const int b = entry->hash - 1;
+      const struct ubo_block_info *info = entry->data;
+      uint64_t offsets = info->offsets;
+
+      /* Walk through the offsets bitfield, finding contiguous regions of
+       * set bits:
+       *
+       *   0000000001111111111111000000000000111111111111110000000011111100
+       *            ^^^^^^^^^^^^^            ^^^^^^^^^^^^^^        ^^^^^^
+       *
+       * Each of these will become a UBO range.
+       */
+      while (offsets != 0) {
+         /* Find the first 1 in the offsets bitfield.  This represents the
+          * start of a range of interesting UBO data.  Make it zero-indexed.
+          */
+         int first_bit = ffsll(offsets) - 1;
+
+         /* Find the first 0 bit in offsets beyond first_bit.  To find the
+          * first zero bit, we find the first 1 bit in the complement.  In
+          * order to ignore bits before first_bit, we mask off those bits.
+          */
+         int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1;
+
+         if (first_hole == -1) {
+            /* If we didn't find a hole, then set it to the end of the
+             * bitfield.  There are no more ranges to process.
+             */
+            first_hole = 64;
+            offsets = 0;
+         } else {
+            /* We've processed all bits before first_hole.  Mask them off. */
+            offsets &= ~((1ull << first_hole) - 1);
+         }
+
+         struct ubo_range_entry *entry =
+            util_dynarray_grow(&ranges, struct ubo_range_entry, 1);
+
+         entry->range.block = b;
+         entry->range.start = first_bit;
+         /* first_hole is one beyond the end, so we don't need to add 1 */
+         entry->range.length = first_hole - first_bit;
+         entry->benefit = 0;
+
+         for (int i = 0; i < entry->range.length; i++)
+            entry->benefit += info->uses[first_bit + i];
+      }
+   }
+
+   int nr_entries = ranges.size / sizeof(struct ubo_range_entry);
+
+   if (0) {
+      util_dynarray_foreach(&ranges, struct ubo_range_entry, entry) {
+         print_ubo_entry(stderr, entry, &state);
+      }
+   }
+
+   /* TODO: Consider combining ranges.
+    *
+    * We can only push 3-4 ranges via 3DSTATE_CONSTANT_XS.  If there are
+    * more ranges, and two are close by with only a small hole, it may be
+    * worth combining them.  The holes will waste register space, but the
+    * benefit of removing pulls may outweigh that cost.
+    */
+
+   /* Sort the list so the most beneficial ranges are at the front. */
+   if (nr_entries > 0) {
+      qsort(ranges.data, nr_entries, sizeof(struct ubo_range_entry),
+            cmp_ubo_range_entry);
+   }
+
+   struct ubo_range_entry *entries = ranges.data;
+
+   /* Return the top 4 or so.  We drop by one if regular uniforms are in
+    * use, assuming one push buffer will be dedicated to those.  We may
+    * also only get 3 on Haswell if we can't write INSTPM.
+    *
+    * The backend may need to shrink these ranges to ensure that they
+    * don't exceed the maximum push constant limits.  It can simply drop
+    * the tail of the list, as that's the least valuable portion.  We
+    * unfortunately can't truncate it here, because we don't know what
+    * the backend is planning to do with regular uniforms.
+    */
+   const int max_ubos = (compiler->constant_buffer_0_is_relative ? 3 : 4) -
+                        state.uses_regular_uniforms;
+   nr_entries = MIN2(nr_entries, max_ubos);
+
+   for (int i = 0; i < nr_entries; i++) {
+      out_ranges[i] = entries[i].range;
+   }
+   for (int i = nr_entries; i < 4; i++) {
+      out_ranges[i].block = 0;
+      out_ranges[i].start = 0;
+      out_ranges[i].length = 0;
+   }
+
+   ralloc_free(ranges.mem_ctx);
+}
diff --git a/src/intel/compiler/elk/brw_nir_attribute_workarounds.c b/src/intel/compiler/elk/brw_nir_attribute_workarounds.c
new file mode 100644
index 00000000000..12c30fd9561
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir_attribute_workarounds.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "brw_nir.h"
+
+/**
+ * Prior to Haswell, the hardware can't natively support GL_FIXED or
+ * 2_10_10_10_REV vertex formats.  This pass inserts extra shader code
+ * to produce the correct values.
+ */
+
+static bool
+apply_attr_wa_instr(nir_builder *b, nir_instr *instr, void *cb_data)
+{
+   const uint8_t *attrib_wa_flags = cb_data;
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   if (intrin->intrinsic != nir_intrinsic_load_input)
+      return false;
+
+   uint8_t wa_flags = attrib_wa_flags[nir_intrinsic_base(intrin)];
+   if (wa_flags == 0)
+      return false;
+
+   b->cursor = nir_after_instr(instr);
+
+   nir_def *val = &intrin->def;
+
+   /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
+    * come in as floating point conversions of the integer values.
+    */
+   if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
+      nir_def *scaled =
+         nir_fmul_imm(b, val, 1.0f / 65536.0f);
+      nir_def *comps[4];
+      for (int i = 0; i < val->num_components; i++) {
+         bool rescale = i < (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK);
+         comps[i] = nir_channel(b, rescale ? scaled : val, i);
+      }
+      val = nir_vec(b, comps, val->num_components);
+   }
+
+   /* Do sign recovery for 2101010 formats if required. */
+   if (wa_flags & BRW_ATTRIB_WA_SIGN) {
+      /* sign recovery shift: <22, 22, 22, 30> */
+      nir_def *shift = nir_imm_ivec4(b, 22, 22, 22, 30);
+      val = nir_ishr(b, nir_ishl(b, val, shift), shift);
+   }
+
+   /* Apply BGRA swizzle if required. */
+   if (wa_flags & BRW_ATTRIB_WA_BGRA) {
+      val = nir_swizzle(b, val, (unsigned[4]){2,1,0,3}, 4);
+   }
+
+   if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
+      /* ES 3.0 has different rules for converting signed normalized
+       * fixed-point numbers than desktop GL.
+       */
+      if (wa_flags & BRW_ATTRIB_WA_SIGN) {
+         /* According to equation 2.2 of the ES 3.0 specification,
+          * signed normalization conversion is done by:
+          *
+          * f = c / (2^(b-1)-1)
+          *
+          * OpenGL 4.2+ uses this equation as well.  Since most contexts
+          * promote to the new higher version, and this is what Haswell+
+          * hardware does anyway, we just always use this formula.
+          */
+         nir_def *es3_normalize_factor =
+            nir_imm_vec4(b, 1.0f / ((1 << 9) - 1), 1.0f / ((1 << 9) - 1),
+                            1.0f / ((1 << 9) - 1), 1.0f / ((1 << 1) - 1));
+         val = nir_fmax(b,
+                        nir_fmul(b, nir_i2f32(b, val), es3_normalize_factor),
+                        nir_imm_float(b, -1.0f));
+      } else {
+         /* The following equation is from the OpenGL 3.2 specification:
+          *
+          * 2.1 unsigned normalization
+          * f = c/(2^n-1)
+          */
+         nir_def *normalize_factor =
+            nir_imm_vec4(b, 1.0f / ((1 << 10) - 1), 1.0f / ((1 << 10) - 1),
+                            1.0f / ((1 << 10) - 1), 1.0f / ((1 << 2)  - 1));
+
+         val = nir_fmul(b, nir_u2f32(b, val), normalize_factor);
+      }
+   }
+
+   if (wa_flags & BRW_ATTRIB_WA_SCALE) {
+      val = (wa_flags & BRW_ATTRIB_WA_SIGN) ? nir_i2f32(b, val)
+                                            : nir_u2f32(b, val);
+   }
+
+   nir_def_rewrite_uses_after(&intrin->def, val,
+                                  val->parent_instr);
+
+   return true;
+}
+
+bool
+brw_nir_apply_attribute_workarounds(nir_shader *shader,
+                                    const uint8_t *attrib_wa_flags)
+{
+   return nir_shader_instructions_pass(shader, apply_attr_wa_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       (void *)attrib_wa_flags);
+}
diff --git a/src/intel/compiler/elk/brw_nir_lower_alpha_to_coverage.c b/src/intel/compiler/elk/brw_nir_lower_alpha_to_coverage.c
new file mode 100644
index 00000000000..92147f5bf39
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir_lower_alpha_to_coverage.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "brw_nir.h"
+
+/**
+ * We need to compute alpha to coverage dithering manually in shader
+ * and replace sample mask store with the bitwise-AND of sample mask and
+ * alpha to coverage dithering.
+ *
+ * The following formula is used to compute final sample mask:
+ *  m = int(16.0 * clamp(src0_alpha, 0.0, 1.0))
+ *  dither_mask = 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) |
+ *     0x0808 * (m & 2) | 0x0100 * (m & 1)
+ *  sample_mask = sample_mask & dither_mask
+ *
+ * It gives a number of ones proportional to the alpha for 2, 4, 8 or 16
+ * least significant bits of the result:
+ *  0.0000 0000000000000000
+ *  0.0625 0000000100000000
+ *  0.1250 0001000000010000
+ *  0.1875 0001000100010000
+ *  0.2500 1000100010001000
+ *  0.3125 1000100110001000
+ *  0.3750 1001100010011000
+ *  0.4375 1001100110011000
+ *  0.5000 1010101010101010
+ *  0.5625 1010101110101010
+ *  0.6250 1011101010111010
+ *  0.6875 1011101110111010
+ *  0.7500 1110111011101110
+ *  0.8125 1110111111101110
+ *  0.8750 1111111011111110
+ *  0.9375 1111111111111110
+ *  1.0000 1111111111111111
+ */
+static nir_def *
+build_dither_mask(nir_builder *b, nir_def *color)
+{
+   assert(color->num_components == 4);
+   nir_def *alpha = nir_channel(b, color, 3);
+
+   nir_def *m =
+      nir_f2i32(b, nir_fmul_imm(b, nir_fsat(b, alpha), 16.0));
+
+   nir_def *part_a =
+      nir_iand_imm(b, nir_ushr(b, nir_imm_int(b, 0xfea80),
+                                  nir_iand_imm(b, m, ~3)),
+                      0xf);
+
+   nir_def *part_b = nir_iand_imm(b, m, 2);
+   nir_def *part_c = nir_iand_imm(b, m, 1);
+
+   return nir_ior(b, nir_imul_imm(b, part_a, 0x1111),
+                     nir_ior(b, nir_imul_imm(b, part_b, 0x0808),
+                                nir_imul_imm(b, part_c, 0x0100)));
+}
+
+bool
+brw_nir_lower_alpha_to_coverage(nir_shader *shader,
+                                const struct brw_wm_prog_key *key,
+                                const struct brw_wm_prog_data *prog_data)
+{
+   assert(shader->info.stage == MESA_SHADER_FRAGMENT);
+   assert(key->alpha_to_coverage != BRW_NEVER);
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+
+   const uint64_t outputs_written = shader->info.outputs_written;
+   if (!(outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) ||
+       !(outputs_written & (BITFIELD64_BIT(FRAG_RESULT_COLOR) |
+                            BITFIELD64_BIT(FRAG_RESULT_DATA0))))
+      goto skip;
+
+   nir_intrinsic_instr *sample_mask_write = NULL;
+   nir_intrinsic_instr *color0_write = NULL;
+   bool sample_mask_write_first = false;
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         if (intrin->intrinsic != nir_intrinsic_store_output)
+            continue;
+
+         /* We call nir_lower_io_to_temporaries to lower FS outputs to
+          * temporaries with a copy at the end so this should be the last
+          * block in the shader.
+          */
+         assert(block->cf_node.parent == &impl->cf_node);
+         assert(nir_cf_node_is_last(&block->cf_node));
+
+         /* See store_output in fs_visitor::nir_emit_fs_intrinsic */
+         const unsigned store_offset = nir_src_as_uint(intrin->src[1]);
+         const unsigned driver_location = nir_intrinsic_base(intrin) +
+            SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
+
+         /* Extract the FRAG_RESULT */
+         const unsigned location =
+            GET_FIELD(driver_location, BRW_NIR_FRAG_OUTPUT_LOCATION);
+
+         if (location == FRAG_RESULT_SAMPLE_MASK) {
+            assert(sample_mask_write == NULL);
+            sample_mask_write = intrin;
+            sample_mask_write_first = (color0_write == NULL);
+         }
+
+         if (location == FRAG_RESULT_COLOR ||
+             location == FRAG_RESULT_DATA0) {
+            assert(color0_write == NULL);
+            color0_write = intrin;
+         }
+      }
+   }
+
+   /* It's possible that shader_info may be out-of-date and the writes to
+    * either gl_SampleMask or the first color value may have been removed.
+    * This can happen if, for instance a nir_undef is written to the
+    * color value.  In that case, just bail and don't do anything rather
+    * than crashing.
+    */
+   if (color0_write == NULL || sample_mask_write == NULL)
+      goto skip;
+
+   /* It's possible that the color value isn't actually a vec4.  In this case,
+    * assuming an alpha of 1.0 and letting the sample mask pass through
+    * unaltered seems like the kindest thing to do to apps.
+    */
+   nir_def *color0 = color0_write->src[0].ssa;
+   if (color0->num_components < 4)
+      goto skip;
+
+   nir_def *sample_mask = sample_mask_write->src[0].ssa;
+
+   if (sample_mask_write_first) {
+      /* If the sample mask write comes before the write to color0, we need
+       * to move it because it's going to use the value from color0 to
+       * compute the sample mask.
+       */
+      nir_instr_remove(&sample_mask_write->instr);
+      nir_instr_insert(nir_after_instr(&color0_write->instr),
+                       &sample_mask_write->instr);
+   }
+
+   nir_builder b = nir_builder_at(nir_before_instr(&sample_mask_write->instr));
+
+   /* Combine dither_mask and the gl_SampleMask value */
+   nir_def *dither_mask = build_dither_mask(&b, color0);
+   dither_mask = nir_iand(&b, sample_mask, dither_mask);
+
+   if (key->alpha_to_coverage == BRW_SOMETIMES) {
+      nir_def *push_flags =
+         nir_load_uniform(&b, 1, 32, nir_imm_int(&b, prog_data->msaa_flags_param * 4));
+      nir_def *alpha_to_coverage =
+         nir_test_mask(&b, push_flags, INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE);
+      dither_mask = nir_bcsel(&b, alpha_to_coverage,
+                              dither_mask, sample_mask_write->src[0].ssa);
+   }
+
+   nir_src_rewrite(&sample_mask_write->src[0], dither_mask);
+
+   nir_metadata_preserve(impl, nir_metadata_block_index |
+                               nir_metadata_dominance);
+   return true;
+
+skip:
+   nir_metadata_preserve(impl, nir_metadata_all);
+   return false;
+}
diff --git a/src/intel/compiler/elk/brw_nir_lower_cooperative_matrix.c b/src/intel/compiler/elk/brw_nir_lower_cooperative_matrix.c
new file mode 100644
index 00000000000..8ed937baaed
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir_lower_cooperative_matrix.c
@@ -0,0 +1,818 @@
+/*
+ * Copyright 2023 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+/**
+ * \file brw_nir_lower_cooperative_matrix.c
+ * Lower cooperative matrix to subgroup operations.
+ *
+ * All supported matrix types are assumed to have either 8 rows or 8
+ * columns. The other dimension of the matrix is typically 8 times the number
+ * of data elements that can be stored in a 32-bit dword. Matrix data is
+ * indexed by a combination of an array element and a subgroup invocation ID.
+ *
+ * Two layouts for matrix data are used. In the first layout,
+ * subgroupShuffle(slice[N], ...) accesses row N of the matrix. This will be
+ * called row-major hereafter. In the other layout,
+ * subgroupShuffle(slice[...], M) accesses column M of the matrix. This will
+ * be called column-major hereafter. In cases where a single 32-bit value is
+ * stored in each entry, these layouts are identical.
+ *
+ * The subtle difference arises when multiple values are packed into a single
+ * 32-bit dword. If two 16-bit values are packed in a single 32-bit value in
+ * column-major, subgroupShuffle(slice[0], 1) holds matrix entries m[1][1] and
+ * m[2][1] (in m[row][column] notation). In row-major, that same shuffle holds
+ * m[0][2] and m[0][3].
+ *
+ * There is an alternate way to think about the matrix layouts. Every matrix
+ * size supported by the Intel driver is either Sx8 (e.g., 16x8 for float16 B
+ * matrix) or Sx8T (e.g., 8x32 for int8 A matrix). The A matrix and B matrix
+ * layouts are such that a single 8 dword register hold an entire row of the
+ * matrix.
+ *
+ * Consider a matrix stored starting in register g32. In an A matrix, the
+ * packed dwords of g32 contain only the data for a single row of the
+ * matrix. g32 is row 0, g33 is row 1, etc. In a B matrix, the packed dwords
+ * of g(32+N).X contain only the data for a single column of the
+ * matrix. g[32:40].0 is column 0, g[32:40].1 is column 1, etc.
+ *
+ * This leads to some shenanigans in \c lower_cmat_load_store.
+ *
+ * In the common case, A, C, and result matrices are stored row major while B
+ * matrices are stored column major. This arrangement facilitates efficient
+ * dot product operations using DPAS or DP4A instructions.
+ *
+ * Future optimizations are possible when row and column major are
+ * flipped. That is, efficient dot products are also possible when A, C, and
+ * result matrices are column major while B is row major.
+ */
+
+#include "brw_nir.h"
+
+struct lower_cmat_state {
+   nir_shader *shader;
+
+   struct hash_table *slice_coop_types;
+
+   struct hash_table *vars_to_slice;
+
+   unsigned subgroup_size;
+};
+
+static void
+print_coop_types(struct lower_cmat_state *state)
+{
+   fprintf(stderr, "--- Slices to Cooperative Matrix type table\n");
+   hash_table_foreach(state->slice_coop_types, e) {
+      nir_variable *var = (void *)e->key;
+      const struct glsl_type *t = e->data;
+      fprintf(stderr, "%p: %s -> %s\n", var, var->name, glsl_get_type_name(t));
+   }
+   fprintf(stderr, "\n\n");
+}
+
+static const struct glsl_type *
+get_coop_type_for_slice(struct lower_cmat_state *state, nir_deref_instr *deref)
+{
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+   struct hash_entry *entry = _mesa_hash_table_search(state->slice_coop_types, var);
+
+   assert(entry != NULL);
+
+   return entry->data;
+}
+
+static bool
+lower_cmat_filter(const nir_instr *instr, const void *_state)
+{
+   if (instr->type == nir_instr_type_deref) {
+      nir_deref_instr *deref = nir_instr_as_deref(instr);
+      return glsl_type_is_cmat(deref->type);
+   }
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_cmat_construct:
+   case nir_intrinsic_cmat_load:
+   case nir_intrinsic_cmat_store:
+   case nir_intrinsic_cmat_length:
+   case nir_intrinsic_cmat_muladd:
+   case nir_intrinsic_cmat_unary_op:
+   case nir_intrinsic_cmat_binary_op:
+   case nir_intrinsic_cmat_scalar_op:
+   case nir_intrinsic_cmat_bitcast:
+   case nir_intrinsic_cmat_insert:
+   case nir_intrinsic_cmat_extract:
+   case nir_intrinsic_cmat_copy:
+      return true;
+
+   default:
+      return false;
+   }
+}
+
+/**
+ * Get number of matrix elements packed in each component of the slice.
+ */
+static unsigned
+get_packing_factor(const struct glsl_cmat_description desc,
+                   const struct glsl_type *slice_type)
+{
+   const struct glsl_type *slice_element_type = glsl_without_array(slice_type);
+
+   assert(!glsl_type_is_cmat(slice_type));
+
+   assert(glsl_get_bit_size(slice_element_type) >= glsl_base_type_get_bit_size(desc.element_type));
+   assert(glsl_get_bit_size(slice_element_type) % glsl_base_type_get_bit_size(desc.element_type) == 0);
+
+   return glsl_get_bit_size(slice_element_type) / glsl_base_type_get_bit_size(desc.element_type);
+}
+
+static const struct glsl_type *
+get_slice_type_from_desc(const struct lower_cmat_state *state,
+                         const struct glsl_cmat_description desc)
+{
+   enum glsl_base_type base_type;
+
+   /* Number of matrix elements stored by each subgroup invocation. If the
+    * data is packed, the slice size will be less than this.
+    */
+   const unsigned elements_per_invocation =
+      (desc.rows * desc.cols) / state->subgroup_size;
+
+   assert(elements_per_invocation > 0);
+
+   const unsigned element_bits = 32;
+   const unsigned bits = glsl_base_type_get_bit_size(desc.element_type);
+   unsigned packing_factor = MIN2(elements_per_invocation,
+                                  element_bits / bits);
+
+   /* Adjust the packing factor so that each row of the matrix fills and
+    * entire GRF.
+    *
+    * The in-register layout of B matrices is different, so those are handled
+    * more like column major (for row major matrices). See the file comment
+    * for more details.
+    */
+   const unsigned actual_cols = desc.use != GLSL_CMAT_USE_B ? desc.cols : desc.rows;
+   while ((actual_cols / packing_factor) < 8) {
+      assert(packing_factor > 1);
+      packing_factor /= 2;
+   }
+
+   switch (desc.element_type) {
+   case GLSL_TYPE_FLOAT:
+      base_type = GLSL_TYPE_FLOAT;
+      break;
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_FLOAT16:
+   case GLSL_TYPE_UINT8:
+   case GLSL_TYPE_UINT16:
+      base_type = glsl_get_base_type(glsl_uintN_t_type(packing_factor * bits));
+      break;
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_INT8:
+   case GLSL_TYPE_INT16:
+      base_type = glsl_get_base_type(glsl_intN_t_type(packing_factor * bits));
+      break;
+   default:
+      unreachable("Invalid cooperative matrix element type.");
+   }
+
+   unsigned len = elements_per_invocation / packing_factor;
+
+   /* Supported matrix sizes are designed to fill either 4 or 8 SIMD8
+    * registers. That means:
+    *
+    *          4 regsiters   8 registers
+    * SIMD32     len = 1       len = 2
+    * SIMD16     len = 2       len = 4
+    * SIMD8      len = 4       len = 8
+    *
+    * If configurations are added that result in other values of len, at the
+    * very least this assertion will need to be updated. The only value of len
+    * that makes sense to add would be 16, and that would be a lot of
+    * registers.
+    */
+   assert(len == 1 || len == 2 || len == 4 || len == 8);
+
+   const struct glsl_type *slice_type = glsl_vector_type(base_type, len);
+
+   assert(packing_factor == get_packing_factor(desc, slice_type));
+
+   return slice_type;
+}
+
+static const struct glsl_type *
+get_slice_type(const struct lower_cmat_state *state,
+               const struct glsl_type *type)
+{
+   if (glsl_type_is_array(type)) {
+      const struct glsl_type *slice_type =
+         get_slice_type(state, glsl_get_array_element(type));
+
+      return glsl_array_type(slice_type, glsl_array_size(type), 0);
+   }
+
+   assert(glsl_type_is_cmat(type));
+
+   return get_slice_type_from_desc(state,
+                                   *glsl_get_cmat_description(type));
+}
+
+static nir_deref_instr *
+create_local_slice(struct lower_cmat_state *state, nir_builder *b,
+                   const struct glsl_type *mat_type, const char *name)
+{
+   const struct glsl_type *slice_type = get_slice_type(state, mat_type);
+   nir_variable *slice_var = nir_local_variable_create(b->impl, slice_type, name);
+   _mesa_hash_table_insert(state->slice_coop_types, slice_var, (void *)mat_type);
+   return nir_build_deref_var(b, slice_var);
+}
+
+static void
+lower_cmat_load_store(nir_builder *b, nir_intrinsic_instr *intrin,
+                      struct lower_cmat_state *state)
+{
+   const bool load = intrin->intrinsic == nir_intrinsic_cmat_load;
+   const unsigned mat_src = load ? 0 : 1;
+   const unsigned ptr_src = load ? 1 : 0;
+
+   nir_deref_instr *slice = nir_src_as_deref(intrin->src[mat_src]);
+   const struct glsl_type *mat_type = get_coop_type_for_slice(state, slice);
+   const struct glsl_cmat_description *desc = glsl_get_cmat_description(mat_type);
+
+   nir_def *results[NIR_MAX_VEC_COMPONENTS];
+   const unsigned num_components = glsl_get_vector_elements(slice->type);
+   const unsigned packing_factor = get_packing_factor(*desc, slice->type);
+
+   nir_deref_instr *pointer = nir_src_as_deref(intrin->src[ptr_src]);
+
+   if ((nir_intrinsic_matrix_layout(intrin) == GLSL_MATRIX_LAYOUT_ROW_MAJOR) ==
+       (desc->use != GLSL_CMAT_USE_B)) {
+      nir_def *stride = nir_udiv_imm(b, intrin->src[2].ssa, packing_factor);
+
+      const struct glsl_type *element_type =
+         glsl_scalar_type(glsl_get_base_type(slice->type));
+
+      pointer = nir_build_deref_cast(b, &pointer->def, pointer->modes,
+                                     element_type,
+                                     glsl_get_bit_size(element_type) / 8);
+
+      nir_def *invocation = nir_load_subgroup_invocation(b);
+      nir_def *base_offset;
+      nir_def *step;
+
+      if (desc->use != GLSL_CMAT_USE_B) {
+         base_offset = nir_iadd(b,
+                                nir_imul(b,
+                                         nir_udiv_imm(b, invocation, 8),
+                                         stride),
+                                nir_umod_imm(b, invocation, 8));
+
+         step = nir_imul_imm(b, stride, state->subgroup_size / 8);
+      } else {
+         base_offset = nir_iadd(b,
+                                nir_imul(b,
+                                         nir_umod_imm(b, invocation, 8),
+                                         stride),
+                                nir_udiv_imm(b, invocation, 8));
+
+         step = nir_imm_int(b, state->subgroup_size / 8);
+      }
+
+      for (unsigned i = 0; i < num_components; i++) {
+         nir_def *offset = nir_imul_imm(b, step, i);
+
+         nir_deref_instr *memory_deref =
+            nir_build_deref_ptr_as_array(b, pointer,
+                                         nir_i2iN(b,
+                                                  nir_iadd(b,
+                                                           base_offset,
+                                                           offset),
+                                                  pointer->def.bit_size));
+
+         if (load) {
+            results[i] = nir_load_deref(b, memory_deref);
+         } else {
+            nir_def *src = nir_channel(b, nir_load_deref(b, slice), i);
+            nir_store_deref(b, memory_deref, src, 0x1);
+         }
+      }
+   } else {
+      nir_def *stride = intrin->src[2].ssa;
+
+      const struct glsl_type *element_type = glsl_scalar_type(desc->element_type);
+      const unsigned element_bits = glsl_base_type_get_bit_size(desc->element_type);
+      const unsigned element_stride = element_bits / 8;
+
+      pointer = nir_build_deref_cast(b, &pointer->def, pointer->modes, element_type,
+                                     element_stride);
+
+      nir_def *invocation_div_8 = nir_udiv_imm(b, nir_load_subgroup_invocation(b), 8);
+      nir_def *invocation_mod_8 = nir_umod_imm(b, nir_load_subgroup_invocation(b), 8);
+
+      nir_def *packed_stride = nir_imul_imm(b, stride, packing_factor);
+
+      for (unsigned i = 0; i < num_components; i++) {
+         const unsigned i_offset = i * (state->subgroup_size / 8);
+         nir_def *v[4];
+
+         for (unsigned j = 0; j < packing_factor; j++) {
+            nir_def *j_offset = nir_imul_imm(b, stride, j);
+            nir_def *offset;
+
+            if (desc->use != GLSL_CMAT_USE_B) {
+               offset = nir_iadd(b,
+                                 nir_iadd(b,
+                                          nir_imul(b,
+                                                   invocation_mod_8,
+                                                   packed_stride),
+                                          invocation_div_8),
+                                 nir_iadd_imm(b, j_offset, i_offset));
+            } else {
+               offset = nir_iadd(b,
+                                 nir_iadd(b,
+                                          nir_imul(b,
+                                                   invocation_div_8,
+                                                   packed_stride),
+                                          invocation_mod_8),
+                                 nir_iadd(b,
+                                          nir_imul_imm(b,
+                                                       packed_stride,
+                                                       i_offset),
+                                          j_offset));
+            }
+
+            nir_deref_instr *memory_deref =
+               nir_build_deref_ptr_as_array(b, pointer,
+                                            nir_i2iN(b,
+                                                     offset,
+                                                     pointer->def.bit_size));
+
+            if (load) {
+               v[j] = nir_load_deref(b, memory_deref);
+            } else {
+               nir_def *src = nir_channel(b, nir_load_deref(b, slice), i);
+
+               nir_def *v =
+                  nir_channel(b, nir_unpack_bits(b, src, element_bits), j);
+
+               nir_store_deref(b, memory_deref, v, 0x1);
+            }
+         }
+
+         if (load) {
+            results[i] = nir_pack_bits(b, nir_vec(b, v, packing_factor),
+                                       packing_factor * element_bits);
+         }
+      }
+   }
+
+   if (load)
+      nir_store_deref(b, slice, nir_vec(b, results, num_components),
+                      nir_component_mask(num_components));
+}
+
+static void
+lower_cmat_unary_op(nir_builder *b, nir_intrinsic_instr *intrin,
+                    struct lower_cmat_state *state)
+{
+   nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
+   nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[1]);
+   nir_def *results[NIR_MAX_VEC_COMPONENTS];
+   const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
+
+   const struct glsl_type *dst_mat_type =
+      get_coop_type_for_slice(state, dst_slice);
+   const struct glsl_type *src_mat_type =
+      get_coop_type_for_slice(state, src_slice);
+
+   const struct glsl_cmat_description dst_desc =
+      *glsl_get_cmat_description(dst_mat_type);
+
+   const struct glsl_cmat_description src_desc =
+      *glsl_get_cmat_description(src_mat_type);
+
+   const unsigned dst_bits = glsl_base_type_bit_size(dst_desc.element_type);
+   const unsigned src_bits = glsl_base_type_bit_size(src_desc.element_type);
+
+   /* The type of the returned slice may be different from the type of the
+    * input slice.
+    */
+   const unsigned dst_packing_factor =
+      get_packing_factor(dst_desc, dst_slice->type);
+
+   const unsigned src_packing_factor =
+      get_packing_factor(src_desc, src_slice->type);
+
+   const nir_op op = nir_intrinsic_alu_op(intrin);
+
+   /* There are three possible cases:
+    *
+    * 1. dst_packing_factor == src_packing_factor. This is the common case,
+    *    and handling it is straightforward.
+    *
+    * 2. dst_packing_factor > src_packing_factor. This occurs when converting a
+    *    float32_t matrix slice to a packed float16_t slice. Loop over the size
+    *    of the destination slice, but read multiple entries from the source
+    *    slice on each iteration.
+    *
+    * 3. dst_packing_factor < src_packing_factor. This occurs when converting a
+    *    packed int8_t matrix slice to an int32_t slice. Loop over the size of
+    *    the source slice, but write multiple entries to the destination slice
+    *    on each iteration.
+    *
+    * Handle all cases by iterating over the total (non-packed) number of
+    * elements in the slice. When dst_packing_factor values have been
+    * calculated, store them.
+    */
+   assert((dst_packing_factor * glsl_get_vector_elements(dst_slice->type)) ==
+          (src_packing_factor * glsl_get_vector_elements(src_slice->type)));
+
+   /* Stores at most dst_packing_factor partial results. */
+   nir_def *v[4];
+   assert(dst_packing_factor <= 4);
+
+   for (unsigned i = 0; i < num_components * dst_packing_factor; i++) {
+      const unsigned dst_chan_index = i % dst_packing_factor;
+      const unsigned src_chan_index = i % src_packing_factor;
+      const unsigned dst_index = i / dst_packing_factor;
+      const unsigned src_index = i / src_packing_factor;
+
+      nir_def *src =
+         nir_channel(b,
+                     nir_unpack_bits(b,
+                                     nir_channel(b,
+                                                 nir_load_deref(b, src_slice),
+                                                 src_index),
+                                     src_bits),
+                     src_chan_index);
+
+      v[dst_chan_index] = nir_build_alu1(b, op, src);
+
+      if (dst_chan_index == (dst_packing_factor - 1)) {
+         results[dst_index] =
+            nir_pack_bits(b, nir_vec(b, v, dst_packing_factor),
+                          dst_packing_factor * dst_bits);
+      }
+   }
+
+   nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
+                   nir_component_mask(num_components));
+}
+
+static void
+lower_cmat_binary_op(nir_builder *b, nir_intrinsic_instr *intrin,
+                     struct lower_cmat_state *state)
+{
+   nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
+   nir_deref_instr *src_a_slice = nir_src_as_deref(intrin->src[1]);
+   nir_deref_instr *src_b_slice = nir_src_as_deref(intrin->src[2]);
+
+   nir_def *src_a = nir_load_deref(b, src_a_slice);
+   nir_def *src_b = nir_load_deref(b, src_b_slice);
+   nir_def *results[NIR_MAX_VEC_COMPONENTS];
+   const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
+
+   const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
+   ASSERTED const struct glsl_type *src_a_mat_type = get_coop_type_for_slice(state, src_a_slice);
+   ASSERTED const struct glsl_type *src_b_mat_type = get_coop_type_for_slice(state, src_b_slice);
+
+   const struct glsl_cmat_description desc =
+      *glsl_get_cmat_description(dst_mat_type);
+
+   assert(dst_mat_type == src_a_mat_type);
+   assert(dst_mat_type == src_b_mat_type);
+
+   const unsigned bits = glsl_base_type_bit_size(desc.element_type);
+   const unsigned packing_factor = get_packing_factor(desc, dst_slice->type);
+
+   for (unsigned i = 0; i < num_components; i++) {
+      nir_def *val_a = nir_channel(b, src_a, i);
+      nir_def *val_b = nir_channel(b, src_b, i);
+
+      results[i] =
+         nir_pack_bits(b, nir_build_alu2(b, nir_intrinsic_alu_op(intrin),
+                                         nir_unpack_bits(b, val_a, bits),
+                                         nir_unpack_bits(b, val_b, bits)),
+                       packing_factor * bits);
+   }
+
+   nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
+                   nir_component_mask(num_components));
+}
+
+static void
+lower_cmat_scalar_op(nir_builder *b, nir_intrinsic_instr *intrin,
+                     struct lower_cmat_state *state)
+{
+   nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
+   nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[1]);
+   nir_def *scalar = intrin->src[2].ssa;
+
+   nir_def *src = nir_load_deref(b, src_slice);
+   nir_def *results[NIR_MAX_VEC_COMPONENTS];
+   const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
+
+   ASSERTED const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
+   ASSERTED const struct glsl_type *src_mat_type = get_coop_type_for_slice(state, src_slice);
+   assert(dst_mat_type == src_mat_type);
+
+   const struct glsl_cmat_description desc =
+      *glsl_get_cmat_description(dst_mat_type);
+
+   const unsigned bits = glsl_base_type_bit_size(desc.element_type);
+   const unsigned packing_factor = get_packing_factor(desc, dst_slice->type);
+
+   for (unsigned i = 0; i < num_components; i++) {
+      nir_def *val = nir_channel(b, src, i);
+
+      results[i] =
+         nir_pack_bits(b, nir_build_alu2(b, nir_intrinsic_alu_op(intrin),
+                                         nir_unpack_bits(b, val, bits),
+                                         scalar),
+                       packing_factor * bits);
+   }
+
+   nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
+                   nir_component_mask(num_components));
+}
+
+static nir_deref_instr *
+lower_cmat_deref(nir_builder *b, nir_deref_instr *deref,
+                 struct lower_cmat_state *state)
+{
+   nir_deref_instr *parent = nir_deref_instr_parent(deref);
+   if (parent) {
+      assert(deref->deref_type == nir_deref_type_array);
+      parent = lower_cmat_deref(b, parent, state);
+      return nir_build_deref_array(b, parent, deref->arr.index.ssa);
+   } else {
+      assert(deref->deref_type == nir_deref_type_var);
+      assert(deref->var);
+      assert(glsl_type_is_cmat(glsl_without_array(deref->var->type)));
+
+      struct hash_entry *entry = _mesa_hash_table_search(state->vars_to_slice, deref->var);
+      assert(entry);
+      return nir_build_deref_var(b, (nir_variable *)entry->data);
+   }
+}
+
+static nir_def *
+lower_cmat_instr(nir_builder *b, nir_instr *instr, void *_state)
+{
+   struct lower_cmat_state *state = _state;
+
+   if (instr->type == nir_instr_type_deref) {
+      nir_deref_instr *deref = lower_cmat_deref(b, nir_instr_as_deref(instr), state);
+      return &deref->def;
+   }
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_cmat_load:
+   case nir_intrinsic_cmat_store:
+      lower_cmat_load_store(b, intrin, state);
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+
+   case nir_intrinsic_cmat_construct: {
+      nir_deref_instr *slice = nir_src_as_deref(intrin->src[0]);
+      nir_def *src = intrin->src[1].ssa;
+
+      const struct glsl_type *mat_type = get_coop_type_for_slice(state, slice);
+      const struct glsl_cmat_description desc =
+         *glsl_get_cmat_description(mat_type);
+      const unsigned packing_factor = get_packing_factor(desc, slice->type);
+
+      if (packing_factor > 1) {
+         src = nir_pack_bits(b, nir_replicate(b, src, packing_factor),
+                             packing_factor * glsl_base_type_get_bit_size(desc.element_type));
+      }
+
+      const unsigned num_components = glsl_get_vector_elements(slice->type);
+
+      nir_store_deref(b, slice, nir_replicate(b, src, num_components),
+                      nir_component_mask(num_components));
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+   }
+
+   case nir_intrinsic_cmat_unary_op:
+      lower_cmat_unary_op(b, intrin, state);
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+
+   case nir_intrinsic_cmat_binary_op:
+      lower_cmat_binary_op(b, intrin, state);
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+
+   case nir_intrinsic_cmat_scalar_op:
+      lower_cmat_scalar_op(b, intrin, state);
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+
+   case nir_intrinsic_cmat_length: {
+      const struct glsl_cmat_description desc = nir_intrinsic_cmat_desc(intrin);
+      const struct glsl_type *mat_type = glsl_cmat_type(&desc);
+      const struct glsl_type *slice_type = get_slice_type(state, mat_type);
+      return nir_imm_intN_t(b, (get_packing_factor(desc, slice_type) *
+                                glsl_get_vector_elements(slice_type)), 32);
+   }
+
+   case nir_intrinsic_cmat_muladd: {
+      nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
+      nir_deref_instr *A_slice = nir_src_as_deref(intrin->src[1]);
+      nir_deref_instr *B_slice = nir_src_as_deref(intrin->src[2]);
+      nir_deref_instr *accum_slice = nir_src_as_deref(intrin->src[3]);
+
+      const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
+      const struct glsl_cmat_description dst_desc = *glsl_get_cmat_description(dst_mat_type);
+
+      const struct glsl_type *src_mat_type = get_coop_type_for_slice(state, A_slice);
+      const struct glsl_cmat_description src_desc = *glsl_get_cmat_description(src_mat_type);
+
+      const unsigned packing_factor = get_packing_factor(dst_desc, dst_slice->type);
+      const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
+
+      nir_def *result =
+         nir_dpas_intel(b,
+                        packing_factor * glsl_base_type_get_bit_size(dst_desc.element_type),
+                        nir_load_deref(b, A_slice),
+                        nir_load_deref(b, B_slice),
+                        nir_load_deref(b, accum_slice),
+                        .dest_type = nir_get_nir_type_for_glsl_base_type(dst_desc.element_type),
+                        .src_type = nir_get_nir_type_for_glsl_base_type(src_desc.element_type),
+                        .saturate = nir_intrinsic_saturate(intrin),
+                        .cmat_signed_mask = nir_intrinsic_cmat_signed_mask(intrin),
+                        .systolic_depth = 8,
+                        .repeat_count = 8);
+
+      nir_store_deref(b, dst_slice, result,
+                      nir_component_mask(num_components));
+
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+   }
+
+   case nir_intrinsic_cmat_bitcast: {
+      nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
+      nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[1]);
+
+      const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
+
+      assert(glsl_get_vector_elements(src_slice->type) == num_components);
+
+      nir_store_deref(b, dst_slice, nir_load_deref(b, src_slice),
+                      nir_component_mask(num_components));
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+   }
+
+   case nir_intrinsic_cmat_copy:
+      nir_copy_deref(b,
+                     nir_src_as_deref(intrin->src[0]),
+                     nir_src_as_deref(intrin->src[1]));
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+
+   case nir_intrinsic_cmat_insert: {
+      nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
+      nir_def *scalar = intrin->src[1].ssa;
+      nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[2]);
+      const nir_src dst_index = intrin->src[3];
+
+      const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
+      ASSERTED const struct glsl_type *src_mat_type = get_coop_type_for_slice(state, src_slice);
+      assert(dst_mat_type == src_mat_type);
+
+      const struct glsl_cmat_description desc =
+         *glsl_get_cmat_description(dst_mat_type);
+
+      const unsigned bits = glsl_base_type_bit_size(desc.element_type);
+      const unsigned packing_factor = get_packing_factor(desc, dst_slice->type);
+      const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
+
+      nir_def *slice_index = nir_udiv_imm(b, dst_index.ssa, packing_factor);
+      nir_def *vector_index = nir_umod_imm(b, dst_index.ssa, packing_factor);
+      nir_def *results[NIR_MAX_VEC_COMPONENTS];
+
+      const int slice_constant_index = nir_src_is_const(dst_index)
+         ? nir_src_as_uint(dst_index) / packing_factor
+         : -1;
+
+      for (unsigned i = 0; i < num_components; i++) {
+         nir_def *val = nir_channel(b, nir_load_deref(b, src_slice), i);
+         nir_def *insert;
+
+         if (slice_constant_index < 0 || slice_constant_index == i) {
+            if (packing_factor == 1) {
+               insert = scalar;
+            } else {
+               nir_def *unpacked = nir_unpack_bits(b, val, bits);
+               nir_def *v = nir_vector_insert(b, unpacked, scalar, vector_index);
+
+               insert = nir_pack_bits(b, v, bits * packing_factor);
+            }
+         } else {
+            insert = val;
+         }
+
+         results[i] = slice_constant_index < 0
+            ? nir_bcsel(b, nir_ieq_imm(b, slice_index, i), insert, val)
+            : insert;
+      }
+
+      nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
+                      nir_component_mask(num_components));
+
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+   }
+
+   case nir_intrinsic_cmat_extract: {
+      nir_deref_instr *slice = nir_src_as_deref(intrin->src[0]);
+      const struct glsl_type *mat_type = get_coop_type_for_slice(state, slice);
+      nir_def *index = intrin->src[1].ssa;
+
+      const struct glsl_cmat_description desc =
+         *glsl_get_cmat_description(mat_type);
+
+      const unsigned bits = glsl_base_type_bit_size(desc.element_type);
+      const unsigned packing_factor = get_packing_factor(desc, slice->type);
+
+      nir_def *src =
+         nir_vector_extract(b, nir_load_deref(b, slice),
+                            nir_udiv_imm(b, index, packing_factor));
+
+      if (packing_factor == 1) {
+         return src;
+      } else {
+         return nir_vector_extract(b,
+                                   nir_unpack_bits(b, src, bits),
+                                   nir_umod_imm(b, index, packing_factor));
+      }
+
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+   }
+
+   default:
+      unreachable("invalid cooperative matrix intrinsic");
+   }
+}
+
+static void
+create_slice_var(struct lower_cmat_state *state, nir_variable *var,
+                 nir_function_impl *impl)
+{
+   // TODO: without array
+   const struct glsl_type *mat_type = glsl_without_array(var->type);
+
+   assert(glsl_type_is_cmat(mat_type));
+   assert((!impl && var->data.mode == nir_var_shader_temp) ||
+          ( impl && var->data.mode == nir_var_function_temp));
+
+   const struct glsl_type *slice_type = get_slice_type(state, var->type);
+   const char *slice_name = ralloc_asprintf(state->shader, "%s_slice", var->name);
+   nir_variable *slice_var = impl ?
+      nir_local_variable_create(impl, slice_type, slice_name) :
+      nir_variable_create(state->shader, var->data.mode, slice_type, slice_name);
+
+   _mesa_hash_table_insert(state->vars_to_slice, var, slice_var);
+   _mesa_hash_table_insert(state->slice_coop_types, slice_var, (void *)mat_type);
+}
+
+bool
+brw_nir_lower_cmat(nir_shader *shader, unsigned subgroup_size)
+{
+   void *temp_ctx = ralloc_context(NULL);
+
+   struct lower_cmat_state state = {
+      .shader = shader,
+      .slice_coop_types = _mesa_pointer_hash_table_create(temp_ctx),
+      .vars_to_slice = _mesa_pointer_hash_table_create(temp_ctx),
+      .subgroup_size = subgroup_size,
+   };
+
+   /* Create a slice array for each variable and add a map from the original
+    * variable back to it, so it can be reached during lowering.
+    *
+    * TODO: Cooperative matrix inside struct?
+    */
+   nir_foreach_variable_in_shader(var, shader) {
+      if (glsl_type_is_cmat(glsl_without_array(var->type)))
+         create_slice_var(&state, var, NULL);
+   }
+   nir_foreach_function(func, shader) {
+      nir_foreach_function_temp_variable(var, func->impl) {
+         if (glsl_type_is_cmat(glsl_without_array(var->type)))
+            create_slice_var(&state, var, func->impl);
+      }
+   }
+
+   bool progress = nir_shader_lower_instructions(shader,
+                                                 lower_cmat_filter,
+                                                 lower_cmat_instr,
+                                                 &state);
+
+   ralloc_free(temp_ctx);
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/elk/brw_nir_lower_cs_intrinsics.c
new file mode 100644
index 00000000000..fdbe684aa02
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir_lower_cs_intrinsics.c
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+struct lower_intrinsics_state {
+   nir_shader *nir;
+   nir_function_impl *impl;
+   bool progress;
+   bool hw_generated_local_id;
+   nir_builder builder;
+};
+
+static void
+compute_local_index_id(nir_builder *b,
+                       nir_shader *nir,
+                       nir_def **local_index,
+                       nir_def **local_id)
+{
+   nir_def *subgroup_id = nir_load_subgroup_id(b);
+
+   nir_def *thread_local_id =
+      nir_imul(b, subgroup_id, nir_load_simd_width_intel(b));
+   nir_def *channel = nir_load_subgroup_invocation(b);
+   nir_def *linear = nir_iadd(b, channel, thread_local_id);
+
+   nir_def *size_x;
+   nir_def *size_y;
+   if (nir->info.workgroup_size_variable) {
+      nir_def *size_xyz = nir_load_workgroup_size(b);
+      size_x = nir_channel(b, size_xyz, 0);
+      size_y = nir_channel(b, size_xyz, 1);
+   } else {
+      size_x = nir_imm_int(b, nir->info.workgroup_size[0]);
+      size_y = nir_imm_int(b, nir->info.workgroup_size[1]);
+   }
+   nir_def *size_xy = nir_imul(b, size_x, size_y);
+
+   /* The local invocation index and ID must respect the following
+    *
+    *    gl_LocalInvocationID.x =
+    *       gl_LocalInvocationIndex % gl_WorkGroupSize.x;
+    *    gl_LocalInvocationID.y =
+    *       (gl_LocalInvocationIndex / gl_WorkGroupSize.x) %
+    *       gl_WorkGroupSize.y;
+    *    gl_LocalInvocationID.z =
+    *       (gl_LocalInvocationIndex /
+    *        (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
+    *       gl_WorkGroupSize.z;
+    *
+    * However, the final % gl_WorkGroupSize.z does nothing unless we
+    * accidentally end up with a gl_LocalInvocationIndex that is too
+    * large so it can safely be omitted.
+    */
+
+   nir_def *id_x, *id_y, *id_z;
+   switch (nir->info.cs.derivative_group) {
+   case DERIVATIVE_GROUP_NONE:
+      if (nir->info.num_images == 0 &&
+          nir->info.num_textures == 0) {
+         /* X-major lid order. Optimal for linear accesses only,
+          * which are usually buffers. X,Y ordering will look like:
+          * (0,0) (1,0) (2,0) ... (size_x-1,0) (0,1) (1,1) ...
+          */
+         id_x = nir_umod(b, linear, size_x);
+         id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
+         *local_index = linear;
+      } else if (!nir->info.workgroup_size_variable &&
+                 nir->info.workgroup_size[1] % 4 == 0) {
+         /* 1x4 block X-major lid order. Same as X-major except increments in
+          * blocks of width=1 height=4. Always optimal for tileY and usually
+          * optimal for linear accesses.
+          *   x = (linear / 4) % size_x
+          *   y = ((linear % 4) + (linear / 4 / size_x) * 4) % size_y
+          * X,Y ordering will look like: (0,0) (0,1) (0,2) (0,3) (1,0) (1,1)
+          * (1,2) (1,3) (2,0) ... (size_x-1,3) (0,4) (0,5) (0,6) (0,7) (1,4) ...
+          */
+         const unsigned height = 4;
+         nir_def *block = nir_udiv_imm(b, linear, height);
+         id_x = nir_umod(b, block, size_x);
+         id_y = nir_umod(b,
+                         nir_iadd(b,
+                                  nir_umod_imm(b, linear, height),
+                                  nir_imul_imm(b,
+                                               nir_udiv(b, block, size_x),
+                                               height)),
+                         size_y);
+      } else {
+         /* Y-major lid order. Optimal for tileY accesses only,
+          * which are usually images. X,Y ordering will look like:
+          * (0,0) (0,1) (0,2) ... (0,size_y-1) (1,0) (1,1) ...
+          */
+         id_y = nir_umod(b, linear, size_y);
+         id_x = nir_umod(b, nir_udiv(b, linear, size_y), size_x);
+      }
+
+      id_z = nir_udiv(b, linear, size_xy);
+      *local_id = nir_vec3(b, id_x, id_y, id_z);
+      if (!*local_index) {
+         *local_index = nir_iadd(b, nir_iadd(b, id_x,
+                                                nir_imul(b, id_y, size_x)),
+                                                nir_imul(b, id_z, size_xy));
+      }
+      break;
+   case DERIVATIVE_GROUP_LINEAR:
+      /* For linear, just set the local invocation index linearly,
+       * and calculate local invocation ID from that.
+       */
+      id_x = nir_umod(b, linear, size_x);
+      id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
+      id_z = nir_udiv(b, linear, size_xy);
+      *local_id = nir_vec3(b, id_x, id_y, id_z);
+      *local_index = linear;
+      break;
+   case DERIVATIVE_GROUP_QUADS: {
+      /* For quads, first we figure out the 2x2 grid the invocation
+       * belongs to -- treating extra Z layers as just more rows.
+       * Then map that into local invocation ID (trivial) and local
+       * invocation index.  Skipping Z simplify index calculation.
+       */
+
+      nir_def *one = nir_imm_int(b, 1);
+      nir_def *double_size_x = nir_ishl(b, size_x, one);
+
+      /* ID within a pair of rows, where each group of 4 is 2x2 quad. */
+      nir_def *row_pair_id = nir_umod(b, linear, double_size_x);
+      nir_def *y_row_pairs = nir_udiv(b, linear, double_size_x);
+
+      nir_def *x =
+         nir_ior(b,
+                 nir_iand(b, row_pair_id, one),
+                 nir_iand(b, nir_ishr(b, row_pair_id, one),
+                          nir_imm_int(b, 0xfffffffe)));
+      nir_def *y =
+         nir_ior(b,
+                 nir_ishl(b, y_row_pairs, one),
+                 nir_iand(b, nir_ishr(b, row_pair_id, one), one));
+
+      *local_id = nir_vec3(b, x,
+                           nir_umod(b, y, size_y),
+                           nir_udiv(b, y, size_y));
+      *local_index = nir_iadd(b, x, nir_imul(b, y, size_x));
+      break;
+   }
+   default:
+      unreachable("invalid derivative group");
+   }
+}
+
+static bool
+lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
+                                  nir_block *block)
+{
+   bool progress = false;
+   nir_builder *b = &state->builder;
+   nir_shader *nir = state->nir;
+
+   /* Reuse calculated values inside the block. */
+   nir_def *local_index = NULL;
+   nir_def *local_id = NULL;
+
+   nir_foreach_instr_safe(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
+
+      b->cursor = nir_after_instr(&intrinsic->instr);
+
+      nir_def *sysval;
+      switch (intrinsic->intrinsic) {
+      case nir_intrinsic_load_local_invocation_id:
+         if (state->hw_generated_local_id)
+            continue;
+
+         FALLTHROUGH;
+      case nir_intrinsic_load_local_invocation_index: {
+         if (!local_index && !nir->info.workgroup_size_variable) {
+            const uint16_t *ws = nir->info.workgroup_size;
+            if (ws[0] * ws[1] * ws[2] == 1) {
+               nir_def *zero = nir_imm_int(b, 0);
+               local_index = zero;
+               local_id = nir_replicate(b, zero, 3);
+            }
+         }
+
+         if (!local_index) {
+            if (nir->info.stage == MESA_SHADER_TASK ||
+                nir->info.stage == MESA_SHADER_MESH) {
+               /* Will be lowered by nir_emit_task_mesh_intrinsic() using
+                * information from the payload.
+                */
+               continue;
+            }
+
+            if (state->hw_generated_local_id) {
+               nir_def *local_id_vec = nir_load_local_invocation_id(b);
+               nir_def *local_id[3] = { nir_channel(b, local_id_vec, 0),
+                                        nir_channel(b, local_id_vec, 1),
+                                        nir_channel(b, local_id_vec, 2) };
+               nir_def *size_x = nir_imm_int(b, nir->info.workgroup_size[0]);
+               nir_def *size_y = nir_imm_int(b, nir->info.workgroup_size[1]);
+
+               sysval = nir_imul(b, local_id[2], nir_imul(b, size_x, size_y));
+               sysval = nir_iadd(b, sysval, nir_imul(b, local_id[1], size_x));
+               sysval = nir_iadd(b, sysval, local_id[0]);
+               local_index = sysval;
+               break;
+            }
+
+            /* First time we are using those, so let's calculate them. */
+            assert(!local_id);
+            compute_local_index_id(b, nir, &local_index, &local_id);
+         }
+
+         assert(local_id);
+         assert(local_index);
+         if (intrinsic->intrinsic == nir_intrinsic_load_local_invocation_id)
+            sysval = local_id;
+         else
+            sysval = local_index;
+         break;
+      }
+
+      case nir_intrinsic_load_num_subgroups: {
+         nir_def *size;
+         if (state->nir->info.workgroup_size_variable) {
+            nir_def *size_xyz = nir_load_workgroup_size(b);
+            nir_def *size_x = nir_channel(b, size_xyz, 0);
+            nir_def *size_y = nir_channel(b, size_xyz, 1);
+            nir_def *size_z = nir_channel(b, size_xyz, 2);
+            size = nir_imul(b, nir_imul(b, size_x, size_y), size_z);
+         } else {
+            size = nir_imm_int(b, nir->info.workgroup_size[0] *
+                                  nir->info.workgroup_size[1] *
+                                  nir->info.workgroup_size[2]);
+         }
+
+         /* Calculate the equivalent of DIV_ROUND_UP. */
+         nir_def *simd_width = nir_load_simd_width_intel(b);
+         sysval =
+            nir_udiv(b, nir_iadd_imm(b, nir_iadd(b, size, simd_width), -1),
+                        simd_width);
+         break;
+      }
+
+      default:
+         continue;
+      }
+
+      if (intrinsic->def.bit_size == 64)
+         sysval = nir_u2u64(b, sysval);
+
+      nir_def_rewrite_uses(&intrinsic->def, sysval);
+      nir_instr_remove(&intrinsic->instr);
+
+      state->progress = true;
+   }
+
+   return progress;
+}
+
+static void
+lower_cs_intrinsics_convert_impl(struct lower_intrinsics_state *state)
+{
+   state->builder = nir_builder_create(state->impl);
+
+   nir_foreach_block(block, state->impl) {
+      lower_cs_intrinsics_convert_block(state, block);
+   }
+
+   nir_metadata_preserve(state->impl,
+                         nir_metadata_block_index | nir_metadata_dominance);
+}
+
+bool
+brw_nir_lower_cs_intrinsics(nir_shader *nir,
+                            const struct intel_device_info *devinfo,
+                            struct brw_cs_prog_data *prog_data)
+{
+   assert(gl_shader_stage_uses_workgroup(nir->info.stage));
+
+   struct lower_intrinsics_state state = {
+      .nir = nir,
+      .hw_generated_local_id = false,
+   };
+
+   /* Constraints from NV_compute_shader_derivatives. */
+   if (gl_shader_stage_is_compute(nir->info.stage) &&
+       !nir->info.workgroup_size_variable) {
+      if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
+         assert(nir->info.workgroup_size[0] % 2 == 0);
+         assert(nir->info.workgroup_size[1] % 2 == 0);
+      } else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR) {
+         ASSERTED unsigned workgroup_size =
+            nir->info.workgroup_size[0] *
+            nir->info.workgroup_size[1] *
+            nir->info.workgroup_size[2];
+         assert(workgroup_size % 4 == 0);
+      }
+   }
+
+   if (devinfo->verx10 >= 125 && prog_data &&
+       nir->info.stage == MESA_SHADER_COMPUTE &&
+       nir->info.cs.derivative_group != DERIVATIVE_GROUP_QUADS &&
+       !nir->info.workgroup_size_variable &&
+       util_is_power_of_two_nonzero(nir->info.workgroup_size[0]) &&
+       util_is_power_of_two_nonzero(nir->info.workgroup_size[1])) {
+
+      state.hw_generated_local_id = true;
+
+      /* TODO: more heuristics about 1D/SLM access vs. 2D access */
+      bool linear =
+         BITSET_TEST(nir->info.system_values_read,
+                     SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) ||
+         (nir->info.workgroup_size[1] == 1 &&
+          nir->info.workgroup_size[2] == 1) ||
+         (nir->info.num_images == 0 && nir->info.num_textures == 0);
+
+      prog_data->walk_order =
+         linear ? INTEL_WALK_ORDER_XYZ : INTEL_WALK_ORDER_YXZ;
+
+      /* nir_lower_compute_system_values will replace any references to
+       * SYSTEM_VALUE_LOCAL_INVOCATION_ID vector components with zero for
+       * any dimension where the workgroup size is 1, so we can skip
+       * generating those.  However, the hardware can only generate
+       * X, XY, or XYZ - it can't skip earlier components.
+       */
+      prog_data->generate_local_id =
+         (nir->info.workgroup_size[0] > 1 ? WRITEMASK_X   : 0) |
+         (nir->info.workgroup_size[1] > 1 ? WRITEMASK_XY  : 0) |
+         (nir->info.workgroup_size[2] > 1 ? WRITEMASK_XYZ : 0);
+   }
+
+   nir_foreach_function_impl(impl, nir) {
+      state.impl = impl;
+      lower_cs_intrinsics_convert_impl(&state);
+   }
+
+   return state.progress;
+}
diff --git a/src/intel/compiler/elk/brw_nir_lower_intersection_shader.c b/src/intel/compiler/elk/brw_nir_lower_intersection_shader.c
new file mode 100644
index 00000000000..b26339bdac1
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir_lower_intersection_shader.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir_rt.h"
+#include "brw_nir_rt_builder.h"
+
+static nir_function_impl *
+lower_any_hit_for_intersection(nir_shader *any_hit)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(any_hit);
+
+   /* Any-hit shaders need three parameters */
+   assert(impl->function->num_params == 0);
+   nir_parameter params[] = {
+      {
+         /* A pointer to a boolean value for whether or not the hit was
+          * accepted.
+          */
+         .num_components = 1,
+         .bit_size = 32,
+      },
+      {
+         /* The hit T value */
+         .num_components = 1,
+         .bit_size = 32,
+      },
+      {
+         /* The hit kind */
+         .num_components = 1,
+         .bit_size = 32,
+      },
+   };
+   impl->function->num_params = ARRAY_SIZE(params);
+   impl->function->params =
+      ralloc_array(any_hit, nir_parameter, ARRAY_SIZE(params));
+   memcpy(impl->function->params, params, sizeof(params));
+
+   nir_builder build = nir_builder_at(nir_before_impl(impl));
+   nir_builder *b = &build;
+
+   nir_def *commit_ptr = nir_load_param(b, 0);
+   nir_def *hit_t = nir_load_param(b, 1);
+   nir_def *hit_kind = nir_load_param(b, 2);
+
+   nir_deref_instr *commit =
+      nir_build_deref_cast(b, commit_ptr, nir_var_function_temp,
+                           glsl_bool_type(), 0);
+
+   nir_foreach_block_safe(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         switch (instr->type) {
+         case nir_instr_type_intrinsic: {
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_ignore_ray_intersection:
+               b->cursor = nir_instr_remove(&intrin->instr);
+               /* We put the newly emitted code inside a dummy if because it's
+                * going to contain a jump instruction and we don't want to
+                * deal with that mess here.  It'll get dealt with by our
+                * control-flow optimization passes.
+                */
+               nir_store_deref(b, commit, nir_imm_false(b), 0x1);
+               nir_push_if(b, nir_imm_true(b));
+               nir_jump(b, nir_jump_return);
+               nir_pop_if(b, NULL);
+               break;
+
+            case nir_intrinsic_terminate_ray:
+               /* The "normal" handling of terminateRay works fine in
+                * intersection shaders.
+                */
+               break;
+
+            case nir_intrinsic_load_ray_t_max:
+               nir_def_rewrite_uses(&intrin->def,
+                                        hit_t);
+               nir_instr_remove(&intrin->instr);
+               break;
+
+            case nir_intrinsic_load_ray_hit_kind:
+               nir_def_rewrite_uses(&intrin->def,
+                                        hit_kind);
+               nir_instr_remove(&intrin->instr);
+               break;
+
+            default:
+               break;
+            }
+            break;
+         }
+
+         case nir_instr_type_jump: {
+            /* Stomp any halts to returns since they only return from the
+             * any-hit shader and not necessarily from the intersection
+             * shader.  This is safe to do because we've already asserted
+             * that we only have the one function.
+             */
+            nir_jump_instr *jump = nir_instr_as_jump(instr);
+            if (jump->type == nir_jump_halt)
+               jump->type = nir_jump_return;
+            break;
+         }
+
+         default:
+            break;
+         }
+      }
+   }
+
+   nir_validate_shader(any_hit, "after initial any-hit lowering");
+
+   nir_lower_returns_impl(impl);
+
+   nir_validate_shader(any_hit, "after lowering returns");
+
+   return impl;
+}
+
+void
+brw_nir_lower_intersection_shader(nir_shader *intersection,
+                                  const nir_shader *any_hit,
+                                  const struct intel_device_info *devinfo)
+{
+   void *dead_ctx = ralloc_context(intersection);
+
+   nir_function_impl *any_hit_impl = NULL;
+   struct hash_table *any_hit_var_remap = NULL;
+   if (any_hit) {
+      nir_shader *any_hit_tmp = nir_shader_clone(dead_ctx, any_hit);
+      NIR_PASS_V(any_hit_tmp, nir_opt_dce);
+      any_hit_impl = lower_any_hit_for_intersection(any_hit_tmp);
+      any_hit_var_remap = _mesa_pointer_hash_table_create(dead_ctx);
+   }
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(intersection);
+
+   nir_builder build = nir_builder_at(nir_before_impl(impl));
+   nir_builder *b = &build;
+
+   nir_def *t_addr = brw_nir_rt_mem_hit_addr(b, false /* committed */);
+   nir_variable *commit =
+      nir_local_variable_create(impl, glsl_bool_type(), "ray_commit");
+   nir_store_var(b, commit, nir_imm_false(b), 0x1);
+
+   assert(impl->end_block->predecessors->entries == 1);
+   set_foreach(impl->end_block->predecessors, block_entry) {
+      struct nir_block *block = (void *)block_entry->key;
+      b->cursor = nir_after_block_before_jump(block);
+      nir_push_if(b, nir_load_var(b, commit));
+      {
+         /* Set the "valid" bit in mem_hit */
+         nir_def *ray_addr = brw_nir_rt_mem_hit_addr(b, false /* committed */);
+         nir_def *flags_dw_addr = nir_iadd_imm(b, ray_addr, 12);
+         nir_store_global(b, flags_dw_addr, 4,
+            nir_ior(b, nir_load_global(b, flags_dw_addr, 4, 1, 32),
+                       nir_imm_int(b, 1 << 16)), 0x1 /* write_mask */);
+
+         nir_accept_ray_intersection(b);
+      }
+      nir_push_else(b, NULL);
+      {
+         nir_ignore_ray_intersection(b);
+      }
+      nir_pop_if(b, NULL);
+      break;
+   }
+
+   nir_foreach_block_safe(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         switch (instr->type) {
+         case nir_instr_type_intrinsic: {
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_report_ray_intersection: {
+               b->cursor = nir_instr_remove(&intrin->instr);
+               nir_def *hit_t = intrin->src[0].ssa;
+               nir_def *hit_kind = intrin->src[1].ssa;
+               nir_def *min_t = nir_load_ray_t_min(b);
+
+               struct brw_nir_rt_mem_ray_defs ray_def;
+               brw_nir_rt_load_mem_ray(b, &ray_def, BRW_RT_BVH_LEVEL_WORLD);
+
+               struct brw_nir_rt_mem_hit_defs hit_in = {};
+               brw_nir_rt_load_mem_hit(b, &hit_in, false);
+
+               nir_def *max_t = ray_def.t_far;
+
+               /* bool commit_tmp = false; */
+               nir_variable *commit_tmp =
+                  nir_local_variable_create(impl, glsl_bool_type(),
+                                            "commit_tmp");
+               nir_store_var(b, commit_tmp, nir_imm_false(b), 0x1);
+
+               nir_push_if(b, nir_iand(b, nir_fge(b, hit_t, min_t),
+                                          nir_fge(b, max_t, hit_t)));
+               {
+                  /* Any-hit defaults to commit */
+                  nir_store_var(b, commit_tmp, nir_imm_true(b), 0x1);
+
+                  if (any_hit_impl != NULL) {
+                     nir_push_if(b, nir_inot(b, nir_load_leaf_opaque_intel(b)));
+                     {
+                        nir_def *params[] = {
+                           &nir_build_deref_var(b, commit_tmp)->def,
+                           hit_t,
+                           hit_kind,
+                        };
+                        nir_inline_function_impl(b, any_hit_impl, params,
+                                                 any_hit_var_remap);
+                     }
+                     nir_pop_if(b, NULL);
+                  }
+
+                  nir_push_if(b, nir_load_var(b, commit_tmp));
+                  {
+                     nir_store_var(b, commit, nir_imm_true(b), 0x1);
+
+                     nir_def *ray_addr =
+                        brw_nir_rt_mem_ray_addr(b, brw_nir_rt_stack_addr(b), BRW_RT_BVH_LEVEL_WORLD);
+
+                     nir_store_global(b, nir_iadd_imm(b, ray_addr, 16 + 12), 4,  hit_t, 0x1);
+                     nir_store_global(b, t_addr, 4,
+                                      nir_vec2(b, nir_fmin(b, hit_t, hit_in.t), hit_kind),
+                                      0x3);
+                  }
+                  nir_pop_if(b, NULL);
+               }
+               nir_pop_if(b, NULL);
+
+               nir_def *accepted = nir_load_var(b, commit_tmp);
+               nir_def_rewrite_uses(&intrin->def,
+                                        accepted);
+               break;
+            }
+
+            default:
+               break;
+            }
+            break;
+         }
+
+         default:
+            break;
+         }
+      }
+   }
+   nir_metadata_preserve(impl, nir_metadata_none);
+
+   /* We did some inlining; have to re-index SSA defs */
+   nir_index_ssa_defs(impl);
+
+   ralloc_free(dead_ctx);
+}
diff --git a/src/intel/compiler/elk/brw_nir_lower_ray_queries.c b/src/intel/compiler/elk/brw_nir_lower_ray_queries.c
new file mode 100644
index 00000000000..bcade17e803
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir_lower_ray_queries.c
@@ -0,0 +1,567 @@
+/*
+ * Copyright (c) 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir_rt.h"
+#include "brw_nir_rt_builder.h"
+
+#include "nir_deref.h"
+
+#include "util/macros.h"
+
+struct lowering_state {
+   const struct intel_device_info *devinfo;
+
+   nir_function_impl *impl;
+
+   struct hash_table *queries;
+   uint32_t n_queries;
+
+   struct brw_nir_rt_globals_defs globals;
+   nir_def *rq_globals;
+};
+
+struct brw_ray_query {
+   nir_variable *opaque_var;
+   nir_variable *internal_var;
+   uint32_t id;
+};
+
+#define SIZEOF_QUERY_STATE (sizeof(uint32_t))
+
+static bool
+need_spill_fill(struct lowering_state *state)
+{
+   return state->n_queries > 1;
+}
+
+/**
+ * This pass converts opaque RayQuery structures from SPIRV into a vec3 where
+ * the first 2 elements store a global address for the query and the third
+ * element is an incremented counter on the number of executed
+ * nir_intrinsic_rq_proceed.
+ */
+
+static void
+register_opaque_var(nir_variable *opaque_var, struct lowering_state *state)
+{
+   struct hash_entry *entry = _mesa_hash_table_search(state->queries, opaque_var);
+   assert(entry == NULL);
+
+   struct brw_ray_query *rq = rzalloc(state->queries, struct brw_ray_query);
+   rq->opaque_var = opaque_var;
+   rq->id = state->n_queries;
+
+   unsigned aoa_size = glsl_get_aoa_size(opaque_var->type);
+   state->n_queries += MAX2(1, aoa_size);
+
+   _mesa_hash_table_insert(state->queries, opaque_var, rq);
+}
+
+static void
+create_internal_var(struct brw_ray_query *rq, struct lowering_state *state)
+{
+   const struct glsl_type *opaque_type = rq->opaque_var->type;
+   const struct glsl_type *internal_type = glsl_uint16_t_type();
+
+   while (glsl_type_is_array(opaque_type)) {
+      assert(!glsl_type_is_unsized_array(opaque_type));
+      internal_type = glsl_array_type(internal_type,
+                                      glsl_array_size(opaque_type),
+                                      0);
+      opaque_type = glsl_get_array_element(opaque_type);
+   }
+
+   rq->internal_var = nir_local_variable_create(state->impl,
+                                                internal_type,
+                                                NULL);
+}
+
+
+
+static nir_def *
+get_ray_query_shadow_addr(nir_builder *b,
+                          nir_deref_instr *deref,
+                          struct lowering_state *state,
+                          nir_deref_instr **out_state_deref)
+{
+   nir_deref_path path;
+   nir_deref_path_init(&path, deref, NULL);
+   assert(path.path[0]->deref_type == nir_deref_type_var);
+
+   nir_variable *opaque_var = nir_deref_instr_get_variable(path.path[0]);
+   struct hash_entry *entry = _mesa_hash_table_search(state->queries, opaque_var);
+   assert(entry);
+
+   struct brw_ray_query *rq = entry->data;
+
+   /* Base address in the shadow memory of the variable associated with this
+    * ray query variable.
+    */
+   nir_def *base_addr =
+      nir_iadd_imm(b, state->globals.resume_sbt_addr,
+                   brw_rt_ray_queries_shadow_stack_size(state->devinfo) * rq->id);
+
+   bool spill_fill = need_spill_fill(state);
+   *out_state_deref = nir_build_deref_var(b, rq->internal_var);
+
+   if (!spill_fill)
+      return NULL;
+
+   /* Just emit code and let constant-folding go to town */
+   nir_deref_instr **p = &path.path[1];
+   for (; *p; p++) {
+      if ((*p)->deref_type == nir_deref_type_array) {
+         nir_def *index = (*p)->arr.index.ssa;
+
+         /**/
+         *out_state_deref = nir_build_deref_array(b, *out_state_deref, index);
+
+         /**/
+         uint64_t size = MAX2(1, glsl_get_aoa_size((*p)->type)) *
+            brw_rt_ray_queries_shadow_stack_size(state->devinfo);
+
+         nir_def *mul = nir_amul_imm(b, nir_i2i64(b, index), size);
+
+         base_addr = nir_iadd(b, base_addr, mul);
+      } else {
+         unreachable("Unsupported deref type");
+      }
+   }
+
+   nir_deref_path_finish(&path);
+
+   /* Add the lane offset to the shadow memory address */
+   nir_def *lane_offset =
+      nir_imul_imm(
+         b,
+         nir_iadd(
+            b,
+            nir_imul(
+               b,
+               brw_load_btd_dss_id(b),
+               brw_nir_rt_load_num_simd_lanes_per_dss(b, state->devinfo)),
+            brw_nir_rt_sync_stack_id(b)),
+         BRW_RT_SIZEOF_SHADOW_RAY_QUERY);
+
+   return nir_iadd(b, base_addr, nir_i2i64(b, lane_offset));
+}
+
+static void
+update_trace_ctrl_level(nir_builder *b,
+                        nir_deref_instr *state_deref,
+                        nir_def **out_old_ctrl,
+                        nir_def **out_old_level,
+                        nir_def *new_ctrl,
+                        nir_def *new_level)
+{
+   nir_def *old_value = nir_load_deref(b, state_deref);
+   nir_def *old_ctrl = nir_ishr_imm(b, old_value, 2);
+   nir_def *old_level = nir_iand_imm(b, old_value, 0x3);
+
+   if (out_old_ctrl)
+      *out_old_ctrl = old_ctrl;
+   if (out_old_level)
+      *out_old_level = old_level;
+
+   if (new_ctrl)
+      new_ctrl = nir_i2i16(b, new_ctrl);
+   if (new_level)
+      new_level = nir_i2i16(b, new_level);
+
+   if (new_ctrl || new_level) {
+      if (!new_ctrl)
+         new_ctrl = old_ctrl;
+      if (!new_level)
+         new_level = old_level;
+
+      nir_def *new_value = nir_ior(b, nir_ishl_imm(b, new_ctrl, 2), new_level);
+      nir_store_deref(b, state_deref, new_value, 0x1);
+   }
+}
+
+static void
+fill_query(nir_builder *b,
+           nir_def *hw_stack_addr,
+           nir_def *shadow_stack_addr,
+           nir_def *ctrl)
+{
+   brw_nir_memcpy_global(b, hw_stack_addr, 64, shadow_stack_addr, 64,
+                         BRW_RT_SIZEOF_RAY_QUERY);
+}
+
+static void
+spill_query(nir_builder *b,
+            nir_def *hw_stack_addr,
+            nir_def *shadow_stack_addr)
+{
+   brw_nir_memcpy_global(b, shadow_stack_addr, 64, hw_stack_addr, 64,
+                         BRW_RT_SIZEOF_RAY_QUERY);
+}
+
+
+static void
+lower_ray_query_intrinsic(nir_builder *b,
+                          nir_intrinsic_instr *intrin,
+                          struct lowering_state *state)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_deref_instr *ctrl_level_deref;
+   nir_def *shadow_stack_addr =
+      get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref);
+   nir_def *hw_stack_addr =
+      brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr, state->devinfo);
+   nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr;
+
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_rq_initialize: {
+      nir_def *as_addr = intrin->src[1].ssa;
+      nir_def *ray_flags = intrin->src[2].ssa;
+      /* From the SPIR-V spec:
+       *
+       *    "Only the 8 least-significant bits of Cull Mask are used by
+       *    this instruction - other bits are ignored.
+       *
+       *    Only the 16 least-significant bits of Miss Index are used by
+       *    this instruction - other bits are ignored."
+       */
+      nir_def *cull_mask = nir_iand_imm(b, intrin->src[3].ssa, 0xff);
+      nir_def *ray_orig = intrin->src[4].ssa;
+      nir_def *ray_t_min = intrin->src[5].ssa;
+      nir_def *ray_dir = intrin->src[6].ssa;
+      nir_def *ray_t_max = intrin->src[7].ssa;
+
+      nir_def *root_node_ptr =
+         brw_nir_rt_acceleration_structure_to_root_node(b, as_addr);
+
+      struct brw_nir_rt_mem_ray_defs ray_defs = {
+         .root_node_ptr = root_node_ptr,
+         .ray_flags = nir_u2u16(b, ray_flags),
+         .ray_mask = cull_mask,
+         .orig = ray_orig,
+         .t_near = ray_t_min,
+         .dir = ray_dir,
+         .t_far = ray_t_max,
+      };
+
+      nir_def *ray_addr =
+         brw_nir_rt_mem_ray_addr(b, stack_addr, BRW_RT_BVH_LEVEL_WORLD);
+
+      brw_nir_rt_query_mark_init(b, stack_addr);
+      brw_nir_rt_store_mem_ray_query_at_addr(b, ray_addr, &ray_defs);
+
+      update_trace_ctrl_level(b, ctrl_level_deref,
+                              NULL, NULL,
+                              nir_imm_int(b, GEN_RT_TRACE_RAY_INITAL),
+                              nir_imm_int(b, BRW_RT_BVH_LEVEL_WORLD));
+      break;
+   }
+
+   case nir_intrinsic_rq_proceed: {
+      nir_def *not_done =
+         nir_inot(b, brw_nir_rt_query_done(b, stack_addr));
+      nir_def *not_done_then, *not_done_else;
+
+      nir_push_if(b, not_done);
+      {
+         nir_def *ctrl, *level;
+         update_trace_ctrl_level(b, ctrl_level_deref,
+                                 &ctrl, &level,
+                                 NULL,
+                                 NULL);
+
+         /* Mark the query as done because handing it over to the HW for
+          * processing. If the HW make any progress, it will write back some
+          * data and as a side effect, clear the "done" bit. If no progress is
+          * made, HW does not write anything back and we can use this bit to
+          * detect that.
+          */
+         brw_nir_rt_query_mark_done(b, stack_addr);
+
+         if (shadow_stack_addr)
+            fill_query(b, hw_stack_addr, shadow_stack_addr, ctrl);
+
+         nir_trace_ray_intel(b, state->rq_globals, level, ctrl, .synchronous = true);
+
+         struct brw_nir_rt_mem_hit_defs hit_in = {};
+         brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false);
+
+         if (shadow_stack_addr)
+            spill_query(b, hw_stack_addr, shadow_stack_addr);
+
+         update_trace_ctrl_level(b, ctrl_level_deref,
+                                 NULL, NULL,
+                                 nir_imm_int(b, GEN_RT_TRACE_RAY_CONTINUE),
+                                 hit_in.bvh_level);
+
+         not_done_then = nir_inot(b, hit_in.done);
+      }
+      nir_push_else(b, NULL);
+      {
+         not_done_else = nir_imm_false(b);
+      }
+      nir_pop_if(b, NULL);
+      not_done = nir_if_phi(b, not_done_then, not_done_else);
+      nir_def_rewrite_uses(&intrin->def, not_done);
+      break;
+   }
+
+   case nir_intrinsic_rq_confirm_intersection: {
+      brw_nir_memcpy_global(b,
+                            brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, true), 16,
+                            brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, false), 16,
+                            BRW_RT_SIZEOF_HIT_INFO);
+      update_trace_ctrl_level(b, ctrl_level_deref,
+                              NULL, NULL,
+                              nir_imm_int(b, GEN_RT_TRACE_RAY_COMMIT),
+                              nir_imm_int(b, BRW_RT_BVH_LEVEL_OBJECT));
+      break;
+   }
+
+   case nir_intrinsic_rq_generate_intersection: {
+      brw_nir_rt_generate_hit_addr(b, stack_addr, intrin->src[1].ssa);
+      update_trace_ctrl_level(b, ctrl_level_deref,
+                              NULL, NULL,
+                              nir_imm_int(b, GEN_RT_TRACE_RAY_COMMIT),
+                              nir_imm_int(b, BRW_RT_BVH_LEVEL_OBJECT));
+      break;
+   }
+
+   case nir_intrinsic_rq_terminate: {
+      brw_nir_rt_query_mark_done(b, stack_addr);
+      break;
+   }
+
+   case nir_intrinsic_rq_load: {
+      const bool committed = nir_intrinsic_committed(intrin);
+
+      struct brw_nir_rt_mem_ray_defs world_ray_in = {};
+      struct brw_nir_rt_mem_ray_defs object_ray_in = {};
+      struct brw_nir_rt_mem_hit_defs hit_in = {};
+      brw_nir_rt_load_mem_ray_from_addr(b, &world_ray_in, stack_addr,
+                                        BRW_RT_BVH_LEVEL_WORLD);
+      brw_nir_rt_load_mem_ray_from_addr(b, &object_ray_in, stack_addr,
+                                        BRW_RT_BVH_LEVEL_OBJECT);
+      brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr, committed);
+
+      nir_def *sysval = NULL;
+      switch (nir_intrinsic_ray_query_value(intrin)) {
+      case nir_ray_query_value_intersection_type:
+         if (committed) {
+            /* Values we want to generate :
+             *
+             * RayQueryCommittedIntersectionNoneEXT = 0U        <= hit_in.valid == false
+             * RayQueryCommittedIntersectionTriangleEXT = 1U    <= hit_in.leaf_type == BRW_RT_BVH_NODE_TYPE_QUAD (4)
+             * RayQueryCommittedIntersectionGeneratedEXT = 2U   <= hit_in.leaf_type == BRW_RT_BVH_NODE_TYPE_PROCEDURAL (3)
+             */
+            sysval =
+               nir_bcsel(b, nir_ieq_imm(b, hit_in.leaf_type, 4),
+                         nir_imm_int(b, 1), nir_imm_int(b, 2));
+            sysval =
+               nir_bcsel(b, hit_in.valid,
+                         sysval, nir_imm_int(b, 0));
+         } else {
+            /* 0 -> triangle, 1 -> AABB */
+            sysval =
+               nir_b2i32(b,
+                         nir_ieq_imm(b, hit_in.leaf_type,
+                                     BRW_RT_BVH_NODE_TYPE_PROCEDURAL));
+         }
+         break;
+
+      case nir_ray_query_value_intersection_t:
+         sysval = hit_in.t;
+         break;
+
+      case nir_ray_query_value_intersection_instance_custom_index: {
+         struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+         brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+         sysval = leaf.instance_id;
+         break;
+      }
+
+      case nir_ray_query_value_intersection_instance_id: {
+         struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+         brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+         sysval = leaf.instance_index;
+         break;
+      }
+
+      case nir_ray_query_value_intersection_instance_sbt_index: {
+         struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+         brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+         sysval = leaf.contribution_to_hit_group_index;
+         break;
+      }
+
+      case nir_ray_query_value_intersection_geometry_index: {
+         nir_def *geometry_index_dw =
+            nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
+                            1, 32);
+         sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29));
+         break;
+      }
+
+      case nir_ray_query_value_intersection_primitive_index:
+         sysval = brw_nir_rt_load_primitive_id_from_hit(b, NULL /* is_procedural */, &hit_in);
+         break;
+
+      case nir_ray_query_value_intersection_barycentrics:
+         sysval = hit_in.tri_bary;
+         break;
+
+      case nir_ray_query_value_intersection_front_face:
+         sysval = hit_in.front_face;
+         break;
+
+      case nir_ray_query_value_intersection_object_ray_direction:
+         sysval = world_ray_in.dir;
+         break;
+
+      case nir_ray_query_value_intersection_object_ray_origin:
+         sysval = world_ray_in.orig;
+         break;
+
+      case nir_ray_query_value_intersection_object_to_world: {
+         struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+         brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+         sysval = leaf.object_to_world[nir_intrinsic_column(intrin)];
+         break;
+      }
+
+      case nir_ray_query_value_intersection_world_to_object: {
+         struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+         brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+         sysval = leaf.world_to_object[nir_intrinsic_column(intrin)];
+         break;
+      }
+
+      case nir_ray_query_value_intersection_candidate_aabb_opaque:
+         sysval = hit_in.front_face;
+         break;
+
+      case nir_ray_query_value_tmin:
+         sysval = world_ray_in.t_near;
+         break;
+
+      case nir_ray_query_value_flags:
+         sysval = nir_u2u32(b, world_ray_in.ray_flags);
+         break;
+
+      case nir_ray_query_value_world_ray_direction:
+         sysval = world_ray_in.dir;
+         break;
+
+      case nir_ray_query_value_world_ray_origin:
+         sysval = world_ray_in.orig;
+         break;
+
+      case nir_ray_query_value_intersection_triangle_vertex_positions: {
+         struct brw_nir_rt_bvh_primitive_leaf_positions_defs pos;
+         brw_nir_rt_load_bvh_primitive_leaf_positions(b, &pos, hit_in.prim_leaf_ptr);
+         sysval = pos.positions[nir_intrinsic_column(intrin)];
+         break;
+      }
+
+      default:
+         unreachable("Invalid ray query");
+      }
+
+      assert(sysval);
+      nir_def_rewrite_uses(&intrin->def, sysval);
+      break;
+   }
+
+   default:
+      unreachable("Invalid intrinsic");
+   }
+}
+
+static void
+lower_ray_query_impl(nir_function_impl *impl, struct lowering_state *state)
+{
+   nir_builder _b, *b = &_b;
+   _b = nir_builder_at(nir_before_impl(impl));
+
+   state->rq_globals = nir_load_ray_query_global_intel(b);
+
+   brw_nir_rt_load_globals_addr(b, &state->globals, state->rq_globals);
+
+   nir_foreach_block_safe(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         if (intrin->intrinsic != nir_intrinsic_rq_initialize &&
+             intrin->intrinsic != nir_intrinsic_rq_terminate &&
+             intrin->intrinsic != nir_intrinsic_rq_proceed &&
+             intrin->intrinsic != nir_intrinsic_rq_generate_intersection &&
+             intrin->intrinsic != nir_intrinsic_rq_confirm_intersection &&
+             intrin->intrinsic != nir_intrinsic_rq_load)
+            continue;
+
+         lower_ray_query_intrinsic(b, intrin, state);
+      }
+   }
+
+   nir_metadata_preserve(impl, nir_metadata_none);
+}
+
+bool
+brw_nir_lower_ray_queries(nir_shader *shader,
+                          const struct intel_device_info *devinfo)
+{
+   assert(exec_list_length(&shader->functions) == 1);
+
+   struct lowering_state state = {
+      .devinfo = devinfo,
+      .impl = nir_shader_get_entrypoint(shader),
+      .queries = _mesa_pointer_hash_table_create(NULL),
+   };
+
+   /* Map all query variable to internal type variables */
+   nir_foreach_function_temp_variable(var, state.impl)
+      register_opaque_var(var, &state);
+   hash_table_foreach(state.queries, entry)
+      create_internal_var(entry->data, &state);
+
+   bool progress = state.n_queries > 0;
+
+   if (progress) {
+      lower_ray_query_impl(state.impl, &state);
+
+      nir_remove_dead_derefs(shader);
+      nir_remove_dead_variables(shader,
+                                nir_var_shader_temp | nir_var_function_temp,
+                                NULL);
+
+      nir_metadata_preserve(state.impl, nir_metadata_none);
+   }
+
+   ralloc_free(state.queries);
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_nir_lower_rt_intrinsics.c b/src/intel/compiler/elk/brw_nir_lower_rt_intrinsics.c
new file mode 100644
index 00000000000..d3653251b74
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir_lower_rt_intrinsics.c
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir_rt.h"
+#include "brw_nir_rt_builder.h"
+
+static nir_def *
+build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)
+{
+   switch (b->shader->info.stage) {
+   case MESA_SHADER_ANY_HIT:
+      /* Any-hit shaders are always compiled into intersection shaders for
+       * procedural geometry.  If we got here in an any-hit shader, it's for
+       * triangles.
+       */
+      return nir_imm_false(b);
+
+   case MESA_SHADER_INTERSECTION:
+      return nir_imm_true(b);
+
+   default:
+      return nir_ieq_imm(b, hit->leaf_type,
+                            BRW_RT_BVH_NODE_TYPE_PROCEDURAL);
+   }
+}
+
+static void
+lower_rt_intrinsics_impl(nir_function_impl *impl,
+                         const struct intel_device_info *devinfo)
+{
+   bool progress = false;
+
+   nir_builder build = nir_builder_at(nir_before_impl(impl));
+   nir_builder *b = &build;
+
+   struct brw_nir_rt_globals_defs globals;
+   brw_nir_rt_load_globals(b, &globals);
+
+   nir_def *hotzone_addr = brw_nir_rt_sw_hotzone_addr(b, devinfo);
+   nir_def *hotzone = nir_load_global(b, hotzone_addr, 16, 4, 32);
+
+   gl_shader_stage stage = b->shader->info.stage;
+   struct brw_nir_rt_mem_ray_defs world_ray_in = {};
+   struct brw_nir_rt_mem_ray_defs object_ray_in = {};
+   struct brw_nir_rt_mem_hit_defs hit_in = {};
+   switch (stage) {
+   case MESA_SHADER_ANY_HIT:
+   case MESA_SHADER_CLOSEST_HIT:
+   case MESA_SHADER_INTERSECTION:
+      brw_nir_rt_load_mem_hit(b, &hit_in,
+                              stage == MESA_SHADER_CLOSEST_HIT);
+      brw_nir_rt_load_mem_ray(b, &object_ray_in,
+                              BRW_RT_BVH_LEVEL_OBJECT);
+      FALLTHROUGH;
+
+   case MESA_SHADER_MISS:
+      brw_nir_rt_load_mem_ray(b, &world_ray_in,
+                              BRW_RT_BVH_LEVEL_WORLD);
+      break;
+
+   default:
+      break;
+   }
+
+   nir_def *thread_stack_base_addr = brw_nir_rt_sw_stack_addr(b, devinfo);
+   nir_def *stack_base_offset = nir_channel(b, hotzone, 0);
+   nir_def *stack_base_addr =
+      nir_iadd(b, thread_stack_base_addr, nir_u2u64(b, stack_base_offset));
+   ASSERTED bool seen_scratch_base_ptr_load = false;
+   ASSERTED bool found_resume = false;
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+         b->cursor = nir_after_instr(&intrin->instr);
+
+         nir_def *sysval = NULL;
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_load_scratch_base_ptr:
+            assert(nir_intrinsic_base(intrin) == 1);
+            seen_scratch_base_ptr_load = true;
+            sysval = stack_base_addr;
+            break;
+
+         case nir_intrinsic_btd_stack_push_intel: {
+            int32_t stack_size = nir_intrinsic_stack_size(intrin);
+            if (stack_size > 0) {
+               nir_def *child_stack_offset =
+                  nir_iadd_imm(b, stack_base_offset, stack_size);
+               nir_store_global(b, hotzone_addr, 16, child_stack_offset, 0x1);
+            }
+            nir_instr_remove(instr);
+            break;
+         }
+
+         case nir_intrinsic_rt_resume:
+            /* This is the first "interesting" instruction */
+            assert(block == nir_start_block(impl));
+            assert(!seen_scratch_base_ptr_load);
+            found_resume = true;
+
+            int32_t stack_size = nir_intrinsic_stack_size(intrin);
+            if (stack_size > 0) {
+               stack_base_offset =
+                  nir_iadd_imm(b, stack_base_offset, -stack_size);
+               nir_store_global(b, hotzone_addr, 16, stack_base_offset, 0x1);
+               stack_base_addr = nir_iadd(b, thread_stack_base_addr,
+                                          nir_u2u64(b, stack_base_offset));
+            }
+            nir_instr_remove(instr);
+            break;
+
+         case nir_intrinsic_load_uniform: {
+            /* We don't want to lower this in the launch trampoline. */
+            if (stage == MESA_SHADER_COMPUTE)
+               break;
+
+            sysval = brw_nir_load_global_const(b, intrin,
+                        nir_load_btd_global_arg_addr_intel(b),
+                        BRW_RT_PUSH_CONST_OFFSET);
+
+            break;
+         }
+
+         case nir_intrinsic_load_ray_launch_id:
+            sysval = nir_channels(b, hotzone, 0xe);
+            break;
+
+         case nir_intrinsic_load_ray_launch_size:
+            sysval = globals.launch_size;
+            break;
+
+         case nir_intrinsic_load_ray_world_origin:
+            sysval = world_ray_in.orig;
+            break;
+
+         case nir_intrinsic_load_ray_world_direction:
+            sysval = world_ray_in.dir;
+            break;
+
+         case nir_intrinsic_load_ray_object_origin:
+            sysval = object_ray_in.orig;
+            break;
+
+         case nir_intrinsic_load_ray_object_direction:
+            sysval = object_ray_in.dir;
+            break;
+
+         case nir_intrinsic_load_ray_t_min:
+            /* It shouldn't matter which we pull this from */
+            sysval = world_ray_in.t_near;
+            break;
+
+         case nir_intrinsic_load_ray_t_max:
+            if (stage == MESA_SHADER_MISS)
+               sysval = world_ray_in.t_far;
+            else
+               sysval = hit_in.t;
+            break;
+
+         case nir_intrinsic_load_primitive_id:
+            sysval = brw_nir_rt_load_primitive_id_from_hit(b,
+                                                           build_leaf_is_procedural(b, &hit_in),
+                                                           &hit_in);
+            break;
+
+         case nir_intrinsic_load_instance_id: {
+            struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+            brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+            sysval = leaf.instance_index;
+            break;
+         }
+
+         case nir_intrinsic_load_ray_object_to_world: {
+            struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+            brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+            sysval = leaf.object_to_world[nir_intrinsic_column(intrin)];
+            break;
+         }
+
+         case nir_intrinsic_load_ray_world_to_object: {
+            struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+            brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+            sysval = leaf.world_to_object[nir_intrinsic_column(intrin)];
+            break;
+         }
+
+         case nir_intrinsic_load_ray_hit_kind: {
+            nir_def *tri_hit_kind =
+               nir_bcsel(b, hit_in.front_face,
+                            nir_imm_int(b, BRW_RT_HIT_KIND_FRONT_FACE),
+                            nir_imm_int(b, BRW_RT_HIT_KIND_BACK_FACE));
+            sysval = nir_bcsel(b, build_leaf_is_procedural(b, &hit_in),
+                                  hit_in.aabb_hit_kind, tri_hit_kind);
+            break;
+         }
+
+         case nir_intrinsic_load_ray_flags:
+            /* We need to fetch the original ray flags we stored in the
+             * leaf pointer, because the actual ray flags we get here
+             * will include any flags passed on the pipeline at creation
+             * time, and the spec for IncomingRayFlagsKHR says:
+             *   Setting pipeline flags on the raytracing pipeline must not
+             *   cause any corresponding flags to be set in variables with
+             *   this decoration.
+             */
+            sysval = nir_u2u32(b, world_ray_in.inst_leaf_ptr);
+            break;
+
+         case nir_intrinsic_load_cull_mask:
+            sysval = nir_u2u32(b, world_ray_in.ray_mask);
+            break;
+
+         case nir_intrinsic_load_ray_geometry_index: {
+            nir_def *geometry_index_dw =
+               nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
+                               1, 32);
+            sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29));
+            break;
+         }
+
+         case nir_intrinsic_load_ray_instance_custom_index: {
+            struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+            brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+            sysval = leaf.instance_id;
+            break;
+         }
+
+         case nir_intrinsic_load_shader_record_ptr:
+            /* We can't handle this intrinsic in resume shaders because the
+             * handle we get there won't be from the original SBT.  The shader
+             * call lowering/splitting pass should have ensured that this
+             * value was spilled from the initial shader and unspilled in any
+             * resume shaders that need it.
+             */
+            assert(!found_resume);
+            sysval = nir_load_btd_local_arg_addr_intel(b);
+            break;
+
+         case nir_intrinsic_load_ray_base_mem_addr_intel:
+            sysval = globals.base_mem_addr;
+            break;
+
+         case nir_intrinsic_load_ray_hw_stack_size_intel:
+            sysval = nir_imul_imm(b, globals.hw_stack_size, 64);
+            break;
+
+         case nir_intrinsic_load_ray_sw_stack_size_intel:
+            sysval = nir_imul_imm(b, globals.sw_stack_size, 64);
+            break;
+
+         case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
+            sysval = globals.num_dss_rt_stacks;
+            break;
+
+         case nir_intrinsic_load_ray_hit_sbt_addr_intel:
+            sysval = globals.hit_sbt_addr;
+            break;
+
+         case nir_intrinsic_load_ray_hit_sbt_stride_intel:
+            sysval = globals.hit_sbt_stride;
+            break;
+
+         case nir_intrinsic_load_ray_miss_sbt_addr_intel:
+            sysval = globals.miss_sbt_addr;
+            break;
+
+         case nir_intrinsic_load_ray_miss_sbt_stride_intel:
+            sysval = globals.miss_sbt_stride;
+            break;
+
+         case nir_intrinsic_load_callable_sbt_addr_intel:
+            sysval = globals.call_sbt_addr;
+            break;
+
+         case nir_intrinsic_load_callable_sbt_stride_intel:
+            sysval = globals.call_sbt_stride;
+            break;
+
+         case nir_intrinsic_load_btd_resume_sbt_addr_intel:
+            sysval = nir_pack_64_2x32_split(b,
+               nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW),
+               nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH));
+            break;
+
+         case nir_intrinsic_load_leaf_procedural_intel:
+            sysval = build_leaf_is_procedural(b, &hit_in);
+            break;
+
+         case nir_intrinsic_load_ray_triangle_vertex_positions: {
+            struct brw_nir_rt_bvh_primitive_leaf_positions_defs pos;
+            brw_nir_rt_load_bvh_primitive_leaf_positions(b, &pos, hit_in.prim_leaf_ptr);
+            sysval = pos.positions[nir_intrinsic_column(intrin)];
+            break;
+         }
+
+         case nir_intrinsic_load_leaf_opaque_intel: {
+            if (stage == MESA_SHADER_INTERSECTION) {
+               /* In intersection shaders, the opaque bit is passed to us in
+                * the front_face bit.
+                */
+               sysval = hit_in.front_face;
+            } else {
+               nir_def *flags_dw =
+                  nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
+                                  1, 32);
+               sysval = nir_i2b(b, nir_iand_imm(b, flags_dw, 1u << 30));
+            }
+            break;
+         }
+
+         default:
+            continue;
+         }
+
+         progress = true;
+
+         if (sysval) {
+            nir_def_rewrite_uses(&intrin->def,
+                                     sysval);
+            nir_instr_remove(&intrin->instr);
+         }
+      }
+   }
+
+   nir_metadata_preserve(impl,
+                         progress ?
+                         nir_metadata_none :
+                         (nir_metadata_block_index |
+                          nir_metadata_dominance));
+}
+
+/** Lower ray-tracing system values and intrinsics
+ *
+ * In most 3D shader stages, intrinsics are a fairly thin wrapper around
+ * hardware functionality and system values represent magic bits that come
+ * into the shader from FF hardware.  Ray-tracing, however, looks a bit more
+ * like the OpenGL 1.0 world where the underlying hardware is simple and most
+ * of the API implementation is software.
+ *
+ * In particular, most things that are treated as system values (or built-ins
+ * in SPIR-V) don't get magically dropped into registers for us.  Instead, we
+ * have to fetch them from the relevant data structures shared with the
+ * ray-tracing hardware.  Most come from either the RT_DISPATCH_GLOBALS or
+ * from one of the MemHit data structures.  Some, such as primitive_id require
+ * us to fetch the leaf address from the MemHit struct and then manually read
+ * the data out of the BVH.  Instead of trying to emit all this code deep in
+ * the back-end where we can't effectively optimize it, we lower it all to
+ * global memory access in NIR.
+ *
+ * Once this pass is complete, the only real system values left are the two
+ * argument pointer system values for BTD dispatch: btd_local_arg_addr and
+ * btd_global_arg_addr.
+ */
+void
+brw_nir_lower_rt_intrinsics(nir_shader *nir,
+                            const struct intel_device_info *devinfo)
+{
+   nir_foreach_function_impl(impl, nir) {
+      lower_rt_intrinsics_impl(impl, devinfo);
+   }
+}
diff --git a/src/intel/compiler/elk/brw_nir_lower_shader_calls.c b/src/intel/compiler/elk/brw_nir_lower_shader_calls.c
new file mode 100644
index 00000000000..739e6375023
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir_lower_shader_calls.c
@@ -0,0 +1,329 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir_rt.h"
+#include "brw_nir_rt_builder.h"
+#include "nir_phi_builder.h"
+
+UNUSED static bool
+no_load_scratch_base_ptr_intrinsic(nir_shader *shader)
+{
+   nir_foreach_function_impl(impl, shader) {
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            if (intrin->intrinsic == nir_intrinsic_load_scratch_base_ptr)
+               return false;
+         }
+      }
+   }
+
+   return true;
+}
+
+/** Insert the appropriate return instruction at the end of the shader */
+void
+brw_nir_lower_shader_returns(nir_shader *shader)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+
+   /* Reserve scratch space at the start of the shader's per-thread scratch
+    * space for the return BINDLESS_SHADER_RECORD address and data payload.
+    * When a shader is called, the calling shader will write the return BSR
+    * address in this region of the callee's scratch space.
+    *
+    * We could also put it at the end of the caller's scratch space.  However,
+    * doing this way means that a shader never accesses its caller's scratch
+    * space unless given an explicit pointer (such as for ray payloads).  It
+    * also makes computing the address easier given that we want to apply an
+    * alignment to the scratch offset to ensure we can make alignment
+    * assumptions in the called shader.
+    *
+    * This isn't needed for ray-gen shaders because they end the thread and
+    * never return to the calling trampoline shader.
+    */
+   assert(no_load_scratch_base_ptr_intrinsic(shader));
+   if (shader->info.stage != MESA_SHADER_RAYGEN)
+      shader->scratch_size += BRW_BTD_STACK_CALLEE_DATA_SIZE;
+
+   nir_builder b = nir_builder_create(impl);
+
+   set_foreach(impl->end_block->predecessors, block_entry) {
+      struct nir_block *block = (void *)block_entry->key;
+      b.cursor = nir_after_block_before_jump(block);
+
+      switch (shader->info.stage) {
+      case MESA_SHADER_RAYGEN:
+         /* A raygen shader is always the root of the shader call tree.  When
+          * it ends, we retire the bindless stack ID and no further shaders
+          * will be executed.
+          */
+         assert(impl->end_block->predecessors->entries == 1);
+         brw_nir_btd_retire(&b);
+         break;
+
+      case MESA_SHADER_ANY_HIT:
+         /* The default action of an any-hit shader is to accept the ray
+          * intersection.  Any-hit shaders may have more than one exit.  Only
+          * the final "normal" exit will actually need to accept the
+          * intersection as any others should come from nir_jump_halt
+          * instructions inserted after ignore_ray_intersection or
+          * terminate_ray or the like.  However, inserting an accept after
+          * the ignore or terminate is safe because it'll get deleted later.
+          */
+         nir_accept_ray_intersection(&b);
+         break;
+
+      case MESA_SHADER_CALLABLE:
+      case MESA_SHADER_MISS:
+      case MESA_SHADER_CLOSEST_HIT:
+         /* Callable, miss, and closest-hit shaders don't take any special
+          * action at the end.  They simply return back to the previous shader
+          * in the call stack.
+          */
+         assert(impl->end_block->predecessors->entries == 1);
+         brw_nir_btd_return(&b);
+         break;
+
+      case MESA_SHADER_INTERSECTION:
+         /* This will be handled by brw_nir_lower_intersection_shader */
+         break;
+
+      default:
+         unreachable("Invalid callable shader stage");
+      }
+   }
+
+   nir_metadata_preserve(impl, nir_metadata_block_index |
+                               nir_metadata_dominance);
+}
+
+static void
+store_resume_addr(nir_builder *b, nir_intrinsic_instr *call)
+{
+   uint32_t call_idx = nir_intrinsic_call_idx(call);
+   uint32_t offset = nir_intrinsic_stack_size(call);
+
+   /* First thing on the called shader's stack is the resume address
+    * followed by a pointer to the payload.
+    */
+   nir_def *resume_record_addr =
+      nir_iadd_imm(b, nir_load_btd_resume_sbt_addr_intel(b),
+                   call_idx * BRW_BTD_RESUME_SBT_STRIDE);
+   /* By the time we get here, any remaining shader/function memory
+    * pointers have been lowered to SSA values.
+    */
+   nir_def *payload_addr =
+      nir_get_shader_call_payload_src(call)->ssa;
+   brw_nir_rt_store_scratch(b, offset, BRW_BTD_STACK_ALIGN,
+                            nir_vec2(b, resume_record_addr, payload_addr),
+                            0xf /* write_mask */);
+
+   nir_btd_stack_push_intel(b, offset);
+}
+
+static bool
+lower_shader_trace_ray_instr(struct nir_builder *b, nir_instr *instr, void *data)
+{
+   struct brw_bs_prog_key *key = data;
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   /* Leave nir_intrinsic_rt_resume to be lowered by
+    * brw_nir_lower_rt_intrinsics()
+    */
+   nir_intrinsic_instr *call = nir_instr_as_intrinsic(instr);
+   if (call->intrinsic != nir_intrinsic_rt_trace_ray)
+      return false;
+
+   b->cursor = nir_instr_remove(instr);
+
+   store_resume_addr(b, call);
+
+   nir_def *as_addr = call->src[0].ssa;
+   nir_def *ray_flags = call->src[1].ssa;
+   /* From the SPIR-V spec:
+    *
+    *    "Only the 8 least-significant bits of Cull Mask are used by this
+    *    instruction - other bits are ignored.
+    *
+    *    Only the 4 least-significant bits of SBT Offset and SBT Stride are
+    *    used by this instruction - other bits are ignored.
+    *
+    *    Only the 16 least-significant bits of Miss Index are used by this
+    *    instruction - other bits are ignored."
+    */
+   nir_def *cull_mask = nir_iand_imm(b, call->src[2].ssa, 0xff);
+   nir_def *sbt_offset = nir_iand_imm(b, call->src[3].ssa, 0xf);
+   nir_def *sbt_stride = nir_iand_imm(b, call->src[4].ssa, 0xf);
+   nir_def *miss_index = nir_iand_imm(b, call->src[5].ssa, 0xffff);
+   nir_def *ray_orig = call->src[6].ssa;
+   nir_def *ray_t_min = call->src[7].ssa;
+   nir_def *ray_dir = call->src[8].ssa;
+   nir_def *ray_t_max = call->src[9].ssa;
+
+   nir_def *root_node_ptr =
+      brw_nir_rt_acceleration_structure_to_root_node(b, as_addr);
+
+   /* The hardware packet requires an address to the first element of the
+    * hit SBT.
+    *
+    * In order to calculate this, we must multiply the "SBT Offset"
+    * provided to OpTraceRay by the SBT stride provided for the hit SBT in
+    * the call to vkCmdTraceRay() and add that to the base address of the
+    * hit SBT. This stride is not to be confused with the "SBT Stride"
+    * provided to OpTraceRay which is in units of this stride. It's a
+    * rather terrible overload of the word "stride". The hardware docs
+    * calls the SPIR-V stride value the "shader index multiplier" which is
+    * a much more sane name.
+    */
+   nir_def *hit_sbt_stride_B =
+      nir_load_ray_hit_sbt_stride_intel(b);
+   nir_def *hit_sbt_offset_B =
+      nir_imul(b, sbt_offset, nir_u2u32(b, hit_sbt_stride_B));
+   nir_def *hit_sbt_addr =
+      nir_iadd(b, nir_load_ray_hit_sbt_addr_intel(b),
+                  nir_u2u64(b, hit_sbt_offset_B));
+
+   /* The hardware packet takes an address to the miss BSR. */
+   nir_def *miss_sbt_stride_B =
+      nir_load_ray_miss_sbt_stride_intel(b);
+   nir_def *miss_sbt_offset_B =
+      nir_imul(b, miss_index, nir_u2u32(b, miss_sbt_stride_B));
+   nir_def *miss_sbt_addr =
+      nir_iadd(b, nir_load_ray_miss_sbt_addr_intel(b),
+                  nir_u2u64(b, miss_sbt_offset_B));
+
+   struct brw_nir_rt_mem_ray_defs ray_defs = {
+      .root_node_ptr = root_node_ptr,
+      /* Combine the shader value given to traceRayEXT() with the pipeline
+       * creation value VkPipelineCreateFlags.
+       */
+      .ray_flags = nir_ior_imm(b, nir_u2u16(b, ray_flags), key->pipeline_ray_flags),
+      .ray_mask = cull_mask,
+      .hit_group_sr_base_ptr = hit_sbt_addr,
+      .hit_group_sr_stride = nir_u2u16(b, hit_sbt_stride_B),
+      .miss_sr_ptr = miss_sbt_addr,
+      .orig = ray_orig,
+      .t_near = ray_t_min,
+      .dir = ray_dir,
+      .t_far = ray_t_max,
+      .shader_index_multiplier = sbt_stride,
+      /* The instance leaf pointer is unused in the top level BVH traversal
+       * since we always start from the root node. We can reuse that field to
+       * store the ray_flags handed to traceRayEXT(). This will be reloaded
+       * when the shader accesses gl_IncomingRayFlagsEXT (see
+       * nir_intrinsic_load_ray_flags brw_nir_lower_rt_intrinsic.c)
+       */
+      .inst_leaf_ptr = nir_u2u64(b, ray_flags),
+   };
+   brw_nir_rt_store_mem_ray(b, &ray_defs, BRW_RT_BVH_LEVEL_WORLD);
+
+   nir_trace_ray_intel(b,
+                       nir_load_btd_global_arg_addr_intel(b),
+                       nir_imm_int(b, BRW_RT_BVH_LEVEL_WORLD),
+                       nir_imm_int(b, GEN_RT_TRACE_RAY_INITAL),
+                       .synchronous = false);
+   return true;
+}
+
+static bool
+lower_shader_call_instr(struct nir_builder *b, nir_intrinsic_instr *call,
+                        void *data)
+{
+   if (call->intrinsic != nir_intrinsic_rt_execute_callable)
+      return false;
+
+   b->cursor = nir_instr_remove(&call->instr);
+
+   store_resume_addr(b, call);
+
+   nir_def *sbt_offset32 =
+      nir_imul(b, call->src[0].ssa,
+               nir_u2u32(b, nir_load_callable_sbt_stride_intel(b)));
+   nir_def *sbt_addr =
+      nir_iadd(b, nir_load_callable_sbt_addr_intel(b),
+               nir_u2u64(b, sbt_offset32));
+   brw_nir_btd_spawn(b, sbt_addr);
+   return true;
+}
+
+bool
+brw_nir_lower_shader_calls(nir_shader *shader, struct brw_bs_prog_key *key)
+{
+   bool a = nir_shader_instructions_pass(shader,
+                                         lower_shader_trace_ray_instr,
+                                         nir_metadata_none,
+                                         key);
+   bool b = nir_shader_intrinsics_pass(shader, lower_shader_call_instr,
+                                         nir_metadata_block_index |
+                                         nir_metadata_dominance,
+                                         NULL);
+   return a || b;
+}
+
+/** Creates a trivial return shader
+ *
+ * In most cases this shader doesn't actually do anything. It just needs to
+ * return to the caller.
+ *
+ * By default, our HW has the ability to handle the fact that a shader is not
+ * available and will execute the next following shader in the tracing call.
+ * For instance, a RAYGEN shader traces a ray, the tracing generates a hit,
+ * but there is no ANYHIT shader available. The HW should follow up by
+ * execution the CLOSESTHIT shader.
+ *
+ * This default behavior can be changed through the RT_CTRL register
+ * (privileged access) and when NULL shader checks are disabled, the HW will
+ * instead call the call stack handler (this shader). This is what i915 is
+ * doing as part of Wa_14013202645.
+ *
+ * In order to ensure the call to the CLOSESTHIT shader, this shader needs to
+ * commit the ray and will not proceed with the BTD return. Similarly when the
+ * same thing happen with the INTERSECTION shader, we should just carry on the
+ * ray traversal with the continue operation.
+ *
+ */
+nir_shader *
+brw_nir_create_trivial_return_shader(const struct brw_compiler *compiler,
+                                     void *mem_ctx)
+{
+   const nir_shader_compiler_options *nir_options =
+      compiler->nir_options[MESA_SHADER_CALLABLE];
+
+   nir_builder _b = nir_builder_init_simple_shader(MESA_SHADER_CALLABLE,
+                                                   nir_options,
+                                                   "RT Trivial Return");
+   nir_builder *b = &_b;
+
+   ralloc_steal(mem_ctx, b->shader);
+   nir_shader *nir = b->shader;
+
+   NIR_PASS_V(nir, brw_nir_lower_shader_returns);
+
+   return nir;
+}
diff --git a/src/intel/compiler/elk/brw_nir_lower_storage_image.c b/src/intel/compiler/elk/brw_nir_lower_storage_image.c
new file mode 100644
index 00000000000..aec06f36dce
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir_lower_storage_image.c
@@ -0,0 +1,765 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "isl/isl.h"
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_format_convert.h"
+
+static nir_def *
+_load_image_param(nir_builder *b, nir_deref_instr *deref, unsigned offset)
+{
+   nir_intrinsic_instr *load =
+      nir_intrinsic_instr_create(b->shader,
+                                 nir_intrinsic_image_deref_load_param_intel);
+   load->src[0] = nir_src_for_ssa(&deref->def);
+   nir_intrinsic_set_base(load, offset / 4);
+
+   switch (offset) {
+   case ISL_IMAGE_PARAM_OFFSET_OFFSET:
+   case ISL_IMAGE_PARAM_SWIZZLING_OFFSET:
+      load->num_components = 2;
+      break;
+   case ISL_IMAGE_PARAM_TILING_OFFSET:
+   case ISL_IMAGE_PARAM_SIZE_OFFSET:
+      load->num_components = 3;
+      break;
+   case ISL_IMAGE_PARAM_STRIDE_OFFSET:
+      load->num_components = 4;
+      break;
+   default:
+      unreachable("Invalid param offset");
+   }
+   nir_def_init(&load->instr, &load->def, load->num_components, 32);
+
+   nir_builder_instr_insert(b, &load->instr);
+   return &load->def;
+}
+
+#define load_image_param(b, d, o) \
+   _load_image_param(b, d, ISL_IMAGE_PARAM_##o##_OFFSET)
+
+static nir_def *
+image_coord_is_in_bounds(nir_builder *b, nir_deref_instr *deref,
+                         nir_def *coord)
+{
+   nir_def *size = load_image_param(b, deref, SIZE);
+   nir_def *cmp = nir_ilt(b, coord, size);
+
+   unsigned coord_comps = glsl_get_sampler_coordinate_components(deref->type);
+   nir_def *in_bounds = nir_imm_true(b);
+   for (unsigned i = 0; i < coord_comps; i++)
+      in_bounds = nir_iand(b, in_bounds, nir_channel(b, cmp, i));
+
+   return in_bounds;
+}
+
+/** Calculate the offset in memory of the texel given by \p coord.
+ *
+ * This is meant to be used with untyped surface messages to access a tiled
+ * surface, what involves taking into account the tiling and swizzling modes
+ * of the surface manually so it will hopefully not happen very often.
+ *
+ * The tiling algorithm implemented here matches either the X or Y tiling
+ * layouts supported by the hardware depending on the tiling coefficients
+ * passed to the program as uniforms.  See Volume 1 Part 2 Section 4.5
+ * "Address Tiling Function" of the IVB PRM for an in-depth explanation of
+ * the hardware tiling format.
+ */
+static nir_def *
+image_address(nir_builder *b, const struct intel_device_info *devinfo,
+              nir_deref_instr *deref, nir_def *coord)
+{
+   if (glsl_get_sampler_dim(deref->type) == GLSL_SAMPLER_DIM_1D &&
+       glsl_sampler_type_is_array(deref->type)) {
+      /* It's easier if 1D arrays are treated like 2D arrays */
+      coord = nir_vec3(b, nir_channel(b, coord, 0),
+                          nir_imm_int(b, 0),
+                          nir_channel(b, coord, 1));
+   } else {
+      unsigned dims = glsl_get_sampler_coordinate_components(deref->type);
+      coord = nir_trim_vector(b, coord, dims);
+   }
+
+   nir_def *offset = load_image_param(b, deref, OFFSET);
+   nir_def *tiling = load_image_param(b, deref, TILING);
+   nir_def *stride = load_image_param(b, deref, STRIDE);
+
+   /* Shift the coordinates by the fixed surface offset.  It may be non-zero
+    * if the image is a single slice of a higher-dimensional surface, or if a
+    * non-zero mipmap level of the surface is bound to the pipeline.  The
+    * offset needs to be applied here rather than at surface state set-up time
+    * because the desired slice-level may start mid-tile, so simply shifting
+    * the surface base address wouldn't give a well-formed tiled surface in
+    * the general case.
+    */
+   nir_def *xypos = (coord->num_components == 1) ?
+                        nir_vec2(b, coord, nir_imm_int(b, 0)) :
+                        nir_trim_vector(b, coord, 2);
+   xypos = nir_iadd(b, xypos, offset);
+
+   /* The layout of 3-D textures in memory is sort-of like a tiling
+    * format.  At each miplevel, the slices are arranged in rows of
+    * 2^level slices per row.  The slice row is stored in tmp.y and
+    * the slice within the row is stored in tmp.x.
+    *
+    * The layout of 2-D array textures and cubemaps is much simpler:
+    * Depending on whether the ARYSPC_LOD0 layout is in use it will be
+    * stored in memory as an array of slices, each one being a 2-D
+    * arrangement of miplevels, or as a 2D arrangement of miplevels,
+    * each one being an array of slices.  In either case the separation
+    * between slices of the same LOD is equal to the qpitch value
+    * provided as stride.w.
+    *
+    * This code can be made to handle either 2D arrays and 3D textures
+    * by passing in the miplevel as tile.z for 3-D textures and 0 in
+    * tile.z for 2-D array textures.
+    *
+    * See Volume 1 Part 1 of the Gfx7 PRM, sections 6.18.4.7 "Surface
+    * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
+    * of the hardware 3D texture and 2D array layouts.
+    */
+   if (coord->num_components > 2) {
+      /* Decompose z into a major (tmp.y) and a minor (tmp.x)
+       * index.
+       */
+      nir_def *z = nir_channel(b, coord, 2);
+      nir_def *z_x = nir_ubfe(b, z, nir_imm_int(b, 0),
+                                  nir_channel(b, tiling, 2));
+      nir_def *z_y = nir_ushr(b, z, nir_channel(b, tiling, 2));
+
+      /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
+       * slice offset.
+       */
+      xypos = nir_iadd(b, xypos, nir_imul(b, nir_vec2(b, z_x, z_y),
+                                             nir_channels(b, stride, 0xc)));
+   }
+
+   nir_def *addr;
+   if (coord->num_components > 1) {
+      /* Calculate the major/minor x and y indices.  In order to
+       * accommodate both X and Y tiling, the Y-major tiling format is
+       * treated as being a bunch of narrow X-tiles placed next to each
+       * other.  This means that the tile width for Y-tiling is actually
+       * the width of one sub-column of the Y-major tile where each 4K
+       * tile has 8 512B sub-columns.
+       *
+       * The major Y value is the row of tiles in which the pixel lives.
+       * The major X value is the tile sub-column in which the pixel
+       * lives; for X tiling, this is the same as the tile column, for Y
+       * tiling, each tile has 8 sub-columns.  The minor X and Y indices
+       * are the position within the sub-column.
+       */
+
+      /* Calculate the minor x and y indices. */
+      nir_def *minor = nir_ubfe(b, xypos, nir_imm_int(b, 0),
+                                       nir_trim_vector(b, tiling, 2));
+      nir_def *major = nir_ushr(b, xypos, nir_trim_vector(b, tiling, 2));
+
+      /* Calculate the texel index from the start of the tile row and the
+       * vertical coordinate of the row.
+       * Equivalent to:
+       *   tmp.x = (major.x << tile.y << tile.x) +
+       *           (minor.y << tile.x) + minor.x
+       *   tmp.y = major.y << tile.y
+       */
+      nir_def *idx_x, *idx_y;
+      idx_x = nir_ishl(b, nir_channel(b, major, 0), nir_channel(b, tiling, 1));
+      idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 1));
+      idx_x = nir_ishl(b, idx_x, nir_channel(b, tiling, 0));
+      idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 0));
+      idx_y = nir_ishl(b, nir_channel(b, major, 1), nir_channel(b, tiling, 1));
+
+      /* Add it to the start of the tile row. */
+      nir_def *idx;
+      idx = nir_imul(b, idx_y, nir_channel(b, stride, 1));
+      idx = nir_iadd(b, idx, idx_x);
+
+      /* Multiply by the Bpp value. */
+      addr = nir_imul(b, idx, nir_channel(b, stride, 0));
+
+      if (devinfo->ver < 8 && devinfo->platform != INTEL_PLATFORM_BYT) {
+         /* Take into account the two dynamically specified shifts.  Both are
+          * used to implement swizzling of X-tiled surfaces.  For Y-tiled
+          * surfaces only one bit needs to be XOR-ed with bit 6 of the memory
+          * address, so a swz value of 0xff (actually interpreted as 31 by the
+          * hardware) will be provided to cause the relevant bit of tmp.y to
+          * be zero and turn the first XOR into the identity.  For linear
+          * surfaces or platforms lacking address swizzling both shifts will
+          * be 0xff causing the relevant bits of both tmp.x and .y to be zero,
+          * what effectively disables swizzling.
+          */
+         nir_def *swizzle = load_image_param(b, deref, SWIZZLING);
+         nir_def *shift0 = nir_ushr(b, addr, nir_channel(b, swizzle, 0));
+         nir_def *shift1 = nir_ushr(b, addr, nir_channel(b, swizzle, 1));
+
+         /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
+         nir_def *bit = nir_iand(b, nir_ixor(b, shift0, shift1),
+                                        nir_imm_int(b, 1 << 6));
+         addr = nir_ixor(b, addr, bit);
+      }
+   } else {
+      /* Multiply by the Bpp/stride value.  Note that the addr.y may be
+       * non-zero even if the image is one-dimensional because a vertical
+       * offset may have been applied above to select a non-zero slice or
+       * level of a higher-dimensional texture.
+       */
+      nir_def *idx;
+      idx = nir_imul(b, nir_channel(b, xypos, 1), nir_channel(b, stride, 1));
+      idx = nir_iadd(b, nir_channel(b, xypos, 0), idx);
+      addr = nir_imul(b, idx, nir_channel(b, stride, 0));
+   }
+
+   return addr;
+}
+
+struct format_info {
+   const struct isl_format_layout *fmtl;
+   unsigned chans;
+   unsigned bits[4];
+};
+
+static struct format_info
+get_format_info(enum isl_format fmt)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
+
+   return (struct format_info) {
+      .fmtl = fmtl,
+      .chans = isl_format_get_num_channels(fmt),
+      .bits = {
+         fmtl->channels.r.bits,
+         fmtl->channels.g.bits,
+         fmtl->channels.b.bits,
+         fmtl->channels.a.bits
+      },
+   };
+}
+
+static nir_def *
+convert_color_for_load(nir_builder *b, const struct intel_device_info *devinfo,
+                       nir_def *color,
+                       enum isl_format image_fmt, enum isl_format lower_fmt,
+                       unsigned dest_components)
+{
+   if (image_fmt == lower_fmt)
+      goto expand_vec;
+
+   if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
+      assert(lower_fmt == ISL_FORMAT_R32_UINT);
+      color = nir_format_unpack_11f11f10f(b, color);
+      goto expand_vec;
+   }
+
+   struct format_info image = get_format_info(image_fmt);
+   struct format_info lower = get_format_info(lower_fmt);
+
+   const bool needs_sign_extension =
+      isl_format_has_snorm_channel(image_fmt) ||
+      isl_format_has_sint_channel(image_fmt);
+
+   /* We only check the red channel to detect if we need to pack/unpack */
+   assert(image.bits[0] != lower.bits[0] ||
+          memcmp(image.bits, lower.bits, sizeof(image.bits)) == 0);
+
+   if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
+      if (needs_sign_extension)
+         color = nir_format_unpack_sint(b, color, image.bits, image.chans);
+      else
+         color = nir_format_unpack_uint(b, color, image.bits, image.chans);
+   } else {
+      /* All these formats are homogeneous */
+      for (unsigned i = 1; i < image.chans; i++)
+         assert(image.bits[i] == image.bits[0]);
+
+      /* On IVB, we rely on the undocumented behavior that typed reads from
+       * surfaces of the unsupported R8 and R16 formats return useful data in
+       * their least significant bits.  However, the data in the high bits is
+       * garbage so we have to discard it.
+       */
+      if (devinfo->verx10 == 70 &&
+          (lower_fmt == ISL_FORMAT_R16_UINT ||
+           lower_fmt == ISL_FORMAT_R8_UINT))
+         color = nir_format_mask_uvec(b, color, lower.bits);
+
+      if (image.bits[0] != lower.bits[0]) {
+         color = nir_format_bitcast_uvec_unmasked(b, color, lower.bits[0],
+                                                  image.bits[0]);
+      }
+
+      if (needs_sign_extension)
+         color = nir_format_sign_extend_ivec(b, color, image.bits);
+   }
+
+   switch (image.fmtl->channels.r.type) {
+   case ISL_UNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_unorm_to_float(b, color, image.bits);
+      break;
+
+   case ISL_SNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_snorm_to_float(b, color, image.bits);
+      break;
+
+   case ISL_SFLOAT:
+      if (image.bits[0] == 16)
+         color = nir_unpack_half_2x16_split_x(b, color);
+      break;
+
+   case ISL_UINT:
+   case ISL_SINT:
+      break;
+
+   default:
+      unreachable("Invalid image channel type");
+   }
+
+expand_vec:
+   assert(dest_components == 1 || dest_components == 4);
+   assert(color->num_components <= dest_components);
+   if (color->num_components == dest_components)
+      return color;
+
+   nir_def *comps[4];
+   for (unsigned i = 0; i < color->num_components; i++)
+      comps[i] = nir_channel(b, color, i);
+
+   for (unsigned i = color->num_components; i < 3; i++)
+      comps[i] = nir_imm_int(b, 0);
+
+   if (color->num_components < 4) {
+      if (isl_format_has_int_channel(image_fmt))
+         comps[3] = nir_imm_int(b, 1);
+      else
+         comps[3] = nir_imm_float(b, 1);
+   }
+
+   return nir_vec(b, comps, dest_components);
+}
+
+static bool
+lower_image_load_instr(nir_builder *b,
+                       const struct intel_device_info *devinfo,
+                       nir_intrinsic_instr *intrin,
+                       bool sparse)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   if (var->data.image.format == PIPE_FORMAT_NONE)
+      return false;
+
+   const enum isl_format image_fmt =
+      isl_format_for_pipe_format(var->data.image.format);
+
+   if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
+      const enum isl_format lower_fmt =
+         isl_lower_storage_image_format(devinfo, image_fmt);
+      const unsigned dest_components =
+         sparse ? (intrin->num_components - 1) : intrin->num_components;
+
+      /* Use an undef to hold the uses of the load while we do the color
+       * conversion.
+       */
+      nir_def *placeholder = nir_undef(b, 4, 32);
+      nir_def_rewrite_uses(&intrin->def, placeholder);
+
+      intrin->num_components = isl_format_get_num_channels(lower_fmt);
+      intrin->def.num_components = intrin->num_components;
+
+      b->cursor = nir_after_instr(&intrin->instr);
+
+      nir_def *color = convert_color_for_load(b, devinfo,
+                                                  &intrin->def,
+                                                  image_fmt, lower_fmt,
+                                                  dest_components);
+
+      if (sparse) {
+         /* Put the sparse component back on the original instruction */
+         intrin->num_components++;
+         intrin->def.num_components = intrin->num_components;
+
+         /* Carry over the sparse component without modifying it with the
+          * converted color.
+          */
+         nir_def *sparse_color[NIR_MAX_VEC_COMPONENTS];
+         for (unsigned i = 0; i < dest_components; i++)
+            sparse_color[i] = nir_channel(b, color, i);
+         sparse_color[dest_components] =
+            nir_channel(b, &intrin->def, intrin->num_components - 1);
+         color = nir_vec(b, sparse_color, dest_components + 1);
+      }
+
+      nir_def_rewrite_uses(placeholder, color);
+      nir_instr_remove(placeholder->parent_instr);
+   } else {
+      /* This code part is only useful prior to Gfx9, we do not have plans to
+       * enable sparse there.
+       */
+      assert(!sparse);
+
+      const struct isl_format_layout *image_fmtl =
+         isl_format_get_layout(image_fmt);
+      /* We have a matching typed format for everything 32b and below */
+      assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
+      enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
+                                ISL_FORMAT_R32G32_UINT :
+                                ISL_FORMAT_R32G32B32A32_UINT;
+      const unsigned dest_components = intrin->num_components;
+
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      nir_def *coord = intrin->src[1].ssa;
+
+      nir_def *do_load = image_coord_is_in_bounds(b, deref, coord);
+      if (devinfo->verx10 == 70) {
+         /* Check whether the first stride component (i.e. the Bpp value)
+          * is greater than four, what on Gfx7 indicates that a surface of
+          * type RAW has been bound for untyped access.  Reading or writing
+          * to a surface of type other than RAW using untyped surface
+          * messages causes a hang on IVB and VLV.
+          */
+         nir_def *stride = load_image_param(b, deref, STRIDE);
+         nir_def *is_raw =
+            nir_igt_imm(b, nir_channel(b, stride, 0), 4);
+         do_load = nir_iand(b, do_load, is_raw);
+      }
+      nir_push_if(b, do_load);
+
+      nir_def *addr = image_address(b, devinfo, deref, coord);
+      nir_def *load =
+         nir_image_deref_load_raw_intel(b, image_fmtl->bpb / 32, 32,
+                                        &deref->def, addr);
+
+      nir_push_else(b, NULL);
+
+      nir_def *zero = nir_imm_zero(b, load->num_components, 32);
+
+      nir_pop_if(b, NULL);
+
+      nir_def *value = nir_if_phi(b, load, zero);
+
+      nir_def *color = convert_color_for_load(b, devinfo, value,
+                                                  image_fmt, raw_fmt,
+                                                  dest_components);
+
+      nir_def_rewrite_uses(&intrin->def, color);
+   }
+
+   return true;
+}
+
+static nir_def *
+convert_color_for_store(nir_builder *b, const struct intel_device_info *devinfo,
+                        nir_def *color,
+                        enum isl_format image_fmt, enum isl_format lower_fmt)
+{
+   struct format_info image = get_format_info(image_fmt);
+   struct format_info lower = get_format_info(lower_fmt);
+
+   color = nir_trim_vector(b, color, image.chans);
+
+   if (image_fmt == lower_fmt)
+      return color;
+
+   if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
+      assert(lower_fmt == ISL_FORMAT_R32_UINT);
+      return nir_format_pack_11f11f10f(b, color);
+   }
+
+   switch (image.fmtl->channels.r.type) {
+   case ISL_UNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_float_to_unorm(b, color, image.bits);
+      break;
+
+   case ISL_SNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_float_to_snorm(b, color, image.bits);
+      break;
+
+   case ISL_SFLOAT:
+      if (image.bits[0] == 16)
+         color = nir_format_float_to_half(b, color);
+      break;
+
+   case ISL_UINT:
+      color = nir_format_clamp_uint(b, color, image.bits);
+      break;
+
+   case ISL_SINT:
+      color = nir_format_clamp_sint(b, color, image.bits);
+      break;
+
+   default:
+      unreachable("Invalid image channel type");
+   }
+
+   if (image.bits[0] < 32 &&
+       (isl_format_has_snorm_channel(image_fmt) ||
+        isl_format_has_sint_channel(image_fmt)))
+      color = nir_format_mask_uvec(b, color, image.bits);
+
+   if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
+      color = nir_format_pack_uint(b, color, image.bits, image.chans);
+   } else {
+      /* All these formats are homogeneous */
+      for (unsigned i = 1; i < image.chans; i++)
+         assert(image.bits[i] == image.bits[0]);
+
+      if (image.bits[0] != lower.bits[0]) {
+         color = nir_format_bitcast_uvec_unmasked(b, color, image.bits[0],
+                                                  lower.bits[0]);
+      }
+   }
+
+   return color;
+}
+
+static bool
+lower_image_store_instr(nir_builder *b,
+                        const struct intel_device_info *devinfo,
+                        nir_intrinsic_instr *intrin)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   /* For write-only surfaces, we trust that the hardware can just do the
+    * conversion for us.
+    */
+   if (var->data.access & ACCESS_NON_READABLE)
+      return false;
+
+   if (var->data.image.format == PIPE_FORMAT_NONE)
+      return false;
+
+   const enum isl_format image_fmt =
+      isl_format_for_pipe_format(var->data.image.format);
+
+   if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
+      const enum isl_format lower_fmt =
+         isl_lower_storage_image_format(devinfo, image_fmt);
+
+      /* Color conversion goes before the store */
+      b->cursor = nir_before_instr(&intrin->instr);
+
+      nir_def *color = convert_color_for_store(b, devinfo,
+                                                   intrin->src[3].ssa,
+                                                   image_fmt, lower_fmt);
+      intrin->num_components = isl_format_get_num_channels(lower_fmt);
+      nir_src_rewrite(&intrin->src[3], color);
+   } else {
+      const struct isl_format_layout *image_fmtl =
+         isl_format_get_layout(image_fmt);
+      /* We have a matching typed format for everything 32b and below */
+      assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
+      enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
+                                ISL_FORMAT_R32G32_UINT :
+                                ISL_FORMAT_R32G32B32A32_UINT;
+
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      nir_def *coord = intrin->src[1].ssa;
+
+      nir_def *do_store = image_coord_is_in_bounds(b, deref, coord);
+      if (devinfo->verx10 == 70) {
+         /* Check whether the first stride component (i.e. the Bpp value)
+          * is greater than four, what on Gfx7 indicates that a surface of
+          * type RAW has been bound for untyped access.  Reading or writing
+          * to a surface of type other than RAW using untyped surface
+          * messages causes a hang on IVB and VLV.
+          */
+         nir_def *stride = load_image_param(b, deref, STRIDE);
+         nir_def *is_raw =
+            nir_igt_imm(b, nir_channel(b, stride, 0), 4);
+         do_store = nir_iand(b, do_store, is_raw);
+      }
+      nir_push_if(b, do_store);
+
+      nir_def *addr = image_address(b, devinfo, deref, coord);
+      nir_def *color = convert_color_for_store(b, devinfo,
+                                                   intrin->src[3].ssa,
+                                                   image_fmt, raw_fmt);
+
+      nir_intrinsic_instr *store =
+         nir_intrinsic_instr_create(b->shader,
+                                    nir_intrinsic_image_deref_store_raw_intel);
+      store->src[0] = nir_src_for_ssa(&deref->def);
+      store->src[1] = nir_src_for_ssa(addr);
+      store->src[2] = nir_src_for_ssa(color);
+      store->num_components = image_fmtl->bpb / 32;
+      nir_builder_instr_insert(b, &store->instr);
+
+      nir_pop_if(b, NULL);
+   }
+
+   return true;
+}
+
+static bool
+lower_image_atomic_instr(nir_builder *b,
+                         const struct intel_device_info *devinfo,
+                         nir_intrinsic_instr *intrin)
+{
+   if (devinfo->verx10 >= 75)
+      return false;
+
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   /* Use an undef to hold the uses of the load conversion. */
+   nir_def *placeholder = nir_undef(b, 4, 32);
+   nir_def_rewrite_uses(&intrin->def, placeholder);
+
+   /* Check the first component of the size field to find out if the
+    * image is bound.  Necessary on IVB for typed atomics because
+    * they don't seem to respect null surfaces and will happily
+    * corrupt or read random memory when no image is bound.
+    */
+   nir_def *size = load_image_param(b, deref, SIZE);
+   nir_def *zero = nir_imm_int(b, 0);
+   nir_push_if(b, nir_ine(b, nir_channel(b, size, 0), zero));
+
+   nir_builder_instr_insert(b, &intrin->instr);
+
+   nir_pop_if(b, NULL);
+
+   nir_def *result = nir_if_phi(b, &intrin->def, zero);
+   nir_def_rewrite_uses(placeholder, result);
+
+   return true;
+}
+
+static bool
+lower_image_size_instr(nir_builder *b,
+                       const struct intel_device_info *devinfo,
+                       nir_intrinsic_instr *intrin)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   /* For write-only images, we have an actual image surface so we fall back
+    * and let the back-end emit a TXS for this.
+    */
+   if (var->data.access & ACCESS_NON_READABLE)
+      return false;
+
+   if (var->data.image.format == PIPE_FORMAT_NONE)
+      return false;
+
+   /* If we have a matching typed format, then we have an actual image surface
+    * so we fall back and let the back-end emit a TXS for this.
+    */
+   const enum isl_format image_fmt =
+      isl_format_for_pipe_format(var->data.image.format);
+   if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt))
+      return false;
+
+   assert(nir_src_as_uint(intrin->src[1]) == 0);
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_def *size = load_image_param(b, deref, SIZE);
+
+   nir_def *comps[4] = { NULL, NULL, NULL, NULL };
+
+   assert(nir_intrinsic_image_dim(intrin) != GLSL_SAMPLER_DIM_CUBE);
+   unsigned coord_comps = glsl_get_sampler_coordinate_components(deref->type);
+   for (unsigned c = 0; c < coord_comps; c++)
+      comps[c] = nir_channel(b, size, c);
+
+   for (unsigned c = coord_comps; c < intrin->def.num_components; ++c)
+      comps[c] = nir_imm_int(b, 1);
+
+   nir_def *vec = nir_vec(b, comps, intrin->def.num_components);
+   nir_def_rewrite_uses(&intrin->def, vec);
+
+   return true;
+}
+
+static bool
+brw_nir_lower_storage_image_instr(nir_builder *b,
+                                  nir_instr *instr,
+                                  void *cb_data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+   const struct brw_nir_lower_storage_image_opts *opts = cb_data;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_image_deref_load:
+      if (opts->lower_loads)
+         return lower_image_load_instr(b, opts->devinfo, intrin, false);
+      return false;
+
+   case nir_intrinsic_image_deref_sparse_load:
+      if (opts->lower_loads)
+         return lower_image_load_instr(b, opts->devinfo, intrin, true);
+      return false;
+
+   case nir_intrinsic_image_deref_store:
+      if (opts->lower_stores)
+         return lower_image_store_instr(b, opts->devinfo, intrin);
+      return false;
+
+   case nir_intrinsic_image_deref_atomic:
+   case nir_intrinsic_image_deref_atomic_swap:
+      if (opts->lower_atomics)
+         return lower_image_atomic_instr(b, opts->devinfo, intrin);
+      return false;
+
+   case nir_intrinsic_image_deref_size:
+      if (opts->lower_get_size)
+         return lower_image_size_instr(b, opts->devinfo, intrin);
+      return false;
+
+   default:
+      /* Nothing to do */
+      return false;
+   }
+}
+
+bool
+brw_nir_lower_storage_image(nir_shader *shader,
+                            const struct brw_nir_lower_storage_image_opts *opts)
+{
+   bool progress = false;
+
+   const nir_lower_image_options image_options = {
+      .lower_cube_size = true,
+      .lower_image_samples_to_one = true,
+   };
+
+   progress |= nir_lower_image(shader, &image_options);
+
+   progress |= nir_shader_instructions_pass(shader,
+                                            brw_nir_lower_storage_image_instr,
+                                            nir_metadata_none,
+                                            (void *)opts);
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_nir_rt.c b/src/intel/compiler/elk/brw_nir_rt.c
new file mode 100644
index 00000000000..b5daa1090de
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir_rt.c
@@ -0,0 +1,536 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "intel_nir.h"
+#include "brw_nir_rt.h"
+#include "brw_nir_rt_builder.h"
+#include "intel_nir.h"
+
+static bool
+resize_deref(nir_builder *b, nir_deref_instr *deref,
+             unsigned num_components, unsigned bit_size)
+{
+   if (deref->def.num_components == num_components &&
+       deref->def.bit_size == bit_size)
+      return false;
+
+   /* NIR requires array indices have to match the deref bit size */
+   if (deref->def.bit_size != bit_size &&
+       (deref->deref_type == nir_deref_type_array ||
+        deref->deref_type == nir_deref_type_ptr_as_array)) {
+      b->cursor = nir_before_instr(&deref->instr);
+      nir_def *idx;
+      if (nir_src_is_const(deref->arr.index)) {
+         idx = nir_imm_intN_t(b, nir_src_as_int(deref->arr.index), bit_size);
+      } else {
+         idx = nir_i2iN(b, deref->arr.index.ssa, bit_size);
+      }
+      nir_src_rewrite(&deref->arr.index, idx);
+   }
+
+   deref->def.num_components = num_components;
+   deref->def.bit_size = bit_size;
+
+   return true;
+}
+
+static bool
+lower_rt_io_derefs(nir_shader *shader)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+
+   bool progress = false;
+
+   unsigned num_shader_call_vars = 0;
+   nir_foreach_variable_with_modes(var, shader, nir_var_shader_call_data)
+      num_shader_call_vars++;
+
+   unsigned num_ray_hit_attrib_vars = 0;
+   nir_foreach_variable_with_modes(var, shader, nir_var_ray_hit_attrib)
+      num_ray_hit_attrib_vars++;
+
+   /* At most one payload is allowed because it's an input.  Technically, this
+    * is also true for hit attribute variables.  However, after we inline an
+    * any-hit shader into an intersection shader, we can end up with multiple
+    * hit attribute variables.  They'll end up mapping to a cast from the same
+    * base pointer so this is fine.
+    */
+   assert(num_shader_call_vars <= 1);
+
+   nir_builder b = nir_builder_at(nir_before_impl(impl));
+
+   nir_def *call_data_addr = NULL;
+   if (num_shader_call_vars > 0) {
+      assert(shader->scratch_size >= BRW_BTD_STACK_CALLEE_DATA_SIZE);
+      call_data_addr =
+         brw_nir_rt_load_scratch(&b, BRW_BTD_STACK_CALL_DATA_PTR_OFFSET, 8,
+                                 1, 64);
+      progress = true;
+   }
+
+   gl_shader_stage stage = shader->info.stage;
+   nir_def *hit_attrib_addr = NULL;
+   if (num_ray_hit_attrib_vars > 0) {
+      assert(stage == MESA_SHADER_ANY_HIT ||
+             stage == MESA_SHADER_CLOSEST_HIT ||
+             stage == MESA_SHADER_INTERSECTION);
+      nir_def *hit_addr =
+         brw_nir_rt_mem_hit_addr(&b, stage == MESA_SHADER_CLOSEST_HIT);
+      /* The vec2 barycentrics are in 2nd and 3rd dwords of MemHit */
+      nir_def *bary_addr = nir_iadd_imm(&b, hit_addr, 4);
+      hit_attrib_addr = nir_bcsel(&b, nir_load_leaf_procedural_intel(&b),
+                                      brw_nir_rt_hit_attrib_data_addr(&b),
+                                      bary_addr);
+      progress = true;
+   }
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_deref)
+            continue;
+
+         nir_deref_instr *deref = nir_instr_as_deref(instr);
+         if (nir_deref_mode_is(deref, nir_var_shader_call_data)) {
+            deref->modes = nir_var_function_temp;
+            if (deref->deref_type == nir_deref_type_var) {
+               b.cursor = nir_before_instr(&deref->instr);
+               nir_deref_instr *cast =
+                  nir_build_deref_cast(&b, call_data_addr,
+                                       nir_var_function_temp,
+                                       deref->var->type, 0);
+               nir_def_rewrite_uses(&deref->def,
+                                        &cast->def);
+               nir_instr_remove(&deref->instr);
+               progress = true;
+            }
+         } else if (nir_deref_mode_is(deref, nir_var_ray_hit_attrib)) {
+            deref->modes = nir_var_function_temp;
+            if (deref->deref_type == nir_deref_type_var) {
+               b.cursor = nir_before_instr(&deref->instr);
+               nir_deref_instr *cast =
+                  nir_build_deref_cast(&b, hit_attrib_addr,
+                                       nir_var_function_temp,
+                                       deref->type, 0);
+               nir_def_rewrite_uses(&deref->def,
+                                        &cast->def);
+               nir_instr_remove(&deref->instr);
+               progress = true;
+            }
+         }
+
+         /* We're going to lower all function_temp memory to scratch using
+          * 64-bit addresses.  We need to resize all our derefs first or else
+          * nir_lower_explicit_io will have a fit.
+          */
+         if (nir_deref_mode_is(deref, nir_var_function_temp) &&
+             resize_deref(&b, deref, 1, 64))
+            progress = true;
+      }
+   }
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+   } else {
+      nir_metadata_preserve(impl, nir_metadata_all);
+   }
+
+   return progress;
+}
+
+/** Lowers ray-tracing shader I/O and scratch access
+ *
+ * SPV_KHR_ray_tracing adds three new types of I/O, each of which need their
+ * own bit of special care:
+ *
+ *  - Shader payload data:  This is represented by the IncomingCallableData
+ *    and IncomingRayPayload storage classes which are both represented by
+ *    nir_var_call_data in NIR.  There is at most one of these per-shader and
+ *    they contain payload data passed down the stack from the parent shader
+ *    when it calls executeCallable() or traceRay().  In our implementation,
+ *    the actual storage lives in the calling shader's scratch space and we're
+ *    passed a pointer to it.
+ *
+ *  - Hit attribute data:  This is represented by the HitAttribute storage
+ *    class in SPIR-V and nir_var_ray_hit_attrib in NIR.  For triangle
+ *    geometry, it's supposed to contain two floats which are the barycentric
+ *    coordinates.  For AABS/procedural geometry, it contains the hit data
+ *    written out by the intersection shader.  In our implementation, it's a
+ *    64-bit pointer which points either to the u/v area of the relevant
+ *    MemHit data structure or the space right after the HW ray stack entry.
+ *
+ *  - Shader record buffer data:  This allows read-only access to the data
+ *    stored in the SBT right after the bindless shader handles.  It's
+ *    effectively a UBO with a magic address.  Coming out of spirv_to_nir,
+ *    we get a nir_intrinsic_load_shader_record_ptr which is cast to a
+ *    nir_var_mem_global deref and all access happens through that.  The
+ *    shader_record_ptr system value is handled in brw_nir_lower_rt_intrinsics
+ *    and we assume nir_lower_explicit_io is called elsewhere thanks to
+ *    VK_KHR_buffer_device_address so there's really nothing to do here.
+ *
+ * We also handle lowering any remaining function_temp variables to scratch at
+ * this point.  This gets rid of any remaining arrays and also takes care of
+ * the sending side of ray payloads where we pass pointers to a function_temp
+ * variable down the call stack.
+ */
+static void
+lower_rt_io_and_scratch(nir_shader *nir)
+{
+   /* First, we to ensure all the I/O variables have explicit types.  Because
+    * these are shader-internal and don't come in from outside, they don't
+    * have an explicit memory layout and we have to assign them one.
+    */
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
+              nir_var_function_temp |
+              nir_var_shader_call_data |
+              nir_var_ray_hit_attrib,
+              glsl_get_natural_size_align_bytes);
+
+   /* Now patch any derefs to I/O vars */
+   NIR_PASS_V(nir, lower_rt_io_derefs);
+
+   /* Finally, lower any remaining function_temp, mem_constant, or
+    * ray_hit_attrib access to 64-bit global memory access.
+    */
+   NIR_PASS_V(nir, nir_lower_explicit_io,
+              nir_var_function_temp |
+              nir_var_mem_constant |
+              nir_var_ray_hit_attrib,
+              nir_address_format_64bit_global);
+}
+
+static void
+build_terminate_ray(nir_builder *b)
+{
+   nir_def *skip_closest_hit = nir_test_mask(b, nir_load_ray_flags(b),
+      BRW_RT_RAY_FLAG_SKIP_CLOSEST_HIT_SHADER);
+   nir_push_if(b, skip_closest_hit);
+   {
+      /* The shader that calls traceRay() is unable to access any ray hit
+       * information except for that which is explicitly written into the ray
+       * payload by shaders invoked during the trace.  If there's no closest-
+       * hit shader, then accepting the hit has no observable effect; it's
+       * just extra memory traffic for no reason.
+       */
+      brw_nir_btd_return(b);
+      nir_jump(b, nir_jump_halt);
+   }
+   nir_push_else(b, NULL);
+   {
+      /* The closest hit shader is in the same shader group as the any-hit
+       * shader that we're currently in.  We can get the address for its SBT
+       * handle by looking at the shader record pointer and subtracting the
+       * size of a SBT handle.  The BINDLESS_SHADER_RECORD for a closest hit
+       * shader is the first one in the SBT handle.
+       */
+      nir_def *closest_hit =
+         nir_iadd_imm(b, nir_load_shader_record_ptr(b),
+                        -BRW_RT_SBT_HANDLE_SIZE);
+
+      brw_nir_rt_commit_hit(b);
+      brw_nir_btd_spawn(b, closest_hit);
+      nir_jump(b, nir_jump_halt);
+   }
+   nir_pop_if(b, NULL);
+}
+
+/** Lowers away ray walk intrinsics
+ *
+ * This lowers terminate_ray, ignore_ray_intersection, and the NIR-specific
+ * accept_ray_intersection intrinsics to the appropriate Intel-specific
+ * intrinsics.
+ */
+static bool
+lower_ray_walk_intrinsics(nir_shader *shader,
+                          const struct intel_device_info *devinfo)
+{
+   assert(shader->info.stage == MESA_SHADER_ANY_HIT ||
+          shader->info.stage == MESA_SHADER_INTERSECTION);
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+
+   nir_builder b = nir_builder_create(impl);
+
+   bool progress = false;
+   nir_foreach_block_safe(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_ignore_ray_intersection: {
+            b.cursor = nir_instr_remove(&intrin->instr);
+
+            /* We put the newly emitted code inside a dummy if because it's
+             * going to contain a jump instruction and we don't want to deal
+             * with that mess here.  It'll get dealt with by our control-flow
+             * optimization passes.
+             */
+            nir_push_if(&b, nir_imm_true(&b));
+            nir_trace_ray_intel(&b,
+                                nir_load_btd_global_arg_addr_intel(&b),
+                                nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT),
+                                nir_imm_int(&b, GEN_RT_TRACE_RAY_CONTINUE),
+                                .synchronous = false);
+            nir_jump(&b, nir_jump_halt);
+            nir_pop_if(&b, NULL);
+            progress = true;
+            break;
+         }
+
+         case nir_intrinsic_accept_ray_intersection: {
+            b.cursor = nir_instr_remove(&intrin->instr);
+
+            nir_def *terminate = nir_test_mask(&b, nir_load_ray_flags(&b),
+               BRW_RT_RAY_FLAG_TERMINATE_ON_FIRST_HIT);
+            nir_push_if(&b, terminate);
+            {
+               build_terminate_ray(&b);
+            }
+            nir_push_else(&b, NULL);
+            {
+               nir_trace_ray_intel(&b,
+                                   nir_load_btd_global_arg_addr_intel(&b),
+                                   nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT),
+                                   nir_imm_int(&b, GEN_RT_TRACE_RAY_COMMIT),
+                                   .synchronous = false);
+               nir_jump(&b, nir_jump_halt);
+            }
+            nir_pop_if(&b, NULL);
+            progress = true;
+            break;
+         }
+
+         case nir_intrinsic_terminate_ray: {
+            b.cursor = nir_instr_remove(&intrin->instr);
+            build_terminate_ray(&b);
+            progress = true;
+            break;
+         }
+
+         default:
+            break;
+         }
+      }
+   }
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_none);
+   } else {
+      nir_metadata_preserve(impl, nir_metadata_all);
+   }
+
+   return progress;
+}
+
+void
+brw_nir_lower_raygen(nir_shader *nir)
+{
+   assert(nir->info.stage == MESA_SHADER_RAYGEN);
+   NIR_PASS_V(nir, brw_nir_lower_shader_returns);
+   lower_rt_io_and_scratch(nir);
+}
+
+void
+brw_nir_lower_any_hit(nir_shader *nir, const struct intel_device_info *devinfo)
+{
+   assert(nir->info.stage == MESA_SHADER_ANY_HIT);
+   NIR_PASS_V(nir, brw_nir_lower_shader_returns);
+   NIR_PASS_V(nir, lower_ray_walk_intrinsics, devinfo);
+   lower_rt_io_and_scratch(nir);
+}
+
+void
+brw_nir_lower_closest_hit(nir_shader *nir)
+{
+   assert(nir->info.stage == MESA_SHADER_CLOSEST_HIT);
+   NIR_PASS_V(nir, brw_nir_lower_shader_returns);
+   lower_rt_io_and_scratch(nir);
+}
+
+void
+brw_nir_lower_miss(nir_shader *nir)
+{
+   assert(nir->info.stage == MESA_SHADER_MISS);
+   NIR_PASS_V(nir, brw_nir_lower_shader_returns);
+   lower_rt_io_and_scratch(nir);
+}
+
+void
+brw_nir_lower_callable(nir_shader *nir)
+{
+   assert(nir->info.stage == MESA_SHADER_CALLABLE);
+   NIR_PASS_V(nir, brw_nir_lower_shader_returns);
+   lower_rt_io_and_scratch(nir);
+}
+
+void
+brw_nir_lower_combined_intersection_any_hit(nir_shader *intersection,
+                                            const nir_shader *any_hit,
+                                            const struct intel_device_info *devinfo)
+{
+   assert(intersection->info.stage == MESA_SHADER_INTERSECTION);
+   assert(any_hit == NULL || any_hit->info.stage == MESA_SHADER_ANY_HIT);
+   NIR_PASS_V(intersection, brw_nir_lower_shader_returns);
+   NIR_PASS_V(intersection, brw_nir_lower_intersection_shader,
+              any_hit, devinfo);
+   NIR_PASS_V(intersection, lower_ray_walk_intrinsics, devinfo);
+   lower_rt_io_and_scratch(intersection);
+}
+
+static nir_def *
+build_load_uniform(nir_builder *b, unsigned offset,
+                   unsigned num_components, unsigned bit_size)
+{
+   return nir_load_uniform(b, num_components, bit_size, nir_imm_int(b, 0),
+                           .base = offset,
+                           .range = num_components * bit_size / 8);
+}
+
+#define load_trampoline_param(b, name, num_components, bit_size) \
+   build_load_uniform((b), offsetof(struct brw_rt_raygen_trampoline_params, name), \
+                      (num_components), (bit_size))
+
+nir_shader *
+brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
+                                 void *mem_ctx)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   const nir_shader_compiler_options *nir_options =
+      compiler->nir_options[MESA_SHADER_COMPUTE];
+
+   STATIC_ASSERT(sizeof(struct brw_rt_raygen_trampoline_params) == 32);
+
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
+                                                  nir_options,
+                                                  "RT Ray-Gen Trampoline");
+   ralloc_steal(mem_ctx, b.shader);
+
+   b.shader->info.workgroup_size_variable = true;
+
+   /* The RT global data and raygen BINDLESS_SHADER_RECORD addresses are
+    * passed in as push constants in the first register.  We deal with the
+    * raygen BSR address here; the global data we'll deal with later.
+    */
+   b.shader->num_uniforms = 32;
+   nir_def *raygen_param_bsr_addr =
+      load_trampoline_param(&b, raygen_bsr_addr, 1, 64);
+   nir_def *is_indirect =
+      nir_i2b(&b, load_trampoline_param(&b, is_indirect, 1, 8));
+   nir_def *local_shift =
+      nir_u2u32(&b, load_trampoline_param(&b, local_group_size_log2, 3, 8));
+
+   nir_def *raygen_indirect_bsr_addr;
+   nir_push_if(&b, is_indirect);
+   {
+      raygen_indirect_bsr_addr =
+         nir_load_global_constant(&b, raygen_param_bsr_addr,
+                                  8 /* align */,
+                                  1 /* components */,
+                                  64 /* bit_size */);
+   }
+   nir_pop_if(&b, NULL);
+
+   nir_def *raygen_bsr_addr =
+      nir_if_phi(&b, raygen_indirect_bsr_addr, raygen_param_bsr_addr);
+
+   nir_def *global_id = nir_load_workgroup_id_zero_base(&b);
+   nir_def *simd_channel = nir_load_subgroup_invocation(&b);
+   nir_def *local_x =
+      nir_ubfe(&b, simd_channel, nir_imm_int(&b, 0),
+                  nir_channel(&b, local_shift, 0));
+   nir_def *local_y =
+      nir_ubfe(&b, simd_channel, nir_channel(&b, local_shift, 0),
+                  nir_channel(&b, local_shift, 1));
+   nir_def *local_z =
+      nir_ubfe(&b, simd_channel,
+                  nir_iadd(&b, nir_channel(&b, local_shift, 0),
+                              nir_channel(&b, local_shift, 1)),
+                  nir_channel(&b, local_shift, 2));
+   nir_def *launch_id =
+      nir_iadd(&b, nir_ishl(&b, global_id, local_shift),
+                  nir_vec3(&b, local_x, local_y, local_z));
+
+   nir_def *launch_size = nir_load_ray_launch_size(&b);
+   nir_push_if(&b, nir_ball(&b, nir_ult(&b, launch_id, launch_size)));
+   {
+      nir_store_global(&b, brw_nir_rt_sw_hotzone_addr(&b, devinfo), 16,
+                       nir_vec4(&b, nir_imm_int(&b, 0), /* Stack ptr */
+                                    nir_channel(&b, launch_id, 0),
+                                    nir_channel(&b, launch_id, 1),
+                                    nir_channel(&b, launch_id, 2)),
+                       0xf /* write mask */);
+
+      brw_nir_btd_spawn(&b, raygen_bsr_addr);
+   }
+   nir_push_else(&b, NULL);
+   {
+      /* Even though these invocations aren't being used for anything, the
+       * hardware allocated stack IDs for them.  They need to retire them.
+       */
+      brw_nir_btd_retire(&b);
+   }
+   nir_pop_if(&b, NULL);
+
+   nir_shader *nir = b.shader;
+   nir->info.name = ralloc_strdup(nir, "RT: TraceRay trampoline");
+   nir_validate_shader(nir, "in brw_nir_create_raygen_trampoline");
+
+   struct brw_nir_compiler_opts opts = {};
+   brw_preprocess_nir(compiler, nir, &opts);
+
+   NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
+
+   b = nir_builder_create(nir_shader_get_entrypoint(b.shader));
+   /* brw_nir_lower_rt_intrinsics will leave us with a btd_global_arg_addr
+    * intrinsic which doesn't exist in compute shaders.  We also created one
+    * above when we generated the BTD spawn intrinsic.  Now we go through and
+    * replace them with a uniform load.
+    */
+   nir_foreach_block(block, b.impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         if (intrin->intrinsic != nir_intrinsic_load_btd_global_arg_addr_intel)
+            continue;
+
+         b.cursor = nir_before_instr(&intrin->instr);
+         nir_def *global_arg_addr =
+            load_trampoline_param(&b, rt_disp_globals_addr, 1, 64);
+         nir_def_rewrite_uses(&intrin->def,
+                                  global_arg_addr);
+         nir_instr_remove(instr);
+      }
+   }
+
+   NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, NULL);
+
+   const bool is_scalar = true;
+   brw_nir_optimize(nir, is_scalar, devinfo);
+
+   return nir;
+}
diff --git a/src/intel/compiler/elk/brw_nir_rt.h b/src/intel/compiler/elk/brw_nir_rt.h
new file mode 100644
index 00000000000..4215d348e0c
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir_rt.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_NIR_RT_H
+#define BRW_NIR_RT_H
+
+#include "brw_nir.h"
+#include "brw_rt.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void brw_nir_lower_raygen(nir_shader *nir);
+void brw_nir_lower_any_hit(nir_shader *nir,
+                           const struct intel_device_info *devinfo);
+void brw_nir_lower_closest_hit(nir_shader *nir);
+void brw_nir_lower_miss(nir_shader *nir);
+void brw_nir_lower_callable(nir_shader *nir);
+void brw_nir_lower_combined_intersection_any_hit(nir_shader *intersection,
+                                                 const nir_shader *any_hit,
+                                                 const struct intel_device_info *devinfo);
+
+/* We reserve the first 16B of the stack for callee data pointers */
+#define BRW_BTD_STACK_RESUME_BSR_ADDR_OFFSET 0
+#define BRW_BTD_STACK_CALL_DATA_PTR_OFFSET 8
+#define BRW_BTD_STACK_CALLEE_DATA_SIZE 16
+
+/* We require the stack to be 8B aligned at the start of a shader */
+#define BRW_BTD_STACK_ALIGN 8
+
+bool brw_nir_lower_ray_queries(nir_shader *shader,
+                               const struct intel_device_info *devinfo);
+
+void brw_nir_lower_shader_returns(nir_shader *shader);
+
+bool brw_nir_lower_shader_calls(nir_shader *shader, struct brw_bs_prog_key *key);
+
+void brw_nir_lower_rt_intrinsics(nir_shader *shader,
+                                 const struct intel_device_info *devinfo);
+void brw_nir_lower_intersection_shader(nir_shader *intersection,
+                                       const nir_shader *any_hit,
+                                       const struct intel_device_info *devinfo);
+
+nir_shader *
+brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
+                                 void *mem_ctx);
+nir_shader *
+brw_nir_create_trivial_return_shader(const struct brw_compiler *compiler,
+                                     void *mem_ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BRW_NIR_RT_H */
diff --git a/src/intel/compiler/elk/brw_nir_rt_builder.h b/src/intel/compiler/elk/brw_nir_rt_builder.h
new file mode 100644
index 00000000000..3f8189e4155
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir_rt_builder.h
@@ -0,0 +1,990 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_NIR_RT_BUILDER_H
+#define BRW_NIR_RT_BUILDER_H
+
+/* This file provides helpers to access memory based data structures that the
+ * RT hardware reads/writes and their locations.
+ *
+ * See also "Memory Based Data Structures for Ray Tracing" (BSpec 47547) and
+ * "Ray Tracing Address Computation for Memory Resident Structures" (BSpec
+ * 47550).
+ */
+
+#include "brw_rt.h"
+#include "nir_builder.h"
+
+#define is_access_for_builder(b) \
+   ((b)->shader->info.stage == MESA_SHADER_FRAGMENT ? \
+    ACCESS_INCLUDE_HELPERS : 0)
+
+static inline nir_def *
+brw_nir_rt_load(nir_builder *b, nir_def *addr, unsigned align,
+                unsigned components, unsigned bit_size)
+{
+   return nir_build_load_global(b, components, bit_size, addr,
+                                .align_mul = align,
+                                .access = is_access_for_builder(b));
+}
+
+static inline void
+brw_nir_rt_store(nir_builder *b, nir_def *addr, unsigned align,
+                 nir_def *value, unsigned write_mask)
+{
+   nir_build_store_global(b, value, addr,
+                          .align_mul = align,
+                          .write_mask = (write_mask) &
+                                        BITFIELD_MASK(value->num_components),
+                          .access = is_access_for_builder(b));
+}
+
+static inline nir_def *
+brw_nir_rt_load_const(nir_builder *b, unsigned components,
+                      nir_def *addr, nir_def *pred)
+{
+   return nir_load_global_const_block_intel(b, components, addr, pred);
+}
+
+static inline nir_def *
+brw_load_btd_dss_id(nir_builder *b)
+{
+   return nir_load_topology_id_intel(b, .base = BRW_TOPOLOGY_ID_DSS);
+}
+
+static inline nir_def *
+brw_nir_rt_load_num_simd_lanes_per_dss(nir_builder *b,
+                                       const struct intel_device_info *devinfo)
+{
+   return nir_imm_int(b, devinfo->num_thread_per_eu *
+                         devinfo->max_eus_per_subslice *
+                         16 /* The RT computation is based off SIMD16 */);
+}
+
+static inline nir_def *
+brw_load_eu_thread_simd(nir_builder *b)
+{
+   return nir_load_topology_id_intel(b, .base = BRW_TOPOLOGY_ID_EU_THREAD_SIMD);
+}
+
+static inline nir_def *
+brw_nir_rt_async_stack_id(nir_builder *b)
+{
+   return nir_iadd(b, nir_umul_32x16(b, nir_load_ray_num_dss_rt_stacks_intel(b),
+                                        brw_load_btd_dss_id(b)),
+                      nir_load_btd_stack_id_intel(b));
+}
+
+static inline nir_def *
+brw_nir_rt_sync_stack_id(nir_builder *b)
+{
+   return brw_load_eu_thread_simd(b);
+}
+
+/* We have our own load/store scratch helpers because they emit a global
+ * memory read or write based on the scratch_base_ptr system value rather
+ * than a load/store_scratch intrinsic.
+ */
+static inline nir_def *
+brw_nir_rt_load_scratch(nir_builder *b, uint32_t offset, unsigned align,
+                        unsigned num_components, unsigned bit_size)
+{
+   nir_def *addr =
+      nir_iadd_imm(b, nir_load_scratch_base_ptr(b, 1, 64, 1), offset);
+   return brw_nir_rt_load(b, addr, MIN2(align, BRW_BTD_STACK_ALIGN),
+                             num_components, bit_size);
+}
+
+static inline void
+brw_nir_rt_store_scratch(nir_builder *b, uint32_t offset, unsigned align,
+                         nir_def *value, nir_component_mask_t write_mask)
+{
+   nir_def *addr =
+      nir_iadd_imm(b, nir_load_scratch_base_ptr(b, 1, 64, 1), offset);
+   brw_nir_rt_store(b, addr, MIN2(align, BRW_BTD_STACK_ALIGN),
+                    value, write_mask);
+}
+
+static inline void
+brw_nir_btd_spawn(nir_builder *b, nir_def *record_addr)
+{
+   nir_btd_spawn_intel(b, nir_load_btd_global_arg_addr_intel(b), record_addr);
+}
+
+static inline void
+brw_nir_btd_retire(nir_builder *b)
+{
+   nir_btd_retire_intel(b);
+}
+
+/** This is a pseudo-op which does a bindless return
+ *
+ * It loads the return address from the stack and calls btd_spawn to spawn the
+ * resume shader.
+ */
+static inline void
+brw_nir_btd_return(struct nir_builder *b)
+{
+   nir_def *resume_addr =
+      brw_nir_rt_load_scratch(b, BRW_BTD_STACK_RESUME_BSR_ADDR_OFFSET,
+                              8 /* align */, 1, 64);
+   brw_nir_btd_spawn(b, resume_addr);
+}
+
+static inline void
+assert_def_size(nir_def *def, unsigned num_components, unsigned bit_size)
+{
+   assert(def->num_components == num_components);
+   assert(def->bit_size == bit_size);
+}
+
+static inline nir_def *
+brw_nir_num_rt_stacks(nir_builder *b,
+                      const struct intel_device_info *devinfo)
+{
+   return nir_imul_imm(b, nir_load_ray_num_dss_rt_stacks_intel(b),
+                          intel_device_info_dual_subslice_id_bound(devinfo));
+}
+
+static inline nir_def *
+brw_nir_rt_sw_hotzone_addr(nir_builder *b,
+                           const struct intel_device_info *devinfo)
+{
+   nir_def *offset32 =
+      nir_imul_imm(b, brw_nir_rt_async_stack_id(b),
+                      BRW_RT_SIZEOF_HOTZONE);
+
+   offset32 = nir_iadd(b, offset32, nir_ineg(b,
+      nir_imul_imm(b, brw_nir_num_rt_stacks(b, devinfo),
+                      BRW_RT_SIZEOF_HOTZONE)));
+
+   return nir_iadd(b, nir_load_ray_base_mem_addr_intel(b),
+                      nir_i2i64(b, offset32));
+}
+
+static inline nir_def *
+brw_nir_rt_sync_stack_addr(nir_builder *b,
+                           nir_def *base_mem_addr,
+                           const struct intel_device_info *devinfo)
+{
+   /* For Ray queries (Synchronous Ray Tracing), the formula is similar but
+    * goes down from rtMemBasePtr :
+    *
+    *    syncBase  = RTDispatchGlobals.rtMemBasePtr
+    *              - (DSSID * NUM_SIMD_LANES_PER_DSS + SyncStackID + 1)
+    *              * syncStackSize
+    *
+    * We assume that we can calculate a 32-bit offset first and then add it
+    * to the 64-bit base address at the end.
+    */
+   nir_def *offset32 =
+      nir_imul(b,
+               nir_iadd(b,
+                        nir_imul(b, brw_load_btd_dss_id(b),
+                                    brw_nir_rt_load_num_simd_lanes_per_dss(b, devinfo)),
+                        nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
+               nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY));
+   return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32));
+}
+
+static inline nir_def *
+brw_nir_rt_stack_addr(nir_builder *b)
+{
+   /* From the BSpec "Address Computation for Memory Based Data Structures:
+    * Ray and TraversalStack (Async Ray Tracing)":
+    *
+    *    stackBase = RTDispatchGlobals.rtMemBasePtr
+    *              + (DSSID * RTDispatchGlobals.numDSSRTStacks + stackID)
+    *              * RTDispatchGlobals.stackSizePerRay // 64B aligned
+    *
+    * We assume that we can calculate a 32-bit offset first and then add it
+    * to the 64-bit base address at the end.
+    */
+   nir_def *offset32 =
+      nir_imul(b, brw_nir_rt_async_stack_id(b),
+                  nir_load_ray_hw_stack_size_intel(b));
+   return nir_iadd(b, nir_load_ray_base_mem_addr_intel(b),
+                      nir_u2u64(b, offset32));
+}
+
+static inline nir_def *
+brw_nir_rt_mem_hit_addr_from_addr(nir_builder *b,
+                        nir_def *stack_addr,
+                        bool committed)
+{
+   return nir_iadd_imm(b, stack_addr, committed ? 0 : BRW_RT_SIZEOF_HIT_INFO);
+}
+
+static inline nir_def *
+brw_nir_rt_mem_hit_addr(nir_builder *b, bool committed)
+{
+   return nir_iadd_imm(b, brw_nir_rt_stack_addr(b),
+                          committed ? 0 : BRW_RT_SIZEOF_HIT_INFO);
+}
+
+static inline nir_def *
+brw_nir_rt_hit_attrib_data_addr(nir_builder *b)
+{
+   return nir_iadd_imm(b, brw_nir_rt_stack_addr(b),
+                          BRW_RT_OFFSETOF_HIT_ATTRIB_DATA);
+}
+
+static inline nir_def *
+brw_nir_rt_mem_ray_addr(nir_builder *b,
+                        nir_def *stack_addr,
+                        enum brw_rt_bvh_level bvh_level)
+{
+   /* From the BSpec "Address Computation for Memory Based Data Structures:
+    * Ray and TraversalStack (Async Ray Tracing)":
+    *
+    *    rayBase = stackBase + sizeof(HitInfo) * 2 // 64B aligned
+    *    rayPtr  = rayBase + bvhLevel * sizeof(Ray); // 64B aligned
+    *
+    * In Vulkan, we always have exactly two levels of BVH: World and Object.
+    */
+   uint32_t offset = BRW_RT_SIZEOF_HIT_INFO * 2 +
+                     bvh_level * BRW_RT_SIZEOF_RAY;
+   return nir_iadd_imm(b, stack_addr, offset);
+}
+
+static inline nir_def *
+brw_nir_rt_sw_stack_addr(nir_builder *b,
+                         const struct intel_device_info *devinfo)
+{
+   nir_def *addr = nir_load_ray_base_mem_addr_intel(b);
+
+   nir_def *offset32 = nir_imul(b, brw_nir_num_rt_stacks(b, devinfo),
+                                       nir_load_ray_hw_stack_size_intel(b));
+   addr = nir_iadd(b, addr, nir_u2u64(b, offset32));
+
+   nir_def *offset_in_stack =
+      nir_imul(b, nir_u2u64(b, brw_nir_rt_async_stack_id(b)),
+                  nir_u2u64(b, nir_load_ray_sw_stack_size_intel(b)));
+
+   return nir_iadd(b, addr, offset_in_stack);
+}
+
+static inline nir_def *
+nir_unpack_64_4x16_split_z(nir_builder *b, nir_def *val)
+{
+   return nir_unpack_32_2x16_split_x(b, nir_unpack_64_2x32_split_y(b, val));
+}
+
+struct brw_nir_rt_globals_defs {
+   nir_def *base_mem_addr;
+   nir_def *call_stack_handler_addr;
+   nir_def *hw_stack_size;
+   nir_def *num_dss_rt_stacks;
+   nir_def *hit_sbt_addr;
+   nir_def *hit_sbt_stride;
+   nir_def *miss_sbt_addr;
+   nir_def *miss_sbt_stride;
+   nir_def *sw_stack_size;
+   nir_def *launch_size;
+   nir_def *call_sbt_addr;
+   nir_def *call_sbt_stride;
+   nir_def *resume_sbt_addr;
+};
+
+static inline void
+brw_nir_rt_load_globals_addr(nir_builder *b,
+                             struct brw_nir_rt_globals_defs *defs,
+                             nir_def *addr)
+{
+   nir_def *data;
+   data = brw_nir_rt_load_const(b, 16, addr, nir_imm_true(b));
+   defs->base_mem_addr = nir_pack_64_2x32(b, nir_trim_vector(b, data, 2));
+
+   defs->call_stack_handler_addr =
+      nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
+
+   defs->hw_stack_size = nir_channel(b, data, 4);
+   defs->num_dss_rt_stacks = nir_iand_imm(b, nir_channel(b, data, 5), 0xffff);
+   defs->hit_sbt_addr =
+      nir_pack_64_2x32_split(b, nir_channel(b, data, 8),
+                                nir_extract_i16(b, nir_channel(b, data, 9),
+                                                   nir_imm_int(b, 0)));
+   defs->hit_sbt_stride =
+      nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 9));
+   defs->miss_sbt_addr =
+      nir_pack_64_2x32_split(b, nir_channel(b, data, 10),
+                                nir_extract_i16(b, nir_channel(b, data, 11),
+                                                   nir_imm_int(b, 0)));
+   defs->miss_sbt_stride =
+      nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 11));
+   defs->sw_stack_size = nir_channel(b, data, 12);
+   defs->launch_size = nir_channels(b, data, 0x7u << 13);
+
+   data = brw_nir_rt_load_const(b, 8, nir_iadd_imm(b, addr, 64), nir_imm_true(b));
+   defs->call_sbt_addr =
+      nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
+                                nir_extract_i16(b, nir_channel(b, data, 1),
+                                                   nir_imm_int(b, 0)));
+   defs->call_sbt_stride =
+      nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1));
+
+   defs->resume_sbt_addr =
+      nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
+}
+
+static inline void
+brw_nir_rt_load_globals(nir_builder *b,
+                        struct brw_nir_rt_globals_defs *defs)
+{
+   brw_nir_rt_load_globals_addr(b, defs, nir_load_btd_global_arg_addr_intel(b));
+}
+
+static inline nir_def *
+brw_nir_rt_unpack_leaf_ptr(nir_builder *b, nir_def *vec2)
+{
+   /* Hit record leaf pointers are 42-bit and assumed to be in 64B chunks.
+    * This leaves 22 bits at the top for other stuff.
+    */
+   nir_def *ptr64 = nir_imul_imm(b, nir_pack_64_2x32(b, vec2), 64);
+
+   /* The top 16 bits (remember, we shifted by 6 already) contain garbage
+    * that we need to get rid of.
+    */
+   nir_def *ptr_lo = nir_unpack_64_2x32_split_x(b, ptr64);
+   nir_def *ptr_hi = nir_unpack_64_2x32_split_y(b, ptr64);
+   ptr_hi = nir_extract_i16(b, ptr_hi, nir_imm_int(b, 0));
+   return nir_pack_64_2x32_split(b, ptr_lo, ptr_hi);
+}
+
+/**
+ * MemHit memory layout (BSpec 47547) :
+ *
+ *      name            bits    description
+ *    - t               32      hit distance of current hit (or initial traversal distance)
+ *    - u               32      barycentric hit coordinates
+ *    - v               32      barycentric hit coordinates
+ *    - primIndexDelta  16      prim index delta for compressed meshlets and quads
+ *    - valid            1      set if there is a hit
+ *    - leafType         3      type of node primLeafPtr is pointing to
+ *    - primLeafIndex    4      index of the hit primitive inside the leaf
+ *    - bvhLevel         3      the instancing level at which the hit occured
+ *    - frontFace        1      whether we hit the front-facing side of a triangle (also used to pass opaque flag when calling intersection shaders)
+ *    - pad0             4      unused bits
+ *    - primLeafPtr     42      pointer to BVH leaf node (multiple of 64 bytes)
+ *    - hitGroupRecPtr0 22      LSB of hit group record of the hit triangle (multiple of 16 bytes)
+ *    - instLeafPtr     42      pointer to BVH instance leaf node (in multiple of 64 bytes)
+ *    - hitGroupRecPtr1 22      MSB of hit group record of the hit triangle (multiple of 32 bytes)
+ */
+struct brw_nir_rt_mem_hit_defs {
+   nir_def *t;
+   nir_def *tri_bary; /**< Only valid for triangle geometry */
+   nir_def *aabb_hit_kind; /**< Only valid for AABB geometry */
+   nir_def *valid;
+   nir_def *leaf_type;
+   nir_def *prim_index_delta;
+   nir_def *prim_leaf_index;
+   nir_def *bvh_level;
+   nir_def *front_face;
+   nir_def *done; /**< Only for ray queries */
+   nir_def *prim_leaf_ptr;
+   nir_def *inst_leaf_ptr;
+};
+
+static inline void
+brw_nir_rt_load_mem_hit_from_addr(nir_builder *b,
+                                  struct brw_nir_rt_mem_hit_defs *defs,
+                                  nir_def *stack_addr,
+                                  bool committed)
+{
+   nir_def *hit_addr =
+      brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, committed);
+
+   nir_def *data = brw_nir_rt_load(b, hit_addr, 16, 4, 32);
+   defs->t = nir_channel(b, data, 0);
+   defs->aabb_hit_kind = nir_channel(b, data, 1);
+   defs->tri_bary = nir_channels(b, data, 0x6);
+   nir_def *bitfield = nir_channel(b, data, 3);
+   defs->prim_index_delta =
+      nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 0), nir_imm_int(b, 16));
+   defs->valid = nir_i2b(b, nir_iand_imm(b, bitfield, 1u << 16));
+   defs->leaf_type =
+      nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 17), nir_imm_int(b, 3));
+   defs->prim_leaf_index =
+      nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 20), nir_imm_int(b, 4));
+   defs->bvh_level =
+      nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 24), nir_imm_int(b, 3));
+   defs->front_face = nir_i2b(b, nir_iand_imm(b, bitfield, 1 << 27));
+   defs->done = nir_i2b(b, nir_iand_imm(b, bitfield, 1 << 28));
+
+   data = brw_nir_rt_load(b, nir_iadd_imm(b, hit_addr, 16), 16, 4, 32);
+   defs->prim_leaf_ptr =
+      brw_nir_rt_unpack_leaf_ptr(b, nir_channels(b, data, 0x3 << 0));
+   defs->inst_leaf_ptr =
+      brw_nir_rt_unpack_leaf_ptr(b, nir_channels(b, data, 0x3 << 2));
+}
+
+static inline void
+brw_nir_rt_load_mem_hit(nir_builder *b,
+                        struct brw_nir_rt_mem_hit_defs *defs,
+                        bool committed)
+{
+   brw_nir_rt_load_mem_hit_from_addr(b, defs, brw_nir_rt_stack_addr(b),
+                                     committed);
+}
+
+static inline void
+brw_nir_memcpy_global(nir_builder *b,
+                      nir_def *dst_addr, uint32_t dst_align,
+                      nir_def *src_addr, uint32_t src_align,
+                      uint32_t size)
+{
+   /* We're going to copy in 16B chunks */
+   assert(size % 16 == 0);
+   dst_align = MIN2(dst_align, 16);
+   src_align = MIN2(src_align, 16);
+
+   for (unsigned offset = 0; offset < size; offset += 16) {
+      nir_def *data =
+         brw_nir_rt_load(b, nir_iadd_imm(b, src_addr, offset), 16,
+                         4, 32);
+      brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, offset), 16,
+                       data, 0xf /* write_mask */);
+   }
+}
+
+static inline void
+brw_nir_memclear_global(nir_builder *b,
+                        nir_def *dst_addr, uint32_t dst_align,
+                        uint32_t size)
+{
+   /* We're going to copy in 16B chunks */
+   assert(size % 16 == 0);
+   dst_align = MIN2(dst_align, 16);
+
+   nir_def *zero = nir_imm_ivec4(b, 0, 0, 0, 0);
+   for (unsigned offset = 0; offset < size; offset += 16) {
+      brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, offset), dst_align,
+                       zero, 0xf /* write_mask */);
+   }
+}
+
+static inline nir_def *
+brw_nir_rt_query_done(nir_builder *b, nir_def *stack_addr)
+{
+   struct brw_nir_rt_mem_hit_defs hit_in = {};
+   brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr,
+                                     false /* committed */);
+
+   return hit_in.done;
+}
+
+static inline void
+brw_nir_rt_set_dword_bit_at(nir_builder *b,
+                            nir_def *addr,
+                            uint32_t addr_offset,
+                            uint32_t bit)
+{
+   nir_def *dword_addr = nir_iadd_imm(b, addr, addr_offset);
+   nir_def *dword = brw_nir_rt_load(b, dword_addr, 4, 1, 32);
+   brw_nir_rt_store(b, dword_addr, 4, nir_ior_imm(b, dword, 1u << bit), 0x1);
+}
+
+static inline void
+brw_nir_rt_query_mark_done(nir_builder *b, nir_def *stack_addr)
+{
+   brw_nir_rt_set_dword_bit_at(b,
+                               brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr,
+                                                                 false /* committed */),
+                               4 * 3 /* dword offset */, 28 /* bit */);
+}
+
+/* This helper clears the 3rd dword of the MemHit structure where the valid
+ * bit is located.
+ */
+static inline void
+brw_nir_rt_query_mark_init(nir_builder *b, nir_def *stack_addr)
+{
+   nir_def *dword_addr;
+
+   for (uint32_t i = 0; i < 2; i++) {
+      dword_addr =
+         nir_iadd_imm(b,
+                      brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr,
+                                                        i == 0 /* committed */),
+                      4 * 3 /* dword offset */);
+      brw_nir_rt_store(b, dword_addr, 4, nir_imm_int(b, 0), 0x1);
+   }
+}
+
+/* This helper is pretty much a memcpy of uncommitted into committed hit
+ * structure, just adding the valid bit.
+ */
+static inline void
+brw_nir_rt_commit_hit_addr(nir_builder *b, nir_def *stack_addr)
+{
+   nir_def *dst_addr =
+      brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, true /* committed */);
+   nir_def *src_addr =
+      brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, false /* committed */);
+
+   for (unsigned offset = 0; offset < BRW_RT_SIZEOF_HIT_INFO; offset += 16) {
+      nir_def *data =
+         brw_nir_rt_load(b, nir_iadd_imm(b, src_addr, offset), 16, 4, 32);
+
+      if (offset == 0) {
+         data = nir_vec4(b,
+                         nir_channel(b, data, 0),
+                         nir_channel(b, data, 1),
+                         nir_channel(b, data, 2),
+                         nir_ior_imm(b,
+                                     nir_channel(b, data, 3),
+                                     0x1 << 16 /* valid */));
+
+         /* Also write the potential hit as we change it. */
+         brw_nir_rt_store(b, nir_iadd_imm(b, src_addr, offset), 16,
+                          data, 0xf /* write_mask */);
+      }
+
+      brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, offset), 16,
+                       data, 0xf /* write_mask */);
+   }
+}
+
+static inline void
+brw_nir_rt_commit_hit(nir_builder *b)
+{
+   nir_def *stack_addr = brw_nir_rt_stack_addr(b);
+   brw_nir_rt_commit_hit_addr(b, stack_addr);
+}
+
+static inline void
+brw_nir_rt_generate_hit_addr(nir_builder *b, nir_def *stack_addr, nir_def *t_val)
+{
+   nir_def *committed_addr =
+      brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, true /* committed */);
+   nir_def *potential_addr =
+      brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, false /* committed */);
+
+   /* Set:
+    *
+    *   potential.t     = t_val;
+    *   potential.valid = true;
+    */
+   nir_def *potential_hit_dwords_0_3 =
+      brw_nir_rt_load(b, potential_addr, 16, 4, 32);
+   potential_hit_dwords_0_3 =
+      nir_vec4(b,
+               t_val,
+               nir_channel(b, potential_hit_dwords_0_3, 1),
+               nir_channel(b, potential_hit_dwords_0_3, 2),
+               nir_ior_imm(b, nir_channel(b, potential_hit_dwords_0_3, 3),
+                           (0x1 << 16) /* valid */));
+   brw_nir_rt_store(b, potential_addr, 16, potential_hit_dwords_0_3, 0xf /* write_mask */);
+
+   /* Set:
+    *
+    *   committed.t               = t_val;
+    *   committed.u               = 0.0f;
+    *   committed.v               = 0.0f;
+    *   committed.valid           = true;
+    *   committed.leaf_type       = potential.leaf_type;
+    *   committed.bvh_level       = BRW_RT_BVH_LEVEL_OBJECT;
+    *   committed.front_face      = false;
+    *   committed.prim_leaf_index = 0;
+    *   committed.done            = false;
+    */
+   nir_def *committed_hit_dwords_0_3 =
+      brw_nir_rt_load(b, committed_addr, 16, 4, 32);
+   committed_hit_dwords_0_3 =
+      nir_vec4(b,
+               t_val,
+               nir_imm_float(b, 0.0f),
+               nir_imm_float(b, 0.0f),
+               nir_ior_imm(b,
+                           nir_ior_imm(b, nir_channel(b, potential_hit_dwords_0_3, 3), 0x000e0000),
+                           (0x1 << 16)                     /* valid */ |
+                           (BRW_RT_BVH_LEVEL_OBJECT << 24) /* leaf_type */));
+   brw_nir_rt_store(b, committed_addr, 16, committed_hit_dwords_0_3, 0xf /* write_mask */);
+
+   /* Set:
+    *
+    *   committed.prim_leaf_ptr   = potential.prim_leaf_ptr;
+    *   committed.inst_leaf_ptr   = potential.inst_leaf_ptr;
+    */
+   brw_nir_memcpy_global(b,
+                         nir_iadd_imm(b, committed_addr, 16), 16,
+                         nir_iadd_imm(b, potential_addr, 16), 16,
+                         16);
+}
+
+struct brw_nir_rt_mem_ray_defs {
+   nir_def *orig;
+   nir_def *dir;
+   nir_def *t_near;
+   nir_def *t_far;
+   nir_def *root_node_ptr;
+   nir_def *ray_flags;
+   nir_def *hit_group_sr_base_ptr;
+   nir_def *hit_group_sr_stride;
+   nir_def *miss_sr_ptr;
+   nir_def *shader_index_multiplier;
+   nir_def *inst_leaf_ptr;
+   nir_def *ray_mask;
+};
+
+static inline void
+brw_nir_rt_store_mem_ray_query_at_addr(nir_builder *b,
+                                       nir_def *ray_addr,
+                                       const struct brw_nir_rt_mem_ray_defs *defs)
+{
+   assert_def_size(defs->orig, 3, 32);
+   assert_def_size(defs->dir, 3, 32);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 0), 16,
+      nir_vec4(b, nir_channel(b, defs->orig, 0),
+                  nir_channel(b, defs->orig, 1),
+                  nir_channel(b, defs->orig, 2),
+                  nir_channel(b, defs->dir, 0)),
+      ~0 /* write mask */);
+
+   assert_def_size(defs->t_near, 1, 32);
+   assert_def_size(defs->t_far, 1, 32);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 16), 16,
+      nir_vec4(b, nir_channel(b, defs->dir, 1),
+                  nir_channel(b, defs->dir, 2),
+                  defs->t_near,
+                  defs->t_far),
+      ~0 /* write mask */);
+
+   assert_def_size(defs->root_node_ptr, 1, 64);
+   assert_def_size(defs->ray_flags, 1, 16);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 32), 16,
+      nir_vec2(b, nir_unpack_64_2x32_split_x(b, defs->root_node_ptr),
+                  nir_pack_32_2x16_split(b,
+                     nir_unpack_64_4x16_split_z(b, defs->root_node_ptr),
+                     defs->ray_flags)),
+      0x3 /* write mask */);
+
+   /* leaf_ptr is optional */
+   nir_def *inst_leaf_ptr;
+   if (defs->inst_leaf_ptr) {
+      inst_leaf_ptr = defs->inst_leaf_ptr;
+   } else {
+      inst_leaf_ptr = nir_imm_int64(b, 0);
+   }
+
+   assert_def_size(inst_leaf_ptr, 1, 64);
+   assert_def_size(defs->ray_mask, 1, 32);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 56), 8,
+      nir_vec2(b, nir_unpack_64_2x32_split_x(b, inst_leaf_ptr),
+                  nir_pack_32_2x16_split(b,
+                     nir_unpack_64_4x16_split_z(b, inst_leaf_ptr),
+                     nir_unpack_32_2x16_split_x(b, defs->ray_mask))),
+      ~0 /* write mask */);
+}
+
+static inline void
+brw_nir_rt_store_mem_ray(nir_builder *b,
+                         const struct brw_nir_rt_mem_ray_defs *defs,
+                         enum brw_rt_bvh_level bvh_level)
+{
+   nir_def *ray_addr =
+      brw_nir_rt_mem_ray_addr(b, brw_nir_rt_stack_addr(b), bvh_level);
+
+   assert_def_size(defs->orig, 3, 32);
+   assert_def_size(defs->dir, 3, 32);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 0), 16,
+      nir_vec4(b, nir_channel(b, defs->orig, 0),
+                  nir_channel(b, defs->orig, 1),
+                  nir_channel(b, defs->orig, 2),
+                  nir_channel(b, defs->dir, 0)),
+      ~0 /* write mask */);
+
+   assert_def_size(defs->t_near, 1, 32);
+   assert_def_size(defs->t_far, 1, 32);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 16), 16,
+      nir_vec4(b, nir_channel(b, defs->dir, 1),
+                  nir_channel(b, defs->dir, 2),
+                  defs->t_near,
+                  defs->t_far),
+      ~0 /* write mask */);
+
+   assert_def_size(defs->root_node_ptr, 1, 64);
+   assert_def_size(defs->ray_flags, 1, 16);
+   assert_def_size(defs->hit_group_sr_base_ptr, 1, 64);
+   assert_def_size(defs->hit_group_sr_stride, 1, 16);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 32), 16,
+      nir_vec4(b, nir_unpack_64_2x32_split_x(b, defs->root_node_ptr),
+                  nir_pack_32_2x16_split(b,
+                     nir_unpack_64_4x16_split_z(b, defs->root_node_ptr),
+                     defs->ray_flags),
+                  nir_unpack_64_2x32_split_x(b, defs->hit_group_sr_base_ptr),
+                  nir_pack_32_2x16_split(b,
+                     nir_unpack_64_4x16_split_z(b, defs->hit_group_sr_base_ptr),
+                     defs->hit_group_sr_stride)),
+      ~0 /* write mask */);
+
+   /* leaf_ptr is optional */
+   nir_def *inst_leaf_ptr;
+   if (defs->inst_leaf_ptr) {
+      inst_leaf_ptr = defs->inst_leaf_ptr;
+   } else {
+      inst_leaf_ptr = nir_imm_int64(b, 0);
+   }
+
+   assert_def_size(defs->miss_sr_ptr, 1, 64);
+   assert_def_size(defs->shader_index_multiplier, 1, 32);
+   assert_def_size(inst_leaf_ptr, 1, 64);
+   assert_def_size(defs->ray_mask, 1, 32);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 48), 16,
+      nir_vec4(b, nir_unpack_64_2x32_split_x(b, defs->miss_sr_ptr),
+                  nir_pack_32_2x16_split(b,
+                     nir_unpack_64_4x16_split_z(b, defs->miss_sr_ptr),
+                     nir_unpack_32_2x16_split_x(b,
+                        nir_ishl(b, defs->shader_index_multiplier,
+                                    nir_imm_int(b, 8)))),
+                  nir_unpack_64_2x32_split_x(b, inst_leaf_ptr),
+                  nir_pack_32_2x16_split(b,
+                     nir_unpack_64_4x16_split_z(b, inst_leaf_ptr),
+                     nir_unpack_32_2x16_split_x(b, defs->ray_mask))),
+      ~0 /* write mask */);
+}
+
+static inline void
+brw_nir_rt_load_mem_ray_from_addr(nir_builder *b,
+                                  struct brw_nir_rt_mem_ray_defs *defs,
+                                  nir_def *ray_base_addr,
+                                  enum brw_rt_bvh_level bvh_level)
+{
+   nir_def *ray_addr = brw_nir_rt_mem_ray_addr(b,
+                                                   ray_base_addr,
+                                                   bvh_level);
+
+   nir_def *data[4] = {
+      brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr,  0), 16, 4, 32),
+      brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 16), 16, 4, 32),
+      brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 32), 16, 4, 32),
+      brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 48), 16, 4, 32),
+   };
+
+   defs->orig = nir_trim_vector(b, data[0], 3);
+   defs->dir = nir_vec3(b, nir_channel(b, data[0], 3),
+                           nir_channel(b, data[1], 0),
+                           nir_channel(b, data[1], 1));
+   defs->t_near = nir_channel(b, data[1], 2);
+   defs->t_far = nir_channel(b, data[1], 3);
+   defs->root_node_ptr =
+      nir_pack_64_2x32_split(b, nir_channel(b, data[2], 0),
+                                nir_extract_i16(b, nir_channel(b, data[2], 1),
+                                                   nir_imm_int(b, 0)));
+   defs->ray_flags =
+      nir_unpack_32_2x16_split_y(b, nir_channel(b, data[2], 1));
+   defs->hit_group_sr_base_ptr =
+      nir_pack_64_2x32_split(b, nir_channel(b, data[2], 2),
+                                nir_extract_i16(b, nir_channel(b, data[2], 3),
+                                                   nir_imm_int(b, 0)));
+   defs->hit_group_sr_stride =
+      nir_unpack_32_2x16_split_y(b, nir_channel(b, data[2], 3));
+   defs->miss_sr_ptr =
+      nir_pack_64_2x32_split(b, nir_channel(b, data[3], 0),
+                                nir_extract_i16(b, nir_channel(b, data[3], 1),
+                                                   nir_imm_int(b, 0)));
+   defs->shader_index_multiplier =
+      nir_ushr(b, nir_unpack_32_2x16_split_y(b, nir_channel(b, data[3], 1)),
+                  nir_imm_int(b, 8));
+   defs->inst_leaf_ptr =
+      nir_pack_64_2x32_split(b, nir_channel(b, data[3], 2),
+                                nir_extract_i16(b, nir_channel(b, data[3], 3),
+                                                   nir_imm_int(b, 0)));
+   defs->ray_mask =
+      nir_unpack_32_2x16_split_y(b, nir_channel(b, data[3], 3));
+}
+
+static inline void
+brw_nir_rt_load_mem_ray(nir_builder *b,
+                        struct brw_nir_rt_mem_ray_defs *defs,
+                        enum brw_rt_bvh_level bvh_level)
+{
+   brw_nir_rt_load_mem_ray_from_addr(b, defs, brw_nir_rt_stack_addr(b),
+                                     bvh_level);
+}
+
+struct brw_nir_rt_bvh_instance_leaf_defs {
+   nir_def *shader_index;
+   nir_def *contribution_to_hit_group_index;
+   nir_def *world_to_object[4];
+   nir_def *instance_id;
+   nir_def *instance_index;
+   nir_def *object_to_world[4];
+};
+
+static inline void
+brw_nir_rt_load_bvh_instance_leaf(nir_builder *b,
+                                  struct brw_nir_rt_bvh_instance_leaf_defs *defs,
+                                  nir_def *leaf_addr)
+{
+   nir_def *leaf_desc = brw_nir_rt_load(b, leaf_addr, 4, 2, 32);
+
+   defs->shader_index =
+      nir_iand_imm(b, nir_channel(b, leaf_desc, 0), (1 << 24) - 1);
+   defs->contribution_to_hit_group_index =
+      nir_iand_imm(b, nir_channel(b, leaf_desc, 1), (1 << 24) - 1);
+
+   defs->world_to_object[0] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 16), 4, 3, 32);
+   defs->world_to_object[1] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 28), 4, 3, 32);
+   defs->world_to_object[2] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 40), 4, 3, 32);
+   /* The last column of the matrices is swapped between the two probably
+    * because it makes it easier/faster for hardware somehow.
+    */
+   defs->object_to_world[3] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 52), 4, 3, 32);
+
+   nir_def *data =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 64), 4, 4, 32);
+   defs->instance_id = nir_channel(b, data, 2);
+   defs->instance_index = nir_channel(b, data, 3);
+
+   defs->object_to_world[0] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 80), 4, 3, 32);
+   defs->object_to_world[1] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 92), 4, 3, 32);
+   defs->object_to_world[2] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 104), 4, 3, 32);
+   defs->world_to_object[3] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 116), 4, 3, 32);
+}
+
+struct brw_nir_rt_bvh_primitive_leaf_defs {
+   nir_def *shader_index;
+   nir_def *geom_mask;
+   nir_def *geom_index;
+   nir_def *type;
+   nir_def *geom_flags;
+};
+
+static inline void
+brw_nir_rt_load_bvh_primitive_leaf(nir_builder *b,
+                                   struct brw_nir_rt_bvh_primitive_leaf_defs *defs,
+                                   nir_def *leaf_addr)
+{
+   nir_def *desc = brw_nir_rt_load(b, leaf_addr, 4, 2, 32);
+
+   defs->shader_index =
+      nir_ubitfield_extract(b, nir_channel(b, desc, 0),
+                            nir_imm_int(b, 23), nir_imm_int(b, 0));
+   defs->geom_mask =
+      nir_ubitfield_extract(b, nir_channel(b, desc, 0),
+                            nir_imm_int(b, 31), nir_imm_int(b, 24));
+
+   defs->geom_index =
+      nir_ubitfield_extract(b, nir_channel(b, desc, 1),
+                            nir_imm_int(b, 28), nir_imm_int(b, 0));
+   defs->type =
+      nir_ubitfield_extract(b, nir_channel(b, desc, 1),
+                            nir_imm_int(b, 29), nir_imm_int(b, 29));
+   defs->geom_flags =
+      nir_ubitfield_extract(b, nir_channel(b, desc, 1),
+                            nir_imm_int(b, 31), nir_imm_int(b, 30));
+}
+
+struct brw_nir_rt_bvh_primitive_leaf_positions_defs {
+   nir_def *positions[3];
+};
+
+static inline void
+brw_nir_rt_load_bvh_primitive_leaf_positions(nir_builder *b,
+                                             struct brw_nir_rt_bvh_primitive_leaf_positions_defs *defs,
+                                             nir_def *leaf_addr)
+{
+   for (unsigned i = 0; i < ARRAY_SIZE(defs->positions); i++) {
+      defs->positions[i] =
+         brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 16 + i * 4 * 3), 4, 3, 32);
+   }
+}
+
+static inline nir_def *
+brw_nir_rt_load_primitive_id_from_hit(nir_builder *b,
+                                      nir_def *is_procedural,
+                                      const struct brw_nir_rt_mem_hit_defs *defs)
+{
+   if (!is_procedural) {
+      is_procedural =
+         nir_ieq_imm(b, defs->leaf_type,
+                        BRW_RT_BVH_NODE_TYPE_PROCEDURAL);
+   }
+
+   nir_def *prim_id_proc, *prim_id_quad;
+   nir_push_if(b, is_procedural);
+   {
+      /* For procedural leafs, the index is in dw[3]. */
+      nir_def *offset =
+         nir_iadd_imm(b, nir_ishl_imm(b, defs->prim_leaf_index, 2), 12);
+      prim_id_proc = nir_load_global(b, nir_iadd(b, defs->prim_leaf_ptr,
+                                                 nir_u2u64(b, offset)),
+                                     4, /* align */ 1, 32);
+   }
+   nir_push_else(b, NULL);
+   {
+      /* For quad leafs, the index is dw[2] and there is a 16bit additional
+       * offset in dw[3].
+       */
+      prim_id_quad = nir_load_global(b, nir_iadd_imm(b, defs->prim_leaf_ptr, 8),
+                                     4, /* align */ 1, 32);
+      prim_id_quad = nir_iadd(b,
+                              prim_id_quad,
+                              defs->prim_index_delta);
+   }
+   nir_pop_if(b, NULL);
+
+   return nir_if_phi(b, prim_id_proc, prim_id_quad);
+}
+
+static inline nir_def *
+brw_nir_rt_acceleration_structure_to_root_node(nir_builder *b,
+                                               nir_def *as_addr)
+{
+   /* The HW memory structure in which we specify what acceleration structure
+    * to traverse, takes the address to the root node in the acceleration
+    * structure, not the acceleration structure itself. To find that, we have
+    * to read the root node offset from the acceleration structure which is
+    * the first QWord.
+    *
+    * But if the acceleration structure pointer is NULL, then we should return
+    * NULL as root node pointer.
+    *
+    * TODO: we could optimize this by assuming that for a given version of the
+    * BVH, we can find the root node at a given offset.
+    */
+   nir_def *root_node_ptr, *null_node_ptr;
+   nir_push_if(b, nir_ieq_imm(b, as_addr, 0));
+   {
+      null_node_ptr = nir_imm_int64(b, 0);
+   }
+   nir_push_else(b, NULL);
+   {
+      root_node_ptr =
+         nir_iadd(b, as_addr, brw_nir_rt_load(b, as_addr, 256, 1, 64));
+   }
+   nir_pop_if(b, NULL);
+
+   return nir_if_phi(b, null_node_ptr, root_node_ptr);
+}
+
+#endif /* BRW_NIR_RT_BUILDER_H */
diff --git a/src/intel/compiler/elk/brw_nir_trig_workarounds.py b/src/intel/compiler/elk/brw_nir_trig_workarounds.py
new file mode 100644
index 00000000000..5d6a7601d31
--- /dev/null
+++ b/src/intel/compiler/elk/brw_nir_trig_workarounds.py
@@ -0,0 +1,67 @@
+#
+# Copyright (C) 2016 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+# Prior to Kaby Lake, The SIN and COS instructions on Intel hardware can
+# produce values slightly outside of the [-1.0, 1.0] range for a small set of
+# values.  Obviously, this can break everyone's expectations about trig
+# functions.  This appears to be fixed in Kaby Lake.
+#
+# According to an internal presentation, the COS instruction can produce
+# a value up to 1.000027 for inputs in the range (0.08296, 0.09888).  One
+# suggested workaround is to multiply by 0.99997, scaling down the
+# amplitude slightly.  Apparently this also minimizes the error function,
+# reducing the maximum error from 0.00006 to about 0.00003.
+
+import argparse
+import sys
+from math import pi
+
+TRIG_WORKAROUNDS = [
+    (('fsin', 'x(is_not_const)'), ('fmul', ('fsin', 'x'), 0.99997)),
+    (('fcos', 'x(is_not_const)'), ('fmul', ('fcos', 'x'), 0.99997)),
+]
+
+LIMIT_TRIG_INPUT_RANGE_WORKAROUND = [
+    (('fsin', 'x(is_not_const)'), ('fsin', ('fmod', 'x', 2.0 * pi))),
+    (('fcos', 'x(is_not_const)'), ('fcos', ('fmod', 'x', 2.0 * pi))),
+]
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--import-path', required=True)
+    args = parser.parse_args()
+    sys.path.insert(0, args.import_path)
+    run()
+
+
+def run():
+    import nir_algebraic  # pylint: disable=import-error
+
+    print('#include "brw_nir.h"')
+    print(nir_algebraic.AlgebraicPass("brw_nir_apply_trig_workarounds",
+                                      TRIG_WORKAROUNDS).render())
+    print(nir_algebraic.AlgebraicPass("brw_nir_limit_trig_input_range_workaround",
+                                      LIMIT_TRIG_INPUT_RANGE_WORKAROUND).render())
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/intel/compiler/elk/brw_packed_float.c b/src/intel/compiler/elk/brw_packed_float.c
new file mode 100644
index 00000000000..a97a176665b
--- /dev/null
+++ b/src/intel/compiler/elk/brw_packed_float.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include "brw_reg.h"
+
+union fu {
+   float f;
+   unsigned u;
+   struct {
+      unsigned mantissa:23;
+      unsigned exponent:8;
+      unsigned sign:1;
+   } s;
+};
+
+int
+brw_float_to_vf(float f)
+{
+   union fu fu = { .f = f };
+
+   /* ±0.0f is special cased. */
+   if (f == 0.0f)
+      return fu.s.sign << 7;
+
+   unsigned mantissa = fu.s.mantissa >> (23 - 4);
+   unsigned exponent = fu.s.exponent - (127 - 3);
+   unsigned vf = (fu.s.sign << 7) | (exponent << 4) | mantissa;
+
+   /* 0.125 would have had the same representation as 0.0, so reject it. */
+   if ((vf & 0x7f) == 0)
+      return -1;
+
+   /* Make sure the mantissa fits in 4-bits and the exponent in 3-bits. */
+   if (fu.u & 0x7ffff || exponent > 7)
+      return -1;
+
+   return vf;
+}
+
+float
+brw_vf_to_float(unsigned char vf)
+{
+   union fu fu;
+
+   /* ±0.0f is special cased. */
+   if (vf == 0x00 || vf == 0x80) {
+      fu.u = (unsigned)vf << 24;
+      return fu.f;
+   }
+
+   fu.s.sign = vf >> 7;
+   fu.s.exponent = ((vf & 0x70) >> 4) + (127 - 3);
+   fu.s.mantissa = (vf & 0xf) << (23 - 4);
+
+   return fu.f;
+}
diff --git a/src/intel/compiler/elk/brw_predicated_break.cpp b/src/intel/compiler/elk/brw_predicated_break.cpp
new file mode 100644
index 00000000000..118d71b3773
--- /dev/null
+++ b/src/intel/compiler/elk/brw_predicated_break.cpp
@@ -0,0 +1,243 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_shader.h"
+
+using namespace brw;
+
+/** @file brw_predicated_break.cpp
+ *
+ * Loops are often structured as
+ *
+ * loop:
+ *    CMP.f0
+ *    (+f0) IF
+ *    BREAK
+ *    ENDIF
+ *    ...
+ *    WHILE loop
+ *
+ * This peephole pass removes the IF and ENDIF instructions and predicates the
+ * BREAK, dropping two instructions from the loop body.
+ *
+ * If the loop was a DO { ... } WHILE loop, it looks like
+ *
+ * loop:
+ *    ...
+ *    CMP.f0
+ *    (+f0) IF
+ *    BREAK
+ *    ENDIF
+ *    WHILE loop
+ *
+ * and we can remove the BREAK instruction and predicate the WHILE.
+ */
+
+#define MAX_NESTING 128
+
+struct loop_continue_tracking {
+   BITSET_WORD has_continue[BITSET_WORDS(MAX_NESTING)];
+   unsigned depth;
+};
+
+static void
+enter_loop(struct loop_continue_tracking *s)
+{
+   s->depth++;
+
+   /* Any loops deeper than that maximum nesting will just re-use the last
+    * flag.  This simplifies most of the code.  MAX_NESTING is chosen to be
+    * large enough that it is unlikely to occur.  Even if it does, the
+    * optimization that uses this tracking is unlikely to make much
+    * difference.
+    */
+   if (s->depth < MAX_NESTING)
+      BITSET_CLEAR(s->has_continue, s->depth);
+}
+
+static void
+exit_loop(struct loop_continue_tracking *s)
+{
+   assert(s->depth > 0);
+   s->depth--;
+}
+
+static void
+set_continue(struct loop_continue_tracking *s)
+{
+   const unsigned i = MIN2(s->depth, MAX_NESTING - 1);
+
+   BITSET_SET(s->has_continue, i);
+}
+
+static bool
+has_continue(const struct loop_continue_tracking *s)
+{
+   const unsigned i = MIN2(s->depth, MAX_NESTING - 1);
+
+   return BITSET_TEST(s->has_continue, i);
+}
+
+bool
+opt_predicated_break(backend_shader *s)
+{
+   bool progress = false;
+   struct loop_continue_tracking state = { {0, }, 0 };
+
+   foreach_block (block, s->cfg) {
+      /* DO instructions, by definition, can only be found at the beginning of
+       * basic blocks.
+       */
+      backend_instruction *const do_inst = block->start();
+
+      /* BREAK, CONTINUE, and WHILE instructions, by definition, can only be
+       * found at the ends of basic blocks.
+       */
+      backend_instruction *jump_inst = block->end();
+
+      if (do_inst->opcode == BRW_OPCODE_DO)
+         enter_loop(&state);
+
+      if (jump_inst->opcode == BRW_OPCODE_CONTINUE)
+         set_continue(&state);
+      else if (jump_inst->opcode == BRW_OPCODE_WHILE)
+         exit_loop(&state);
+
+      if (block->start_ip != block->end_ip)
+         continue;
+
+      if (jump_inst->opcode != BRW_OPCODE_BREAK &&
+          jump_inst->opcode != BRW_OPCODE_CONTINUE)
+         continue;
+
+      backend_instruction *if_inst = block->prev()->end();
+      if (if_inst->opcode != BRW_OPCODE_IF)
+         continue;
+
+      backend_instruction *endif_inst = block->next()->start();
+      if (endif_inst->opcode != BRW_OPCODE_ENDIF)
+         continue;
+
+      bblock_t *jump_block = block;
+      bblock_t *if_block = jump_block->prev();
+      bblock_t *endif_block = jump_block->next();
+
+      jump_inst->predicate = if_inst->predicate;
+      jump_inst->predicate_inverse = if_inst->predicate_inverse;
+
+      bblock_t *earlier_block = if_block;
+      if (if_block->start_ip == if_block->end_ip) {
+         earlier_block = if_block->prev();
+      }
+
+      if_inst->remove(if_block);
+
+      bblock_t *later_block = endif_block;
+      if (endif_block->start_ip == endif_block->end_ip) {
+         later_block = endif_block->next();
+      }
+      endif_inst->remove(endif_block);
+
+      if (!earlier_block->ends_with_control_flow()) {
+         /* FIXME: There is a potential problem here. If earlier_block starts
+          * with a DO instruction, this will delete the physical link to the
+          * WHILE block. It is unclear whether ENDIF has the same potential
+          * problem.
+          */
+         assert(earlier_block->start() == NULL ||
+                earlier_block->start()->opcode != BRW_OPCODE_DO);
+
+         earlier_block->unlink_children();
+         earlier_block->add_successor(s->cfg->mem_ctx, jump_block,
+                                      bblock_link_logical);
+      }
+
+      if (!later_block->starts_with_control_flow()) {
+         later_block->unlink_parents();
+      }
+
+      /* If jump_block already has a link to later_block, don't create another
+       * one. Instead, promote the link to logical.
+       */
+      bool need_to_link = true;
+      foreach_list_typed(bblock_link, link, link, &jump_block->children) {
+         if (link->block == later_block) {
+            assert(later_block->starts_with_control_flow());
+
+            /* Update the link from later_block back to jump_block. */
+            foreach_list_typed(bblock_link, parent_link, link, &later_block->parents) {
+               if (parent_link->block == jump_block) {
+                  parent_link->kind = bblock_link_logical;
+               }
+            }
+
+            /* Update the link from jump_block to later_block. */
+            link->kind = bblock_link_logical;
+            need_to_link = false;
+         }
+      }
+
+      if (need_to_link) {
+         jump_block->add_successor(s->cfg->mem_ctx, later_block,
+                                   bblock_link_logical);
+      }
+
+      if (earlier_block->can_combine_with(jump_block)) {
+         earlier_block->combine_with(jump_block);
+
+         block = earlier_block;
+      }
+
+      /* Now look at the first instruction of the block following the BREAK. If
+       * it's a WHILE, we can delete the break, predicate the WHILE, and join
+       * the two basic blocks.
+       *
+       * This optimization can only be applied if the only instruction that
+       * can transfer control to the WHILE is the BREAK.  If other paths can
+       * lead to the while, the flags may be in an unknown state, and the loop
+       * could terminate prematurely.  This can occur if the loop contains a
+       * CONT instruction.
+       */
+      bblock_t *while_block = earlier_block->next();
+      backend_instruction *while_inst = while_block->start();
+
+      if (jump_inst->opcode == BRW_OPCODE_BREAK &&
+          while_inst->opcode == BRW_OPCODE_WHILE &&
+          while_inst->predicate == BRW_PREDICATE_NONE &&
+          !has_continue(&state)) {
+         jump_inst->remove(earlier_block);
+         while_inst->predicate = jump_inst->predicate;
+         while_inst->predicate_inverse = !jump_inst->predicate_inverse;
+
+         assert(earlier_block->can_combine_with(while_block));
+         earlier_block->combine_with(while_block);
+      }
+
+      progress = true;
+   }
+
+   if (progress)
+      s->invalidate_analysis(DEPENDENCY_BLOCKS | DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_prim.h b/src/intel/compiler/elk/brw_prim.h
new file mode 100644
index 00000000000..28823089c1e
--- /dev/null
+++ b/src/intel/compiler/elk/brw_prim.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_PRIM_H
+#define BRW_PRIM_H
+
+#define _3DPRIM_POINTLIST         0x01
+#define _3DPRIM_LINELIST          0x02
+#define _3DPRIM_LINESTRIP         0x03
+#define _3DPRIM_TRILIST           0x04
+#define _3DPRIM_TRISTRIP          0x05
+#define _3DPRIM_TRIFAN            0x06
+#define _3DPRIM_QUADLIST          0x07
+#define _3DPRIM_QUADSTRIP         0x08
+#define _3DPRIM_LINELIST_ADJ      0x09 /* G45+ */
+#define _3DPRIM_LINESTRIP_ADJ     0x0A /* G45+ */
+#define _3DPRIM_TRILIST_ADJ       0x0B /* G45+ */
+#define _3DPRIM_TRISTRIP_ADJ      0x0C /* G45+ */
+#define _3DPRIM_TRISTRIP_REVERSE  0x0D
+#define _3DPRIM_POLYGON           0x0E
+#define _3DPRIM_RECTLIST          0x0F
+#define _3DPRIM_LINELOOP          0x10
+#define _3DPRIM_POINTLIST_BF      0x11
+#define _3DPRIM_LINESTRIP_CONT    0x12
+#define _3DPRIM_LINESTRIP_BF      0x13
+#define _3DPRIM_LINESTRIP_CONT_BF 0x14
+#define _3DPRIM_TRIFAN_NOSTIPPLE  0x16
+#define _3DPRIM_PATCHLIST(n) ({ assert(n > 0 && n <= 32); 0x20 + (n - 1); })
+
+#endif /* BRW_PRIM_H */
diff --git a/src/intel/compiler/elk/brw_private.h b/src/intel/compiler/elk/brw_private.h
new file mode 100644
index 00000000000..922ec8abc31
--- /dev/null
+++ b/src/intel/compiler/elk/brw_private.h
@@ -0,0 +1,76 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_PRIVATE_H
+#define BRW_PRIVATE_H
+
+#include "brw_compiler.h"
+
+#include <variant>
+
+unsigned brw_required_dispatch_width(const struct shader_info *info);
+
+static constexpr int SIMD_COUNT = 3;
+
+struct brw_simd_selection_state {
+   const struct intel_device_info *devinfo;
+
+   std::variant<struct brw_cs_prog_data *,
+                struct brw_bs_prog_data *> prog_data;
+
+   unsigned required_width;
+
+   const char *error[SIMD_COUNT];
+
+   bool compiled[SIMD_COUNT];
+   bool spilled[SIMD_COUNT];
+};
+
+inline int brw_simd_first_compiled(const brw_simd_selection_state &state)
+{
+   for (int i = 0; i < SIMD_COUNT; i++) {
+      if (state.compiled[i])
+         return i;
+   }
+   return -1;
+}
+
+inline bool brw_simd_any_compiled(const brw_simd_selection_state &state)
+{
+   return brw_simd_first_compiled(state) >= 0;
+}
+
+bool brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd);
+
+void brw_simd_mark_compiled(brw_simd_selection_state &state, unsigned simd, bool spilled);
+
+int brw_simd_select(const brw_simd_selection_state &state);
+
+int brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
+                                       const struct brw_cs_prog_data *prog_data,
+                                       const unsigned *sizes);
+
+bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag);
+
+#endif // BRW_PRIVATE_H
diff --git a/src/intel/compiler/elk/brw_reg.h b/src/intel/compiler/elk/brw_reg.h
new file mode 100644
index 00000000000..7e2243c4625
--- /dev/null
+++ b/src/intel/compiler/elk/brw_reg.h
@@ -0,0 +1,1375 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+/** @file brw_reg.h
+ *
+ * This file defines struct brw_reg, which is our representation for EU
+ * registers.  They're not a hardware specific format, just an abstraction
+ * that intends to capture the full flexibility of the hardware registers.
+ *
+ * The brw_eu_emit.c layer's brw_set_dest/brw_set_src[01] functions encode
+ * the abstract brw_reg type into the actual hardware instruction encoding.
+ */
+
+#ifndef BRW_REG_H
+#define BRW_REG_H
+
+#include <stdbool.h>
+#include "util/compiler.h"
+#include "util/glheader.h"
+#include "util/macros.h"
+#include "util/rounding.h"
+#include "util/u_math.h"
+#include "brw_eu_defines.h"
+#include "brw_reg_type.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct intel_device_info;
+
+/** Size of general purpose register space in REG_SIZE units */
+#define BRW_MAX_GRF 128
+#define XE2_MAX_GRF 256
+
+/**
+ * First GRF used for the MRF hack.
+ *
+ * On gfx7, MRFs are no longer used, and contiguous GRFs are used instead.  We
+ * haven't converted our compiler to be aware of this, so it asks for MRFs and
+ * brw_eu_emit.c quietly converts them to be accesses of the top GRFs.  The
+ * register allocators have to be careful of this to avoid corrupting the "MRF"s
+ * with actual GRF allocations.
+ */
+#define GFX7_MRF_HACK_START 112
+
+/**
+ * BRW hardware swizzles.
+ * Only defines XYZW to ensure it can be contained in 2 bits
+ */
+#define BRW_SWIZZLE_X 0
+#define BRW_SWIZZLE_Y 1
+#define BRW_SWIZZLE_Z 2
+#define BRW_SWIZZLE_W 3
+
+/** Number of message register file registers */
+#define BRW_MAX_MRF(gen) (gen == 6 ? 24 : 16)
+
+#define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
+#define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
+
+#define BRW_SWIZZLE_NOOP      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XYZW      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XXXX      BRW_SWIZZLE4(0,0,0,0)
+#define BRW_SWIZZLE_YYYY      BRW_SWIZZLE4(1,1,1,1)
+#define BRW_SWIZZLE_ZZZZ      BRW_SWIZZLE4(2,2,2,2)
+#define BRW_SWIZZLE_WWWW      BRW_SWIZZLE4(3,3,3,3)
+#define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
+#define BRW_SWIZZLE_YXYX      BRW_SWIZZLE4(1,0,1,0)
+#define BRW_SWIZZLE_XZXZ      BRW_SWIZZLE4(0,2,0,2)
+#define BRW_SWIZZLE_YZXW      BRW_SWIZZLE4(1,2,0,3)
+#define BRW_SWIZZLE_YWYW      BRW_SWIZZLE4(1,3,1,3)
+#define BRW_SWIZZLE_ZXYW      BRW_SWIZZLE4(2,0,1,3)
+#define BRW_SWIZZLE_ZWZW      BRW_SWIZZLE4(2,3,2,3)
+#define BRW_SWIZZLE_WZWZ      BRW_SWIZZLE4(3,2,3,2)
+#define BRW_SWIZZLE_WZYX      BRW_SWIZZLE4(3,2,1,0)
+#define BRW_SWIZZLE_XXZZ      BRW_SWIZZLE4(0,0,2,2)
+#define BRW_SWIZZLE_YYWW      BRW_SWIZZLE4(1,1,3,3)
+#define BRW_SWIZZLE_YXWZ      BRW_SWIZZLE4(1,0,3,2)
+
+#define BRW_SWZ_COMP_INPUT(comp) (BRW_SWIZZLE_XYZW >> ((comp)*2))
+#define BRW_SWZ_COMP_OUTPUT(comp) (BRW_SWIZZLE_XYZW << ((comp)*2))
+
+static inline bool
+brw_is_single_value_swizzle(unsigned swiz)
+{
+   return (swiz == BRW_SWIZZLE_XXXX ||
+           swiz == BRW_SWIZZLE_YYYY ||
+           swiz == BRW_SWIZZLE_ZZZZ ||
+           swiz == BRW_SWIZZLE_WWWW);
+}
+
+/**
+ * Compute the swizzle obtained from the application of \p swz0 on the result
+ * of \p swz1.  The argument ordering is expected to match function
+ * composition.
+ */
+static inline unsigned
+brw_compose_swizzle(unsigned swz0, unsigned swz1)
+{
+   return BRW_SWIZZLE4(
+      BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 0)),
+      BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 1)),
+      BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 2)),
+      BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 3)));
+}
+
+/**
+ * Return the result of applying swizzle \p swz to shuffle the bits of \p mask
+ * (AKA image).
+ */
+static inline unsigned
+brw_apply_swizzle_to_mask(unsigned swz, unsigned mask)
+{
+   unsigned result = 0;
+
+   for (unsigned i = 0; i < 4; i++) {
+      if (mask & (1 << BRW_GET_SWZ(swz, i)))
+         result |= 1 << i;
+   }
+
+   return result;
+}
+
+/**
+ * Return the result of applying the inverse of swizzle \p swz to shuffle the
+ * bits of \p mask (AKA preimage).  Useful to find out which components are
+ * read from a swizzled source given the instruction writemask.
+ */
+static inline unsigned
+brw_apply_inv_swizzle_to_mask(unsigned swz, unsigned mask)
+{
+   unsigned result = 0;
+
+   for (unsigned i = 0; i < 4; i++) {
+      if (mask & (1 << i))
+         result |= 1 << BRW_GET_SWZ(swz, i);
+   }
+
+   return result;
+}
+
+/**
+ * Construct an identity swizzle for the set of enabled channels given by \p
+ * mask.  The result will only reference channels enabled in the provided \p
+ * mask, assuming that \p mask is non-zero.  The constructed swizzle will
+ * satisfy the property that for any instruction OP and any mask:
+ *
+ *    brw_OP(p, brw_writemask(dst, mask),
+ *           brw_swizzle(src, brw_swizzle_for_mask(mask)));
+ *
+ * will be equivalent to the same instruction without swizzle:
+ *
+ *    brw_OP(p, brw_writemask(dst, mask), src);
+ */
+static inline unsigned
+brw_swizzle_for_mask(unsigned mask)
+{
+   unsigned last = (mask ? ffs(mask) - 1 : 0);
+   unsigned swz[4];
+
+   for (unsigned i = 0; i < 4; i++)
+      last = swz[i] = (mask & (1 << i) ? i : last);
+
+   return BRW_SWIZZLE4(swz[0], swz[1], swz[2], swz[3]);
+}
+
+/**
+ * Construct an identity swizzle for the first \p n components of a vector.
+ * When only a subset of channels of a vec4 are used we don't want to
+ * reference the other channels, as that will tell optimization passes that
+ * those other channels are used.
+ */
+static inline unsigned
+brw_swizzle_for_size(unsigned n)
+{
+   return brw_swizzle_for_mask((1 << n) - 1);
+}
+
+/**
+ * Converse of brw_swizzle_for_mask().  Returns the mask of components
+ * accessed by the specified swizzle \p swz.
+ */
+static inline unsigned
+brw_mask_for_swizzle(unsigned swz)
+{
+   return brw_apply_inv_swizzle_to_mask(swz, ~0);
+}
+
+uint32_t brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz);
+
+#define REG_SIZE (8*4)
+
+/* These aren't hardware structs, just something useful for us to pass around:
+ *
+ * Align1 operation has a lot of control over input ranges.  Used in
+ * WM programs to implement shaders decomposed into "channel serial"
+ * or "structure of array" form:
+ */
+struct brw_reg {
+   union {
+      struct {
+         enum brw_reg_type type:4;
+         enum brw_reg_file file:3;      /* :2 hardware format */
+         unsigned negate:1;             /* source only */
+         unsigned abs:1;                /* source only */
+         unsigned address_mode:1;       /* relative addressing, hopefully! */
+         unsigned pad0:17;
+         unsigned subnr:5;              /* :1 in align16 */
+      };
+      uint32_t bits;
+   };
+
+   union {
+      struct {
+         unsigned nr;
+         unsigned swizzle:8;      /* src only, align16 only */
+         unsigned writemask:4;    /* dest only, align16 only */
+         int  indirect_offset:10; /* relative addressing offset */
+         unsigned vstride:4;      /* source only */
+         unsigned width:3;        /* src only, align1 only */
+         unsigned hstride:2;      /* align1 only */
+         unsigned pad1:1;
+      };
+
+      double df;
+      uint64_t u64;
+      int64_t d64;
+      float f;
+      int   d;
+      unsigned ud;
+   };
+};
+
+static inline unsigned
+phys_nr(const struct intel_device_info *devinfo, const struct brw_reg reg)
+{
+   if (devinfo->ver >= 20) {
+      if (reg.file == BRW_GENERAL_REGISTER_FILE)
+         return reg.nr / 2;
+      else if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+               reg.nr >= BRW_ARF_ACCUMULATOR &&
+               reg.nr < BRW_ARF_FLAG)
+         return BRW_ARF_ACCUMULATOR + (reg.nr - BRW_ARF_ACCUMULATOR) / 2;
+      else
+         return reg.nr;
+   } else {
+      return reg.nr;
+   }
+}
+
+static inline unsigned
+phys_subnr(const struct intel_device_info *devinfo, const struct brw_reg reg)
+{
+   if (devinfo->ver >= 20) {
+      if (reg.file == BRW_GENERAL_REGISTER_FILE ||
+          (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+           reg.nr >= BRW_ARF_ACCUMULATOR &&
+           reg.nr < BRW_ARF_FLAG))
+         return (reg.nr & 1) * REG_SIZE + reg.subnr;
+      else
+         return reg.subnr;
+   } else {
+      return reg.subnr;
+   }
+}
+
+static inline bool
+brw_regs_equal(const struct brw_reg *a, const struct brw_reg *b)
+{
+   return a->bits == b->bits && a->u64 == b->u64;
+}
+
+static inline bool
+brw_regs_negative_equal(const struct brw_reg *a, const struct brw_reg *b)
+{
+   if (a->file == IMM) {
+      if (a->bits != b->bits)
+         return false;
+
+      switch ((enum brw_reg_type) a->type) {
+      case BRW_REGISTER_TYPE_UQ:
+      case BRW_REGISTER_TYPE_Q:
+         return a->d64 == -b->d64;
+      case BRW_REGISTER_TYPE_DF:
+         return a->df == -b->df;
+      case BRW_REGISTER_TYPE_UD:
+      case BRW_REGISTER_TYPE_D:
+         return a->d == -b->d;
+      case BRW_REGISTER_TYPE_F:
+         return a->f == -b->f;
+      case BRW_REGISTER_TYPE_VF:
+         /* It is tempting to treat 0 as a negation of 0 (and -0 as a negation
+          * of -0).  There are occasions where 0 or -0 is used and the exact
+          * bit pattern is desired.  At the very least, changing this to allow
+          * 0 as a negation of 0 causes some fp64 tests to fail on IVB.
+          */
+         return a->ud == (b->ud ^ 0x80808080);
+      case BRW_REGISTER_TYPE_UW:
+      case BRW_REGISTER_TYPE_W:
+      case BRW_REGISTER_TYPE_UV:
+      case BRW_REGISTER_TYPE_V:
+      case BRW_REGISTER_TYPE_HF:
+         /* FINISHME: Implement support for these types once there is
+          * something in the compiler that can generate them.  Until then,
+          * they cannot be tested.
+          */
+         return false;
+      case BRW_REGISTER_TYPE_UB:
+      case BRW_REGISTER_TYPE_B:
+      case BRW_REGISTER_TYPE_NF:
+      default:
+         unreachable("not reached");
+      }
+   } else {
+      struct brw_reg tmp = *a;
+
+      tmp.negate = !tmp.negate;
+
+      return brw_regs_equal(&tmp, b);
+   }
+}
+
+struct brw_indirect {
+   unsigned addr_subnr:4;
+   int addr_offset:10;
+   unsigned pad:18;
+};
+
+
+static inline unsigned
+type_sz(unsigned type)
+{
+   switch(type) {
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_Q:
+   case BRW_REGISTER_TYPE_DF:
+   case BRW_REGISTER_TYPE_NF:
+      return 8;
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_F:
+   case BRW_REGISTER_TYPE_VF:
+      return 4;
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_HF:
+   /* [U]V components are 4-bit, but HW unpacks them to 16-bit (2 bytes) */
+   case BRW_REGISTER_TYPE_UV:
+   case BRW_REGISTER_TYPE_V:
+      return 2;
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+      return 1;
+   default:
+      unreachable("not reached");
+   }
+}
+
+static inline enum brw_reg_type
+get_exec_type(const enum brw_reg_type type)
+{
+   switch (type) {
+   case BRW_REGISTER_TYPE_B:
+   case BRW_REGISTER_TYPE_V:
+      return BRW_REGISTER_TYPE_W;
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_UV:
+      return BRW_REGISTER_TYPE_UW;
+   case BRW_REGISTER_TYPE_VF:
+      return BRW_REGISTER_TYPE_F;
+   default:
+      return type;
+   }
+}
+
+/**
+ * Return an integer type of the requested size and signedness.
+ */
+static inline enum brw_reg_type
+brw_int_type(unsigned sz, bool is_signed)
+{
+   switch (sz) {
+   case 1:
+      return (is_signed ? BRW_REGISTER_TYPE_B : BRW_REGISTER_TYPE_UB);
+   case 2:
+      return (is_signed ? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
+   case 4:
+      return (is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD);
+   case 8:
+      return (is_signed ? BRW_REGISTER_TYPE_Q : BRW_REGISTER_TYPE_UQ);
+   default:
+      unreachable("Not reached.");
+   }
+}
+
+/**
+ * Construct a brw_reg.
+ * \param file      one of the BRW_x_REGISTER_FILE values
+ * \param nr        register number/index
+ * \param subnr     register sub number
+ * \param negate    register negate modifier
+ * \param abs       register abs modifier
+ * \param type      one of BRW_REGISTER_TYPE_x
+ * \param vstride   one of BRW_VERTICAL_STRIDE_x
+ * \param width     one of BRW_WIDTH_x
+ * \param hstride   one of BRW_HORIZONTAL_STRIDE_x
+ * \param swizzle   one of BRW_SWIZZLE_x
+ * \param writemask WRITEMASK_X/Y/Z/W bitfield
+ */
+static inline struct brw_reg
+brw_reg(enum brw_reg_file file,
+        unsigned nr,
+        unsigned subnr,
+        unsigned negate,
+        unsigned abs,
+        enum brw_reg_type type,
+        unsigned vstride,
+        unsigned width,
+        unsigned hstride,
+        unsigned swizzle,
+        unsigned writemask)
+{
+   struct brw_reg reg;
+   if (file == BRW_GENERAL_REGISTER_FILE)
+      assert(nr < XE2_MAX_GRF);
+   else if (file == BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(nr <= BRW_ARF_TIMESTAMP);
+   /* Asserting on the MRF register number requires to know the hardware gen
+    * (gfx6 has 24 MRF registers), which we don't know here, so we assert
+    * for that in the generators and in brw_eu_emit.c
+    */
+
+   reg.type = type;
+   reg.file = file;
+   reg.negate = negate;
+   reg.abs = abs;
+   reg.address_mode = BRW_ADDRESS_DIRECT;
+   reg.pad0 = 0;
+   reg.subnr = subnr * type_sz(type);
+   reg.nr = nr;
+
+   /* Could do better: If the reg is r5.3<0;1,0>, we probably want to
+    * set swizzle and writemask to W, as the lower bits of subnr will
+    * be lost when converted to align16.  This is probably too much to
+    * keep track of as you'd want it adjusted by suboffset(), etc.
+    * Perhaps fix up when converting to align16?
+    */
+   reg.swizzle = swizzle;
+   reg.writemask = writemask;
+   reg.indirect_offset = 0;
+   reg.vstride = vstride;
+   reg.width = width;
+   reg.hstride = hstride;
+   reg.pad1 = 0;
+   return reg;
+}
+
+/** Construct float[16] register */
+static inline struct brw_reg
+brw_vec16_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return brw_reg(file,
+                  nr,
+                  subnr,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_F,
+                  BRW_VERTICAL_STRIDE_16,
+                  BRW_WIDTH_16,
+                  BRW_HORIZONTAL_STRIDE_1,
+                  BRW_SWIZZLE_XYZW,
+                  WRITEMASK_XYZW);
+}
+
+/** Construct float[8] register */
+static inline struct brw_reg
+brw_vec8_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return brw_reg(file,
+                  nr,
+                  subnr,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_F,
+                  BRW_VERTICAL_STRIDE_8,
+                  BRW_WIDTH_8,
+                  BRW_HORIZONTAL_STRIDE_1,
+                  BRW_SWIZZLE_XYZW,
+                  WRITEMASK_XYZW);
+}
+
+/** Construct float[4] register */
+static inline struct brw_reg
+brw_vec4_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return brw_reg(file,
+                  nr,
+                  subnr,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_F,
+                  BRW_VERTICAL_STRIDE_4,
+                  BRW_WIDTH_4,
+                  BRW_HORIZONTAL_STRIDE_1,
+                  BRW_SWIZZLE_XYZW,
+                  WRITEMASK_XYZW);
+}
+
+/** Construct float[2] register */
+static inline struct brw_reg
+brw_vec2_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return brw_reg(file,
+                  nr,
+                  subnr,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_F,
+                  BRW_VERTICAL_STRIDE_2,
+                  BRW_WIDTH_2,
+                  BRW_HORIZONTAL_STRIDE_1,
+                  BRW_SWIZZLE_XYXY,
+                  WRITEMASK_XY);
+}
+
+/** Construct float[1] register */
+static inline struct brw_reg
+brw_vec1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return brw_reg(file,
+                  nr,
+                  subnr,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_F,
+                  BRW_VERTICAL_STRIDE_0,
+                  BRW_WIDTH_1,
+                  BRW_HORIZONTAL_STRIDE_0,
+                  BRW_SWIZZLE_XXXX,
+                  WRITEMASK_X);
+}
+
+static inline struct brw_reg
+brw_vecn_reg(unsigned width, enum brw_reg_file file,
+             unsigned nr, unsigned subnr)
+{
+   switch (width) {
+   case 1:
+      return brw_vec1_reg(file, nr, subnr);
+   case 2:
+      return brw_vec2_reg(file, nr, subnr);
+   case 4:
+      return brw_vec4_reg(file, nr, subnr);
+   case 8:
+      return brw_vec8_reg(file, nr, subnr);
+   case 16:
+      return brw_vec16_reg(file, nr, subnr);
+   default:
+      unreachable("Invalid register width");
+   }
+}
+
+static inline struct brw_reg
+retype(struct brw_reg reg, enum brw_reg_type type)
+{
+   reg.type = type;
+   return reg;
+}
+
+static inline struct brw_reg
+firsthalf(struct brw_reg reg)
+{
+   return reg;
+}
+
+static inline struct brw_reg
+sechalf(struct brw_reg reg)
+{
+   if (reg.vstride)
+      reg.nr++;
+   return reg;
+}
+
+static inline struct brw_reg
+offset(struct brw_reg reg, unsigned delta)
+{
+   reg.nr += delta;
+   return reg;
+}
+
+
+static inline struct brw_reg
+byte_offset(struct brw_reg reg, unsigned bytes)
+{
+   unsigned newoffset = reg.nr * REG_SIZE + reg.subnr + bytes;
+   reg.nr = newoffset / REG_SIZE;
+   reg.subnr = newoffset % REG_SIZE;
+   return reg;
+}
+
+static inline struct brw_reg
+suboffset(struct brw_reg reg, unsigned delta)
+{
+   return byte_offset(reg, delta * type_sz(reg.type));
+}
+
+/** Construct unsigned word[16] register */
+static inline struct brw_reg
+brw_uw16_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return suboffset(retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+/** Construct unsigned word[8] register */
+static inline struct brw_reg
+brw_uw8_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return suboffset(retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+/** Construct unsigned word[1] register */
+static inline struct brw_reg
+brw_uw1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return suboffset(retype(brw_vec1_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static inline struct brw_reg
+brw_ud8_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return retype(brw_vec8_reg(file, nr, subnr), BRW_REGISTER_TYPE_UD);
+}
+
+static inline struct brw_reg
+brw_ud1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return retype(brw_vec1_reg(file, nr, subnr), BRW_REGISTER_TYPE_UD);
+}
+
+static inline struct brw_reg
+brw_imm_reg(enum brw_reg_type type)
+{
+   return brw_reg(BRW_IMMEDIATE_VALUE,
+                  0,
+                  0,
+                  0,
+                  0,
+                  type,
+                  BRW_VERTICAL_STRIDE_0,
+                  BRW_WIDTH_1,
+                  BRW_HORIZONTAL_STRIDE_0,
+                  0,
+                  0);
+}
+
+/** Construct float immediate register */
+static inline struct brw_reg
+brw_imm_df(double df)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_DF);
+   imm.df = df;
+   return imm;
+}
+
+static inline struct brw_reg
+brw_imm_u64(uint64_t u64)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UQ);
+   imm.u64 = u64;
+   return imm;
+}
+
+static inline struct brw_reg
+brw_imm_f(float f)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F);
+   imm.f = f;
+   return imm;
+}
+
+/** Construct int64_t immediate register */
+static inline struct brw_reg
+brw_imm_q(int64_t q)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_Q);
+   imm.d64 = q;
+   return imm;
+}
+
+/** Construct int64_t immediate register */
+static inline struct brw_reg
+brw_imm_uq(uint64_t uq)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UQ);
+   imm.u64 = uq;
+   return imm;
+}
+
+/** Construct integer immediate register */
+static inline struct brw_reg
+brw_imm_d(int d)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D);
+   imm.d = d;
+   return imm;
+}
+
+/** Construct uint immediate register */
+static inline struct brw_reg
+brw_imm_ud(unsigned ud)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD);
+   imm.ud = ud;
+   return imm;
+}
+
+/** Construct ushort immediate register */
+static inline struct brw_reg
+brw_imm_uw(uint16_t uw)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW);
+   imm.ud = uw | (uw << 16);
+   return imm;
+}
+
+/** Construct short immediate register */
+static inline struct brw_reg
+brw_imm_w(int16_t w)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W);
+   imm.ud = (uint16_t)w | (uint32_t)(uint16_t)w << 16;
+   return imm;
+}
+
+/* brw_imm_b and brw_imm_ub aren't supported by hardware - the type
+ * numbers alias with _V and _VF below:
+ */
+
+/** Construct vector of eight signed half-byte values */
+static inline struct brw_reg
+brw_imm_v(unsigned v)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_V);
+   imm.ud = v;
+   return imm;
+}
+
+/** Construct vector of eight unsigned half-byte values */
+static inline struct brw_reg
+brw_imm_uv(unsigned uv)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UV);
+   imm.ud = uv;
+   return imm;
+}
+
+/** Construct vector of four 8-bit float values */
+static inline struct brw_reg
+brw_imm_vf(unsigned v)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.ud = v;
+   return imm;
+}
+
+static inline struct brw_reg
+brw_imm_vf4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_4;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.ud = ((v0 << 0) | (v1 << 8) | (v2 << 16) | (v3 << 24));
+   return imm;
+}
+
+
+static inline struct brw_reg
+brw_address(struct brw_reg reg)
+{
+   return brw_imm_uw(reg.nr * REG_SIZE + reg.subnr);
+}
+
+/** Construct float[1] general-purpose register */
+static inline struct brw_reg
+brw_vec1_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg
+xe2_vec1_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 2 * nr + subnr / 8, subnr % 8);
+}
+
+/** Construct float[2] general-purpose register */
+static inline struct brw_reg
+brw_vec2_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg
+xe2_vec2_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, 2 * nr + subnr / 8, subnr % 8);
+}
+
+/** Construct float[4] general-purpose register */
+static inline struct brw_reg
+brw_vec4_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec4_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg
+xe2_vec4_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec4_reg(BRW_GENERAL_REGISTER_FILE, 2 * nr + subnr / 8, subnr % 8);
+}
+
+/** Construct float[8] general-purpose register */
+static inline struct brw_reg
+brw_vec8_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg
+xe2_vec8_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, 2 * nr + subnr / 8, subnr % 8);
+}
+
+/** Construct float[16] general-purpose register */
+static inline struct brw_reg
+brw_vec16_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg
+xe2_vec16_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec16_reg(BRW_GENERAL_REGISTER_FILE, 2 * nr + subnr / 8, subnr % 8);
+}
+
+static inline struct brw_reg
+brw_vecn_grf(unsigned width, unsigned nr, unsigned subnr)
+{
+   return brw_vecn_reg(width, BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg
+xe2_vecn_grf(unsigned width, unsigned nr, unsigned subnr)
+{
+   return brw_vecn_reg(width, BRW_GENERAL_REGISTER_FILE, nr + subnr / 8, subnr % 8);
+}
+
+static inline struct brw_reg
+brw_uw1_grf(unsigned nr, unsigned subnr)
+{
+   return brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg
+brw_uw8_grf(unsigned nr, unsigned subnr)
+{
+   return brw_uw8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg
+brw_uw16_grf(unsigned nr, unsigned subnr)
+{
+   return brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg
+brw_ud8_grf(unsigned nr, unsigned subnr)
+{
+   return brw_ud8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg
+brw_ud1_grf(unsigned nr, unsigned subnr)
+{
+   return brw_ud1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+
+/** Construct null register (usually used for setting condition codes) */
+static inline struct brw_reg
+brw_null_reg(void)
+{
+   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_NULL, 0);
+}
+
+static inline struct brw_reg
+brw_null_vec(unsigned width)
+{
+   return brw_vecn_reg(width, BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_NULL, 0);
+}
+
+static inline struct brw_reg
+brw_address_reg(unsigned subnr)
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_ADDRESS, subnr);
+}
+
+static inline struct brw_reg
+brw_tdr_reg(void)
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_TDR, 0);
+}
+
+/* If/else instructions break in align16 mode if writemask & swizzle
+ * aren't xyzw.  This goes against the convention for other scalar
+ * regs:
+ */
+static inline struct brw_reg
+brw_ip_reg(void)
+{
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                  BRW_ARF_IP,
+                  0,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_UD,
+                  BRW_VERTICAL_STRIDE_4, /* ? */
+                  BRW_WIDTH_1,
+                  BRW_HORIZONTAL_STRIDE_0,
+                  BRW_SWIZZLE_XYZW, /* NOTE! */
+                  WRITEMASK_XYZW); /* NOTE! */
+}
+
+static inline struct brw_reg
+brw_notification_reg(void)
+{
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                  BRW_ARF_NOTIFICATION_COUNT,
+                  0,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_UD,
+                  BRW_VERTICAL_STRIDE_0,
+                  BRW_WIDTH_1,
+                  BRW_HORIZONTAL_STRIDE_0,
+                  BRW_SWIZZLE_XXXX,
+                  WRITEMASK_X);
+}
+
+static inline struct brw_reg
+brw_cr0_reg(unsigned subnr)
+{
+   return brw_ud1_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_CONTROL, subnr);
+}
+
+static inline struct brw_reg
+brw_sr0_reg(unsigned subnr)
+{
+   return brw_ud1_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_STATE, subnr);
+}
+
+static inline struct brw_reg
+brw_acc_reg(unsigned width)
+{
+   return brw_vecn_reg(width, BRW_ARCHITECTURE_REGISTER_FILE,
+                       BRW_ARF_ACCUMULATOR, 0);
+}
+
+static inline struct brw_reg
+brw_flag_reg(int reg, int subreg)
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                      BRW_ARF_FLAG + reg, subreg);
+}
+
+static inline struct brw_reg
+brw_flag_subreg(unsigned subreg)
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                      BRW_ARF_FLAG + subreg / 2, subreg % 2);
+}
+
+/**
+ * Return the mask register present in Gfx4-5, or the related register present
+ * in Gfx7.5 and later hardware referred to as "channel enable" register in
+ * the documentation.
+ */
+static inline struct brw_reg
+brw_mask_reg(unsigned subnr)
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_MASK, subnr);
+}
+
+static inline struct brw_reg
+brw_vmask_reg()
+{
+   return brw_sr0_reg(3);
+}
+
+static inline struct brw_reg
+brw_dmask_reg()
+{
+   return brw_sr0_reg(2);
+}
+
+static inline struct brw_reg
+brw_mask_stack_reg(unsigned subnr)
+{
+   return suboffset(retype(brw_vec16_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                                         BRW_ARF_MASK_STACK, 0),
+                           BRW_REGISTER_TYPE_UB), subnr);
+}
+
+static inline struct brw_reg
+brw_mask_stack_depth_reg(unsigned subnr)
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                      BRW_ARF_MASK_STACK_DEPTH, subnr);
+}
+
+static inline struct brw_reg
+brw_message_reg(unsigned nr)
+{
+   return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, nr, 0);
+}
+
+static inline struct brw_reg
+brw_uvec_mrf(unsigned width, unsigned nr, unsigned subnr)
+{
+   return retype(brw_vecn_reg(width, BRW_MESSAGE_REGISTER_FILE, nr, subnr),
+                 BRW_REGISTER_TYPE_UD);
+}
+
+/* This is almost always called with a numeric constant argument, so
+ * make things easy to evaluate at compile time:
+ */
+static inline unsigned cvt(unsigned val)
+{
+   switch (val) {
+   case 0: return 0;
+   case 1: return 1;
+   case 2: return 2;
+   case 4: return 3;
+   case 8: return 4;
+   case 16: return 5;
+   case 32: return 6;
+   }
+   return 0;
+}
+
+static inline struct brw_reg
+stride(struct brw_reg reg, unsigned vstride, unsigned width, unsigned hstride)
+{
+   reg.vstride = cvt(vstride);
+   reg.width = cvt(width) - 1;
+   reg.hstride = cvt(hstride);
+   return reg;
+}
+
+/**
+ * Multiply the vertical and horizontal stride of a register by the given
+ * factor \a s.
+ */
+static inline struct brw_reg
+spread(struct brw_reg reg, unsigned s)
+{
+   if (s) {
+      assert(util_is_power_of_two_nonzero(s));
+
+      if (reg.hstride)
+         reg.hstride += cvt(s) - 1;
+
+      if (reg.vstride)
+         reg.vstride += cvt(s) - 1;
+
+      return reg;
+   } else {
+      return stride(reg, 0, 1, 0);
+   }
+}
+
+/**
+ * Reinterpret each channel of register \p reg as a vector of values of the
+ * given smaller type and take the i-th subcomponent from each.
+ */
+static inline struct brw_reg
+subscript(struct brw_reg reg, enum brw_reg_type type, unsigned i)
+{
+   unsigned scale = type_sz(reg.type) / type_sz(type);
+   assert(scale >= 1 && i < scale);
+
+   if (reg.file == IMM) {
+      unsigned bit_size = type_sz(type) * 8;
+      reg.u64 >>= i * bit_size;
+      reg.u64 &= BITFIELD64_MASK(bit_size);
+      if (bit_size <= 16)
+         reg.u64 |= reg.u64 << 16;
+      return retype(reg, type);
+   }
+
+   return suboffset(retype(spread(reg, scale), type), i);
+}
+
+static inline struct brw_reg
+vec16(struct brw_reg reg)
+{
+   return stride(reg, 16,16,1);
+}
+
+static inline struct brw_reg
+vec8(struct brw_reg reg)
+{
+   return stride(reg, 8,8,1);
+}
+
+static inline struct brw_reg
+vec4(struct brw_reg reg)
+{
+   return stride(reg, 4,4,1);
+}
+
+static inline struct brw_reg
+vec2(struct brw_reg reg)
+{
+   return stride(reg, 2,2,1);
+}
+
+static inline struct brw_reg
+vec1(struct brw_reg reg)
+{
+   return stride(reg, 0,1,0);
+}
+
+
+static inline struct brw_reg
+get_element(struct brw_reg reg, unsigned elt)
+{
+   return vec1(suboffset(reg, elt));
+}
+
+static inline struct brw_reg
+get_element_ud(struct brw_reg reg, unsigned elt)
+{
+   return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt));
+}
+
+static inline struct brw_reg
+get_element_d(struct brw_reg reg, unsigned elt)
+{
+   return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_D), elt));
+}
+
+static inline struct brw_reg
+brw_swizzle(struct brw_reg reg, unsigned swz)
+{
+   if (reg.file == BRW_IMMEDIATE_VALUE)
+      reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swz);
+   else
+      reg.swizzle = brw_compose_swizzle(swz, reg.swizzle);
+
+   return reg;
+}
+
+static inline struct brw_reg
+brw_writemask(struct brw_reg reg, unsigned mask)
+{
+   assert(reg.file != BRW_IMMEDIATE_VALUE);
+   reg.writemask &= mask;
+   return reg;
+}
+
+static inline struct brw_reg
+brw_set_writemask(struct brw_reg reg, unsigned mask)
+{
+   assert(reg.file != BRW_IMMEDIATE_VALUE);
+   reg.writemask = mask;
+   return reg;
+}
+
+static inline unsigned
+brw_writemask_for_size(unsigned n)
+{
+   return (1 << n) - 1;
+}
+
+static inline unsigned
+brw_writemask_for_component_packing(unsigned n, unsigned first_component)
+{
+   assert(first_component + n <= 4);
+   return (((1 << n) - 1) << first_component);
+}
+
+static inline struct brw_reg
+negate(struct brw_reg reg)
+{
+   reg.negate ^= 1;
+   return reg;
+}
+
+static inline struct brw_reg
+brw_abs(struct brw_reg reg)
+{
+   reg.abs = 1;
+   reg.negate = 0;
+   return reg;
+}
+
+/************************************************************************/
+
+static inline struct brw_reg
+brw_vec4_indirect(unsigned subnr, int offset)
+{
+   struct brw_reg reg =  brw_vec4_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.indirect_offset = offset;
+   return reg;
+}
+
+static inline struct brw_reg
+brw_vec1_indirect(unsigned subnr, int offset)
+{
+   struct brw_reg reg =  brw_vec1_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.indirect_offset = offset;
+   return reg;
+}
+
+static inline struct brw_reg
+brw_VxH_indirect(unsigned subnr, int offset)
+{
+   struct brw_reg reg = brw_vec1_grf(0, 0);
+   reg.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.indirect_offset = offset;
+   return reg;
+}
+
+static inline struct brw_reg
+deref_4f(struct brw_indirect ptr, int offset)
+{
+   return brw_vec4_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static inline struct brw_reg
+deref_1f(struct brw_indirect ptr, int offset)
+{
+   return brw_vec1_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static inline struct brw_reg
+deref_4b(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_4f(ptr, offset), BRW_REGISTER_TYPE_B);
+}
+
+static inline struct brw_reg
+deref_1uw(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UW);
+}
+
+static inline struct brw_reg
+deref_1d(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_D);
+}
+
+static inline struct brw_reg
+deref_1ud(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UD);
+}
+
+static inline struct brw_reg
+get_addr_reg(struct brw_indirect ptr)
+{
+   return brw_address_reg(ptr.addr_subnr);
+}
+
+static inline struct brw_indirect
+brw_indirect_offset(struct brw_indirect ptr, int offset)
+{
+   ptr.addr_offset += offset;
+   return ptr;
+}
+
+static inline struct brw_indirect
+brw_indirect(unsigned addr_subnr, int offset)
+{
+   struct brw_indirect ptr;
+   ptr.addr_subnr = addr_subnr;
+   ptr.addr_offset = offset;
+   ptr.pad = 0;
+   return ptr;
+}
+
+static inline bool
+region_matches(struct brw_reg reg, enum brw_vertical_stride v,
+               enum brw_width w, enum brw_horizontal_stride h)
+{
+   return reg.vstride == v &&
+          reg.width == w &&
+          reg.hstride == h;
+}
+
+#define has_scalar_region(reg) \
+   region_matches(reg, BRW_VERTICAL_STRIDE_0, BRW_WIDTH_1, \
+                  BRW_HORIZONTAL_STRIDE_0)
+
+/**
+ * Return the size in bytes per data element of register \p reg on the
+ * corresponding register file.
+ */
+static inline unsigned
+element_sz(struct brw_reg reg)
+{
+   if (reg.file == BRW_IMMEDIATE_VALUE || has_scalar_region(reg)) {
+      return type_sz(reg.type);
+
+   } else if (reg.width == BRW_WIDTH_1 &&
+              reg.hstride == BRW_HORIZONTAL_STRIDE_0) {
+      assert(reg.vstride != BRW_VERTICAL_STRIDE_0);
+      return type_sz(reg.type) << (reg.vstride - 1);
+
+   } else {
+      assert(reg.hstride != BRW_HORIZONTAL_STRIDE_0);
+      assert(reg.vstride == reg.hstride + reg.width);
+      return type_sz(reg.type) << (reg.hstride - 1);
+   }
+}
+
+/* brw_packed_float.c */
+int brw_float_to_vf(float f);
+float brw_vf_to_float(unsigned char vf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/intel/compiler/elk/brw_reg_type.c b/src/intel/compiler/elk/brw_reg_type.c
new file mode 100644
index 00000000000..93f1b55cadf
--- /dev/null
+++ b/src/intel/compiler/elk/brw_reg_type.c
@@ -0,0 +1,563 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_reg.h"
+#include "brw_eu_defines.h"
+#include "dev/intel_device_info.h"
+
+#define INVALID (-1)
+
+enum hw_reg_type {
+   BRW_HW_REG_TYPE_UD  = 0,
+   BRW_HW_REG_TYPE_D   = 1,
+   BRW_HW_REG_TYPE_UW  = 2,
+   BRW_HW_REG_TYPE_W   = 3,
+   BRW_HW_REG_TYPE_F   = 7,
+   GFX8_HW_REG_TYPE_UQ = 8,
+   GFX8_HW_REG_TYPE_Q  = 9,
+
+   BRW_HW_REG_TYPE_UB  = 4,
+   BRW_HW_REG_TYPE_B   = 5,
+   GFX7_HW_REG_TYPE_DF = 6,
+   GFX8_HW_REG_TYPE_HF = 10,
+
+   GFX11_HW_REG_TYPE_UD = 0,
+   GFX11_HW_REG_TYPE_D  = 1,
+   GFX11_HW_REG_TYPE_UW = 2,
+   GFX11_HW_REG_TYPE_W  = 3,
+   GFX11_HW_REG_TYPE_UB = 4,
+   GFX11_HW_REG_TYPE_B  = 5,
+   GFX11_HW_REG_TYPE_UQ = 6,
+   GFX11_HW_REG_TYPE_Q  = 7,
+   GFX11_HW_REG_TYPE_HF = 8,
+   GFX11_HW_REG_TYPE_F  = 9,
+   GFX11_HW_REG_TYPE_DF = 10,
+   GFX11_HW_REG_TYPE_NF = 11,
+};
+
+enum hw_imm_type {
+   BRW_HW_IMM_TYPE_UD  = 0,
+   BRW_HW_IMM_TYPE_D   = 1,
+   BRW_HW_IMM_TYPE_UW  = 2,
+   BRW_HW_IMM_TYPE_W   = 3,
+   BRW_HW_IMM_TYPE_F   = 7,
+   GFX8_HW_IMM_TYPE_UQ = 8,
+   GFX8_HW_IMM_TYPE_Q  = 9,
+
+   BRW_HW_IMM_TYPE_UV  = 4,
+   BRW_HW_IMM_TYPE_VF  = 5,
+   BRW_HW_IMM_TYPE_V   = 6,
+   GFX8_HW_IMM_TYPE_DF = 10,
+   GFX8_HW_IMM_TYPE_HF = 11,
+
+   GFX11_HW_IMM_TYPE_UD = 0,
+   GFX11_HW_IMM_TYPE_D  = 1,
+   GFX11_HW_IMM_TYPE_UW = 2,
+   GFX11_HW_IMM_TYPE_W  = 3,
+   GFX11_HW_IMM_TYPE_UV = 4,
+   GFX11_HW_IMM_TYPE_V  = 5,
+   GFX11_HW_IMM_TYPE_UQ = 6,
+   GFX11_HW_IMM_TYPE_Q  = 7,
+   GFX11_HW_IMM_TYPE_HF = 8,
+   GFX11_HW_IMM_TYPE_F  = 9,
+   GFX11_HW_IMM_TYPE_DF = 10,
+   GFX11_HW_IMM_TYPE_VF = 11,
+};
+
+#define GFX12_HW_REG_TYPE_UINT(n) (n)
+#define GFX12_HW_REG_TYPE_SINT(n) (0x4 | (n))
+#define GFX12_HW_REG_TYPE_FLOAT(n) (0x8 | (n))
+
+static const struct hw_type {
+   enum hw_reg_type reg_type;
+   enum hw_imm_type imm_type;
+} gfx4_hw_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = {     INVALID, INVALID             },
+
+   [BRW_REGISTER_TYPE_F]  = { BRW_HW_REG_TYPE_F,   BRW_HW_IMM_TYPE_F   },
+   [BRW_REGISTER_TYPE_VF] = { INVALID,             BRW_HW_IMM_TYPE_VF  },
+
+   [BRW_REGISTER_TYPE_D]  = { BRW_HW_REG_TYPE_D,   BRW_HW_IMM_TYPE_D   },
+   [BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD,  BRW_HW_IMM_TYPE_UD  },
+   [BRW_REGISTER_TYPE_W]  = { BRW_HW_REG_TYPE_W,   BRW_HW_IMM_TYPE_W   },
+   [BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW,  BRW_HW_IMM_TYPE_UW  },
+   [BRW_REGISTER_TYPE_B]  = { BRW_HW_REG_TYPE_B,   INVALID             },
+   [BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB,  INVALID             },
+   [BRW_REGISTER_TYPE_V]  = { INVALID,             BRW_HW_IMM_TYPE_V   },
+}, gfx6_hw_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = {     INVALID, INVALID             },
+
+   [BRW_REGISTER_TYPE_F]  = { BRW_HW_REG_TYPE_F,   BRW_HW_IMM_TYPE_F   },
+   [BRW_REGISTER_TYPE_VF] = { INVALID,             BRW_HW_IMM_TYPE_VF  },
+
+   [BRW_REGISTER_TYPE_D]  = { BRW_HW_REG_TYPE_D,   BRW_HW_IMM_TYPE_D   },
+   [BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD,  BRW_HW_IMM_TYPE_UD  },
+   [BRW_REGISTER_TYPE_W]  = { BRW_HW_REG_TYPE_W,   BRW_HW_IMM_TYPE_W   },
+   [BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW,  BRW_HW_IMM_TYPE_UW  },
+   [BRW_REGISTER_TYPE_B]  = { BRW_HW_REG_TYPE_B,   INVALID             },
+   [BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB,  INVALID             },
+   [BRW_REGISTER_TYPE_V]  = { INVALID,             BRW_HW_IMM_TYPE_V   },
+   [BRW_REGISTER_TYPE_UV] = { INVALID,             BRW_HW_IMM_TYPE_UV  },
+}, gfx7_hw_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = {     INVALID, INVALID             },
+
+   [BRW_REGISTER_TYPE_DF] = { GFX7_HW_REG_TYPE_DF, INVALID             },
+   [BRW_REGISTER_TYPE_F]  = { BRW_HW_REG_TYPE_F,   BRW_HW_IMM_TYPE_F   },
+   [BRW_REGISTER_TYPE_VF] = { INVALID,             BRW_HW_IMM_TYPE_VF  },
+
+   [BRW_REGISTER_TYPE_D]  = { BRW_HW_REG_TYPE_D,   BRW_HW_IMM_TYPE_D   },
+   [BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD,  BRW_HW_IMM_TYPE_UD  },
+   [BRW_REGISTER_TYPE_W]  = { BRW_HW_REG_TYPE_W,   BRW_HW_IMM_TYPE_W   },
+   [BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW,  BRW_HW_IMM_TYPE_UW  },
+   [BRW_REGISTER_TYPE_B]  = { BRW_HW_REG_TYPE_B,   INVALID             },
+   [BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB,  INVALID             },
+   [BRW_REGISTER_TYPE_V]  = { INVALID,             BRW_HW_IMM_TYPE_V   },
+   [BRW_REGISTER_TYPE_UV] = { INVALID,             BRW_HW_IMM_TYPE_UV  },
+}, gfx8_hw_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = {     INVALID, INVALID             },
+
+   [BRW_REGISTER_TYPE_DF] = { GFX7_HW_REG_TYPE_DF, GFX8_HW_IMM_TYPE_DF },
+   [BRW_REGISTER_TYPE_F]  = { BRW_HW_REG_TYPE_F,   BRW_HW_IMM_TYPE_F   },
+   [BRW_REGISTER_TYPE_HF] = { GFX8_HW_REG_TYPE_HF, GFX8_HW_IMM_TYPE_HF },
+   [BRW_REGISTER_TYPE_VF] = { INVALID,             BRW_HW_IMM_TYPE_VF  },
+
+   [BRW_REGISTER_TYPE_Q]  = { GFX8_HW_REG_TYPE_Q,  GFX8_HW_IMM_TYPE_Q  },
+   [BRW_REGISTER_TYPE_UQ] = { GFX8_HW_REG_TYPE_UQ, GFX8_HW_IMM_TYPE_UQ },
+   [BRW_REGISTER_TYPE_D]  = { BRW_HW_REG_TYPE_D,   BRW_HW_IMM_TYPE_D   },
+   [BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD,  BRW_HW_IMM_TYPE_UD  },
+   [BRW_REGISTER_TYPE_W]  = { BRW_HW_REG_TYPE_W,   BRW_HW_IMM_TYPE_W   },
+   [BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW,  BRW_HW_IMM_TYPE_UW  },
+   [BRW_REGISTER_TYPE_B]  = { BRW_HW_REG_TYPE_B,   INVALID             },
+   [BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB,  INVALID             },
+   [BRW_REGISTER_TYPE_V]  = { INVALID,             BRW_HW_IMM_TYPE_V   },
+   [BRW_REGISTER_TYPE_UV] = { INVALID,             BRW_HW_IMM_TYPE_UV  },
+}, gfx11_hw_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = {      INVALID, INVALID              },
+
+   [BRW_REGISTER_TYPE_NF] = { GFX11_HW_REG_TYPE_NF, INVALID              },
+   [BRW_REGISTER_TYPE_F]  = { GFX11_HW_REG_TYPE_F,  GFX11_HW_IMM_TYPE_F  },
+   [BRW_REGISTER_TYPE_HF] = { GFX11_HW_REG_TYPE_HF, GFX11_HW_IMM_TYPE_HF },
+   [BRW_REGISTER_TYPE_VF] = { INVALID,              GFX11_HW_IMM_TYPE_VF },
+
+   [BRW_REGISTER_TYPE_D]  = { GFX11_HW_REG_TYPE_D,  GFX11_HW_IMM_TYPE_D  },
+   [BRW_REGISTER_TYPE_UD] = { GFX11_HW_REG_TYPE_UD, GFX11_HW_IMM_TYPE_UD },
+   [BRW_REGISTER_TYPE_W]  = { GFX11_HW_REG_TYPE_W,  GFX11_HW_IMM_TYPE_W  },
+   [BRW_REGISTER_TYPE_UW] = { GFX11_HW_REG_TYPE_UW, GFX11_HW_IMM_TYPE_UW },
+   [BRW_REGISTER_TYPE_B]  = { GFX11_HW_REG_TYPE_B,  INVALID              },
+   [BRW_REGISTER_TYPE_UB] = { GFX11_HW_REG_TYPE_UB, INVALID              },
+   [BRW_REGISTER_TYPE_V]  = { INVALID,              GFX11_HW_IMM_TYPE_V  },
+   [BRW_REGISTER_TYPE_UV] = { INVALID,              GFX11_HW_IMM_TYPE_UV },
+}, gfx12_hw_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = {            INVALID, INVALID                    },
+
+   [BRW_REGISTER_TYPE_F]  = { GFX12_HW_REG_TYPE_FLOAT(2), GFX12_HW_REG_TYPE_FLOAT(2) },
+   [BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_FLOAT(1), GFX12_HW_REG_TYPE_FLOAT(1) },
+   [BRW_REGISTER_TYPE_VF] = { INVALID,                    GFX12_HW_REG_TYPE_FLOAT(0) },
+
+   [BRW_REGISTER_TYPE_D]  = { GFX12_HW_REG_TYPE_SINT(2),  GFX12_HW_REG_TYPE_SINT(2)  },
+   [BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2),  GFX12_HW_REG_TYPE_UINT(2)  },
+   [BRW_REGISTER_TYPE_W]  = { GFX12_HW_REG_TYPE_SINT(1),  GFX12_HW_REG_TYPE_SINT(1)  },
+   [BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1),  GFX12_HW_REG_TYPE_UINT(1)  },
+   [BRW_REGISTER_TYPE_B]  = { GFX12_HW_REG_TYPE_SINT(0),  INVALID                    },
+   [BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0),  INVALID                    },
+   [BRW_REGISTER_TYPE_V]  = { INVALID,                    GFX12_HW_REG_TYPE_SINT(0)  },
+   [BRW_REGISTER_TYPE_UV] = { INVALID,                    GFX12_HW_REG_TYPE_UINT(0)  },
+}, gfx125_hw_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = {            INVALID, INVALID                    },
+
+   [BRW_REGISTER_TYPE_DF] = { GFX12_HW_REG_TYPE_FLOAT(3), GFX12_HW_REG_TYPE_FLOAT(3) },
+   [BRW_REGISTER_TYPE_F]  = { GFX12_HW_REG_TYPE_FLOAT(2), GFX12_HW_REG_TYPE_FLOAT(2) },
+   [BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_FLOAT(1), GFX12_HW_REG_TYPE_FLOAT(1) },
+   [BRW_REGISTER_TYPE_VF] = { INVALID,                    GFX12_HW_REG_TYPE_FLOAT(0) },
+
+   [BRW_REGISTER_TYPE_Q]  = { GFX12_HW_REG_TYPE_SINT(3),  GFX12_HW_REG_TYPE_SINT(3)  },
+   [BRW_REGISTER_TYPE_UQ] = { GFX12_HW_REG_TYPE_UINT(3),  GFX12_HW_REG_TYPE_UINT(3)  },
+   [BRW_REGISTER_TYPE_D]  = { GFX12_HW_REG_TYPE_SINT(2),  GFX12_HW_REG_TYPE_SINT(2)  },
+   [BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2),  GFX12_HW_REG_TYPE_UINT(2)  },
+   [BRW_REGISTER_TYPE_W]  = { GFX12_HW_REG_TYPE_SINT(1),  GFX12_HW_REG_TYPE_SINT(1)  },
+   [BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1),  GFX12_HW_REG_TYPE_UINT(1)  },
+   [BRW_REGISTER_TYPE_B]  = { GFX12_HW_REG_TYPE_SINT(0),  INVALID                    },
+   [BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0),  INVALID                    },
+   [BRW_REGISTER_TYPE_V]  = { INVALID,                    GFX12_HW_REG_TYPE_SINT(0)  },
+   [BRW_REGISTER_TYPE_UV] = { INVALID,                    GFX12_HW_REG_TYPE_UINT(0)  },
+};
+
+/* SNB adds 3-src instructions (MAD and LRP) that only operate on floats, so
+ * the types were implied. IVB adds BFE and BFI2 that operate on doublewords
+ * and unsigned doublewords, so a new field is also available in the da3src
+ * struct (part of struct brw_instruction.bits1 in brw_structs.h) to select
+ * dst and shared-src types.
+ *
+ * CNL adds support for 3-src instructions in align1 mode, and with it support
+ * for most register types.
+ */
+enum hw_3src_reg_type {
+   GFX7_3SRC_TYPE_F  = 0,
+   GFX7_3SRC_TYPE_D  = 1,
+   GFX7_3SRC_TYPE_UD = 2,
+   GFX7_3SRC_TYPE_DF = 3,
+   GFX8_3SRC_TYPE_HF = 4,
+
+   /** When ExecutionDatatype is 1: @{ */
+   GFX10_ALIGN1_3SRC_REG_TYPE_HF = 0b000,
+   GFX10_ALIGN1_3SRC_REG_TYPE_F  = 0b001,
+   GFX10_ALIGN1_3SRC_REG_TYPE_DF = 0b010,
+   GFX11_ALIGN1_3SRC_REG_TYPE_NF = 0b011,
+   /** @} */
+
+   /** When ExecutionDatatype is 0: @{ */
+   GFX10_ALIGN1_3SRC_REG_TYPE_UD = 0b000,
+   GFX10_ALIGN1_3SRC_REG_TYPE_D  = 0b001,
+   GFX10_ALIGN1_3SRC_REG_TYPE_UW = 0b010,
+   GFX10_ALIGN1_3SRC_REG_TYPE_W  = 0b011,
+   GFX10_ALIGN1_3SRC_REG_TYPE_UB = 0b100,
+   GFX10_ALIGN1_3SRC_REG_TYPE_B  = 0b101,
+   /** @} */
+};
+
+static const struct hw_3src_type {
+   enum hw_3src_reg_type reg_type;
+   enum gfx10_align1_3src_exec_type exec_type;
+} gfx6_hw_3src_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+   [BRW_REGISTER_TYPE_F]  = { GFX7_3SRC_TYPE_F  },
+}, gfx7_hw_3src_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+   [BRW_REGISTER_TYPE_F]  = { GFX7_3SRC_TYPE_F  },
+   [BRW_REGISTER_TYPE_D]  = { GFX7_3SRC_TYPE_D  },
+   [BRW_REGISTER_TYPE_UD] = { GFX7_3SRC_TYPE_UD },
+   [BRW_REGISTER_TYPE_DF] = { GFX7_3SRC_TYPE_DF },
+}, gfx8_hw_3src_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+   [BRW_REGISTER_TYPE_F]  = { GFX7_3SRC_TYPE_F  },
+   [BRW_REGISTER_TYPE_D]  = { GFX7_3SRC_TYPE_D  },
+   [BRW_REGISTER_TYPE_UD] = { GFX7_3SRC_TYPE_UD },
+   [BRW_REGISTER_TYPE_DF] = { GFX7_3SRC_TYPE_DF },
+   [BRW_REGISTER_TYPE_HF] = { GFX8_3SRC_TYPE_HF },
+}, gfx10_hw_3src_align1_type[] = {
+#define E(x) BRW_ALIGN1_3SRC_EXEC_TYPE_##x
+   [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+   [BRW_REGISTER_TYPE_DF] = { GFX10_ALIGN1_3SRC_REG_TYPE_DF, E(FLOAT) },
+   [BRW_REGISTER_TYPE_F]  = { GFX10_ALIGN1_3SRC_REG_TYPE_F,  E(FLOAT) },
+   [BRW_REGISTER_TYPE_HF] = { GFX10_ALIGN1_3SRC_REG_TYPE_HF, E(FLOAT) },
+
+   [BRW_REGISTER_TYPE_D]  = { GFX10_ALIGN1_3SRC_REG_TYPE_D,  E(INT)   },
+   [BRW_REGISTER_TYPE_UD] = { GFX10_ALIGN1_3SRC_REG_TYPE_UD, E(INT)   },
+   [BRW_REGISTER_TYPE_W]  = { GFX10_ALIGN1_3SRC_REG_TYPE_W,  E(INT)   },
+   [BRW_REGISTER_TYPE_UW] = { GFX10_ALIGN1_3SRC_REG_TYPE_UW, E(INT)   },
+   [BRW_REGISTER_TYPE_B]  = { GFX10_ALIGN1_3SRC_REG_TYPE_B,  E(INT)   },
+   [BRW_REGISTER_TYPE_UB] = { GFX10_ALIGN1_3SRC_REG_TYPE_UB, E(INT)   },
+}, gfx11_hw_3src_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+   [BRW_REGISTER_TYPE_NF] = { GFX11_ALIGN1_3SRC_REG_TYPE_NF, E(FLOAT) },
+   [BRW_REGISTER_TYPE_F]  = { GFX10_ALIGN1_3SRC_REG_TYPE_F,  E(FLOAT) },
+   [BRW_REGISTER_TYPE_HF] = { GFX10_ALIGN1_3SRC_REG_TYPE_HF, E(FLOAT) },
+
+   [BRW_REGISTER_TYPE_D]  = { GFX10_ALIGN1_3SRC_REG_TYPE_D,  E(INT)   },
+   [BRW_REGISTER_TYPE_UD] = { GFX10_ALIGN1_3SRC_REG_TYPE_UD, E(INT)   },
+   [BRW_REGISTER_TYPE_W]  = { GFX10_ALIGN1_3SRC_REG_TYPE_W,  E(INT)   },
+   [BRW_REGISTER_TYPE_UW] = { GFX10_ALIGN1_3SRC_REG_TYPE_UW, E(INT)   },
+   [BRW_REGISTER_TYPE_B]  = { GFX10_ALIGN1_3SRC_REG_TYPE_B,  E(INT)   },
+   [BRW_REGISTER_TYPE_UB] = { GFX10_ALIGN1_3SRC_REG_TYPE_UB, E(INT)   },
+}, gfx12_hw_3src_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+   [BRW_REGISTER_TYPE_F]  = { GFX12_HW_REG_TYPE_UINT(2),     E(FLOAT), },
+   [BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_UINT(1),     E(FLOAT), },
+
+   [BRW_REGISTER_TYPE_D]  = { GFX12_HW_REG_TYPE_SINT(2),     E(INT),  },
+   [BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2),     E(INT),  },
+   [BRW_REGISTER_TYPE_W]  = { GFX12_HW_REG_TYPE_SINT(1),     E(INT),  },
+   [BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1),     E(INT),  },
+   [BRW_REGISTER_TYPE_B]  = { GFX12_HW_REG_TYPE_SINT(0),     E(INT),  },
+   [BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0),     E(INT),  },
+}, gfx125_hw_3src_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+   [BRW_REGISTER_TYPE_DF] = { GFX12_HW_REG_TYPE_UINT(3),     E(FLOAT), },
+   [BRW_REGISTER_TYPE_F]  = { GFX12_HW_REG_TYPE_UINT(2),     E(FLOAT), },
+   [BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_UINT(1),     E(FLOAT), },
+
+   [BRW_REGISTER_TYPE_Q]  = { GFX12_HW_REG_TYPE_SINT(3),     E(INT),  },
+   [BRW_REGISTER_TYPE_UQ] = { GFX12_HW_REG_TYPE_UINT(3),     E(INT),  },
+   [BRW_REGISTER_TYPE_D]  = { GFX12_HW_REG_TYPE_SINT(2),     E(INT),  },
+   [BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2),     E(INT),  },
+   [BRW_REGISTER_TYPE_W]  = { GFX12_HW_REG_TYPE_SINT(1),     E(INT),  },
+   [BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1),     E(INT),  },
+   [BRW_REGISTER_TYPE_B]  = { GFX12_HW_REG_TYPE_SINT(0),     E(INT),  },
+   [BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0),     E(INT),  },
+#undef E
+};
+
+/**
+ * Convert a brw_reg_type enumeration value into the hardware representation.
+ *
+ * The hardware encoding may depend on whether the value is an immediate.
+ */
+unsigned
+brw_reg_type_to_hw_type(const struct intel_device_info *devinfo,
+                        enum brw_reg_file file,
+                        enum brw_reg_type type)
+{
+   const struct hw_type *table;
+
+   if (devinfo->verx10 >= 125) {
+      assert(type < ARRAY_SIZE(gfx125_hw_type));
+      table = gfx125_hw_type;
+   } else if (devinfo->ver >= 12) {
+      assert(type < ARRAY_SIZE(gfx12_hw_type));
+      table = gfx12_hw_type;
+   } else if (devinfo->ver >= 11) {
+      assert(type < ARRAY_SIZE(gfx11_hw_type));
+      table = gfx11_hw_type;
+   } else if (devinfo->ver >= 8) {
+      assert(type < ARRAY_SIZE(gfx8_hw_type));
+      table = gfx8_hw_type;
+   } else if (devinfo->ver >= 7) {
+      assert(type < ARRAY_SIZE(gfx7_hw_type));
+      table = gfx7_hw_type;
+   } else if (devinfo->ver >= 6) {
+      assert(type < ARRAY_SIZE(gfx6_hw_type));
+      table = gfx6_hw_type;
+   } else {
+      assert(type < ARRAY_SIZE(gfx4_hw_type));
+      table = gfx4_hw_type;
+   }
+
+   if (file == BRW_IMMEDIATE_VALUE) {
+      assert(table[type].imm_type != (enum hw_imm_type)INVALID);
+      return table[type].imm_type;
+   } else {
+      assert(table[type].reg_type != (enum hw_reg_type)INVALID);
+      return table[type].reg_type;
+   }
+}
+
+/**
+ * Convert the hardware representation into a brw_reg_type enumeration value.
+ *
+ * The hardware encoding may depend on whether the value is an immediate.
+ */
+enum brw_reg_type
+brw_hw_type_to_reg_type(const struct intel_device_info *devinfo,
+                        enum brw_reg_file file, unsigned hw_type)
+{
+   const struct hw_type *table;
+
+   if (devinfo->verx10 >= 125) {
+      table = gfx125_hw_type;
+   } else if (devinfo->ver >= 12) {
+      table = gfx12_hw_type;
+   } else if (devinfo->ver >= 11) {
+      table = gfx11_hw_type;
+   } else if (devinfo->ver >= 8) {
+      table = gfx8_hw_type;
+   } else if (devinfo->ver >= 7) {
+      table = gfx7_hw_type;
+   } else if (devinfo->ver >= 6) {
+      table = gfx6_hw_type;
+   } else {
+      table = gfx4_hw_type;
+   }
+
+   if (file == BRW_IMMEDIATE_VALUE) {
+      for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
+         if (table[i].imm_type == (enum hw_imm_type)hw_type) {
+            return i;
+         }
+      }
+   } else {
+      for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
+         if (table[i].reg_type == (enum hw_reg_type)hw_type) {
+            return i;
+         }
+      }
+   }
+   return INVALID_REG_TYPE;
+}
+
+/**
+ * Convert a brw_reg_type enumeration value into the hardware representation
+ * for a 3-src align16 instruction
+ */
+unsigned
+brw_reg_type_to_a16_hw_3src_type(const struct intel_device_info *devinfo,
+                                 enum brw_reg_type type)
+{
+   const struct hw_3src_type *table;
+
+   if (devinfo->ver >= 8) {
+      assert(type < ARRAY_SIZE(gfx8_hw_3src_type));
+      table = gfx8_hw_3src_type;
+   } else if (devinfo->ver >= 7) {
+      assert(type < ARRAY_SIZE(gfx7_hw_3src_type));
+      table = gfx7_hw_3src_type;
+   } else {
+      assert(type < ARRAY_SIZE(gfx6_hw_3src_type));
+      table = gfx6_hw_3src_type;
+   }
+
+   assert(table[type].reg_type != (enum hw_3src_reg_type)INVALID);
+   return table[type].reg_type;
+}
+
+/**
+ * Convert a brw_reg_type enumeration value into the hardware representation
+ * for a 3-src align1 instruction
+ */
+unsigned
+brw_reg_type_to_a1_hw_3src_type(const struct intel_device_info *devinfo,
+                                enum brw_reg_type type)
+{
+   if (devinfo->verx10 >= 125) {
+      assert(type < ARRAY_SIZE(gfx125_hw_3src_type));
+      return gfx125_hw_3src_type[type].reg_type;
+   } else if (devinfo->ver >= 12) {
+      assert(type < ARRAY_SIZE(gfx12_hw_3src_type));
+      return gfx12_hw_3src_type[type].reg_type;
+   } else if (devinfo->ver >= 11) {
+      assert(type < ARRAY_SIZE(gfx11_hw_3src_type));
+      return gfx11_hw_3src_type[type].reg_type;
+   } else {
+      assert(type < ARRAY_SIZE(gfx10_hw_3src_align1_type));
+      return gfx10_hw_3src_align1_type[type].reg_type;
+   }
+}
+
+/**
+ * Convert the hardware representation for a 3-src align16 instruction into a
+ * brw_reg_type enumeration value.
+ */
+enum brw_reg_type
+brw_a16_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
+                                 unsigned hw_type)
+{
+   const struct hw_3src_type *table = NULL;
+
+   if (devinfo->ver >= 8) {
+      table = gfx8_hw_3src_type;
+   } else if (devinfo->ver >= 7) {
+      table = gfx7_hw_3src_type;
+   } else if (devinfo->ver >= 6) {
+      table = gfx6_hw_3src_type;
+   }
+
+   for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
+      if (table[i].reg_type == hw_type) {
+         return i;
+      }
+   }
+   return INVALID_REG_TYPE;
+}
+
+/**
+ * Convert the hardware representation for a 3-src align1 instruction into a
+ * brw_reg_type enumeration value.
+ */
+enum brw_reg_type
+brw_a1_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
+                                unsigned hw_type, unsigned exec_type)
+{
+   const struct hw_3src_type *table =
+      (devinfo->verx10 >= 125 ? gfx125_hw_3src_type :
+       devinfo->ver >= 12 ? gfx12_hw_3src_type :
+       devinfo->ver >= 11 ? gfx11_hw_3src_type :
+       gfx10_hw_3src_align1_type);
+
+   for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
+      if (table[i].reg_type == hw_type &&
+          table[i].exec_type == exec_type) {
+         return i;
+      }
+   }
+   return INVALID_REG_TYPE;
+}
+
+/**
+ * Return the element size given a register type.
+ */
+unsigned
+brw_reg_type_to_size(enum brw_reg_type type)
+{
+   static const unsigned type_size[] = {
+      [BRW_REGISTER_TYPE_NF] = 8,
+      [BRW_REGISTER_TYPE_DF] = 8,
+      [BRW_REGISTER_TYPE_F]  = 4,
+      [BRW_REGISTER_TYPE_HF] = 2,
+      [BRW_REGISTER_TYPE_VF] = 4,
+
+      [BRW_REGISTER_TYPE_Q]  = 8,
+      [BRW_REGISTER_TYPE_UQ] = 8,
+      [BRW_REGISTER_TYPE_D]  = 4,
+      [BRW_REGISTER_TYPE_UD] = 4,
+      [BRW_REGISTER_TYPE_W]  = 2,
+      [BRW_REGISTER_TYPE_UW] = 2,
+      [BRW_REGISTER_TYPE_B]  = 1,
+      [BRW_REGISTER_TYPE_UB] = 1,
+      [BRW_REGISTER_TYPE_V]  = 2,
+      [BRW_REGISTER_TYPE_UV] = 2,
+   };
+   if (type >= ARRAY_SIZE(type_size))
+      return -1;
+
+   return type_size[type];
+}
+
+/**
+ * Converts a BRW_REGISTER_TYPE_* enum to a short string (F, UD, and so on).
+ *
+ * This is different than reg_encoding from brw_disasm.c in that it operates
+ * on the abstract enum values, rather than the generation-specific encoding.
+ */
+const char *
+brw_reg_type_to_letters(enum brw_reg_type type)
+{
+   static const char letters[][3] = {
+      [BRW_REGISTER_TYPE_NF] = "NF",
+      [BRW_REGISTER_TYPE_DF] = "DF",
+      [BRW_REGISTER_TYPE_F]  = "F",
+      [BRW_REGISTER_TYPE_HF] = "HF",
+      [BRW_REGISTER_TYPE_VF] = "VF",
+
+      [BRW_REGISTER_TYPE_Q]  = "Q",
+      [BRW_REGISTER_TYPE_UQ] = "UQ",
+      [BRW_REGISTER_TYPE_D]  = "D",
+      [BRW_REGISTER_TYPE_UD] = "UD",
+      [BRW_REGISTER_TYPE_W]  = "W",
+      [BRW_REGISTER_TYPE_UW] = "UW",
+      [BRW_REGISTER_TYPE_B]  = "B",
+      [BRW_REGISTER_TYPE_UB] = "UB",
+      [BRW_REGISTER_TYPE_V]  = "V",
+      [BRW_REGISTER_TYPE_UV] = "UV",
+   };
+   if (type >= ARRAY_SIZE(letters))
+      return "INVALID";
+
+   assert(type < ARRAY_SIZE(letters));
+   return letters[type];
+}
diff --git a/src/intel/compiler/elk/brw_reg_type.h b/src/intel/compiler/elk/brw_reg_type.h
new file mode 100644
index 00000000000..e124bc1054d
--- /dev/null
+++ b/src/intel/compiler/elk/brw_reg_type.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_REG_TYPE_H
+#define BRW_REG_TYPE_H
+
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef HAVE_FUNC_ATTRIBUTE_PURE
+#define ATTRIBUTE_PURE __attribute__((__pure__))
+#else
+#define ATTRIBUTE_PURE
+#endif
+
+enum brw_reg_file;
+struct intel_device_info;
+
+/*
+ * The ordering has been chosen so that no enum value is the same as a
+ * compatible hardware encoding.
+ */
+enum PACKED brw_reg_type {
+   /** Floating-point types: @{ */
+   BRW_REGISTER_TYPE_NF, /* >64-bit (accumulator-only) native float (gfx11+) */
+   BRW_REGISTER_TYPE_DF, /* 64-bit float (double float) */
+   BRW_REGISTER_TYPE_F,  /* 32-bit float */
+   BRW_REGISTER_TYPE_HF, /* 16-bit float (half float) */
+   BRW_REGISTER_TYPE_VF, /* 32-bit vector of 4 8-bit floats */
+   /** @} */
+
+   /** Integer types: @{ */
+   BRW_REGISTER_TYPE_Q,  /* 64-bit   signed integer (quad word) */
+   BRW_REGISTER_TYPE_UQ, /* 64-bit unsigned integer (quad word) */
+   BRW_REGISTER_TYPE_D,  /* 32-bit   signed integer (double word) */
+   BRW_REGISTER_TYPE_UD, /* 32-bit unsigned integer (double word) */
+   BRW_REGISTER_TYPE_W,  /* 16-bit   signed integer (word) */
+   BRW_REGISTER_TYPE_UW, /* 16-bit unsigned integer (word) */
+   BRW_REGISTER_TYPE_B,  /*  8-bit   signed integer (byte) */
+   BRW_REGISTER_TYPE_UB, /*  8-bit unsigned integer (byte) */
+   BRW_REGISTER_TYPE_V,  /* vector of 8   signed 4-bit integers (treated as W) */
+   BRW_REGISTER_TYPE_UV, /* vector of 8 unsigned 4-bit integers (treated as UW) */
+   /** @} */
+
+   BRW_REGISTER_TYPE_LAST = BRW_REGISTER_TYPE_UV
+};
+
+static inline bool
+brw_reg_type_is_floating_point(enum brw_reg_type type)
+{
+   switch (type) {
+   case BRW_REGISTER_TYPE_NF:
+   case BRW_REGISTER_TYPE_DF:
+   case BRW_REGISTER_TYPE_F:
+   case BRW_REGISTER_TYPE_HF:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static inline bool
+brw_reg_type_is_integer(enum brw_reg_type type)
+{
+   switch (type) {
+   case BRW_REGISTER_TYPE_Q:
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_B:
+   case BRW_REGISTER_TYPE_UB:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static inline bool
+brw_reg_type_is_unsigned_integer(enum brw_reg_type tp)
+{
+   return tp == BRW_REGISTER_TYPE_UB ||
+          tp == BRW_REGISTER_TYPE_UW ||
+          tp == BRW_REGISTER_TYPE_UD ||
+          tp == BRW_REGISTER_TYPE_UQ;
+}
+
+/*
+ * Returns a type based on a reference_type (word, float, half-float) and a
+ * given bit_size.
+ */
+static inline enum brw_reg_type
+brw_reg_type_from_bit_size(unsigned bit_size,
+                           enum brw_reg_type reference_type)
+{
+   switch(reference_type) {
+   case BRW_REGISTER_TYPE_HF:
+   case BRW_REGISTER_TYPE_F:
+   case BRW_REGISTER_TYPE_DF:
+      switch(bit_size) {
+      case 16:
+         return BRW_REGISTER_TYPE_HF;
+      case 32:
+         return BRW_REGISTER_TYPE_F;
+      case 64:
+         return BRW_REGISTER_TYPE_DF;
+      default:
+         unreachable("Invalid bit size");
+      }
+   case BRW_REGISTER_TYPE_B:
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_Q:
+      switch(bit_size) {
+      case 8:
+         return BRW_REGISTER_TYPE_B;
+      case 16:
+         return BRW_REGISTER_TYPE_W;
+      case 32:
+         return BRW_REGISTER_TYPE_D;
+      case 64:
+         return BRW_REGISTER_TYPE_Q;
+      default:
+         unreachable("Invalid bit size");
+      }
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_UQ:
+      switch(bit_size) {
+      case 8:
+         return BRW_REGISTER_TYPE_UB;
+      case 16:
+         return BRW_REGISTER_TYPE_UW;
+      case 32:
+         return BRW_REGISTER_TYPE_UD;
+      case 64:
+         return BRW_REGISTER_TYPE_UQ;
+      default:
+         unreachable("Invalid bit size");
+      }
+   default:
+      unreachable("Unknown type");
+   }
+}
+
+
+#define INVALID_REG_TYPE    ((enum brw_reg_type)-1)
+#define INVALID_HW_REG_TYPE ((unsigned)-1)
+
+unsigned
+brw_reg_type_to_hw_type(const struct intel_device_info *devinfo,
+                        enum brw_reg_file file, enum brw_reg_type type);
+
+enum brw_reg_type ATTRIBUTE_PURE
+brw_hw_type_to_reg_type(const struct intel_device_info *devinfo,
+                        enum brw_reg_file file, unsigned hw_type);
+
+unsigned
+brw_reg_type_to_a16_hw_3src_type(const struct intel_device_info *devinfo,
+                                 enum brw_reg_type type);
+
+unsigned
+brw_reg_type_to_a1_hw_3src_type(const struct intel_device_info *devinfo,
+                                enum brw_reg_type type);
+
+enum brw_reg_type
+brw_a16_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
+                                 unsigned hw_type);
+
+enum brw_reg_type
+brw_a1_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
+                                unsigned hw_type, unsigned exec_type);
+
+unsigned
+brw_reg_type_to_size(enum brw_reg_type type);
+
+const char *
+brw_reg_type_to_letters(enum brw_reg_type type);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/intel/compiler/elk/brw_rt.h b/src/intel/compiler/elk/brw_rt.h
new file mode 100644
index 00000000000..2cf1851ff59
--- /dev/null
+++ b/src/intel/compiler/elk/brw_rt.h
@@ -0,0 +1,292 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_RT_H
+#define BRW_RT_H
+
+#include <stdint.h>
+
+#include "compiler/shader_enums.h"
+#include "util/macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Vulkan defines shaderGroupHandleSize = 32 */
+#define BRW_RT_SBT_HANDLE_SIZE 32
+
+/** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
+#define BRW_RT_DISPATCH_GLOBALS_SIZE 80
+
+/** Offset after the RT dispatch globals at which "push" constants live */
+#define BRW_RT_PUSH_CONST_OFFSET 128
+
+/** Stride of the resume SBT */
+#define BRW_BTD_RESUME_SBT_STRIDE 8
+
+/* Vulkan always uses exactly two levels of BVH: world and object.  At the API
+ * level, these are referred to as top and bottom.
+ */
+enum brw_rt_bvh_level {
+   BRW_RT_BVH_LEVEL_WORLD = 0,
+   BRW_RT_BVH_LEVEL_OBJECT = 1,
+};
+#define BRW_RT_MAX_BVH_LEVELS 2
+
+enum brw_rt_bvh_node_type {
+   BRW_RT_BVH_NODE_TYPE_INTERNAL = 0,
+   BRW_RT_BVH_NODE_TYPE_INSTANCE = 1,
+   BRW_RT_BVH_NODE_TYPE_PROCEDURAL = 3,
+   BRW_RT_BVH_NODE_TYPE_QUAD = 4,
+};
+
+/** HitKind values returned for triangle geometry
+ *
+ * This enum must match the SPIR-V enum.
+ */
+enum brw_rt_hit_kind {
+   BRW_RT_HIT_KIND_FRONT_FACE = 0xfe,
+   BRW_RT_HIT_KIND_BACK_FACE = 0xff,
+};
+
+/** Ray flags
+ *
+ * This enum must match the SPIR-V RayFlags enum.
+ */
+enum brw_rt_ray_flags {
+   BRW_RT_RAY_FLAG_FORCE_OPAQUE                    = 0x01,
+   BRW_RT_RAY_FLAG_FORCE_NON_OPAQUE                = 0x02,
+   BRW_RT_RAY_FLAG_TERMINATE_ON_FIRST_HIT          = 0x04,
+   BRW_RT_RAY_FLAG_SKIP_CLOSEST_HIT_SHADER         = 0x08,
+   BRW_RT_RAY_FLAG_CULL_BACK_FACING_TRIANGLES      = 0x10,
+   BRW_RT_RAY_FLAG_CULL_FRONT_FACING_TRIANGLES     = 0x20,
+   BRW_RT_RAY_FLAG_CULL_OPAQUE                     = 0x40,
+   BRW_RT_RAY_FLAG_CULL_NON_OPAQUE                 = 0x80,
+   BRW_RT_RAY_FLAG_SKIP_TRIANGLES                  = 0x100,
+   BRW_RT_RAY_FLAG_SKIP_AABBS                      = 0x200,
+};
+
+struct brw_rt_scratch_layout {
+   /** Number of stack IDs per DSS */
+   uint32_t stack_ids_per_dss;
+
+   /** Start offset (in bytes) of the hardware MemRay stack */
+   uint32_t ray_stack_start;
+
+   /** Stride (in bytes) of the hardware MemRay stack */
+   uint32_t ray_stack_stride;
+
+   /** Start offset (in bytes) of the SW stacks */
+   uint64_t sw_stack_start;
+
+   /** Size (in bytes) of the SW stack for a single shader invocation */
+   uint32_t sw_stack_size;
+
+   /** Total size (in bytes) of the RT scratch memory area */
+   uint64_t total_size;
+};
+
+/** Parameters passed to the raygen trampoline shader
+ *
+ * This struct is carefully construected to be 32B and must be passed to the
+ * raygen trampoline shader as as inline constant data.
+ */
+struct brw_rt_raygen_trampoline_params {
+   /** The GPU address of the RT_DISPATCH_GLOBALS */
+   uint64_t rt_disp_globals_addr;
+
+   /** The GPU address of the BINDLESS_SHADER_RECORD for the raygen shader */
+   uint64_t raygen_bsr_addr;
+
+   /** 1 if this is an indirect dispatch, 0 otherwise */
+   uint8_t is_indirect;
+
+   /** The integer log2 of the local group size
+    *
+    * Ray-tracing shaders don't have a concept of local vs. global workgroup
+    * size.  They only have a single 3D launch size.  The raygen trampoline
+    * shader is always dispatched with a local workgroup size equal to the
+    * SIMD width but the shape of the local workgroup is determined at
+    * dispatch time based on the shape of the launch and passed to the
+    * trampoline via this field.  (There's no sense having a Z dimension on
+    * the local workgroup if the launch is 2D.)
+    *
+    * We use the integer log2 of the size because there's no point in
+    * non-power-of-two sizes and  shifts are cheaper than division.
+    */
+   uint8_t local_group_size_log2[3];
+
+   uint32_t pad[3];
+};
+
+/** Size of the "hot zone" in bytes
+ *
+ * The hot zone is a SW-defined data structure which is a single uvec4
+ * containing two bits of information:
+ *
+ *  - hotzone.x: Stack offset (in bytes)
+ *
+ *    This is the offset (in bytes) into the per-thread scratch space at which
+ *    the current shader's stack starts.  This is incremented by the calling
+ *    shader prior to any shader call type instructions and gets decremented
+ *    by the resume shader as part of completing the return operation.
+ *
+ *
+ *  - hotzone.yzw: The launch ID associated with the current thread
+ *
+ *    Inside a bindless shader, the only information we have is the DSS ID
+ *    from the hardware EU and a per-DSS stack ID.  In particular, the three-
+ *    dimensional launch ID is lost the moment we leave the raygen trampoline.
+ */
+#define BRW_RT_SIZEOF_HOTZONE 16
+
+/* From the BSpec "Address Computation for Memory Based Data Structures:
+ * Ray and TraversalStack (Async Ray Tracing)":
+ *
+ *    sizeof(Ray) = 64B, sizeof(HitInfo) = 32B, sizeof(TravStack) = 32B.
+ */
+#define BRW_RT_SIZEOF_RAY 64
+#define BRW_RT_SIZEOF_HIT_INFO 32
+#define BRW_RT_SIZEOF_TRAV_STACK 32
+
+/* From the BSpec:
+ *
+ *    syncStackSize = (maxBVHLevels % 2 == 1) ?
+ *       (sizeof(HitInfo) * 2 +
+ *          (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels + 32B) :
+ *       (sizeof(HitInfo) * 2 +
+ *          (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels);
+ *
+ * The select is just to align to 64B.
+ */
+#define BRW_RT_SIZEOF_RAY_QUERY \
+   (BRW_RT_SIZEOF_HIT_INFO * 2 + \
+    (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \
+    (BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0))
+
+#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY  \
+   (BRW_RT_SIZEOF_HIT_INFO * 2 + \
+    (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS)
+
+#define BRW_RT_SIZEOF_HW_STACK \
+   (BRW_RT_SIZEOF_HIT_INFO * 2 + \
+    BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \
+    BRW_RT_SIZEOF_TRAV_STACK * BRW_RT_MAX_BVH_LEVELS)
+
+/* This is a mesa-defined region for hit attribute data */
+#define BRW_RT_SIZEOF_HIT_ATTRIB_DATA 64
+#define BRW_RT_OFFSETOF_HIT_ATTRIB_DATA BRW_RT_SIZEOF_HW_STACK
+
+#define BRW_RT_ASYNC_STACK_STRIDE \
+   ALIGN_POT(BRW_RT_OFFSETOF_HIT_ATTRIB_DATA + \
+             BRW_RT_SIZEOF_HIT_ATTRIB_DATA, 64)
+
+static inline void
+brw_rt_compute_scratch_layout(struct brw_rt_scratch_layout *layout,
+                              const struct intel_device_info *devinfo,
+                              uint32_t stack_ids_per_dss,
+                              uint32_t sw_stack_size)
+{
+   layout->stack_ids_per_dss = stack_ids_per_dss;
+
+   const uint32_t dss_count = intel_device_info_dual_subslice_id_bound(devinfo);
+   const uint32_t num_stack_ids = dss_count * stack_ids_per_dss;
+
+   uint64_t size = 0;
+
+   /* The first thing in our scratch area is an array of "hot zones" which
+    * store the stack offset as well as the launch IDs for each active
+    * invocation.
+    */
+   size += BRW_RT_SIZEOF_HOTZONE * num_stack_ids;
+
+   /* Next, we place the HW ray stacks */
+   assert(size % 64 == 0); /* Cache-line aligned */
+   assert(size < UINT32_MAX);
+   layout->ray_stack_start = size;
+   layout->ray_stack_stride = BRW_RT_ASYNC_STACK_STRIDE;
+   size += num_stack_ids * layout->ray_stack_stride;
+
+   /* Finally, we place the SW stacks for the individual ray-tracing shader
+    * invocations.  We align these to 64B to ensure that we don't have any
+    * shared cache lines which could hurt performance.
+    */
+   assert(size % 64 == 0);
+   layout->sw_stack_start = size;
+   layout->sw_stack_size = ALIGN(sw_stack_size, 64);
+
+   /* Currently it's always the case that sw_stack_size is a power of
+    * two, but power-of-two SW stack sizes are prone to causing
+    * collisions in the hashing function used by the L3 to map memory
+    * addresses to banks, which can cause stack accesses from most
+    * DSSes to bottleneck on a single L3 bank.  Fix it by padding the
+    * SW stack by a single cacheline if it was a power of two.
+    */
+   if (layout->sw_stack_size > 64 &&
+       util_is_power_of_two_nonzero(layout->sw_stack_size))
+      layout->sw_stack_size += 64;
+
+   size += num_stack_ids * layout->sw_stack_size;
+
+   layout->total_size = size;
+}
+
+static inline uint32_t
+brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info *devinfo)
+{
+   /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
+    * which includes all the threads.
+    */
+   uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
+   uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */
+   return max_eu_id * max_simd_size * BRW_RT_SIZEOF_RAY_QUERY;
+}
+
+static inline uint32_t
+brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo)
+{
+   /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
+    * which includes all the threads.
+    */
+   uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
+   uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */
+   return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY;
+}
+
+static inline uint32_t
+brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo,
+                                      uint32_t ray_queries)
+{
+   /* Don't bother a shadow stack if we only have a single query. We can
+    * directly write in the HW buffer.
+    */
+   return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) +
+          ray_queries * 4; /* Ctrl + Level data */
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BRW_RT_H */
diff --git a/src/intel/compiler/elk/brw_schedule_instructions.cpp b/src/intel/compiler/elk/brw_schedule_instructions.cpp
new file mode 100644
index 00000000000..ff2cc6ff997
--- /dev/null
+++ b/src/intel/compiler/elk/brw_schedule_instructions.cpp
@@ -0,0 +1,2096 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_fs_live_variables.h"
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+#include "brw_shader.h"
+#include <new>
+
+using namespace brw;
+
+/** @file brw_fs_schedule_instructions.cpp
+ *
+ * List scheduling of FS instructions.
+ *
+ * The basic model of the list scheduler is to take a basic block,
+ * compute a DAG of the dependencies (RAW ordering with latency, WAW
+ * ordering with latency, WAR ordering), and make a list of the DAG heads.
+ * Heuristically pick a DAG head, then put all the children that are
+ * now DAG heads into the list of things to schedule.
+ *
+ * The heuristic is the important part.  We're trying to be cheap,
+ * since actually computing the optimal scheduling is NP complete.
+ * What we do is track a "current clock".  When we schedule a node, we
+ * update the earliest-unblocked clock time of its children, and
+ * increment the clock.  Then, when trying to schedule, we just pick
+ * the earliest-unblocked instruction to schedule.
+ *
+ * Note that often there will be many things which could execute
+ * immediately, and there are a range of heuristic options to choose
+ * from in picking among those.
+ */
+
+static bool debug = false;
+
+class instruction_scheduler;
+struct schedule_node_child;
+
+class schedule_node : public exec_node
+{
+public:
+   void set_latency_gfx4();
+   void set_latency_gfx7(const struct brw_isa_info *isa);
+
+   backend_instruction *inst;
+   schedule_node_child *children;
+   int children_count;
+   int children_cap;
+   int initial_parent_count;
+   int initial_unblocked_time;
+   int latency;
+
+   /**
+    * This is the sum of the instruction's latency plus the maximum delay of
+    * its children, or just the issue_time if it's a leaf node.
+    */
+   int delay;
+
+   /**
+    * Preferred exit node among the (direct or indirect) successors of this
+    * node.  Among the scheduler nodes blocked by this node, this will be the
+    * one that may cause earliest program termination, or NULL if none of the
+    * successors is an exit node.
+    */
+   schedule_node *exit;
+
+   /**
+    * How many cycles this instruction takes to issue.
+    *
+    * Instructions in gen hardware are handled one simd4 vector at a time,
+    * with 1 cycle per vector dispatched.  Thus SIMD8 pixel shaders take 2
+    * cycles to dispatch and SIMD16 (compressed) instructions take 4.
+    */
+   int issue_time;
+
+   /* Temporary data used during the scheduling process. */
+   struct {
+      int parent_count;
+      int unblocked_time;
+
+      /**
+       * Which iteration of pushing groups of children onto the candidates list
+       * this node was a part of.
+       */
+      unsigned cand_generation;
+   } tmp;
+};
+
+struct schedule_node_child {
+   schedule_node *n;
+   int effective_latency;
+};
+
+static inline void
+reset_node_tmp(schedule_node *n)
+{
+   n->tmp.parent_count = n->initial_parent_count;
+   n->tmp.unblocked_time = n->initial_unblocked_time;
+   n->tmp.cand_generation = 0;
+}
+
+/**
+ * Lower bound of the scheduling time after which one of the instructions
+ * blocked by this node may lead to program termination.
+ *
+ * exit_unblocked_time() determines a strict partial ordering relation '«' on
+ * the set of scheduler nodes as follows:
+ *
+ *   n « m <-> exit_unblocked_time(n) < exit_unblocked_time(m)
+ *
+ * which can be used to heuristically order nodes according to how early they
+ * can unblock an exit node and lead to program termination.
+ */
+static inline int
+exit_tmp_unblocked_time(const schedule_node *n)
+{
+   return n->exit ? n->exit->tmp.unblocked_time : INT_MAX;
+}
+
+static inline int
+exit_initial_unblocked_time(const schedule_node *n)
+{
+   return n->exit ? n->exit->initial_unblocked_time : INT_MAX;
+}
+
+void
+schedule_node::set_latency_gfx4()
+{
+   int chans = 8;
+   int math_latency = 22;
+
+   switch (inst->opcode) {
+   case SHADER_OPCODE_RCP:
+      this->latency = 1 * chans * math_latency;
+      break;
+   case SHADER_OPCODE_RSQ:
+      this->latency = 2 * chans * math_latency;
+      break;
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_LOG2:
+      /* full precision log.  partial is 2. */
+      this->latency = 3 * chans * math_latency;
+      break;
+   case SHADER_OPCODE_INT_REMAINDER:
+   case SHADER_OPCODE_EXP2:
+      /* full precision.  partial is 3, same throughput. */
+      this->latency = 4 * chans * math_latency;
+      break;
+   case SHADER_OPCODE_POW:
+      this->latency = 8 * chans * math_latency;
+      break;
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      /* minimum latency, max is 12 rounds. */
+      this->latency = 5 * chans * math_latency;
+      break;
+   default:
+      this->latency = 2;
+      break;
+   }
+}
+
+void
+schedule_node::set_latency_gfx7(const struct brw_isa_info *isa)
+{
+   const bool is_haswell = isa->devinfo->verx10 == 75;
+
+   switch (inst->opcode) {
+   case BRW_OPCODE_MAD:
+      /* 2 cycles
+       *  (since the last two src operands are in different register banks):
+       * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
+       *
+       * 3 cycles on IVB, 4 on HSW
+       *  (since the last two src operands are in the same register bank):
+       * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
+       *
+       * 18 cycles on IVB, 16 on HSW
+       *  (since the last two src operands are in different register banks):
+       * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
+       * mov(8) null   g4<4,5,1>F                     { align16 WE_normal 1Q };
+       *
+       * 20 cycles on IVB, 18 on HSW
+       *  (since the last two src operands are in the same register bank):
+       * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
+       * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
+       */
+
+      /* Our register allocator doesn't know about register banks, so use the
+       * higher latency.
+       */
+      latency = is_haswell ? 16 : 18;
+      break;
+
+   case BRW_OPCODE_LRP:
+      /* 2 cycles
+       *  (since the last two src operands are in different register banks):
+       * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
+       *
+       * 3 cycles on IVB, 4 on HSW
+       *  (since the last two src operands are in the same register bank):
+       * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
+       *
+       * 16 cycles on IVB, 14 on HSW
+       *  (since the last two src operands are in different register banks):
+       * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
+       * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
+       *
+       * 16 cycles
+       *  (since the last two src operands are in the same register bank):
+       * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
+       * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
+       */
+
+      /* Our register allocator doesn't know about register banks, so use the
+       * higher latency.
+       */
+      latency = 14;
+      break;
+
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      /* 2 cycles:
+       * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
+       *
+       * 18 cycles:
+       * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
+       * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
+       *
+       * Same for exp2, log2, rsq, sqrt, sin, cos.
+       */
+      latency = is_haswell ? 14 : 16;
+      break;
+
+   case SHADER_OPCODE_POW:
+      /* 2 cycles:
+       * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
+       *
+       * 26 cycles:
+       * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
+       * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
+       */
+      latency = is_haswell ? 22 : 24;
+      break;
+
+   case SHADER_OPCODE_TEX:
+   case SHADER_OPCODE_TXD:
+   case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_LZ:
+   case SHADER_OPCODE_TXL:
+   case SHADER_OPCODE_TXL_LZ:
+      /* 18 cycles:
+       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       *
+       * 697 +/-49 cycles (min 610, n=26):
+       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
+       *
+       * So the latency on our first texture load of the batchbuffer takes
+       * ~700 cycles, since the caches are cold at that point.
+       *
+       * 840 +/- 92 cycles (min 720, n=25):
+       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
+       *
+       * On the second load, it takes just an extra ~140 cycles, and after
+       * accounting for the 14 cycles of the MOV's latency, that makes ~130.
+       *
+       * 683 +/- 49 cycles (min = 602, n=47):
+       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       * send(8) g50<1>UW   g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
+       *
+       * The unit appears to be pipelined, since this matches up with the
+       * cache-cold case, despite there being two loads here.  If you replace
+       * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
+       *
+       * So, take some number between the cache-hot 140 cycles and the
+       * cache-cold 700 cycles.  No particular tuning was done on this.
+       *
+       * I haven't done significant testing of the non-TEX opcodes.  TXL at
+       * least looked about the same as TEX.
+       */
+      latency = 200;
+      break;
+
+   case SHADER_OPCODE_TXS:
+      /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
+       * cycles (n=15):
+       * mov(8)   g114<1>UD  0D                        { align1 WE_normal 1Q };
+       * send(8)  g6<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 10, 1) mlen 1 rlen 4        { align1 WE_normal 1Q };
+       * mov(16)  g6<1>F     g6<8,8,1>D                { align1 WE_normal 1Q };
+       *
+       *
+       * Two loads was 535 +/- 30 cycles (n=19):
+       * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
+       * send(16)  g6<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
+       * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
+       * mov(16)   g6<1>F     g6<8,8,1>D               { align1 WE_normal 1H };
+       * send(16)  g8<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
+       * mov(16)   g8<1>F     g8<8,8,1>D               { align1 WE_normal 1H };
+       * add(16)   g6<1>F     g6<8,8,1>F   g8<8,8,1>F  { align1 WE_normal 1H };
+       *
+       * Since the only caches that should matter are just the
+       * instruction/state cache containing the surface state, assume that we
+       * always have hot caches.
+       */
+      latency = 100;
+      break;
+
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case VS_OPCODE_PULL_CONSTANT_LOAD:
+      /* testing using varying-index pull constants:
+       *
+       * 16 cycles:
+       * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
+       *
+       * ~480 cycles:
+       * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
+       * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
+       *
+       * ~620 cycles:
+       * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
+       * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
+       * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
+       *
+       * So, if it's cache-hot, it's about 140.  If it's cache cold, it's
+       * about 460.  We expect to mostly be cache hot, so pick something more
+       * in that direction.
+       */
+      latency = 200;
+      break;
+
+   case SHADER_OPCODE_GFX7_SCRATCH_READ:
+      /* Testing a load from offset 0, that had been previously written:
+       *
+       * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q };
+       * mov(8)  null      g114<8,8,1>F { align1 WE_normal 1Q };
+       *
+       * The cycles spent seemed to be grouped around 40-50 (as low as 38),
+       * then around 140.  Presumably this is cache hit vs miss.
+       */
+      latency = 50;
+      break;
+
+   case VEC4_OPCODE_UNTYPED_ATOMIC:
+      /* See GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP */
+      latency = 14000;
+      break;
+
+   case VEC4_OPCODE_UNTYPED_SURFACE_READ:
+   case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
+      /* See also GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ */
+      latency = is_haswell ? 300 : 600;
+      break;
+
+   case SHADER_OPCODE_SEND:
+      switch (inst->sfid) {
+      case BRW_SFID_SAMPLER: {
+         unsigned msg_type = (inst->desc >> 12) & 0x1f;
+         switch (msg_type) {
+         case GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO:
+         case GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO:
+            /* See also SHADER_OPCODE_TXS */
+            latency = 100;
+            break;
+
+         default:
+            /* See also SHADER_OPCODE_TEX */
+            latency = 200;
+            break;
+         }
+         break;
+      }
+
+      case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
+         /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
+         latency = 200;
+         break;
+
+      case GFX6_SFID_DATAPORT_RENDER_CACHE:
+         switch (brw_fb_desc_msg_type(isa->devinfo, inst->desc)) {
+         case GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE:
+         case GFX7_DATAPORT_RC_TYPED_SURFACE_READ:
+            /* See also SHADER_OPCODE_TYPED_SURFACE_READ */
+            assert(!is_haswell);
+            latency = 600;
+            break;
+
+         case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
+            /* See also SHADER_OPCODE_TYPED_ATOMIC */
+            assert(!is_haswell);
+            latency = 14000;
+            break;
+
+         case GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE:
+            /* completely fabricated number */
+            latency = 600;
+            break;
+
+         default:
+            unreachable("Unknown render cache message");
+         }
+         break;
+
+      case GFX7_SFID_DATAPORT_DATA_CACHE:
+         switch ((inst->desc >> 14) & 0x1f) {
+         case BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ:
+         case GFX7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ:
+         case GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE:
+            /* We have no data for this but assume it's a little faster than
+             * untyped surface read/write.
+             */
+            latency = 200;
+            break;
+
+         case GFX7_DATAPORT_DC_DWORD_SCATTERED_READ:
+         case GFX6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE:
+         case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ:
+         case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE:
+            /* We have no data for this but assume it's roughly the same as
+             * untyped surface read/write.
+             */
+            latency = 300;
+            break;
+
+         case GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ:
+         case GFX7_DATAPORT_DC_UNTYPED_SURFACE_WRITE:
+            /* Test code:
+             *   mov(8)    g112<1>UD       0x00000000UD       { align1 WE_all 1Q };
+             *   mov(1)    g112.7<1>UD     g1.7<0,1,0>UD      { align1 WE_all };
+             *   mov(8)    g113<1>UD       0x00000000UD       { align1 WE_normal 1Q };
+             *   send(8)   g4<1>UD         g112<8,8,1>UD
+             *             data (38, 6, 5) mlen 2 rlen 1      { align1 WE_normal 1Q };
+             *   .
+             *   . [repeats 8 times]
+             *   .
+             *   mov(8)    g112<1>UD       0x00000000UD       { align1 WE_all 1Q };
+             *   mov(1)    g112.7<1>UD     g1.7<0,1,0>UD      { align1 WE_all };
+             *   mov(8)    g113<1>UD       0x00000000UD       { align1 WE_normal 1Q };
+             *   send(8)   g4<1>UD         g112<8,8,1>UD
+             *             data (38, 6, 5) mlen 2 rlen 1      { align1 WE_normal 1Q };
+             *
+             * Running it 100 times as fragment shader on a 128x128 quad
+             * gives an average latency of 583 cycles per surface read,
+             * standard deviation 0.9%.
+             */
+            assert(!is_haswell);
+            latency = 600;
+            break;
+
+         case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
+            /* Test code:
+             *   mov(8)    g112<1>ud       0x00000000ud       { align1 WE_all 1Q };
+             *   mov(1)    g112.7<1>ud     g1.7<0,1,0>ud      { align1 WE_all };
+             *   mov(8)    g113<1>ud       0x00000000ud       { align1 WE_normal 1Q };
+             *   send(8)   g4<1>ud         g112<8,8,1>ud
+             *             data (38, 5, 6) mlen 2 rlen 1      { align1 WE_normal 1Q };
+             *
+             * Running it 100 times as fragment shader on a 128x128 quad
+             * gives an average latency of 13867 cycles per atomic op,
+             * standard deviation 3%.  Note that this is a rather
+             * pessimistic estimate, the actual latency in cases with few
+             * collisions between threads and favorable pipelining has been
+             * seen to be reduced by a factor of 100.
+             */
+            assert(!is_haswell);
+            latency = 14000;
+            break;
+
+         default:
+            unreachable("Unknown data cache message");
+         }
+         break;
+
+      case HSW_SFID_DATAPORT_DATA_CACHE_1:
+         switch (brw_dp_desc_msg_type(isa->devinfo, inst->desc)) {
+         case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ:
+         case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE:
+         case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ:
+         case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE:
+         case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE:
+         case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ:
+         case GFX8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE:
+         case GFX9_DATAPORT_DC_PORT1_A64_SCATTERED_READ:
+         case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ:
+         case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE:
+            /* See also GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ */
+            latency = 300;
+            break;
+
+         case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
+         case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
+         case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
+         case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
+         case GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP:
+         case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP:
+         case GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP:
+         case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP:
+         case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP:
+            /* See also GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP */
+            latency = 14000;
+            break;
+
+         default:
+            unreachable("Unknown data cache message");
+         }
+         break;
+
+      case GFX7_SFID_PIXEL_INTERPOLATOR:
+         latency = 50; /* TODO */
+         break;
+
+      case GFX12_SFID_UGM:
+      case GFX12_SFID_TGM:
+      case GFX12_SFID_SLM:
+         switch (lsc_msg_desc_opcode(isa->devinfo, inst->desc)) {
+         case LSC_OP_LOAD:
+         case LSC_OP_STORE:
+         case LSC_OP_LOAD_CMASK:
+         case LSC_OP_STORE_CMASK:
+            latency = 300;
+            break;
+         case LSC_OP_FENCE:
+         case LSC_OP_ATOMIC_INC:
+         case LSC_OP_ATOMIC_DEC:
+         case LSC_OP_ATOMIC_LOAD:
+         case LSC_OP_ATOMIC_STORE:
+         case LSC_OP_ATOMIC_ADD:
+         case LSC_OP_ATOMIC_SUB:
+         case LSC_OP_ATOMIC_MIN:
+         case LSC_OP_ATOMIC_MAX:
+         case LSC_OP_ATOMIC_UMIN:
+         case LSC_OP_ATOMIC_UMAX:
+         case LSC_OP_ATOMIC_CMPXCHG:
+         case LSC_OP_ATOMIC_FADD:
+         case LSC_OP_ATOMIC_FSUB:
+         case LSC_OP_ATOMIC_FMIN:
+         case LSC_OP_ATOMIC_FMAX:
+         case LSC_OP_ATOMIC_FCMPXCHG:
+         case LSC_OP_ATOMIC_AND:
+         case LSC_OP_ATOMIC_OR:
+         case LSC_OP_ATOMIC_XOR:
+            latency = 1400;
+            break;
+         default:
+            unreachable("unsupported new data port message instruction");
+         }
+         break;
+
+      case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH:
+      case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
+         /* TODO.
+          *
+          * We'll assume for the moment that this is pretty quick as it
+          * doesn't actually return any data.
+          */
+         latency = 200;
+         break;
+
+      case BRW_SFID_URB:
+         latency = 200;
+         break;
+
+      default:
+         unreachable("Unknown SFID");
+      }
+      break;
+
+   case BRW_OPCODE_DPAS:
+      switch (inst->rcount) {
+      case 1:
+         latency = 21;
+         break;
+      case 2:
+         latency = 22;
+         break;
+      case 8:
+      default:
+         latency = 32;
+         break;
+      }
+      break;
+
+   default:
+      /* 2 cycles:
+       * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
+       *
+       * 16 cycles:
+       * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
+       * mov(8) null   g4<8,8,1>F                      { align1 WE_normal 1Q };
+       */
+      latency = 14;
+      break;
+   }
+}
+
+class instruction_scheduler {
+public:
+   instruction_scheduler(void *mem_ctx, const backend_shader *s, int grf_count,
+                         int grf_write_scale, bool post_reg_alloc):
+      bs(s)
+   {
+      this->mem_ctx = mem_ctx;
+      this->lin_ctx = linear_context(this->mem_ctx);
+      this->grf_count = grf_count;
+      this->post_reg_alloc = post_reg_alloc;
+
+      this->last_grf_write = linear_zalloc_array(lin_ctx, schedule_node *, grf_count * grf_write_scale);
+
+      this->nodes_len = s->cfg->last_block()->end_ip + 1;
+      this->nodes = linear_zalloc_array(lin_ctx, schedule_node, this->nodes_len);
+
+      const struct intel_device_info *devinfo = bs->devinfo;
+      const struct brw_isa_info *isa = &bs->compiler->isa;
+
+      schedule_node *n = nodes;
+      foreach_block_and_inst(block, backend_instruction, inst, s->cfg) {
+         n->inst = inst;
+
+         /* We can't measure Gfx6 timings directly but expect them to be much
+          * closer to Gfx7 than Gfx4.
+          */
+         if (!post_reg_alloc)
+            n->latency = 1;
+         else if (devinfo->ver >= 6)
+            n->set_latency_gfx7(isa);
+         else
+            n->set_latency_gfx4();
+
+         n++;
+      }
+      assert(n == nodes + nodes_len);
+
+      current.block = NULL;
+      current.start = NULL;
+      current.end = NULL;
+      current.len = 0;
+      current.time = 0;
+      current.cand_generation = 0;
+      current.available.make_empty();
+   }
+
+   void add_barrier_deps(schedule_node *n);
+   void add_cross_lane_deps(schedule_node *n);
+   void add_dep(schedule_node *before, schedule_node *after, int latency);
+   void add_dep(schedule_node *before, schedule_node *after);
+
+   void set_current_block(bblock_t *block);
+   void compute_delays();
+   void compute_exits();
+
+   void schedule(schedule_node *chosen);
+   void update_children(schedule_node *chosen);
+
+   void *mem_ctx;
+   linear_ctx *lin_ctx;
+
+   schedule_node *nodes;
+   int nodes_len;
+
+   /* Current block being processed. */
+   struct {
+      bblock_t *block;
+
+      /* Range of nodes in the block.  End will point to first node
+       * address after the block, i.e. the range is [start, end).
+       */
+      schedule_node *start;
+      schedule_node *end;
+      int len;
+
+      int scheduled;
+
+      unsigned cand_generation;
+      int time;
+      exec_list available;
+   } current;
+
+   bool post_reg_alloc;
+   int grf_count;
+   const backend_shader *bs;
+
+   /**
+    * Last instruction to have written the grf (or a channel in the grf, for the
+    * scalar backend)
+    */
+   schedule_node **last_grf_write;
+};
+
+class fs_instruction_scheduler : public instruction_scheduler
+{
+public:
+   fs_instruction_scheduler(void *mem_ctx, const fs_visitor *v, int grf_count, int hw_reg_count,
+                            int block_count, bool post_reg_alloc);
+   void calculate_deps();
+   bool is_compressed(const fs_inst *inst);
+   schedule_node *choose_instruction_to_schedule();
+   int calculate_issue_time(backend_instruction *inst);
+
+   void count_reads_remaining(backend_instruction *inst);
+   void setup_liveness(cfg_t *cfg);
+   void update_register_pressure(backend_instruction *inst);
+   int get_register_pressure_benefit(backend_instruction *inst);
+   void clear_last_grf_write();
+
+   void schedule_instructions();
+   void run(instruction_scheduler_mode mode);
+
+   const fs_visitor *v;
+   unsigned hw_reg_count;
+   int reg_pressure;
+   instruction_scheduler_mode mode;
+
+   /*
+    * The register pressure at the beginning of each basic block.
+    */
+
+   int *reg_pressure_in;
+
+   /*
+    * The virtual GRF's whose range overlaps the beginning of each basic block.
+    */
+
+   BITSET_WORD **livein;
+
+   /*
+    * The virtual GRF's whose range overlaps the end of each basic block.
+    */
+
+   BITSET_WORD **liveout;
+
+   /*
+    * The hardware GRF's whose range overlaps the end of each basic block.
+    */
+
+   BITSET_WORD **hw_liveout;
+
+   /*
+    * Whether we've scheduled a write for this virtual GRF yet.
+    */
+
+   bool *written;
+
+   /*
+    * How many reads we haven't scheduled for this virtual GRF yet.
+    */
+
+   int *reads_remaining;
+
+   /*
+    * How many reads we haven't scheduled for this hardware GRF yet.
+    */
+
+   int *hw_reads_remaining;
+
+};
+
+fs_instruction_scheduler::fs_instruction_scheduler(void *mem_ctx, const fs_visitor *v,
+                                                   int grf_count, int hw_reg_count,
+                                                   int block_count, bool post_reg_alloc)
+   : instruction_scheduler(mem_ctx, v, grf_count, /* grf_write_scale */ 16,
+                           post_reg_alloc),
+     v(v)
+{
+   this->hw_reg_count = hw_reg_count;
+   this->mode = SCHEDULE_NONE;
+   this->reg_pressure = 0;
+
+   if (!post_reg_alloc) {
+      this->reg_pressure_in = linear_zalloc_array(lin_ctx, int, block_count);
+
+      this->livein = linear_alloc_array(lin_ctx, BITSET_WORD *, block_count);
+      for (int i = 0; i < block_count; i++)
+         this->livein[i] = linear_zalloc_array(lin_ctx, BITSET_WORD,
+                                         BITSET_WORDS(grf_count));
+
+      this->liveout = linear_alloc_array(lin_ctx, BITSET_WORD *, block_count);
+      for (int i = 0; i < block_count; i++)
+         this->liveout[i] = linear_zalloc_array(lin_ctx, BITSET_WORD,
+                                          BITSET_WORDS(grf_count));
+
+      this->hw_liveout = linear_alloc_array(lin_ctx, BITSET_WORD *, block_count);
+      for (int i = 0; i < block_count; i++)
+         this->hw_liveout[i] = linear_zalloc_array(lin_ctx, BITSET_WORD,
+                                             BITSET_WORDS(hw_reg_count));
+
+      setup_liveness(v->cfg);
+
+      this->written = linear_alloc_array(lin_ctx, bool, grf_count);
+
+      this->reads_remaining = linear_alloc_array(lin_ctx, int, grf_count);
+
+      this->hw_reads_remaining = linear_alloc_array(lin_ctx, int, hw_reg_count);
+   } else {
+      this->reg_pressure_in = NULL;
+      this->livein = NULL;
+      this->liveout = NULL;
+      this->hw_liveout = NULL;
+      this->written = NULL;
+      this->reads_remaining = NULL;
+      this->hw_reads_remaining = NULL;
+   }
+
+   foreach_block(block, v->cfg) {
+      set_current_block(block);
+
+      for (schedule_node *n = current.start; n < current.end; n++)
+         n->issue_time = calculate_issue_time(n->inst);
+
+      calculate_deps();
+      compute_delays();
+      compute_exits();
+   }
+}
+
+static bool
+is_src_duplicate(fs_inst *inst, int src)
+{
+   for (int i = 0; i < src; i++)
+     if (inst->src[i].equals(inst->src[src]))
+       return true;
+
+  return false;
+}
+
+void
+fs_instruction_scheduler::count_reads_remaining(backend_instruction *be)
+{
+   assert(reads_remaining);
+
+   fs_inst *inst = (fs_inst *)be;
+
+   for (int i = 0; i < inst->sources; i++) {
+      if (is_src_duplicate(inst, i))
+         continue;
+
+      if (inst->src[i].file == VGRF) {
+         reads_remaining[inst->src[i].nr]++;
+      } else if (inst->src[i].file == FIXED_GRF) {
+         if (inst->src[i].nr >= hw_reg_count)
+            continue;
+
+         for (unsigned j = 0; j < regs_read(inst, i); j++)
+            hw_reads_remaining[inst->src[i].nr + j]++;
+      }
+   }
+}
+
+void
+fs_instruction_scheduler::setup_liveness(cfg_t *cfg)
+{
+   const fs_live_variables &live = v->live_analysis.require();
+
+   /* First, compute liveness on a per-GRF level using the in/out sets from
+    * liveness calculation.
+    */
+   for (int block = 0; block < cfg->num_blocks; block++) {
+      for (int i = 0; i < live.num_vars; i++) {
+         if (BITSET_TEST(live.block_data[block].livein, i)) {
+            int vgrf = live.vgrf_from_var[i];
+            if (!BITSET_TEST(livein[block], vgrf)) {
+               reg_pressure_in[block] += v->alloc.sizes[vgrf];
+               BITSET_SET(livein[block], vgrf);
+            }
+         }
+
+         if (BITSET_TEST(live.block_data[block].liveout, i))
+            BITSET_SET(liveout[block], live.vgrf_from_var[i]);
+      }
+   }
+
+   /* Now, extend the live in/live out sets for when a range crosses a block
+    * boundary, which matches what our register allocator/interference code
+    * does to account for force_writemask_all and incompatible exec_mask's.
+    */
+   for (int block = 0; block < cfg->num_blocks - 1; block++) {
+      for (int i = 0; i < grf_count; i++) {
+         if (live.vgrf_start[i] <= cfg->blocks[block]->end_ip &&
+             live.vgrf_end[i] >= cfg->blocks[block + 1]->start_ip) {
+            if (!BITSET_TEST(livein[block + 1], i)) {
+                reg_pressure_in[block + 1] += v->alloc.sizes[i];
+                BITSET_SET(livein[block + 1], i);
+            }
+
+            BITSET_SET(liveout[block], i);
+         }
+      }
+   }
+
+   int payload_last_use_ip[hw_reg_count];
+   v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip);
+
+   for (unsigned i = 0; i < hw_reg_count; i++) {
+      if (payload_last_use_ip[i] == -1)
+         continue;
+
+      for (int block = 0; block < cfg->num_blocks; block++) {
+         if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i])
+            reg_pressure_in[block]++;
+
+         if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i])
+            BITSET_SET(hw_liveout[block], i);
+      }
+   }
+}
+
+void
+fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
+{
+   assert(reads_remaining);
+
+   fs_inst *inst = (fs_inst *)be;
+
+   if (inst->dst.file == VGRF) {
+      written[inst->dst.nr] = true;
+   }
+
+   for (int i = 0; i < inst->sources; i++) {
+      if (is_src_duplicate(inst, i))
+          continue;
+
+      if (inst->src[i].file == VGRF) {
+         reads_remaining[inst->src[i].nr]--;
+      } else if (inst->src[i].file == FIXED_GRF &&
+                 inst->src[i].nr < hw_reg_count) {
+         for (unsigned off = 0; off < regs_read(inst, i); off++)
+            hw_reads_remaining[inst->src[i].nr + off]--;
+      }
+   }
+}
+
+int
+fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
+{
+   fs_inst *inst = (fs_inst *)be;
+   int benefit = 0;
+   const int block_idx = current.block->num;
+
+   if (inst->dst.file == VGRF) {
+      if (!BITSET_TEST(livein[block_idx], inst->dst.nr) &&
+          !written[inst->dst.nr])
+         benefit -= v->alloc.sizes[inst->dst.nr];
+   }
+
+   for (int i = 0; i < inst->sources; i++) {
+      if (is_src_duplicate(inst, i))
+         continue;
+
+      if (inst->src[i].file == VGRF &&
+          !BITSET_TEST(liveout[block_idx], inst->src[i].nr) &&
+          reads_remaining[inst->src[i].nr] == 1)
+         benefit += v->alloc.sizes[inst->src[i].nr];
+
+      if (inst->src[i].file == FIXED_GRF &&
+          inst->src[i].nr < hw_reg_count) {
+         for (unsigned off = 0; off < regs_read(inst, i); off++) {
+            int reg = inst->src[i].nr + off;
+            if (!BITSET_TEST(hw_liveout[block_idx], reg) &&
+                hw_reads_remaining[reg] == 1) {
+               benefit++;
+            }
+         }
+      }
+   }
+
+   return benefit;
+}
+
+class vec4_instruction_scheduler : public instruction_scheduler
+{
+public:
+   vec4_instruction_scheduler(void *mem_ctx, const vec4_visitor *v, int grf_count);
+   void calculate_deps();
+   schedule_node *choose_instruction_to_schedule();
+   const vec4_visitor *v;
+
+   void run();
+};
+
+vec4_instruction_scheduler::vec4_instruction_scheduler(void *mem_ctx, const vec4_visitor *v,
+                                                       int grf_count)
+   : instruction_scheduler(mem_ctx, v, grf_count, /* grf_write_scale */ 1,
+                           /* post_reg_alloc */ true),
+     v(v)
+{
+}
+
+void
+instruction_scheduler::set_current_block(bblock_t *block)
+{
+   current.block = block;
+   current.start = nodes + block->start_ip;
+   current.len = block->end_ip - block->start_ip + 1;
+   current.end = current.start + current.len;
+   current.time = 0;
+   current.scheduled = 0;
+   current.cand_generation = 1;
+}
+
+/** Computation of the delay member of each node. */
+void
+instruction_scheduler::compute_delays()
+{
+   for (schedule_node *n = current.end - 1; n >= current.start; n--) {
+      if (!n->children_count) {
+         n->delay = n->issue_time;
+      } else {
+         for (int i = 0; i < n->children_count; i++) {
+            assert(n->children[i].n->delay);
+            n->delay = MAX2(n->delay, n->latency + n->children[i].n->delay);
+         }
+      }
+   }
+}
+
+void
+instruction_scheduler::compute_exits()
+{
+   /* Calculate a lower bound of the scheduling time of each node in the
+    * graph.  This is analogous to the node's critical path but calculated
+    * from the top instead of from the bottom of the block.
+    */
+   for (schedule_node *n = current.start; n < current.end; n++) {
+      for (int i = 0; i < n->children_count; i++) {
+         schedule_node_child *child = &n->children[i];
+         child->n->initial_unblocked_time =
+            MAX2(child->n->initial_unblocked_time,
+                 n->initial_unblocked_time + n->issue_time + child->effective_latency);
+      }
+   }
+
+   /* Calculate the exit of each node by induction based on the exit nodes of
+    * its children.  The preferred exit of a node is the one among the exit
+    * nodes of its children which can be unblocked first according to the
+    * optimistic unblocked time estimate calculated above.
+    */
+   for (schedule_node *n = current.end - 1; n >= current.start; n--) {
+      n->exit = (n->inst->opcode == BRW_OPCODE_HALT ? n : NULL);
+
+      for (int i = 0; i < n->children_count; i++) {
+         if (exit_initial_unblocked_time(n->children[i].n) < exit_initial_unblocked_time(n))
+            n->exit = n->children[i].n->exit;
+      }
+   }
+}
+
+/**
+ * Add a dependency between two instruction nodes.
+ *
+ * The @after node will be scheduled after @before.  We will try to
+ * schedule it @latency cycles after @before, but no guarantees there.
+ */
+void
+instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
+                               int latency)
+{
+   if (!before || !after)
+      return;
+
+   assert(before != after);
+
+   for (int i = 0; i < before->children_count; i++) {
+      schedule_node_child *child = &before->children[i];
+      if (child->n == after) {
+         child->effective_latency = MAX2(child->effective_latency, latency);
+         return;
+      }
+   }
+
+   if (before->children_cap <= before->children_count) {
+      if (before->children_cap < 16)
+         before->children_cap = 16;
+      else
+         before->children_cap *= 2;
+
+      before->children = reralloc(mem_ctx, before->children,
+                                  schedule_node_child,
+                                  before->children_cap);
+   }
+
+   schedule_node_child *child = &before->children[before->children_count];
+   child->n = after;
+   child->effective_latency = latency;
+   before->children_count++;
+   after->initial_parent_count++;
+}
+
+void
+instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
+{
+   if (!before)
+      return;
+
+   add_dep(before, after, before->latency);
+}
+
+static bool
+is_scheduling_barrier(const backend_instruction *inst)
+{
+   return inst->opcode == SHADER_OPCODE_HALT_TARGET ||
+          inst->is_control_flow() ||
+          inst->has_side_effects();
+}
+
+static bool
+has_cross_lane_access(const fs_inst *inst)
+{
+   /* FINISHME:
+    *
+    * This function is likely incomplete in terms of identify cross lane
+    * accesses.
+    */
+   if (inst->opcode == SHADER_OPCODE_BROADCAST ||
+       inst->opcode == SHADER_OPCODE_READ_SR_REG ||
+       inst->opcode == SHADER_OPCODE_CLUSTER_BROADCAST ||
+       inst->opcode == SHADER_OPCODE_SHUFFLE ||
+       inst->opcode == FS_OPCODE_LOAD_LIVE_CHANNELS ||
+       inst->opcode == SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL ||
+       inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL)
+      return true;
+
+   for (unsigned s = 0; s < inst->sources; s++) {
+      if (inst->src[s].file == VGRF) {
+         if (inst->src[s].stride == 0)
+            return true;
+      }
+   }
+
+   return false;
+}
+
+/**
+ * Sometimes we really want this node to execute after everything that
+ * was before it and before everything that followed it.  This adds
+ * the deps to do so.
+ */
+void
+instruction_scheduler::add_barrier_deps(schedule_node *n)
+{
+   for (schedule_node *prev = n - 1; prev >= current.start; prev--) {
+      add_dep(prev, n, 0);
+      if (is_scheduling_barrier(prev->inst))
+         break;
+   }
+
+   for (schedule_node *next = n + 1; next < current.end; next++) {
+      add_dep(n, next, 0);
+      if (is_scheduling_barrier(next->inst))
+         break;
+   }
+}
+
+/**
+ * Because some instructions like HALT can disable lanes, scheduling prior to
+ * a cross lane access should not be allowed, otherwise we could end up with
+ * later instructions accessing uninitialized data.
+ */
+void
+instruction_scheduler::add_cross_lane_deps(schedule_node *n)
+{
+   for (schedule_node *prev = n - 1; prev >= current.start; prev--) {
+      if (has_cross_lane_access((fs_inst*)prev->inst))
+         add_dep(prev, n, 0);
+   }
+}
+
+/* instruction scheduling needs to be aware of when an MRF write
+ * actually writes 2 MRFs.
+ */
+bool
+fs_instruction_scheduler::is_compressed(const fs_inst *inst)
+{
+   return inst->exec_size == 16;
+}
+
+/* Clears last_grf_write to be ready to start calculating deps for a block
+ * again.
+ *
+ * Since pre-ra grf_count scales with instructions, and instructions scale with
+ * BBs, we don't want to memset all of last_grf_write per block or you'll end up
+ * O(n^2) with number of blocks.  For shaders using softfp64, we get a *lot* of
+ * blocks.
+ *
+ * We don't bother being careful for post-ra, since then grf_count doesn't scale
+ * with instructions.
+ */
+void
+fs_instruction_scheduler::clear_last_grf_write()
+{
+   if (!post_reg_alloc) {
+      for (schedule_node *n = current.start; n < current.end; n++) {
+         fs_inst *inst = (fs_inst *)n->inst;
+
+         if (inst->dst.file == VGRF) {
+            /* Don't bother being careful with regs_written(), quicker to just clear 2 cachelines. */
+            memset(&last_grf_write[inst->dst.nr * 16], 0, sizeof(*last_grf_write) * 16);
+         }
+      }
+   } else {
+      memset(last_grf_write, 0, sizeof(*last_grf_write) * grf_count * 16);
+   }
+}
+
+void
+fs_instruction_scheduler::calculate_deps()
+{
+   /* Pre-register-allocation, this tracks the last write per VGRF offset.
+    * After register allocation, reg_offsets are gone and we track individual
+    * GRF registers.
+    */
+   schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->ver)];
+   schedule_node *last_conditional_mod[8] = {};
+   schedule_node *last_accumulator_write = NULL;
+   /* Fixed HW registers are assumed to be separate from the virtual
+    * GRFs, so they can be tracked separately.  We don't really write
+    * to fixed GRFs much, so don't bother tracking them on a more
+    * granular level.
+    */
+   schedule_node *last_fixed_grf_write = NULL;
+
+   memset(last_mrf_write, 0, sizeof(last_mrf_write));
+
+   /* top-to-bottom dependencies: RAW and WAW. */
+   for (schedule_node *n = current.start; n < current.end; n++) {
+      fs_inst *inst = (fs_inst *)n->inst;
+
+      if (is_scheduling_barrier(inst))
+         add_barrier_deps(n);
+
+      if (inst->opcode == BRW_OPCODE_HALT ||
+          inst->opcode == SHADER_OPCODE_HALT_TARGET)
+          add_cross_lane_deps(n);
+
+      /* read-after-write deps. */
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF) {
+            if (post_reg_alloc) {
+               for (unsigned r = 0; r < regs_read(inst, i); r++)
+                  add_dep(last_grf_write[inst->src[i].nr + r], n);
+            } else {
+               for (unsigned r = 0; r < regs_read(inst, i); r++) {
+                  add_dep(last_grf_write[inst->src[i].nr * 16 +
+                                         inst->src[i].offset / REG_SIZE + r], n);
+               }
+            }
+         } else if (inst->src[i].file == FIXED_GRF) {
+            if (post_reg_alloc) {
+               for (unsigned r = 0; r < regs_read(inst, i); r++)
+                  add_dep(last_grf_write[inst->src[i].nr + r], n);
+            } else {
+               add_dep(last_fixed_grf_write, n);
+            }
+         } else if (inst->src[i].is_accumulator()) {
+            add_dep(last_accumulator_write, n);
+         } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) {
+            add_barrier_deps(n);
+         }
+      }
+
+      if (inst->base_mrf != -1) {
+         for (int i = 0; i < inst->mlen; i++) {
+            /* It looks like the MRF regs are released in the send
+             * instruction once it's sent, not when the result comes
+             * back.
+             */
+            add_dep(last_mrf_write[inst->base_mrf + i], n);
+         }
+      }
+
+      if (const unsigned mask = inst->flags_read(v->devinfo)) {
+         assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
+
+         for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
+            if (mask & (1 << i))
+               add_dep(last_conditional_mod[i], n);
+         }
+      }
+
+      if (inst->reads_accumulator_implicitly()) {
+         add_dep(last_accumulator_write, n);
+      }
+
+      /* write-after-write deps. */
+      if (inst->dst.file == VGRF) {
+         if (post_reg_alloc) {
+            for (unsigned r = 0; r < regs_written(inst); r++) {
+               add_dep(last_grf_write[inst->dst.nr + r], n);
+               last_grf_write[inst->dst.nr + r] = n;
+            }
+         } else {
+            for (unsigned r = 0; r < regs_written(inst); r++) {
+               add_dep(last_grf_write[inst->dst.nr * 16 +
+                                      inst->dst.offset / REG_SIZE + r], n);
+               last_grf_write[inst->dst.nr * 16 +
+                              inst->dst.offset / REG_SIZE + r] = n;
+            }
+         }
+      } else if (inst->dst.file == MRF) {
+         int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
+
+         add_dep(last_mrf_write[reg], n);
+         last_mrf_write[reg] = n;
+         if (is_compressed(inst)) {
+            if (inst->dst.nr & BRW_MRF_COMPR4)
+               reg += 4;
+            else
+               reg++;
+            add_dep(last_mrf_write[reg], n);
+            last_mrf_write[reg] = n;
+         }
+      } else if (inst->dst.file == FIXED_GRF) {
+         if (post_reg_alloc) {
+            for (unsigned r = 0; r < regs_written(inst); r++) {
+               add_dep(last_grf_write[inst->dst.nr + r], n);
+               last_grf_write[inst->dst.nr + r] = n;
+            }
+         } else {
+            add_dep(last_fixed_grf_write, n);
+            last_fixed_grf_write = n;
+         }
+      } else if (inst->dst.is_accumulator()) {
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
+         add_barrier_deps(n);
+      }
+
+      if (inst->mlen > 0 && inst->base_mrf != -1) {
+         for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
+            add_dep(last_mrf_write[inst->base_mrf + i], n);
+            last_mrf_write[inst->base_mrf + i] = n;
+         }
+      }
+
+      if (const unsigned mask = inst->flags_written(v->devinfo)) {
+         assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
+
+         for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
+            if (mask & (1 << i)) {
+               add_dep(last_conditional_mod[i], n, 0);
+               last_conditional_mod[i] = n;
+            }
+         }
+      }
+
+      if (inst->writes_accumulator_implicitly(v->devinfo) &&
+          !inst->dst.is_accumulator()) {
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
+      }
+   }
+
+   clear_last_grf_write();
+
+   /* bottom-to-top dependencies: WAR */
+   memset(last_mrf_write, 0, sizeof(last_mrf_write));
+   memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
+   last_accumulator_write = NULL;
+   last_fixed_grf_write = NULL;
+
+   for (schedule_node *n = current.end - 1; n >= current.start; n--) {
+      fs_inst *inst = (fs_inst *)n->inst;
+
+      /* write-after-read deps. */
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF) {
+            if (post_reg_alloc) {
+               for (unsigned r = 0; r < regs_read(inst, i); r++)
+                  add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
+            } else {
+               for (unsigned r = 0; r < regs_read(inst, i); r++) {
+                  add_dep(n, last_grf_write[inst->src[i].nr * 16 +
+                                            inst->src[i].offset / REG_SIZE + r], 0);
+               }
+            }
+         } else if (inst->src[i].file == FIXED_GRF) {
+            if (post_reg_alloc) {
+               for (unsigned r = 0; r < regs_read(inst, i); r++)
+                  add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
+            } else {
+               add_dep(n, last_fixed_grf_write, 0);
+            }
+         } else if (inst->src[i].is_accumulator()) {
+            add_dep(n, last_accumulator_write, 0);
+         } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) {
+            add_barrier_deps(n);
+         }
+      }
+
+      if (inst->base_mrf != -1) {
+         for (int i = 0; i < inst->mlen; i++) {
+            /* It looks like the MRF regs are released in the send
+             * instruction once it's sent, not when the result comes
+             * back.
+             */
+            add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
+         }
+      }
+
+      if (const unsigned mask = inst->flags_read(v->devinfo)) {
+         assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
+
+         for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
+            if (mask & (1 << i))
+               add_dep(n, last_conditional_mod[i]);
+         }
+      }
+
+      if (inst->reads_accumulator_implicitly()) {
+         add_dep(n, last_accumulator_write);
+      }
+
+      /* Update the things this instruction wrote, so earlier reads
+       * can mark this as WAR dependency.
+       */
+      if (inst->dst.file == VGRF) {
+         if (post_reg_alloc) {
+            for (unsigned r = 0; r < regs_written(inst); r++)
+               last_grf_write[inst->dst.nr + r] = n;
+         } else {
+            for (unsigned r = 0; r < regs_written(inst); r++) {
+               last_grf_write[inst->dst.nr * 16 +
+                              inst->dst.offset / REG_SIZE + r] = n;
+            }
+         }
+      } else if (inst->dst.file == MRF) {
+         int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
+
+         last_mrf_write[reg] = n;
+
+         if (is_compressed(inst)) {
+            if (inst->dst.nr & BRW_MRF_COMPR4)
+               reg += 4;
+            else
+               reg++;
+
+            last_mrf_write[reg] = n;
+         }
+      } else if (inst->dst.file == FIXED_GRF) {
+         if (post_reg_alloc) {
+            for (unsigned r = 0; r < regs_written(inst); r++)
+               last_grf_write[inst->dst.nr + r] = n;
+         } else {
+            last_fixed_grf_write = n;
+         }
+      } else if (inst->dst.is_accumulator()) {
+         last_accumulator_write = n;
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
+         add_barrier_deps(n);
+      }
+
+      if (inst->mlen > 0 && inst->base_mrf != -1) {
+         for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
+            last_mrf_write[inst->base_mrf + i] = n;
+         }
+      }
+
+      if (const unsigned mask = inst->flags_written(v->devinfo)) {
+         assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
+
+         for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
+            if (mask & (1 << i))
+               last_conditional_mod[i] = n;
+         }
+      }
+
+      if (inst->writes_accumulator_implicitly(v->devinfo)) {
+         last_accumulator_write = n;
+      }
+   }
+
+   clear_last_grf_write();
+}
+
+void
+vec4_instruction_scheduler::calculate_deps()
+{
+   schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->ver)];
+   schedule_node *last_conditional_mod = NULL;
+   schedule_node *last_accumulator_write = NULL;
+   /* Fixed HW registers are assumed to be separate from the virtual
+    * GRFs, so they can be tracked separately.  We don't really write
+    * to fixed GRFs much, so don't bother tracking them on a more
+    * granular level.
+    */
+   schedule_node *last_fixed_grf_write = NULL;
+
+   memset(last_grf_write, 0, grf_count * sizeof(*last_grf_write));
+   memset(last_mrf_write, 0, sizeof(last_mrf_write));
+
+   /* top-to-bottom dependencies: RAW and WAW. */
+   for (schedule_node *n = current.start; n < current.end; n++) {
+      vec4_instruction *inst = (vec4_instruction *)n->inst;
+
+      if (is_scheduling_barrier(inst))
+         add_barrier_deps(n);
+
+      /* read-after-write deps. */
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file == VGRF) {
+            for (unsigned j = 0; j < regs_read(inst, i); ++j)
+               add_dep(last_grf_write[inst->src[i].nr + j], n);
+         } else if (inst->src[i].file == FIXED_GRF) {
+            add_dep(last_fixed_grf_write, n);
+         } else if (inst->src[i].is_accumulator()) {
+            assert(last_accumulator_write);
+            add_dep(last_accumulator_write, n);
+         } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) {
+            add_barrier_deps(n);
+         }
+      }
+
+      if (inst->reads_g0_implicitly())
+         add_dep(last_fixed_grf_write, n);
+
+      if (!inst->is_send_from_grf()) {
+         for (int i = 0; i < inst->mlen; i++) {
+            /* It looks like the MRF regs are released in the send
+             * instruction once it's sent, not when the result comes
+             * back.
+             */
+            add_dep(last_mrf_write[inst->base_mrf + i], n);
+         }
+      }
+
+      if (inst->reads_flag()) {
+         assert(last_conditional_mod);
+         add_dep(last_conditional_mod, n);
+      }
+
+      if (inst->reads_accumulator_implicitly()) {
+         assert(last_accumulator_write);
+         add_dep(last_accumulator_write, n);
+      }
+
+      /* write-after-write deps. */
+      if (inst->dst.file == VGRF) {
+         for (unsigned j = 0; j < regs_written(inst); ++j) {
+            add_dep(last_grf_write[inst->dst.nr + j], n);
+            last_grf_write[inst->dst.nr + j] = n;
+         }
+      } else if (inst->dst.file == MRF) {
+         add_dep(last_mrf_write[inst->dst.nr], n);
+         last_mrf_write[inst->dst.nr] = n;
+     } else if (inst->dst.file == FIXED_GRF) {
+         add_dep(last_fixed_grf_write, n);
+         last_fixed_grf_write = n;
+      } else if (inst->dst.is_accumulator()) {
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
+         add_barrier_deps(n);
+      }
+
+      if (inst->mlen > 0 && !inst->is_send_from_grf()) {
+         for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
+            add_dep(last_mrf_write[inst->base_mrf + i], n);
+            last_mrf_write[inst->base_mrf + i] = n;
+         }
+      }
+
+      if (inst->writes_flag(v->devinfo)) {
+         add_dep(last_conditional_mod, n, 0);
+         last_conditional_mod = n;
+      }
+
+      if (inst->writes_accumulator_implicitly(v->devinfo) &&
+          !inst->dst.is_accumulator()) {
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
+      }
+   }
+
+   /* bottom-to-top dependencies: WAR */
+   memset(last_grf_write, 0, grf_count * sizeof(*last_grf_write));
+   memset(last_mrf_write, 0, sizeof(last_mrf_write));
+   last_conditional_mod = NULL;
+   last_accumulator_write = NULL;
+   last_fixed_grf_write = NULL;
+
+   for (schedule_node *n = current.end - 1; n >= current.start; n--) {
+      vec4_instruction *inst = (vec4_instruction *)n->inst;
+
+      /* write-after-read deps. */
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file == VGRF) {
+            for (unsigned j = 0; j < regs_read(inst, i); ++j)
+               add_dep(n, last_grf_write[inst->src[i].nr + j]);
+         } else if (inst->src[i].file == FIXED_GRF) {
+            add_dep(n, last_fixed_grf_write);
+         } else if (inst->src[i].is_accumulator()) {
+            add_dep(n, last_accumulator_write);
+         } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) {
+            add_barrier_deps(n);
+         }
+      }
+
+      if (!inst->is_send_from_grf()) {
+         for (int i = 0; i < inst->mlen; i++) {
+            /* It looks like the MRF regs are released in the send
+             * instruction once it's sent, not when the result comes
+             * back.
+             */
+            add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
+         }
+      }
+
+      if (inst->reads_flag()) {
+         add_dep(n, last_conditional_mod);
+      }
+
+      if (inst->reads_accumulator_implicitly()) {
+         add_dep(n, last_accumulator_write);
+      }
+
+      /* Update the things this instruction wrote, so earlier reads
+       * can mark this as WAR dependency.
+       */
+      if (inst->dst.file == VGRF) {
+         for (unsigned j = 0; j < regs_written(inst); ++j)
+            last_grf_write[inst->dst.nr + j] = n;
+      } else if (inst->dst.file == MRF) {
+         last_mrf_write[inst->dst.nr] = n;
+      } else if (inst->dst.file == FIXED_GRF) {
+         last_fixed_grf_write = n;
+      } else if (inst->dst.is_accumulator()) {
+         last_accumulator_write = n;
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
+         add_barrier_deps(n);
+      }
+
+      if (inst->mlen > 0 && !inst->is_send_from_grf()) {
+         for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
+            last_mrf_write[inst->base_mrf + i] = n;
+         }
+      }
+
+      if (inst->writes_flag(v->devinfo)) {
+         last_conditional_mod = n;
+      }
+
+      if (inst->writes_accumulator_implicitly(v->devinfo)) {
+         last_accumulator_write = n;
+      }
+   }
+}
+
+schedule_node *
+fs_instruction_scheduler::choose_instruction_to_schedule()
+{
+   schedule_node *chosen = NULL;
+
+   if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) {
+      int chosen_time = 0;
+
+      /* Of the instructions ready to execute or the closest to being ready,
+       * choose the one most likely to unblock an early program exit, or
+       * otherwise the oldest one.
+       */
+      foreach_in_list(schedule_node, n, &current.available) {
+         if (!chosen ||
+             exit_tmp_unblocked_time(n) < exit_tmp_unblocked_time(chosen) ||
+             (exit_tmp_unblocked_time(n) == exit_tmp_unblocked_time(chosen) &&
+              n->tmp.unblocked_time < chosen_time)) {
+            chosen = n;
+            chosen_time = n->tmp.unblocked_time;
+         }
+      }
+   } else {
+      int chosen_register_pressure_benefit = 0;
+
+      /* Before register allocation, we don't care about the latencies of
+       * instructions.  All we care about is reducing live intervals of
+       * variables so that we can avoid register spilling, or get SIMD16
+       * shaders which naturally do a better job of hiding instruction
+       * latency.
+       */
+      foreach_in_list(schedule_node, n, &current.available) {
+         fs_inst *inst = (fs_inst *)n->inst;
+
+         if (!chosen) {
+            chosen = n;
+            chosen_register_pressure_benefit =
+                  get_register_pressure_benefit(chosen->inst);
+            continue;
+         }
+
+         /* Most important: If we can definitely reduce register pressure, do
+          * so immediately.
+          */
+         int register_pressure_benefit = get_register_pressure_benefit(n->inst);
+
+         if (register_pressure_benefit > 0 &&
+             register_pressure_benefit > chosen_register_pressure_benefit) {
+            chosen = n;
+            chosen_register_pressure_benefit = register_pressure_benefit;
+            continue;
+         } else if (chosen_register_pressure_benefit > 0 &&
+                    (register_pressure_benefit <
+                     chosen_register_pressure_benefit)) {
+            continue;
+         }
+
+         if (mode == SCHEDULE_PRE_LIFO) {
+            /* Prefer instructions that recently became available for
+             * scheduling.  These are the things that are most likely to
+             * (eventually) make a variable dead and reduce register pressure.
+             * Typical register pressure estimates don't work for us because
+             * most of our pressure comes from texturing, where no single
+             * instruction to schedule will make a vec4 value dead.
+             */
+            if (n->tmp.cand_generation > chosen->tmp.cand_generation) {
+               chosen = n;
+               chosen_register_pressure_benefit = register_pressure_benefit;
+               continue;
+            } else if (n->tmp.cand_generation < chosen->tmp.cand_generation) {
+               continue;
+            }
+
+            /* On MRF-using chips, prefer non-SEND instructions.  If we don't
+             * do this, then because we prefer instructions that just became
+             * candidates, we'll end up in a pattern of scheduling a SEND,
+             * then the MRFs for the next SEND, then the next SEND, then the
+             * MRFs, etc., without ever consuming the results of a send.
+             */
+            if (v->devinfo->ver < 7) {
+               fs_inst *chosen_inst = (fs_inst *)chosen->inst;
+
+               /* We use size_written > 4 * exec_size as our test for the kind
+                * of send instruction to avoid -- only sends generate many
+                * regs, and a single-result send is probably actually reducing
+                * register pressure.
+                */
+               if (inst->size_written <= 4 * inst->exec_size &&
+                   chosen_inst->size_written > 4 * chosen_inst->exec_size) {
+                  chosen = n;
+                  chosen_register_pressure_benefit = register_pressure_benefit;
+                  continue;
+               } else if (inst->size_written > chosen_inst->size_written) {
+                  continue;
+               }
+            }
+         }
+
+         /* For instructions pushed on the cands list at the same time, prefer
+          * the one with the highest delay to the end of the program.  This is
+          * most likely to have its values able to be consumed first (such as
+          * for a large tree of lowered ubo loads, which appear reversed in
+          * the instruction stream with respect to when they can be consumed).
+          */
+         if (n->delay > chosen->delay) {
+            chosen = n;
+            chosen_register_pressure_benefit = register_pressure_benefit;
+            continue;
+         } else if (n->delay < chosen->delay) {
+            continue;
+         }
+
+         /* Prefer the node most likely to unblock an early program exit.
+          */
+         if (exit_tmp_unblocked_time(n) < exit_tmp_unblocked_time(chosen)) {
+            chosen = n;
+            chosen_register_pressure_benefit = register_pressure_benefit;
+            continue;
+         } else if (exit_tmp_unblocked_time(n) > exit_tmp_unblocked_time(chosen)) {
+            continue;
+         }
+
+         /* If all other metrics are equal, we prefer the first instruction in
+          * the list (program execution).
+          */
+      }
+   }
+
+   return chosen;
+}
+
+schedule_node *
+vec4_instruction_scheduler::choose_instruction_to_schedule()
+{
+   schedule_node *chosen = NULL;
+   int chosen_time = 0;
+
+   /* Of the instructions ready to execute or the closest to being ready,
+    * choose the oldest one.
+    */
+   foreach_in_list(schedule_node, n, &current.available) {
+      if (!chosen || n->tmp.unblocked_time < chosen_time) {
+         chosen = n;
+         chosen_time = n->tmp.unblocked_time;
+      }
+   }
+
+   return chosen;
+}
+
+int
+fs_instruction_scheduler::calculate_issue_time(backend_instruction *inst0)
+{
+   const struct brw_isa_info *isa = &v->compiler->isa;
+   const fs_inst *inst = static_cast<fs_inst *>(inst0);
+   const unsigned overhead = v->grf_used && has_bank_conflict(isa, inst) ?
+      DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE) : 0;
+   if (is_compressed(inst))
+      return 4 + overhead;
+   else
+      return 2 + overhead;
+}
+
+void
+instruction_scheduler::schedule(schedule_node *chosen)
+{
+   assert(current.scheduled < current.len);
+   current.scheduled++;
+
+   assert(chosen);
+   chosen->remove();
+   current.block->instructions.push_tail(chosen->inst);
+
+   /* If we expected a delay for scheduling, then bump the clock to reflect
+    * that.  In reality, the hardware will switch to another hyperthread
+    * and may not return to dispatching our thread for a while even after
+    * we're unblocked.  After this, we have the time when the chosen
+    * instruction will start executing.
+    */
+   current.time = MAX2(current.time, chosen->tmp.unblocked_time);
+
+   /* Update the clock for how soon an instruction could start after the
+    * chosen one.
+    */
+   current.time += chosen->issue_time;
+
+   if (debug) {
+      fprintf(stderr, "clock %4d, scheduled: ", current.time);
+      bs->dump_instruction(chosen->inst);
+   }
+}
+
+void
+instruction_scheduler::update_children(schedule_node *chosen)
+{
+   /* Now that we've scheduled a new instruction, some of its
+    * children can be promoted to the list of instructions ready to
+    * be scheduled.  Update the children's unblocked time for this
+    * DAG edge as we do so.
+    */
+   for (int i = chosen->children_count - 1; i >= 0; i--) {
+      schedule_node_child *child = &chosen->children[i];
+
+      child->n->tmp.unblocked_time = MAX2(child->n->tmp.unblocked_time,
+                                          current.time + child->effective_latency);
+
+      if (debug) {
+         fprintf(stderr, "\tchild %d, %d parents: ", i, child->n->tmp.parent_count);
+         bs->dump_instruction(child->n->inst);
+      }
+
+      child->n->tmp.cand_generation = current.cand_generation;
+      child->n->tmp.parent_count--;
+      if (child->n->tmp.parent_count == 0) {
+         if (debug) {
+            fprintf(stderr, "\t\tnow available\n");
+         }
+         current.available.push_head(child->n);
+      }
+   }
+   current.cand_generation++;
+
+   /* Shared resource: the mathbox.  There's one mathbox per EU on Gfx6+
+    * but it's more limited pre-gfx6, so if we send something off to it then
+    * the next math instruction isn't going to make progress until the first
+    * is done.
+    */
+   if (bs->devinfo->ver < 6 && chosen->inst->is_math()) {
+      foreach_in_list(schedule_node, n, &current.available) {
+         if (n->inst->is_math())
+            n->tmp.unblocked_time = MAX2(n->tmp.unblocked_time,
+                                         current.time + chosen->latency);
+      }
+   }
+}
+
+void
+fs_instruction_scheduler::schedule_instructions()
+{
+   if (!post_reg_alloc)
+      reg_pressure = reg_pressure_in[current.block->num];
+
+   assert(current.available.is_empty());
+   for (schedule_node *n = current.start; n < current.end; n++) {
+      reset_node_tmp(n);
+
+      /* Add DAG heads to the list of available instructions. */
+      if (n->tmp.parent_count == 0)
+         current.available.push_tail(n);
+   }
+
+   current.block->instructions.make_empty();
+
+   while (!current.available.is_empty()) {
+      schedule_node *chosen = choose_instruction_to_schedule();
+      schedule(chosen);
+
+      if (!post_reg_alloc) {
+         reg_pressure -= get_register_pressure_benefit(chosen->inst);
+         update_register_pressure(chosen->inst);
+         if (debug)
+            fprintf(stderr, "(register pressure %d)\n", reg_pressure);
+      }
+
+      update_children(chosen);
+   }
+}
+
+void
+fs_instruction_scheduler::run(instruction_scheduler_mode mode)
+{
+   this->mode = mode;
+
+   if (debug && !post_reg_alloc) {
+      fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n",
+              post_reg_alloc);
+         bs->dump_instructions();
+   }
+
+   if (!post_reg_alloc) {
+      memset(reads_remaining, 0, grf_count * sizeof(*reads_remaining));
+      memset(hw_reads_remaining, 0, hw_reg_count * sizeof(*hw_reads_remaining));
+      memset(written, 0, grf_count * sizeof(*written));
+   }
+
+   foreach_block(block, v->cfg) {
+      set_current_block(block);
+
+      if (!post_reg_alloc) {
+         for (schedule_node *n = current.start; n < current.end; n++)
+            count_reads_remaining(n->inst);
+      }
+
+      schedule_instructions();
+   }
+
+   if (debug && !post_reg_alloc) {
+      fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n",
+              post_reg_alloc);
+      bs->dump_instructions();
+   }
+}
+
+void
+vec4_instruction_scheduler::run()
+{
+   foreach_block(block, v->cfg) {
+      set_current_block(block);
+
+      for (schedule_node *n = current.start; n < current.end; n++) {
+         /* We always execute as two vec4s in parallel. */
+         n->issue_time = 2;
+      }
+
+      calculate_deps();
+
+      compute_delays();
+      compute_exits();
+
+      assert(current.available.is_empty());
+      for (schedule_node *n = current.start; n < current.end; n++) {
+         reset_node_tmp(n);
+
+         /* Add DAG heads to the list of available instructions. */
+         if (n->tmp.parent_count == 0)
+            current.available.push_tail(n);
+      }
+
+      current.block->instructions.make_empty();
+
+      while (!current.available.is_empty()) {
+         schedule_node *chosen = choose_instruction_to_schedule();
+         schedule(chosen);
+         update_children(chosen);
+      }
+   }
+}
+
+fs_instruction_scheduler *
+fs_visitor::prepare_scheduler(void *mem_ctx)
+{
+   const int grf_count = alloc.count;
+
+   fs_instruction_scheduler *empty = rzalloc(mem_ctx, fs_instruction_scheduler);
+   return new (empty) fs_instruction_scheduler(mem_ctx, this, grf_count, first_non_payload_grf,
+                                               cfg->num_blocks, /* post_reg_alloc */ false);
+}
+
+void
+fs_visitor::schedule_instructions_pre_ra(fs_instruction_scheduler *sched,
+                                         instruction_scheduler_mode mode)
+{
+   if (mode == SCHEDULE_NONE)
+      return;
+
+   sched->run(mode);
+
+   invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+}
+
+void
+fs_visitor::schedule_instructions_post_ra()
+{
+   const bool post_reg_alloc = true;
+   const int grf_count = reg_unit(devinfo) * grf_used;
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   fs_instruction_scheduler sched(mem_ctx, this, grf_count, first_non_payload_grf,
+                                  cfg->num_blocks, post_reg_alloc);
+   sched.run(SCHEDULE_POST);
+
+   ralloc_free(mem_ctx);
+
+   invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+}
+
+void
+vec4_visitor::opt_schedule_instructions()
+{
+   void *mem_ctx = ralloc_context(NULL);
+
+   vec4_instruction_scheduler sched(mem_ctx, this, prog_data->total_grf);
+   sched.run();
+
+   ralloc_free(mem_ctx);
+
+   invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+}
diff --git a/src/intel/compiler/elk/brw_shader.cpp b/src/intel/compiler/elk/brw_shader.cpp
new file mode 100644
index 00000000000..7b38b6f4235
--- /dev/null
+++ b/src/intel/compiler/elk/brw_shader.cpp
@@ -0,0 +1,1427 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_cfg.h"
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_nir.h"
+#include "brw_private.h"
+#include "brw_vec4_tes.h"
+#include "dev/intel_debug.h"
+#include "util/macros.h"
+#include "util/u_debug.h"
+
+enum brw_reg_type
+brw_type_for_base_type(const struct glsl_type *type)
+{
+   switch (type->base_type) {
+   case GLSL_TYPE_FLOAT16:
+      return BRW_REGISTER_TYPE_HF;
+   case GLSL_TYPE_FLOAT:
+      return BRW_REGISTER_TYPE_F;
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_SUBROUTINE:
+      return BRW_REGISTER_TYPE_D;
+   case GLSL_TYPE_INT16:
+      return BRW_REGISTER_TYPE_W;
+   case GLSL_TYPE_INT8:
+      return BRW_REGISTER_TYPE_B;
+   case GLSL_TYPE_UINT:
+      return BRW_REGISTER_TYPE_UD;
+   case GLSL_TYPE_UINT16:
+      return BRW_REGISTER_TYPE_UW;
+   case GLSL_TYPE_UINT8:
+      return BRW_REGISTER_TYPE_UB;
+   case GLSL_TYPE_ARRAY:
+      return brw_type_for_base_type(type->fields.array);
+   case GLSL_TYPE_STRUCT:
+   case GLSL_TYPE_INTERFACE:
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
+   case GLSL_TYPE_ATOMIC_UINT:
+      /* These should be overridden with the type of the member when
+       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
+       * way to trip up if we don't.
+       */
+      return BRW_REGISTER_TYPE_UD;
+   case GLSL_TYPE_IMAGE:
+      return BRW_REGISTER_TYPE_UD;
+   case GLSL_TYPE_DOUBLE:
+      return BRW_REGISTER_TYPE_DF;
+   case GLSL_TYPE_UINT64:
+      return BRW_REGISTER_TYPE_UQ;
+   case GLSL_TYPE_INT64:
+      return BRW_REGISTER_TYPE_Q;
+   case GLSL_TYPE_VOID:
+   case GLSL_TYPE_ERROR:
+   case GLSL_TYPE_COOPERATIVE_MATRIX:
+      unreachable("not reached");
+   }
+
+   return BRW_REGISTER_TYPE_F;
+}
+
+uint32_t
+brw_math_function(enum opcode op)
+{
+   switch (op) {
+   case SHADER_OPCODE_RCP:
+      return BRW_MATH_FUNCTION_INV;
+   case SHADER_OPCODE_RSQ:
+      return BRW_MATH_FUNCTION_RSQ;
+   case SHADER_OPCODE_SQRT:
+      return BRW_MATH_FUNCTION_SQRT;
+   case SHADER_OPCODE_EXP2:
+      return BRW_MATH_FUNCTION_EXP;
+   case SHADER_OPCODE_LOG2:
+      return BRW_MATH_FUNCTION_LOG;
+   case SHADER_OPCODE_POW:
+      return BRW_MATH_FUNCTION_POW;
+   case SHADER_OPCODE_SIN:
+      return BRW_MATH_FUNCTION_SIN;
+   case SHADER_OPCODE_COS:
+      return BRW_MATH_FUNCTION_COS;
+   case SHADER_OPCODE_INT_QUOTIENT:
+      return BRW_MATH_FUNCTION_INT_DIV_QUOTIENT;
+   case SHADER_OPCODE_INT_REMAINDER:
+      return BRW_MATH_FUNCTION_INT_DIV_REMAINDER;
+   default:
+      unreachable("not reached: unknown math function");
+   }
+}
+
+bool
+brw_texture_offset(const nir_tex_instr *tex, unsigned src,
+                   uint32_t *offset_bits_out)
+{
+   if (!nir_src_is_const(tex->src[src].src))
+      return false;
+
+   const unsigned num_components = nir_tex_instr_src_size(tex, src);
+
+   /* Combine all three offsets into a single unsigned dword:
+    *
+    *    bits 11:8 - U Offset (X component)
+    *    bits  7:4 - V Offset (Y component)
+    *    bits  3:0 - R Offset (Z component)
+    */
+   uint32_t offset_bits = 0;
+   for (unsigned i = 0; i < num_components; i++) {
+      int offset = nir_src_comp_as_int(tex->src[src].src, i);
+
+      /* offset out of bounds; caller will handle it. */
+      if (offset > 7 || offset < -8)
+         return false;
+
+      const unsigned shift = 4 * (2 - i);
+      offset_bits |= (offset << shift) & (0xF << shift);
+   }
+
+   *offset_bits_out = offset_bits;
+
+   return true;
+}
+
+const char *
+brw_instruction_name(const struct brw_isa_info *isa, enum opcode op)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   switch (op) {
+   case 0 ... NUM_BRW_OPCODES - 1:
+      /* The DO instruction doesn't exist on Gfx6+, but we use it to mark the
+       * start of a loop in the IR.
+       */
+      if (devinfo->ver >= 6 && op == BRW_OPCODE_DO)
+         return "do";
+
+      /* The following conversion opcodes doesn't exist on Gfx8+, but we use
+       * then to mark that we want to do the conversion.
+       */
+      if (devinfo->ver > 7 && op == BRW_OPCODE_F32TO16)
+         return "f32to16";
+
+      if (devinfo->ver > 7 && op == BRW_OPCODE_F16TO32)
+         return "f16to32";
+
+      /* DPAS instructions may transiently exist on platforms that do not
+       * support DPAS. They will eventually be lowered, but in the meantime it
+       * must be possible to query the instruction name.
+       */
+      if (devinfo->verx10 < 125 && op == BRW_OPCODE_DPAS)
+         return "dpas";
+
+      assert(brw_opcode_desc(isa, op)->name);
+      return brw_opcode_desc(isa, op)->name;
+   case FS_OPCODE_FB_WRITE:
+      return "fb_write";
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      return "fb_write_logical";
+   case FS_OPCODE_REP_FB_WRITE:
+      return "rep_fb_write";
+   case FS_OPCODE_FB_READ:
+      return "fb_read";
+   case FS_OPCODE_FB_READ_LOGICAL:
+      return "fb_read_logical";
+
+   case SHADER_OPCODE_RCP:
+      return "rcp";
+   case SHADER_OPCODE_RSQ:
+      return "rsq";
+   case SHADER_OPCODE_SQRT:
+      return "sqrt";
+   case SHADER_OPCODE_EXP2:
+      return "exp2";
+   case SHADER_OPCODE_LOG2:
+      return "log2";
+   case SHADER_OPCODE_POW:
+      return "pow";
+   case SHADER_OPCODE_INT_QUOTIENT:
+      return "int_quot";
+   case SHADER_OPCODE_INT_REMAINDER:
+      return "int_rem";
+   case SHADER_OPCODE_SIN:
+      return "sin";
+   case SHADER_OPCODE_COS:
+      return "cos";
+
+   case SHADER_OPCODE_SEND:
+      return "send";
+
+   case SHADER_OPCODE_UNDEF:
+      return "undef";
+
+   case SHADER_OPCODE_TEX:
+      return "tex";
+   case SHADER_OPCODE_TEX_LOGICAL:
+      return "tex_logical";
+   case SHADER_OPCODE_TXD:
+      return "txd";
+   case SHADER_OPCODE_TXD_LOGICAL:
+      return "txd_logical";
+   case SHADER_OPCODE_TXF:
+      return "txf";
+   case SHADER_OPCODE_TXF_LOGICAL:
+      return "txf_logical";
+   case SHADER_OPCODE_TXF_LZ:
+      return "txf_lz";
+   case SHADER_OPCODE_TXL:
+      return "txl";
+   case SHADER_OPCODE_TXL_LOGICAL:
+      return "txl_logical";
+   case SHADER_OPCODE_TXL_LZ:
+      return "txl_lz";
+   case SHADER_OPCODE_TXS:
+      return "txs";
+   case SHADER_OPCODE_TXS_LOGICAL:
+      return "txs_logical";
+   case FS_OPCODE_TXB:
+      return "txb";
+   case FS_OPCODE_TXB_LOGICAL:
+      return "txb_logical";
+   case SHADER_OPCODE_TXF_CMS:
+      return "txf_cms";
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+      return "txf_cms_logical";
+   case SHADER_OPCODE_TXF_CMS_W:
+      return "txf_cms_w";
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+      return "txf_cms_w_logical";
+   case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
+      return "txf_cms_w_gfx12_logical";
+   case SHADER_OPCODE_TXF_UMS:
+      return "txf_ums";
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+      return "txf_ums_logical";
+   case SHADER_OPCODE_TXF_MCS:
+      return "txf_mcs";
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+      return "txf_mcs_logical";
+   case SHADER_OPCODE_LOD:
+      return "lod";
+   case SHADER_OPCODE_LOD_LOGICAL:
+      return "lod_logical";
+   case SHADER_OPCODE_TG4:
+      return "tg4";
+   case SHADER_OPCODE_TG4_LOGICAL:
+      return "tg4_logical";
+   case SHADER_OPCODE_TG4_OFFSET:
+      return "tg4_offset";
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+      return "tg4_offset_logical";
+   case SHADER_OPCODE_SAMPLEINFO:
+      return "sampleinfo";
+   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+      return "sampleinfo_logical";
+
+   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
+      return "image_size_logical";
+
+   case VEC4_OPCODE_UNTYPED_ATOMIC:
+      return "untyped_atomic";
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+      return "untyped_atomic_logical";
+   case VEC4_OPCODE_UNTYPED_SURFACE_READ:
+      return "untyped_surface_read";
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+      return "untyped_surface_read_logical";
+   case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
+      return "untyped_surface_write";
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+      return "untyped_surface_write_logical";
+   case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
+      return "unaligned_oword_block_read_logical";
+   case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
+      return "oword_block_write_logical";
+   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
+      return "a64_untyped_read_logical";
+   case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
+      return "a64_oword_block_read_logical";
+   case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
+      return "a64_unaligned_oword_block_read_logical";
+   case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
+      return "a64_oword_block_write_logical";
+   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
+      return "a64_untyped_write_logical";
+   case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
+      return "a64_byte_scattered_read_logical";
+   case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
+      return "a64_byte_scattered_write_logical";
+   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
+      return "a64_untyped_atomic_logical";
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+      return "typed_atomic_logical";
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      return "typed_surface_read_logical";
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      return "typed_surface_write_logical";
+   case SHADER_OPCODE_MEMORY_FENCE:
+      return "memory_fence";
+   case FS_OPCODE_SCHEDULING_FENCE:
+      return "scheduling_fence";
+   case SHADER_OPCODE_INTERLOCK:
+      /* For an interlock we actually issue a memory fence via sendc. */
+      return "interlock";
+
+   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
+      return "byte_scattered_read_logical";
+   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
+      return "byte_scattered_write_logical";
+   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
+      return "dword_scattered_read_logical";
+   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
+      return "dword_scattered_write_logical";
+
+   case SHADER_OPCODE_LOAD_PAYLOAD:
+      return "load_payload";
+   case FS_OPCODE_PACK:
+      return "pack";
+
+   case SHADER_OPCODE_GFX4_SCRATCH_READ:
+      return "gfx4_scratch_read";
+   case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
+      return "gfx4_scratch_write";
+   case SHADER_OPCODE_GFX7_SCRATCH_READ:
+      return "gfx7_scratch_read";
+   case SHADER_OPCODE_SCRATCH_HEADER:
+      return "scratch_header";
+
+   case SHADER_OPCODE_URB_WRITE_LOGICAL:
+      return "urb_write_logical";
+   case SHADER_OPCODE_URB_READ_LOGICAL:
+      return "urb_read_logical";
+
+   case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+      return "find_live_channel";
+   case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
+      return "find_last_live_channel";
+   case FS_OPCODE_LOAD_LIVE_CHANNELS:
+      return "load_live_channels";
+
+   case SHADER_OPCODE_BROADCAST:
+      return "broadcast";
+   case SHADER_OPCODE_SHUFFLE:
+      return "shuffle";
+   case SHADER_OPCODE_SEL_EXEC:
+      return "sel_exec";
+   case SHADER_OPCODE_QUAD_SWIZZLE:
+      return "quad_swizzle";
+   case SHADER_OPCODE_CLUSTER_BROADCAST:
+      return "cluster_broadcast";
+
+   case SHADER_OPCODE_GET_BUFFER_SIZE:
+      return "get_buffer_size";
+
+   case VEC4_OPCODE_MOV_BYTES:
+      return "mov_bytes";
+   case VEC4_OPCODE_PACK_BYTES:
+      return "pack_bytes";
+   case VEC4_OPCODE_UNPACK_UNIFORM:
+      return "unpack_uniform";
+   case VEC4_OPCODE_DOUBLE_TO_F32:
+      return "double_to_f32";
+   case VEC4_OPCODE_DOUBLE_TO_D32:
+      return "double_to_d32";
+   case VEC4_OPCODE_DOUBLE_TO_U32:
+      return "double_to_u32";
+   case VEC4_OPCODE_TO_DOUBLE:
+      return "single_to_double";
+   case VEC4_OPCODE_PICK_LOW_32BIT:
+      return "pick_low_32bit";
+   case VEC4_OPCODE_PICK_HIGH_32BIT:
+      return "pick_high_32bit";
+   case VEC4_OPCODE_SET_LOW_32BIT:
+      return "set_low_32bit";
+   case VEC4_OPCODE_SET_HIGH_32BIT:
+      return "set_high_32bit";
+   case VEC4_OPCODE_MOV_FOR_SCRATCH:
+      return "mov_for_scratch";
+   case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
+      return "zero_oob_push_regs";
+
+   case FS_OPCODE_DDX_COARSE:
+      return "ddx_coarse";
+   case FS_OPCODE_DDX_FINE:
+      return "ddx_fine";
+   case FS_OPCODE_DDY_COARSE:
+      return "ddy_coarse";
+   case FS_OPCODE_DDY_FINE:
+      return "ddy_fine";
+
+   case FS_OPCODE_LINTERP:
+      return "linterp";
+
+   case FS_OPCODE_PIXEL_X:
+      return "pixel_x";
+   case FS_OPCODE_PIXEL_Y:
+      return "pixel_y";
+
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+      return "uniform_pull_const";
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
+      return "varying_pull_const_gfx4";
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
+      return "varying_pull_const_logical";
+
+   case FS_OPCODE_SET_SAMPLE_ID:
+      return "set_sample_id";
+
+   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+      return "pack_half_2x16_split";
+
+   case SHADER_OPCODE_HALT_TARGET:
+      return "halt_target";
+
+   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+      return "interp_sample";
+   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+      return "interp_shared_offset";
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+      return "interp_per_slot_offset";
+
+   case VEC4_VS_OPCODE_URB_WRITE:
+      return "vs_urb_write";
+   case VS_OPCODE_PULL_CONSTANT_LOAD:
+      return "pull_constant_load";
+   case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
+      return "pull_constant_load_gfx7";
+
+   case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
+      return "unpack_flags_simd4x2";
+
+   case VEC4_GS_OPCODE_URB_WRITE:
+      return "gs_urb_write";
+   case VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
+      return "gs_urb_write_allocate";
+   case GS_OPCODE_THREAD_END:
+      return "gs_thread_end";
+   case GS_OPCODE_SET_WRITE_OFFSET:
+      return "set_write_offset";
+   case GS_OPCODE_SET_VERTEX_COUNT:
+      return "set_vertex_count";
+   case GS_OPCODE_SET_DWORD_2:
+      return "set_dword_2";
+   case GS_OPCODE_PREPARE_CHANNEL_MASKS:
+      return "prepare_channel_masks";
+   case GS_OPCODE_SET_CHANNEL_MASKS:
+      return "set_channel_masks";
+   case GS_OPCODE_GET_INSTANCE_ID:
+      return "get_instance_id";
+   case GS_OPCODE_FF_SYNC:
+      return "ff_sync";
+   case GS_OPCODE_SET_PRIMITIVE_ID:
+      return "set_primitive_id";
+   case GS_OPCODE_SVB_WRITE:
+      return "gs_svb_write";
+   case GS_OPCODE_SVB_SET_DST_INDEX:
+      return "gs_svb_set_dst_index";
+   case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
+      return "gs_ff_sync_set_primitives";
+   case CS_OPCODE_CS_TERMINATE:
+      return "cs_terminate";
+   case SHADER_OPCODE_BARRIER:
+      return "barrier";
+   case SHADER_OPCODE_MULH:
+      return "mulh";
+   case SHADER_OPCODE_ISUB_SAT:
+      return "isub_sat";
+   case SHADER_OPCODE_USUB_SAT:
+      return "usub_sat";
+   case SHADER_OPCODE_MOV_INDIRECT:
+      return "mov_indirect";
+   case SHADER_OPCODE_MOV_RELOC_IMM:
+      return "mov_reloc_imm";
+
+   case VEC4_OPCODE_URB_READ:
+      return "urb_read";
+   case TCS_OPCODE_GET_INSTANCE_ID:
+      return "tcs_get_instance_id";
+   case VEC4_TCS_OPCODE_URB_WRITE:
+      return "tcs_urb_write";
+   case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+      return "tcs_set_input_urb_offsets";
+   case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+      return "tcs_set_output_urb_offsets";
+   case TCS_OPCODE_GET_PRIMITIVE_ID:
+      return "tcs_get_primitive_id";
+   case TCS_OPCODE_CREATE_BARRIER_HEADER:
+      return "tcs_create_barrier_header";
+   case TCS_OPCODE_SRC0_010_IS_ZERO:
+      return "tcs_src0<0,1,0>_is_zero";
+   case TCS_OPCODE_RELEASE_INPUT:
+      return "tcs_release_input";
+   case TCS_OPCODE_THREAD_END:
+      return "tcs_thread_end";
+   case TES_OPCODE_CREATE_INPUT_READ_HEADER:
+      return "tes_create_input_read_header";
+   case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+      return "tes_add_indirect_urb_offset";
+   case TES_OPCODE_GET_PRIMITIVE_ID:
+      return "tes_get_primitive_id";
+
+   case RT_OPCODE_TRACE_RAY_LOGICAL:
+      return "rt_trace_ray_logical";
+
+   case SHADER_OPCODE_RND_MODE:
+      return "rnd_mode";
+   case SHADER_OPCODE_FLOAT_CONTROL_MODE:
+      return "float_control_mode";
+   case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
+      return "btd_spawn_logical";
+   case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
+      return "btd_retire_logical";
+   case SHADER_OPCODE_READ_SR_REG:
+      return "read_sr_reg";
+   }
+
+   unreachable("not reached");
+}
+
+bool
+brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg)
+{
+   union {
+      unsigned ud;
+      int d;
+      float f;
+      double df;
+   } imm, sat_imm = { 0 };
+
+   const unsigned size = type_sz(type);
+
+   /* We want to either do a 32-bit or 64-bit data copy, the type is otherwise
+    * irrelevant, so just check the size of the type and copy from/to an
+    * appropriately sized field.
+    */
+   if (size < 8)
+      imm.ud = reg->ud;
+   else
+      imm.df = reg->df;
+
+   switch (type) {
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_Q:
+      /* Nothing to do. */
+      return false;
+   case BRW_REGISTER_TYPE_F:
+      sat_imm.f = SATURATE(imm.f);
+      break;
+   case BRW_REGISTER_TYPE_DF:
+      sat_imm.df = SATURATE(imm.df);
+      break;
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+      unreachable("no UB/B immediates");
+   case BRW_REGISTER_TYPE_V:
+   case BRW_REGISTER_TYPE_UV:
+   case BRW_REGISTER_TYPE_VF:
+      unreachable("unimplemented: saturate vector immediate");
+   case BRW_REGISTER_TYPE_HF:
+      unreachable("unimplemented: saturate HF immediate");
+   case BRW_REGISTER_TYPE_NF:
+      unreachable("no NF immediates");
+   }
+
+   if (size < 8) {
+      if (imm.ud != sat_imm.ud) {
+         reg->ud = sat_imm.ud;
+         return true;
+      }
+   } else {
+      if (imm.df != sat_imm.df) {
+         reg->df = sat_imm.df;
+         return true;
+      }
+   }
+   return false;
+}
+
+bool
+brw_negate_immediate(enum brw_reg_type type, struct brw_reg *reg)
+{
+   switch (type) {
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UD:
+      reg->d = -reg->d;
+      return true;
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW: {
+      uint16_t value = -(int16_t)reg->ud;
+      reg->ud = value | (uint32_t)value << 16;
+      return true;
+   }
+   case BRW_REGISTER_TYPE_F:
+      reg->f = -reg->f;
+      return true;
+   case BRW_REGISTER_TYPE_VF:
+      reg->ud ^= 0x80808080;
+      return true;
+   case BRW_REGISTER_TYPE_DF:
+      reg->df = -reg->df;
+      return true;
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_Q:
+      reg->d64 = -reg->d64;
+      return true;
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+      unreachable("no UB/B immediates");
+   case BRW_REGISTER_TYPE_UV:
+   case BRW_REGISTER_TYPE_V:
+      assert(!"unimplemented: negate UV/V immediate");
+   case BRW_REGISTER_TYPE_HF:
+      reg->ud ^= 0x80008000;
+      return true;
+   case BRW_REGISTER_TYPE_NF:
+      unreachable("no NF immediates");
+   }
+
+   return false;
+}
+
+bool
+brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg)
+{
+   switch (type) {
+   case BRW_REGISTER_TYPE_D:
+      reg->d = abs(reg->d);
+      return true;
+   case BRW_REGISTER_TYPE_W: {
+      uint16_t value = abs((int16_t)reg->ud);
+      reg->ud = value | (uint32_t)value << 16;
+      return true;
+   }
+   case BRW_REGISTER_TYPE_F:
+      reg->f = fabsf(reg->f);
+      return true;
+   case BRW_REGISTER_TYPE_DF:
+      reg->df = fabs(reg->df);
+      return true;
+   case BRW_REGISTER_TYPE_VF:
+      reg->ud &= ~0x80808080;
+      return true;
+   case BRW_REGISTER_TYPE_Q:
+      reg->d64 = imaxabs(reg->d64);
+      return true;
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+      unreachable("no UB/B immediates");
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_UV:
+      /* Presumably the absolute value modifier on an unsigned source is a
+       * nop, but it would be nice to confirm.
+       */
+      assert(!"unimplemented: abs unsigned immediate");
+   case BRW_REGISTER_TYPE_V:
+      assert(!"unimplemented: abs V immediate");
+   case BRW_REGISTER_TYPE_HF:
+      reg->ud &= ~0x80008000;
+      return true;
+   case BRW_REGISTER_TYPE_NF:
+      unreachable("no NF immediates");
+   }
+
+   return false;
+}
+
+backend_shader::backend_shader(const struct brw_compiler *compiler,
+                               const struct brw_compile_params *params,
+                               const nir_shader *shader,
+                               struct brw_stage_prog_data *stage_prog_data,
+                               bool debug_enabled)
+   : compiler(compiler),
+     log_data(params->log_data),
+     devinfo(compiler->devinfo),
+     nir(shader),
+     stage_prog_data(stage_prog_data),
+     mem_ctx(params->mem_ctx),
+     cfg(NULL), idom_analysis(this),
+     stage(shader->info.stage),
+     debug_enabled(debug_enabled)
+{
+}
+
+backend_shader::~backend_shader()
+{
+}
+
+bool
+backend_reg::equals(const backend_reg &r) const
+{
+   return brw_regs_equal(this, &r) && offset == r.offset;
+}
+
+bool
+backend_reg::negative_equals(const backend_reg &r) const
+{
+   return brw_regs_negative_equal(this, &r) && offset == r.offset;
+}
+
+bool
+backend_reg::is_zero() const
+{
+   if (file != IMM)
+      return false;
+
+   assert(type_sz(type) > 1);
+
+   switch (type) {
+   case BRW_REGISTER_TYPE_HF:
+      assert((d & 0xffff) == ((d >> 16) & 0xffff));
+      return (d & 0xffff) == 0 || (d & 0xffff) == 0x8000;
+   case BRW_REGISTER_TYPE_F:
+      return f == 0;
+   case BRW_REGISTER_TYPE_DF:
+      return df == 0;
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW:
+      assert((d & 0xffff) == ((d >> 16) & 0xffff));
+      return (d & 0xffff) == 0;
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UD:
+      return d == 0;
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_Q:
+      return u64 == 0;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_reg::is_one() const
+{
+   if (file != IMM)
+      return false;
+
+   assert(type_sz(type) > 1);
+
+   switch (type) {
+   case BRW_REGISTER_TYPE_HF:
+      assert((d & 0xffff) == ((d >> 16) & 0xffff));
+      return (d & 0xffff) == 0x3c00;
+   case BRW_REGISTER_TYPE_F:
+      return f == 1.0f;
+   case BRW_REGISTER_TYPE_DF:
+      return df == 1.0;
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW:
+      assert((d & 0xffff) == ((d >> 16) & 0xffff));
+      return (d & 0xffff) == 1;
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UD:
+      return d == 1;
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_Q:
+      return u64 == 1;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_reg::is_negative_one() const
+{
+   if (file != IMM)
+      return false;
+
+   assert(type_sz(type) > 1);
+
+   switch (type) {
+   case BRW_REGISTER_TYPE_HF:
+      assert((d & 0xffff) == ((d >> 16) & 0xffff));
+      return (d & 0xffff) == 0xbc00;
+   case BRW_REGISTER_TYPE_F:
+      return f == -1.0;
+   case BRW_REGISTER_TYPE_DF:
+      return df == -1.0;
+   case BRW_REGISTER_TYPE_W:
+      assert((d & 0xffff) == ((d >> 16) & 0xffff));
+      return (d & 0xffff) == 0xffff;
+   case BRW_REGISTER_TYPE_D:
+      return d == -1;
+   case BRW_REGISTER_TYPE_Q:
+      return d64 == -1;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_reg::is_null() const
+{
+   return file == ARF && nr == BRW_ARF_NULL;
+}
+
+
+bool
+backend_reg::is_accumulator() const
+{
+   return file == ARF && nr == BRW_ARF_ACCUMULATOR;
+}
+
+bool
+backend_instruction::is_commutative() const
+{
+   switch (opcode) {
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_ADD3:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+      return true;
+   case BRW_OPCODE_SEL:
+      /* MIN and MAX are commutative. */
+      if (conditional_mod == BRW_CONDITIONAL_GE ||
+          conditional_mod == BRW_CONDITIONAL_L) {
+         return true;
+      }
+      FALLTHROUGH;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_instruction::is_3src(const struct brw_compiler *compiler) const
+{
+   return ::is_3src(&compiler->isa, opcode);
+}
+
+bool
+backend_instruction::is_math() const
+{
+   return (opcode == SHADER_OPCODE_RCP ||
+           opcode == SHADER_OPCODE_RSQ ||
+           opcode == SHADER_OPCODE_SQRT ||
+           opcode == SHADER_OPCODE_EXP2 ||
+           opcode == SHADER_OPCODE_LOG2 ||
+           opcode == SHADER_OPCODE_SIN ||
+           opcode == SHADER_OPCODE_COS ||
+           opcode == SHADER_OPCODE_INT_QUOTIENT ||
+           opcode == SHADER_OPCODE_INT_REMAINDER ||
+           opcode == SHADER_OPCODE_POW);
+}
+
+bool
+backend_instruction::is_control_flow_begin() const
+{
+   switch (opcode) {
+   case BRW_OPCODE_DO:
+   case BRW_OPCODE_IF:
+   case BRW_OPCODE_ELSE:
+      return true;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_instruction::is_control_flow_end() const
+{
+   switch (opcode) {
+   case BRW_OPCODE_ELSE:
+   case BRW_OPCODE_WHILE:
+   case BRW_OPCODE_ENDIF:
+      return true;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_instruction::is_control_flow() const
+{
+   switch (opcode) {
+   case BRW_OPCODE_DO:
+   case BRW_OPCODE_WHILE:
+   case BRW_OPCODE_IF:
+   case BRW_OPCODE_ELSE:
+   case BRW_OPCODE_ENDIF:
+   case BRW_OPCODE_BREAK:
+   case BRW_OPCODE_CONTINUE:
+      return true;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_instruction::uses_indirect_addressing() const
+{
+   switch (opcode) {
+   case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_CLUSTER_BROADCAST:
+   case SHADER_OPCODE_MOV_INDIRECT:
+      return true;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_instruction::can_do_source_mods() const
+{
+   switch (opcode) {
+   case BRW_OPCODE_ADDC:
+   case BRW_OPCODE_BFE:
+   case BRW_OPCODE_BFI1:
+   case BRW_OPCODE_BFI2:
+   case BRW_OPCODE_BFREV:
+   case BRW_OPCODE_CBIT:
+   case BRW_OPCODE_FBH:
+   case BRW_OPCODE_FBL:
+   case BRW_OPCODE_ROL:
+   case BRW_OPCODE_ROR:
+   case BRW_OPCODE_SUBB:
+   case BRW_OPCODE_DP4A:
+   case BRW_OPCODE_DPAS:
+   case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_CLUSTER_BROADCAST:
+   case SHADER_OPCODE_MOV_INDIRECT:
+   case SHADER_OPCODE_SHUFFLE:
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+      return false;
+   default:
+      return true;
+   }
+}
+
+bool
+backend_instruction::can_do_saturate() const
+{
+   switch (opcode) {
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_ADD3:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_AVG:
+   case BRW_OPCODE_CSEL:
+   case BRW_OPCODE_DP2:
+   case BRW_OPCODE_DP3:
+   case BRW_OPCODE_DP4:
+   case BRW_OPCODE_DPH:
+   case BRW_OPCODE_DP4A:
+   case BRW_OPCODE_F16TO32:
+   case BRW_OPCODE_F32TO16:
+   case BRW_OPCODE_LINE:
+   case BRW_OPCODE_LRP:
+   case BRW_OPCODE_MAC:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_MATH:
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+   case BRW_OPCODE_PLN:
+   case BRW_OPCODE_RNDD:
+   case BRW_OPCODE_RNDE:
+   case BRW_OPCODE_RNDU:
+   case BRW_OPCODE_RNDZ:
+   case BRW_OPCODE_SEL:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_SHR:
+   case FS_OPCODE_LINTERP:
+   case SHADER_OPCODE_COS:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_SQRT:
+      return true;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_instruction::can_do_cmod() const
+{
+   switch (opcode) {
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_ADD3:
+   case BRW_OPCODE_ADDC:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_AVG:
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_CMPN:
+   case BRW_OPCODE_DP2:
+   case BRW_OPCODE_DP3:
+   case BRW_OPCODE_DP4:
+   case BRW_OPCODE_DPH:
+   case BRW_OPCODE_F16TO32:
+   case BRW_OPCODE_F32TO16:
+   case BRW_OPCODE_FRC:
+   case BRW_OPCODE_LINE:
+   case BRW_OPCODE_LRP:
+   case BRW_OPCODE_LZD:
+   case BRW_OPCODE_MAC:
+   case BRW_OPCODE_MACH:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_MUL:
+   case BRW_OPCODE_NOT:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_PLN:
+   case BRW_OPCODE_RNDD:
+   case BRW_OPCODE_RNDE:
+   case BRW_OPCODE_RNDU:
+   case BRW_OPCODE_RNDZ:
+   case BRW_OPCODE_SAD2:
+   case BRW_OPCODE_SADA2:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SUBB:
+   case BRW_OPCODE_XOR:
+   case FS_OPCODE_LINTERP:
+      return true;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_instruction::reads_accumulator_implicitly() const
+{
+   switch (opcode) {
+   case BRW_OPCODE_MAC:
+   case BRW_OPCODE_MACH:
+   case BRW_OPCODE_SADA2:
+      return true;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_instruction::writes_accumulator_implicitly(const struct intel_device_info *devinfo) const
+{
+   return writes_accumulator ||
+          (devinfo->ver < 6 &&
+           ((opcode >= BRW_OPCODE_ADD && opcode < BRW_OPCODE_NOP) ||
+            (opcode >= FS_OPCODE_DDX_COARSE && opcode <= FS_OPCODE_LINTERP))) ||
+          (opcode == FS_OPCODE_LINTERP &&
+           (!devinfo->has_pln || devinfo->ver <= 6)) ||
+          (eot && intel_needs_workaround(devinfo, 14010017096));
+}
+
+bool
+backend_instruction::has_side_effects() const
+{
+   switch (opcode) {
+   case SHADER_OPCODE_SEND:
+      return send_has_side_effects;
+
+   case BRW_OPCODE_SYNC:
+   case VEC4_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
+   case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
+   case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
+   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
+   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_MEMORY_FENCE:
+   case SHADER_OPCODE_INTERLOCK:
+   case SHADER_OPCODE_URB_WRITE_LOGICAL:
+   case FS_OPCODE_FB_WRITE:
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+   case FS_OPCODE_REP_FB_WRITE:
+   case SHADER_OPCODE_BARRIER:
+   case VEC4_TCS_OPCODE_URB_WRITE:
+   case TCS_OPCODE_RELEASE_INPUT:
+   case SHADER_OPCODE_RND_MODE:
+   case SHADER_OPCODE_FLOAT_CONTROL_MODE:
+   case FS_OPCODE_SCHEDULING_FENCE:
+   case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
+   case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
+   case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
+   case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
+   case RT_OPCODE_TRACE_RAY_LOGICAL:
+   case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
+      return true;
+   default:
+      return eot;
+   }
+}
+
+bool
+backend_instruction::is_volatile() const
+{
+   switch (opcode) {
+   case SHADER_OPCODE_SEND:
+      return send_is_volatile;
+
+   case VEC4_OPCODE_UNTYPED_SURFACE_READ:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
+   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
+   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
+   case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
+   case VEC4_OPCODE_URB_READ:
+      return true;
+   default:
+      return false;
+   }
+}
+
+#ifndef NDEBUG
+static bool
+inst_is_in_block(const bblock_t *block, const backend_instruction *inst)
+{
+   const exec_node *n = inst;
+
+   /* Find the tail sentinel. If the tail sentinel is the sentinel from the
+    * list header in the bblock_t, then this instruction is in that basic
+    * block.
+    */
+   while (!n->is_tail_sentinel())
+      n = n->get_next();
+
+   return n == &block->instructions.tail_sentinel;
+}
+#endif
+
+static void
+adjust_later_block_ips(bblock_t *start_block, int ip_adjustment)
+{
+   for (bblock_t *block_iter = start_block->next();
+        block_iter;
+        block_iter = block_iter->next()) {
+      block_iter->start_ip += ip_adjustment;
+      block_iter->end_ip += ip_adjustment;
+   }
+}
+
+void
+backend_instruction::insert_after(bblock_t *block, backend_instruction *inst)
+{
+   assert(this != inst);
+   assert(block->end_ip_delta == 0);
+
+   if (!this->is_head_sentinel())
+      assert(inst_is_in_block(block, this) || !"Instruction not in block");
+
+   block->end_ip++;
+
+   adjust_later_block_ips(block, 1);
+
+   exec_node::insert_after(inst);
+}
+
+void
+backend_instruction::insert_before(bblock_t *block, backend_instruction *inst)
+{
+   assert(this != inst);
+   assert(block->end_ip_delta == 0);
+
+   if (!this->is_tail_sentinel())
+      assert(inst_is_in_block(block, this) || !"Instruction not in block");
+
+   block->end_ip++;
+
+   adjust_later_block_ips(block, 1);
+
+   exec_node::insert_before(inst);
+}
+
+void
+backend_instruction::remove(bblock_t *block, bool defer_later_block_ip_updates)
+{
+   assert(inst_is_in_block(block, this) || !"Instruction not in block");
+
+   if (defer_later_block_ip_updates) {
+      block->end_ip_delta--;
+   } else {
+      assert(block->end_ip_delta == 0);
+      adjust_later_block_ips(block, -1);
+   }
+
+   if (block->start_ip == block->end_ip) {
+      if (block->end_ip_delta != 0) {
+         adjust_later_block_ips(block, block->end_ip_delta);
+         block->end_ip_delta = 0;
+      }
+
+      block->cfg->remove_block(block);
+   } else {
+      block->end_ip--;
+   }
+
+   exec_node::remove();
+}
+
+void
+backend_shader::dump_instructions(const char *name) const
+{
+   FILE *file = stderr;
+   if (name && __normal_user()) {
+      file = fopen(name, "w");
+      if (!file)
+         file = stderr;
+   }
+
+   dump_instructions_to_file(file);
+
+   if (file != stderr) {
+      fclose(file);
+   }
+}
+
+void
+backend_shader::dump_instructions_to_file(FILE *file) const
+{
+   if (cfg) {
+      int ip = 0;
+      foreach_block_and_inst(block, backend_instruction, inst, cfg) {
+         if (!INTEL_DEBUG(DEBUG_OPTIMIZER))
+            fprintf(file, "%4d: ", ip++);
+         dump_instruction(inst, file);
+      }
+   } else {
+      int ip = 0;
+      foreach_in_list(backend_instruction, inst, &instructions) {
+         if (!INTEL_DEBUG(DEBUG_OPTIMIZER))
+            fprintf(file, "%4d: ", ip++);
+         dump_instruction(inst, file);
+      }
+   }
+}
+
+void
+backend_shader::calculate_cfg()
+{
+   if (this->cfg)
+      return;
+   cfg = new(mem_ctx) cfg_t(this, &this->instructions);
+}
+
+void
+backend_shader::invalidate_analysis(brw::analysis_dependency_class c)
+{
+   idom_analysis.invalidate(c);
+}
+
+extern "C" const unsigned *
+brw_compile_tes(const struct brw_compiler *compiler,
+                brw_compile_tes_params *params)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   nir_shader *nir = params->base.nir;
+   const struct brw_tes_prog_key *key = params->key;
+   const struct intel_vue_map *input_vue_map = params->input_vue_map;
+   struct brw_tes_prog_data *prog_data = params->prog_data;
+
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_EVAL];
+   const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TES);
+   const unsigned *assembly;
+
+   prog_data->base.base.stage = MESA_SHADER_TESS_EVAL;
+   prog_data->base.base.ray_queries = nir->info.ray_queries;
+
+   nir->info.inputs_read = key->inputs_read;
+   nir->info.patch_inputs_read = key->patch_inputs_read;
+
+   brw_nir_apply_key(nir, compiler, &key->base, 8);
+   brw_nir_lower_tes_inputs(nir, input_vue_map);
+   brw_nir_lower_vue_outputs(nir);
+   brw_postprocess_nir(nir, compiler, debug_enabled,
+                       key->base.robust_flags);
+
+   brw_compute_vue_map(devinfo, &prog_data->base.vue_map,
+                       nir->info.outputs_written,
+                       nir->info.separate_shader, 1);
+
+   unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4;
+
+   assert(output_size_bytes >= 1);
+   if (output_size_bytes > GFX7_MAX_DS_URB_ENTRY_SIZE_BYTES) {
+      params->base.error_str = ralloc_strdup(params->base.mem_ctx,
+                                             "DS outputs exceed maximum size");
+      return NULL;
+   }
+
+   prog_data->base.clip_distance_mask =
+      ((1 << nir->info.clip_distance_array_size) - 1);
+   prog_data->base.cull_distance_mask =
+      ((1 << nir->info.cull_distance_array_size) - 1) <<
+      nir->info.clip_distance_array_size;
+
+   prog_data->include_primitive_id =
+      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
+
+   /* URB entry sizes are stored as a multiple of 64 bytes. */
+   prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+
+   prog_data->base.urb_read_length = 0;
+
+   STATIC_ASSERT(INTEL_TESS_PARTITIONING_INTEGER == TESS_SPACING_EQUAL - 1);
+   STATIC_ASSERT(INTEL_TESS_PARTITIONING_ODD_FRACTIONAL ==
+                 TESS_SPACING_FRACTIONAL_ODD - 1);
+   STATIC_ASSERT(INTEL_TESS_PARTITIONING_EVEN_FRACTIONAL ==
+                 TESS_SPACING_FRACTIONAL_EVEN - 1);
+
+   prog_data->partitioning =
+      (enum intel_tess_partitioning) (nir->info.tess.spacing - 1);
+
+   switch (nir->info.tess._primitive_mode) {
+   case TESS_PRIMITIVE_QUADS:
+      prog_data->domain = INTEL_TESS_DOMAIN_QUAD;
+      break;
+   case TESS_PRIMITIVE_TRIANGLES:
+      prog_data->domain = INTEL_TESS_DOMAIN_TRI;
+      break;
+   case TESS_PRIMITIVE_ISOLINES:
+      prog_data->domain = INTEL_TESS_DOMAIN_ISOLINE;
+      break;
+   default:
+      unreachable("invalid domain shader primitive mode");
+   }
+
+   if (nir->info.tess.point_mode) {
+      prog_data->output_topology = INTEL_TESS_OUTPUT_TOPOLOGY_POINT;
+   } else if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) {
+      prog_data->output_topology = INTEL_TESS_OUTPUT_TOPOLOGY_LINE;
+   } else {
+      /* Hardware winding order is backwards from OpenGL */
+      prog_data->output_topology =
+         nir->info.tess.ccw ? INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW
+                             : INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW;
+   }
+
+   if (unlikely(debug_enabled)) {
+      fprintf(stderr, "TES Input ");
+      brw_print_vue_map(stderr, input_vue_map, MESA_SHADER_TESS_EVAL);
+      fprintf(stderr, "TES Output ");
+      brw_print_vue_map(stderr, &prog_data->base.vue_map,
+                        MESA_SHADER_TESS_EVAL);
+   }
+
+   if (is_scalar) {
+      const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
+      fs_visitor v(compiler, &params->base, &key->base,
+                   &prog_data->base.base, nir, dispatch_width,
+                   params->base.stats != NULL, debug_enabled);
+      if (!v.run_tes()) {
+         params->base.error_str =
+            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      assert(v.payload().num_regs % reg_unit(devinfo) == 0);
+      prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
+
+      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
+
+      fs_generator g(compiler, &params->base,
+                     &prog_data->base.base, false, MESA_SHADER_TESS_EVAL);
+      if (unlikely(debug_enabled)) {
+         g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
+                                        "%s tessellation evaluation shader %s",
+                                        nir->info.label ? nir->info.label
+                                                        : "unnamed",
+                                        nir->info.name));
+      }
+
+      g.generate_code(v.cfg, dispatch_width, v.shader_stats,
+                      v.performance_analysis.require(), params->base.stats);
+
+      g.add_const_data(nir->constant_data, nir->constant_data_size);
+
+      assembly = g.get_assembly();
+   } else {
+      brw::vec4_tes_visitor v(compiler, &params->base, key, prog_data,
+                              nir, debug_enabled);
+      if (!v.run()) {
+         params->base.error_str =
+            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+	 return NULL;
+      }
+
+      if (unlikely(debug_enabled))
+	 v.dump_instructions();
+
+      assembly = brw_vec4_generate_assembly(compiler, &params->base, nir,
+                                            &prog_data->base, v.cfg,
+                                            v.performance_analysis.require(),
+                                            debug_enabled);
+   }
+
+   return assembly;
+}
diff --git a/src/intel/compiler/elk/brw_shader.h b/src/intel/compiler/elk/brw_shader.h
new file mode 100644
index 00000000000..85a1912ee3c
--- /dev/null
+++ b/src/intel/compiler/elk/brw_shader.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_SHADER_H
+#define BRW_SHADER_H
+
+#include <stdint.h>
+#include "brw_cfg.h"
+#include "brw_compiler.h"
+#include "compiler/nir/nir.h"
+
+#ifdef __cplusplus
+#include "brw_ir_analysis.h"
+#include "brw_ir_allocator.h"
+
+enum instruction_scheduler_mode {
+   SCHEDULE_PRE,
+   SCHEDULE_PRE_NON_LIFO,
+   SCHEDULE_PRE_LIFO,
+   SCHEDULE_POST,
+   SCHEDULE_NONE,
+};
+
+#define UBO_START ((1 << 16) - 4)
+
+struct backend_shader {
+protected:
+
+   backend_shader(const struct brw_compiler *compiler,
+                  const struct brw_compile_params *params,
+                  const nir_shader *shader,
+                  struct brw_stage_prog_data *stage_prog_data,
+                  bool debug_enabled);
+
+public:
+   virtual ~backend_shader();
+
+   const struct brw_compiler *compiler;
+   void *log_data; /* Passed to compiler->*_log functions */
+
+   const struct intel_device_info * const devinfo;
+   const nir_shader *nir;
+   struct brw_stage_prog_data * const stage_prog_data;
+
+   /** ralloc context for temporary data used during compile */
+   void *mem_ctx;
+
+   /**
+    * List of either fs_inst or vec4_instruction (inheriting from
+    * backend_instruction)
+    */
+   exec_list instructions;
+
+   cfg_t *cfg;
+   brw_analysis<brw::idom_tree, backend_shader> idom_analysis;
+
+   gl_shader_stage stage;
+   bool debug_enabled;
+
+   brw::simple_allocator alloc;
+
+   virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const = 0;
+   virtual void dump_instructions_to_file(FILE *file) const;
+
+   /* Convenience functions based on the above. */
+   void dump_instruction(const backend_instruction *inst, FILE *file = stderr) const {
+      dump_instruction_to_file(inst, file);
+   }
+   void dump_instructions(const char *name = nullptr) const;
+
+   void calculate_cfg();
+
+   virtual void invalidate_analysis(brw::analysis_dependency_class c);
+};
+
+#else
+struct backend_shader;
+#endif /* __cplusplus */
+
+enum brw_reg_type brw_type_for_base_type(const struct glsl_type *type);
+uint32_t brw_math_function(enum opcode op);
+const char *brw_instruction_name(const struct brw_isa_info *isa,
+                                 enum opcode op);
+bool brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg);
+bool brw_negate_immediate(enum brw_reg_type type, struct brw_reg *reg);
+bool brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg);
+
+bool opt_predicated_break(struct backend_shader *s);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* brw_fs_reg_allocate.cpp */
+void brw_fs_alloc_reg_sets(struct brw_compiler *compiler);
+
+/* brw_vec4_reg_allocate.cpp */
+void brw_vec4_alloc_reg_set(struct brw_compiler *compiler);
+
+/* brw_disasm.c */
+extern const char *const conditional_modifier[16];
+extern const char *const pred_ctrl_align16[16];
+
+/* Per-thread scratch space is a power-of-two multiple of 1KB. */
+static inline unsigned
+brw_get_scratch_size(int size)
+{
+   return MAX2(1024, util_next_power_of_two(size));
+}
+
+
+static inline nir_variable_mode
+brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
+                         gl_shader_stage stage)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   const bool is_scalar = compiler->scalar_stage[stage];
+   nir_variable_mode indirect_mask = (nir_variable_mode) 0;
+
+   switch (stage) {
+   case MESA_SHADER_VERTEX:
+   case MESA_SHADER_FRAGMENT:
+      indirect_mask |= nir_var_shader_in;
+      break;
+
+   case MESA_SHADER_GEOMETRY:
+      if (!is_scalar)
+         indirect_mask |= nir_var_shader_in;
+      break;
+
+   default:
+      /* Everything else can handle indirect inputs */
+      break;
+   }
+
+   if (is_scalar && stage != MESA_SHADER_TESS_CTRL &&
+                    stage != MESA_SHADER_TASK &&
+                    stage != MESA_SHADER_MESH)
+      indirect_mask |= nir_var_shader_out;
+
+   /* On HSW+, we allow indirects in scalar shaders.  They get implemented
+    * using nir_lower_vars_to_explicit_types and nir_lower_explicit_io in
+    * brw_postprocess_nir.
+    *
+    * We haven't plumbed through the indirect scratch messages on gfx6 or
+    * earlier so doing indirects via scratch doesn't work there. On gfx7 and
+    * earlier the scratch space size is limited to 12kB.  If we allowed
+    * indirects as scratch all the time, we may easily exceed this limit
+    * without having any fallback.
+    */
+   if (is_scalar && devinfo->verx10 <= 70)
+      indirect_mask |= nir_var_function_temp;
+
+   return indirect_mask;
+}
+
+bool brw_texture_offset(const nir_tex_instr *tex, unsigned src,
+                        uint32_t *offset_bits);
+
+/**
+ * Scratch data used when compiling a GLSL geometry shader.
+ */
+struct brw_gs_compile
+{
+   struct brw_gs_prog_key key;
+   struct intel_vue_map input_vue_map;
+
+   unsigned control_data_bits_per_vertex;
+   unsigned control_data_header_size_bits;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BRW_SHADER_H */
diff --git a/src/intel/compiler/elk/brw_simd_selection.cpp b/src/intel/compiler/elk/brw_simd_selection.cpp
new file mode 100644
index 00000000000..05f9394c0d4
--- /dev/null
+++ b/src/intel/compiler/elk/brw_simd_selection.cpp
@@ -0,0 +1,268 @@
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_private.h"
+#include "compiler/shader_info.h"
+#include "intel/dev/intel_debug.h"
+#include "intel/dev/intel_device_info.h"
+#include "util/ralloc.h"
+
+unsigned
+brw_required_dispatch_width(const struct shader_info *info)
+{
+   if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
+      assert(gl_shader_stage_uses_workgroup(info->stage));
+      /* These enum values are expressly chosen to be equal to the subgroup
+       * size that they require.
+       */
+      return (unsigned)info->subgroup_size;
+   } else {
+      return 0;
+   }
+}
+
+static inline bool
+test_bit(unsigned mask, unsigned bit) {
+   return mask & (1u << bit);
+}
+
+namespace {
+
+struct brw_cs_prog_data *
+get_cs_prog_data(brw_simd_selection_state &state)
+{
+   if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
+      return std::get<struct brw_cs_prog_data *>(state.prog_data);
+   else
+      return nullptr;
+}
+
+struct brw_stage_prog_data *
+get_prog_data(brw_simd_selection_state &state)
+{
+   if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
+      return &std::get<struct brw_cs_prog_data *>(state.prog_data)->base;
+   else if (std::holds_alternative<struct brw_bs_prog_data *>(state.prog_data))
+      return &std::get<struct brw_bs_prog_data *>(state.prog_data)->base;
+   else
+      return nullptr;
+}
+
+}
+
+bool
+brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
+{
+   assert(simd < SIMD_COUNT);
+   assert(!state.compiled[simd]);
+
+   const auto cs_prog_data = get_cs_prog_data(state);
+   const auto prog_data = get_prog_data(state);
+   const unsigned width = 8u << simd;
+
+   /* For shaders with variable size workgroup, in most cases we can compile
+    * all the variants (exceptions are bindless dispatch & ray queries), since
+    * the choice will happen only at dispatch time.
+    */
+   const bool workgroup_size_variable = cs_prog_data && cs_prog_data->local_size[0] == 0;
+
+   if (!workgroup_size_variable) {
+      if (state.spilled[simd]) {
+         state.error[simd] = "Would spill";
+         return false;
+      }
+
+      if (state.required_width && state.required_width != width) {
+         state.error[simd] = "Different than required dispatch width";
+         return false;
+      }
+
+      if (cs_prog_data) {
+         const unsigned workgroup_size = cs_prog_data->local_size[0] *
+                                         cs_prog_data->local_size[1] *
+                                         cs_prog_data->local_size[2];
+
+         unsigned max_threads = state.devinfo->max_cs_workgroup_threads;
+
+         const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
+         if (simd > min_simd && state.compiled[simd - 1] &&
+            workgroup_size <= (width / 2)) {
+            state.error[simd] = "Workgroup size already fits in smaller SIMD";
+            return false;
+         }
+
+         if (DIV_ROUND_UP(workgroup_size, width) > max_threads) {
+            state.error[simd] = "Would need more than max_threads to fit all invocations";
+            return false;
+         }
+      }
+
+      /* The SIMD32 is only enabled for cases it is needed unless forced.
+       *
+       * TODO: Use performance_analysis and drop this rule.
+       */
+      if (width == 32 && state.devinfo->ver < 20) {
+         if (!INTEL_DEBUG(DEBUG_DO32) && (state.compiled[0] || state.compiled[1])) {
+            state.error[simd] = "SIMD32 not required (use INTEL_DEBUG=do32 to force)";
+            return false;
+         }
+      }
+   }
+
+   if (width == 8 && state.devinfo->ver >= 20) {
+      state.error[simd] = "SIMD8 not supported on Xe2+";
+      return false;
+   }
+
+   if (width == 32 && cs_prog_data && cs_prog_data->base.ray_queries > 0) {
+      state.error[simd] = "Ray queries not supported";
+      return false;
+   }
+
+   if (width == 32 && cs_prog_data && cs_prog_data->uses_btd_stack_ids) {
+      state.error[simd] = "Bindless shader calls not supported";
+      return false;
+   }
+
+   uint64_t start;
+   switch (prog_data->stage) {
+   case MESA_SHADER_COMPUTE:
+      start = DEBUG_CS_SIMD8;
+      break;
+   case MESA_SHADER_TASK:
+      start = DEBUG_TS_SIMD8;
+      break;
+   case MESA_SHADER_MESH:
+      start = DEBUG_MS_SIMD8;
+      break;
+   case MESA_SHADER_RAYGEN:
+   case MESA_SHADER_ANY_HIT:
+   case MESA_SHADER_CLOSEST_HIT:
+   case MESA_SHADER_MISS:
+   case MESA_SHADER_INTERSECTION:
+   case MESA_SHADER_CALLABLE:
+      start = DEBUG_RT_SIMD8;
+      break;
+   default:
+      unreachable("unknown shader stage in brw_simd_should_compile");
+   }
+
+   const bool env_skip[] = {
+      (intel_simd & (start << 0)) == 0,
+      (intel_simd & (start << 1)) == 0,
+      (intel_simd & (start << 2)) == 0,
+   };
+
+   static_assert(ARRAY_SIZE(env_skip) == SIMD_COUNT);
+
+   if (unlikely(env_skip[simd])) {
+      state.error[simd] = "Disabled by INTEL_DEBUG environment variable";
+      return false;
+   }
+
+   return true;
+}
+
+void
+brw_simd_mark_compiled(brw_simd_selection_state &state, unsigned simd, bool spilled)
+{
+   assert(simd < SIMD_COUNT);
+   assert(!state.compiled[simd]);
+
+   auto cs_prog_data = get_cs_prog_data(state);
+
+   state.compiled[simd] = true;
+   if (cs_prog_data)
+      cs_prog_data->prog_mask |= 1u << simd;
+
+   /* If a SIMD spilled, all the larger ones would spill too. */
+   if (spilled) {
+      for (unsigned i = simd; i < SIMD_COUNT; i++) {
+         state.spilled[i] = true;
+         if (cs_prog_data)
+            cs_prog_data->prog_spilled |= 1u << i;
+      }
+   }
+}
+
+int
+brw_simd_select(const struct brw_simd_selection_state &state)
+{
+   for (int i = SIMD_COUNT - 1; i >= 0; i--) {
+      if (state.compiled[i] && !state.spilled[i])
+         return i;
+   }
+   for (int i = SIMD_COUNT - 1; i >= 0; i--) {
+      if (state.compiled[i])
+         return i;
+   }
+   return -1;
+}
+
+int
+brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
+                                   const struct brw_cs_prog_data *prog_data,
+                                   const unsigned *sizes)
+{
+   if (!sizes || (prog_data->local_size[0] == sizes[0] &&
+                  prog_data->local_size[1] == sizes[1] &&
+                  prog_data->local_size[2] == sizes[2])) {
+      brw_simd_selection_state simd_state{
+         .prog_data = const_cast<struct brw_cs_prog_data *>(prog_data),
+      };
+
+      /* Propagate the prog_data information back to the simd_state,
+       * so we can use select() directly.
+       */
+      for (int i = 0; i < SIMD_COUNT; i++) {
+         simd_state.compiled[i] = test_bit(prog_data->prog_mask, i);
+         simd_state.spilled[i] = test_bit(prog_data->prog_spilled, i);
+      }
+
+      return brw_simd_select(simd_state);
+   }
+
+   struct brw_cs_prog_data cloned = *prog_data;
+   for (unsigned i = 0; i < 3; i++)
+      cloned.local_size[i] = sizes[i];
+
+   cloned.prog_mask = 0;
+   cloned.prog_spilled = 0;
+
+   brw_simd_selection_state simd_state{
+      .devinfo = devinfo,
+      .prog_data = &cloned,
+   };
+
+   for (unsigned simd = 0; simd < SIMD_COUNT; simd++) {
+      /* We are not recompiling, so use original results of prog_mask and
+       * prog_spilled as they will already contain all possible compilations.
+       */
+      if (brw_simd_should_compile(simd_state, simd) &&
+          test_bit(prog_data->prog_mask, simd)) {
+         brw_simd_mark_compiled(simd_state, simd, test_bit(prog_data->prog_spilled, simd));
+      }
+   }
+
+   return brw_simd_select(simd_state);
+}
diff --git a/src/intel/compiler/elk/brw_vec4.cpp b/src/intel/compiler/elk/brw_vec4.cpp
new file mode 100644
index 00000000000..cb93f3cc2c6
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4.cpp
@@ -0,0 +1,2707 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "brw_fs.h"
+#include "brw_eu.h"
+#include "brw_cfg.h"
+#include "brw_nir.h"
+#include "brw_vec4_builder.h"
+#include "brw_vec4_vs.h"
+#include "brw_dead_control_flow.h"
+#include "brw_private.h"
+#include "dev/intel_debug.h"
+#include "util/u_math.h"
+
+#define MAX_INSTRUCTION (1 << 30)
+
+using namespace brw;
+
+namespace brw {
+
+void
+src_reg::init()
+{
+   memset((void*)this, 0, sizeof(*this));
+   this->file = BAD_FILE;
+   this->type = BRW_REGISTER_TYPE_UD;
+}
+
+src_reg::src_reg(enum brw_reg_file file, int nr, const glsl_type *type)
+{
+   init();
+
+   this->file = file;
+   this->nr = nr;
+   if (type && (glsl_type_is_scalar(type) || glsl_type_is_vector(type) || glsl_type_is_matrix(type)))
+      this->swizzle = brw_swizzle_for_size(type->vector_elements);
+   else
+      this->swizzle = BRW_SWIZZLE_XYZW;
+   if (type)
+      this->type = brw_type_for_base_type(type);
+}
+
+/** Generic unset register constructor. */
+src_reg::src_reg()
+{
+   init();
+}
+
+src_reg::src_reg(struct ::brw_reg reg) :
+   backend_reg(reg)
+{
+   this->offset = 0;
+   this->reladdr = NULL;
+}
+
+src_reg::src_reg(const dst_reg &reg) :
+   backend_reg(reg)
+{
+   this->reladdr = reg.reladdr;
+   this->swizzle = brw_swizzle_for_mask(reg.writemask);
+}
+
+void
+dst_reg::init()
+{
+   memset((void*)this, 0, sizeof(*this));
+   this->file = BAD_FILE;
+   this->type = BRW_REGISTER_TYPE_UD;
+   this->writemask = WRITEMASK_XYZW;
+}
+
+dst_reg::dst_reg()
+{
+   init();
+}
+
+dst_reg::dst_reg(enum brw_reg_file file, int nr)
+{
+   init();
+
+   this->file = file;
+   this->nr = nr;
+}
+
+dst_reg::dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
+                 unsigned writemask)
+{
+   init();
+
+   this->file = file;
+   this->nr = nr;
+   this->type = brw_type_for_base_type(type);
+   this->writemask = writemask;
+}
+
+dst_reg::dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
+                 unsigned writemask)
+{
+   init();
+
+   this->file = file;
+   this->nr = nr;
+   this->type = type;
+   this->writemask = writemask;
+}
+
+dst_reg::dst_reg(struct ::brw_reg reg) :
+   backend_reg(reg)
+{
+   this->offset = 0;
+   this->reladdr = NULL;
+}
+
+dst_reg::dst_reg(const src_reg &reg) :
+   backend_reg(reg)
+{
+   this->writemask = brw_mask_for_swizzle(reg.swizzle);
+   this->reladdr = reg.reladdr;
+}
+
+bool
+dst_reg::equals(const dst_reg &r) const
+{
+   return (this->backend_reg::equals(r) &&
+           (reladdr == r.reladdr ||
+            (reladdr && r.reladdr && reladdr->equals(*r.reladdr))));
+}
+
+bool
+vec4_instruction::is_send_from_grf() const
+{
+   switch (opcode) {
+   case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
+   case VEC4_OPCODE_UNTYPED_ATOMIC:
+   case VEC4_OPCODE_UNTYPED_SURFACE_READ:
+   case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
+   case VEC4_OPCODE_URB_READ:
+   case VEC4_TCS_OPCODE_URB_WRITE:
+   case TCS_OPCODE_RELEASE_INPUT:
+   case SHADER_OPCODE_BARRIER:
+      return true;
+   default:
+      return false;
+   }
+}
+
+/**
+ * Returns true if this instruction's sources and destinations cannot
+ * safely be the same register.
+ *
+ * In most cases, a register can be written over safely by the same
+ * instruction that is its last use.  For a single instruction, the
+ * sources are dereferenced before writing of the destination starts
+ * (naturally).
+ *
+ * However, there are a few cases where this can be problematic:
+ *
+ * - Virtual opcodes that translate to multiple instructions in the
+ *   code generator: if src == dst and one instruction writes the
+ *   destination before a later instruction reads the source, then
+ *   src will have been clobbered.
+ *
+ * The register allocator uses this information to set up conflicts between
+ * GRF sources and the destination.
+ */
+bool
+vec4_instruction::has_source_and_destination_hazard() const
+{
+   switch (opcode) {
+   case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+   case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+   case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+      return true;
+   default:
+      /* 8-wide compressed DF operations are executed as two 4-wide operations,
+       * so we have a src/dst hazard if the first half of the instruction
+       * overwrites the source of the second half. Prevent this by marking
+       * compressed instructions as having src/dst hazards, so the register
+       * allocator assigns safe register regions for dst and srcs.
+       */
+      return size_written > REG_SIZE;
+   }
+}
+
+unsigned
+vec4_instruction::size_read(unsigned arg) const
+{
+   switch (opcode) {
+   case VEC4_OPCODE_UNTYPED_ATOMIC:
+   case VEC4_OPCODE_UNTYPED_SURFACE_READ:
+   case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
+   case VEC4_TCS_OPCODE_URB_WRITE:
+      if (arg == 0)
+         return mlen * REG_SIZE;
+      break;
+   case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
+      if (arg == 1)
+         return mlen * REG_SIZE;
+      break;
+   default:
+      break;
+   }
+
+   switch (src[arg].file) {
+   case BAD_FILE:
+      return 0;
+   case IMM:
+   case UNIFORM:
+      return 4 * type_sz(src[arg].type);
+   default:
+      /* XXX - Represent actual vertical stride. */
+      return exec_size * type_sz(src[arg].type);
+   }
+}
+
+bool
+vec4_instruction::can_do_source_mods(const struct intel_device_info *devinfo)
+{
+   if (devinfo->ver == 6 && is_math())
+      return false;
+
+   if (is_send_from_grf())
+      return false;
+
+   if (!backend_instruction::can_do_source_mods())
+      return false;
+
+   return true;
+}
+
+bool
+vec4_instruction::can_do_cmod()
+{
+   if (!backend_instruction::can_do_cmod())
+      return false;
+
+   /* The accumulator result appears to get used for the conditional modifier
+    * generation.  When negating a UD value, there is a 33rd bit generated for
+    * the sign in the accumulator value, so now you can't check, for example,
+    * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
+    */
+   for (unsigned i = 0; i < 3; i++) {
+      if (src[i].file != BAD_FILE &&
+          brw_reg_type_is_unsigned_integer(src[i].type) && src[i].negate)
+         return false;
+   }
+
+   return true;
+}
+
+bool
+vec4_instruction::can_do_writemask(const struct intel_device_info *devinfo)
+{
+   switch (opcode) {
+   case SHADER_OPCODE_GFX4_SCRATCH_READ:
+   case VEC4_OPCODE_DOUBLE_TO_F32:
+   case VEC4_OPCODE_DOUBLE_TO_D32:
+   case VEC4_OPCODE_DOUBLE_TO_U32:
+   case VEC4_OPCODE_TO_DOUBLE:
+   case VEC4_OPCODE_PICK_LOW_32BIT:
+   case VEC4_OPCODE_PICK_HIGH_32BIT:
+   case VEC4_OPCODE_SET_LOW_32BIT:
+   case VEC4_OPCODE_SET_HIGH_32BIT:
+   case VS_OPCODE_PULL_CONSTANT_LOAD:
+   case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
+   case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+   case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+   case TES_OPCODE_CREATE_INPUT_READ_HEADER:
+   case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+   case VEC4_OPCODE_URB_READ:
+   case SHADER_OPCODE_MOV_INDIRECT:
+   case SHADER_OPCODE_TEX:
+   case FS_OPCODE_TXB:
+   case SHADER_OPCODE_TXD:
+   case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_LZ:
+   case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_CMS_W:
+   case SHADER_OPCODE_TXF_UMS:
+   case SHADER_OPCODE_TXF_MCS:
+   case SHADER_OPCODE_TXL:
+   case SHADER_OPCODE_TXL_LZ:
+   case SHADER_OPCODE_TXS:
+   case SHADER_OPCODE_LOD:
+   case SHADER_OPCODE_TG4:
+   case SHADER_OPCODE_TG4_OFFSET:
+   case SHADER_OPCODE_SAMPLEINFO:
+      return false;
+   default:
+      /* The MATH instruction on Gfx6 only executes in align1 mode, which does
+       * not support writemasking.
+       */
+      if (devinfo->ver == 6 && is_math())
+         return false;
+
+      return true;
+   }
+}
+
+bool
+vec4_instruction::can_change_types() const
+{
+   return dst.type == src[0].type &&
+          !src[0].abs && !src[0].negate && !saturate &&
+          (opcode == BRW_OPCODE_MOV ||
+           (opcode == BRW_OPCODE_SEL &&
+            dst.type == src[1].type &&
+            predicate != BRW_PREDICATE_NONE &&
+            !src[1].abs && !src[1].negate));
+}
+
+/**
+ * Returns how many MRFs an opcode will write over.
+ *
+ * Note that this is not the 0 or 1 implied writes in an actual gen
+ * instruction -- the generate_* functions generate additional MOVs
+ * for setup.
+ */
+unsigned
+vec4_instruction::implied_mrf_writes() const
+{
+   if (mlen == 0 || is_send_from_grf())
+      return 0;
+
+   switch (opcode) {
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      return 1;
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+   case SHADER_OPCODE_POW:
+   case TCS_OPCODE_THREAD_END:
+      return 2;
+   case VEC4_VS_OPCODE_URB_WRITE:
+      return 1;
+   case VS_OPCODE_PULL_CONSTANT_LOAD:
+      return 2;
+   case SHADER_OPCODE_GFX4_SCRATCH_READ:
+      return 2;
+   case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
+      return 3;
+   case VEC4_GS_OPCODE_URB_WRITE:
+   case VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
+   case GS_OPCODE_THREAD_END:
+      return 0;
+   case GS_OPCODE_FF_SYNC:
+      return 1;
+   case VEC4_TCS_OPCODE_URB_WRITE:
+      return 0;
+   case SHADER_OPCODE_TEX:
+   case SHADER_OPCODE_TXL:
+   case SHADER_OPCODE_TXD:
+   case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_CMS_W:
+   case SHADER_OPCODE_TXF_MCS:
+   case SHADER_OPCODE_TXS:
+   case SHADER_OPCODE_TG4:
+   case SHADER_OPCODE_TG4_OFFSET:
+   case SHADER_OPCODE_SAMPLEINFO:
+   case SHADER_OPCODE_GET_BUFFER_SIZE:
+      return header_size;
+   default:
+      unreachable("not reached");
+   }
+}
+
+bool
+src_reg::equals(const src_reg &r) const
+{
+   return (this->backend_reg::equals(r) &&
+	   !reladdr && !r.reladdr);
+}
+
+bool
+src_reg::negative_equals(const src_reg &r) const
+{
+   return this->backend_reg::negative_equals(r) &&
+          !reladdr && !r.reladdr;
+}
+
+bool
+vec4_visitor::opt_vector_float()
+{
+   bool progress = false;
+
+   foreach_block(block, cfg) {
+      unsigned last_reg = ~0u, last_offset = ~0u;
+      enum brw_reg_file last_reg_file = BAD_FILE;
+
+      uint8_t imm[4] = { 0 };
+      int inst_count = 0;
+      vec4_instruction *imm_inst[4];
+      unsigned writemask = 0;
+      enum brw_reg_type dest_type = BRW_REGISTER_TYPE_F;
+
+      foreach_inst_in_block_safe(vec4_instruction, inst, block) {
+         int vf = -1;
+         enum brw_reg_type need_type = BRW_REGISTER_TYPE_LAST;
+
+         /* Look for unconditional MOVs from an immediate with a partial
+          * writemask.  Skip type-conversion MOVs other than integer 0,
+          * where the type doesn't matter.  See if the immediate can be
+          * represented as a VF.
+          */
+         if (inst->opcode == BRW_OPCODE_MOV &&
+             inst->src[0].file == IMM &&
+             inst->predicate == BRW_PREDICATE_NONE &&
+             inst->dst.writemask != WRITEMASK_XYZW &&
+             type_sz(inst->src[0].type) < 8 &&
+             (inst->src[0].type == inst->dst.type || inst->src[0].d == 0)) {
+
+            vf = brw_float_to_vf(inst->src[0].d);
+            need_type = BRW_REGISTER_TYPE_D;
+
+            if (vf == -1) {
+               vf = brw_float_to_vf(inst->src[0].f);
+               need_type = BRW_REGISTER_TYPE_F;
+            }
+         } else {
+            last_reg = ~0u;
+         }
+
+         /* If this wasn't a MOV, or the destination register doesn't match,
+          * or we have to switch destination types, then this breaks our
+          * sequence.  Combine anything we've accumulated so far.
+          */
+         if (last_reg != inst->dst.nr ||
+             last_offset != inst->dst.offset ||
+             last_reg_file != inst->dst.file ||
+             (vf > 0 && dest_type != need_type)) {
+
+            if (inst_count > 1) {
+               unsigned vf;
+               memcpy(&vf, imm, sizeof(vf));
+               vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf));
+               mov->dst.type = dest_type;
+               mov->dst.writemask = writemask;
+               inst->insert_before(block, mov);
+
+               for (int i = 0; i < inst_count; i++) {
+                  imm_inst[i]->remove(block);
+               }
+
+               progress = true;
+            }
+
+            inst_count = 0;
+            last_reg = ~0u;;
+            writemask = 0;
+            dest_type = BRW_REGISTER_TYPE_F;
+
+            for (int i = 0; i < 4; i++) {
+               imm[i] = 0;
+            }
+         }
+
+         /* Record this instruction's value (if it was representable). */
+         if (vf != -1) {
+            if ((inst->dst.writemask & WRITEMASK_X) != 0)
+               imm[0] = vf;
+            if ((inst->dst.writemask & WRITEMASK_Y) != 0)
+               imm[1] = vf;
+            if ((inst->dst.writemask & WRITEMASK_Z) != 0)
+               imm[2] = vf;
+            if ((inst->dst.writemask & WRITEMASK_W) != 0)
+               imm[3] = vf;
+
+            writemask |= inst->dst.writemask;
+            imm_inst[inst_count++] = inst;
+
+            last_reg = inst->dst.nr;
+            last_offset = inst->dst.offset;
+            last_reg_file = inst->dst.file;
+            if (vf > 0)
+               dest_type = need_type;
+         }
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+/* Replaces unused channels of a swizzle with channels that are used.
+ *
+ * For instance, this pass transforms
+ *
+ *    mov vgrf4.yz, vgrf5.wxzy
+ *
+ * into
+ *
+ *    mov vgrf4.yz, vgrf5.xxzx
+ *
+ * This eliminates false uses of some channels, letting dead code elimination
+ * remove the instructions that wrote them.
+ */
+bool
+vec4_visitor::opt_reduce_swizzle()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      if (inst->dst.file == BAD_FILE ||
+          inst->dst.file == ARF ||
+          inst->dst.file == FIXED_GRF ||
+          inst->is_send_from_grf())
+         continue;
+
+      unsigned swizzle;
+
+      /* Determine which channels of the sources are read. */
+      switch (inst->opcode) {
+      case VEC4_OPCODE_PACK_BYTES:
+      case BRW_OPCODE_DP4:
+      case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
+                            *           but all four of src1.
+                            */
+         swizzle = brw_swizzle_for_size(4);
+         break;
+      case BRW_OPCODE_DP3:
+         swizzle = brw_swizzle_for_size(3);
+         break;
+      case BRW_OPCODE_DP2:
+         swizzle = brw_swizzle_for_size(2);
+         break;
+
+      case VEC4_OPCODE_TO_DOUBLE:
+      case VEC4_OPCODE_DOUBLE_TO_F32:
+      case VEC4_OPCODE_DOUBLE_TO_D32:
+      case VEC4_OPCODE_DOUBLE_TO_U32:
+      case VEC4_OPCODE_PICK_LOW_32BIT:
+      case VEC4_OPCODE_PICK_HIGH_32BIT:
+      case VEC4_OPCODE_SET_LOW_32BIT:
+      case VEC4_OPCODE_SET_HIGH_32BIT:
+         swizzle = brw_swizzle_for_size(4);
+         break;
+
+      default:
+         swizzle = brw_swizzle_for_mask(inst->dst.writemask);
+         break;
+      }
+
+      /* Update sources' swizzles. */
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file != VGRF &&
+             inst->src[i].file != ATTR &&
+             inst->src[i].file != UNIFORM)
+            continue;
+
+         const unsigned new_swizzle =
+            brw_compose_swizzle(swizzle, inst->src[i].swizzle);
+         if (inst->src[i].swizzle != new_swizzle) {
+            inst->src[i].swizzle = new_swizzle;
+            progress = true;
+         }
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
+
+   return progress;
+}
+
+void
+vec4_visitor::split_uniform_registers()
+{
+   /* Prior to this, uniforms have been in an array sized according to
+    * the number of vector uniforms present, sparsely filled (so an
+    * aggregate results in reg indices being skipped over).  Now we're
+    * going to cut those aggregates up so each .nr index is one
+    * vector.  The goal is to make elimination of unused uniform
+    * components easier later.
+    */
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (int i = 0 ; i < 3; i++) {
+         if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START)
+	    continue;
+
+	 assert(!inst->src[i].reladdr);
+
+         inst->src[i].nr += inst->src[i].offset / 16;
+	 inst->src[i].offset %= 16;
+      }
+   }
+}
+
+/**
+ * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
+ *
+ * While GLSL IR also performs this optimization, we end up with it in
+ * our instruction stream for a couple of reasons.  One is that we
+ * sometimes generate silly instructions, for example in array access
+ * where we'll generate "ADD offset, index, base" even if base is 0.
+ * The other is that GLSL IR's constant propagation doesn't track the
+ * components of aggregates, so some VS patterns (initialize matrix to
+ * 0, accumulate in vertex blending factors) end up breaking down to
+ * instructions involving 0.
+ */
+bool
+vec4_visitor::opt_algebraic()
+{
+   bool progress = false;
+
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      switch (inst->opcode) {
+      case BRW_OPCODE_MOV:
+         if (inst->src[0].file != IMM)
+            break;
+
+         if (inst->saturate) {
+            /* Full mixed-type saturates don't happen.  However, we can end up
+             * with things like:
+             *
+             *    mov.sat(8) g21<1>DF       -1F
+             *
+             * Other mixed-size-but-same-base-type cases may also be possible.
+             */
+            if (inst->dst.type != inst->src[0].type &&
+                inst->dst.type != BRW_REGISTER_TYPE_DF &&
+                inst->src[0].type != BRW_REGISTER_TYPE_F)
+               assert(!"unimplemented: saturate mixed types");
+
+            if (brw_saturate_immediate(inst->src[0].type,
+                                       &inst->src[0].as_brw_reg())) {
+               inst->saturate = false;
+               progress = true;
+            }
+         }
+         break;
+
+      case BRW_OPCODE_OR:
+         if (inst->src[1].is_zero()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[1] = src_reg();
+            progress = true;
+         }
+         break;
+
+      case VEC4_OPCODE_UNPACK_UNIFORM:
+         if (inst->src[0].file != UNIFORM) {
+            inst->opcode = BRW_OPCODE_MOV;
+            progress = true;
+         }
+         break;
+
+      case BRW_OPCODE_ADD:
+	 if (inst->src[1].is_zero()) {
+	    inst->opcode = BRW_OPCODE_MOV;
+	    inst->src[1] = src_reg();
+	    progress = true;
+	 }
+	 break;
+
+      case BRW_OPCODE_MUL:
+	 if (inst->src[1].file != IMM)
+	    continue;
+
+	 if (brw_reg_type_is_floating_point(inst->src[1].type))
+	    break;
+
+	 if (inst->src[1].is_zero()) {
+	    inst->opcode = BRW_OPCODE_MOV;
+	    switch (inst->src[0].type) {
+	    case BRW_REGISTER_TYPE_F:
+	       inst->src[0] = brw_imm_f(0.0f);
+	       break;
+	    case BRW_REGISTER_TYPE_D:
+	       inst->src[0] = brw_imm_d(0);
+	       break;
+	    case BRW_REGISTER_TYPE_UD:
+	       inst->src[0] = brw_imm_ud(0u);
+	       break;
+	    default:
+	       unreachable("not reached");
+	    }
+	    inst->src[1] = src_reg();
+	    progress = true;
+	 } else if (inst->src[1].is_one()) {
+	    inst->opcode = BRW_OPCODE_MOV;
+	    inst->src[1] = src_reg();
+	    progress = true;
+         } else if (inst->src[1].is_negative_one()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0].negate = !inst->src[0].negate;
+            inst->src[1] = src_reg();
+            progress = true;
+	 }
+	 break;
+      case SHADER_OPCODE_BROADCAST:
+         if (is_uniform(inst->src[0]) ||
+             inst->src[1].is_zero()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[1] = src_reg();
+            inst->force_writemask_all = true;
+            progress = true;
+         }
+         break;
+
+      default:
+	 break;
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
+                          DEPENDENCY_INSTRUCTION_DETAIL);
+
+   return progress;
+}
+
+/* Conditions for which we want to avoid setting the dependency control bits */
+bool
+vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
+{
+#define IS_DWORD(reg) \
+   (reg.type == BRW_REGISTER_TYPE_UD || \
+    reg.type == BRW_REGISTER_TYPE_D)
+
+#define IS_64BIT(reg) (reg.file != BAD_FILE && type_sz(reg.type) == 8)
+
+   if (devinfo->ver >= 7) {
+      if (IS_64BIT(inst->dst) || IS_64BIT(inst->src[0]) ||
+          IS_64BIT(inst->src[1]) || IS_64BIT(inst->src[2]))
+      return true;
+   }
+
+#undef IS_64BIT
+#undef IS_DWORD
+
+   /*
+    * mlen:
+    * In the presence of send messages, totally interrupt dependency
+    * control. They're long enough that the chance of dependency
+    * control around them just doesn't matter.
+    *
+    * predicate:
+    * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
+    * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
+    * completes the scoreboard clear must have a non-zero execution mask. This
+    * means, if any kind of predication can change the execution mask or channel
+    * enable of the last instruction, the optimization must be avoided. This is
+    * to avoid instructions being shot down the pipeline when no writes are
+    * required.
+    *
+    * math:
+    * Dependency control does not work well over math instructions.
+    * NB: Discovered empirically
+    */
+   return (inst->mlen || inst->predicate || inst->is_math());
+}
+
+/**
+ * Sets the dependency control fields on instructions after register
+ * allocation and before the generator is run.
+ *
+ * When you have a sequence of instructions like:
+ *
+ * DP4 temp.x vertex uniform[0]
+ * DP4 temp.y vertex uniform[0]
+ * DP4 temp.z vertex uniform[0]
+ * DP4 temp.w vertex uniform[0]
+ *
+ * The hardware doesn't know that it can actually run the later instructions
+ * while the previous ones are in flight, producing stalls.  However, we have
+ * manual fields we can set in the instructions that let it do so.
+ */
+void
+vec4_visitor::opt_set_dependency_control()
+{
+   vec4_instruction *last_grf_write[BRW_MAX_GRF];
+   uint8_t grf_channels_written[BRW_MAX_GRF];
+   vec4_instruction *last_mrf_write[BRW_MAX_GRF];
+   uint8_t mrf_channels_written[BRW_MAX_GRF];
+
+   assert(prog_data->total_grf ||
+          !"Must be called after register allocation");
+
+   foreach_block (block, cfg) {
+      memset(last_grf_write, 0, sizeof(last_grf_write));
+      memset(last_mrf_write, 0, sizeof(last_mrf_write));
+
+      foreach_inst_in_block (vec4_instruction, inst, block) {
+         /* If we read from a register that we were doing dependency control
+          * on, don't do dependency control across the read.
+          */
+         for (int i = 0; i < 3; i++) {
+            int reg = inst->src[i].nr + inst->src[i].offset / REG_SIZE;
+            if (inst->src[i].file == VGRF) {
+               last_grf_write[reg] = NULL;
+            } else if (inst->src[i].file == FIXED_GRF) {
+               memset(last_grf_write, 0, sizeof(last_grf_write));
+               break;
+            }
+            assert(inst->src[i].file != MRF);
+         }
+
+         if (is_dep_ctrl_unsafe(inst)) {
+            memset(last_grf_write, 0, sizeof(last_grf_write));
+            memset(last_mrf_write, 0, sizeof(last_mrf_write));
+            continue;
+         }
+
+         /* Now, see if we can do dependency control for this instruction
+          * against a previous one writing to its destination.
+          */
+         int reg = inst->dst.nr + inst->dst.offset / REG_SIZE;
+         if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
+            if (last_grf_write[reg] &&
+                last_grf_write[reg]->dst.offset == inst->dst.offset &&
+                !(inst->dst.writemask & grf_channels_written[reg])) {
+               last_grf_write[reg]->no_dd_clear = true;
+               inst->no_dd_check = true;
+            } else {
+               grf_channels_written[reg] = 0;
+            }
+
+            last_grf_write[reg] = inst;
+            grf_channels_written[reg] |= inst->dst.writemask;
+         } else if (inst->dst.file == MRF) {
+            if (last_mrf_write[reg] &&
+                last_mrf_write[reg]->dst.offset == inst->dst.offset &&
+                !(inst->dst.writemask & mrf_channels_written[reg])) {
+               last_mrf_write[reg]->no_dd_clear = true;
+               inst->no_dd_check = true;
+            } else {
+               mrf_channels_written[reg] = 0;
+            }
+
+            last_mrf_write[reg] = inst;
+            mrf_channels_written[reg] |= inst->dst.writemask;
+         }
+      }
+   }
+}
+
+bool
+vec4_instruction::can_reswizzle(const struct intel_device_info *devinfo,
+                                int dst_writemask,
+                                int swizzle,
+                                int swizzle_mask)
+{
+   /* Gfx6 MATH instructions can not execute in align16 mode, so swizzles
+    * are not allowed.
+    */
+   if (devinfo->ver == 6 && is_math() && swizzle != BRW_SWIZZLE_XYZW)
+      return false;
+
+   /* If we write to the flag register changing the swizzle would change
+    * what channels are written to the flag register.
+    */
+   if (writes_flag(devinfo))
+      return false;
+
+   /* We can't swizzle implicit accumulator access.  We'd have to
+    * reswizzle the producer of the accumulator value in addition
+    * to the consumer (i.e. both MUL and MACH).  Just skip this.
+    */
+   if (reads_accumulator_implicitly())
+      return false;
+
+   if (!can_do_writemask(devinfo) && dst_writemask != WRITEMASK_XYZW)
+      return false;
+
+   /* If this instruction sets anything not referenced by swizzle, then we'd
+    * totally break it when we reswizzle.
+    */
+   if (dst.writemask & ~swizzle_mask)
+      return false;
+
+   if (mlen > 0)
+      return false;
+
+   for (int i = 0; i < 3; i++) {
+      if (src[i].is_accumulator())
+         return false;
+   }
+
+   return true;
+}
+
+/**
+ * For any channels in the swizzle's source that were populated by this
+ * instruction, rewrite the instruction to put the appropriate result directly
+ * in those channels.
+ *
+ * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
+ */
+void
+vec4_instruction::reswizzle(int dst_writemask, int swizzle)
+{
+   /* Destination write mask doesn't correspond to source swizzle for the dot
+    * product and pack_bytes instructions.
+    */
+   if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH &&
+       opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 &&
+       opcode != VEC4_OPCODE_PACK_BYTES) {
+      for (int i = 0; i < 3; i++) {
+         if (src[i].file == BAD_FILE)
+            continue;
+
+         if (src[i].file == IMM) {
+            assert(src[i].type != BRW_REGISTER_TYPE_V &&
+                   src[i].type != BRW_REGISTER_TYPE_UV);
+
+            /* Vector immediate types need to be reswizzled. */
+            if (src[i].type == BRW_REGISTER_TYPE_VF) {
+               const unsigned imm[] = {
+                  (src[i].ud >>  0) & 0x0ff,
+                  (src[i].ud >>  8) & 0x0ff,
+                  (src[i].ud >> 16) & 0x0ff,
+                  (src[i].ud >> 24) & 0x0ff,
+               };
+
+               src[i] = brw_imm_vf4(imm[BRW_GET_SWZ(swizzle, 0)],
+                                    imm[BRW_GET_SWZ(swizzle, 1)],
+                                    imm[BRW_GET_SWZ(swizzle, 2)],
+                                    imm[BRW_GET_SWZ(swizzle, 3)]);
+            }
+
+            continue;
+         }
+
+         src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle);
+      }
+   }
+
+   /* Apply the specified swizzle and writemask to the original mask of
+    * written components.
+    */
+   dst.writemask = dst_writemask &
+                   brw_apply_swizzle_to_mask(swizzle, dst.writemask);
+}
+
+/*
+ * Tries to reduce extra MOV instructions by taking temporary GRFs that get
+ * just written and then MOVed into another reg and making the original write
+ * of the GRF write directly to the final destination instead.
+ */
+bool
+vec4_visitor::opt_register_coalesce()
+{
+   bool progress = false;
+   int next_ip = 0;
+   const vec4_live_variables &live = live_analysis.require();
+
+   foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
+      int ip = next_ip;
+      next_ip++;
+
+      if (inst->opcode != BRW_OPCODE_MOV ||
+          (inst->dst.file != VGRF && inst->dst.file != MRF) ||
+	  inst->predicate ||
+	  inst->src[0].file != VGRF ||
+	  inst->dst.type != inst->src[0].type ||
+	  inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
+	 continue;
+
+      /* Remove no-op MOVs */
+      if (inst->dst.file == inst->src[0].file &&
+          inst->dst.nr == inst->src[0].nr &&
+          inst->dst.offset == inst->src[0].offset) {
+         bool is_nop_mov = true;
+
+         for (unsigned c = 0; c < 4; c++) {
+            if ((inst->dst.writemask & (1 << c)) == 0)
+               continue;
+
+            if (BRW_GET_SWZ(inst->src[0].swizzle, c) != c) {
+               is_nop_mov = false;
+               break;
+            }
+         }
+
+         if (is_nop_mov) {
+            inst->remove(block);
+            progress = true;
+            continue;
+         }
+      }
+
+      bool to_mrf = (inst->dst.file == MRF);
+
+      /* Can't coalesce this GRF if someone else was going to
+       * read it later.
+       */
+      if (live.var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip)
+	 continue;
+
+      /* We need to check interference with the final destination between this
+       * instruction and the earliest instruction involved in writing the GRF
+       * we're eliminating.  To do that, keep track of which of our source
+       * channels we've seen initialized.
+       */
+      const unsigned chans_needed =
+         brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
+                                       inst->dst.writemask);
+      unsigned chans_remaining = chans_needed;
+
+      /* Now walk up the instruction stream trying to see if we can rewrite
+       * everything writing to the temporary to write into the destination
+       * instead.
+       */
+      vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
+      foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
+                                                  inst) {
+         _scan_inst = scan_inst;
+
+         if (regions_overlap(inst->src[0], inst->size_read(0),
+                             scan_inst->dst, scan_inst->size_written)) {
+            /* Found something writing to the reg we want to coalesce away. */
+            if (to_mrf) {
+               /* SEND instructions can't have MRF as a destination. */
+               if (scan_inst->mlen)
+                  break;
+
+               if (devinfo->ver == 6) {
+                  /* gfx6 math instructions must have the destination be
+                   * VGRF, so no compute-to-MRF for them.
+                   */
+                  if (scan_inst->is_math()) {
+                     break;
+                  }
+               }
+            }
+
+            /* VS_OPCODE_UNPACK_FLAGS_SIMD4X2 generates a bunch of mov(1)
+             * instructions, and this optimization pass is not capable of
+             * handling that.  Bail on these instructions and hope that some
+             * later optimization pass can do the right thing after they are
+             * expanded.
+             */
+            if (scan_inst->opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2)
+               break;
+
+            /* This doesn't handle saturation on the instruction we
+             * want to coalesce away if the register types do not match.
+             * But if scan_inst is a non type-converting 'mov', we can fix
+             * the types later.
+             */
+            if (inst->saturate &&
+                inst->dst.type != scan_inst->dst.type &&
+                !(scan_inst->opcode == BRW_OPCODE_MOV &&
+                  scan_inst->dst.type == scan_inst->src[0].type))
+               break;
+
+            /* Only allow coalescing between registers of the same type size.
+             * Otherwise we would need to make the pass aware of the fact that
+             * channel sizes are different for single and double precision.
+             */
+            if (type_sz(inst->src[0].type) != type_sz(scan_inst->src[0].type))
+               break;
+
+            /* Check that scan_inst writes the same amount of data as the
+             * instruction, otherwise coalescing would lead to writing a
+             * different (larger or smaller) region of the destination
+             */
+            if (scan_inst->size_written != inst->size_written)
+               break;
+
+            /* If we can't handle the swizzle, bail. */
+            if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask,
+                                          inst->src[0].swizzle,
+                                          chans_needed)) {
+               break;
+            }
+
+            /* This only handles coalescing writes of 8 channels (1 register
+             * for single-precision and 2 registers for double-precision)
+             * starting at the source offset of the copy instruction.
+             */
+            if (DIV_ROUND_UP(scan_inst->size_written,
+                             type_sz(scan_inst->dst.type)) > 8 ||
+                scan_inst->dst.offset != inst->src[0].offset)
+               break;
+
+	    /* Mark which channels we found unconditional writes for. */
+	    if (!scan_inst->predicate)
+               chans_remaining &= ~scan_inst->dst.writemask;
+
+	    if (chans_remaining == 0)
+	       break;
+	 }
+
+         /* You can't read from an MRF, so if someone else reads our MRF's
+          * source GRF that we wanted to rewrite, that stops us.  If it's a
+          * GRF we're trying to coalesce to, we don't actually handle
+          * rewriting sources so bail in that case as well.
+          */
+	 bool interfered = false;
+	 for (int i = 0; i < 3; i++) {
+            if (regions_overlap(inst->src[0], inst->size_read(0),
+                                scan_inst->src[i], scan_inst->size_read(i)))
+	       interfered = true;
+	 }
+	 if (interfered)
+	    break;
+
+         /* If somebody else writes the same channels of our destination here,
+          * we can't coalesce before that.
+          */
+         if (regions_overlap(inst->dst, inst->size_written,
+                             scan_inst->dst, scan_inst->size_written) &&
+             (inst->dst.writemask & scan_inst->dst.writemask) != 0) {
+            break;
+         }
+
+         /* Check for reads of the register we're trying to coalesce into.  We
+          * can't go rewriting instructions above that to put some other value
+          * in the register instead.
+          */
+         if (to_mrf && scan_inst->mlen > 0) {
+            unsigned start = scan_inst->base_mrf;
+            unsigned end = scan_inst->base_mrf + scan_inst->mlen;
+
+            if (inst->dst.nr >= start && inst->dst.nr < end) {
+               break;
+            }
+         } else {
+            for (int i = 0; i < 3; i++) {
+               if (regions_overlap(inst->dst, inst->size_written,
+                                   scan_inst->src[i], scan_inst->size_read(i)))
+                  interfered = true;
+            }
+            if (interfered)
+               break;
+         }
+      }
+
+      if (chans_remaining == 0) {
+	 /* If we've made it here, we have an MOV we want to coalesce out, and
+	  * a scan_inst pointing to the earliest instruction involved in
+	  * computing the value.  Now go rewrite the instruction stream
+	  * between the two.
+	  */
+         vec4_instruction *scan_inst = _scan_inst;
+	 while (scan_inst != inst) {
+	    if (scan_inst->dst.file == VGRF &&
+                scan_inst->dst.nr == inst->src[0].nr &&
+		scan_inst->dst.offset == inst->src[0].offset) {
+               scan_inst->reswizzle(inst->dst.writemask,
+                                    inst->src[0].swizzle);
+	       scan_inst->dst.file = inst->dst.file;
+               scan_inst->dst.nr = inst->dst.nr;
+	       scan_inst->dst.offset = inst->dst.offset;
+               if (inst->saturate &&
+                   inst->dst.type != scan_inst->dst.type) {
+                  /* If we have reached this point, scan_inst is a non
+                   * type-converting 'mov' and we can modify its register types
+                   * to match the ones in inst. Otherwise, we could have an
+                   * incorrect saturation result.
+                   */
+                  scan_inst->dst.type = inst->dst.type;
+                  scan_inst->src[0].type = inst->src[0].type;
+               }
+	       scan_inst->saturate |= inst->saturate;
+	    }
+	    scan_inst = (vec4_instruction *)scan_inst->next;
+	 }
+	 inst->remove(block);
+	 progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+/**
+ * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
+ * flow.  We could probably do better here with some form of divergence
+ * analysis.
+ */
+bool
+vec4_visitor::eliminate_find_live_channel()
+{
+   bool progress = false;
+   unsigned depth = 0;
+
+   if (!brw_stage_has_packed_dispatch(devinfo, stage, 0, stage_prog_data)) {
+      /* The optimization below assumes that channel zero is live on thread
+       * dispatch, which may not be the case if the fixed function dispatches
+       * threads sparsely.
+       */
+      return false;
+   }
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      switch (inst->opcode) {
+      case BRW_OPCODE_IF:
+      case BRW_OPCODE_DO:
+         depth++;
+         break;
+
+      case BRW_OPCODE_ENDIF:
+      case BRW_OPCODE_WHILE:
+         depth--;
+         break;
+
+      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+         if (depth == 0) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = brw_imm_d(0);
+            inst->force_writemask_all = true;
+            progress = true;
+         }
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
+
+   return progress;
+}
+
+/**
+ * Splits virtual GRFs requesting more than one contiguous physical register.
+ *
+ * We initially create large virtual GRFs for temporary structures, arrays,
+ * and matrices, so that the visitor functions can add offsets to work their
+ * way down to the actual member being accessed.  But when it comes to
+ * optimization, we'd like to treat each register as individual storage if
+ * possible.
+ *
+ * So far, the only thing that might prevent splitting is a send message from
+ * a GRF on IVB.
+ */
+void
+vec4_visitor::split_virtual_grfs()
+{
+   int num_vars = this->alloc.count;
+   int new_virtual_grf[num_vars];
+   bool split_grf[num_vars];
+
+   memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
+
+   /* Try to split anything > 0 sized. */
+   for (int i = 0; i < num_vars; i++) {
+      split_grf[i] = this->alloc.sizes[i] != 1;
+   }
+
+   /* Check that the instructions are compatible with the registers we're trying
+    * to split.
+    */
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      if (inst->dst.file == VGRF && regs_written(inst) > 1)
+         split_grf[inst->dst.nr] = false;
+
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file == VGRF && regs_read(inst, i) > 1)
+            split_grf[inst->src[i].nr] = false;
+      }
+   }
+
+   /* Allocate new space for split regs.  Note that the virtual
+    * numbers will be contiguous.
+    */
+   for (int i = 0; i < num_vars; i++) {
+      if (!split_grf[i])
+         continue;
+
+      new_virtual_grf[i] = alloc.allocate(1);
+      for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
+         unsigned reg = alloc.allocate(1);
+         assert(reg == new_virtual_grf[i] + j - 1);
+         (void) reg;
+      }
+      this->alloc.sizes[i] = 1;
+   }
+
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      if (inst->dst.file == VGRF && split_grf[inst->dst.nr] &&
+          inst->dst.offset / REG_SIZE != 0) {
+         inst->dst.nr = (new_virtual_grf[inst->dst.nr] +
+                         inst->dst.offset / REG_SIZE - 1);
+         inst->dst.offset %= REG_SIZE;
+      }
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file == VGRF && split_grf[inst->src[i].nr] &&
+             inst->src[i].offset / REG_SIZE != 0) {
+            inst->src[i].nr = (new_virtual_grf[inst->src[i].nr] +
+                                inst->src[i].offset / REG_SIZE - 1);
+            inst->src[i].offset %= REG_SIZE;
+         }
+      }
+   }
+   invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
+}
+
+void
+vec4_visitor::dump_instruction_to_file(const backend_instruction *be_inst, FILE *file) const
+{
+   const vec4_instruction *inst = (const vec4_instruction *)be_inst;
+
+   if (inst->predicate) {
+      fprintf(file, "(%cf%d.%d%s) ",
+              inst->predicate_inverse ? '-' : '+',
+              inst->flag_subreg / 2,
+              inst->flag_subreg % 2,
+              pred_ctrl_align16[inst->predicate]);
+   }
+
+   fprintf(file, "%s(%d)", brw_instruction_name(&compiler->isa, inst->opcode),
+           inst->exec_size);
+   if (inst->saturate)
+      fprintf(file, ".sat");
+   if (inst->conditional_mod) {
+      fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
+      if (!inst->predicate &&
+          (devinfo->ver < 5 || (inst->opcode != BRW_OPCODE_SEL &&
+                                inst->opcode != BRW_OPCODE_CSEL &&
+                                inst->opcode != BRW_OPCODE_IF &&
+                                inst->opcode != BRW_OPCODE_WHILE))) {
+         fprintf(file, ".f%d.%d", inst->flag_subreg / 2, inst->flag_subreg % 2);
+      }
+   }
+   fprintf(file, " ");
+
+   switch (inst->dst.file) {
+   case VGRF:
+      fprintf(file, "vgrf%d", inst->dst.nr);
+      break;
+   case FIXED_GRF:
+      fprintf(file, "g%d", inst->dst.nr);
+      break;
+   case MRF:
+      fprintf(file, "m%d", inst->dst.nr);
+      break;
+   case ARF:
+      switch (inst->dst.nr) {
+      case BRW_ARF_NULL:
+         fprintf(file, "null");
+         break;
+      case BRW_ARF_ADDRESS:
+         fprintf(file, "a0.%d", inst->dst.subnr);
+         break;
+      case BRW_ARF_ACCUMULATOR:
+         fprintf(file, "acc%d", inst->dst.subnr);
+         break;
+      case BRW_ARF_FLAG:
+         fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+         break;
+      default:
+         fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+         break;
+      }
+      break;
+   case BAD_FILE:
+      fprintf(file, "(null)");
+      break;
+   case IMM:
+   case ATTR:
+   case UNIFORM:
+      unreachable("not reached");
+   }
+   if (inst->dst.offset ||
+       (inst->dst.file == VGRF &&
+        alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
+      const unsigned reg_size = (inst->dst.file == UNIFORM ? 16 : REG_SIZE);
+      fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
+              inst->dst.offset % reg_size);
+   }
+   if (inst->dst.writemask != WRITEMASK_XYZW) {
+      fprintf(file, ".");
+      if (inst->dst.writemask & 1)
+         fprintf(file, "x");
+      if (inst->dst.writemask & 2)
+         fprintf(file, "y");
+      if (inst->dst.writemask & 4)
+         fprintf(file, "z");
+      if (inst->dst.writemask & 8)
+         fprintf(file, "w");
+   }
+   fprintf(file, ":%s", brw_reg_type_to_letters(inst->dst.type));
+
+   if (inst->src[0].file != BAD_FILE)
+      fprintf(file, ", ");
+
+   for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
+      if (inst->src[i].negate)
+         fprintf(file, "-");
+      if (inst->src[i].abs)
+         fprintf(file, "|");
+      switch (inst->src[i].file) {
+      case VGRF:
+         fprintf(file, "vgrf%d", inst->src[i].nr);
+         break;
+      case FIXED_GRF:
+         fprintf(file, "g%d.%d", inst->src[i].nr, inst->src[i].subnr);
+         break;
+      case ATTR:
+         fprintf(file, "attr%d", inst->src[i].nr);
+         break;
+      case UNIFORM:
+         fprintf(file, "u%d", inst->src[i].nr);
+         break;
+      case IMM:
+         switch (inst->src[i].type) {
+         case BRW_REGISTER_TYPE_F:
+            fprintf(file, "%fF", inst->src[i].f);
+            break;
+         case BRW_REGISTER_TYPE_DF:
+            fprintf(file, "%fDF", inst->src[i].df);
+            break;
+         case BRW_REGISTER_TYPE_D:
+            fprintf(file, "%dD", inst->src[i].d);
+            break;
+         case BRW_REGISTER_TYPE_UD:
+            fprintf(file, "%uU", inst->src[i].ud);
+            break;
+         case BRW_REGISTER_TYPE_VF:
+            fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
+                    brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
+            break;
+         default:
+            fprintf(file, "???");
+            break;
+         }
+         break;
+      case ARF:
+         switch (inst->src[i].nr) {
+         case BRW_ARF_NULL:
+            fprintf(file, "null");
+            break;
+         case BRW_ARF_ADDRESS:
+            fprintf(file, "a0.%d", inst->src[i].subnr);
+            break;
+         case BRW_ARF_ACCUMULATOR:
+            fprintf(file, "acc%d", inst->src[i].subnr);
+            break;
+         case BRW_ARF_FLAG:
+            fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+            break;
+         default:
+            fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+            break;
+         }
+         break;
+      case BAD_FILE:
+         fprintf(file, "(null)");
+         break;
+      case MRF:
+         unreachable("not reached");
+      }
+
+      if (inst->src[i].offset ||
+          (inst->src[i].file == VGRF &&
+           alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
+         const unsigned reg_size = (inst->src[i].file == UNIFORM ? 16 : REG_SIZE);
+         fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
+                 inst->src[i].offset % reg_size);
+      }
+
+      if (inst->src[i].file != IMM) {
+         static const char *chans[4] = {"x", "y", "z", "w"};
+         fprintf(file, ".");
+         for (int c = 0; c < 4; c++) {
+            fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
+         }
+      }
+
+      if (inst->src[i].abs)
+         fprintf(file, "|");
+
+      if (inst->src[i].file != IMM) {
+         fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type));
+      }
+
+      if (i < 2 && inst->src[i + 1].file != BAD_FILE)
+         fprintf(file, ", ");
+   }
+
+   if (inst->force_writemask_all)
+      fprintf(file, " NoMask");
+
+   if (inst->exec_size != 8)
+      fprintf(file, " group%d", inst->group);
+
+   fprintf(file, "\n");
+}
+
+
+int
+vec4_vs_visitor::setup_attributes(int payload_reg)
+{
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file == ATTR) {
+            assert(inst->src[i].offset % REG_SIZE == 0);
+            int grf = payload_reg + inst->src[i].nr +
+                      inst->src[i].offset / REG_SIZE;
+
+            struct brw_reg reg = brw_vec8_grf(grf, 0);
+            reg.swizzle = inst->src[i].swizzle;
+            reg.type = inst->src[i].type;
+            reg.abs = inst->src[i].abs;
+            reg.negate = inst->src[i].negate;
+            inst->src[i] = reg;
+         }
+      }
+   }
+
+   return payload_reg + vs_prog_data->nr_attribute_slots;
+}
+
+void
+vec4_visitor::setup_push_ranges()
+{
+   /* Only allow 32 registers (256 uniform components) as push constants,
+    * which is the limit on gfx6.
+    *
+    * If changing this value, note the limitation about total_regs in
+    * brw_curbe.c.
+    */
+   const unsigned max_push_length = 32;
+
+   push_length = DIV_ROUND_UP(prog_data->base.nr_params, 8);
+   push_length = MIN2(push_length, max_push_length);
+
+   /* Shrink UBO push ranges so it all fits in max_push_length */
+   for (unsigned i = 0; i < 4; i++) {
+      struct brw_ubo_range *range = &prog_data->base.ubo_ranges[i];
+
+      if (push_length + range->length > max_push_length)
+         range->length = max_push_length - push_length;
+
+      push_length += range->length;
+   }
+   assert(push_length <= max_push_length);
+}
+
+int
+vec4_visitor::setup_uniforms(int reg)
+{
+   /* It's possible that uniform compaction will shrink further than expected
+    * so we re-compute the layout and set up our UBO push starts.
+    */
+   ASSERTED const unsigned old_push_length = push_length;
+   push_length = DIV_ROUND_UP(prog_data->base.nr_params, 8);
+   for (unsigned i = 0; i < 4; i++) {
+      ubo_push_start[i] = push_length;
+      push_length += stage_prog_data->ubo_ranges[i].length;
+   }
+   assert(push_length == old_push_length);
+
+   /* The pre-gfx6 VS requires that some push constants get loaded no
+    * matter what, or the GPU would hang.
+    */
+   if (devinfo->ver < 6 && push_length == 0) {
+      brw_stage_prog_data_add_params(stage_prog_data, 4);
+      for (unsigned int i = 0; i < 4; i++) {
+	 unsigned int slot = this->uniforms * 4 + i;
+	 stage_prog_data->param[slot] = BRW_PARAM_BUILTIN_ZERO;
+      }
+      push_length = 1;
+   }
+
+   prog_data->base.dispatch_grf_start_reg = reg;
+   prog_data->base.curb_read_length = push_length;
+
+   return reg + push_length;
+}
+
+void
+vec4_vs_visitor::setup_payload(void)
+{
+   int reg = 0;
+
+   /* The payload always contains important data in g0, which contains
+    * the URB handles that are passed on to the URB write at the end
+    * of the thread.  So, we always start push constants at g1.
+    */
+   reg++;
+
+   reg = setup_uniforms(reg);
+
+   reg = setup_attributes(reg);
+
+   this->first_non_payload_grf = reg;
+}
+
+bool
+vec4_visitor::lower_minmax()
+{
+   assert(devinfo->ver < 6);
+
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      const vec4_builder ibld(this, block, inst);
+
+      if (inst->opcode == BRW_OPCODE_SEL &&
+          inst->predicate == BRW_PREDICATE_NONE) {
+         /* If src1 is an immediate value that is not NaN, then it can't be
+          * NaN.  In that case, emit CMP because it is much better for cmod
+          * propagation.  Likewise if src1 is not float.  Gfx4 and Gfx5 don't
+          * support HF or DF, so it is not necessary to check for those.
+          */
+         if (inst->src[1].type != BRW_REGISTER_TYPE_F ||
+             (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
+            ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                     inst->conditional_mod);
+         } else {
+            ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                      inst->conditional_mod);
+         }
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+src_reg
+vec4_visitor::get_timestamp()
+{
+   assert(devinfo->ver == 7);
+
+   src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                                BRW_ARF_TIMESTAMP,
+                                0,
+                                0,
+                                0,
+                                BRW_REGISTER_TYPE_UD,
+                                BRW_VERTICAL_STRIDE_0,
+                                BRW_WIDTH_4,
+                                BRW_HORIZONTAL_STRIDE_4,
+                                BRW_SWIZZLE_XYZW,
+                                WRITEMASK_XYZW));
+
+   dst_reg dst = dst_reg(this, glsl_uvec4_type());
+
+   vec4_instruction *mov = emit(MOV(dst, ts));
+   /* We want to read the 3 fields we care about (mostly field 0, but also 2)
+    * even if it's not enabled in the dispatch.
+    */
+   mov->force_writemask_all = true;
+
+   return src_reg(dst);
+}
+
+static bool
+is_align1_df(vec4_instruction *inst)
+{
+   switch (inst->opcode) {
+   case VEC4_OPCODE_DOUBLE_TO_F32:
+   case VEC4_OPCODE_DOUBLE_TO_D32:
+   case VEC4_OPCODE_DOUBLE_TO_U32:
+   case VEC4_OPCODE_TO_DOUBLE:
+   case VEC4_OPCODE_PICK_LOW_32BIT:
+   case VEC4_OPCODE_PICK_HIGH_32BIT:
+   case VEC4_OPCODE_SET_LOW_32BIT:
+   case VEC4_OPCODE_SET_HIGH_32BIT:
+      return true;
+   default:
+      return false;
+   }
+}
+
+/**
+ * Three source instruction must have a GRF/MRF destination register.
+ * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
+ */
+void
+vec4_visitor::fixup_3src_null_dest()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
+      if (inst->is_3src(compiler) && inst->dst.is_null()) {
+         const unsigned size_written = type_sz(inst->dst.type);
+         const unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE);
+
+         inst->dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)),
+                            inst->dst.type);
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
+                          DEPENDENCY_VARIABLES);
+}
+
+void
+vec4_visitor::convert_to_hw_regs()
+{
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (int i = 0; i < 3; i++) {
+         class src_reg &src = inst->src[i];
+         struct brw_reg reg;
+         switch (src.file) {
+         case VGRF: {
+            reg = byte_offset(brw_vecn_grf(4, src.nr, 0), src.offset);
+            reg.type = src.type;
+            reg.abs = src.abs;
+            reg.negate = src.negate;
+            break;
+         }
+
+         case UNIFORM: {
+            if (src.nr >= UBO_START) {
+               reg = byte_offset(brw_vec4_grf(
+                                    prog_data->base.dispatch_grf_start_reg +
+                                    ubo_push_start[src.nr - UBO_START] +
+                                    src.offset / 32, 0),
+                                 src.offset % 32);
+            } else {
+               reg = byte_offset(brw_vec4_grf(
+                                    prog_data->base.dispatch_grf_start_reg +
+                                    src.nr / 2, src.nr % 2 * 4),
+                                 src.offset);
+            }
+            reg = stride(reg, 0, 4, 1);
+            reg.type = src.type;
+            reg.abs = src.abs;
+            reg.negate = src.negate;
+
+            /* This should have been moved to pull constants. */
+            assert(!src.reladdr);
+            break;
+         }
+
+         case FIXED_GRF:
+            if (type_sz(src.type) == 8) {
+               reg = src.as_brw_reg();
+               break;
+            }
+            FALLTHROUGH;
+         case ARF:
+         case IMM:
+            continue;
+
+         case BAD_FILE:
+            /* Probably unused. */
+            reg = brw_null_reg();
+            reg = retype(reg, src.type);
+            break;
+
+         case MRF:
+         case ATTR:
+            unreachable("not reached");
+         }
+
+         apply_logical_swizzle(&reg, inst, i);
+         src = reg;
+
+         /* From IVB PRM, vol4, part3, "General Restrictions on Regioning
+          * Parameters":
+          *
+          *   "If ExecSize = Width and HorzStride ≠ 0, VertStride must be set
+          *    to Width * HorzStride."
+          *
+          * We can break this rule with DF sources on DF align1
+          * instructions, because the exec_size would be 4 and width is 4.
+          * As we know we are not accessing to next GRF, it is safe to
+          * set vstride to the formula given by the rule itself.
+          */
+         if (is_align1_df(inst) && (cvt(inst->exec_size) - 1) == src.width)
+            src.vstride = src.width + src.hstride;
+      }
+
+      if (inst->is_3src(compiler)) {
+         /* 3-src instructions with scalar sources support arbitrary subnr,
+          * but don't actually use swizzles.  Convert swizzle into subnr.
+          * Skip this for double-precision instructions: RepCtrl=1 is not
+          * allowed for them and needs special handling.
+          */
+         for (int i = 0; i < 3; i++) {
+            if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0 &&
+                type_sz(inst->src[i].type) < 8) {
+               assert(brw_is_single_value_swizzle(inst->src[i].swizzle));
+               inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0);
+            }
+         }
+      }
+
+      dst_reg &dst = inst->dst;
+      struct brw_reg reg;
+
+      switch (inst->dst.file) {
+      case VGRF:
+         reg = byte_offset(brw_vec8_grf(dst.nr, 0), dst.offset);
+         reg.type = dst.type;
+         reg.writemask = dst.writemask;
+         break;
+
+      case MRF:
+         reg = byte_offset(brw_message_reg(dst.nr), dst.offset);
+         assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
+         reg.type = dst.type;
+         reg.writemask = dst.writemask;
+         break;
+
+      case ARF:
+      case FIXED_GRF:
+         reg = dst.as_brw_reg();
+         break;
+
+      case BAD_FILE:
+         reg = brw_null_reg();
+         reg = retype(reg, dst.type);
+         break;
+
+      case IMM:
+      case ATTR:
+      case UNIFORM:
+         unreachable("not reached");
+      }
+
+      dst = reg;
+   }
+}
+
+static bool
+stage_uses_interleaved_attributes(unsigned stage,
+                                  enum intel_shader_dispatch_mode dispatch_mode)
+{
+   switch (stage) {
+   case MESA_SHADER_TESS_EVAL:
+      return true;
+   case MESA_SHADER_GEOMETRY:
+      return dispatch_mode != INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
+   default:
+      return false;
+   }
+}
+
+/**
+ * Get the closest native SIMD width supported by the hardware for instruction
+ * \p inst.  The instruction will be left untouched by
+ * vec4_visitor::lower_simd_width() if the returned value matches the
+ * instruction's original execution size.
+ */
+static unsigned
+get_lowered_simd_width(const struct intel_device_info *devinfo,
+                       enum intel_shader_dispatch_mode dispatch_mode,
+                       unsigned stage, const vec4_instruction *inst)
+{
+   /* Do not split some instructions that require special handling */
+   switch (inst->opcode) {
+   case SHADER_OPCODE_GFX4_SCRATCH_READ:
+   case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
+      return inst->exec_size;
+   default:
+      break;
+   }
+
+   unsigned lowered_width = MIN2(16, inst->exec_size);
+
+   /* We need to split some cases of double-precision instructions that write
+    * 2 registers. We only need to care about this in gfx7 because that is the
+    * only hardware that implements fp64 in Align16.
+    */
+   if (devinfo->ver == 7 && inst->size_written > REG_SIZE) {
+      /* Align16 8-wide double-precision SEL does not work well. Verified
+       * empirically.
+       */
+      if (inst->opcode == BRW_OPCODE_SEL && type_sz(inst->dst.type) == 8)
+         lowered_width = MIN2(lowered_width, 4);
+
+      /* HSW PRM, 3D Media GPGPU Engine, Region Alignment Rules for Direct
+       * Register Addressing:
+       *
+       *    "When destination spans two registers, the source MUST span two
+       *     registers."
+       */
+      for (unsigned i = 0; i < 3; i++) {
+         if (inst->src[i].file == BAD_FILE)
+            continue;
+         if (inst->size_read(i) <= REG_SIZE)
+            lowered_width = MIN2(lowered_width, 4);
+
+         /* Interleaved attribute setups use a vertical stride of 0, which
+          * makes them hit the associated instruction decompression bug in gfx7.
+          * Split them to prevent this.
+          */
+         if (inst->src[i].file == ATTR &&
+             stage_uses_interleaved_attributes(stage, dispatch_mode))
+            lowered_width = MIN2(lowered_width, 4);
+      }
+   }
+
+   /* IvyBridge can manage a maximum of 4 DFs per SIMD4x2 instruction, since
+    * it doesn't support compression in Align16 mode, no matter if it has
+    * force_writemask_all enabled or disabled (the latter is affected by the
+    * compressed instruction bug in gfx7, which is another reason to enforce
+    * this limit).
+    */
+   if (devinfo->verx10 == 70 &&
+       (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8))
+      lowered_width = MIN2(lowered_width, 4);
+
+   return lowered_width;
+}
+
+static bool
+dst_src_regions_overlap(vec4_instruction *inst)
+{
+   if (inst->size_written == 0)
+      return false;
+
+   unsigned dst_start = inst->dst.offset;
+   unsigned dst_end = dst_start + inst->size_written - 1;
+   for (int i = 0; i < 3; i++) {
+      if (inst->src[i].file == BAD_FILE)
+         continue;
+
+      if (inst->dst.file != inst->src[i].file ||
+          inst->dst.nr != inst->src[i].nr)
+         continue;
+
+      unsigned src_start = inst->src[i].offset;
+      unsigned src_end = src_start + inst->size_read(i) - 1;
+
+      if ((dst_start >= src_start && dst_start <= src_end) ||
+          (dst_end >= src_start && dst_end <= src_end) ||
+          (dst_start <= src_start && dst_end >= src_end)) {
+         return true;
+      }
+   }
+
+   return false;
+}
+
+bool
+vec4_visitor::lower_simd_width()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      const unsigned lowered_width =
+         get_lowered_simd_width(devinfo, prog_data->dispatch_mode, stage, inst);
+      assert(lowered_width <= inst->exec_size);
+      if (lowered_width == inst->exec_size)
+         continue;
+
+      /* We need to deal with source / destination overlaps when splitting.
+       * The hardware supports reading from and writing to the same register
+       * in the same instruction, but we need to be careful that each split
+       * instruction we produce does not corrupt the source of the next.
+       *
+       * The easiest way to handle this is to make the split instructions write
+       * to temporaries if there is an src/dst overlap and then move from the
+       * temporaries to the original destination. We also need to consider
+       * instructions that do partial writes via align1 opcodes, in which case
+       * we need to make sure that the we initialize the temporary with the
+       * value of the instruction's dst.
+       */
+      bool needs_temp = dst_src_regions_overlap(inst);
+      for (unsigned n = 0; n < inst->exec_size / lowered_width; n++)  {
+         unsigned channel_offset = lowered_width * n;
+
+         unsigned size_written = lowered_width * type_sz(inst->dst.type);
+
+         /* Create the split instruction from the original so that we copy all
+          * relevant instruction fields, then set the width and calculate the
+          * new dst/src regions.
+          */
+         vec4_instruction *linst = new(mem_ctx) vec4_instruction(*inst);
+         linst->exec_size = lowered_width;
+         linst->group = channel_offset;
+         linst->size_written = size_written;
+
+         /* Compute split dst region */
+         dst_reg dst;
+         if (needs_temp) {
+            unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE);
+            dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)),
+                         inst->dst.type);
+            if (inst->is_align1_partial_write()) {
+               vec4_instruction *copy = MOV(dst, src_reg(inst->dst));
+               copy->exec_size = lowered_width;
+               copy->group = channel_offset;
+               copy->size_written = size_written;
+               inst->insert_before(block, copy);
+            }
+         } else {
+            dst = horiz_offset(inst->dst, channel_offset);
+         }
+         linst->dst = dst;
+
+         /* Compute split source regions */
+         for (int i = 0; i < 3; i++) {
+            if (linst->src[i].file == BAD_FILE)
+               continue;
+
+            bool is_interleaved_attr =
+               linst->src[i].file == ATTR &&
+               stage_uses_interleaved_attributes(stage,
+                                                 prog_data->dispatch_mode);
+
+            if (!is_uniform(linst->src[i]) && !is_interleaved_attr)
+               linst->src[i] = horiz_offset(linst->src[i], channel_offset);
+         }
+
+         inst->insert_before(block, linst);
+
+         /* If we used a temporary to store the result of the split
+          * instruction, copy the result to the original destination
+          */
+         if (needs_temp) {
+            vec4_instruction *mov =
+               MOV(offset(inst->dst, lowered_width, n), src_reg(dst));
+            mov->exec_size = lowered_width;
+            mov->group = channel_offset;
+            mov->size_written = size_written;
+            mov->predicate = inst->predicate;
+            inst->insert_before(block, mov);
+         }
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+static brw_predicate
+scalarize_predicate(brw_predicate predicate, unsigned writemask)
+{
+   if (predicate != BRW_PREDICATE_NORMAL)
+      return predicate;
+
+   switch (writemask) {
+   case WRITEMASK_X:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_X;
+   case WRITEMASK_Y:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+   case WRITEMASK_Z:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+   case WRITEMASK_W:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_W;
+   default:
+      unreachable("invalid writemask");
+   }
+}
+
+/* Gfx7 has a hardware decompression bug that we can exploit to represent
+ * handful of additional swizzles natively.
+ */
+static bool
+is_gfx7_supported_64bit_swizzle(vec4_instruction *inst, unsigned arg)
+{
+   switch (inst->src[arg].swizzle) {
+   case BRW_SWIZZLE_XXXX:
+   case BRW_SWIZZLE_YYYY:
+   case BRW_SWIZZLE_ZZZZ:
+   case BRW_SWIZZLE_WWWW:
+   case BRW_SWIZZLE_XYXY:
+   case BRW_SWIZZLE_YXYX:
+   case BRW_SWIZZLE_ZWZW:
+   case BRW_SWIZZLE_WZWZ:
+      return true;
+   default:
+      return false;
+   }
+}
+
+/* 64-bit sources use regions with a width of 2. These 2 elements in each row
+ * can be addressed using 32-bit swizzles (which is what the hardware supports)
+ * but it also means that the swizzle we apply on the first two components of a
+ * dvec4 is coupled with the swizzle we use for the last 2. In other words,
+ * only some specific swizzle combinations can be natively supported.
+ *
+ * FIXME: we can go an step further and implement even more swizzle
+ *        variations using only partial scalarization.
+ *
+ * For more details see:
+ * https://bugs.freedesktop.org/show_bug.cgi?id=92760#c82
+ */
+bool
+vec4_visitor::is_supported_64bit_region(vec4_instruction *inst, unsigned arg)
+{
+   const src_reg &src = inst->src[arg];
+   assert(type_sz(src.type) == 8);
+
+   /* Uniform regions have a vstride=0. Because we use 2-wide rows with
+    * 64-bit regions it means that we cannot access components Z/W, so
+    * return false for any such case. Interleaved attributes will also be
+    * mapped to GRF registers with a vstride of 0, so apply the same
+    * treatment.
+    */
+   if ((is_uniform(src) ||
+        (stage_uses_interleaved_attributes(stage, prog_data->dispatch_mode) &&
+         src.file == ATTR)) &&
+       (brw_mask_for_swizzle(src.swizzle) & 12))
+      return false;
+
+   switch (src.swizzle) {
+   case BRW_SWIZZLE_XYZW:
+   case BRW_SWIZZLE_XXZZ:
+   case BRW_SWIZZLE_YYWW:
+   case BRW_SWIZZLE_YXWZ:
+      return true;
+   default:
+      return devinfo->ver == 7 && is_gfx7_supported_64bit_swizzle(inst, arg);
+   }
+}
+
+bool
+vec4_visitor::scalarize_df()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      /* Skip DF instructions that operate in Align1 mode */
+      if (is_align1_df(inst))
+         continue;
+
+      /* Check if this is a double-precision instruction */
+      bool is_double = type_sz(inst->dst.type) == 8;
+      for (int arg = 0; !is_double && arg < 3; arg++) {
+         is_double = inst->src[arg].file != BAD_FILE &&
+                     type_sz(inst->src[arg].type) == 8;
+      }
+
+      if (!is_double)
+         continue;
+
+      /* Skip the lowering for specific regioning scenarios that we can
+       * support natively.
+       */
+      bool skip_lowering = true;
+
+      /* XY and ZW writemasks operate in 32-bit, which means that they don't
+       * have a native 64-bit representation and they should always be split.
+       */
+      if (inst->dst.writemask == WRITEMASK_XY ||
+          inst->dst.writemask == WRITEMASK_ZW) {
+         skip_lowering = false;
+      } else {
+         for (unsigned i = 0; i < 3; i++) {
+            if (inst->src[i].file == BAD_FILE || type_sz(inst->src[i].type) < 8)
+               continue;
+            skip_lowering = skip_lowering && is_supported_64bit_region(inst, i);
+         }
+      }
+
+      if (skip_lowering)
+         continue;
+
+      /* Generate scalar instructions for each enabled channel */
+      for (unsigned chan = 0; chan < 4; chan++) {
+         unsigned chan_mask = 1 << chan;
+         if (!(inst->dst.writemask & chan_mask))
+            continue;
+
+         vec4_instruction *scalar_inst = new(mem_ctx) vec4_instruction(*inst);
+
+         for (unsigned i = 0; i < 3; i++) {
+            unsigned swz = BRW_GET_SWZ(inst->src[i].swizzle, chan);
+            scalar_inst->src[i].swizzle = BRW_SWIZZLE4(swz, swz, swz, swz);
+         }
+
+         scalar_inst->dst.writemask = chan_mask;
+
+         if (inst->predicate != BRW_PREDICATE_NONE) {
+            scalar_inst->predicate =
+               scalarize_predicate(inst->predicate, chan_mask);
+         }
+
+         inst->insert_before(block, scalar_inst);
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+bool
+vec4_visitor::lower_64bit_mad_to_mul_add()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      if (inst->opcode != BRW_OPCODE_MAD)
+         continue;
+
+      if (type_sz(inst->dst.type) != 8)
+         continue;
+
+      dst_reg mul_dst = dst_reg(this, glsl_dvec4_type());
+
+      /* Use the copy constructor so we copy all relevant instruction fields
+       * from the original mad into the add and mul instructions
+       */
+      vec4_instruction *mul = new(mem_ctx) vec4_instruction(*inst);
+      mul->opcode = BRW_OPCODE_MUL;
+      mul->dst = mul_dst;
+      mul->src[0] = inst->src[1];
+      mul->src[1] = inst->src[2];
+      mul->src[2].file = BAD_FILE;
+
+      vec4_instruction *add = new(mem_ctx) vec4_instruction(*inst);
+      add->opcode = BRW_OPCODE_ADD;
+      add->src[0] = src_reg(mul_dst);
+      add->src[1] = inst->src[0];
+      add->src[2].file = BAD_FILE;
+
+      inst->insert_before(block, mul);
+      inst->insert_before(block, add);
+      inst->remove(block);
+
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
+
+/* The align16 hardware can only do 32-bit swizzle channels, so we need to
+ * translate the logical 64-bit swizzle channels that we use in the Vec4 IR
+ * to 32-bit swizzle channels in hardware registers.
+ *
+ * @inst and @arg identify the original vec4 IR source operand we need to
+ * translate the swizzle for and @hw_reg is the hardware register where we
+ * will write the hardware swizzle to use.
+ *
+ * This pass assumes that Align16/DF instructions have been fully scalarized
+ * previously so there is just one 64-bit swizzle channel to deal with for any
+ * given Vec4 IR source.
+ */
+void
+vec4_visitor::apply_logical_swizzle(struct brw_reg *hw_reg,
+                                    vec4_instruction *inst, int arg)
+{
+   src_reg reg = inst->src[arg];
+
+   if (reg.file == BAD_FILE || reg.file == BRW_IMMEDIATE_VALUE)
+      return;
+
+   /* If this is not a 64-bit operand or this is a scalar instruction we don't
+    * need to do anything about the swizzles.
+    */
+   if(type_sz(reg.type) < 8 || is_align1_df(inst)) {
+      hw_reg->swizzle = reg.swizzle;
+      return;
+   }
+
+   /* Take the 64-bit logical swizzle channel and translate it to 32-bit */
+   assert(brw_is_single_value_swizzle(reg.swizzle) ||
+          is_supported_64bit_region(inst, arg));
+
+   /* Apply the region <2, 2, 1> for GRF or <0, 2, 1> for uniforms, as align16
+    * HW can only do 32-bit swizzle channels.
+    */
+   hw_reg->width = BRW_WIDTH_2;
+
+   if (is_supported_64bit_region(inst, arg) &&
+       !is_gfx7_supported_64bit_swizzle(inst, arg)) {
+      /* Supported 64-bit swizzles are those such that their first two
+       * components, when expanded to 32-bit swizzles, match the semantics
+       * of the original 64-bit swizzle with 2-wide row regioning.
+       */
+      unsigned swizzle0 = BRW_GET_SWZ(reg.swizzle, 0);
+      unsigned swizzle1 = BRW_GET_SWZ(reg.swizzle, 1);
+      hw_reg->swizzle = BRW_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1,
+                                     swizzle1 * 2, swizzle1 * 2 + 1);
+   } else {
+      /* If we got here then we have one of the following:
+       *
+       * 1. An unsupported swizzle, which should be single-value thanks to the
+       *    scalarization pass.
+       *
+       * 2. A gfx7 supported swizzle. These can be single-value or double-value
+       *    swizzles. If the latter, they are never cross-dvec2 channels. For
+       *    these we always need to activate the gfx7 vstride=0 exploit.
+       */
+      unsigned swizzle0 = BRW_GET_SWZ(reg.swizzle, 0);
+      unsigned swizzle1 = BRW_GET_SWZ(reg.swizzle, 1);
+      assert((swizzle0 < 2) == (swizzle1 < 2));
+
+      /* To gain access to Z/W components we need to select the second half
+       * of the register and then use a X/Y swizzle to select Z/W respectively.
+       */
+      if (swizzle0 >= 2) {
+         *hw_reg = suboffset(*hw_reg, 2);
+         swizzle0 -= 2;
+         swizzle1 -= 2;
+      }
+
+      /* All gfx7-specific supported swizzles require the vstride=0 exploit */
+      if (devinfo->ver == 7 && is_gfx7_supported_64bit_swizzle(inst, arg))
+         hw_reg->vstride = BRW_VERTICAL_STRIDE_0;
+
+      /* Any 64-bit source with an offset at 16B is intended to address the
+       * second half of a register and needs a vertical stride of 0 so we:
+       *
+       * 1. Don't violate register region restrictions.
+       * 2. Activate the gfx7 instruction decompression bug exploit when
+       *    execsize > 4
+       */
+      if (hw_reg->subnr % REG_SIZE == 16) {
+         assert(devinfo->ver == 7);
+         hw_reg->vstride = BRW_VERTICAL_STRIDE_0;
+      }
+
+      hw_reg->swizzle = BRW_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1,
+                                     swizzle1 * 2, swizzle1 * 2 + 1);
+   }
+}
+
+void
+vec4_visitor::invalidate_analysis(brw::analysis_dependency_class c)
+{
+   backend_shader::invalidate_analysis(c);
+   live_analysis.invalidate(c);
+}
+
+bool
+vec4_visitor::run()
+{
+   setup_push_ranges();
+
+   if (prog_data->base.zero_push_reg) {
+      /* push_reg_mask_param is in uint32 params and UNIFORM is in vec4s */
+      const unsigned mask_param = stage_prog_data->push_reg_mask_param;
+      src_reg mask = src_reg(dst_reg(UNIFORM, mask_param / 4));
+      assert(mask_param % 2 == 0); /* Should be 64-bit-aligned */
+      mask.swizzle = BRW_SWIZZLE4((mask_param + 0) % 4,
+                                  (mask_param + 1) % 4,
+                                  (mask_param + 0) % 4,
+                                  (mask_param + 1) % 4);
+
+      emit(VEC4_OPCODE_ZERO_OOB_PUSH_REGS,
+           dst_reg(VGRF, alloc.allocate(3)), mask);
+   }
+
+   emit_prolog();
+
+   emit_nir_code();
+   if (failed)
+      return false;
+   base_ir = NULL;
+
+   emit_thread_end();
+
+   calculate_cfg();
+   cfg->validate(_mesa_shader_stage_to_abbrev(stage));
+
+   /* Before any optimization, push array accesses out to scratch
+    * space where we need them to be.  This pass may allocate new
+    * virtual GRFs, so we want to do it early.  It also makes sure
+    * that we have reladdr computations available for CSE, since we'll
+    * often do repeated subexpressions for those.
+    */
+   move_grf_array_access_to_scratch();
+   split_uniform_registers();
+
+   split_virtual_grfs();
+
+#define OPT(pass, args...) ({                                          \
+      pass_num++;                                                      \
+      bool this_progress = pass(args);                                 \
+                                                                       \
+      if (INTEL_DEBUG(DEBUG_OPTIMIZER) && this_progress) {             \
+         char filename[64];                                            \
+         snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass,              \
+                  _mesa_shader_stage_to_abbrev(stage),                 \
+                  nir->info.name, iteration, pass_num);                \
+                                                                       \
+         backend_shader::dump_instructions(filename);                  \
+      }                                                                \
+                                                                       \
+      cfg->validate(_mesa_shader_stage_to_abbrev(stage));              \
+      progress = progress || this_progress;                            \
+      this_progress;                                                   \
+   })
+
+
+   if (INTEL_DEBUG(DEBUG_OPTIMIZER)) {
+      char filename[64];
+      snprintf(filename, 64, "%s-%s-00-00-start",
+               _mesa_shader_stage_to_abbrev(stage), nir->info.name);
+
+      backend_shader::dump_instructions(filename);
+   }
+
+   bool progress;
+   int iteration = 0;
+   int pass_num = 0;
+   do {
+      progress = false;
+      pass_num = 0;
+      iteration++;
+
+      OPT(opt_predicated_break, this);
+      OPT(opt_reduce_swizzle);
+      OPT(dead_code_eliminate);
+      OPT(dead_control_flow_eliminate, this);
+      OPT(opt_copy_propagation);
+      OPT(opt_cmod_propagation);
+      OPT(opt_cse);
+      OPT(opt_algebraic);
+      OPT(opt_register_coalesce);
+      OPT(eliminate_find_live_channel);
+   } while (progress);
+
+   pass_num = 0;
+
+   if (OPT(opt_vector_float)) {
+      OPT(opt_cse);
+      OPT(opt_copy_propagation, false);
+      OPT(opt_copy_propagation, true);
+      OPT(dead_code_eliminate);
+   }
+
+   if (devinfo->ver <= 5 && OPT(lower_minmax)) {
+      OPT(opt_cmod_propagation);
+      OPT(opt_cse);
+      OPT(opt_copy_propagation);
+      OPT(dead_code_eliminate);
+   }
+
+   if (OPT(lower_simd_width)) {
+      OPT(opt_copy_propagation);
+      OPT(dead_code_eliminate);
+   }
+
+   if (failed)
+      return false;
+
+   OPT(lower_64bit_mad_to_mul_add);
+
+   /* Run this before payload setup because tessellation shaders
+    * rely on it to prevent cross dvec2 regioning on DF attributes
+    * that are setup so that XY are on the second half of register and
+    * ZW are in the first half of the next.
+    */
+   OPT(scalarize_df);
+
+   setup_payload();
+
+   if (INTEL_DEBUG(DEBUG_SPILL_VEC4)) {
+      /* Debug of register spilling: Go spill everything. */
+      const int grf_count = alloc.count;
+      float spill_costs[alloc.count];
+      bool no_spill[alloc.count];
+      evaluate_spill_costs(spill_costs, no_spill);
+      for (int i = 0; i < grf_count; i++) {
+         if (no_spill[i])
+            continue;
+         spill_reg(i);
+      }
+
+      /* We want to run this after spilling because 64-bit (un)spills need to
+       * emit code to shuffle 64-bit data for the 32-bit scratch read/write
+       * messages that can produce unsupported 64-bit swizzle regions.
+       */
+      OPT(scalarize_df);
+   }
+
+   fixup_3src_null_dest();
+
+   bool allocated_without_spills = reg_allocate();
+
+   if (!allocated_without_spills) {
+      brw_shader_perf_log(compiler, log_data,
+                          "%s shader triggered register spilling.  "
+                          "Try reducing the number of live vec4 values "
+                          "to improve performance.\n",
+                          _mesa_shader_stage_to_string(stage));
+
+      while (!reg_allocate()) {
+         if (failed)
+            return false;
+      }
+
+      /* We want to run this after spilling because 64-bit (un)spills need to
+       * emit code to shuffle 64-bit data for the 32-bit scratch read/write
+       * messages that can produce unsupported 64-bit swizzle regions.
+       */
+      OPT(scalarize_df);
+   }
+
+   opt_schedule_instructions();
+
+   opt_set_dependency_control();
+
+   convert_to_hw_regs();
+
+   if (last_scratch > 0) {
+      prog_data->base.total_scratch =
+         brw_get_scratch_size(last_scratch * REG_SIZE);
+   }
+
+   return !failed;
+}
+
+} /* namespace brw */
+
+extern "C" {
+
+const unsigned *
+brw_compile_vs(const struct brw_compiler *compiler,
+               struct brw_compile_vs_params *params)
+{
+   struct nir_shader *nir = params->base.nir;
+   const struct brw_vs_prog_key *key = params->key;
+   struct brw_vs_prog_data *prog_data = params->prog_data;
+   const bool debug_enabled =
+      brw_should_print_shader(nir, params->base.debug_flag ?
+                                   params->base.debug_flag : DEBUG_VS);
+
+   prog_data->base.base.stage = MESA_SHADER_VERTEX;
+   prog_data->base.base.ray_queries = nir->info.ray_queries;
+   prog_data->base.base.total_scratch = 0;
+
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
+   brw_nir_apply_key(nir, compiler, &key->base, 8);
+
+   const unsigned *assembly = NULL;
+
+   prog_data->inputs_read = nir->info.inputs_read;
+   prog_data->double_inputs_read = nir->info.vs.double_inputs;
+
+   brw_nir_lower_vs_inputs(nir, params->edgeflag_is_last, key->gl_attrib_wa_flags);
+   brw_nir_lower_vue_outputs(nir);
+   brw_postprocess_nir(nir, compiler, debug_enabled,
+                       key->base.robust_flags);
+
+   prog_data->base.clip_distance_mask =
+      ((1 << nir->info.clip_distance_array_size) - 1);
+   prog_data->base.cull_distance_mask =
+      ((1 << nir->info.cull_distance_array_size) - 1) <<
+      nir->info.clip_distance_array_size;
+
+   unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read);
+
+   /* gl_VertexID and gl_InstanceID are system values, but arrive via an
+    * incoming vertex attribute.  So, add an extra slot.
+    */
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) ||
+       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) ||
+       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
+       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID)) {
+      nr_attribute_slots++;
+   }
+
+   /* gl_DrawID and IsIndexedDraw share its very own vec4 */
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID) ||
+       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_IS_INDEXED_DRAW)) {
+      nr_attribute_slots++;
+   }
+
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_IS_INDEXED_DRAW))
+      prog_data->uses_is_indexed_draw = true;
+
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX))
+      prog_data->uses_firstvertex = true;
+
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE))
+      prog_data->uses_baseinstance = true;
+
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE))
+      prog_data->uses_vertexid = true;
+
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID))
+      prog_data->uses_instanceid = true;
+
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID))
+          prog_data->uses_drawid = true;
+
+   /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
+    * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
+    * vec4 mode, the hardware appears to wedge unless we read something.
+    */
+   if (is_scalar)
+      prog_data->base.urb_read_length =
+         DIV_ROUND_UP(nr_attribute_slots, 2);
+   else
+      prog_data->base.urb_read_length =
+         DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2);
+
+   prog_data->nr_attribute_slots = nr_attribute_slots;
+
+   /* Since vertex shaders reuse the same VUE entry for inputs and outputs
+    * (overwriting the original contents), we need to make sure the size is
+    * the larger of the two.
+    */
+   const unsigned vue_entries =
+      MAX2(nr_attribute_slots, (unsigned)prog_data->base.vue_map.num_slots);
+
+   if (compiler->devinfo->ver == 6) {
+      prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
+   } else {
+      prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
+   }
+
+   if (unlikely(debug_enabled)) {
+      fprintf(stderr, "VS Output ");
+      brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX);
+   }
+
+   if (is_scalar) {
+      const unsigned dispatch_width = compiler->devinfo->ver >= 20 ? 16 : 8;
+      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
+
+      fs_visitor v(compiler, &params->base, &key->base,
+                   &prog_data->base.base, nir, dispatch_width,
+                   params->base.stats != NULL, debug_enabled);
+      if (!v.run_vs()) {
+         params->base.error_str =
+            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
+      prog_data->base.base.dispatch_grf_start_reg =
+         v.payload().num_regs / reg_unit(compiler->devinfo);
+
+      fs_generator g(compiler, &params->base,
+                     &prog_data->base.base, v.runtime_check_aads_emit,
+                     MESA_SHADER_VERTEX);
+      if (unlikely(debug_enabled)) {
+         const char *debug_name =
+            ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s",
+                            nir->info.label ? nir->info.label :
+                               "unnamed",
+                            nir->info.name);
+
+         g.enable_debug(debug_name);
+      }
+      g.generate_code(v.cfg, dispatch_width, v.shader_stats,
+                      v.performance_analysis.require(), params->base.stats);
+      g.add_const_data(nir->constant_data, nir->constant_data_size);
+      assembly = g.get_assembly();
+   }
+
+   if (!assembly) {
+      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
+
+      vec4_vs_visitor v(compiler, &params->base, key, prog_data,
+                        nir, debug_enabled);
+      if (!v.run()) {
+         params->base.error_str =
+            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      assembly = brw_vec4_generate_assembly(compiler, &params->base,
+                                            nir, &prog_data->base,
+                                            v.cfg,
+                                            v.performance_analysis.require(),
+                                            debug_enabled);
+   }
+
+   return assembly;
+}
+
+} /* extern "C" */
diff --git a/src/intel/compiler/elk/brw_vec4.h b/src/intel/compiler/elk/brw_vec4.h
new file mode 100644
index 00000000000..ca803386309
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4.h
@@ -0,0 +1,350 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_VEC4_H
+#define BRW_VEC4_H
+
+#include "brw_shader.h"
+
+#ifdef __cplusplus
+#include "brw_ir_vec4.h"
+#include "brw_ir_performance.h"
+#include "brw_vec4_builder.h"
+#include "brw_vec4_live_variables.h"
+#endif
+
+#include "compiler/glsl/ir.h"
+#include "compiler/nir/nir.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+const unsigned *
+brw_vec4_generate_assembly(const struct brw_compiler *compiler,
+                           const struct brw_compile_params *params,
+                           const nir_shader *nir,
+                           struct brw_vue_prog_data *prog_data,
+                           const struct cfg_t *cfg,
+                           const brw::performance &perf,
+                           bool debug_enabled);
+
+#ifdef __cplusplus
+} /* extern "C" */
+
+namespace brw {
+/**
+ * The vertex shader front-end.
+ *
+ * Translates either GLSL IR or Mesa IR (for ARB_vertex_program and
+ * fixed-function) into VS IR.
+ */
+class vec4_visitor : public backend_shader
+{
+public:
+   vec4_visitor(const struct brw_compiler *compiler,
+                const struct brw_compile_params *params,
+                const struct brw_sampler_prog_key_data *key,
+                struct brw_vue_prog_data *prog_data,
+                const nir_shader *shader,
+                bool no_spills,
+                bool debug_enabled);
+
+   dst_reg dst_null_f()
+   {
+      return dst_reg(brw_null_reg());
+   }
+
+   dst_reg dst_null_df()
+   {
+      return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
+   }
+
+   dst_reg dst_null_d()
+   {
+      return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   }
+
+   dst_reg dst_null_ud()
+   {
+      return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+   }
+
+   const struct brw_sampler_prog_key_data * const key_tex;
+   struct brw_vue_prog_data * const prog_data;
+   char *fail_msg;
+   bool failed;
+
+   /**
+    * GLSL IR currently being processed, which is associated with our
+    * driver IR instructions for debugging purposes.
+    */
+   const void *base_ir;
+   const char *current_annotation;
+
+   int first_non_payload_grf;
+   unsigned ubo_push_start[4];
+   unsigned push_length;
+   unsigned int max_grf;
+   brw_analysis<brw::vec4_live_variables, backend_shader> live_analysis;
+   brw_analysis<brw::performance, vec4_visitor> performance_analysis;
+
+   /* Regs for vertex results.  Generated at ir_variable visiting time
+    * for the ir->location's used.
+    */
+   dst_reg output_reg[VARYING_SLOT_TESS_MAX][4];
+   unsigned output_num_components[VARYING_SLOT_TESS_MAX][4];
+   const char *output_reg_annotation[VARYING_SLOT_TESS_MAX];
+   int uniforms;
+
+   bool run();
+   void fail(const char *msg, ...);
+
+   int setup_uniforms(int payload_reg);
+
+   bool reg_allocate_trivial();
+   bool reg_allocate();
+   void evaluate_spill_costs(float *spill_costs, bool *no_spill);
+   int choose_spill_reg(struct ra_graph *g);
+   void spill_reg(unsigned spill_reg);
+   void move_grf_array_access_to_scratch();
+   void split_uniform_registers();
+   void setup_push_ranges();
+   virtual void invalidate_analysis(brw::analysis_dependency_class c);
+   void split_virtual_grfs();
+   bool opt_vector_float();
+   bool opt_reduce_swizzle();
+   bool dead_code_eliminate();
+   bool opt_cmod_propagation();
+   bool opt_copy_propagation(bool do_constant_prop = true);
+   bool opt_cse_local(bblock_t *block, const vec4_live_variables &live);
+   bool opt_cse();
+   bool opt_algebraic();
+   bool opt_register_coalesce();
+   bool eliminate_find_live_channel();
+   bool is_dep_ctrl_unsafe(const vec4_instruction *inst);
+   void opt_set_dependency_control();
+   void opt_schedule_instructions();
+   void convert_to_hw_regs();
+   void fixup_3src_null_dest();
+
+   bool is_supported_64bit_region(vec4_instruction *inst, unsigned arg);
+   bool lower_simd_width();
+   bool scalarize_df();
+   bool lower_64bit_mad_to_mul_add();
+   void apply_logical_swizzle(struct brw_reg *hw_reg,
+                              vec4_instruction *inst, int arg);
+
+   vec4_instruction *emit(vec4_instruction *inst);
+
+   vec4_instruction *emit(enum opcode opcode);
+   vec4_instruction *emit(enum opcode opcode, const dst_reg &dst);
+   vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
+                          const src_reg &src0);
+   vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
+                          const src_reg &src0, const src_reg &src1);
+   vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
+                          const src_reg &src0, const src_reg &src1,
+                          const src_reg &src2);
+
+   vec4_instruction *emit_before(bblock_t *block,
+                                 vec4_instruction *inst,
+				 vec4_instruction *new_inst);
+
+#define EMIT1(op) vec4_instruction *op(const dst_reg &, const src_reg &);
+#define EMIT2(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &);
+#define EMIT3(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &, const src_reg &);
+   EMIT1(MOV)
+   EMIT1(NOT)
+   EMIT1(RNDD)
+   EMIT1(RNDE)
+   EMIT1(RNDZ)
+   EMIT1(FRC)
+   EMIT1(F32TO16)
+   EMIT1(F16TO32)
+   EMIT2(ADD)
+   EMIT2(MUL)
+   EMIT2(MACH)
+   EMIT2(MAC)
+   EMIT2(AND)
+   EMIT2(OR)
+   EMIT2(XOR)
+   EMIT2(DP3)
+   EMIT2(DP4)
+   EMIT2(DPH)
+   EMIT2(SHL)
+   EMIT2(SHR)
+   EMIT2(ASR)
+   vec4_instruction *CMP(dst_reg dst, src_reg src0, src_reg src1,
+			 enum brw_conditional_mod condition);
+   vec4_instruction *IF(src_reg src0, src_reg src1,
+                        enum brw_conditional_mod condition);
+   vec4_instruction *IF(enum brw_predicate predicate);
+   EMIT1(SCRATCH_READ)
+   EMIT2(SCRATCH_WRITE)
+   EMIT3(LRP)
+   EMIT1(BFREV)
+   EMIT3(BFE)
+   EMIT2(BFI1)
+   EMIT3(BFI2)
+   EMIT1(FBH)
+   EMIT1(FBL)
+   EMIT1(CBIT)
+   EMIT1(LZD)
+   EMIT3(MAD)
+   EMIT2(ADDC)
+   EMIT2(SUBB)
+   EMIT1(DIM)
+
+#undef EMIT1
+#undef EMIT2
+#undef EMIT3
+
+   vec4_instruction *emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
+                                 src_reg src0, src_reg src1);
+
+   /**
+    * Copy any live channel from \p src to the first channel of the
+    * result.
+    */
+   src_reg emit_uniformize(const src_reg &src);
+
+   /** Fix all float operands of a 3-source instruction. */
+   void fix_float_operands(src_reg op[3], nir_alu_instr *instr);
+
+   src_reg fix_3src_operand(const src_reg &src);
+
+   vec4_instruction *emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+                               const src_reg &src1 = src_reg());
+
+   src_reg fix_math_operand(const src_reg &src);
+
+   void emit_pack_half_2x16(dst_reg dst, src_reg src0);
+   void emit_unpack_half_2x16(dst_reg dst, src_reg src0);
+   void emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0);
+   void emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0);
+   void emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0);
+   void emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0);
+
+   src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
+                          src_reg surface);
+
+   void emit_ndc_computation();
+   void emit_psiz_and_flags(dst_reg reg);
+   vec4_instruction *emit_generic_urb_slot(dst_reg reg, int varying, int comp);
+   virtual void emit_urb_slot(dst_reg reg, int varying);
+
+   src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst,
+			      src_reg *reladdr, int reg_offset);
+   void emit_scratch_read(bblock_t *block, vec4_instruction *inst,
+			  dst_reg dst,
+			  src_reg orig_src,
+			  int base_offset);
+   void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
+			   int base_offset);
+   void emit_pull_constant_load_reg(dst_reg dst,
+                                    src_reg surf_index,
+                                    src_reg offset,
+                                    bblock_t *before_block,
+                                    vec4_instruction *before_inst);
+   src_reg emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
+                                vec4_instruction *inst, src_reg src);
+
+   void resolve_ud_negate(src_reg *reg);
+
+   void emit_shader_float_controls_execution_mode();
+
+   bool lower_minmax();
+
+   src_reg get_timestamp();
+
+   virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const;
+
+   bool optimize_predicate(nir_alu_instr *instr, enum brw_predicate *predicate);
+
+   void emit_conversion_from_double(dst_reg dst, src_reg src);
+   void emit_conversion_to_double(dst_reg dst, src_reg src);
+
+   vec4_instruction *shuffle_64bit_data(dst_reg dst, src_reg src,
+                                        bool for_write,
+                                        bool for_scratch = false,
+                                        bblock_t *block = NULL,
+                                        vec4_instruction *ref = NULL);
+
+   virtual void emit_nir_code();
+   virtual void nir_setup_uniforms();
+   virtual void nir_emit_impl(nir_function_impl *impl);
+   virtual void nir_emit_cf_list(exec_list *list);
+   virtual void nir_emit_if(nir_if *if_stmt);
+   virtual void nir_emit_loop(nir_loop *loop);
+   virtual void nir_emit_block(nir_block *block);
+   virtual void nir_emit_instr(nir_instr *instr);
+   virtual void nir_emit_load_const(nir_load_const_instr *instr);
+   src_reg get_nir_ssbo_intrinsic_index(nir_intrinsic_instr *instr);
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+   virtual void nir_emit_alu(nir_alu_instr *instr);
+   virtual void nir_emit_jump(nir_jump_instr *instr);
+   virtual void nir_emit_texture(nir_tex_instr *instr);
+   virtual void nir_emit_undef(nir_undef_instr *instr);
+   virtual void nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr);
+
+   dst_reg get_nir_def(const nir_def &def, enum brw_reg_type type);
+   dst_reg get_nir_def(const nir_def &def, nir_alu_type type);
+   dst_reg get_nir_def(const nir_def &def);
+   src_reg get_nir_src(const nir_src &src, enum brw_reg_type type,
+                       unsigned num_components = 4);
+   src_reg get_nir_src(const nir_src &src, nir_alu_type type,
+                       unsigned num_components = 4);
+   src_reg get_nir_src(const nir_src &src,
+                       unsigned num_components = 4);
+   src_reg get_nir_src_imm(const nir_src &src);
+   src_reg get_indirect_offset(nir_intrinsic_instr *instr);
+
+   dst_reg *nir_ssa_values;
+
+protected:
+   void emit_vertex();
+   void setup_payload_interference(struct ra_graph *g, int first_payload_node,
+                                   int reg_node_count);
+   virtual void setup_payload() = 0;
+   virtual void emit_prolog() = 0;
+   virtual void emit_thread_end() = 0;
+   virtual void emit_urb_write_header(int mrf) = 0;
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete) = 0;
+   virtual void gs_emit_vertex(int stream_id);
+   virtual void gs_end_primitive();
+
+private:
+   /**
+    * If true, then register allocation should fail instead of spilling.
+    */
+   const bool no_spills;
+
+   unsigned last_scratch; /**< measured in 32-byte (register size) units */
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_H */
diff --git a/src/intel/compiler/elk/brw_vec4_builder.h b/src/intel/compiler/elk/brw_vec4_builder.h
new file mode 100644
index 00000000000..322a6aae20b
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_builder.h
@@ -0,0 +1,646 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_VEC4_BUILDER_H
+#define BRW_VEC4_BUILDER_H
+
+#include "brw_ir_vec4.h"
+#include "brw_ir_allocator.h"
+
+namespace brw {
+   /**
+    * Toolbox to assemble a VEC4 IR program out of individual instructions.
+    *
+    * This object is meant to have an interface consistent with
+    * brw::fs_builder.  They cannot be fully interchangeable because
+    * brw::fs_builder generates scalar code while brw::vec4_builder generates
+    * vector code.
+    */
+   class vec4_builder {
+   public:
+      /** Type used in this IR to represent a source of an instruction. */
+      typedef brw::src_reg src_reg;
+
+      /** Type used in this IR to represent the destination of an instruction. */
+      typedef brw::dst_reg dst_reg;
+
+      /** Type used in this IR to represent an instruction. */
+      typedef vec4_instruction instruction;
+
+      /**
+       * Construct a vec4_builder that inserts instructions into \p shader.
+       */
+      vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) :
+         shader(shader), block(NULL), cursor(NULL),
+         _dispatch_width(dispatch_width), _group(0),
+         force_writemask_all(false),
+         annotation()
+      {
+      }
+
+      /**
+       * Construct a vec4_builder that inserts instructions into \p shader
+       * before instruction \p inst in basic block \p block.  The default
+       * execution controls and debug annotation are initialized from the
+       * instruction passed as argument.
+       */
+      vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) :
+         shader(shader), block(block), cursor(inst),
+         _dispatch_width(inst->exec_size), _group(inst->group),
+         force_writemask_all(inst->force_writemask_all)
+      {
+         annotation.str = inst->annotation;
+         annotation.ir = inst->ir;
+      }
+
+      /**
+       * Construct a vec4_builder that inserts instructions before \p cursor
+       * in basic block \p block, inheriting other code generation parameters
+       * from this.
+       */
+      vec4_builder
+      at(bblock_t *block, exec_node *cursor) const
+      {
+         vec4_builder bld = *this;
+         bld.block = block;
+         bld.cursor = cursor;
+         return bld;
+      }
+
+      /**
+       * Construct a vec4_builder appending instructions at the end of the
+       * instruction list of the shader, inheriting other code generation
+       * parameters from this.
+       */
+      vec4_builder
+      at_end() const
+      {
+         return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
+      }
+
+      /**
+       * Construct a builder specifying the default SIMD width and group of
+       * channel enable signals, inheriting other code generation parameters
+       * from this.
+       *
+       * \p n gives the default SIMD width, \p i gives the slot group used for
+       * predication and control flow masking in multiples of \p n channels.
+       */
+      vec4_builder
+      group(unsigned n, unsigned i) const
+      {
+         assert(force_writemask_all ||
+                (n <= dispatch_width() && i < dispatch_width() / n));
+         vec4_builder bld = *this;
+         bld._dispatch_width = n;
+         bld._group += i * n;
+         return bld;
+      }
+
+      /**
+       * Construct a builder with per-channel control flow execution masking
+       * disabled if \p b is true.  If control flow execution masking is
+       * already disabled this has no effect.
+       */
+      vec4_builder
+      exec_all(bool b = true) const
+      {
+         vec4_builder bld = *this;
+         if (b)
+            bld.force_writemask_all = true;
+         return bld;
+      }
+
+      /**
+       * Construct a builder with the given debug annotation info.
+       */
+      vec4_builder
+      annotate(const char *str, const void *ir = NULL) const
+      {
+         vec4_builder bld = *this;
+         bld.annotation.str = str;
+         bld.annotation.ir = ir;
+         return bld;
+      }
+
+      /**
+       * Get the SIMD width in use.
+       */
+      unsigned
+      dispatch_width() const
+      {
+         return _dispatch_width;
+      }
+
+      /**
+       * Get the channel group in use.
+       */
+      unsigned
+      group() const
+      {
+         return _group;
+      }
+
+      /**
+       * Allocate a virtual register of natural vector size (four for this IR)
+       * and SIMD width.  \p n gives the amount of space to allocate in
+       * dispatch_width units (which is just enough space for four logical
+       * components in this IR).
+       */
+      dst_reg
+      vgrf(enum brw_reg_type type, unsigned n = 1) const
+      {
+         assert(dispatch_width() <= 32);
+
+         if (n > 0)
+            return retype(dst_reg(VGRF, shader->alloc.allocate(
+                                     n * DIV_ROUND_UP(type_sz(type), 4))),
+                           type);
+         else
+            return retype(null_reg_ud(), type);
+      }
+
+      /**
+       * Create a null register of floating type.
+       */
+      dst_reg
+      null_reg_f() const
+      {
+         return dst_reg(retype(brw_null_vec(dispatch_width()),
+                               BRW_REGISTER_TYPE_F));
+      }
+
+      /**
+       * Create a null register of signed integer type.
+       */
+      dst_reg
+      null_reg_d() const
+      {
+         return dst_reg(retype(brw_null_vec(dispatch_width()),
+                               BRW_REGISTER_TYPE_D));
+      }
+
+      /**
+       * Create a null register of unsigned integer type.
+       */
+      dst_reg
+      null_reg_ud() const
+      {
+         return dst_reg(retype(brw_null_vec(dispatch_width()),
+                               BRW_REGISTER_TYPE_UD));
+      }
+
+      /**
+       * Insert an instruction into the program.
+       */
+      instruction *
+      emit(const instruction &inst) const
+      {
+         return emit(new(shader->mem_ctx) instruction(inst));
+      }
+
+      /**
+       * Create and insert a nullary control instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode) const
+      {
+         return emit(instruction(opcode));
+      }
+
+      /**
+       * Create and insert a nullary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst) const
+      {
+         return emit(instruction(opcode, dst));
+      }
+
+      /**
+       * Create and insert a unary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_RCP:
+         case SHADER_OPCODE_RSQ:
+         case SHADER_OPCODE_SQRT:
+         case SHADER_OPCODE_EXP2:
+         case SHADER_OPCODE_LOG2:
+         case SHADER_OPCODE_SIN:
+         case SHADER_OPCODE_COS:
+            return fix_math_instruction(
+               emit(instruction(opcode, dst,
+                                fix_math_operand(src0))));
+
+         default:
+            return emit(instruction(opcode, dst, src0));
+         }
+      }
+
+      /**
+       * Create and insert a binary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_POW:
+         case SHADER_OPCODE_INT_QUOTIENT:
+         case SHADER_OPCODE_INT_REMAINDER:
+            return fix_math_instruction(
+               emit(instruction(opcode, dst,
+                                fix_math_operand(src0),
+                                fix_math_operand(src1))));
+
+         default:
+            return emit(instruction(opcode, dst, src0, src1));
+         }
+      }
+
+      /**
+       * Create and insert a ternary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1, const src_reg &src2) const
+      {
+         switch (opcode) {
+         case BRW_OPCODE_BFE:
+         case BRW_OPCODE_BFI2:
+         case BRW_OPCODE_MAD:
+         case BRW_OPCODE_LRP:
+            return emit(instruction(opcode, dst,
+                                    fix_3src_operand(src0),
+                                    fix_3src_operand(src1),
+                                    fix_3src_operand(src2)));
+
+         default:
+            return emit(instruction(opcode, dst, src0, src1, src2));
+         }
+      }
+
+      /**
+       * Insert a preallocated instruction into the program.
+       */
+      instruction *
+      emit(instruction *inst) const
+      {
+         inst->exec_size = dispatch_width();
+         inst->group = group();
+         inst->force_writemask_all = force_writemask_all;
+         inst->size_written = inst->exec_size * type_sz(inst->dst.type);
+         inst->annotation = annotation.str;
+         inst->ir = annotation.ir;
+
+         if (block)
+            static_cast<instruction *>(cursor)->insert_before(block, inst);
+         else
+            cursor->insert_before(inst);
+
+         return inst;
+      }
+
+      /**
+       * Select \p src0 if the comparison of both sources with the given
+       * conditional mod evaluates to true, otherwise select \p src1.
+       *
+       * Generally useful to get the minimum or maximum of two values.
+       */
+      instruction *
+      emit_minmax(const dst_reg &dst, const src_reg &src0,
+                  const src_reg &src1, brw_conditional_mod mod) const
+      {
+         assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
+
+         return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
+                                     fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Copy any live channel from \p src to the first channel of the result.
+       */
+      src_reg
+      emit_uniformize(const src_reg &src) const
+      {
+         const vec4_builder ubld = exec_all();
+         const dst_reg chan_index =
+            writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X);
+         const dst_reg dst = vgrf(src.type);
+
+         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
+         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index));
+
+         return src_reg(dst);
+      }
+
+      /**
+       * Assorted arithmetic ops.
+       * @{
+       */
+#define ALU1(op)                                        \
+      instruction *                                     \
+      op(const dst_reg &dst, const src_reg &src0) const \
+      {                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0);       \
+      }
+
+#define ALU2(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
+      }
+
+#define ALU2_ACC(op)                                                    \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
+         inst->writes_accumulator = true;                               \
+         return inst;                                                   \
+      }
+
+#define ALU3(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
+         const src_reg &src2) const                                     \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
+      }
+
+      ALU2(ADD)
+      ALU2_ACC(ADDC)
+      ALU2(AND)
+      ALU2(ASR)
+      ALU2(AVG)
+      ALU3(BFE)
+      ALU2(BFI1)
+      ALU3(BFI2)
+      ALU1(BFREV)
+      ALU1(CBIT)
+      ALU3(CSEL)
+      ALU1(DIM)
+      ALU2(DP2)
+      ALU2(DP3)
+      ALU2(DP4)
+      ALU2(DPH)
+      ALU1(F16TO32)
+      ALU1(F32TO16)
+      ALU1(FBH)
+      ALU1(FBL)
+      ALU1(FRC)
+      ALU2(LINE)
+      ALU1(LZD)
+      ALU2(MAC)
+      ALU2_ACC(MACH)
+      ALU3(MAD)
+      ALU1(MOV)
+      ALU2(MUL)
+      ALU1(NOT)
+      ALU2(OR)
+      ALU2(PLN)
+      ALU1(RNDD)
+      ALU1(RNDE)
+      ALU1(RNDU)
+      ALU1(RNDZ)
+      ALU2(SAD2)
+      ALU2_ACC(SADA2)
+      ALU2(SEL)
+      ALU2(SHL)
+      ALU2(SHR)
+      ALU2_ACC(SUBB)
+      ALU2(XOR)
+
+#undef ALU3
+#undef ALU2_ACC
+#undef ALU2
+#undef ALU1
+      /** @} */
+
+      /**
+       * CMP: Sets the low bit of the destination channels with the result
+       * of the comparison, while the upper bits are undefined, and updates
+       * the flag register with the packed 16 bits of the result.
+       */
+      instruction *
+      CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
+          brw_conditional_mod condition) const
+      {
+         /* Take the instruction:
+          *
+          * CMP null<d> src0<f> src1<f>
+          *
+          * Original gfx4 does type conversion to the destination type
+          * before comparison, producing garbage results for floating
+          * point comparisons.
+          *
+          * The destination type doesn't matter on newer generations,
+          * so we set the type to match src0 so we can compact the
+          * instruction.
+          */
+         return set_condmod(condition,
+                            emit(BRW_OPCODE_CMP, retype(dst, src0.type),
+                                 fix_unsigned_negate(src0),
+                                 fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * CMPN: Behaves like CMP, but produces true if src1 is NaN.
+       */
+      instruction *
+      CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
+          brw_conditional_mod condition) const
+      {
+         /* Take the instruction:
+          *
+          * CMPN null<d> src0<f> src1<f>
+          *
+          * Original gfx4 does type conversion to the destination type
+          * before comparison, producing garbage results for floating
+          * point comparisons.
+          *
+          * The destination type doesn't matter on newer generations,
+          * so we set the type to match src0 so we can compact the
+          * instruction.
+          */
+         return set_condmod(condition,
+                            emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
+                                 fix_unsigned_negate(src0),
+                                 fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Gfx4 predicated IF.
+       */
+      instruction *
+      IF(brw_predicate predicate) const
+      {
+         return set_predicate(predicate, emit(BRW_OPCODE_IF));
+      }
+
+      /**
+       * Gfx6 IF with embedded comparison.
+       */
+      instruction *
+      IF(const src_reg &src0, const src_reg &src1,
+         brw_conditional_mod condition) const
+      {
+         assert(shader->devinfo->ver == 6);
+         return set_condmod(condition,
+                            emit(BRW_OPCODE_IF,
+                                 null_reg_d(),
+                                 fix_unsigned_negate(src0),
+                                 fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Emit a linear interpolation instruction.
+       */
+      instruction *
+      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
+          const src_reg &a) const
+      {
+         /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
+          * we need to reorder the operands.
+          */
+         assert(shader->devinfo->ver >= 6 && shader->devinfo->ver <= 9);
+         return emit(BRW_OPCODE_LRP, dst, a, y, x);
+      }
+
+      backend_shader *shader;
+
+   protected:
+      /**
+       * Workaround for negation of UD registers.  See comment in
+       * fs_generator::generate_code() for the details.
+       */
+      src_reg
+      fix_unsigned_negate(const src_reg &src) const
+      {
+         if (src.type == BRW_REGISTER_TYPE_UD && src.negate) {
+            dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
+            MOV(temp, src);
+            return src_reg(temp);
+         } else {
+            return src;
+         }
+      }
+
+      /**
+       * Workaround for register access modes not supported by the ternary
+       * instruction encoding.
+       */
+      src_reg
+      fix_3src_operand(const src_reg &src) const
+      {
+         /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
+          * able to use vertical stride of zero to replicate the vec4 uniform, like
+          *
+          *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
+          *
+          * But you can't, since vertical stride is always four in three-source
+          * instructions. Instead, insert a MOV instruction to do the replication so
+          * that the three-source instruction can consume it.
+          */
+
+         /* The MOV is only needed if the source is a uniform or immediate. */
+         if (src.file != UNIFORM && src.file != IMM)
+            return src;
+
+         if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
+            return src;
+
+         const dst_reg expanded = vgrf(src.type);
+         emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
+         return src_reg(expanded);
+      }
+
+      /**
+       * Workaround for register access modes not supported by the math
+       * instruction.
+       */
+      src_reg
+      fix_math_operand(const src_reg &src) const
+      {
+         /* The gfx6 math instruction ignores the source modifiers --
+          * swizzle, abs, negate, and at least some parts of the register
+          * region description.
+          *
+          * Rather than trying to enumerate all these cases, *always* expand the
+          * operand to a temp GRF for gfx6.
+          *
+          * For gfx7, keep the operand as-is, except if immediate, which gfx7 still
+          * can't use.
+          */
+         if (shader->devinfo->ver == 6 ||
+             (shader->devinfo->ver == 7 && src.file == IMM)) {
+            const dst_reg tmp = vgrf(src.type);
+            MOV(tmp, src);
+            return src_reg(tmp);
+         } else {
+            return src;
+         }
+      }
+
+      /**
+       * Workaround other weirdness of the math instruction.
+       */
+      instruction *
+      fix_math_instruction(instruction *inst) const
+      {
+         if (shader->devinfo->ver == 6 &&
+             inst->dst.writemask != WRITEMASK_XYZW) {
+            const dst_reg tmp = vgrf(inst->dst.type);
+            MOV(inst->dst, src_reg(tmp));
+            inst->dst = tmp;
+
+         } else if (shader->devinfo->ver < 6) {
+            const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2);
+            inst->base_mrf = 1;
+            inst->mlen = sources;
+         }
+
+         return inst;
+      }
+
+      bblock_t *block;
+      exec_node *cursor;
+
+      unsigned _dispatch_width;
+      unsigned _group;
+      bool force_writemask_all;
+
+      /** Debug annotation info. */
+      struct {
+         const char *str;
+         const void *ir;
+      } annotation;
+   };
+}
+
+#endif
diff --git a/src/intel/compiler/elk/brw_vec4_cmod_propagation.cpp b/src/intel/compiler/elk/brw_vec4_cmod_propagation.cpp
new file mode 100644
index 00000000000..a3d7f7e8558
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_cmod_propagation.cpp
@@ -0,0 +1,365 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+/** @file brw_vec4_cmod_propagation.cpp
+ *
+ * Really similar to brw_fs_cmod_propagation but adapted to vec4 needs. Check
+ * brw_fs_cmod_propagation for further details on the rationale behind this
+ * optimization.
+ */
+
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+
+namespace brw {
+
+static bool
+writemasks_incompatible(const vec4_instruction *earlier,
+                        const vec4_instruction *later)
+{
+   return (earlier->dst.writemask != WRITEMASK_X &&
+           earlier->dst.writemask != WRITEMASK_XYZW) ||
+          (earlier->dst.writemask == WRITEMASK_XYZW &&
+           later->src[0].swizzle != BRW_SWIZZLE_XYZW) ||
+          (later->dst.writemask & ~earlier->dst.writemask) != 0;
+}
+
+static bool
+opt_cmod_propagation_local(bblock_t *block, vec4_visitor *v)
+{
+   bool progress = false;
+   UNUSED int ip = block->end_ip + 1;
+
+   foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) {
+      ip--;
+
+      if ((inst->opcode != BRW_OPCODE_AND &&
+           inst->opcode != BRW_OPCODE_CMP &&
+           inst->opcode != BRW_OPCODE_MOV) ||
+          inst->predicate != BRW_PREDICATE_NONE ||
+          !inst->dst.is_null() ||
+          (inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
+           inst->src[0].file != UNIFORM))
+         continue;
+
+      /* An ABS source modifier can only be handled when processing a compare
+       * with a value other than zero.
+       */
+      if (inst->src[0].abs &&
+          (inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero()))
+         continue;
+
+      if (inst->opcode == BRW_OPCODE_AND &&
+          !(inst->src[1].is_one() &&
+            inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+            !inst->src[0].negate))
+         continue;
+
+      if (inst->opcode == BRW_OPCODE_MOV &&
+          inst->conditional_mod != BRW_CONDITIONAL_NZ)
+         continue;
+
+      bool read_flag = false;
+      foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, inst) {
+         /* A CMP with a second source of zero can match with anything.  A CMP
+          * with a second source that is not zero can only match with an ADD
+          * instruction.
+          */
+         if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
+            bool negate;
+
+            if (scan_inst->opcode != BRW_OPCODE_ADD)
+               goto not_match;
+
+            if (writemasks_incompatible(scan_inst, inst))
+               goto not_match;
+
+            /* A CMP is basically a subtraction.  The result of the
+             * subtraction must be the same as the result of the addition.
+             * This means that one of the operands must be negated.  So (a +
+             * b) vs (a == -b) or (a + -b) vs (a == b).
+             */
+            if ((inst->src[0].equals(scan_inst->src[0]) &&
+                 inst->src[1].negative_equals(scan_inst->src[1])) ||
+                (inst->src[0].equals(scan_inst->src[1]) &&
+                 inst->src[1].negative_equals(scan_inst->src[0]))) {
+               negate = false;
+            } else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
+                        inst->src[1].equals(scan_inst->src[1])) ||
+                       (inst->src[0].negative_equals(scan_inst->src[1]) &&
+                        inst->src[1].equals(scan_inst->src[0]))) {
+               negate = true;
+            } else {
+               goto not_match;
+            }
+
+            if (scan_inst->exec_size != inst->exec_size ||
+                scan_inst->group != inst->group)
+               goto not_match;
+
+            /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
+             *
+             *    * Note that the [post condition signal] bits generated at
+             *      the output of a compute are before the .sat.
+             *
+             * So we don't have to bail if scan_inst has saturate.
+             */
+
+            /* Otherwise, try propagating the conditional. */
+            const enum brw_conditional_mod cond =
+               negate ? brw_swap_cmod(inst->conditional_mod)
+                      : inst->conditional_mod;
+
+            if (scan_inst->can_do_cmod() &&
+                ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
+                 scan_inst->conditional_mod == cond)) {
+               scan_inst->conditional_mod = cond;
+               inst->remove(block);
+               progress = true;
+            }
+            break;
+         }
+
+         if (regions_overlap(inst->src[0], inst->size_read(0),
+                             scan_inst->dst, scan_inst->size_written)) {
+            if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) ||
+                scan_inst->dst.offset != inst->src[0].offset ||
+                scan_inst->exec_size != inst->exec_size ||
+                scan_inst->group != inst->group) {
+               break;
+            }
+
+            /* If scan_inst is a CMP that produces a single value and inst is
+             * a CMP.NZ that consumes only that value, remove inst.
+             */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+                (inst->src[0].type == BRW_REGISTER_TYPE_D ||
+                 inst->src[0].type == BRW_REGISTER_TYPE_UD) &&
+                (inst->opcode == BRW_OPCODE_CMP ||
+                 inst->opcode == BRW_OPCODE_MOV) &&
+                scan_inst->opcode == BRW_OPCODE_CMP &&
+                ((inst->src[0].swizzle == BRW_SWIZZLE_XXXX &&
+                  scan_inst->dst.writemask == WRITEMASK_X) ||
+                 (inst->src[0].swizzle == BRW_SWIZZLE_YYYY &&
+                  scan_inst->dst.writemask == WRITEMASK_Y) ||
+                 (inst->src[0].swizzle == BRW_SWIZZLE_ZZZZ &&
+                  scan_inst->dst.writemask == WRITEMASK_Z) ||
+                 (inst->src[0].swizzle == BRW_SWIZZLE_WWWW &&
+                  scan_inst->dst.writemask == WRITEMASK_W))) {
+               if (inst->dst.writemask != scan_inst->dst.writemask) {
+                  src_reg temp(v, glsl_vec4_type(), 1);
+
+                  /* Given a sequence like:
+                   *
+                   *    cmp.ge.f0(8)  g21<1>.zF      g20<4>.xF      g18<4>.xF
+                   *    ...
+                   *    cmp.nz.f0(8)  null<1>D       g21<4>.zD      0D
+                   *
+                   * Replace it with something like:
+                   *
+                   *    cmp.ge.f0(8)  g22<1>.zF      g20<4>.xF      g18<4>.xF
+                   *    mov(8)        g21<1>.xF      g22<1>.zzzzF
+                   *
+                   * The added MOV will most likely be removed later.  In the
+                   * worst case, it should be cheaper to schedule.
+                   */
+                  temp.swizzle = brw_swizzle_for_mask(inst->dst.writemask);
+                  temp.type = scan_inst->src[0].type;
+
+                  vec4_instruction *mov = v->MOV(scan_inst->dst, temp);
+
+                  /* Modify the source swizzles on scan_inst.  If scan_inst
+                   * was
+                   *
+                   *    cmp.ge.f0(8)  g21<1>.zF      g20<4>.wzyxF   g18<4>.yxwzF
+                   *
+                   * replace it with
+                   *
+                   *    cmp.ge.f0(8)  g21<1>.zF      g20<4>.yyyyF   g18<4>.wwwwF
+                   */
+                  unsigned src0_chan;
+                  unsigned src1_chan;
+                  switch (scan_inst->dst.writemask) {
+                  case WRITEMASK_X:
+                     src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 0);
+                     src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 0);
+                     break;
+                  case WRITEMASK_Y:
+                     src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 1);
+                     src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 1);
+                     break;
+                  case WRITEMASK_Z:
+                     src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 2);
+                     src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 2);
+                     break;
+                  case WRITEMASK_W:
+                     src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 3);
+                     src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 3);
+                     break;
+                  default:
+                     unreachable("Impossible writemask");
+                  }
+
+                  scan_inst->src[0].swizzle = BRW_SWIZZLE4(src0_chan,
+                                                           src0_chan,
+                                                           src0_chan,
+                                                           src0_chan);
+
+                  /* There's no swizzle on immediate value sources. */
+                  if (scan_inst->src[1].file != IMM) {
+                     scan_inst->src[1].swizzle = BRW_SWIZZLE4(src1_chan,
+                                                              src1_chan,
+                                                              src1_chan,
+                                                              src1_chan);
+                  }
+
+                  scan_inst->dst = dst_reg(temp);
+                  scan_inst->dst.writemask = inst->dst.writemask;
+
+                  scan_inst->insert_after(block, mov);
+               }
+
+               inst->remove(block);
+               progress = true;
+               break;
+            }
+
+            if (writemasks_incompatible(scan_inst, inst))
+               break;
+
+            /* CMP's result is the same regardless of dest type. */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+                scan_inst->opcode == BRW_OPCODE_CMP &&
+                (inst->dst.type == BRW_REGISTER_TYPE_D ||
+                 inst->dst.type == BRW_REGISTER_TYPE_UD)) {
+               inst->remove(block);
+               progress = true;
+               break;
+            }
+
+            /* If the AND wasn't handled by the previous case, it isn't safe
+             * to remove it.
+             */
+            if (inst->opcode == BRW_OPCODE_AND)
+               break;
+
+            /* Comparisons operate differently for ints and floats */
+            if (scan_inst->dst.type != inst->dst.type &&
+                (scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
+                 inst->dst.type == BRW_REGISTER_TYPE_F))
+               break;
+
+            /* If the instruction generating inst's source also wrote the
+             * flag, and inst is doing a simple .nz comparison, then inst
+             * is redundant - the appropriate value is already in the flag
+             * register.  Delete inst.
+             */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+                !inst->src[0].negate &&
+                scan_inst->writes_flag(v->devinfo)) {
+               inst->remove(block);
+               progress = true;
+               break;
+            }
+
+            /* The conditional mod of the CMP/CMPN instructions behaves
+             * specially because the flag output is not calculated from the
+             * result of the instruction, but the other way around, which
+             * means that even if the condmod to propagate and the condmod
+             * from the CMP instruction are the same they will in general give
+             * different results because they are evaluated based on different
+             * inputs.
+             */
+            if (scan_inst->opcode == BRW_OPCODE_CMP ||
+                scan_inst->opcode == BRW_OPCODE_CMPN)
+               break;
+
+            /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
+             *
+             *    * Note that the [post condition signal] bits generated at
+             *      the output of a compute are before the .sat.
+             */
+            if (scan_inst->saturate)
+               break;
+
+            /* From the Sky Lake PRM, Vol 2a, "Multiply":
+             *
+             *    "When multiplying integer data types, if one of the sources
+             *    is a DW, the resulting full precision data is stored in
+             *    the accumulator. However, if the destination data type is
+             *    either W or DW, the low bits of the result are written to
+             *    the destination register and the remaining high bits are
+             *    discarded. This results in undefined Overflow and Sign
+             *    flags. Therefore, conditional modifiers and saturation
+             *    (.sat) cannot be used in this case.
+             *
+             * We just disallow cmod propagation on all integer multiplies.
+             */
+            if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
+                scan_inst->opcode == BRW_OPCODE_MUL)
+               break;
+
+            /* Otherwise, try propagating the conditional. */
+            enum brw_conditional_mod cond =
+               inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
+                                   : inst->conditional_mod;
+
+            if (scan_inst->can_do_cmod() &&
+                ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
+                 scan_inst->conditional_mod == cond)) {
+               scan_inst->conditional_mod = cond;
+               inst->remove(block);
+               progress = true;
+            }
+            break;
+         }
+
+      not_match:
+         if (scan_inst->writes_flag(v->devinfo))
+            break;
+
+         read_flag = read_flag || scan_inst->reads_flag();
+      }
+   }
+
+   return progress;
+}
+
+bool
+vec4_visitor::opt_cmod_propagation()
+{
+   bool progress = false;
+
+   foreach_block_reverse(block, cfg) {
+      progress = opt_cmod_propagation_local(block, this) || progress;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/elk/brw_vec4_copy_propagation.cpp b/src/intel/compiler/elk/brw_vec4_copy_propagation.cpp
new file mode 100644
index 00000000000..fd535fd88af
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_copy_propagation.cpp
@@ -0,0 +1,556 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file brw_vec4_copy_propagation.cpp
+ *
+ * Implements tracking of values copied between registers, and
+ * optimizations based on that: copy propagation and constant
+ * propagation.
+ */
+
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+
+namespace brw {
+
+struct copy_entry {
+   src_reg *value[4];
+   int saturatemask;
+};
+
+static bool
+is_direct_copy(vec4_instruction *inst)
+{
+   return (inst->opcode == BRW_OPCODE_MOV &&
+	   !inst->predicate &&
+	   inst->dst.file == VGRF &&
+	   inst->dst.offset % REG_SIZE == 0 &&
+	   !inst->dst.reladdr &&
+	   !inst->src[0].reladdr &&
+	   (inst->dst.type == inst->src[0].type ||
+            (inst->dst.type == BRW_REGISTER_TYPE_F &&
+             inst->src[0].type == BRW_REGISTER_TYPE_VF)));
+}
+
+static bool
+is_dominated_by_previous_instruction(vec4_instruction *inst)
+{
+   return (inst->opcode != BRW_OPCODE_DO &&
+	   inst->opcode != BRW_OPCODE_WHILE &&
+	   inst->opcode != BRW_OPCODE_ELSE &&
+	   inst->opcode != BRW_OPCODE_ENDIF);
+}
+
+static bool
+is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch)
+{
+   const src_reg *src = values[ch];
+
+   /* consider GRF only */
+   assert(inst->dst.file == VGRF);
+   if (!src || src->file != VGRF)
+      return false;
+
+   return regions_overlap(*src, REG_SIZE, inst->dst, inst->size_written) &&
+          (inst->dst.offset != src->offset ||
+           inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch)));
+}
+
+/**
+ * Get the origin of a copy as a single register if all components present in
+ * the given readmask originate from the same register and have compatible
+ * regions, otherwise return a BAD_FILE register.
+ */
+static src_reg
+get_copy_value(const copy_entry &entry, unsigned readmask)
+{
+   unsigned swz[4] = {};
+   src_reg value;
+
+   for (unsigned i = 0; i < 4; i++) {
+      if (readmask & (1 << i)) {
+         if (entry.value[i]) {
+            src_reg src = *entry.value[i];
+
+            if (src.file == IMM) {
+               swz[i] = i;
+            } else {
+               swz[i] = BRW_GET_SWZ(src.swizzle, i);
+               /* Overwrite the original swizzle so the src_reg::equals call
+                * below doesn't care about it, the correct swizzle will be
+                * calculated once the swizzles of all components are known.
+                */
+               src.swizzle = BRW_SWIZZLE_XYZW;
+            }
+
+            if (value.file == BAD_FILE) {
+               value = src;
+            } else if (!value.equals(src)) {
+               return src_reg();
+            }
+         } else {
+            return src_reg();
+         }
+      }
+   }
+
+   return swizzle(value,
+                  brw_compose_swizzle(brw_swizzle_for_mask(readmask),
+                                      BRW_SWIZZLE4(swz[0], swz[1],
+                                                   swz[2], swz[3])));
+}
+
+static bool
+try_constant_propagate(vec4_instruction *inst,
+                       int arg, const copy_entry *entry)
+{
+   /* For constant propagation, we only handle the same constant
+    * across all 4 channels.  Some day, we should handle the 8-bit
+    * float vector format, which would let us constant propagate
+    * vectors better.
+    * We could be more aggressive here -- some channels might not get used
+    * based on the destination writemask.
+    */
+   src_reg value =
+      get_copy_value(*entry,
+                     brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
+                                                   WRITEMASK_XYZW));
+
+   if (value.file != IMM)
+      return false;
+
+   /* 64-bit types can't be used except for one-source instructions, which
+    * higher levels should have constant folded away, so there's no point in
+    * propagating immediates here.
+    */
+   if (type_sz(value.type) == 8 || type_sz(inst->src[arg].type) == 8)
+      return false;
+
+   if (value.type == BRW_REGISTER_TYPE_VF) {
+      /* The result of bit-casting the component values of a vector float
+       * cannot in general be represented as an immediate.
+       */
+      if (inst->src[arg].type != BRW_REGISTER_TYPE_F)
+         return false;
+   } else {
+      value.type = inst->src[arg].type;
+   }
+
+   if (inst->src[arg].abs) {
+      if (!brw_abs_immediate(value.type, &value.as_brw_reg()))
+         return false;
+   }
+
+   if (inst->src[arg].negate) {
+      if (!brw_negate_immediate(value.type, &value.as_brw_reg()))
+         return false;
+   }
+
+   value = swizzle(value, inst->src[arg].swizzle);
+
+   switch (inst->opcode) {
+   case BRW_OPCODE_MOV:
+   case SHADER_OPCODE_BROADCAST:
+      inst->src[arg] = value;
+      return true;
+
+   case VEC4_OPCODE_UNTYPED_ATOMIC:
+      if (arg == 1) {
+         inst->src[arg] = value;
+         return true;
+      }
+      break;
+
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+         break;
+   case BRW_OPCODE_DP2:
+   case BRW_OPCODE_DP3:
+   case BRW_OPCODE_DP4:
+   case BRW_OPCODE_DPH:
+   case BRW_OPCODE_BFI1:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SUBB:
+      if (arg == 1) {
+         inst->src[arg] = value;
+         return true;
+      }
+      break;
+
+   case BRW_OPCODE_MACH:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_ADDC:
+      if (arg == 1) {
+	 inst->src[arg] = value;
+	 return true;
+      } else if (arg == 0 && inst->src[1].file != IMM) {
+	 /* Fit this constant in by commuting the operands.  Exception: we
+	  * can't do this for 32-bit integer MUL/MACH because it's asymmetric.
+	  */
+	 if ((inst->opcode == BRW_OPCODE_MUL ||
+              inst->opcode == BRW_OPCODE_MACH) &&
+	     (inst->src[1].type == BRW_REGISTER_TYPE_D ||
+	      inst->src[1].type == BRW_REGISTER_TYPE_UD))
+	    break;
+	 inst->src[0] = inst->src[1];
+	 inst->src[1] = value;
+	 return true;
+      }
+      break;
+   case GS_OPCODE_SET_WRITE_OFFSET:
+      /* This is just a multiply by a constant with special strides.
+       * The generator will handle immediates in both arguments (generating
+       * a single MOV of the product).  So feel free to propagate in src0.
+       */
+      inst->src[arg] = value;
+      return true;
+
+   case BRW_OPCODE_CMP:
+      if (arg == 1) {
+	 inst->src[arg] = value;
+	 return true;
+      } else if (arg == 0 && inst->src[1].file != IMM) {
+	 enum brw_conditional_mod new_cmod;
+
+	 new_cmod = brw_swap_cmod(inst->conditional_mod);
+	 if (new_cmod != BRW_CONDITIONAL_NONE) {
+	    /* Fit this constant in by swapping the operands and
+	     * flipping the test.
+	     */
+	    inst->src[0] = inst->src[1];
+	    inst->src[1] = value;
+	    inst->conditional_mod = new_cmod;
+	    return true;
+	 }
+      }
+      break;
+
+   case BRW_OPCODE_SEL:
+      if (arg == 1) {
+	 inst->src[arg] = value;
+	 return true;
+      } else if (arg == 0 && inst->src[1].file != IMM) {
+	 inst->src[0] = inst->src[1];
+	 inst->src[1] = value;
+
+	 /* If this was predicated, flipping operands means
+	  * we also need to flip the predicate.
+	  */
+	 if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
+	    inst->predicate_inverse = !inst->predicate_inverse;
+	 }
+	 return true;
+      }
+      break;
+
+   default:
+      break;
+   }
+
+   return false;
+}
+
+static bool
+is_align1_opcode(unsigned opcode)
+{
+   switch (opcode) {
+   case VEC4_OPCODE_DOUBLE_TO_F32:
+   case VEC4_OPCODE_DOUBLE_TO_D32:
+   case VEC4_OPCODE_DOUBLE_TO_U32:
+   case VEC4_OPCODE_TO_DOUBLE:
+   case VEC4_OPCODE_PICK_LOW_32BIT:
+   case VEC4_OPCODE_PICK_HIGH_32BIT:
+   case VEC4_OPCODE_SET_LOW_32BIT:
+   case VEC4_OPCODE_SET_HIGH_32BIT:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static bool
+try_copy_propagate(const struct brw_compiler *compiler,
+                   vec4_instruction *inst, int arg,
+                   const copy_entry *entry, int attributes_per_reg)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+
+   /* Build up the value we are propagating as if it were the source of a
+    * single MOV
+    */
+   src_reg value =
+      get_copy_value(*entry,
+                     brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
+                                                   WRITEMASK_XYZW));
+
+   /* Check that we can propagate that value */
+   if (value.file != UNIFORM &&
+       value.file != VGRF &&
+       value.file != ATTR)
+      return false;
+
+   /* Instructions that write 2 registers also need to read 2 registers. Make
+    * sure we don't break that restriction by copy propagating from a uniform.
+    */
+   if (inst->size_written > REG_SIZE && is_uniform(value))
+      return false;
+
+   /* There is a regioning restriction such that if execsize == width
+    * and hstride != 0 then the vstride can't be 0. When we split instrutions
+    * that take a single-precision source (like F->DF conversions) we end up
+    * with a 4-wide source on an instruction with an execution size of 4.
+    * If we then copy-propagate the source from a uniform we also end up with a
+    * vstride of 0 and we violate the restriction.
+    */
+   if (inst->exec_size == 4 && value.file == UNIFORM &&
+       type_sz(value.type) == 4)
+      return false;
+
+   /* If the type of the copy value is different from the type of the
+    * instruction then the swizzles and writemasks involved don't have the same
+    * meaning and simply replacing the source would produce different semantics.
+    */
+   if (type_sz(value.type) != type_sz(inst->src[arg].type))
+      return false;
+
+   if (inst->src[arg].offset % REG_SIZE || value.offset % REG_SIZE)
+      return false;
+
+   bool has_source_modifiers = value.negate || value.abs;
+
+   /* gfx6 math and gfx7+ SENDs from GRFs ignore source modifiers on
+    * instructions.
+    */
+   if (has_source_modifiers && !inst->can_do_source_mods(devinfo))
+      return false;
+
+   /* Reject cases that would violate register regioning restrictions. */
+   if ((value.file == UNIFORM || value.swizzle != BRW_SWIZZLE_XYZW) &&
+       ((devinfo->ver == 6 && inst->is_math()) ||
+        inst->is_send_from_grf() ||
+        inst->uses_indirect_addressing())) {
+      return false;
+   }
+
+   if (has_source_modifiers &&
+       value.type != inst->src[arg].type &&
+       !inst->can_change_types())
+      return false;
+
+   if (has_source_modifiers &&
+       (inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_WRITE ||
+        inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT))
+      return false;
+
+   unsigned composed_swizzle = brw_compose_swizzle(inst->src[arg].swizzle,
+                                                   value.swizzle);
+
+   /* Instructions that operate on vectors in ALIGN1 mode will ignore swizzles
+    * so copy-propagation won't be safe if the composed swizzle is anything
+    * other than the identity.
+    */
+   if (is_align1_opcode(inst->opcode) && composed_swizzle != BRW_SWIZZLE_XYZW)
+      return false;
+
+   if (inst->is_3src(compiler) &&
+       (value.file == UNIFORM ||
+        (value.file == ATTR && attributes_per_reg != 1)) &&
+       !brw_is_single_value_swizzle(composed_swizzle))
+      return false;
+
+   if (inst->is_send_from_grf())
+      return false;
+
+   /* we can't generally copy-propagate UD negations because we
+    * end up accessing the resulting values as signed integers
+    * instead. See also resolve_ud_negate().
+    */
+   if (value.negate &&
+       value.type == BRW_REGISTER_TYPE_UD)
+      return false;
+
+   /* Don't report progress if this is a noop. */
+   if (value.equals(inst->src[arg]))
+      return false;
+
+   const unsigned dst_saturate_mask = inst->dst.writemask &
+      brw_apply_swizzle_to_mask(inst->src[arg].swizzle, entry->saturatemask);
+
+   if (dst_saturate_mask) {
+      /* We either saturate all or nothing. */
+      if (dst_saturate_mask != inst->dst.writemask)
+         return false;
+
+      /* Limit saturate propagation only to SEL with src1 bounded within 0.0
+       * and 1.0, otherwise skip copy propagate altogether.
+       */
+      switch(inst->opcode) {
+      case BRW_OPCODE_SEL:
+         if (arg != 0 ||
+             inst->src[0].type != BRW_REGISTER_TYPE_F ||
+             inst->src[1].file != IMM ||
+             inst->src[1].type != BRW_REGISTER_TYPE_F ||
+             inst->src[1].f < 0.0 ||
+             inst->src[1].f > 1.0) {
+            return false;
+         }
+         if (!inst->saturate)
+            inst->saturate = true;
+         break;
+      default:
+         return false;
+      }
+   }
+
+   /* Build the final value */
+   if (inst->src[arg].abs) {
+      value.negate = false;
+      value.abs = true;
+   }
+   if (inst->src[arg].negate)
+      value.negate = !value.negate;
+
+   value.swizzle = composed_swizzle;
+   if (has_source_modifiers &&
+       value.type != inst->src[arg].type) {
+      assert(inst->can_change_types());
+      for (int i = 0; i < 3; i++) {
+         inst->src[i].type = value.type;
+      }
+      inst->dst.type = value.type;
+   } else {
+      value.type = inst->src[arg].type;
+   }
+
+   inst->src[arg] = value;
+   return true;
+}
+
+bool
+vec4_visitor::opt_copy_propagation(bool do_constant_prop)
+{
+   /* If we are in dual instanced or single mode, then attributes are going
+    * to be interleaved, so one register contains two attribute slots.
+    */
+   const int attributes_per_reg =
+      prog_data->dispatch_mode == INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
+   bool progress = false;
+   struct copy_entry entries[alloc.total_size];
+
+   memset(&entries, 0, sizeof(entries));
+
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      /* This pass only works on basic blocks.  If there's flow
+       * control, throw out all our information and start from
+       * scratch.
+       *
+       * This should really be fixed by using a structure like in
+       * src/glsl/opt_copy_propagation.cpp to track available copies.
+       */
+      if (!is_dominated_by_previous_instruction(inst)) {
+	 memset(&entries, 0, sizeof(entries));
+	 continue;
+      }
+
+      /* For each source arg, see if each component comes from a copy
+       * from the same type file (IMM, VGRF, UNIFORM), and try
+       * optimizing out access to the copy result
+       */
+      for (int i = 2; i >= 0; i--) {
+	 /* Copied values end up in GRFs, and we don't track reladdr
+	  * accesses.
+	  */
+	 if (inst->src[i].file != VGRF ||
+	     inst->src[i].reladdr)
+	    continue;
+
+         /* We only handle register-aligned single GRF copies. */
+         if (inst->size_read(i) != REG_SIZE ||
+             inst->src[i].offset % REG_SIZE)
+            continue;
+
+         const unsigned reg = (alloc.offsets[inst->src[i].nr] +
+                               inst->src[i].offset / REG_SIZE);
+         const copy_entry &entry = entries[reg];
+
+         if (do_constant_prop && try_constant_propagate(inst, i, &entry))
+            progress = true;
+         else if (try_copy_propagate(compiler, inst, i, &entry, attributes_per_reg))
+	    progress = true;
+      }
+
+      /* Track available source registers. */
+      if (inst->dst.file == VGRF) {
+	 const int reg =
+            alloc.offsets[inst->dst.nr] + inst->dst.offset / REG_SIZE;
+
+	 /* Update our destination's current channel values.  For a direct copy,
+	  * the value is the newly propagated source.  Otherwise, we don't know
+	  * the new value, so clear it.
+	  */
+	 bool direct_copy = is_direct_copy(inst);
+         entries[reg].saturatemask &= ~inst->dst.writemask;
+	 for (int i = 0; i < 4; i++) {
+	    if (inst->dst.writemask & (1 << i)) {
+               entries[reg].value[i] = direct_copy ? &inst->src[0] : NULL;
+               entries[reg].saturatemask |=
+                  inst->saturate && direct_copy ? 1 << i : 0;
+            }
+	 }
+
+	 /* Clear the records for any registers whose current value came from
+	  * our destination's updated channels, as the two are no longer equal.
+	  */
+	 if (inst->dst.reladdr)
+	    memset(&entries, 0, sizeof(entries));
+	 else {
+	    for (unsigned i = 0; i < alloc.total_size; i++) {
+	       for (int j = 0; j < 4; j++) {
+		  if (is_channel_updated(inst, entries[i].value, j)) {
+		     entries[i].value[j] = NULL;
+		     entries[i].saturatemask &= ~(1 << j);
+                  }
+	       }
+	    }
+	 }
+      }
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
+                          DEPENDENCY_INSTRUCTION_DETAIL);
+
+   return progress;
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/elk/brw_vec4_cse.cpp b/src/intel/compiler/elk/brw_vec4_cse.cpp
new file mode 100644
index 00000000000..c4c9ea68e15
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_cse.cpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright © 2012, 2013, 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "brw_vec4_live_variables.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+/** @file brw_vec4_cse.cpp
+ *
+ * Support for local common subexpression elimination.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 13.1 (p378).
+ */
+
+namespace {
+struct aeb_entry : public exec_node {
+   /** The instruction that generates the expression value. */
+   vec4_instruction *generator;
+
+   /** The temporary where the value is stored. */
+   src_reg tmp;
+};
+}
+
+static bool
+is_expression(const vec4_instruction *const inst)
+{
+   switch (inst->opcode) {
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_SEL:
+   case BRW_OPCODE_NOT:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_CMPN:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+   case BRW_OPCODE_FRC:
+   case BRW_OPCODE_RNDU:
+   case BRW_OPCODE_RNDD:
+   case BRW_OPCODE_RNDE:
+   case BRW_OPCODE_RNDZ:
+   case BRW_OPCODE_LINE:
+   case BRW_OPCODE_PLN:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_LRP:
+   case VEC4_OPCODE_UNPACK_UNIFORM:
+   case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+   case SHADER_OPCODE_BROADCAST:
+   case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+   case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+      return true;
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      return inst->mlen == 0;
+   default:
+      return false;
+   }
+}
+
+static bool
+operands_match(const vec4_instruction *a, const vec4_instruction *b)
+{
+   const src_reg *xs = a->src;
+   const src_reg *ys = b->src;
+
+   if (a->opcode == BRW_OPCODE_MAD) {
+      return xs[0].equals(ys[0]) &&
+             ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
+              (xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
+   } else if (a->opcode == BRW_OPCODE_MOV &&
+              xs[0].file == IMM &&
+              xs[0].type == BRW_REGISTER_TYPE_VF) {
+      src_reg tmp_x = xs[0];
+      src_reg tmp_y = ys[0];
+
+      /* Smash out the values that are not part of the writemask.  Otherwise
+       * the equals operator will fail due to mismatches in unused components.
+       */
+      const unsigned ab_writemask = a->dst.writemask & b->dst.writemask;
+      const uint32_t mask = ((ab_writemask & WRITEMASK_X) ? 0x000000ff : 0) |
+                            ((ab_writemask & WRITEMASK_Y) ? 0x0000ff00 : 0) |
+                            ((ab_writemask & WRITEMASK_Z) ? 0x00ff0000 : 0) |
+                            ((ab_writemask & WRITEMASK_W) ? 0xff000000 : 0);
+
+      tmp_x.ud &= mask;
+      tmp_y.ud &= mask;
+
+      return tmp_x.equals(tmp_y);
+   } else if (!a->is_commutative()) {
+      return xs[0].equals(ys[0]) && xs[1].equals(ys[1]) && xs[2].equals(ys[2]);
+   } else {
+      return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
+             (xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
+   }
+}
+
+/**
+ * Checks if instructions match, exactly for sources, but loosely for
+ * destination writemasks.
+ *
+ * \param 'a' is the generating expression from the AEB entry.
+ * \param 'b' is the second occurrence of the expression that we're
+ *        considering eliminating.
+ */
+static bool
+instructions_match(vec4_instruction *a, vec4_instruction *b)
+{
+   return a->opcode == b->opcode &&
+          a->saturate == b->saturate &&
+          a->predicate == b->predicate &&
+          a->predicate_inverse == b->predicate_inverse &&
+          a->conditional_mod == b->conditional_mod &&
+          a->flag_subreg == b->flag_subreg &&
+          a->dst.type == b->dst.type &&
+          a->offset == b->offset &&
+          a->mlen == b->mlen &&
+          a->base_mrf == b->base_mrf &&
+          a->header_size == b->header_size &&
+          a->shadow_compare == b->shadow_compare &&
+          ((a->dst.writemask & b->dst.writemask) == a->dst.writemask) &&
+          a->force_writemask_all == b->force_writemask_all &&
+          a->size_written == b->size_written &&
+          a->exec_size == b->exec_size &&
+          a->group == b->group &&
+          operands_match(a, b);
+}
+
+bool
+vec4_visitor::opt_cse_local(bblock_t *block, const vec4_live_variables &live)
+{
+   bool progress = false;
+   exec_list aeb;
+
+   void *cse_ctx = ralloc_context(NULL);
+
+   int ip = block->start_ip;
+   foreach_inst_in_block (vec4_instruction, inst, block) {
+      /* Skip some cases. */
+      if (is_expression(inst) && !inst->predicate && inst->mlen == 0 &&
+          ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
+           inst->dst.is_null()))
+      {
+         bool found = false;
+
+         foreach_in_list_use_after(aeb_entry, entry, &aeb) {
+            /* Match current instruction's expression against those in AEB. */
+            if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
+                instructions_match(inst, entry->generator)) {
+               found = true;
+               progress = true;
+               break;
+            }
+         }
+
+         if (!found) {
+            if (inst->opcode != BRW_OPCODE_MOV ||
+                (inst->opcode == BRW_OPCODE_MOV &&
+                 inst->src[0].file == IMM &&
+                 inst->src[0].type == BRW_REGISTER_TYPE_VF)) {
+               /* Our first sighting of this expression.  Create an entry. */
+               aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
+               entry->tmp = src_reg(); /* file will be BAD_FILE */
+               entry->generator = inst;
+               aeb.push_tail(entry);
+            }
+         } else {
+            /* This is at least our second sighting of this expression.
+             * If we don't have a temporary already, make one.
+             */
+            bool no_existing_temp = entry->tmp.file == BAD_FILE;
+            if (no_existing_temp && !entry->generator->dst.is_null()) {
+               entry->tmp = retype(src_reg(VGRF, alloc.allocate(
+                                              regs_written(entry->generator)),
+                                           NULL), inst->dst.type);
+
+               const unsigned width = entry->generator->exec_size;
+               unsigned component_size = width * type_sz(entry->tmp.type);
+               unsigned num_copy_movs =
+                  DIV_ROUND_UP(entry->generator->size_written, component_size);
+               for (unsigned i = 0; i < num_copy_movs; ++i) {
+                  vec4_instruction *copy =
+                     MOV(offset(entry->generator->dst, width, i),
+                         offset(entry->tmp, width, i));
+                  copy->exec_size = width;
+                  copy->group = entry->generator->group;
+                  copy->force_writemask_all =
+                     entry->generator->force_writemask_all;
+                  entry->generator->insert_after(block, copy);
+               }
+
+               entry->generator->dst = dst_reg(entry->tmp);
+            }
+
+            /* dest <- temp */
+            if (!inst->dst.is_null()) {
+               assert(inst->dst.type == entry->tmp.type);
+               const unsigned width = inst->exec_size;
+               unsigned component_size = width * type_sz(inst->dst.type);
+               unsigned num_copy_movs =
+                  DIV_ROUND_UP(inst->size_written, component_size);
+               for (unsigned i = 0; i < num_copy_movs; ++i) {
+                  vec4_instruction *copy =
+                     MOV(offset(inst->dst, width, i),
+                         offset(entry->tmp, width, i));
+                  copy->exec_size = inst->exec_size;
+                  copy->group = inst->group;
+                  copy->force_writemask_all = inst->force_writemask_all;
+                  inst->insert_before(block, copy);
+               }
+            }
+
+            /* Set our iterator so that next time through the loop inst->next
+             * will get the instruction in the basic block after the one we've
+             * removed.
+             */
+            vec4_instruction *prev = (vec4_instruction *)inst->prev;
+
+            inst->remove(block);
+            inst = prev;
+         }
+      }
+
+      foreach_in_list_safe(aeb_entry, entry, &aeb) {
+         /* Kill all AEB entries that write a different value to or read from
+          * the flag register if we just wrote it.
+          */
+         if (inst->writes_flag(devinfo)) {
+            if (entry->generator->reads_flag() ||
+                (entry->generator->writes_flag(devinfo) &&
+                 !instructions_match(inst, entry->generator))) {
+               entry->remove();
+               ralloc_free(entry);
+               continue;
+            }
+         }
+
+         for (int i = 0; i < 3; i++) {
+            src_reg *src = &entry->generator->src[i];
+
+            /* Kill all AEB entries that use the destination we just
+             * overwrote.
+             */
+            if (inst->dst.file == entry->generator->src[i].file &&
+                inst->dst.nr == entry->generator->src[i].nr) {
+               entry->remove();
+               ralloc_free(entry);
+               break;
+            }
+
+            /* Kill any AEB entries using registers that don't get reused any
+             * more -- a sure sign they'll fail operands_match().
+             */
+            if (src->file == VGRF) {
+               if (live.var_range_end(var_from_reg(alloc, dst_reg(*src)), 8) < ip) {
+                  entry->remove();
+                  ralloc_free(entry);
+                  break;
+               }
+            }
+         }
+      }
+
+      ip++;
+   }
+
+   ralloc_free(cse_ctx);
+
+   return progress;
+}
+
+bool
+vec4_visitor::opt_cse()
+{
+   bool progress = false;
+   const vec4_live_variables &live = live_analysis.require();
+
+   foreach_block (block, cfg) {
+      progress = opt_cse_local(block, live) || progress;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_vec4_dead_code_eliminate.cpp b/src/intel/compiler/elk/brw_vec4_dead_code_eliminate.cpp
new file mode 100644
index 00000000000..10a64a56143
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_dead_code_eliminate.cpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "brw_vec4_live_variables.h"
+#include "brw_cfg.h"
+
+/** @file brw_vec4_dead_code_eliminate.cpp
+ *
+ * Dataflow-aware dead code elimination.
+ *
+ * Walks the instruction list from the bottom, removing instructions that
+ * have results that both aren't used in later blocks and haven't been read
+ * yet in the tail end of this block.
+ */
+
+using namespace brw;
+
+bool
+vec4_visitor::dead_code_eliminate()
+{
+   bool progress = false;
+
+   const vec4_live_variables &live_vars = live_analysis.require();
+   int num_vars = live_vars.num_vars;
+   BITSET_WORD *live = rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(num_vars));
+   BITSET_WORD *flag_live = rzalloc_array(NULL, BITSET_WORD, 1);
+
+   foreach_block_reverse_safe(block, cfg) {
+      memcpy(live, live_vars.block_data[block->num].liveout,
+             sizeof(BITSET_WORD) * BITSET_WORDS(num_vars));
+      memcpy(flag_live, live_vars.block_data[block->num].flag_liveout,
+             sizeof(BITSET_WORD));
+
+      foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) {
+         if ((inst->dst.file == VGRF && !inst->has_side_effects()) ||
+             (inst->dst.is_null() && inst->writes_flag(devinfo))){
+            bool result_live[4] = { false };
+            if (inst->dst.file == VGRF) {
+               for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
+                  for (int c = 0; c < 4; c++) {
+                     const unsigned v = var_from_reg(alloc, inst->dst, c, i);
+                     result_live[c] |= BITSET_TEST(live, v);
+                  }
+               }
+            } else {
+               for (unsigned c = 0; c < 4; c++)
+                  result_live[c] = BITSET_TEST(flag_live, c);
+            }
+
+            /* If the instruction can't do writemasking, then it's all or
+             * nothing.
+             */
+            if (!inst->can_do_writemask(devinfo)) {
+               bool result = result_live[0] | result_live[1] |
+                             result_live[2] | result_live[3];
+               result_live[0] = result;
+               result_live[1] = result;
+               result_live[2] = result;
+               result_live[3] = result;
+            }
+
+            if (inst->writes_flag(devinfo)) {
+               /* Independently calculate the usage of the flag components and
+                * the destination value components.
+                */
+               uint8_t flag_mask = inst->dst.writemask;
+               uint8_t dest_mask = inst->dst.writemask;
+
+               for (int c = 0; c < 4; c++) {
+                  if (!result_live[c] && dest_mask & (1 << c))
+                     dest_mask &= ~(1 << c);
+
+                  if (!BITSET_TEST(flag_live, c))
+                     flag_mask &= ~(1 << c);
+               }
+
+               if (inst->dst.writemask != (flag_mask | dest_mask)) {
+                  progress = true;
+                  inst->dst.writemask = flag_mask | dest_mask;
+               }
+
+               /* If none of the destination components are read, replace the
+                * destination register with the NULL register.
+                */
+               if (dest_mask == 0) {
+                  progress = true;
+                  inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
+               }
+            } else {
+               for (int c = 0; c < 4; c++) {
+                  if (!result_live[c] && inst->dst.writemask & (1 << c)) {
+                     inst->dst.writemask &= ~(1 << c);
+                     progress = true;
+
+                     if (inst->dst.writemask == 0) {
+                        if (inst->writes_accumulator) {
+                           inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
+                        } else {
+                           inst->opcode = BRW_OPCODE_NOP;
+                           break;
+                        }
+                     }
+                  }
+               }
+            }
+         }
+
+         if (inst->dst.is_null() && inst->writes_flag(devinfo)) {
+            bool combined_live = false;
+            for (unsigned c = 0; c < 4; c++)
+               combined_live |= BITSET_TEST(flag_live, c);
+
+            if (!combined_live) {
+               inst->opcode = BRW_OPCODE_NOP;
+               progress = true;
+            }
+         }
+
+         if (inst->dst.file == VGRF && !inst->predicate &&
+             !inst->is_align1_partial_write()) {
+            for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
+               for (int c = 0; c < 4; c++) {
+                  if (inst->dst.writemask & (1 << c)) {
+                     const unsigned v = var_from_reg(alloc, inst->dst, c, i);
+                     BITSET_CLEAR(live, v);
+                  }
+               }
+            }
+         }
+
+         if (inst->writes_flag(devinfo) && !inst->predicate && inst->exec_size == 8) {
+            for (unsigned c = 0; c < 4; c++)
+               BITSET_CLEAR(flag_live, c);
+         }
+
+         if (inst->opcode == BRW_OPCODE_NOP) {
+            inst->remove(block);
+            continue;
+         }
+
+         for (int i = 0; i < 3; i++) {
+            if (inst->src[i].file == VGRF) {
+               for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) {
+                  for (int c = 0; c < 4; c++) {
+                     const unsigned v = var_from_reg(alloc, inst->src[i], c, j);
+                     BITSET_SET(live, v);
+                  }
+               }
+            }
+         }
+
+         for (unsigned c = 0; c < 4; c++) {
+            if (inst->reads_flag(c)) {
+               BITSET_SET(flag_live, c);
+            }
+         }
+      }
+   }
+
+   ralloc_free(live);
+   ralloc_free(flag_live);
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
diff --git a/src/intel/compiler/elk/brw_vec4_generator.cpp b/src/intel/compiler/elk/brw_vec4_generator.cpp
new file mode 100644
index 00000000000..df414189f4b
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_generator.cpp
@@ -0,0 +1,2319 @@
+/* Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+#include "brw_disasm_info.h"
+#include "dev/intel_debug.h"
+#include "util/mesa-sha1.h"
+
+using namespace brw;
+
+static void
+generate_math1_gfx4(struct brw_codegen *p,
+                    vec4_instruction *inst,
+                    struct brw_reg dst,
+                    struct brw_reg src)
+{
+   gfx4_math(p,
+	     dst,
+	     brw_math_function(inst->opcode),
+	     inst->base_mrf,
+	     src,
+	     BRW_MATH_PRECISION_FULL);
+}
+
+static void
+check_gfx6_math_src_arg(struct brw_reg src)
+{
+   /* Source swizzles are ignored. */
+   assert(!src.abs);
+   assert(!src.negate);
+   assert(src.swizzle == BRW_SWIZZLE_XYZW);
+}
+
+static void
+generate_math_gfx6(struct brw_codegen *p,
+                   vec4_instruction *inst,
+                   struct brw_reg dst,
+                   struct brw_reg src0,
+                   struct brw_reg src1)
+{
+   /* Can't do writemask because math can't be align16. */
+   assert(dst.writemask == WRITEMASK_XYZW);
+   /* Source swizzles are ignored. */
+   check_gfx6_math_src_arg(src0);
+   if (src1.file == BRW_GENERAL_REGISTER_FILE)
+      check_gfx6_math_src_arg(src1);
+
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   gfx6_math(p, dst, brw_math_function(inst->opcode), src0, src1);
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+}
+
+static void
+generate_math2_gfx4(struct brw_codegen *p,
+                    vec4_instruction *inst,
+                    struct brw_reg dst,
+                    struct brw_reg src0,
+                    struct brw_reg src1)
+{
+   /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
+    * "Message Payload":
+    *
+    * "Operand0[7].  For the INT DIV functions, this operand is the
+    *  denominator."
+    *  ...
+    * "Operand1[7].  For the INT DIV functions, this operand is the
+    *  numerator."
+    */
+   bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
+   struct brw_reg &op0 = is_int_div ? src1 : src0;
+   struct brw_reg &op1 = is_int_div ? src0 : src1;
+
+   brw_push_insn_state(p);
+   brw_set_default_saturate(p, false);
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+   brw_set_default_flag_reg(p, 0, 0);
+   brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1);
+   brw_pop_insn_state(p);
+
+   gfx4_math(p,
+	     dst,
+	     brw_math_function(inst->opcode),
+	     inst->base_mrf,
+	     op0,
+	     BRW_MATH_PRECISION_FULL);
+}
+
+static void
+generate_tex(struct brw_codegen *p,
+             struct brw_vue_prog_data *prog_data,
+             gl_shader_stage stage,
+             vec4_instruction *inst,
+             struct brw_reg dst,
+             struct brw_reg src,
+             struct brw_reg surface_index,
+             struct brw_reg sampler_index)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   int msg_type = -1;
+
+   if (devinfo->ver >= 5) {
+      switch (inst->opcode) {
+      case SHADER_OPCODE_TEX:
+      case SHADER_OPCODE_TXL:
+	 if (inst->shadow_compare) {
+	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
+	 } else {
+	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
+	 }
+	 break;
+      case SHADER_OPCODE_TXD:
+         if (inst->shadow_compare) {
+            /* Gfx7.5+.  Otherwise, lowered by brw_lower_texture_gradients(). */
+            assert(devinfo->verx10 == 75);
+            msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
+         } else {
+            msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
+         }
+	 break;
+      case SHADER_OPCODE_TXF:
+	 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
+	 break;
+      case SHADER_OPCODE_TXF_CMS:
+         if (devinfo->ver >= 7)
+            msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
+         else
+            msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
+         break;
+      case SHADER_OPCODE_TXF_MCS:
+         assert(devinfo->ver >= 7);
+         msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
+         break;
+      case SHADER_OPCODE_TXS:
+	 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
+	 break;
+      case SHADER_OPCODE_TG4:
+         if (inst->shadow_compare) {
+            msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
+         } else {
+            msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
+         }
+         break;
+      case SHADER_OPCODE_TG4_OFFSET:
+         if (inst->shadow_compare) {
+            msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
+         } else {
+            msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
+         }
+         break;
+      case SHADER_OPCODE_SAMPLEINFO:
+         msg_type = GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
+         break;
+      default:
+	 unreachable("should not get here: invalid vec4 texture opcode");
+      }
+   } else {
+      switch (inst->opcode) {
+      case SHADER_OPCODE_TEX:
+      case SHADER_OPCODE_TXL:
+	 if (inst->shadow_compare) {
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
+	    assert(inst->mlen == 3);
+	 } else {
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
+	    assert(inst->mlen == 2);
+	 }
+	 break;
+      case SHADER_OPCODE_TXD:
+	 /* There is no sample_d_c message; comparisons are done manually. */
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
+	 assert(inst->mlen == 4);
+	 break;
+      case SHADER_OPCODE_TXF:
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD;
+	 assert(inst->mlen == 2);
+	 break;
+      case SHADER_OPCODE_TXS:
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
+	 assert(inst->mlen == 2);
+	 break;
+      default:
+	 unreachable("should not get here: invalid vec4 texture opcode");
+      }
+   }
+
+   assert(msg_type != -1);
+
+   assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
+
+   /* Load the message header if present.  If there's a texture offset, we need
+    * to set it up explicitly and load the offset bitfield.  Otherwise, we can
+    * use an implied move from g0 to the first message register.
+    */
+   if (inst->header_size != 0) {
+      if (devinfo->ver < 6 && !inst->offset) {
+         /* Set up an implied move from g0 to the MRF. */
+         src = brw_vec8_grf(0, 0);
+      } else {
+         struct brw_reg header =
+            retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
+         uint32_t dw2 = 0;
+
+         /* Explicitly set up the message header by copying g0 to the MRF. */
+         brw_push_insn_state(p);
+         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+         brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         if (inst->offset)
+            /* Set the texel offset bits in DWord 2. */
+            dw2 = inst->offset;
+
+         /* The VS, DS, and FS stages have the g0.2 payload delivered as 0,
+          * so header0.2 is 0 when g0 is copied.  The HS and GS stages do
+          * not, so we must set to to 0 to avoid setting undesirable bits
+          * in the message header.
+          */
+         if (dw2 ||
+             stage == MESA_SHADER_TESS_CTRL ||
+             stage == MESA_SHADER_GEOMETRY) {
+            brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(dw2));
+         }
+
+         brw_adjust_sampler_state_pointer(p, header, sampler_index);
+         brw_pop_insn_state(p);
+      }
+   }
+
+   uint32_t return_format;
+
+   switch (dst.type) {
+   case BRW_REGISTER_TYPE_D:
+      return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
+      break;
+   case BRW_REGISTER_TYPE_UD:
+      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
+      break;
+   default:
+      return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+      break;
+   }
+
+   /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
+    * is set as part of the message descriptor.  On gfx4, the PRM seems to
+    * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
+    * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
+    * gone from the message descriptor entirely and you just get UINT32 all
+    * the time regasrdless.  Since we can really only do non-UINT32 on gfx4,
+    * just stomp it to UINT32 all the time.
+    */
+   if (inst->opcode == SHADER_OPCODE_TXS)
+      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
+
+   if (surface_index.file == BRW_IMMEDIATE_VALUE &&
+       sampler_index.file == BRW_IMMEDIATE_VALUE) {
+      uint32_t surface = surface_index.ud;
+      uint32_t sampler = sampler_index.ud;
+
+      brw_SAMPLE(p,
+                 dst,
+                 inst->base_mrf,
+                 src,
+                 surface,
+                 sampler % 16,
+                 msg_type,
+                 1, /* response length */
+                 inst->mlen,
+                 inst->header_size != 0,
+                 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+                 return_format);
+   } else {
+      /* Non-constant sampler index. */
+
+      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+      struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
+      struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
+
+      brw_push_insn_state(p);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+      if (brw_regs_equal(&surface_reg, &sampler_reg)) {
+         brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+      } else {
+         if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
+            brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
+         } else {
+            brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
+            brw_OR(p, addr, addr, surface_reg);
+         }
+      }
+      brw_AND(p, addr, addr, brw_imm_ud(0xfff));
+
+      brw_pop_insn_state(p);
+
+      if (inst->base_mrf != -1)
+         gfx6_resolve_implied_move(p, &src, inst->base_mrf);
+
+      /* dst = send(offset, a0.0 | <descriptor>) */
+      brw_send_indirect_message(
+         p, BRW_SFID_SAMPLER, dst, src, addr,
+         brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
+         brw_sampler_desc(devinfo,
+                          0 /* surface */,
+                          0 /* sampler */,
+                          msg_type,
+                          BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+                          return_format),
+         false /* EOT */);
+
+      /* visitor knows more than we do about the surface limit required,
+       * so has already done marking.
+       */
+   }
+}
+
+static void
+generate_vs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
+{
+   brw_urb_WRITE(p,
+		 brw_null_reg(), /* dest */
+		 inst->base_mrf, /* starting mrf reg nr */
+		 brw_vec8_grf(0, 0), /* src */
+                 inst->urb_write_flags,
+		 inst->mlen,
+		 0,		/* response len */
+		 inst->offset,	/* urb destination offset */
+		 BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
+static void
+generate_gs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
+{
+   struct brw_reg src = brw_message_reg(inst->base_mrf);
+   brw_urb_WRITE(p,
+                 brw_null_reg(), /* dest */
+                 inst->base_mrf, /* starting mrf reg nr */
+                 src,
+                 inst->urb_write_flags,
+                 inst->mlen,
+                 0,             /* response len */
+                 inst->offset,  /* urb destination offset */
+                 BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
+static void
+generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst)
+{
+   struct brw_reg src = brw_message_reg(inst->base_mrf);
+
+   /* We pass the temporary passed in src0 as the writeback register */
+   brw_urb_WRITE(p,
+                 inst->src[0].as_brw_reg(), /* dest */
+                 inst->base_mrf, /* starting mrf reg nr */
+                 src,
+                 BRW_URB_WRITE_ALLOCATE_COMPLETE,
+                 inst->mlen,
+                 1, /* response len */
+                 inst->offset,  /* urb destination offset */
+                 BRW_URB_SWIZZLE_INTERLEAVE);
+
+   /* Now put allocated urb handle in dst.0 */
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, get_element_ud(inst->dst.as_brw_reg(), 0),
+           get_element_ud(inst->src[0].as_brw_reg(), 0));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
+{
+   struct brw_reg src = brw_message_reg(inst->base_mrf);
+   brw_urb_WRITE(p,
+                 brw_null_reg(), /* dest */
+                 inst->base_mrf, /* starting mrf reg nr */
+                 src,
+                 BRW_URB_WRITE_EOT | inst->urb_write_flags,
+                 inst->mlen,
+                 0,              /* response len */
+                 0,              /* urb destination offset */
+                 BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
+static void
+generate_gs_set_write_offset(struct brw_codegen *p,
+                             struct brw_reg dst,
+                             struct brw_reg src0,
+                             struct brw_reg src1)
+{
+   /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
+    * Header: M0.3):
+    *
+    *     Slot 0 Offset. This field, after adding to the Global Offset field
+    *     in the message descriptor, specifies the offset (in 256-bit units)
+    *     from the start of the URB entry, as referenced by URB Handle 0, at
+    *     which the data will be accessed.
+    *
+    * Similar text describes DWORD M0.4, which is slot 1 offset.
+    *
+    * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components
+    * of the register for geometry shader invocations 0 and 1) by the
+    * immediate value in src1, and store the result in DWORDs 3 and 4 of dst.
+    *
+    * We can do this with the following EU instruction:
+    *
+    *     mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW   { Align1 WE_all }
+    */
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   assert(p->devinfo->ver >= 7 &&
+          src1.file == BRW_IMMEDIATE_VALUE &&
+          src1.type == BRW_REGISTER_TYPE_UD &&
+          src1.ud <= USHRT_MAX);
+   if (src0.file == BRW_IMMEDIATE_VALUE) {
+      brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3),
+              brw_imm_ud(src0.ud * src1.ud));
+   } else {
+      if (src1.file == BRW_IMMEDIATE_VALUE) {
+         src1 = brw_imm_uw(src1.ud);
+      }
+      brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
+              retype(src1, BRW_REGISTER_TYPE_UW));
+   }
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_set_vertex_count(struct brw_codegen *p,
+                             struct brw_reg dst,
+                             struct brw_reg src)
+{
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   /* If we think of the src and dst registers as composed of 8 DWORDs each,
+    * we want to pick up the contents of DWORDs 0 and 4 from src, truncate
+    * them to WORDs, and then pack them into DWORD 2 of dst.
+    *
+    * It's easier to get the EU to do this if we think of the src and dst
+    * registers as composed of 16 WORDS each; then, we want to pick up the
+    * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5
+    * of dst.
+    *
+    * We can do that by the following EU instruction:
+    *
+    *     mov (2) dst.4<1>:uw src<8;1,0>:uw   { Align1, Q1, NoMask }
+    */
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p,
+           suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4),
+           stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_svb_write(struct brw_codegen *p,
+                      vec4_instruction *inst,
+                      struct brw_reg dst,
+                      struct brw_reg src0,
+                      struct brw_reg src1)
+{
+   int binding = inst->sol_binding;
+   bool final_write = inst->sol_final_write;
+
+   brw_push_insn_state(p);
+   brw_set_default_exec_size(p, BRW_EXECUTE_4);
+   /* Copy Vertex data into M0.x */
+   brw_MOV(p, stride(dst, 4, 4, 1),
+           stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1));
+   brw_pop_insn_state(p);
+
+   brw_push_insn_state(p);
+   /* Send SVB Write */
+   brw_svb_write(p,
+                 final_write ? src1 : brw_null_reg(), /* dest == src1 */
+                 1, /* msg_reg_nr */
+                 dst, /* src0 == previous dst */
+                 BRW_GFX6_SOL_BINDING_START + binding, /* binding_table_index */
+                 final_write); /* send_commit_msg */
+
+   /* Finally, wait for the write commit to occur so that we can proceed to
+    * other things safely.
+    *
+    * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
+    *
+    *   The write commit does not modify the destination register, but
+    *   merely clears the dependency associated with the destination
+    *   register. Thus, a simple “mov” instruction using the register as a
+    *   source is sufficient to wait for the write commit to occur.
+    */
+   if (final_write) {
+      brw_MOV(p, src1, src1);
+   }
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_svb_set_destination_index(struct brw_codegen *p,
+                                      vec4_instruction *inst,
+                                      struct brw_reg dst,
+                                      struct brw_reg src)
+{
+   int vertex = inst->sol_vertex;
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_set_dword_2(struct brw_codegen *p,
+                        struct brw_reg dst,
+                        struct brw_reg src)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_prepare_channel_masks(struct brw_codegen *p,
+                                  struct brw_reg dst)
+{
+   /* We want to left shift just DWORD 4 (the x component belonging to the
+    * second geometry shader invocation) by 4 bits.  So generate the
+    * instruction:
+    *
+    *     shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
+    */
+   dst = suboffset(vec1(dst), 4);
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_SHL(p, dst, dst, brw_imm_ud(4));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_set_channel_masks(struct brw_codegen *p,
+                              struct brw_reg dst,
+                              struct brw_reg src)
+{
+   /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
+    * Header: M0.5):
+    *
+    *     15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
+    *
+    *        When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
+    *        DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
+    *        Vertex 0 DATA[7].  This bit is ANDed with the corresponding
+    *        channel enable to determine the final channel enable.  For the
+    *        URB_READ_OWORD & URB_READ_HWORD messages, when final channel
+    *        enable is 1 it indicates that Vertex 1 DATA [3] will be included
+    *        in the writeback message.  For the URB_WRITE_OWORD &
+    *        URB_WRITE_HWORD messages, when final channel enable is 1 it
+    *        indicates that Vertex 1 DATA [3] will be written to the surface.
+    *
+    *        0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
+    *        1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
+    *
+    *     14 Vertex 1 DATA [2] Channel Mask
+    *     13 Vertex 1 DATA [1] Channel Mask
+    *     12 Vertex 1 DATA [0] Channel Mask
+    *     11 Vertex 0 DATA [3] Channel Mask
+    *     10 Vertex 0 DATA [2] Channel Mask
+    *      9 Vertex 0 DATA [1] Channel Mask
+    *      8 Vertex 0 DATA [0] Channel Mask
+    *
+    * (This is from a section of the PRM that is agnostic to the particular
+    * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
+    * geometry shader invocations 0 and 1, respectively).  Since we have the
+    * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0,
+    * and the enable flags for geometry shader invocation 1 in bits 7:0 of
+    * DWORD 4, we just need to OR them together and store the result in bits
+    * 15:8 of DWORD 5.
+    *
+    * It's easier to get the EU to do this if we think of the src and dst
+    * registers as composed of 32 bytes each; then, we want to pick up the
+    * contents of bytes 0 and 16 from src, OR them together, and store them in
+    * byte 21.
+    *
+    * We can do that by the following EU instruction:
+    *
+    *     or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
+    *
+    * Note: this relies on the source register having zeros in (a) bits 7:4 of
+    * DWORD 0 and (b) bits 3:0 of DWORD 4.  We can rely on (b) because the
+    * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which
+    * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to
+    * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to
+    * contain valid channel mask values (which are in the range 0x0-0xf).
+    */
+   dst = retype(dst, BRW_REGISTER_TYPE_UB);
+   src = retype(src, BRW_REGISTER_TYPE_UB);
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_get_instance_id(struct brw_codegen *p,
+                            struct brw_reg dst)
+{
+   /* We want to right shift R0.0 & R0.1 by GFX7_GS_PAYLOAD_INSTANCE_ID_SHIFT
+    * and store into dst.0 & dst.4. So generate the instruction:
+    *
+    *     shr(8) dst<1> R0<1,4,0> GFX7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q }
+    */
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   dst = retype(dst, BRW_REGISTER_TYPE_UD);
+   struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   brw_SHR(p, dst, stride(r0, 1, 4, 0),
+           brw_imm_ud(GFX7_GS_PAYLOAD_INSTANCE_ID_SHIFT));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_ff_sync_set_primitives(struct brw_codegen *p,
+                                   struct brw_reg dst,
+                                   struct brw_reg src0,
+                                   struct brw_reg src1,
+                                   struct brw_reg src2)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   /* Save src0 data in 16:31 bits of dst.0 */
+   brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0),
+           brw_imm_ud(0xffffu));
+   brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16));
+   /* Save src1 data in 0:15 bits of dst.0 */
+   brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0),
+           brw_imm_ud(0xffffu));
+   brw_OR(p, suboffset(vec1(dst), 0),
+          suboffset(vec1(dst), 0),
+          suboffset(vec1(src2), 0));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_ff_sync(struct brw_codegen *p,
+                    vec4_instruction *inst,
+                    struct brw_reg dst,
+                    struct brw_reg src0,
+                    struct brw_reg src1)
+{
+   /* This opcode uses an implied MRF register for:
+    *  - the header of the ff_sync message. And as such it is expected to be
+    *    initialized to r0 before calling here.
+    *  - the destination where we will write the allocated URB handle.
+    */
+   struct brw_reg header =
+      retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
+
+   /* Overwrite dword 0 of the header (SO vertices to write) and
+    * dword 1 (number of primitives written).
+    */
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0));
+   brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0));
+   brw_pop_insn_state(p);
+
+   /* Allocate URB handle in dst */
+   brw_ff_sync(p,
+               dst,
+               0,
+               header,
+               1, /* allocate */
+               1, /* response length */
+               0 /* eot */);
+
+   /* Now put allocated urb handle in header.0 */
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0));
+
+   /* src1 is not an immediate when we use transform feedback */
+   if (src1.file != BRW_IMMEDIATE_VALUE) {
+      brw_set_default_exec_size(p, BRW_EXECUTE_4);
+      brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1));
+   }
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst)
+{
+   /* In gfx6, PrimitiveID is delivered in R0.1 of the payload */
+   struct brw_reg src = brw_vec8_grf(0, 0);
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   const bool ivb = devinfo->platform == INTEL_PLATFORM_IVB ||
+                    devinfo->platform == INTEL_PLATFORM_BYT;
+
+   /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
+    *
+    * Since we operate in SIMD4x2 mode, we need run half as many threads
+    * as necessary.  So we assign (2i + 1, 2i) as the thread counts.  We
+    * shift right by one less to accomplish the multiplication by two.
+    */
+   dst = retype(dst, BRW_REGISTER_TYPE_UD);
+   struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+   const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
+   const int shift = ivb ? 16 : 17;
+
+   brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask));
+   brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0),
+           brw_imm_ud(shift - 1));
+   brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_urb_write(struct brw_codegen *p,
+                       vec4_instruction *inst,
+                       struct brw_reg urb_header)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, brw_null_reg());
+   brw_set_src0(p, send, urb_header);
+   brw_set_desc(p, send, brw_message_desc(devinfo, inst->mlen, 0, true));
+
+   brw_inst_set_sfid(devinfo, send, BRW_SFID_URB);
+   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD);
+   brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
+   if (inst->urb_write_flags & BRW_URB_WRITE_EOT) {
+      brw_inst_set_eot(devinfo, send, 1);
+   } else {
+      brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
+      brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
+   }
+
+   /* what happens to swizzles? */
+}
+
+
+static void
+generate_tcs_input_urb_offsets(struct brw_codegen *p,
+                               struct brw_reg dst,
+                               struct brw_reg vertex,
+                               struct brw_reg offset)
+{
+   /* Generates an URB read/write message header for HS/DS operation.
+    * Inputs are a vertex index, and a byte offset from the beginning of
+    * the vertex. */
+
+   /* If `vertex` is not an immediate, we clobber a0.0 */
+
+   assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE);
+   assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D);
+
+   assert(dst.file == BRW_GENERAL_REGISTER_FILE);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, dst, brw_imm_ud(0));
+
+   /* m0.5 bits 8-15 are channel enables */
+   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
+
+   /* m0.0-0.1: URB handles */
+   if (vertex.file == BRW_IMMEDIATE_VALUE) {
+      uint32_t vertex_index = vertex.ud;
+      struct brw_reg index_reg = brw_vec1_grf(
+            1 + (vertex_index >> 3), vertex_index & 7);
+
+      brw_MOV(p, vec2(get_element_ud(dst, 0)),
+              retype(index_reg, BRW_REGISTER_TYPE_UD));
+   } else {
+      /* Use indirect addressing.  ICP Handles are DWords (single channels
+       * of a register) and start at g1.0.
+       *
+       * In order to start our region at g1.0, we add 8 to the vertex index,
+       * effectively skipping over the 8 channels in g0.0.  This gives us a
+       * DWord offset to the ICP Handle.
+       *
+       * Indirect addressing works in terms of bytes, so we then multiply
+       * the DWord offset by 4 (by shifting left by 2).
+       */
+      struct brw_reg addr = brw_address_reg(0);
+
+      /* bottom half: m0.0 = g[1.0 + vertex.0]UD */
+      brw_ADD(p, addr, retype(get_element_ud(vertex, 0), BRW_REGISTER_TYPE_UW),
+              brw_imm_uw(0x8));
+      brw_SHL(p, addr, addr, brw_imm_uw(2));
+      brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0));
+
+      /* top half: m0.1 = g[1.0 + vertex.4]UD */
+      brw_ADD(p, addr, retype(get_element_ud(vertex, 4), BRW_REGISTER_TYPE_UW),
+              brw_imm_uw(0x8));
+      brw_SHL(p, addr, addr, brw_imm_uw(2));
+      brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0));
+   }
+
+   /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
+   if (offset.file != ARF)
+      brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+
+   brw_pop_insn_state(p);
+}
+
+
+static void
+generate_tcs_output_urb_offsets(struct brw_codegen *p,
+                                struct brw_reg dst,
+                                struct brw_reg write_mask,
+                                struct brw_reg offset)
+{
+   /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */
+   assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE);
+
+   assert(write_mask.file == BRW_IMMEDIATE_VALUE);
+   assert(write_mask.type == BRW_REGISTER_TYPE_UD);
+
+   brw_push_insn_state(p);
+
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, dst, brw_imm_ud(0));
+
+   unsigned mask = write_mask.ud;
+
+   /* m0.5 bits 15:12 and 11:8 are channel enables */
+   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12)));
+
+   /* HS patch URB handle is delivered in r0.0 */
+   struct brw_reg urb_handle = brw_vec1_grf(0, 0);
+
+   /* m0.0-0.1: URB handles */
+   brw_MOV(p, vec2(get_element_ud(dst, 0)),
+           retype(urb_handle, BRW_REGISTER_TYPE_UD));
+
+   /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
+   if (offset.file != ARF)
+      brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tes_create_input_read_header(struct brw_codegen *p,
+                                      struct brw_reg dst)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   /* Initialize the register to 0 */
+   brw_MOV(p, dst, brw_imm_ud(0));
+
+   /* Enable all the channels in m0.5 bits 15:8 */
+   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
+
+   /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1.  For safety,
+    * mask out irrelevant "Reserved" bits, as they're not marked MBZ.
+    */
+   brw_AND(p, vec2(get_element_ud(dst, 0)),
+           retype(brw_vec1_grf(1, 3), BRW_REGISTER_TYPE_UD),
+           brw_imm_ud(0x1fff));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tes_add_indirect_urb_offset(struct brw_codegen *p,
+                                     struct brw_reg dst,
+                                     struct brw_reg header,
+                                     struct brw_reg offset)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   brw_MOV(p, dst, header);
+
+   /* Uniforms will have a stride <0;4,1>, and we need to convert to <0;1,0>.
+    * Other values get <4;1,0>.
+    */
+   struct brw_reg restrided_offset;
+   if (offset.vstride == BRW_VERTICAL_STRIDE_0 &&
+       offset.width == BRW_WIDTH_4 &&
+       offset.hstride == BRW_HORIZONTAL_STRIDE_1) {
+      restrided_offset = stride(offset, 0, 1, 0);
+   } else {
+      restrided_offset = stride(offset, 4, 1, 0);
+   }
+
+   /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */
+   brw_MOV(p, vec2(get_element_ud(dst, 3)), restrided_offset);
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_vec4_urb_read(struct brw_codegen *p,
+                       vec4_instruction *inst,
+                       struct brw_reg dst,
+                       struct brw_reg header)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   assert(header.file == BRW_GENERAL_REGISTER_FILE);
+   assert(header.type == BRW_REGISTER_TYPE_UD);
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, header);
+
+   brw_set_desc(p, send, brw_message_desc(devinfo, 1, 1, true));
+
+   brw_inst_set_sfid(devinfo, send, BRW_SFID_URB);
+   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
+   brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
+   brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
+
+   brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
+}
+
+static void
+generate_tcs_release_input(struct brw_codegen *p,
+                           struct brw_reg header,
+                           struct brw_reg vertex,
+                           struct brw_reg is_unpaired)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+
+   assert(vertex.file == BRW_IMMEDIATE_VALUE);
+   assert(vertex.type == BRW_REGISTER_TYPE_UD);
+
+   /* m0.0-0.1: URB handles */
+   struct brw_reg urb_handles =
+      retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7),
+             BRW_REGISTER_TYPE_UD);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, header, brw_imm_ud(0));
+   brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles);
+   brw_pop_insn_state(p);
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, brw_null_reg());
+   brw_set_src0(p, send, header);
+   brw_set_desc(p, send, brw_message_desc(devinfo, 1, 0, true));
+
+   brw_inst_set_sfid(devinfo, send, BRW_SFID_URB);
+   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
+   brw_inst_set_urb_complete(devinfo, send, 1);
+   brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ?
+                                    BRW_URB_SWIZZLE_NONE :
+                                    BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
+static void
+generate_tcs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
+{
+   struct brw_reg header = brw_message_reg(inst->base_mrf);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, header, brw_imm_ud(0));
+   brw_MOV(p, get_element_ud(header, 5), brw_imm_ud(WRITEMASK_X << 8));
+   brw_MOV(p, get_element_ud(header, 0),
+           retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   brw_MOV(p, brw_message_reg(inst->base_mrf + 1), brw_imm_ud(0u));
+   brw_pop_insn_state(p);
+
+   brw_urb_WRITE(p,
+                 brw_null_reg(), /* dest */
+                 inst->base_mrf, /* starting mrf reg nr */
+                 header,
+                 BRW_URB_WRITE_EOT | BRW_URB_WRITE_OWORD |
+                 BRW_URB_WRITE_USE_CHANNEL_MASKS,
+                 inst->mlen,
+                 0,              /* response len */
+                 0,              /* urb destination offset */
+                 0);
+}
+
+static void
+generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, dst, retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_D));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_create_barrier_header(struct brw_codegen *p,
+                                   struct brw_vue_prog_data *prog_data,
+                                   struct brw_reg dst)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   const bool ivb = devinfo->platform == INTEL_PLATFORM_IVB ||
+                    devinfo->platform == INTEL_PLATFORM_BYT;
+   struct brw_reg m0_2 = get_element_ud(dst, 2);
+   unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances;
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   /* Zero the message header */
+   brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
+
+   /* Copy "Barrier ID" from r0.2, bits 16:13 (Gfx7.5+) or 15:12 (Gfx7) */
+   brw_AND(p, m0_2,
+           retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
+           brw_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13)));
+
+   /* Shift it up to bits 27:24. */
+   brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(ivb ? 12 : 11));
+
+   /* Set the Barrier Count and the enable bit */
+   brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15)));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_oword_dual_block_offsets(struct brw_codegen *p,
+                                  struct brw_reg m1,
+                                  struct brw_reg index)
+{
+   int second_vertex_offset;
+
+   if (p->devinfo->ver >= 6)
+      second_vertex_offset = 1;
+   else
+      second_vertex_offset = 16;
+
+   m1 = retype(m1, BRW_REGISTER_TYPE_D);
+
+   /* Set up M1 (message payload).  Only the block offsets in M1.0 and
+    * M1.4 are used, and the rest are ignored.
+    */
+   struct brw_reg m1_0 = suboffset(vec1(m1), 0);
+   struct brw_reg m1_4 = suboffset(vec1(m1), 4);
+   struct brw_reg index_0 = suboffset(vec1(index), 0);
+   struct brw_reg index_4 = suboffset(vec1(index), 4);
+
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+   brw_MOV(p, m1_0, index_0);
+
+   if (index.file == BRW_IMMEDIATE_VALUE) {
+      index_4.ud += second_vertex_offset;
+      brw_MOV(p, m1_4, index_4);
+   } else {
+      brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset));
+   }
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_unpack_flags(struct brw_codegen *p,
+                      struct brw_reg dst)
+{
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+   struct brw_reg flags = brw_flag_reg(0, 0);
+   struct brw_reg dst_0 = suboffset(vec1(dst), 0);
+   struct brw_reg dst_4 = suboffset(vec1(dst), 4);
+
+   brw_AND(p, dst_0, flags, brw_imm_ud(0x0f));
+   brw_AND(p, dst_4, flags, brw_imm_ud(0xf0));
+   brw_SHR(p, dst_4, dst_4, brw_imm_ud(4));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_scratch_read(struct brw_codegen *p,
+                      vec4_instruction *inst,
+                      struct brw_reg dst,
+                      struct brw_reg index)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   struct brw_reg header = brw_vec8_grf(0, 0);
+
+   gfx6_resolve_implied_move(p, &header, inst->base_mrf);
+
+   generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
+				     index);
+
+   uint32_t msg_type;
+
+   if (devinfo->ver >= 6)
+      msg_type = GFX6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else if (devinfo->verx10 >= 45)
+      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else
+      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+
+   const unsigned target_cache =
+      devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
+      devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
+      BRW_SFID_DATAPORT_READ;
+
+   /* Each of the 8 channel enables is considered for whether each
+    * dword is written.
+    */
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_sfid(devinfo, send, target_cache);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, header);
+   if (devinfo->ver < 6)
+      brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf);
+   brw_set_desc(p, send,
+                brw_message_desc(devinfo, 2, 1, true) |
+                brw_dp_read_desc(devinfo,
+                                 brw_scratch_surface_idx(p),
+                                 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+                                 msg_type, BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
+}
+
+static void
+generate_scratch_write(struct brw_codegen *p,
+                       vec4_instruction *inst,
+                       struct brw_reg dst,
+                       struct brw_reg src,
+                       struct brw_reg index)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   const unsigned target_cache =
+      (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
+       devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
+       BRW_SFID_DATAPORT_WRITE);
+   struct brw_reg header = brw_vec8_grf(0, 0);
+   bool write_commit;
+
+   /* If the instruction is predicated, we'll predicate the send, not
+    * the header setup.
+    */
+   brw_push_insn_state(p);
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+   brw_set_default_flag_reg(p, 0, 0);
+
+   gfx6_resolve_implied_move(p, &header, inst->base_mrf);
+
+   generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
+				     index);
+
+   brw_MOV(p,
+	   retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D),
+	   retype(src, BRW_REGISTER_TYPE_D));
+
+   brw_pop_insn_state(p);
+
+   uint32_t msg_type;
+
+   if (devinfo->ver >= 7)
+      msg_type = GFX7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE;
+   else if (devinfo->ver == 6)
+      msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
+   else
+      msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
+
+   brw_set_default_predicate_control(p, inst->predicate);
+
+   /* Pre-gfx6, we have to specify write commits to ensure ordering
+    * between reads and writes within a thread.  Afterwards, that's
+    * guaranteed and write commits only matter for inter-thread
+    * synchronization.
+    */
+   if (devinfo->ver >= 6) {
+      write_commit = false;
+   } else {
+      /* The visitor set up our destination register to be g0.  This
+       * means that when the next read comes along, we will end up
+       * reading from g0 and causing a block on the write commit.  For
+       * write-after-read, we are relying on the value of the previous
+       * read being used (and thus blocking on completion) before our
+       * write is executed.  This means we have to be careful in
+       * instruction scheduling to not violate this assumption.
+       */
+      write_commit = true;
+   }
+
+   /* Each of the 8 channel enables is considered for whether each
+    * dword is written.
+    */
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_sfid(p->devinfo, send, target_cache);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, header);
+   if (devinfo->ver < 6)
+      brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
+   brw_set_desc(p, send,
+                brw_message_desc(devinfo, 3, write_commit, true) |
+                brw_dp_write_desc(devinfo,
+                                  brw_scratch_surface_idx(p),
+                                  BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+                                  msg_type,
+                                  write_commit));
+}
+
+static void
+generate_pull_constant_load(struct brw_codegen *p,
+                            vec4_instruction *inst,
+                            struct brw_reg dst,
+                            struct brw_reg index,
+                            struct brw_reg offset)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   const unsigned target_cache =
+      (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_SAMPLER_CACHE :
+       BRW_SFID_DATAPORT_READ);
+   assert(index.file == BRW_IMMEDIATE_VALUE &&
+	  index.type == BRW_REGISTER_TYPE_UD);
+   uint32_t surf_index = index.ud;
+
+   struct brw_reg header = brw_vec8_grf(0, 0);
+
+   gfx6_resolve_implied_move(p, &header, inst->base_mrf);
+
+   if (devinfo->ver >= 6) {
+      if (offset.file == BRW_IMMEDIATE_VALUE) {
+         brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
+                           BRW_REGISTER_TYPE_D),
+                 brw_imm_d(offset.ud >> 4));
+      } else {
+         brw_SHR(p, retype(brw_message_reg(inst->base_mrf + 1),
+                           BRW_REGISTER_TYPE_D),
+                 offset, brw_imm_d(4));
+      }
+   } else {
+      brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
+                        BRW_REGISTER_TYPE_D),
+              offset);
+   }
+
+   uint32_t msg_type;
+
+   if (devinfo->ver >= 6)
+      msg_type = GFX6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else if (devinfo->verx10 >= 45)
+      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else
+      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+
+   /* Each of the 8 channel enables is considered for whether each
+    * dword is written.
+    */
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_sfid(devinfo, send, target_cache);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, header);
+   if (devinfo->ver < 6)
+      brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
+   brw_set_desc(p, send,
+                brw_message_desc(devinfo, 2, 1, true) |
+                brw_dp_read_desc(devinfo, surf_index,
+                                 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+                                 msg_type,
+                                 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
+}
+
+static void
+generate_get_buffer_size(struct brw_codegen *p,
+                         vec4_instruction *inst,
+                         struct brw_reg dst,
+                         struct brw_reg src,
+                         struct brw_reg surf_index)
+{
+   assert(p->devinfo->ver >= 7);
+   assert(surf_index.type == BRW_REGISTER_TYPE_UD &&
+          surf_index.file == BRW_IMMEDIATE_VALUE);
+
+   brw_SAMPLE(p,
+              dst,
+              inst->base_mrf,
+              src,
+              surf_index.ud,
+              0,
+              GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
+              1, /* response length */
+              inst->mlen,
+              inst->header_size > 0,
+              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+              BRW_SAMPLER_RETURN_FORMAT_SINT32);
+}
+
+static void
+generate_pull_constant_load_gfx7(struct brw_codegen *p,
+                                 vec4_instruction *inst,
+                                 struct brw_reg dst,
+                                 struct brw_reg surf_index,
+                                 struct brw_reg offset)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   assert(surf_index.type == BRW_REGISTER_TYPE_UD);
+
+   if (surf_index.file == BRW_IMMEDIATE_VALUE) {
+
+      brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND);
+      brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
+      brw_set_dest(p, insn, dst);
+      brw_set_src0(p, insn, offset);
+      brw_set_desc(p, insn,
+                   brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
+                   brw_sampler_desc(devinfo, surf_index.ud,
+                                    0, /* LD message ignores sampler unit */
+                                    GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
+                                    BRW_SAMPLER_SIMD_MODE_SIMD4X2, 0));
+   } else {
+
+      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+
+      brw_push_insn_state(p);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+      /* a0.0 = surf_index & 0xff */
+      brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
+      brw_inst_set_exec_size(devinfo, insn_and, BRW_EXECUTE_1);
+      brw_set_dest(p, insn_and, addr);
+      brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD)));
+      brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
+
+      brw_pop_insn_state(p);
+
+      /* dst = send(offset, a0.0 | <descriptor>) */
+      brw_send_indirect_message(
+         p, BRW_SFID_SAMPLER, dst, offset, addr,
+         brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
+         brw_sampler_desc(devinfo,
+                          0 /* surface */,
+                          0 /* sampler */,
+                          GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
+                          BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+                          0),
+         false /* EOT */);
+   }
+}
+
+static void
+generate_mov_indirect(struct brw_codegen *p,
+                      vec4_instruction *,
+                      struct brw_reg dst, struct brw_reg reg,
+                      struct brw_reg indirect)
+{
+   assert(indirect.type == BRW_REGISTER_TYPE_UD);
+   assert(p->devinfo->ver >= 6);
+
+   unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2);
+
+   /* This instruction acts in align1 mode */
+   assert(dst.writemask == WRITEMASK_XYZW);
+
+   if (indirect.file == BRW_IMMEDIATE_VALUE) {
+      imm_byte_offset += indirect.ud;
+
+      reg.nr = imm_byte_offset / REG_SIZE;
+      reg.subnr = (imm_byte_offset / (REG_SIZE / 2)) % 2;
+      unsigned shift = (imm_byte_offset / 4) % 4;
+      reg.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
+
+      brw_MOV(p, dst, reg);
+   } else {
+      brw_push_insn_state(p);
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+      struct brw_reg addr = vec8(brw_address_reg(0));
+
+      /* We need to move the indirect value into the address register.  In
+       * order to make things make some sense, we want to respect at least the
+       * X component of the swizzle.  In order to do that, we need to convert
+       * the subnr (probably 0) to an align1 subnr and add in the swizzle.
+       */
+      assert(brw_is_single_value_swizzle(indirect.swizzle));
+      indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0));
+
+      /* We then use a region of <8,4,0>:uw to pick off the first 2 bytes of
+       * the indirect and splat it out to all four channels of the given half
+       * of a0.
+       */
+      indirect.subnr *= 2;
+      indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0);
+      brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset));
+
+      /* Now we need to incorporate the swizzle from the source register */
+      if (reg.swizzle != BRW_SWIZZLE_XXXX) {
+         uint32_t uv_swiz = BRW_GET_SWZ(reg.swizzle, 0) << 2 |
+                            BRW_GET_SWZ(reg.swizzle, 1) << 6 |
+                            BRW_GET_SWZ(reg.swizzle, 2) << 10 |
+                            BRW_GET_SWZ(reg.swizzle, 3) << 14;
+         uv_swiz |= uv_swiz << 16;
+
+         brw_ADD(p, addr, addr, brw_imm_uv(uv_swiz));
+      }
+
+      brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), reg.type));
+
+      brw_pop_insn_state(p);
+   }
+}
+
+static void
+generate_zero_oob_push_regs(struct brw_codegen *p,
+                            struct brw_stage_prog_data *prog_data,
+                            struct brw_reg scratch,
+                            struct brw_reg bit_mask_in)
+{
+   const uint64_t want_zero = prog_data->zero_push_reg;
+   assert(want_zero);
+
+   assert(bit_mask_in.file == BRW_GENERAL_REGISTER_FILE);
+   assert(BRW_GET_SWZ(bit_mask_in.swizzle, 1) ==
+          BRW_GET_SWZ(bit_mask_in.swizzle, 0) + 1);
+   bit_mask_in.subnr += BRW_GET_SWZ(bit_mask_in.swizzle, 0) * 4;
+   bit_mask_in.type = BRW_REGISTER_TYPE_W;
+
+   /* Scratch should be 3 registers in the GRF */
+   assert(scratch.file == BRW_GENERAL_REGISTER_FILE);
+   scratch = vec8(scratch);
+   struct brw_reg mask_w16 = retype(scratch, BRW_REGISTER_TYPE_W);
+   struct brw_reg mask_d16 = retype(byte_offset(scratch, REG_SIZE),
+                                    BRW_REGISTER_TYPE_D);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   for (unsigned i = 0; i < 64; i++) {
+      if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
+         brw_set_default_exec_size(p, BRW_EXECUTE_8);
+         brw_SHL(p, suboffset(mask_w16, 8),
+                    vec1(byte_offset(bit_mask_in, i / 8)),
+                    brw_imm_v(0x01234567));
+         brw_SHL(p, mask_w16, suboffset(mask_w16, 8), brw_imm_w(8));
+
+         brw_set_default_exec_size(p, BRW_EXECUTE_16);
+         brw_ASR(p, mask_d16, mask_w16, brw_imm_w(15));
+      }
+
+      if (want_zero & BITFIELD64_BIT(i)) {
+         unsigned push_start = prog_data->dispatch_grf_start_reg;
+         struct brw_reg push_reg =
+            retype(brw_vec8_grf(push_start + i, 0), BRW_REGISTER_TYPE_D);
+
+         brw_set_default_exec_size(p, BRW_EXECUTE_8);
+         brw_AND(p, push_reg, push_reg, vec1(suboffset(mask_d16, i)));
+      }
+   }
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_code(struct brw_codegen *p,
+              const struct brw_compiler *compiler,
+              const struct brw_compile_params *params,
+              const nir_shader *nir,
+              struct brw_vue_prog_data *prog_data,
+              const struct cfg_t *cfg,
+              const performance &perf,
+              struct brw_compile_stats *stats,
+              bool debug_enabled)
+{
+   const struct intel_device_info *devinfo = p->devinfo;
+   const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->info.stage);
+   struct disasm_info *disasm_info = disasm_initialize(p->isa, cfg);
+
+   /* `send_count` explicitly does not include spills or fills, as we'd
+    * like to use it as a metric for intentional memory access or other
+    * shared function use.  Otherwise, subtle changes to scheduling or
+    * register allocation could cause it to fluctuate wildly - and that
+    * effect is already counted in spill/fill counts.
+    */
+   int spill_count = 0, fill_count = 0;
+   int loop_count = 0, send_count = 0;
+
+   foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
+      struct brw_reg src[3], dst;
+
+      if (unlikely(debug_enabled))
+         disasm_annotate(disasm_info, inst, p->next_insn_offset);
+
+      for (unsigned int i = 0; i < 3; i++) {
+         src[i] = inst->src[i].as_brw_reg();
+      }
+      dst = inst->dst.as_brw_reg();
+
+      brw_set_default_predicate_control(p, inst->predicate);
+      brw_set_default_predicate_inverse(p, inst->predicate_inverse);
+      brw_set_default_flag_reg(p, inst->flag_subreg / 2, inst->flag_subreg % 2);
+      brw_set_default_saturate(p, inst->saturate);
+      brw_set_default_mask_control(p, inst->force_writemask_all);
+      brw_set_default_acc_write_control(p, inst->writes_accumulator);
+
+      assert(inst->group % inst->exec_size == 0);
+      assert(inst->group % 4 == 0);
+
+      /* There are some instructions where the destination is 64-bit
+       * but we retype it to a smaller type. In that case, we cannot
+       * double the exec_size.
+       */
+      const bool is_df = (get_exec_type_size(inst) == 8 ||
+                          inst->dst.type == BRW_REGISTER_TYPE_DF) &&
+                         inst->opcode != VEC4_OPCODE_PICK_LOW_32BIT &&
+                         inst->opcode != VEC4_OPCODE_PICK_HIGH_32BIT &&
+                         inst->opcode != VEC4_OPCODE_SET_LOW_32BIT &&
+                         inst->opcode != VEC4_OPCODE_SET_HIGH_32BIT;
+
+      unsigned exec_size = inst->exec_size;
+      if (devinfo->verx10 == 70 && is_df)
+         exec_size *= 2;
+
+      brw_set_default_exec_size(p, cvt(exec_size) - 1);
+
+      if (!inst->force_writemask_all)
+         brw_set_default_group(p, inst->group);
+
+      assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->ver));
+      assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
+
+      unsigned pre_emit_nr_insn = p->nr_insn;
+
+      switch (inst->opcode) {
+      case VEC4_OPCODE_UNPACK_UNIFORM:
+      case BRW_OPCODE_MOV:
+      case VEC4_OPCODE_MOV_FOR_SCRATCH:
+         brw_MOV(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_ADD:
+         brw_ADD(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_MUL:
+         brw_MUL(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_MACH:
+         brw_MACH(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_MAD:
+         assert(devinfo->ver >= 6);
+         brw_MAD(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_FRC:
+         brw_FRC(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_RNDD:
+         brw_RNDD(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_RNDE:
+         brw_RNDE(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_RNDZ:
+         brw_RNDZ(p, dst, src[0]);
+         break;
+
+      case BRW_OPCODE_AND:
+         brw_AND(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_OR:
+         brw_OR(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_XOR:
+         brw_XOR(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_NOT:
+         brw_NOT(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_ASR:
+         brw_ASR(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_SHR:
+         brw_SHR(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_SHL:
+         brw_SHL(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_CMP:
+         brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
+         break;
+      case BRW_OPCODE_CMPN:
+         brw_CMPN(p, dst, inst->conditional_mod, src[0], src[1]);
+         break;
+      case BRW_OPCODE_SEL:
+         brw_SEL(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_DPH:
+         brw_DPH(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_DP4:
+         brw_DP4(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_DP3:
+         brw_DP3(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_DP2:
+         brw_DP2(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_F32TO16:
+         assert(devinfo->ver >= 7);
+         brw_F32TO16(p, dst, src[0]);
+         break;
+
+      case BRW_OPCODE_F16TO32:
+         assert(devinfo->ver >= 7);
+         brw_F16TO32(p, dst, src[0]);
+         break;
+
+      case BRW_OPCODE_LRP:
+         assert(devinfo->ver >= 6);
+         brw_LRP(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_BFREV:
+         assert(devinfo->ver >= 7);
+         brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
+                   retype(src[0], BRW_REGISTER_TYPE_UD));
+         break;
+      case BRW_OPCODE_FBH:
+         assert(devinfo->ver >= 7);
+         brw_FBH(p, retype(dst, src[0].type), src[0]);
+         break;
+      case BRW_OPCODE_FBL:
+         assert(devinfo->ver >= 7);
+         brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD),
+                 retype(src[0], BRW_REGISTER_TYPE_UD));
+         break;
+      case BRW_OPCODE_LZD:
+         brw_LZD(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_CBIT:
+         assert(devinfo->ver >= 7);
+         brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD),
+                  retype(src[0], BRW_REGISTER_TYPE_UD));
+         break;
+      case BRW_OPCODE_ADDC:
+         assert(devinfo->ver >= 7);
+         brw_ADDC(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_SUBB:
+         assert(devinfo->ver >= 7);
+         brw_SUBB(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_MAC:
+         brw_MAC(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_BFE:
+         assert(devinfo->ver >= 7);
+         brw_BFE(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_BFI1:
+         assert(devinfo->ver >= 7);
+         brw_BFI1(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_BFI2:
+         assert(devinfo->ver >= 7);
+         brw_BFI2(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_IF:
+         if (!inst->src[0].is_null()) {
+            /* The instruction has an embedded compare (only allowed on gfx6) */
+            assert(devinfo->ver == 6);
+            gfx6_IF(p, inst->conditional_mod, src[0], src[1]);
+         } else {
+            brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8);
+            brw_inst_set_pred_control(p->devinfo, if_inst, inst->predicate);
+         }
+         break;
+
+      case BRW_OPCODE_ELSE:
+         brw_ELSE(p);
+         break;
+      case BRW_OPCODE_ENDIF:
+         brw_ENDIF(p);
+         break;
+
+      case BRW_OPCODE_DO:
+         brw_DO(p, BRW_EXECUTE_8);
+         break;
+
+      case BRW_OPCODE_BREAK:
+         brw_BREAK(p);
+         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+         break;
+      case BRW_OPCODE_CONTINUE:
+         brw_CONT(p);
+         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+         break;
+
+      case BRW_OPCODE_WHILE:
+         brw_WHILE(p);
+         loop_count++;
+         break;
+
+      case SHADER_OPCODE_RCP:
+      case SHADER_OPCODE_RSQ:
+      case SHADER_OPCODE_SQRT:
+      case SHADER_OPCODE_EXP2:
+      case SHADER_OPCODE_LOG2:
+      case SHADER_OPCODE_SIN:
+      case SHADER_OPCODE_COS:
+         assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
+         if (devinfo->ver >= 7) {
+            gfx6_math(p, dst, brw_math_function(inst->opcode), src[0],
+                      brw_null_reg());
+         } else if (devinfo->ver == 6) {
+            generate_math_gfx6(p, inst, dst, src[0], brw_null_reg());
+         } else {
+            generate_math1_gfx4(p, inst, dst, src[0]);
+            send_count++;
+         }
+         break;
+
+      case SHADER_OPCODE_POW:
+      case SHADER_OPCODE_INT_QUOTIENT:
+      case SHADER_OPCODE_INT_REMAINDER:
+         assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
+         if (devinfo->ver >= 7) {
+            gfx6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
+         } else if (devinfo->ver == 6) {
+            generate_math_gfx6(p, inst, dst, src[0], src[1]);
+         } else {
+            generate_math2_gfx4(p, inst, dst, src[0], src[1]);
+            send_count++;
+         }
+         break;
+
+      case SHADER_OPCODE_TEX:
+      case SHADER_OPCODE_TXD:
+      case SHADER_OPCODE_TXF:
+      case SHADER_OPCODE_TXF_CMS:
+      case SHADER_OPCODE_TXF_CMS_W:
+      case SHADER_OPCODE_TXF_MCS:
+      case SHADER_OPCODE_TXL:
+      case SHADER_OPCODE_TXS:
+      case SHADER_OPCODE_TG4:
+      case SHADER_OPCODE_TG4_OFFSET:
+      case SHADER_OPCODE_SAMPLEINFO:
+         generate_tex(p, prog_data, nir->info.stage,
+                      inst, dst, src[0], src[1], src[2]);
+         send_count++;
+         break;
+
+      case SHADER_OPCODE_GET_BUFFER_SIZE:
+         generate_get_buffer_size(p, inst, dst, src[0], src[1]);
+         send_count++;
+         break;
+
+      case VEC4_VS_OPCODE_URB_WRITE:
+         generate_vs_urb_write(p, inst);
+         send_count++;
+         break;
+
+      case SHADER_OPCODE_GFX4_SCRATCH_READ:
+         generate_scratch_read(p, inst, dst, src[0]);
+         fill_count++;
+         break;
+
+      case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
+         generate_scratch_write(p, inst, dst, src[0], src[1]);
+         spill_count++;
+         break;
+
+      case VS_OPCODE_PULL_CONSTANT_LOAD:
+         generate_pull_constant_load(p, inst, dst, src[0], src[1]);
+         send_count++;
+         break;
+
+      case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
+         generate_pull_constant_load_gfx7(p, inst, dst, src[0], src[1]);
+         send_count++;
+         break;
+
+      case VEC4_GS_OPCODE_URB_WRITE:
+         generate_gs_urb_write(p, inst);
+         send_count++;
+         break;
+
+      case VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
+         generate_gs_urb_write_allocate(p, inst);
+         send_count++;
+         break;
+
+      case GS_OPCODE_SVB_WRITE:
+         generate_gs_svb_write(p, inst, dst, src[0], src[1]);
+         send_count++;
+         break;
+
+      case GS_OPCODE_SVB_SET_DST_INDEX:
+         generate_gs_svb_set_destination_index(p, inst, dst, src[0]);
+         break;
+
+      case GS_OPCODE_THREAD_END:
+         generate_gs_thread_end(p, inst);
+         send_count++;
+         break;
+
+      case GS_OPCODE_SET_WRITE_OFFSET:
+         generate_gs_set_write_offset(p, dst, src[0], src[1]);
+         break;
+
+      case GS_OPCODE_SET_VERTEX_COUNT:
+         generate_gs_set_vertex_count(p, dst, src[0]);
+         break;
+
+      case GS_OPCODE_FF_SYNC:
+         generate_gs_ff_sync(p, inst, dst, src[0], src[1]);
+         send_count++;
+         break;
+
+      case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
+         generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case GS_OPCODE_SET_PRIMITIVE_ID:
+         generate_gs_set_primitive_id(p, dst);
+         break;
+
+      case GS_OPCODE_SET_DWORD_2:
+         generate_gs_set_dword_2(p, dst, src[0]);
+         break;
+
+      case GS_OPCODE_PREPARE_CHANNEL_MASKS:
+         generate_gs_prepare_channel_masks(p, dst);
+         break;
+
+      case GS_OPCODE_SET_CHANNEL_MASKS:
+         generate_gs_set_channel_masks(p, dst, src[0]);
+         break;
+
+      case GS_OPCODE_GET_INSTANCE_ID:
+         generate_gs_get_instance_id(p, dst);
+         break;
+
+      case VEC4_OPCODE_UNTYPED_ATOMIC:
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
+                            !inst->dst.is_null(), inst->header_size);
+         send_count++;
+         break;
+
+      case VEC4_OPCODE_UNTYPED_SURFACE_READ:
+         assert(!inst->header_size);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
+                                  src[2].ud);
+         send_count++;
+         break;
+
+      case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_untyped_surface_write(p, src[0], src[1], inst->mlen,
+                                   src[2].ud, inst->header_size);
+         send_count++;
+         break;
+
+      case SHADER_OPCODE_MEMORY_FENCE:
+         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND,
+                          brw_message_target(inst->sfid),
+                          inst->desc,
+                          /* commit_enable */ false,
+                          /* bti */ 0);
+         send_count++;
+         break;
+
+      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+         brw_find_live_channel(p, dst, false);
+         break;
+
+      case SHADER_OPCODE_BROADCAST:
+         assert(inst->force_writemask_all);
+         brw_broadcast(p, dst, src[0], src[1]);
+         break;
+
+      case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
+         generate_unpack_flags(p, dst);
+         break;
+
+      case VEC4_OPCODE_MOV_BYTES: {
+         /* Moves the low byte from each channel, using an Align1 access mode
+          * and a <4,1,0> source region.
+          */
+         assert(src[0].type == BRW_REGISTER_TYPE_UB ||
+                src[0].type == BRW_REGISTER_TYPE_B);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+         src[0].vstride = BRW_VERTICAL_STRIDE_4;
+         src[0].width = BRW_WIDTH_1;
+         src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
+         brw_MOV(p, dst, src[0]);
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_DOUBLE_TO_F32:
+      case VEC4_OPCODE_DOUBLE_TO_D32:
+      case VEC4_OPCODE_DOUBLE_TO_U32: {
+         assert(type_sz(src[0].type) == 8);
+         assert(type_sz(dst.type) == 8);
+
+         brw_reg_type dst_type;
+
+         switch (inst->opcode) {
+         case VEC4_OPCODE_DOUBLE_TO_F32:
+            dst_type = BRW_REGISTER_TYPE_F;
+            break;
+         case VEC4_OPCODE_DOUBLE_TO_D32:
+            dst_type = BRW_REGISTER_TYPE_D;
+            break;
+         case VEC4_OPCODE_DOUBLE_TO_U32:
+            dst_type = BRW_REGISTER_TYPE_UD;
+            break;
+         default:
+            unreachable("Not supported conversion");
+         }
+         dst = retype(dst, dst_type);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         /* When converting from DF->F, we set destination's stride as 2 as an
+          * alignment requirement. But in IVB/BYT, each DF implicitly writes
+          * two floats, being the first one the converted value. So we don't
+          * need to explicitly set stride 2, but 1.
+          */
+         struct brw_reg spread_dst;
+         if (devinfo->verx10 == 70)
+            spread_dst = stride(dst, 8, 4, 1);
+         else
+            spread_dst = stride(dst, 8, 4, 2);
+
+         brw_MOV(p, spread_dst, src[0]);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_TO_DOUBLE: {
+         assert(type_sz(src[0].type) == 4);
+         assert(type_sz(dst.type) == 8);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         brw_MOV(p, dst, src[0]);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_PICK_LOW_32BIT:
+      case VEC4_OPCODE_PICK_HIGH_32BIT: {
+         /* Stores the low/high 32-bit of each 64-bit element in src[0] into
+          * dst using ALIGN1 mode and a <8,4,2>:UD region on the source.
+          */
+         assert(type_sz(src[0].type) == 8);
+         assert(type_sz(dst.type) == 4);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         dst = retype(dst, BRW_REGISTER_TYPE_UD);
+         dst.hstride = BRW_HORIZONTAL_STRIDE_1;
+
+         src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
+         if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT)
+            src[0] = suboffset(src[0], 1);
+         src[0] = spread(src[0], 2);
+         brw_MOV(p, dst, src[0]);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_SET_LOW_32BIT:
+      case VEC4_OPCODE_SET_HIGH_32BIT: {
+         /* Reads consecutive 32-bit elements from src[0] and writes
+          * them to the low/high 32-bit of each 64-bit element in dst.
+          */
+         assert(type_sz(src[0].type) == 4);
+         assert(type_sz(dst.type) == 8);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         dst = retype(dst, BRW_REGISTER_TYPE_UD);
+         if (inst->opcode == VEC4_OPCODE_SET_HIGH_32BIT)
+            dst = suboffset(dst, 1);
+         dst.hstride = BRW_HORIZONTAL_STRIDE_2;
+
+         src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
+         brw_MOV(p, dst, src[0]);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_PACK_BYTES: {
+         /* Is effectively:
+          *
+          *   mov(8) dst<16,4,1>:UB src<4,1,0>:UB
+          *
+          * but destinations' only regioning is horizontal stride, so instead we
+          * have to use two instructions:
+          *
+          *   mov(4) dst<1>:UB     src<4,1,0>:UB
+          *   mov(4) dst.16<1>:UB  src.16<4,1,0>:UB
+          *
+          * where they pack the four bytes from the low and high four DW.
+          */
+         assert(util_is_power_of_two_nonzero(dst.writemask));
+         unsigned offset = __builtin_ctz(dst.writemask);
+
+         dst.type = BRW_REGISTER_TYPE_UB;
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         src[0].type = BRW_REGISTER_TYPE_UB;
+         src[0].vstride = BRW_VERTICAL_STRIDE_4;
+         src[0].width = BRW_WIDTH_1;
+         src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
+         dst.subnr = offset * 4;
+         struct brw_inst *insn = brw_MOV(p, dst, src[0]);
+         brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
+         brw_inst_set_no_dd_clear(p->devinfo, insn, true);
+         brw_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check);
+
+         src[0].subnr = 16;
+         dst.subnr = 16 + offset * 4;
+         insn = brw_MOV(p, dst, src[0]);
+         brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
+         brw_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear);
+         brw_inst_set_no_dd_check(p->devinfo, insn, true);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
+         generate_zero_oob_push_regs(p, &prog_data->base, dst, src[0]);
+         break;
+
+      case VEC4_TCS_OPCODE_URB_WRITE:
+         generate_tcs_urb_write(p, inst, src[0]);
+         send_count++;
+         break;
+
+      case VEC4_OPCODE_URB_READ:
+         generate_vec4_urb_read(p, inst, dst, src[0]);
+         send_count++;
+         break;
+
+      case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+         generate_tcs_input_urb_offsets(p, dst, src[0], src[1]);
+         break;
+
+      case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+         generate_tcs_output_urb_offsets(p, dst, src[0], src[1]);
+         break;
+
+      case TCS_OPCODE_GET_INSTANCE_ID:
+         generate_tcs_get_instance_id(p, dst);
+         break;
+
+      case TCS_OPCODE_GET_PRIMITIVE_ID:
+         generate_tcs_get_primitive_id(p, dst);
+         break;
+
+      case TCS_OPCODE_CREATE_BARRIER_HEADER:
+         generate_tcs_create_barrier_header(p, prog_data, dst);
+         break;
+
+      case TES_OPCODE_CREATE_INPUT_READ_HEADER:
+         generate_tes_create_input_read_header(p, dst);
+         break;
+
+      case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+         generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]);
+         break;
+
+      case TES_OPCODE_GET_PRIMITIVE_ID:
+         generate_tes_get_primitive_id(p, dst);
+         break;
+
+      case TCS_OPCODE_SRC0_010_IS_ZERO:
+         /* If src_reg had stride like fs_reg, we wouldn't need this. */
+         brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0));
+         break;
+
+      case TCS_OPCODE_RELEASE_INPUT:
+         generate_tcs_release_input(p, dst, src[0], src[1]);
+         send_count++;
+         break;
+
+      case TCS_OPCODE_THREAD_END:
+         generate_tcs_thread_end(p, inst);
+         send_count++;
+         break;
+
+      case SHADER_OPCODE_BARRIER:
+         brw_barrier(p, src[0]);
+         brw_WAIT(p);
+         send_count++;
+         break;
+
+      case SHADER_OPCODE_MOV_INDIRECT:
+         generate_mov_indirect(p, inst, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_DIM:
+         assert(devinfo->verx10 == 75);
+         assert(src[0].type == BRW_REGISTER_TYPE_DF);
+         assert(dst.type == BRW_REGISTER_TYPE_DF);
+         brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
+         break;
+
+      case SHADER_OPCODE_RND_MODE: {
+         assert(src[0].file == BRW_IMMEDIATE_VALUE);
+         /*
+          * Changes the floating point rounding mode updating the control
+          * register field defined at cr0.0[5-6] bits.
+          */
+         enum brw_rnd_mode mode =
+            (enum brw_rnd_mode) (src[0].d << BRW_CR0_RND_MODE_SHIFT);
+         brw_float_controls_mode(p, mode, BRW_CR0_RND_MODE_MASK);
+      }
+         break;
+
+      default:
+         unreachable("Unsupported opcode");
+      }
+
+      if (inst->opcode == VEC4_OPCODE_PACK_BYTES) {
+         /* Handled dependency hints in the generator. */
+
+         assert(!inst->conditional_mod);
+      } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
+         assert(p->nr_insn == pre_emit_nr_insn + 1 ||
+                !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
+                 "emitting more than 1 instruction");
+
+         brw_inst *last = &p->store[pre_emit_nr_insn];
+
+         if (inst->conditional_mod)
+            brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
+         brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
+         brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
+      }
+   }
+
+   brw_set_uip_jip(p, 0);
+
+   /* end of program sentinel */
+   disasm_new_inst_group(disasm_info, p->next_insn_offset);
+
+#ifndef NDEBUG
+   bool validated =
+#else
+   if (unlikely(debug_enabled))
+#endif
+      brw_validate_instructions(&compiler->isa, p->store,
+                                0, p->next_insn_offset,
+                                disasm_info);
+
+   int before_size = p->next_insn_offset;
+   brw_compact_instructions(p, 0, disasm_info);
+   int after_size = p->next_insn_offset;
+
+   bool dump_shader_bin = brw_should_dump_shader_bin();
+   unsigned char sha1[21];
+   char sha1buf[41];
+
+   if (unlikely(debug_enabled || dump_shader_bin)) {
+      _mesa_sha1_compute(p->store, p->next_insn_offset, sha1);
+      _mesa_sha1_format(sha1buf, sha1);
+   }
+
+   if (unlikely(dump_shader_bin))
+      brw_dump_shader_bin(p->store, 0, p->next_insn_offset, sha1buf);
+
+   if (unlikely(debug_enabled)) {
+      fprintf(stderr, "Native code for %s %s shader %s (src_hash 0x%08x) (sha1 %s):\n",
+            nir->info.label ? nir->info.label : "unnamed",
+            _mesa_shader_stage_to_string(nir->info.stage), nir->info.name,
+            params->source_hash, sha1buf);
+
+      fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
+                     "spills:fills, %u sends. Compacted %d to %d bytes (%.0f%%)\n",
+            stage_abbrev, before_size / 16, loop_count, perf.latency,
+            spill_count, fill_count, send_count, before_size, after_size,
+            100.0f * (before_size - after_size) / before_size);
+
+      /* overriding the shader makes disasm_info invalid */
+      if (!brw_try_override_assembly(p, 0, sha1buf)) {
+         dump_assembly(p->store, 0, p->next_insn_offset,
+                       disasm_info, perf.block_latency);
+      } else {
+         fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
+      }
+   }
+   ralloc_free(disasm_info);
+   assert(validated);
+
+   brw_shader_debug_log(compiler, params->log_data,
+                        "%s vec4 shader: %d inst, %d loops, %u cycles, "
+                        "%d:%d spills:fills, %u sends, "
+                        "compacted %d to %d bytes.\n",
+                        stage_abbrev, before_size / 16,
+                        loop_count, perf.latency, spill_count,
+                        fill_count, send_count, before_size, after_size);
+   if (stats) {
+      stats->dispatch_width = 0;
+      stats->max_dispatch_width = 0;
+      stats->instructions = before_size / 16;
+      stats->sends = send_count;
+      stats->loops = loop_count;
+      stats->cycles = perf.latency;
+      stats->spills = spill_count;
+      stats->fills = fill_count;
+   }
+}
+
+extern "C" const unsigned *
+brw_vec4_generate_assembly(const struct brw_compiler *compiler,
+                           const struct brw_compile_params *params,
+                           const nir_shader *nir,
+                           struct brw_vue_prog_data *prog_data,
+                           const struct cfg_t *cfg,
+                           const performance &perf,
+                           bool debug_enabled)
+{
+   struct brw_codegen *p = rzalloc(params->mem_ctx, struct brw_codegen);
+   brw_init_codegen(&compiler->isa, p, params->mem_ctx);
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   generate_code(p, compiler, params,
+                 nir, prog_data, cfg, perf,
+                 params->stats, debug_enabled);
+
+   assert(prog_data->base.const_data_size == 0);
+   if (nir->constant_data_size > 0) {
+      prog_data->base.const_data_size = nir->constant_data_size;
+      prog_data->base.const_data_offset =
+         brw_append_data(p, nir->constant_data, nir->constant_data_size, 32);
+   }
+
+   return brw_get_program(p, &prog_data->base.program_size);
+}
diff --git a/src/intel/compiler/elk/brw_vec4_gs_nir.cpp b/src/intel/compiler/elk/brw_vec4_gs_nir.cpp
new file mode 100644
index 00000000000..60b42da87b9
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_gs_nir.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4_gs_visitor.h"
+
+namespace brw {
+
+void
+vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg dest;
+   src_reg src;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_per_vertex_input: {
+      assert(instr->def.bit_size == 32);
+      /* The EmitNoIndirectInput flag guarantees our vertex index will
+       * be constant.  We should handle indirects someday.
+       */
+      const unsigned vertex = nir_src_as_uint(instr->src[0]);
+      const unsigned offset_reg = nir_src_as_uint(instr->src[1]);
+
+      const unsigned input_array_stride = prog_data->urb_read_length * 2;
+
+      /* Make up a type...we have no way of knowing... */
+      const glsl_type *const type = glsl_ivec_type(instr->num_components);
+
+      src = src_reg(ATTR, input_array_stride * vertex +
+                    nir_intrinsic_base(instr) + offset_reg,
+                    type);
+      src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr));
+
+      dest = get_nir_def(instr->def, src.type);
+      dest.writemask = brw_writemask_for_size(instr->num_components);
+      emit(MOV(dest, src));
+      break;
+   }
+
+   case nir_intrinsic_load_input:
+      unreachable("nir_lower_io should have produced per_vertex intrinsics");
+
+   case nir_intrinsic_emit_vertex_with_counter:
+      this->vertex_count =
+         retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
+      gs_emit_vertex(nir_intrinsic_stream_id(instr));
+      break;
+
+   case nir_intrinsic_end_primitive_with_counter:
+      this->vertex_count =
+         retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
+      gs_end_primitive();
+      break;
+
+   case nir_intrinsic_set_vertex_and_primitive_count:
+      this->vertex_count =
+         retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
+      break;
+
+   case nir_intrinsic_load_primitive_id:
+      assert(gs_prog_data->include_primitive_id);
+      dest = get_nir_def(instr->def, BRW_REGISTER_TYPE_D);
+      emit(MOV(dest, retype(brw_vec4_grf(1, 0), BRW_REGISTER_TYPE_D)));
+      break;
+
+   case nir_intrinsic_load_invocation_id: {
+      dest = get_nir_def(instr->def, BRW_REGISTER_TYPE_D);
+      if (gs_prog_data->invocations > 1)
+         emit(GS_OPCODE_GET_INSTANCE_ID, dest);
+      else
+         emit(MOV(dest, brw_imm_ud(0)));
+      break;
+   }
+
+   default:
+      vec4_visitor::nir_emit_intrinsic(instr);
+   }
+}
+}
diff --git a/src/intel/compiler/elk/brw_vec4_gs_visitor.cpp b/src/intel/compiler/elk/brw_vec4_gs_visitor.cpp
new file mode 100644
index 00000000000..e5b48b6e2d6
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_gs_visitor.cpp
@@ -0,0 +1,951 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_gs_visitor.cpp
+ *
+ * Geometry-shader-specific code derived from the vec4_visitor class.
+ */
+
+#include "brw_vec4_gs_visitor.h"
+#include "gfx6_gs_visitor.h"
+#include "brw_eu.h"
+#include "brw_cfg.h"
+#include "brw_fs.h"
+#include "brw_nir.h"
+#include "brw_prim.h"
+#include "brw_private.h"
+#include "dev/intel_debug.h"
+
+namespace brw {
+
+vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
+                                 const struct brw_compile_params *params,
+                                 struct brw_gs_compile *c,
+                                 struct brw_gs_prog_data *prog_data,
+                                 const nir_shader *shader,
+                                 bool no_spills,
+                                 bool debug_enabled)
+   : vec4_visitor(compiler, params, &c->key.base.tex,
+                  &prog_data->base, shader,
+                  no_spills, debug_enabled),
+     c(c),
+     gs_prog_data(prog_data)
+{
+}
+
+
+static inline struct brw_reg
+attribute_to_hw_reg(int attr, brw_reg_type type, bool interleaved)
+{
+   struct brw_reg reg;
+
+   unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(type));
+   if (interleaved) {
+      reg = stride(brw_vecn_grf(width, attr / 2, (attr % 2) * 4), 0, width, 1);
+   } else {
+      reg = brw_vecn_grf(width, attr, 0);
+   }
+
+   reg.type = type;
+   return reg;
+}
+
+/**
+ * Replace each register of type ATTR in this->instructions with a reference
+ * to a fixed HW register.
+ *
+ * If interleaved is true, then each attribute takes up half a register, with
+ * register N containing attribute 2*N in its first half and attribute 2*N+1
+ * in its second half (this corresponds to the payload setup used by geometry
+ * shaders in "single" or "dual instanced" dispatch mode).  If interleaved is
+ * false, then each attribute takes up a whole register, with register N
+ * containing attribute N (this corresponds to the payload setup used by
+ * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
+ */
+int
+vec4_gs_visitor::setup_varying_inputs(int payload_reg,
+                                      int attributes_per_reg)
+{
+   /* For geometry shaders there are N copies of the input attributes, where N
+    * is the number of input vertices.  attribute_map[BRW_VARYING_SLOT_COUNT *
+    * i + j] represents attribute j for vertex i.
+    *
+    * Note that GS inputs are read from the VUE 256 bits (2 vec4's) at a time,
+    * so the total number of input slots that will be delivered to the GS (and
+    * thus the stride of the input arrays) is urb_read_length * 2.
+    */
+   const unsigned num_input_vertices = nir->info.gs.vertices_in;
+   assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
+   unsigned input_array_stride = prog_data->urb_read_length * 2;
+
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file != ATTR)
+            continue;
+
+         assert(inst->src[i].offset % REG_SIZE == 0);
+         int grf = payload_reg * attributes_per_reg +
+                   inst->src[i].nr + inst->src[i].offset / REG_SIZE;
+
+         struct brw_reg reg =
+            attribute_to_hw_reg(grf, inst->src[i].type, attributes_per_reg > 1);
+         reg.swizzle = inst->src[i].swizzle;
+         if (inst->src[i].abs)
+            reg = brw_abs(reg);
+         if (inst->src[i].negate)
+            reg = negate(reg);
+
+         inst->src[i] = reg;
+      }
+   }
+
+   int regs_used = ALIGN(input_array_stride * num_input_vertices,
+                         attributes_per_reg) / attributes_per_reg;
+   return payload_reg + regs_used;
+}
+
+void
+vec4_gs_visitor::setup_payload()
+{
+   /* If we are in dual instanced or single mode, then attributes are going
+    * to be interleaved, so one register contains two attribute slots.
+    */
+   int attributes_per_reg =
+      prog_data->dispatch_mode == INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
+
+   int reg = 0;
+
+   /* The payload always contains important data in r0, which contains
+    * the URB handles that are passed on to the URB write at the end
+    * of the thread.
+    */
+   reg++;
+
+   /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */
+   if (gs_prog_data->include_primitive_id)
+      reg++;
+
+   reg = setup_uniforms(reg);
+
+   reg = setup_varying_inputs(reg, attributes_per_reg);
+
+   this->first_non_payload_grf = reg;
+}
+
+
+void
+vec4_gs_visitor::emit_prolog()
+{
+   /* In vertex shaders, r0.2 is guaranteed to be initialized to zero.  In
+    * geometry shaders, it isn't (it contains a bunch of information we don't
+    * need, like the input primitive type).  We need r0.2 to be zero in order
+    * to build scratch read/write messages correctly (otherwise this value
+    * will be interpreted as a global offset, causing us to do our scratch
+    * reads/writes to garbage memory).  So just set it to zero at the top of
+    * the shader.
+    */
+   this->current_annotation = "clear r0.2";
+   dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, brw_imm_ud(0u));
+   inst->force_writemask_all = true;
+
+   /* Create a virtual register to hold the vertex count */
+   this->vertex_count = src_reg(this, glsl_uint_type());
+
+   /* Initialize the vertex_count register to 0 */
+   this->current_annotation = "initialize vertex_count";
+   inst = emit(MOV(dst_reg(this->vertex_count), brw_imm_ud(0u)));
+   inst->force_writemask_all = true;
+
+   if (c->control_data_header_size_bits > 0) {
+      /* Create a virtual register to hold the current set of control data
+       * bits.
+       */
+      this->control_data_bits = src_reg(this, glsl_uint_type());
+
+      /* If we're outputting more than 32 control data bits, then EmitVertex()
+       * will set control_data_bits to 0 after emitting the first vertex.
+       * Otherwise, we need to initialize it to 0 here.
+       */
+      if (c->control_data_header_size_bits <= 32) {
+         this->current_annotation = "initialize control data bits";
+         inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u)));
+         inst->force_writemask_all = true;
+      }
+   }
+
+   this->current_annotation = NULL;
+}
+
+void
+vec4_gs_visitor::emit_thread_end()
+{
+   if (c->control_data_header_size_bits > 0) {
+      /* During shader execution, we only ever call emit_control_data_bits()
+       * just prior to outputting a vertex.  Therefore, the control data bits
+       * corresponding to the most recently output vertex still need to be
+       * emitted.
+       */
+      current_annotation = "thread end: emit control data bits";
+      emit_control_data_bits();
+   }
+
+   /* MRF 0 is reserved for the debugger, so start with message header
+    * in MRF 1.
+    */
+   int base_mrf = 1;
+
+   current_annotation = "thread end";
+   dst_reg mrf_reg(MRF, base_mrf);
+   src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+   inst->force_writemask_all = true;
+   emit(GS_OPCODE_SET_VERTEX_COUNT, mrf_reg, this->vertex_count);
+   inst = emit(GS_OPCODE_THREAD_END);
+   inst->base_mrf = base_mrf;
+   inst->mlen = 1;
+}
+
+
+void
+vec4_gs_visitor::emit_urb_write_header(int mrf)
+{
+   /* The SEND instruction that writes the vertex data to the VUE will use
+    * per_slot_offset=true, which means that DWORDs 3 and 4 of the message
+    * header specify an offset (in multiples of 256 bits) into the URB entry
+    * at which the write should take place.
+    *
+    * So we have to prepare a message header with the appropriate offset
+    * values.
+    */
+   dst_reg mrf_reg(MRF, mrf);
+   src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   this->current_annotation = "URB write header";
+   vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+   inst->force_writemask_all = true;
+   emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
+        brw_imm_ud(gs_prog_data->output_vertex_size_hwords));
+}
+
+
+vec4_instruction *
+vec4_gs_visitor::emit_urb_write_opcode(bool complete)
+{
+   /* We don't care whether the vertex is complete, because in general
+    * geometry shaders output multiple vertices, and we don't terminate the
+    * thread until all vertices are complete.
+    */
+   (void) complete;
+
+   vec4_instruction *inst = emit(VEC4_GS_OPCODE_URB_WRITE);
+   inst->offset = gs_prog_data->control_data_header_size_hwords;
+
+   inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
+   return inst;
+}
+
+
+/**
+ * Write out a batch of 32 control data bits from the control_data_bits
+ * register to the URB.
+ *
+ * The current value of the vertex_count register determines which DWORD in
+ * the URB receives the control data bits.  The control_data_bits register is
+ * assumed to contain the correct data for the vertex that was most recently
+ * output, and all previous vertices that share the same DWORD.
+ *
+ * This function takes care of ensuring that if no vertices have been output
+ * yet, no control bits are emitted.
+ */
+void
+vec4_gs_visitor::emit_control_data_bits()
+{
+   assert(c->control_data_bits_per_vertex != 0);
+
+   /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized)
+    * granularity, we need to use two tricks to ensure that the batch of 32
+    * control data bits is written to the appropriate DWORD in the URB.  To
+    * select which vec4 we are writing to, we use the "slot {0,1} offset"
+    * fields of the message header.  To select which DWORD in the vec4 we are
+    * writing to, we use the channel mask fields of the message header.  To
+    * avoid penalizing geometry shaders that emit a small number of vertices
+    * with extra bookkeeping, we only do each of these tricks when
+    * c->prog_data.control_data_header_size_bits is large enough to make it
+    * necessary.
+    *
+    * Note: this means that if we're outputting just a single DWORD of control
+    * data bits, we'll actually replicate it four times since we won't do any
+    * channel masking.  But that's not a problem since in this case the
+    * hardware only pays attention to the first DWORD.
+    */
+   enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD;
+   if (c->control_data_header_size_bits > 32)
+      urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS;
+   if (c->control_data_header_size_bits > 128)
+      urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;
+
+   /* If we are using either channel masks or a per-slot offset, then we
+    * need to figure out which DWORD we are trying to write to, using the
+    * formula:
+    *
+    *     dword_index = (vertex_count - 1) * bits_per_vertex / 32
+    *
+    * Since bits_per_vertex is a power of two, and is known at compile
+    * time, this can be optimized to:
+    *
+    *     dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
+    */
+   src_reg dword_index(this, glsl_uint_type());
+   if (urb_write_flags) {
+      src_reg prev_count(this, glsl_uint_type());
+      emit(ADD(dst_reg(prev_count), this->vertex_count,
+               brw_imm_ud(0xffffffffu)));
+      unsigned log2_bits_per_vertex =
+         util_last_bit(c->control_data_bits_per_vertex);
+      emit(SHR(dst_reg(dword_index), prev_count,
+               brw_imm_ud(6 - log2_bits_per_vertex)));
+   }
+
+   /* Start building the URB write message.  The first MRF gets a copy of
+    * R0.
+    */
+   int base_mrf = 1;
+   dst_reg mrf_reg(MRF, base_mrf);
+   src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+   inst->force_writemask_all = true;
+
+   if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
+      /* Set the per-slot offset to dword_index / 4, to that we'll write to
+       * the appropriate OWORD within the control data header.
+       */
+      src_reg per_slot_offset(this, glsl_uint_type());
+      emit(SHR(dst_reg(per_slot_offset), dword_index, brw_imm_ud(2u)));
+      emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset,
+           brw_imm_ud(1u));
+   }
+
+   if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
+      /* Set the channel masks to 1 << (dword_index % 4), so that we'll
+       * write to the appropriate DWORD within the OWORD.  We need to do
+       * this computation with force_writemask_all, otherwise garbage data
+       * from invocation 0 might clobber the mask for invocation 1 when
+       * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
+       * together.
+       */
+      src_reg channel(this, glsl_uint_type());
+      inst = emit(AND(dst_reg(channel), dword_index, brw_imm_ud(3u)));
+      inst->force_writemask_all = true;
+      src_reg one(this, glsl_uint_type());
+      inst = emit(MOV(dst_reg(one), brw_imm_ud(1u)));
+      inst->force_writemask_all = true;
+      src_reg channel_mask(this, glsl_uint_type());
+      inst = emit(SHL(dst_reg(channel_mask), one, channel));
+      inst->force_writemask_all = true;
+      emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
+                                            channel_mask);
+      emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
+   }
+
+   /* Store the control data bits in the message payload and send it. */
+   dst_reg mrf_reg2(MRF, base_mrf + 1);
+   inst = emit(MOV(mrf_reg2, this->control_data_bits));
+   inst->force_writemask_all = true;
+   inst = emit(VEC4_GS_OPCODE_URB_WRITE);
+   inst->urb_write_flags = urb_write_flags;
+   inst->base_mrf = base_mrf;
+   inst->mlen = 2;
+}
+
+void
+vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id)
+{
+   /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
+
+   /* Note: we are calling this *before* increasing vertex_count, so
+    * this->vertex_count == vertex_count - 1 in the formula above.
+    */
+
+   /* Stream mode uses 2 bits per vertex */
+   assert(c->control_data_bits_per_vertex == 2);
+
+   /* Must be a valid stream */
+   assert(stream_id < 4); /* MAX_VERTEX_STREAMS */
+
+   /* Control data bits are initialized to 0 so we don't have to set any
+    * bits when sending vertices to stream 0.
+    */
+   if (stream_id == 0)
+      return;
+
+   /* reg::sid = stream_id */
+   src_reg sid(this, glsl_uint_type());
+   emit(MOV(dst_reg(sid), brw_imm_ud(stream_id)));
+
+   /* reg:shift_count = 2 * (vertex_count - 1) */
+   src_reg shift_count(this, glsl_uint_type());
+   emit(SHL(dst_reg(shift_count), this->vertex_count, brw_imm_ud(1u)));
+
+   /* Note: we're relying on the fact that the GEN SHL instruction only pays
+    * attention to the lower 5 bits of its second source argument, so on this
+    * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
+    * stream_id << ((2 * (vertex_count - 1)) % 32).
+    */
+   src_reg mask(this, glsl_uint_type());
+   emit(SHL(dst_reg(mask), sid, shift_count));
+   emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
+}
+
+void
+vec4_gs_visitor::gs_emit_vertex(int stream_id)
+{
+   this->current_annotation = "emit vertex: safety check";
+
+   /* Haswell and later hardware ignores the "Render Stream Select" bits
+    * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
+    * and instead sends all primitives down the pipeline for rasterization.
+    * If the SOL stage is enabled, "Render Stream Select" is honored and
+    * primitives bound to non-zero streams are discarded after stream output.
+    *
+    * Since the only purpose of primives sent to non-zero streams is to
+    * be recorded by transform feedback, we can simply discard all geometry
+    * bound to these streams when transform feedback is disabled.
+    */
+   if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
+      return;
+
+   /* If we're outputting 32 control data bits or less, then we can wait
+    * until the shader is over to output them all.  Otherwise we need to
+    * output them as we go.  Now is the time to do it, since we're about to
+    * output the vertex_count'th vertex, so it's guaranteed that the
+    * control data bits associated with the (vertex_count - 1)th vertex are
+    * correct.
+    */
+   if (c->control_data_header_size_bits > 32) {
+      this->current_annotation = "emit vertex: emit control data bits";
+      /* Only emit control data bits if we've finished accumulating a batch
+       * of 32 bits.  This is the case when:
+       *
+       *     (vertex_count * bits_per_vertex) % 32 == 0
+       *
+       * (in other words, when the last 5 bits of vertex_count *
+       * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
+       * integer n (which is always the case, since bits_per_vertex is
+       * always 1 or 2), this is equivalent to requiring that the last 5-n
+       * bits of vertex_count are 0:
+       *
+       *     vertex_count & (2^(5-n) - 1) == 0
+       *
+       * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
+       * equivalent to:
+       *
+       *     vertex_count & (32 / bits_per_vertex - 1) == 0
+       */
+      vec4_instruction *inst =
+         emit(AND(dst_null_ud(), this->vertex_count,
+                  brw_imm_ud(32 / c->control_data_bits_per_vertex - 1)));
+      inst->conditional_mod = BRW_CONDITIONAL_Z;
+
+      emit(IF(BRW_PREDICATE_NORMAL));
+      {
+         /* If vertex_count is 0, then no control data bits have been
+          * accumulated yet, so we skip emitting them.
+          */
+         emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u),
+                  BRW_CONDITIONAL_NEQ));
+         emit(IF(BRW_PREDICATE_NORMAL));
+         emit_control_data_bits();
+         emit(BRW_OPCODE_ENDIF);
+
+         /* Reset control_data_bits to 0 so we can start accumulating a new
+          * batch.
+          *
+          * Note: in the case where vertex_count == 0, this neutralizes the
+          * effect of any call to EndPrimitive() that the shader may have
+          * made before outputting its first vertex.
+          */
+         inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u)));
+         inst->force_writemask_all = true;
+      }
+      emit(BRW_OPCODE_ENDIF);
+   }
+
+   this->current_annotation = "emit vertex: vertex data";
+   emit_vertex();
+
+   /* In stream mode we have to set control data bits for all vertices
+    * unless we have disabled control data bits completely (which we do
+    * do for MESA_PRIM_POINTS outputs that don't use streams).
+    */
+   if (c->control_data_header_size_bits > 0 &&
+       gs_prog_data->control_data_format ==
+          GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
+       this->current_annotation = "emit vertex: Stream control data bits";
+       set_stream_control_data_bits(stream_id);
+   }
+
+   this->current_annotation = NULL;
+}
+
+void
+vec4_gs_visitor::gs_end_primitive()
+{
+   /* We can only do EndPrimitive() functionality when the control data
+    * consists of cut bits.  Fortunately, the only time it isn't is when the
+    * output type is points, in which case EndPrimitive() is a no-op.
+    */
+   if (gs_prog_data->control_data_format !=
+       GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
+      return;
+   }
+
+   if (c->control_data_header_size_bits == 0)
+      return;
+
+   /* Cut bits use one bit per vertex. */
+   assert(c->control_data_bits_per_vertex == 1);
+
+   /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
+    * vertex n, 0 otherwise.  So all we need to do here is mark bit
+    * (vertex_count - 1) % 32 in the cut_bits register to indicate that
+    * EndPrimitive() was called after emitting vertex (vertex_count - 1);
+    * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
+    *
+    * Note that if EndPrimitve() is called before emitting any vertices, this
+    * will cause us to set bit 31 of the control_data_bits register to 1.
+    * That's fine because:
+    *
+    * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
+    *   output, so the hardware will ignore cut bit 31.
+    *
+    * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
+    *   last vertex, so setting cut bit 31 has no effect (since the primitive
+    *   is automatically ended when the GS terminates).
+    *
+    * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
+    *   control_data_bits register to 0 when the first vertex is emitted.
+    */
+
+   /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
+   src_reg one(this, glsl_uint_type());
+   emit(MOV(dst_reg(one), brw_imm_ud(1u)));
+   src_reg prev_count(this, glsl_uint_type());
+   emit(ADD(dst_reg(prev_count), this->vertex_count, brw_imm_ud(0xffffffffu)));
+   src_reg mask(this, glsl_uint_type());
+   /* Note: we're relying on the fact that the GEN SHL instruction only pays
+    * attention to the lower 5 bits of its second source argument, so on this
+    * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
+    * ((vertex_count - 1) % 32).
+    */
+   emit(SHL(dst_reg(mask), one, prev_count));
+   emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
+}
+
+static const GLuint gl_prim_to_hw_prim[MESA_PRIM_TRIANGLE_STRIP_ADJACENCY+1] = {
+   [MESA_PRIM_POINTS] =_3DPRIM_POINTLIST,
+   [MESA_PRIM_LINES] = _3DPRIM_LINELIST,
+   [MESA_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
+   [MESA_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
+   [MESA_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
+   [MESA_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
+   [MESA_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
+   [MESA_PRIM_QUADS] = _3DPRIM_QUADLIST,
+   [MESA_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
+   [MESA_PRIM_POLYGON] = _3DPRIM_POLYGON,
+   [MESA_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
+   [MESA_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
+   [MESA_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
+   [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
+};
+
+} /* namespace brw */
+
+extern "C" const unsigned *
+brw_compile_gs(const struct brw_compiler *compiler,
+               struct brw_compile_gs_params *params)
+{
+   nir_shader *nir = params->base.nir;
+   const struct brw_gs_prog_key *key = params->key;
+   struct brw_gs_prog_data *prog_data = params->prog_data;
+
+   struct brw_gs_compile c;
+   memset(&c, 0, sizeof(c));
+   c.key = *key;
+
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_GEOMETRY];
+   const bool debug_enabled = brw_should_print_shader(nir, DEBUG_GS);
+
+   prog_data->base.base.stage = MESA_SHADER_GEOMETRY;
+   prog_data->base.base.ray_queries = nir->info.ray_queries;
+   prog_data->base.base.total_scratch = 0;
+
+   /* The GLSL linker will have already matched up GS inputs and the outputs
+    * of prior stages.  The driver does extend VS outputs in some cases, but
+    * only for legacy OpenGL or Gfx4-5 hardware, neither of which offer
+    * geometry shader support.  So we can safely ignore that.
+    *
+    * For SSO pipelines, we use a fixed VUE map layout based on variable
+    * locations, so we can rely on rendezvous-by-location making this work.
+    */
+   GLbitfield64 inputs_read = nir->info.inputs_read;
+   brw_compute_vue_map(compiler->devinfo,
+                       &c.input_vue_map, inputs_read,
+                       nir->info.separate_shader, 1);
+
+   brw_nir_apply_key(nir, compiler, &key->base, 8);
+   brw_nir_lower_vue_inputs(nir, &c.input_vue_map);
+   brw_nir_lower_vue_outputs(nir);
+   brw_postprocess_nir(nir, compiler, debug_enabled,
+                       key->base.robust_flags);
+
+   prog_data->base.clip_distance_mask =
+      ((1 << nir->info.clip_distance_array_size) - 1);
+   prog_data->base.cull_distance_mask =
+      ((1 << nir->info.cull_distance_array_size) - 1) <<
+      nir->info.clip_distance_array_size;
+
+   prog_data->include_primitive_id =
+      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
+
+   prog_data->invocations = nir->info.gs.invocations;
+
+   if (compiler->devinfo->ver >= 8)
+      nir_gs_count_vertices_and_primitives(
+         nir, &prog_data->static_vertex_count, nullptr, nullptr, 1u);
+
+   if (compiler->devinfo->ver >= 7) {
+      if (nir->info.gs.output_primitive == MESA_PRIM_POINTS) {
+         /* When the output type is points, the geometry shader may output data
+          * to multiple streams, and EndPrimitive() has no effect.  So we
+          * configure the hardware to interpret the control data as stream ID.
+          */
+         prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
+
+         /* We only have to emit control bits if we are using non-zero streams */
+         if (nir->info.gs.active_stream_mask != (1 << 0))
+            c.control_data_bits_per_vertex = 2;
+         else
+            c.control_data_bits_per_vertex = 0;
+      } else {
+         /* When the output type is triangle_strip or line_strip, EndPrimitive()
+          * may be used to terminate the current strip and start a new one
+          * (similar to primitive restart), and outputting data to multiple
+          * streams is not supported.  So we configure the hardware to interpret
+          * the control data as EndPrimitive information (a.k.a. "cut bits").
+          */
+         prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
+
+         /* We only need to output control data if the shader actually calls
+          * EndPrimitive().
+          */
+         c.control_data_bits_per_vertex =
+            nir->info.gs.uses_end_primitive ? 1 : 0;
+      }
+   } else {
+      /* There are no control data bits in gfx6. */
+      c.control_data_bits_per_vertex = 0;
+   }
+   c.control_data_header_size_bits =
+      nir->info.gs.vertices_out * c.control_data_bits_per_vertex;
+
+   /* 1 HWORD = 32 bytes = 256 bits */
+   prog_data->control_data_header_size_hwords =
+      ALIGN(c.control_data_header_size_bits, 256) / 256;
+
+   /* Compute the output vertex size.
+    *
+    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
+    * Size (p168):
+    *
+    *     [0,62] indicating [1,63] 16B units
+    *
+    *     Specifies the size of each vertex stored in the GS output entry
+    *     (following any Control Header data) as a number of 128-bit units
+    *     (minus one).
+    *
+    *     Programming Restrictions: The vertex size must be programmed as a
+    *     multiple of 32B units with the following exception: Rendering is
+    *     disabled (as per SOL stage state) and the vertex size output by the
+    *     GS thread is 16B.
+    *
+    *     If rendering is enabled (as per SOL state) the vertex size must be
+    *     programmed as a multiple of 32B units. In other words, the only time
+    *     software can program a vertex size with an odd number of 16B units
+    *     is when rendering is disabled.
+    *
+    * Note: B=bytes in the above text.
+    *
+    * It doesn't seem worth the extra trouble to optimize the case where the
+    * vertex size is 16B (especially since this would require special-casing
+    * the GEN assembly that writes to the URB).  So we just set the vertex
+    * size to a multiple of 32B (2 vec4's) in all cases.
+    *
+    * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
+    * budget that as follows:
+    *
+    *   512 bytes for varyings (a varying component is 4 bytes and
+    *             gl_MaxGeometryOutputComponents = 128)
+    *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
+    *             bytes)
+    *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
+    *             even if it's not used)
+    *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
+    *             whenever clip planes are enabled, even if the shader doesn't
+    *             write to gl_ClipDistance)
+    *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
+    *             (see below)--this causes up to 1 VUE slot to be wasted
+    *   400 bytes available for varying packing overhead
+    *
+    * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
+    * per interpolation type, so this is plenty.
+    *
+    */
+   unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
+   assert(compiler->devinfo->ver == 6 ||
+          output_vertex_size_bytes <= GFX7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
+   prog_data->output_vertex_size_hwords =
+      ALIGN(output_vertex_size_bytes, 32) / 32;
+
+   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
+    * That divides up as follows:
+    *
+    *     64 bytes for the control data header (cut indices or StreamID bits)
+    *   4096 bytes for varyings (a varying component is 4 bytes and
+    *              gl_MaxGeometryTotalOutputComponents = 1024)
+    *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
+    *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
+    *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
+    *              even if it's not used)
+    *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
+    *              whenever clip planes are enabled, even if the shader doesn't
+    *              write to gl_ClipDistance)
+    *   4096 bytes overhead since the VUE size must be a multiple of 32
+    *              bytes (see above)--this causes up to 1 VUE slot to be wasted
+    *   8128 bytes available for varying packing overhead
+    *
+    * Worst-case varying packing overhead is 3/4 of a varying slot per
+    * interpolation type, which works out to 3072 bytes, so this would allow
+    * us to accommodate 2 interpolation types without any danger of running
+    * out of URB space.
+    *
+    * In practice, the risk of running out of URB space is very small, since
+    * the above figures are all worst-case, and most of them scale with the
+    * number of output vertices.  So we'll just calculate the amount of space
+    * we need, and if it's too large, fail to compile.
+    *
+    * The above is for gfx7+ where we have a single URB entry that will hold
+    * all the output. In gfx6, we will have to allocate URB entries for every
+    * vertex we emit, so our URB entries only need to be large enough to hold
+    * a single vertex. Also, gfx6 does not have a control data header.
+    */
+   unsigned output_size_bytes;
+   if (compiler->devinfo->ver >= 7) {
+      output_size_bytes =
+         prog_data->output_vertex_size_hwords * 32 * nir->info.gs.vertices_out;
+      output_size_bytes += 32 * prog_data->control_data_header_size_hwords;
+   } else {
+      output_size_bytes = prog_data->output_vertex_size_hwords * 32;
+   }
+
+   /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
+    * which comes before the control header.
+    */
+   if (compiler->devinfo->ver >= 8)
+      output_size_bytes += 32;
+
+   /* Shaders can technically set max_vertices = 0, at which point we
+    * may have a URB size of 0 bytes.  Nothing good can come from that,
+    * so enforce a minimum size.
+    */
+   if (output_size_bytes == 0)
+      output_size_bytes = 1;
+
+   unsigned max_output_size_bytes = GFX7_MAX_GS_URB_ENTRY_SIZE_BYTES;
+   if (compiler->devinfo->ver == 6)
+      max_output_size_bytes = GFX6_MAX_GS_URB_ENTRY_SIZE_BYTES;
+   if (output_size_bytes > max_output_size_bytes)
+      return NULL;
+
+
+   /* URB entry sizes are stored as a multiple of 64 bytes in gfx7+ and
+    * a multiple of 128 bytes in gfx6.
+    */
+   if (compiler->devinfo->ver >= 7) {
+      prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+   } else {
+      prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
+   }
+
+   assert(nir->info.gs.output_primitive < ARRAY_SIZE(brw::gl_prim_to_hw_prim));
+   prog_data->output_topology =
+      brw::gl_prim_to_hw_prim[nir->info.gs.output_primitive];
+
+   prog_data->vertices_in = nir->info.gs.vertices_in;
+
+   /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
+    * need to program a URB read length of ceiling(num_slots / 2).
+    */
+   prog_data->base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
+
+   /* Now that prog_data setup is done, we are ready to actually compile the
+    * program.
+    */
+   if (unlikely(debug_enabled)) {
+      fprintf(stderr, "GS Input ");
+      brw_print_vue_map(stderr, &c.input_vue_map, MESA_SHADER_GEOMETRY);
+      fprintf(stderr, "GS Output ");
+      brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_GEOMETRY);
+   }
+
+   if (is_scalar) {
+      fs_visitor v(compiler, &params->base, &c, prog_data, nir,
+                   params->base.stats != NULL, debug_enabled);
+      if (v.run_gs()) {
+         prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
+
+         assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
+         prog_data->base.base.dispatch_grf_start_reg =
+            v.payload().num_regs / reg_unit(compiler->devinfo);
+
+         fs_generator g(compiler, &params->base,
+                        &prog_data->base.base, false, MESA_SHADER_GEOMETRY);
+         if (unlikely(debug_enabled)) {
+            const char *label =
+               nir->info.label ? nir->info.label : "unnamed";
+            char *name = ralloc_asprintf(params->base.mem_ctx,
+                                         "%s geometry shader %s",
+                                         label, nir->info.name);
+            g.enable_debug(name);
+         }
+         g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
+                         v.performance_analysis.require(), params->base.stats);
+         g.add_const_data(nir->constant_data, nir->constant_data_size);
+         return g.get_assembly();
+      }
+
+      params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+
+      return NULL;
+   }
+
+   if (compiler->devinfo->ver >= 7) {
+      /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
+       * so without spilling. If the GS invocations count > 1, then we can't use
+       * dual object mode.
+       */
+      if (prog_data->invocations <= 1 &&
+          !INTEL_DEBUG(DEBUG_NO_DUAL_OBJECT_GS)) {
+         prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
+
+         brw::vec4_gs_visitor v(compiler, &params->base, &c, prog_data, nir,
+                                true /* no_spills */,
+                                debug_enabled);
+
+         /* Backup 'nr_params' and 'param' as they can be modified by the
+          * the DUAL_OBJECT visitor. If it fails, we will run the fallback
+          * (DUAL_INSTANCED or SINGLE mode) and we need to restore original
+          * values.
+          */
+         const unsigned param_count = prog_data->base.base.nr_params;
+         uint32_t *param = ralloc_array(NULL, uint32_t, param_count);
+         memcpy(param, prog_data->base.base.param,
+                sizeof(uint32_t) * param_count);
+
+         if (v.run()) {
+            /* Success! Backup is not needed */
+            ralloc_free(param);
+            return brw_vec4_generate_assembly(compiler, &params->base,
+                                              nir, &prog_data->base,
+                                              v.cfg,
+                                              v.performance_analysis.require(),
+                                              debug_enabled);
+         } else {
+            /* These variables could be modified by the execution of the GS
+             * visitor if it packed the uniforms in the push constant buffer.
+             * As it failed, we need restore them so we can start again with
+             * DUAL_INSTANCED or SINGLE mode.
+             *
+             * FIXME: Could more variables be modified by this execution?
+             */
+            memcpy(prog_data->base.base.param, param,
+                   sizeof(uint32_t) * param_count);
+            prog_data->base.base.nr_params = param_count;
+            ralloc_free(param);
+         }
+      }
+   }
+
+   /* Either we failed to compile in DUAL_OBJECT mode (probably because it
+    * would have required spilling) or DUAL_OBJECT mode is disabled.  So fall
+    * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
+    *
+    * FIXME: Single dispatch mode requires that the driver can handle
+    * interleaving of input registers, but this is already supported (dual
+    * instance mode has the same requirement). However, to take full advantage
+    * of single dispatch mode to reduce register pressure we would also need to
+    * do interleaved outputs, but currently, the vec4 visitor and generator
+    * classes do not support this, so at the moment register pressure in
+    * single and dual instance modes is the same.
+    *
+    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
+    * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
+    * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
+    * is also supported. When InstanceCount=1 (one instance per object) software
+    * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
+    * the best choice for performance, followed by SINGLE mode."
+    *
+    * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
+    * mode is more performant when invocations > 1. Gfx6 only supports
+    * SINGLE mode.
+    */
+   if (prog_data->invocations <= 1 || compiler->devinfo->ver < 7)
+      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X1_SINGLE;
+   else
+      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_INSTANCE;
+
+   brw::vec4_gs_visitor *gs = NULL;
+   const unsigned *ret = NULL;
+
+   if (compiler->devinfo->ver >= 7)
+      gs = new brw::vec4_gs_visitor(compiler, &params->base, &c, prog_data,
+                                    nir, false /* no_spills */,
+                                    debug_enabled);
+   else
+      gs = new brw::gfx6_gs_visitor(compiler, &params->base, &c, prog_data,
+                                    nir, false /* no_spills */,
+                                    debug_enabled);
+
+   if (!gs->run()) {
+      params->base.error_str =
+         ralloc_strdup(params->base.mem_ctx, gs->fail_msg);
+   } else {
+      ret = brw_vec4_generate_assembly(compiler, &params->base, nir,
+                                       &prog_data->base, gs->cfg,
+                                       gs->performance_analysis.require(),
+                                       debug_enabled);
+   }
+
+   delete gs;
+   return ret;
+}
diff --git a/src/intel/compiler/elk/brw_vec4_gs_visitor.h b/src/intel/compiler/elk/brw_vec4_gs_visitor.h
new file mode 100644
index 00000000000..ec8b6f7fa8b
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_gs_visitor.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_gs_visitor.h
+ *
+ * Geometry-shader-specific code derived from the vec4_visitor class.
+ */
+
+#ifndef BRW_VEC4_GS_VISITOR_H
+#define BRW_VEC4_GS_VISITOR_H
+
+#include "brw_vec4.h"
+
+#define MAX_GS_INPUT_VERTICES 6
+
+#ifdef __cplusplus
+namespace brw {
+
+class vec4_gs_visitor : public vec4_visitor
+{
+public:
+   vec4_gs_visitor(const struct brw_compiler *compiler,
+                   const struct brw_compile_params *params,
+                   struct brw_gs_compile *c,
+                   struct brw_gs_prog_data *prog_data,
+                   const nir_shader *shader,
+                   bool no_spills,
+                   bool debug_enabled);
+
+protected:
+   virtual void setup_payload();
+   virtual void emit_prolog();
+   virtual void emit_thread_end();
+   virtual void emit_urb_write_header(int mrf);
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete);
+   virtual void gs_emit_vertex(int stream_id);
+   virtual void gs_end_primitive();
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+
+protected:
+   int setup_varying_inputs(int payload_reg, int attributes_per_reg);
+   void emit_control_data_bits();
+   void set_stream_control_data_bits(unsigned stream_id);
+
+   src_reg vertex_count;
+   src_reg control_data_bits;
+   const struct brw_gs_compile * const c;
+   struct brw_gs_prog_data * const gs_prog_data;
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_GS_VISITOR_H */
diff --git a/src/intel/compiler/elk/brw_vec4_live_variables.cpp b/src/intel/compiler/elk/brw_vec4_live_variables.cpp
new file mode 100644
index 00000000000..88fa179d0f5
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_live_variables.cpp
@@ -0,0 +1,331 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_vec4.h"
+#include "brw_vec4_live_variables.h"
+
+using namespace brw;
+
+#define MAX_INSTRUCTION (1 << 30)
+
+/** @file brw_vec4_live_variables.cpp
+ *
+ * Support for computing at the basic block level which variables
+ * (virtual GRFs in our case) are live at entry and exit.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 14.1 (p444).
+ */
+
+/**
+ * Sets up the use/def arrays and block-local approximation of the live ranges.
+ *
+ * The basic-block-level live variable analysis needs to know which
+ * variables get used before they're completely defined, and which
+ * variables are completely defined before they're used.
+ *
+ * We independently track each channel of a vec4.  This is because we need to
+ * be able to recognize a sequence like:
+ *
+ * ...
+ * DP4 tmp.x a b;
+ * DP4 tmp.y c d;
+ * MUL result.xy tmp.xy e.xy
+ * ...
+ *
+ * as having tmp live only across that sequence (assuming it's used nowhere
+ * else), because it's a common pattern.  A more conservative approach that
+ * doesn't get tmp marked a deffed in this block will tend to result in
+ * spilling.
+ */
+void
+vec4_live_variables::setup_def_use()
+{
+   int ip = 0;
+
+   foreach_block (block, cfg) {
+      assert(ip == block->start_ip);
+      if (block->num > 0)
+	 assert(cfg->blocks[block->num - 1]->end_ip == ip - 1);
+
+      foreach_inst_in_block(vec4_instruction, inst, block) {
+         struct block_data *bd = &block_data[block->num];
+
+         /* Set up the instruction uses. */
+	 for (unsigned int i = 0; i < 3; i++) {
+	    if (inst->src[i].file == VGRF) {
+               for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) {
+                  for (int c = 0; c < 4; c++) {
+                     const unsigned v = var_from_reg(alloc, inst->src[i], c, j);
+
+                     start[v] = MIN2(start[v], ip);
+                     end[v] = ip;
+
+                     if (!BITSET_TEST(bd->def, v))
+                        BITSET_SET(bd->use, v);
+                  }
+               }
+	    }
+	 }
+         for (unsigned c = 0; c < 4; c++) {
+            if (inst->reads_flag(c) &&
+                !BITSET_TEST(bd->flag_def, c)) {
+               BITSET_SET(bd->flag_use, c);
+            }
+         }
+
+         /* Set up the instruction defs. */
+         if (inst->dst.file == VGRF) {
+            for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
+               for (int c = 0; c < 4; c++) {
+                  if (inst->dst.writemask & (1 << c)) {
+                     const unsigned v = var_from_reg(alloc, inst->dst, c, i);
+
+                     start[v] = MIN2(start[v], ip);
+                     end[v] = ip;
+
+                     /* Check for unconditional register writes, these are the
+                      * things that screen off preceding definitions of a
+                      * variable, and thus qualify for being in def[].
+                      */
+                     if ((!inst->predicate || inst->opcode == BRW_OPCODE_SEL) &&
+                         !BITSET_TEST(bd->use, v))
+                        BITSET_SET(bd->def, v);
+                  }
+               }
+            }
+         }
+         if (inst->writes_flag(devinfo)) {
+            for (unsigned c = 0; c < 4; c++) {
+               if ((inst->dst.writemask & (1 << c)) &&
+                   !BITSET_TEST(bd->flag_use, c)) {
+                  BITSET_SET(bd->flag_def, c);
+               }
+            }
+         }
+
+	 ip++;
+      }
+   }
+}
+
+/**
+ * The algorithm incrementally sets bits in liveout and livein,
+ * propagating it through control flow.  It will eventually terminate
+ * because it only ever adds bits, and stops when no bits are added in
+ * a pass.
+ */
+void
+vec4_live_variables::compute_live_variables()
+{
+   bool cont = true;
+
+   while (cont) {
+      cont = false;
+
+      foreach_block_reverse (block, cfg) {
+         struct block_data *bd = &block_data[block->num];
+
+	 /* Update liveout */
+	 foreach_list_typed(bblock_link, child_link, link, &block->children) {
+       struct block_data *child_bd = &block_data[child_link->block->num];
+
+	    for (int i = 0; i < bitset_words; i++) {
+               BITSET_WORD new_liveout = (child_bd->livein[i] &
+                                          ~bd->liveout[i]);
+               if (new_liveout) {
+                  bd->liveout[i] |= new_liveout;
+		  cont = true;
+	       }
+	    }
+            BITSET_WORD new_liveout = (child_bd->flag_livein[0] &
+                                       ~bd->flag_liveout[0]);
+            if (new_liveout) {
+               bd->flag_liveout[0] |= new_liveout;
+               cont = true;
+            }
+	 }
+
+         /* Update livein */
+         for (int i = 0; i < bitset_words; i++) {
+            BITSET_WORD new_livein = (bd->use[i] |
+                                      (bd->liveout[i] &
+                                       ~bd->def[i]));
+            if (new_livein & ~bd->livein[i]) {
+               bd->livein[i] |= new_livein;
+               cont = true;
+            }
+         }
+         BITSET_WORD new_livein = (bd->flag_use[0] |
+                                   (bd->flag_liveout[0] &
+                                    ~bd->flag_def[0]));
+         if (new_livein & ~bd->flag_livein[0]) {
+            bd->flag_livein[0] |= new_livein;
+            cont = true;
+         }
+      }
+   }
+}
+
+/**
+ * Extend the start/end ranges for each variable to account for the
+ * new information calculated from control flow.
+ */
+void
+vec4_live_variables::compute_start_end()
+{
+   foreach_block (block, cfg) {
+      const struct block_data &bd = block_data[block->num];
+
+      for (int i = 0; i < num_vars; i++) {
+         if (BITSET_TEST(bd.livein, i)) {
+            start[i] = MIN2(start[i], block->start_ip);
+            end[i] = MAX2(end[i], block->start_ip);
+         }
+
+         if (BITSET_TEST(bd.liveout, i)) {
+            start[i] = MIN2(start[i], block->end_ip);
+            end[i] = MAX2(end[i], block->end_ip);
+         }
+      }
+   }
+}
+
+vec4_live_variables::vec4_live_variables(const backend_shader *s)
+   : alloc(s->alloc), cfg(s->cfg)
+{
+   mem_ctx = ralloc_context(NULL);
+
+   num_vars = alloc.total_size * 8;
+   start = ralloc_array(mem_ctx, int, num_vars);
+   end = ralloc_array(mem_ctx, int, num_vars);
+
+   for (int i = 0; i < num_vars; i++) {
+      start[i] = MAX_INSTRUCTION;
+      end[i] = -1;
+   }
+
+   devinfo = s->compiler->devinfo;
+
+   block_data = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks);
+
+   bitset_words = BITSET_WORDS(num_vars);
+   for (int i = 0; i < cfg->num_blocks; i++) {
+      block_data[i].def = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+      block_data[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+      block_data[i].livein = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+      block_data[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+
+      block_data[i].flag_def[0] = 0;
+      block_data[i].flag_use[0] = 0;
+      block_data[i].flag_livein[0] = 0;
+      block_data[i].flag_liveout[0] = 0;
+   }
+
+   setup_def_use();
+   compute_live_variables();
+   compute_start_end();
+}
+
+vec4_live_variables::~vec4_live_variables()
+{
+   ralloc_free(mem_ctx);
+}
+
+static bool
+check_register_live_range(const vec4_live_variables *live, int ip,
+                          unsigned var, unsigned n)
+{
+   for (unsigned j = 0; j < n; j += 4) {
+      if (var + j >= unsigned(live->num_vars) ||
+          live->start[var + j] > ip || live->end[var + j] < ip)
+         return false;
+   }
+
+   return true;
+}
+
+bool
+vec4_live_variables::validate(const backend_shader *s) const
+{
+   unsigned ip = 0;
+
+   foreach_block_and_inst(block, vec4_instruction, inst, s->cfg) {
+      for (unsigned c = 0; c < 4; c++) {
+         if (inst->dst.writemask & (1 << c)) {
+            for (unsigned i = 0; i < 3; i++) {
+               if (inst->src[i].file == VGRF &&
+                   !check_register_live_range(this, ip,
+                                              var_from_reg(alloc, inst->src[i], c),
+                                              regs_read(inst, i)))
+                  return false;
+            }
+
+            if (inst->dst.file == VGRF &&
+                !check_register_live_range(this, ip,
+                                           var_from_reg(alloc, inst->dst, c),
+                                           regs_written(inst)))
+               return false;
+         }
+      }
+
+      ip++;
+   }
+
+   return true;
+}
+
+int
+vec4_live_variables::var_range_start(unsigned v, unsigned n) const
+{
+   int ip = INT_MAX;
+
+   for (unsigned i = 0; i < n; i++)
+      ip = MIN2(ip, start[v + i]);
+
+   return ip;
+}
+
+int
+vec4_live_variables::var_range_end(unsigned v, unsigned n) const
+{
+   int ip = INT_MIN;
+
+   for (unsigned i = 0; i < n; i++)
+      ip = MAX2(ip, end[v + i]);
+
+   return ip;
+}
+
+bool
+vec4_live_variables::vgrfs_interfere(int a, int b) const
+{
+   return !((var_range_end(8 * alloc.offsets[a], 8 * alloc.sizes[a]) <=
+             var_range_start(8 * alloc.offsets[b], 8 * alloc.sizes[b])) ||
+            (var_range_end(8 * alloc.offsets[b], 8 * alloc.sizes[b]) <=
+             var_range_start(8 * alloc.offsets[a], 8 * alloc.sizes[a])));
+}
diff --git a/src/intel/compiler/elk/brw_vec4_live_variables.h b/src/intel/compiler/elk/brw_vec4_live_variables.h
new file mode 100644
index 00000000000..39d97c8a521
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_live_variables.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#ifndef BRW_VEC4_LIVE_VARIABLES_H
+#define BRW_VEC4_LIVE_VARIABLES_H
+
+#include "brw_ir_vec4.h"
+#include "brw_ir_analysis.h"
+#include "util/bitset.h"
+
+struct backend_shader;
+
+namespace brw {
+
+class vec4_live_variables {
+public:
+   struct block_data {
+      /**
+       * Which variables are defined before being used in the block.
+       *
+       * Note that for our purposes, "defined" means unconditionally, completely
+       * defined.
+       */
+      BITSET_WORD *def;
+
+      /**
+       * Which variables are used before being defined in the block.
+       */
+      BITSET_WORD *use;
+
+      /** Which defs reach the entry point of the block. */
+      BITSET_WORD *livein;
+
+      /** Which defs reach the exit point of the block. */
+      BITSET_WORD *liveout;
+
+      BITSET_WORD flag_def[1];
+      BITSET_WORD flag_use[1];
+      BITSET_WORD flag_livein[1];
+      BITSET_WORD flag_liveout[1];
+   };
+
+   vec4_live_variables(const backend_shader *s);
+   ~vec4_live_variables();
+
+   bool
+   validate(const backend_shader *s) const;
+
+   analysis_dependency_class
+   dependency_class() const
+   {
+      return (DEPENDENCY_INSTRUCTION_IDENTITY |
+              DEPENDENCY_INSTRUCTION_DATA_FLOW |
+              DEPENDENCY_VARIABLES);
+   }
+
+   int num_vars;
+   int bitset_words;
+
+   const struct intel_device_info *devinfo;
+
+   /** Per-basic-block information on live variables */
+   struct block_data *block_data;
+
+   /** @{
+    * Final computed live ranges for each variable.
+    */
+   int *start;
+   int *end;
+   /** @} */
+
+   int var_range_start(unsigned v, unsigned n) const;
+   int var_range_end(unsigned v, unsigned n) const;
+   bool vgrfs_interfere(int a, int b) const;
+
+protected:
+   void setup_def_use();
+   void compute_live_variables();
+   void compute_start_end();
+
+   const simple_allocator &alloc;
+   cfg_t *cfg;
+   void *mem_ctx;
+};
+
+/* Returns the variable index for the k-th dword of the c-th component of
+ * register reg.
+ */
+inline unsigned
+var_from_reg(const simple_allocator &alloc, const src_reg &reg,
+             unsigned c = 0, unsigned k = 0)
+{
+   assert(reg.file == VGRF && reg.nr < alloc.count && c < 4);
+   const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4);
+   unsigned result =
+      8 * alloc.offsets[reg.nr] + reg.offset / 4 +
+      (BRW_GET_SWZ(reg.swizzle, c) + k / csize * 4) * csize + k % csize;
+   /* Do not exceed the limit for this register */
+   assert(result < 8 * (alloc.offsets[reg.nr] + alloc.sizes[reg.nr]));
+   return result;
+}
+
+inline unsigned
+var_from_reg(const simple_allocator &alloc, const dst_reg &reg,
+             unsigned c = 0, unsigned k = 0)
+{
+   assert(reg.file == VGRF && reg.nr < alloc.count && c < 4);
+   const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4);
+   unsigned result =
+      8 * alloc.offsets[reg.nr] + reg.offset / 4 +
+      (c + k / csize * 4) * csize + k % csize;
+   /* Do not exceed the limit for this register */
+   assert(result < 8 * (alloc.offsets[reg.nr] + alloc.sizes[reg.nr]));
+   return result;
+}
+
+} /* namespace brw */
+
+#endif /* BRW_VEC4_LIVE_VARIABLES_H */
diff --git a/src/intel/compiler/elk/brw_vec4_nir.cpp b/src/intel/compiler/elk/brw_vec4_nir.cpp
new file mode 100644
index 00000000000..9121f8e10f2
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_nir.cpp
@@ -0,0 +1,2307 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "brw_vec4.h"
+#include "brw_vec4_builder.h"
+#include "brw_vec4_surface_builder.h"
+#include "brw_eu.h"
+#include "nir.h"
+#include "nir_intrinsics.h"
+#include "nir_intrinsics_indices.h"
+
+using namespace brw;
+using namespace brw::surface_access;
+
+namespace brw {
+
+void
+vec4_visitor::emit_nir_code()
+{
+   /* Globally set the rounding mode based on the float controls.  gen7 doesn't
+    * support 16-bit floats, and gen8 switches to scalar VS.  So we don't need
+    * to do any per-instruction mode switching the way the scalar FS handles.
+    */
+   emit_shader_float_controls_execution_mode();
+   if (nir->num_uniforms > 0)
+      nir_setup_uniforms();
+
+   nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
+}
+
+void
+vec4_visitor::nir_setup_uniforms()
+{
+   uniforms = nir->num_uniforms / 16;
+}
+
+void
+vec4_visitor::nir_emit_impl(nir_function_impl *impl)
+{
+   nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc);
+
+   nir_emit_cf_list(&impl->body);
+}
+
+void
+vec4_visitor::nir_emit_cf_list(exec_list *list)
+{
+   exec_list_validate(list);
+   foreach_list_typed(nir_cf_node, node, node, list) {
+      switch (node->type) {
+      case nir_cf_node_if:
+         nir_emit_if(nir_cf_node_as_if(node));
+         break;
+
+      case nir_cf_node_loop:
+         nir_emit_loop(nir_cf_node_as_loop(node));
+         break;
+
+      case nir_cf_node_block:
+         nir_emit_block(nir_cf_node_as_block(node));
+         break;
+
+      default:
+         unreachable("Invalid CFG node block");
+      }
+   }
+}
+
+void
+vec4_visitor::nir_emit_if(nir_if *if_stmt)
+{
+   /* First, put the condition in f0 */
+   src_reg condition = get_nir_src(if_stmt->condition, BRW_REGISTER_TYPE_D, 1);
+   vec4_instruction *inst = emit(MOV(dst_null_d(), condition));
+   inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+   /* We can just predicate based on the X channel, as the condition only
+    * goes on its own line */
+   emit(IF(BRW_PREDICATE_ALIGN16_REPLICATE_X));
+
+   nir_emit_cf_list(&if_stmt->then_list);
+
+   if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
+      emit(BRW_OPCODE_ELSE);
+      nir_emit_cf_list(&if_stmt->else_list);
+   }
+
+   emit(BRW_OPCODE_ENDIF);
+}
+
+void
+vec4_visitor::nir_emit_loop(nir_loop *loop)
+{
+   assert(!nir_loop_has_continue_construct(loop));
+   emit(BRW_OPCODE_DO);
+
+   nir_emit_cf_list(&loop->body);
+
+   emit(BRW_OPCODE_WHILE);
+}
+
+void
+vec4_visitor::nir_emit_block(nir_block *block)
+{
+   nir_foreach_instr(instr, block) {
+      nir_emit_instr(instr);
+   }
+}
+
+void
+vec4_visitor::nir_emit_instr(nir_instr *instr)
+{
+   base_ir = instr;
+
+   switch (instr->type) {
+   case nir_instr_type_load_const:
+      nir_emit_load_const(nir_instr_as_load_const(instr));
+      break;
+
+   case nir_instr_type_intrinsic:
+      nir_emit_intrinsic(nir_instr_as_intrinsic(instr));
+      break;
+
+   case nir_instr_type_alu:
+      nir_emit_alu(nir_instr_as_alu(instr));
+      break;
+
+   case nir_instr_type_jump:
+      nir_emit_jump(nir_instr_as_jump(instr));
+      break;
+
+   case nir_instr_type_tex:
+      nir_emit_texture(nir_instr_as_tex(instr));
+      break;
+
+   case nir_instr_type_undef:
+      nir_emit_undef(nir_instr_as_undef(instr));
+      break;
+
+   default:
+      unreachable("VS instruction not yet implemented by NIR->vec4");
+   }
+}
+
+static dst_reg
+dst_reg_for_nir_reg(vec4_visitor *v, nir_def *handle,
+                    unsigned base_offset, nir_src *indirect)
+{
+   nir_intrinsic_instr *decl = nir_reg_get_decl(handle);
+   dst_reg reg = v->nir_ssa_values[handle->index];
+   if (nir_intrinsic_bit_size(decl) == 64)
+      reg.type = BRW_REGISTER_TYPE_DF;
+
+   reg = offset(reg, 8, base_offset);
+   if (indirect) {
+      reg.reladdr =
+         new(v->mem_ctx) src_reg(v->get_nir_src(*indirect,
+                                                BRW_REGISTER_TYPE_D,
+                                                1));
+   }
+   return reg;
+}
+
+dst_reg
+vec4_visitor::get_nir_def(const nir_def &def)
+{
+   nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
+   if (!store_reg) {
+      dst_reg dst =
+         dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(def.bit_size, 32)));
+      if (def.bit_size == 64)
+         dst.type = BRW_REGISTER_TYPE_DF;
+      nir_ssa_values[def.index] = dst;
+      return dst;
+   } else {
+      nir_src *indirect =
+         (store_reg->intrinsic == nir_intrinsic_store_reg_indirect) ?
+         &store_reg->src[2] : NULL;
+
+      dst_reg dst = dst_reg_for_nir_reg(this, store_reg->src[1].ssa,
+                                        nir_intrinsic_base(store_reg),
+                                        indirect);
+      dst.writemask = nir_intrinsic_write_mask(store_reg);
+      return dst;
+   }
+}
+
+dst_reg
+vec4_visitor::get_nir_def(const nir_def &def, enum brw_reg_type type)
+{
+   return retype(get_nir_def(def), type);
+}
+
+dst_reg
+vec4_visitor::get_nir_def(const nir_def &def, nir_alu_type type)
+{
+   return get_nir_def(def, brw_type_for_nir_type(devinfo, type));
+}
+
+src_reg
+vec4_visitor::get_nir_src(const nir_src &src, enum brw_reg_type type,
+                          unsigned num_components)
+{
+   nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa);
+
+   dst_reg reg;
+   if (load_reg) {
+      nir_src *indirect =
+         (load_reg->intrinsic == nir_intrinsic_load_reg_indirect) ?
+         &load_reg->src[1] : NULL;
+
+      reg = dst_reg_for_nir_reg(this, load_reg->src[0].ssa,
+                                      nir_intrinsic_base(load_reg),
+                                      indirect);
+   } else {
+      reg = nir_ssa_values[src.ssa->index];
+   }
+
+   reg = retype(reg, type);
+
+   src_reg reg_as_src = src_reg(reg);
+   reg_as_src.swizzle = brw_swizzle_for_size(num_components);
+   return reg_as_src;
+}
+
+src_reg
+vec4_visitor::get_nir_src(const nir_src &src, nir_alu_type type,
+                          unsigned num_components)
+{
+   return get_nir_src(src, brw_type_for_nir_type(devinfo, type),
+                      num_components);
+}
+
+src_reg
+vec4_visitor::get_nir_src(const nir_src &src, unsigned num_components)
+{
+   /* if type is not specified, default to signed int */
+   return get_nir_src(src, nir_type_int32, num_components);
+}
+
+src_reg
+vec4_visitor::get_nir_src_imm(const nir_src &src)
+{
+   assert(nir_src_num_components(src) == 1);
+   assert(nir_src_bit_size(src) == 32);
+   return nir_src_is_const(src) ? src_reg(brw_imm_d(nir_src_as_int(src))) :
+                                  get_nir_src(src, 1);
+}
+
+src_reg
+vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
+{
+   nir_src *offset_src = nir_get_io_offset_src(instr);
+
+   if (nir_src_is_const(*offset_src)) {
+      /* The only constant offset we should find is 0.  brw_nir.c's
+       * add_const_offset_to_base() will fold other constant offsets
+       * into the base index.
+       */
+      assert(nir_src_as_uint(*offset_src) == 0);
+      return src_reg();
+   }
+
+   return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1);
+}
+
+static src_reg
+setup_imm_df(const vec4_builder &bld, double v)
+{
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   assert(devinfo->ver == 7);
+
+   /* gfx7.5 does not support DF immediates straightforward but the DIM
+    * instruction allows to set the 64-bit immediate value.
+    */
+   if (devinfo->verx10 == 75) {
+      const vec4_builder ubld = bld.exec_all();
+      const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_DF);
+      ubld.DIM(dst, brw_imm_df(v));
+      return swizzle(src_reg(dst), BRW_SWIZZLE_XXXX);
+   }
+
+   /* gfx7 does not support DF immediates */
+   union {
+      double d;
+      struct {
+         uint32_t i1;
+         uint32_t i2;
+      };
+   } di;
+
+   di.d = v;
+
+   /* Write the low 32-bit of the constant to the X:UD channel and the
+    * high 32-bit to the Y:UD channel to build the constant in a VGRF.
+    * We have to do this twice (offset 0 and offset 1), since a DF VGRF takes
+    * two SIMD8 registers in SIMD4x2 execution. Finally, return a swizzle
+    * XXXX so any access to the VGRF only reads the constant data in these
+    * channels.
+    */
+   const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+   for (unsigned n = 0; n < 2; n++) {
+      const vec4_builder ubld = bld.exec_all().group(4, n);
+      ubld.MOV(writemask(offset(tmp, 8, n), WRITEMASK_X), brw_imm_ud(di.i1));
+      ubld.MOV(writemask(offset(tmp, 8, n), WRITEMASK_Y), brw_imm_ud(di.i2));
+   }
+
+   return swizzle(src_reg(retype(tmp, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX);
+}
+
+void
+vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
+{
+   dst_reg reg;
+
+   if (instr->def.bit_size == 64) {
+      reg = dst_reg(VGRF, alloc.allocate(2));
+      reg.type = BRW_REGISTER_TYPE_DF;
+   } else {
+      reg = dst_reg(VGRF, alloc.allocate(1));
+      reg.type = BRW_REGISTER_TYPE_D;
+   }
+
+   const vec4_builder ibld = vec4_builder(this).at_end();
+   unsigned remaining = brw_writemask_for_size(instr->def.num_components);
+
+   /* @FIXME: consider emitting vector operations to save some MOVs in
+    * cases where the components are representable in 8 bits.
+    * For now, we emit a MOV for each distinct value.
+    */
+   for (unsigned i = 0; i < instr->def.num_components; i++) {
+      unsigned writemask = 1 << i;
+
+      if ((remaining & writemask) == 0)
+         continue;
+
+      for (unsigned j = i; j < instr->def.num_components; j++) {
+         if ((instr->def.bit_size == 32 &&
+              instr->value[i].u32 == instr->value[j].u32) ||
+             (instr->def.bit_size == 64 &&
+              instr->value[i].f64 == instr->value[j].f64)) {
+            writemask |= 1 << j;
+         }
+      }
+
+      reg.writemask = writemask;
+      if (instr->def.bit_size == 64) {
+         emit(MOV(reg, setup_imm_df(ibld, instr->value[i].f64)));
+      } else {
+         emit(MOV(reg, brw_imm_d(instr->value[i].i32)));
+      }
+
+      remaining &= ~writemask;
+   }
+
+   /* Set final writemask */
+   reg.writemask = brw_writemask_for_size(instr->def.num_components);
+
+   nir_ssa_values[instr->def.index] = reg;
+}
+
+src_reg
+vec4_visitor::get_nir_ssbo_intrinsic_index(nir_intrinsic_instr *instr)
+{
+   /* SSBO stores are weird in that their index is in src[1] */
+   const unsigned src = instr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0;
+
+   if (nir_src_is_const(instr->src[src])) {
+      return brw_imm_ud(nir_src_as_uint(instr->src[src]));
+   } else {
+      return emit_uniformize(get_nir_src(instr->src[src]));
+   }
+}
+
+void
+vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg dest;
+   src_reg src;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_decl_reg: {
+      unsigned bit_size = nir_intrinsic_bit_size(instr);
+      unsigned array_elems = nir_intrinsic_num_array_elems(instr);
+      if (array_elems == 0)
+         array_elems = 1;
+
+      const unsigned num_regs = array_elems * DIV_ROUND_UP(bit_size, 32);
+      dst_reg reg(VGRF, alloc.allocate(num_regs));
+      if (bit_size == 64)
+         reg.type = BRW_REGISTER_TYPE_DF;
+
+      nir_ssa_values[instr->def.index] = reg;
+      break;
+   }
+
+   case nir_intrinsic_load_reg:
+   case nir_intrinsic_load_reg_indirect:
+   case nir_intrinsic_store_reg:
+   case nir_intrinsic_store_reg_indirect:
+      /* Nothing to do with these. */
+      break;
+
+   case nir_intrinsic_load_input: {
+      assert(instr->def.bit_size == 32);
+      /* We set EmitNoIndirectInput for VS */
+      unsigned load_offset = nir_src_as_uint(instr->src[0]);
+
+      dest = get_nir_def(instr->def);
+
+      src = src_reg(ATTR, nir_intrinsic_base(instr) + load_offset,
+                    glsl_uvec4_type());
+      src = retype(src, dest.type);
+
+      /* Swizzle source based on component layout qualifier */
+      src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr));
+      emit(MOV(dest, src));
+      break;
+   }
+
+   case nir_intrinsic_store_output: {
+      assert(nir_src_bit_size(instr->src[0]) == 32);
+      unsigned store_offset = nir_src_as_uint(instr->src[1]);
+      int varying = nir_intrinsic_base(instr) + store_offset;
+      src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F,
+                        instr->num_components);
+
+      unsigned c = nir_intrinsic_component(instr);
+      output_reg[varying][c] = dst_reg(src);
+      output_num_components[varying][c] = instr->num_components;
+      break;
+   }
+
+   case nir_intrinsic_get_ssbo_size: {
+      assert(nir_src_num_components(instr->src[0]) == 1);
+      unsigned ssbo_index = nir_src_is_const(instr->src[0]) ?
+                            nir_src_as_uint(instr->src[0]) : 0;
+
+      dst_reg result_dst = get_nir_def(instr->def);
+      vec4_instruction *inst = new(mem_ctx)
+         vec4_instruction(SHADER_OPCODE_GET_BUFFER_SIZE, result_dst);
+
+      inst->base_mrf = 2;
+      inst->mlen = 1; /* always at least one */
+      inst->src[1] = brw_imm_ud(ssbo_index);
+
+      /* MRF for the first parameter */
+      src_reg lod = brw_imm_d(0);
+      int param_base = inst->base_mrf;
+      int writemask = WRITEMASK_X;
+      emit(MOV(dst_reg(MRF, param_base, glsl_int_type(), writemask), lod));
+
+      emit(inst);
+      break;
+   }
+
+   case nir_intrinsic_store_ssbo: {
+      assert(devinfo->ver == 7);
+
+      /* brw_nir_lower_mem_access_bit_sizes takes care of this */
+      assert(nir_src_bit_size(instr->src[0]) == 32);
+      assert(nir_intrinsic_write_mask(instr) ==
+             (1u << instr->num_components) - 1);
+
+      src_reg surf_index = get_nir_ssbo_intrinsic_index(instr);
+      src_reg offset_reg = retype(get_nir_src_imm(instr->src[2]),
+                                  BRW_REGISTER_TYPE_UD);
+
+      /* Value */
+      src_reg val_reg = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, 4);
+
+      /* IvyBridge does not have a native SIMD4x2 untyped write message so untyped
+       * writes will use SIMD8 mode. In order to hide this and keep symmetry across
+       * typed and untyped messages and across hardware platforms, the
+       * current implementation of the untyped messages will transparently convert
+       * the SIMD4x2 payload into an equivalent SIMD8 payload by transposing it
+       * and enabling only channel X on the SEND instruction.
+       *
+       * The above, works well for full vector writes, but not for partial writes
+       * where we want to write some channels and not others, like when we have
+       * code such as v.xyw = vec3(1,2,4). Because the untyped write messages are
+       * quite restrictive with regards to the channel enables we can configure in
+       * the message descriptor (not all combinations are allowed) we cannot simply
+       * implement these scenarios with a single message while keeping the
+       * aforementioned symmetry in the implementation. For now we de decided that
+       * it is better to keep the symmetry to reduce complexity, so in situations
+       * such as the one described we end up emitting two untyped write messages
+       * (one for xy and another for w).
+       *
+       * The code below packs consecutive channels into a single write message,
+       * detects gaps in the vector write and if needed, sends a second message
+       * with the remaining channels. If in the future we decide that we want to
+       * emit a single message at the expense of losing the symmetry in the
+       * implementation we can:
+       *
+       * 1) For IvyBridge: Only use the red channel of the untyped write SIMD8
+       *    message payload. In this mode we can write up to 8 offsets and dwords
+       *    to the red channel only (for the two vec4s in the SIMD4x2 execution)
+       *    and select which of the 8 channels carry data to write by setting the
+       *    appropriate writemask in the dst register of the SEND instruction.
+       *    It would require to write a new generator opcode specifically for
+       *    IvyBridge since we would need to prepare a SIMD8 payload that could
+       *    use any channel, not just X.
+       *
+       * 2) For Haswell+: Simply send a single write message but set the writemask
+       *    on the dst of the SEND instruction to select the channels we want to
+       *    write. It would require to modify the current messages to receive
+       *    and honor the writemask provided.
+       */
+      const vec4_builder bld = vec4_builder(this).at_end()
+                               .annotate(current_annotation, base_ir);
+
+      emit_untyped_write(bld, surf_index, offset_reg, val_reg,
+                         1 /* dims */, instr->num_components /* size */,
+                         BRW_PREDICATE_NONE);
+      break;
+   }
+
+   case nir_intrinsic_load_ssbo: {
+      assert(devinfo->ver == 7);
+
+      /* brw_nir_lower_mem_access_bit_sizes takes care of this */
+      assert(instr->def.bit_size == 32);
+
+      src_reg surf_index = get_nir_ssbo_intrinsic_index(instr);
+      src_reg offset_reg = retype(get_nir_src_imm(instr->src[1]),
+                                  BRW_REGISTER_TYPE_UD);
+
+      /* Read the vector */
+      const vec4_builder bld = vec4_builder(this).at_end()
+         .annotate(current_annotation, base_ir);
+
+      src_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
+                                              1 /* dims */, 4 /* size*/,
+                                              BRW_PREDICATE_NONE);
+      dst_reg dest = get_nir_def(instr->def);
+      read_result.type = dest.type;
+      read_result.swizzle = brw_swizzle_for_size(instr->num_components);
+      emit(MOV(dest, read_result));
+      break;
+   }
+
+   case nir_intrinsic_ssbo_atomic:
+   case nir_intrinsic_ssbo_atomic_swap:
+      nir_emit_ssbo_atomic(lsc_op_to_legacy_atomic(lsc_aop_for_nir_intrinsic(instr)), instr);
+      break;
+
+   case nir_intrinsic_load_vertex_id:
+      unreachable("should be lowered by vertex_id_zero_based");
+
+   case nir_intrinsic_load_vertex_id_zero_base:
+   case nir_intrinsic_load_base_vertex:
+   case nir_intrinsic_load_instance_id:
+   case nir_intrinsic_load_base_instance:
+   case nir_intrinsic_load_draw_id:
+   case nir_intrinsic_load_invocation_id:
+      unreachable("should be lowered by brw_nir_lower_vs_inputs()");
+
+   case nir_intrinsic_load_uniform: {
+      /* Offsets are in bytes but they should always be multiples of 4 */
+      assert(nir_intrinsic_base(instr) % 4 == 0);
+
+      dest = get_nir_def(instr->def);
+
+      src = src_reg(dst_reg(UNIFORM, nir_intrinsic_base(instr) / 16));
+      src.type = dest.type;
+
+      /* Uniforms don't actually have to be vec4 aligned.  In the case that
+       * it isn't, we have to use a swizzle to shift things around.  They
+       * do still have the std140 alignment requirement that vec2's have to
+       * be vec2-aligned and vec3's and vec4's have to be vec4-aligned.
+       *
+       * The swizzle also works in the indirect case as the generator adds
+       * the swizzle to the offset for us.
+       */
+      const int type_size = type_sz(src.type);
+      unsigned shift = (nir_intrinsic_base(instr) % 16) / type_size;
+      assert(shift + instr->num_components <= 4);
+
+      if (nir_src_is_const(instr->src[0])) {
+         const unsigned load_offset = nir_src_as_uint(instr->src[0]);
+         /* Offsets are in bytes but they should always be multiples of 4 */
+         assert(load_offset % 4 == 0);
+
+         src.swizzle = brw_swizzle_for_size(instr->num_components);
+         dest.writemask = brw_writemask_for_size(instr->num_components);
+         unsigned offset = load_offset + shift * type_size;
+         src.offset = ROUND_DOWN_TO(offset, 16);
+         shift = (offset % 16) / type_size;
+         assert(shift + instr->num_components <= 4);
+         src.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
+
+         emit(MOV(dest, src));
+      } else {
+         /* Uniform arrays are vec4 aligned, because of std140 alignment
+          * rules.
+          */
+         assert(shift == 0);
+
+         src_reg indirect = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
+
+         /* MOV_INDIRECT is going to stomp the whole thing anyway */
+         dest.writemask = WRITEMASK_XYZW;
+
+         emit(SHADER_OPCODE_MOV_INDIRECT, dest, src,
+              indirect, brw_imm_ud(nir_intrinsic_range(instr)));
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_ubo: {
+      src_reg surf_index;
+
+      dest = get_nir_def(instr->def);
+
+      if (nir_src_is_const(instr->src[0])) {
+         /* The block index is a constant, so just emit the binding table entry
+          * as an immediate.
+          */
+         const unsigned index = nir_src_as_uint(instr->src[0]);
+         surf_index = brw_imm_ud(index);
+      } else {
+         /* The block index is not a constant. Evaluate the index expression
+          * per-channel and add the base UBO index; we have to select a value
+          * from any live channel.
+          */
+         surf_index = src_reg(this, glsl_uint_type());
+         emit(MOV(dst_reg(surf_index), get_nir_src(instr->src[0], nir_type_int32,
+                                                   instr->num_components)));
+         surf_index = emit_uniformize(surf_index);
+      }
+
+      src_reg push_reg;
+      src_reg offset_reg;
+      if (nir_src_is_const(instr->src[1])) {
+         unsigned load_offset = nir_src_as_uint(instr->src[1]);
+         unsigned aligned_offset = load_offset & ~15;
+         offset_reg = brw_imm_ud(aligned_offset);
+
+         /* See if we've selected this as a push constant candidate */
+         if (nir_src_is_const(instr->src[0])) {
+            const unsigned ubo_block = nir_src_as_uint(instr->src[0]);
+            const unsigned offset_256b = aligned_offset / 32;
+
+            for (int i = 0; i < 4; i++) {
+               const struct brw_ubo_range *range = &prog_data->base.ubo_ranges[i];
+               if (range->block == ubo_block &&
+                   offset_256b >= range->start &&
+                   offset_256b < range->start + range->length) {
+
+                  push_reg = src_reg(dst_reg(UNIFORM, UBO_START + i));
+                  push_reg.type = dest.type;
+                  push_reg.offset = aligned_offset - 32 * range->start;
+                  break;
+               }
+            }
+         }
+      } else {
+         offset_reg = src_reg(this, glsl_uint_type());
+         emit(MOV(dst_reg(offset_reg),
+                  get_nir_src(instr->src[1], nir_type_uint32, 1)));
+      }
+
+      src_reg packed_consts;
+      if (push_reg.file != BAD_FILE) {
+         packed_consts = push_reg;
+      } else if (instr->def.bit_size == 32) {
+         packed_consts = src_reg(this, glsl_vec4_type());
+         emit_pull_constant_load_reg(dst_reg(packed_consts),
+                                     surf_index,
+                                     offset_reg,
+                                     NULL, NULL /* before_block/inst */);
+         prog_data->base.has_ubo_pull = true;
+      } else {
+         src_reg temp = src_reg(this, glsl_dvec4_type());
+         src_reg temp_float = retype(temp, BRW_REGISTER_TYPE_F);
+
+         emit_pull_constant_load_reg(dst_reg(temp_float),
+                                     surf_index, offset_reg, NULL, NULL);
+         if (offset_reg.file == IMM)
+            offset_reg.ud += 16;
+         else
+            emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16u)));
+         emit_pull_constant_load_reg(dst_reg(byte_offset(temp_float, REG_SIZE)),
+                                     surf_index, offset_reg, NULL, NULL);
+         prog_data->base.has_ubo_pull = true;
+
+         packed_consts = src_reg(this, glsl_dvec4_type());
+         shuffle_64bit_data(dst_reg(packed_consts), temp, false);
+      }
+
+      packed_consts.swizzle = brw_swizzle_for_size(instr->num_components);
+      if (nir_src_is_const(instr->src[1])) {
+         unsigned load_offset = nir_src_as_uint(instr->src[1]);
+         unsigned type_size = type_sz(dest.type);
+         packed_consts.swizzle +=
+            BRW_SWIZZLE4(load_offset % 16 / type_size,
+                         load_offset % 16 / type_size,
+                         load_offset % 16 / type_size,
+                         load_offset % 16 / type_size);
+      }
+
+      emit(MOV(dest, retype(packed_consts, dest.type)));
+
+      break;
+   }
+
+   case nir_intrinsic_barrier: {
+      if (nir_intrinsic_memory_scope(instr) == SCOPE_NONE)
+         break;
+      const vec4_builder bld =
+         vec4_builder(this).at_end().annotate(current_annotation, base_ir);
+      const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      vec4_instruction *fence =
+         bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, brw_vec8_grf(0, 0));
+      fence->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
+      break;
+   }
+
+   case nir_intrinsic_shader_clock: {
+      /* We cannot do anything if there is an event, so ignore it for now */
+      const src_reg shader_clock = get_timestamp();
+      const enum brw_reg_type type = brw_type_for_base_type(glsl_uvec2_type());
+
+      dest = get_nir_def(instr->def, type);
+      emit(MOV(dest, shader_clock));
+      break;
+   }
+
+   default:
+      unreachable("Unknown intrinsic");
+   }
+}
+
+void
+vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr)
+{
+   dst_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_def(instr->def);
+
+   src_reg surface = get_nir_ssbo_intrinsic_index(instr);
+   src_reg offset = get_nir_src(instr->src[1], 1);
+   src_reg data1;
+   if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
+      data1 = get_nir_src(instr->src[2], 1);
+   src_reg data2;
+   if (op == BRW_AOP_CMPWR)
+      data2 = get_nir_src(instr->src[3], 1);
+
+   /* Emit the actual atomic operation operation */
+   const vec4_builder bld =
+      vec4_builder(this).at_end().annotate(current_annotation, base_ir);
+
+   src_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
+                                               data1, data2,
+                                               1 /* dims */, 1 /* rsize */,
+                                               op,
+                                               BRW_PREDICATE_NONE);
+   dest.type = atomic_result.type;
+   bld.MOV(dest, atomic_result);
+}
+
+static unsigned
+brw_swizzle_for_nir_swizzle(uint8_t swizzle[4])
+{
+   return BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+}
+
+bool
+vec4_visitor::optimize_predicate(nir_alu_instr *instr,
+                                 enum brw_predicate *predicate)
+{
+   if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *cmp_instr =
+      nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+
+   switch (cmp_instr->op) {
+   case nir_op_b32any_fnequal2:
+   case nir_op_b32any_inequal2:
+   case nir_op_b32any_fnequal3:
+   case nir_op_b32any_inequal3:
+   case nir_op_b32any_fnequal4:
+   case nir_op_b32any_inequal4:
+      *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      break;
+   case nir_op_b32all_fequal2:
+   case nir_op_b32all_iequal2:
+   case nir_op_b32all_fequal3:
+   case nir_op_b32all_iequal3:
+   case nir_op_b32all_fequal4:
+   case nir_op_b32all_iequal4:
+      *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+      break;
+   default:
+      return false;
+   }
+
+   unsigned size_swizzle =
+      brw_swizzle_for_size(nir_op_infos[cmp_instr->op].input_sizes[0]);
+
+   src_reg op[2];
+   assert(nir_op_infos[cmp_instr->op].num_inputs == 2);
+   for (unsigned i = 0; i < 2; i++) {
+      nir_alu_type type = nir_op_infos[cmp_instr->op].input_types[i];
+      unsigned bit_size = nir_src_bit_size(cmp_instr->src[i].src);
+      type = (nir_alu_type) (((unsigned) type) | bit_size);
+      op[i] = get_nir_src(cmp_instr->src[i].src, type, 4);
+      unsigned base_swizzle =
+         brw_swizzle_for_nir_swizzle(cmp_instr->src[i].swizzle);
+      op[i].swizzle = brw_compose_swizzle(size_swizzle, base_swizzle);
+   }
+
+   emit(CMP(dst_null_d(), op[0], op[1],
+            brw_cmod_for_nir_comparison(cmp_instr->op)));
+
+   return true;
+}
+
+void
+vec4_visitor::emit_conversion_from_double(dst_reg dst, src_reg src)
+{
+   enum opcode op;
+   switch (dst.type) {
+   case BRW_REGISTER_TYPE_D:
+      op = VEC4_OPCODE_DOUBLE_TO_D32;
+      break;
+   case BRW_REGISTER_TYPE_UD:
+      op = VEC4_OPCODE_DOUBLE_TO_U32;
+      break;
+   case BRW_REGISTER_TYPE_F:
+      op = VEC4_OPCODE_DOUBLE_TO_F32;
+      break;
+   default:
+      unreachable("Unknown conversion");
+   }
+
+   dst_reg temp = dst_reg(this, glsl_dvec4_type());
+   emit(MOV(temp, src));
+   dst_reg temp2 = dst_reg(this, glsl_dvec4_type());
+   emit(op, temp2, src_reg(temp));
+
+   emit(VEC4_OPCODE_PICK_LOW_32BIT, retype(temp2, dst.type), src_reg(temp2));
+   emit(MOV(dst, src_reg(retype(temp2, dst.type))));
+}
+
+void
+vec4_visitor::emit_conversion_to_double(dst_reg dst, src_reg src)
+{
+   dst_reg tmp_dst = dst_reg(src_reg(this, glsl_dvec4_type()));
+   src_reg tmp_src = retype(src_reg(this, glsl_vec4_type()), src.type);
+   emit(MOV(dst_reg(tmp_src), src));
+   emit(VEC4_OPCODE_TO_DOUBLE, tmp_dst, tmp_src);
+   emit(MOV(dst, src_reg(tmp_dst)));
+}
+
+/**
+ * Try to use an immediate value for a source
+ *
+ * In cases of flow control, constant propagation is sometimes unable to
+ * determine that a register contains a constant value.  To work around this,
+ * try to emit a literal as one of the sources.  If \c try_src0_also is set,
+ * \c op[0] will also be tried for an immediate value.
+ *
+ * If \c op[0] is modified, the operands will be exchanged so that \c op[1]
+ * will always be the immediate value.
+ *
+ * \return The index of the source that was modified, 0 or 1, if successful.
+ * Otherwise, -1.
+ *
+ * \param op - Operands to the instruction
+ * \param try_src0_also - True if \c op[0] should also be a candidate for
+ *                        getting an immediate value.  This should only be set
+ *                        for commutative operations.
+ */
+static int
+try_immediate_source(const nir_alu_instr *instr, src_reg *op,
+                     bool try_src0_also)
+{
+   unsigned idx;
+
+   /* MOV should be the only single-source instruction passed to this
+    * function.  Any other unary instruction with a constant source should
+    * have been constant-folded away!
+    */
+   assert(nir_op_infos[instr->op].num_inputs > 1 ||
+          instr->op == nir_op_mov);
+
+   if (instr->op != nir_op_mov &&
+       nir_src_bit_size(instr->src[1].src) == 32 &&
+       nir_src_is_const(instr->src[1].src)) {
+      idx = 1;
+   } else if (try_src0_also &&
+         nir_src_bit_size(instr->src[0].src) == 32 &&
+         nir_src_is_const(instr->src[0].src)) {
+      idx = 0;
+   } else {
+      return -1;
+   }
+
+   const enum brw_reg_type old_type = op[idx].type;
+
+   switch (old_type) {
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UD: {
+      int first_comp = -1;
+      int d = 0;
+
+      for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) {
+         if (nir_alu_instr_channel_used(instr, idx, i)) {
+            if (first_comp < 0) {
+               first_comp = i;
+               d = nir_src_comp_as_int(instr->src[idx].src,
+                                       instr->src[idx].swizzle[i]);
+            } else if (d != nir_src_comp_as_int(instr->src[idx].src,
+                                                instr->src[idx].swizzle[i])) {
+               return -1;
+            }
+         }
+      }
+
+      assert(first_comp >= 0);
+
+      if (op[idx].abs)
+         d = MAX2(-d, d);
+
+      if (op[idx].negate)
+         d = -d;
+
+      op[idx] = retype(src_reg(brw_imm_d(d)), old_type);
+      break;
+   }
+
+   case BRW_REGISTER_TYPE_F: {
+      int first_comp = -1;
+      float f[NIR_MAX_VEC_COMPONENTS] = { 0.0f };
+      bool is_scalar = true;
+
+      for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) {
+         if (nir_alu_instr_channel_used(instr, idx, i)) {
+            f[i] = nir_src_comp_as_float(instr->src[idx].src,
+                                         instr->src[idx].swizzle[i]);
+            if (first_comp < 0) {
+               first_comp = i;
+            } else if (f[first_comp] != f[i]) {
+               is_scalar = false;
+            }
+         }
+      }
+
+      if (is_scalar) {
+         if (op[idx].abs)
+            f[first_comp] = fabs(f[first_comp]);
+
+         if (op[idx].negate)
+            f[first_comp] = -f[first_comp];
+
+         op[idx] = src_reg(brw_imm_f(f[first_comp]));
+         assert(op[idx].type == old_type);
+      } else {
+         uint8_t vf_values[4] = { 0, 0, 0, 0 };
+
+         for (unsigned i = 0; i < ARRAY_SIZE(vf_values); i++) {
+
+            if (op[idx].abs)
+               f[i] = fabs(f[i]);
+
+            if (op[idx].negate)
+               f[i] = -f[i];
+
+            const int vf = brw_float_to_vf(f[i]);
+            if (vf == -1)
+               return -1;
+
+            vf_values[i] = vf;
+         }
+
+         op[idx] = src_reg(brw_imm_vf4(vf_values[0], vf_values[1],
+                                       vf_values[2], vf_values[3]));
+      }
+      break;
+   }
+
+   default:
+      unreachable("Non-32bit type.");
+   }
+
+   /* If the instruction has more than one source, the instruction format only
+    * allows source 1 to be an immediate value.  If the immediate value was
+    * source 0, then the sources must be exchanged.
+    */
+   if (idx == 0 && instr->op != nir_op_mov) {
+      src_reg tmp = op[0];
+      op[0] = op[1];
+      op[1] = tmp;
+   }
+
+   return idx;
+}
+
+void
+vec4_visitor::fix_float_operands(src_reg op[3], nir_alu_instr *instr)
+{
+   bool fixed[3] = { false, false, false };
+
+   for (unsigned i = 0; i < 2; i++) {
+      if (!nir_src_is_const(instr->src[i].src))
+         continue;
+
+      for (unsigned j = i + 1; j < 3; j++) {
+         if (fixed[j])
+            continue;
+
+         if (!nir_src_is_const(instr->src[j].src))
+            continue;
+
+         if (nir_alu_srcs_equal(instr, instr, i, j)) {
+            if (!fixed[i])
+               op[i] = fix_3src_operand(op[i]);
+
+            op[j] = op[i];
+
+            fixed[i] = true;
+            fixed[j] = true;
+         } else if (nir_alu_srcs_negative_equal(instr, instr, i, j)) {
+            if (!fixed[i])
+               op[i] = fix_3src_operand(op[i]);
+
+            op[j] = op[i];
+            op[j].negate = !op[j].negate;
+
+            fixed[i] = true;
+            fixed[j] = true;
+         }
+      }
+   }
+
+   for (unsigned i = 0; i < 3; i++) {
+      if (!fixed[i])
+         op[i] = fix_3src_operand(op[i]);
+   }
+}
+
+static bool
+const_src_fits_in_16_bits(const nir_src &src, brw_reg_type type)
+{
+   assert(nir_src_is_const(src));
+   if (brw_reg_type_is_unsigned_integer(type)) {
+      return nir_src_comp_as_uint(src, 0) <= UINT16_MAX;
+   } else {
+      const int64_t c = nir_src_comp_as_int(src, 0);
+      return c <= INT16_MAX && c >= INT16_MIN;
+   }
+}
+
+void
+vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
+{
+   vec4_instruction *inst;
+
+   nir_alu_type dst_type = (nir_alu_type) (nir_op_infos[instr->op].output_type |
+                                           instr->def.bit_size);
+   dst_reg dst = get_nir_def(instr->def, dst_type);
+   dst.writemask &= nir_component_mask(instr->def.num_components);
+
+   src_reg op[4];
+   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+      nir_alu_type src_type = (nir_alu_type)
+         (nir_op_infos[instr->op].input_types[i] |
+          nir_src_bit_size(instr->src[i].src));
+      op[i] = get_nir_src(instr->src[i].src, src_type, 4);
+      op[i].swizzle = brw_swizzle_for_nir_swizzle(instr->src[i].swizzle);
+   }
+
+#ifndef NDEBUG
+   /* On Gen7 and earlier, no functionality is exposed that should allow 8-bit
+    * integer types to ever exist.
+    */
+   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
+      assert(type_sz(op[i].type) > 1);
+#endif
+
+   switch (instr->op) {
+   case nir_op_mov:
+      try_immediate_source(instr, &op[0], true);
+      inst = emit(MOV(dst, op[0]));
+      break;
+
+   case nir_op_vec2:
+   case nir_op_vec3:
+   case nir_op_vec4:
+      unreachable("not reached: should be handled by lower_vec_to_movs()");
+
+   case nir_op_i2f32:
+   case nir_op_u2f32:
+      inst = emit(MOV(dst, op[0]));
+      break;
+
+   case nir_op_f2f32:
+   case nir_op_f2i32:
+   case nir_op_f2u32:
+      if (nir_src_bit_size(instr->src[0].src) == 64)
+         emit_conversion_from_double(dst, op[0]);
+      else
+         inst = emit(MOV(dst, op[0]));
+      break;
+
+   case nir_op_f2f64:
+   case nir_op_i2f64:
+   case nir_op_u2f64:
+      emit_conversion_to_double(dst, op[0]);
+      break;
+
+   case nir_op_fsat:
+      inst = emit(MOV(dst, op[0]));
+      inst->saturate = true;
+      break;
+
+   case nir_op_fneg:
+   case nir_op_ineg:
+      op[0].negate = true;
+      inst = emit(MOV(dst, op[0]));
+      break;
+
+   case nir_op_fabs:
+   case nir_op_iabs:
+      op[0].negate = false;
+      op[0].abs = true;
+      inst = emit(MOV(dst, op[0]));
+      break;
+
+   case nir_op_iadd:
+      assert(instr->def.bit_size < 64);
+      FALLTHROUGH;
+   case nir_op_fadd:
+      try_immediate_source(instr, op, true);
+      inst = emit(ADD(dst, op[0], op[1]));
+      break;
+
+   case nir_op_uadd_sat:
+      assert(instr->def.bit_size < 64);
+      inst = emit(ADD(dst, op[0], op[1]));
+      inst->saturate = true;
+      break;
+
+   case nir_op_fmul:
+      try_immediate_source(instr, op, true);
+      inst = emit(MUL(dst, op[0], op[1]));
+      break;
+
+   case nir_op_imul: {
+      assert(instr->def.bit_size < 64);
+
+      /* For integer multiplication, the MUL uses the low 16 bits of one of
+       * the operands (src0 through SNB, src1 on IVB and later). The MACH
+       * accumulates in the contribution of the upper 16 bits of that
+       * operand. If we can determine that one of the args is in the low
+       * 16 bits, though, we can just emit a single MUL.
+       */
+      if (nir_src_is_const(instr->src[0].src) &&
+          nir_alu_instr_src_read_mask(instr, 0) == 1 &&
+          const_src_fits_in_16_bits(instr->src[0].src, op[0].type)) {
+         if (devinfo->ver < 7)
+            emit(MUL(dst, op[0], op[1]));
+         else
+            emit(MUL(dst, op[1], op[0]));
+      } else if (nir_src_is_const(instr->src[1].src) &&
+                 nir_alu_instr_src_read_mask(instr, 1) == 1 &&
+                 const_src_fits_in_16_bits(instr->src[1].src, op[1].type)) {
+         if (devinfo->ver < 7)
+            emit(MUL(dst, op[1], op[0]));
+         else
+            emit(MUL(dst, op[0], op[1]));
+      } else {
+         struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
+
+         emit(MUL(acc, op[0], op[1]));
+         emit(MACH(dst_null_d(), op[0], op[1]));
+         emit(MOV(dst, src_reg(acc)));
+      }
+      break;
+   }
+
+   case nir_op_imul_high:
+   case nir_op_umul_high: {
+      assert(instr->def.bit_size < 64);
+      struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
+
+      emit(MUL(acc, op[0], op[1]));
+      emit(MACH(dst, op[0], op[1]));
+      break;
+   }
+
+   case nir_op_frcp:
+      inst = emit_math(SHADER_OPCODE_RCP, dst, op[0]);
+      break;
+
+   case nir_op_fexp2:
+      inst = emit_math(SHADER_OPCODE_EXP2, dst, op[0]);
+      break;
+
+   case nir_op_flog2:
+      inst = emit_math(SHADER_OPCODE_LOG2, dst, op[0]);
+      break;
+
+   case nir_op_fsin:
+      inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]);
+      break;
+
+   case nir_op_fcos:
+      inst = emit_math(SHADER_OPCODE_COS, dst, op[0]);
+      break;
+
+   case nir_op_idiv:
+   case nir_op_udiv:
+      assert(instr->def.bit_size < 64);
+      emit_math(SHADER_OPCODE_INT_QUOTIENT, dst, op[0], op[1]);
+      break;
+
+   case nir_op_umod:
+   case nir_op_irem:
+      /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
+       * appears that our hardware just does the right thing for signed
+       * remainder.
+       */
+      assert(instr->def.bit_size < 64);
+      emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
+      break;
+
+   case nir_op_imod: {
+      /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
+      inst = emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
+
+      /* Math instructions don't support conditional mod */
+      inst = emit(MOV(dst_null_d(), src_reg(dst)));
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+      /* Now, we need to determine if signs of the sources are different.
+       * When we XOR the sources, the top bit is 0 if they are the same and 1
+       * if they are different.  We can then use a conditional modifier to
+       * turn that into a predicate.  This leads us to an XOR.l instruction.
+       *
+       * Technically, according to the PRM, you're not allowed to use .l on a
+       * XOR instruction.  However, empirical experiments and Curro's reading
+       * of the simulator source both indicate that it's safe.
+       */
+      src_reg tmp = src_reg(this, glsl_ivec4_type());
+      inst = emit(XOR(dst_reg(tmp), op[0], op[1]));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->conditional_mod = BRW_CONDITIONAL_L;
+
+      /* If the result of the initial remainder operation is non-zero and the
+       * two sources have different signs, add in a copy of op[1] to get the
+       * final integer modulus value.
+       */
+      inst = emit(ADD(dst, src_reg(dst), op[1]));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   }
+
+   case nir_op_ldexp:
+      unreachable("not reached: should be handled by ldexp_to_arith()");
+
+   case nir_op_fsqrt:
+      inst = emit_math(SHADER_OPCODE_SQRT, dst, op[0]);
+      break;
+
+   case nir_op_frsq:
+      inst = emit_math(SHADER_OPCODE_RSQ, dst, op[0]);
+      break;
+
+   case nir_op_fpow:
+      inst = emit_math(SHADER_OPCODE_POW, dst, op[0], op[1]);
+      break;
+
+   case nir_op_uadd_carry: {
+      assert(instr->def.bit_size < 64);
+      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+
+      emit(ADDC(dst_null_ud(), op[0], op[1]));
+      emit(MOV(dst, src_reg(acc)));
+      break;
+   }
+
+   case nir_op_usub_borrow: {
+      assert(instr->def.bit_size < 64);
+      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+
+      emit(SUBB(dst_null_ud(), op[0], op[1]));
+      emit(MOV(dst, src_reg(acc)));
+      break;
+   }
+
+   case nir_op_ftrunc:
+      inst = emit(RNDZ(dst, op[0]));
+      if (devinfo->ver < 6) {
+         inst->conditional_mod = BRW_CONDITIONAL_R;
+         inst = emit(ADD(dst, src_reg(dst), brw_imm_f(1.0f)));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */
+      }
+      break;
+
+   case nir_op_fceil: {
+      src_reg tmp = src_reg(this, glsl_float_type());
+      tmp.swizzle = brw_swizzle_for_size(nir_src_num_components(instr->src[0].src));
+
+      op[0].negate = !op[0].negate;
+      emit(RNDD(dst_reg(tmp), op[0]));
+      tmp.negate = true;
+      inst = emit(MOV(dst, tmp));
+      break;
+   }
+
+   case nir_op_ffloor:
+      inst = emit(RNDD(dst, op[0]));
+      break;
+
+   case nir_op_ffract:
+      inst = emit(FRC(dst, op[0]));
+      break;
+
+   case nir_op_fround_even:
+      inst = emit(RNDE(dst, op[0]));
+      if (devinfo->ver < 6) {
+         inst->conditional_mod = BRW_CONDITIONAL_R;
+         inst = emit(ADD(dst, src_reg(dst), brw_imm_f(1.0f)));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */
+      }
+      break;
+
+   case nir_op_fquantize2f16: {
+      /* See also vec4_visitor::emit_pack_half_2x16() */
+      src_reg tmp16 = src_reg(this, glsl_uvec4_type());
+      src_reg tmp32 = src_reg(this, glsl_vec4_type());
+      src_reg zero = src_reg(this, glsl_vec4_type());
+
+      /* Check for denormal */
+      src_reg abs_src0 = op[0];
+      abs_src0.abs = true;
+      emit(CMP(dst_null_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
+               BRW_CONDITIONAL_L));
+      /* Get the appropriately signed zero */
+      emit(AND(retype(dst_reg(zero), BRW_REGISTER_TYPE_UD),
+               retype(op[0], BRW_REGISTER_TYPE_UD),
+               brw_imm_ud(0x80000000)));
+      /* Do the actual F32 -> F16 -> F32 conversion */
+      emit(F32TO16(dst_reg(tmp16), op[0]));
+      emit(F16TO32(dst_reg(tmp32), tmp16));
+      /* Select that or zero based on normal status */
+      inst = emit(BRW_OPCODE_SEL, dst, zero, tmp32);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   }
+
+   case nir_op_imin:
+   case nir_op_umin:
+      assert(instr->def.bit_size < 64);
+      FALLTHROUGH;
+   case nir_op_fmin:
+      try_immediate_source(instr, op, true);
+      inst = emit_minmax(BRW_CONDITIONAL_L, dst, op[0], op[1]);
+      break;
+
+   case nir_op_imax:
+   case nir_op_umax:
+      assert(instr->def.bit_size < 64);
+      FALLTHROUGH;
+   case nir_op_fmax:
+      try_immediate_source(instr, op, true);
+      inst = emit_minmax(BRW_CONDITIONAL_GE, dst, op[0], op[1]);
+      break;
+
+   case nir_op_fddx:
+   case nir_op_fddx_coarse:
+   case nir_op_fddx_fine:
+   case nir_op_fddy:
+   case nir_op_fddy_coarse:
+   case nir_op_fddy_fine:
+      unreachable("derivatives are not valid in vertex shaders");
+
+   case nir_op_ilt32:
+   case nir_op_ult32:
+   case nir_op_ige32:
+   case nir_op_uge32:
+   case nir_op_ieq32:
+   case nir_op_ine32:
+      assert(instr->def.bit_size < 64);
+      FALLTHROUGH;
+   case nir_op_flt32:
+   case nir_op_fge32:
+   case nir_op_feq32:
+   case nir_op_fneu32: {
+      enum brw_conditional_mod conditional_mod =
+         brw_cmod_for_nir_comparison(instr->op);
+
+      if (nir_src_bit_size(instr->src[0].src) < 64) {
+         /* If the order of the sources is changed due to an immediate value,
+          * then the condition must also be changed.
+          */
+         if (try_immediate_source(instr, op, true) == 0)
+            conditional_mod = brw_swap_cmod(conditional_mod);
+
+         emit(CMP(dst, op[0], op[1], conditional_mod));
+      } else {
+         /* Produce a 32-bit boolean result from the DF comparison by selecting
+          * only the low 32-bit in each DF produced. Do this in a temporary
+          * so we can then move from there to the result using align16 again
+          * to honor the original writemask.
+          */
+         dst_reg temp = dst_reg(this, glsl_dvec4_type());
+         emit(CMP(temp, op[0], op[1], conditional_mod));
+         dst_reg result = dst_reg(this, glsl_bvec4_type());
+         emit(VEC4_OPCODE_PICK_LOW_32BIT, result, src_reg(temp));
+         emit(MOV(dst, src_reg(result)));
+      }
+      break;
+   }
+
+   case nir_op_b32all_iequal2:
+   case nir_op_b32all_iequal3:
+   case nir_op_b32all_iequal4:
+      assert(instr->def.bit_size < 64);
+      FALLTHROUGH;
+   case nir_op_b32all_fequal2:
+   case nir_op_b32all_fequal3:
+   case nir_op_b32all_fequal4: {
+      unsigned swiz =
+         brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
+
+      emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
+               brw_cmod_for_nir_comparison(instr->op)));
+      emit(MOV(dst, brw_imm_d(0)));
+      inst = emit(MOV(dst, brw_imm_d(~0)));
+      inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+      break;
+   }
+
+   case nir_op_b32any_inequal2:
+   case nir_op_b32any_inequal3:
+   case nir_op_b32any_inequal4:
+      assert(instr->def.bit_size < 64);
+      FALLTHROUGH;
+   case nir_op_b32any_fnequal2:
+   case nir_op_b32any_fnequal3:
+   case nir_op_b32any_fnequal4: {
+      unsigned swiz =
+         brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
+
+      emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
+               brw_cmod_for_nir_comparison(instr->op)));
+
+      emit(MOV(dst, brw_imm_d(0)));
+      inst = emit(MOV(dst, brw_imm_d(~0)));
+      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      break;
+   }
+
+   case nir_op_inot:
+      assert(instr->def.bit_size < 64);
+      emit(NOT(dst, op[0]));
+      break;
+
+   case nir_op_ixor:
+      assert(instr->def.bit_size < 64);
+      try_immediate_source(instr, op, true);
+      emit(XOR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ior:
+      assert(instr->def.bit_size < 64);
+      try_immediate_source(instr, op, true);
+      emit(OR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_iand:
+      assert(instr->def.bit_size < 64);
+      try_immediate_source(instr, op, true);
+      emit(AND(dst, op[0], op[1]));
+      break;
+
+   case nir_op_b2i32:
+   case nir_op_b2f32:
+   case nir_op_b2f64:
+      if (instr->def.bit_size > 32) {
+         assert(dst.type == BRW_REGISTER_TYPE_DF);
+         emit_conversion_to_double(dst, negate(op[0]));
+      } else {
+         emit(MOV(dst, negate(op[0])));
+      }
+      break;
+
+   case nir_op_unpack_half_2x16_split_x:
+   case nir_op_unpack_half_2x16_split_y:
+   case nir_op_pack_half_2x16_split:
+      unreachable("not reached: should not occur in vertex shader");
+
+   case nir_op_unpack_snorm_2x16:
+   case nir_op_unpack_unorm_2x16:
+   case nir_op_pack_snorm_2x16:
+   case nir_op_pack_unorm_2x16:
+      unreachable("not reached: should be handled by lower_packing_builtins");
+
+   case nir_op_pack_uvec4_to_uint:
+      unreachable("not reached");
+
+   case nir_op_pack_uvec2_to_uint: {
+      dst_reg tmp1 = dst_reg(this, glsl_uint_type());
+      tmp1.writemask = WRITEMASK_X;
+      op[0].swizzle = BRW_SWIZZLE_YYYY;
+      emit(SHL(tmp1, op[0], src_reg(brw_imm_ud(16u))));
+
+      dst_reg tmp2 = dst_reg(this, glsl_uint_type());
+      tmp2.writemask = WRITEMASK_X;
+      op[0].swizzle = BRW_SWIZZLE_XXXX;
+      emit(AND(tmp2, op[0], src_reg(brw_imm_ud(0xffffu))));
+
+      emit(OR(dst, src_reg(tmp1), src_reg(tmp2)));
+      break;
+   }
+
+   case nir_op_pack_64_2x32_split: {
+      dst_reg result = dst_reg(this, glsl_dvec4_type());
+      dst_reg tmp = dst_reg(this, glsl_uvec4_type());
+      emit(MOV(tmp, retype(op[0], BRW_REGISTER_TYPE_UD)));
+      emit(VEC4_OPCODE_SET_LOW_32BIT, result, src_reg(tmp));
+      emit(MOV(tmp, retype(op[1], BRW_REGISTER_TYPE_UD)));
+      emit(VEC4_OPCODE_SET_HIGH_32BIT, result, src_reg(tmp));
+      emit(MOV(dst, src_reg(result)));
+      break;
+   }
+
+   case nir_op_unpack_64_2x32_split_x:
+   case nir_op_unpack_64_2x32_split_y: {
+      enum opcode oper = (instr->op == nir_op_unpack_64_2x32_split_x) ?
+         VEC4_OPCODE_PICK_LOW_32BIT : VEC4_OPCODE_PICK_HIGH_32BIT;
+      dst_reg tmp = dst_reg(this, glsl_dvec4_type());
+      emit(MOV(tmp, op[0]));
+      dst_reg tmp2 = dst_reg(this, glsl_uvec4_type());
+      emit(oper, tmp2, src_reg(tmp));
+      emit(MOV(dst, src_reg(tmp2)));
+      break;
+   }
+
+   case nir_op_unpack_half_2x16:
+      /* As NIR does not guarantee that we have a correct swizzle outside the
+       * boundaries of a vector, and the implementation of emit_unpack_half_2x16
+       * uses the source operand in an operation with WRITEMASK_Y while our
+       * source operand has only size 1, it accessed incorrect data producing
+       * regressions in Piglit. We repeat the swizzle of the first component on the
+       * rest of components to avoid regressions. In the vec4_visitor IR code path
+       * this is not needed because the operand has already the correct swizzle.
+       */
+      op[0].swizzle = brw_compose_swizzle(BRW_SWIZZLE_XXXX, op[0].swizzle);
+      emit_unpack_half_2x16(dst, op[0]);
+      break;
+
+   case nir_op_pack_half_2x16:
+      emit_pack_half_2x16(dst, op[0]);
+      break;
+
+   case nir_op_unpack_unorm_4x8:
+      assert(instr->def.bit_size < 64);
+      emit_unpack_unorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_pack_unorm_4x8:
+      assert(instr->def.bit_size < 64);
+      emit_pack_unorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_unpack_snorm_4x8:
+      assert(instr->def.bit_size < 64);
+      emit_unpack_snorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_pack_snorm_4x8:
+      assert(instr->def.bit_size < 64);
+      emit_pack_snorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_bitfield_reverse:
+      assert(instr->def.bit_size == 32);
+      assert(nir_src_bit_size(instr->src[0].src) == 32);
+      emit(BFREV(dst, op[0]));
+      break;
+
+   case nir_op_bit_count:
+      assert(instr->def.bit_size == 32);
+      assert(nir_src_bit_size(instr->src[0].src) < 64);
+      emit(CBIT(dst, op[0]));
+      break;
+
+   case nir_op_ifind_msb: {
+      assert(instr->def.bit_size == 32);
+      assert(nir_src_bit_size(instr->src[0].src) == 32);
+      assert(devinfo->ver >= 7);
+
+      vec4_builder bld = vec4_builder(this).at_end();
+      src_reg src(dst);
+
+      emit(FBH(retype(dst, BRW_REGISTER_TYPE_UD), op[0]));
+
+      /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
+       * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
+       * subtract the result from 31 to convert the MSB count into an LSB
+       * count.
+       */
+      bld.CMP(dst_null_d(), src, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
+
+      inst = bld.ADD(dst, src, brw_imm_d(31));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->src[0].negate = true;
+      break;
+   }
+
+   case nir_op_uclz:
+      assert(instr->def.bit_size == 32);
+      assert(nir_src_bit_size(instr->src[0].src) == 32);
+      emit(LZD(dst, op[0]));
+      break;
+
+   case nir_op_find_lsb:
+      assert(instr->def.bit_size == 32);
+      assert(nir_src_bit_size(instr->src[0].src) == 32);
+      assert(devinfo->ver >= 7);
+      emit(FBL(dst, op[0]));
+      break;
+
+   case nir_op_ubitfield_extract:
+   case nir_op_ibitfield_extract:
+      unreachable("should have been lowered");
+   case nir_op_ubfe:
+   case nir_op_ibfe:
+      assert(instr->def.bit_size < 64);
+      op[0] = fix_3src_operand(op[0]);
+      op[1] = fix_3src_operand(op[1]);
+      op[2] = fix_3src_operand(op[2]);
+
+      emit(BFE(dst, op[2], op[1], op[0]));
+      break;
+
+   case nir_op_bfm:
+      assert(instr->def.bit_size < 64);
+      emit(BFI1(dst, op[0], op[1]));
+      break;
+
+   case nir_op_bfi:
+      assert(instr->def.bit_size < 64);
+      op[0] = fix_3src_operand(op[0]);
+      op[1] = fix_3src_operand(op[1]);
+      op[2] = fix_3src_operand(op[2]);
+
+      emit(BFI2(dst, op[0], op[1], op[2]));
+      break;
+
+   case nir_op_bitfield_insert:
+      unreachable("not reached: should have been lowered");
+
+   case nir_op_fsign:
+       if (type_sz(op[0].type) < 8) {
+         /* AND(val, 0x80000000) gives the sign bit.
+          *
+          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
+          * zero.
+          */
+         emit(CMP(dst_null_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ));
+
+         op[0].type = BRW_REGISTER_TYPE_UD;
+         dst.type = BRW_REGISTER_TYPE_UD;
+         emit(AND(dst, op[0], brw_imm_ud(0x80000000u)));
+
+         inst = emit(OR(dst, src_reg(dst), brw_imm_ud(0x3f800000u)));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         dst.type = BRW_REGISTER_TYPE_F;
+      } else {
+         /* For doubles we do the same but we need to consider:
+          *
+          * - We use a MOV with conditional_mod instead of a CMP so that we can
+          *   skip loading a 0.0 immediate. We use a source modifier on the
+          *   source of the MOV so that we flush denormalized values to 0.
+          *   Since we want to compare against 0, this won't alter the result.
+          * - We need to extract the high 32-bit of each DF where the sign
+          *   is stored.
+          * - We need to produce a DF result.
+          */
+
+         /* Check for zero */
+         src_reg value = op[0];
+         value.abs = true;
+         inst = emit(MOV(dst_null_df(), value));
+         inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+         /* AND each high 32-bit channel with 0x80000000u */
+         dst_reg tmp = dst_reg(this, glsl_uvec4_type());
+         emit(VEC4_OPCODE_PICK_HIGH_32BIT, tmp, op[0]);
+         emit(AND(tmp, src_reg(tmp), brw_imm_ud(0x80000000u)));
+
+         /* Add 1.0 to each channel, predicated to skip the cases where the
+          * channel's value was 0
+          */
+         inst = emit(OR(tmp, src_reg(tmp), brw_imm_ud(0x3f800000u)));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+
+         /* Now convert the result from float to double */
+         emit_conversion_to_double(dst, retype(src_reg(tmp),
+                                               BRW_REGISTER_TYPE_F));
+      }
+      break;
+
+   case nir_op_ishl:
+      assert(instr->def.bit_size < 64);
+      try_immediate_source(instr, op, false);
+      emit(SHL(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ishr:
+      assert(instr->def.bit_size < 64);
+      try_immediate_source(instr, op, false);
+      emit(ASR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ushr:
+      assert(instr->def.bit_size < 64);
+      try_immediate_source(instr, op, false);
+      emit(SHR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ffma:
+      if (type_sz(dst.type) == 8) {
+         dst_reg mul_dst = dst_reg(this, glsl_dvec4_type());
+         emit(MUL(mul_dst, op[1], op[0]));
+         inst = emit(ADD(dst, src_reg(mul_dst), op[2]));
+      } else {
+         fix_float_operands(op, instr);
+         inst = emit(MAD(dst, op[2], op[1], op[0]));
+      }
+      break;
+
+   case nir_op_flrp:
+      fix_float_operands(op, instr);
+      inst = emit(LRP(dst, op[2], op[1], op[0]));
+      break;
+
+   case nir_op_b32csel:
+      enum brw_predicate predicate;
+      if (!optimize_predicate(instr, &predicate)) {
+         emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ));
+         switch (dst.writemask) {
+         case WRITEMASK_X:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X;
+            break;
+         case WRITEMASK_Y:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+            break;
+         case WRITEMASK_Z:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+            break;
+         case WRITEMASK_W:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W;
+            break;
+         default:
+            predicate = BRW_PREDICATE_NORMAL;
+            break;
+         }
+      }
+      inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]);
+      inst->predicate = predicate;
+      break;
+
+   case nir_op_fdot2_replicated:
+      try_immediate_source(instr, op, true);
+      inst = emit(BRW_OPCODE_DP2, dst, op[0], op[1]);
+      break;
+
+   case nir_op_fdot3_replicated:
+      try_immediate_source(instr, op, true);
+      inst = emit(BRW_OPCODE_DP3, dst, op[0], op[1]);
+      break;
+
+   case nir_op_fdot4_replicated:
+      try_immediate_source(instr, op, true);
+      inst = emit(BRW_OPCODE_DP4, dst, op[0], op[1]);
+      break;
+
+   case nir_op_fdph_replicated:
+      try_immediate_source(instr, op, false);
+      inst = emit(BRW_OPCODE_DPH, dst, op[0], op[1]);
+      break;
+
+   case nir_op_fdiv:
+      unreachable("not reached: should be lowered by lower_fdiv in the compiler");
+
+   case nir_op_fmod:
+      unreachable("not reached: should be lowered by lower_fmod in the compiler");
+
+   case nir_op_fsub:
+   case nir_op_isub:
+      unreachable("not reached: should be handled by ir_sub_to_add_neg");
+
+   default:
+      unreachable("Unimplemented ALU operation");
+   }
+
+   /* If we need to do a boolean resolve, replace the result with -(x & 1)
+    * to sign extend the low bit to 0/~0
+    */
+   if (devinfo->ver <= 5 &&
+       (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) ==
+       BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
+      dst_reg masked = dst_reg(this, glsl_int_type());
+      masked.writemask = dst.writemask;
+      emit(AND(masked, src_reg(dst), brw_imm_d(1)));
+      src_reg masked_neg = src_reg(masked);
+      masked_neg.negate = true;
+      emit(MOV(retype(dst, BRW_REGISTER_TYPE_D), masked_neg));
+   }
+}
+
+void
+vec4_visitor::nir_emit_jump(nir_jump_instr *instr)
+{
+   switch (instr->type) {
+   case nir_jump_break:
+      emit(BRW_OPCODE_BREAK);
+      break;
+
+   case nir_jump_continue:
+      emit(BRW_OPCODE_CONTINUE);
+      break;
+
+   case nir_jump_return:
+      FALLTHROUGH;
+   default:
+      unreachable("unknown jump");
+   }
+}
+
+static bool
+is_high_sampler(const struct intel_device_info *devinfo, src_reg sampler)
+{
+   if (devinfo->verx10 != 75)
+      return false;
+
+   return sampler.file != IMM || sampler.ud >= 16;
+}
+
+void
+vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
+{
+   unsigned texture = instr->texture_index;
+   unsigned sampler = instr->sampler_index;
+   src_reg texture_reg = brw_imm_ud(texture);
+   src_reg sampler_reg = brw_imm_ud(sampler);
+   src_reg coordinate;
+   const glsl_type *coord_type = NULL;
+   src_reg shadow_comparator;
+   src_reg offset_value;
+   src_reg lod, lod2;
+   src_reg sample_index;
+   src_reg mcs;
+
+   dst_reg dest = get_nir_def(instr->def, instr->dest_type);
+
+   /* The hardware requires a LOD for buffer textures */
+   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
+      lod = brw_imm_d(0);
+
+   /* Load the texture operation sources */
+   uint32_t constant_offset = 0;
+   for (unsigned i = 0; i < instr->num_srcs; i++) {
+      switch (instr->src[i].src_type) {
+      case nir_tex_src_comparator:
+         shadow_comparator = get_nir_src(instr->src[i].src,
+                                         BRW_REGISTER_TYPE_F, 1);
+         break;
+
+      case nir_tex_src_coord: {
+         unsigned src_size = nir_tex_instr_src_size(instr, i);
+
+         switch (instr->op) {
+         case nir_texop_txf:
+         case nir_texop_txf_ms:
+         case nir_texop_samples_identical:
+            coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D,
+                                     src_size);
+            coord_type = glsl_ivec_type(src_size);
+            break;
+
+         default:
+            coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+                                     src_size);
+            coord_type = glsl_vec_type(src_size);
+            break;
+         }
+         break;
+      }
+
+      case nir_tex_src_ddx:
+         lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+                           nir_tex_instr_src_size(instr, i));
+         break;
+
+      case nir_tex_src_ddy:
+         lod2 = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+                           nir_tex_instr_src_size(instr, i));
+         break;
+
+      case nir_tex_src_lod:
+         switch (instr->op) {
+         case nir_texop_txs:
+         case nir_texop_txf:
+            lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1);
+            break;
+
+         default:
+            lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 1);
+            break;
+         }
+         break;
+
+      case nir_tex_src_ms_index: {
+         sample_index = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1);
+         break;
+      }
+
+      case nir_tex_src_offset:
+         if (!brw_texture_offset(instr, i, &constant_offset)) {
+            offset_value =
+               get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2);
+         }
+         break;
+
+      case nir_tex_src_texture_offset: {
+         assert(texture_reg.is_zero());
+         texture_reg = emit_uniformize(get_nir_src(instr->src[i].src,
+                                                   BRW_REGISTER_TYPE_UD, 1));
+         break;
+      }
+
+      case nir_tex_src_sampler_offset: {
+         assert(sampler_reg.is_zero());
+         sampler_reg = emit_uniformize(get_nir_src(instr->src[i].src,
+                                                   BRW_REGISTER_TYPE_UD, 1));
+         break;
+      }
+
+      case nir_tex_src_projector:
+         unreachable("Should be lowered by nir_lower_tex");
+
+      case nir_tex_src_bias:
+         unreachable("LOD bias is not valid for vertex shaders.\n");
+
+      default:
+         unreachable("unknown texture source");
+      }
+   }
+
+   if (instr->op == nir_texop_txf_ms ||
+       instr->op == nir_texop_samples_identical) {
+      assert(coord_type != NULL);
+      if (devinfo->ver >= 7) {
+         mcs = emit_mcs_fetch(coord_type, coordinate, texture_reg);
+      } else {
+         mcs = brw_imm_ud(0u);
+      }
+   }
+
+   /* Stuff the channel select bits in the top of the texture offset */
+   if (instr->op == nir_texop_tg4) {
+      if (instr->component == 1 &&
+          (key_tex->gather_channel_quirk_mask & (1 << texture))) {
+         /* gather4 sampler is broken for green channel on RG32F --
+          * we must ask for blue instead.
+          */
+         constant_offset |= 2 << 16;
+      } else {
+         constant_offset |= instr->component << 16;
+      }
+   }
+
+   enum opcode opcode;
+   switch (instr->op) {
+   case nir_texop_tex:             opcode = SHADER_OPCODE_TXL;        break;
+   case nir_texop_txl:             opcode = SHADER_OPCODE_TXL;        break;
+   case nir_texop_txd:             opcode = SHADER_OPCODE_TXD;        break;
+   case nir_texop_txf:             opcode = SHADER_OPCODE_TXF;        break;
+   case nir_texop_txf_ms:          opcode = SHADER_OPCODE_TXF_CMS;    break;
+   case nir_texop_txs:             opcode = SHADER_OPCODE_TXS;        break;
+   case nir_texop_query_levels:    opcode = SHADER_OPCODE_TXS;        break;
+   case nir_texop_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
+   case nir_texop_tg4:
+      opcode = offset_value.file != BAD_FILE ? SHADER_OPCODE_TG4_OFFSET
+                                             : SHADER_OPCODE_TG4;
+      break;
+   case nir_texop_samples_identical: {
+      /* There are some challenges implementing this for vec4, and it seems
+       * unlikely to be used anyway.  For now, just return false ways.
+       */
+      emit(MOV(dest, brw_imm_ud(0u)));
+      return;
+   }
+   case nir_texop_txb:
+   case nir_texop_lod:
+      unreachable("Implicit LOD is only valid inside fragment shaders.");
+   default:
+      unreachable("Unrecognized tex op");
+   }
+
+   vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
+
+   inst->offset = constant_offset;
+
+   /* The message header is necessary for:
+    * - Gfx4 (always)
+    * - Texel offsets
+    * - Gather channel selection
+    * - Sampler indices too large to fit in a 4-bit value.
+    * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
+    */
+   inst->header_size =
+      (devinfo->ver < 5 ||
+       inst->offset != 0 ||
+       opcode == SHADER_OPCODE_TG4 ||
+       opcode == SHADER_OPCODE_TG4_OFFSET ||
+       opcode == SHADER_OPCODE_SAMPLEINFO ||
+       is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
+   inst->base_mrf = 2;
+   inst->mlen = inst->header_size;
+   inst->dst.writemask = WRITEMASK_XYZW;
+   inst->shadow_compare = shadow_comparator.file != BAD_FILE;
+
+   inst->src[1] = texture_reg;
+   inst->src[2] = sampler_reg;
+
+   /* MRF for the first parameter */
+   int param_base = inst->base_mrf + inst->header_size;
+
+   if (opcode == SHADER_OPCODE_TXS) {
+      int writemask = devinfo->ver == 4 ? WRITEMASK_W : WRITEMASK_X;
+      emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
+      inst->mlen++;
+   } else if (opcode == SHADER_OPCODE_SAMPLEINFO) {
+      inst->dst.writemask = WRITEMASK_X;
+   } else {
+      /* Load the coordinate */
+      /* FINISHME: gl_clamp_mask and saturate */
+      int coord_mask = (1 << instr->coord_components) - 1;
+      int zero_mask = 0xf & ~coord_mask;
+
+      emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
+               coordinate));
+      inst->mlen++;
+
+      if (zero_mask != 0) {
+         emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
+                  brw_imm_d(0)));
+      }
+      /* Load the shadow comparator */
+      if (shadow_comparator.file != BAD_FILE &&
+          opcode != SHADER_OPCODE_TXD &&
+          opcode != SHADER_OPCODE_TG4_OFFSET) {
+	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
+			  WRITEMASK_X),
+		  shadow_comparator));
+	 inst->mlen++;
+      }
+
+      /* Load the LOD info */
+      switch (opcode) {
+      case SHADER_OPCODE_TXL: {
+	 int mrf, writemask;
+	 if (devinfo->ver >= 5) {
+	    mrf = param_base + 1;
+	    if (shadow_comparator.file != BAD_FILE) {
+	       writemask = WRITEMASK_Y;
+	       /* mlen already incremented */
+	    } else {
+	       writemask = WRITEMASK_X;
+	       inst->mlen++;
+	    }
+	 } else /* devinfo->ver == 4 */ {
+	    mrf = param_base;
+	    writemask = WRITEMASK_W;
+	 }
+	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
+         break;
+      }
+
+      case SHADER_OPCODE_TXF:
+         emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
+         break;
+
+      case SHADER_OPCODE_TXF_CMS:
+         emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
+                  sample_index));
+         if (devinfo->ver >= 7) {
+            /* MCS data is in the first channel of `mcs`, but we need to get it into
+             * the .y channel of the second vec4 of params, so replicate .x across
+             * the whole vec4 and then mask off everything except .y
+             */
+            mcs.swizzle = BRW_SWIZZLE_XXXX;
+            emit(MOV(dst_reg(MRF, param_base + 1, glsl_uint_type(), WRITEMASK_Y),
+                     mcs));
+         }
+         inst->mlen++;
+         break;
+
+      case SHADER_OPCODE_TXD: {
+         const brw_reg_type type = lod.type;
+
+	 if (devinfo->ver >= 5) {
+	    lod.swizzle = BRW_SWIZZLE4(BRW_SWIZZLE_X,BRW_SWIZZLE_X,BRW_SWIZZLE_Y,BRW_SWIZZLE_Y);
+	    lod2.swizzle = BRW_SWIZZLE4(BRW_SWIZZLE_X,BRW_SWIZZLE_X,BRW_SWIZZLE_Y,BRW_SWIZZLE_Y);
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
+	    inst->mlen++;
+
+	    if (nir_tex_instr_dest_size(instr) == 3 ||
+                shadow_comparator.file != BAD_FILE) {
+	       lod.swizzle = BRW_SWIZZLE_ZZZZ;
+	       lod2.swizzle = BRW_SWIZZLE_ZZZZ;
+	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
+	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
+	       inst->mlen++;
+
+               if (shadow_comparator.file != BAD_FILE) {
+                  emit(MOV(dst_reg(MRF, param_base + 2,
+                                   shadow_comparator.type, WRITEMASK_Z),
+                           shadow_comparator));
+               }
+	    }
+	 } else /* devinfo->ver == 4 */ {
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
+	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
+	    inst->mlen += 2;
+	 }
+         break;
+      }
+
+      case SHADER_OPCODE_TG4_OFFSET:
+         if (shadow_comparator.file != BAD_FILE) {
+            emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
+                     shadow_comparator));
+         }
+
+         emit(MOV(dst_reg(MRF, param_base + 1, glsl_ivec2_type(), WRITEMASK_XY),
+                  offset_value));
+         inst->mlen++;
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   emit(inst);
+
+   /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
+    * spec requires layers.
+    */
+   if (instr->op == nir_texop_txs && devinfo->ver < 7) {
+      /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */
+      emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
+                  src_reg(inst->dst), brw_imm_d(1));
+   }
+
+   if (instr->op == nir_texop_query_levels) {
+      /* # levels is in .w */
+      src_reg swizzled(dest);
+      swizzled.swizzle = BRW_SWIZZLE4(BRW_SWIZZLE_W, BRW_SWIZZLE_W,
+                                      BRW_SWIZZLE_W, BRW_SWIZZLE_W);
+      emit(MOV(dest, swizzled));
+   }
+}
+
+src_reg
+vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
+                             src_reg coordinate, src_reg surface)
+{
+   vec4_instruction *inst =
+      new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
+                                    dst_reg(this, glsl_uvec4_type()));
+   inst->base_mrf = 2;
+   inst->src[1] = surface;
+   inst->src[2] = brw_imm_ud(0); /* sampler */
+   inst->mlen = 1;
+
+   const int param_base = inst->base_mrf;
+
+   /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
+   int coord_mask = (1 << coordinate_type->vector_elements) - 1;
+   int zero_mask = 0xf & ~coord_mask;
+
+   emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
+            coordinate));
+
+   emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
+            brw_imm_d(0)));
+
+   emit(inst);
+   return src_reg(inst->dst);
+}
+
+void
+vec4_visitor::nir_emit_undef(nir_undef_instr *instr)
+{
+   nir_ssa_values[instr->def.index] =
+      dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(instr->def.bit_size, 32)));
+}
+
+/* SIMD4x2 64bit data is stored in register space like this:
+ *
+ * r0.0:DF  x0 y0 z0 w0
+ * r1.0:DF  x1 y1 z1 w1
+ *
+ * When we need to write data such as this to memory using 32-bit write
+ * messages we need to shuffle it in this fashion:
+ *
+ * r0.0:DF  x0 y0 x1 y1 (to be written at base offset)
+ * r0.0:DF  z0 w0 z1 w1 (to be written at base offset + 16)
+ *
+ * We need to do the inverse operation when we read using 32-bit messages,
+ * which we can do by applying the same exact shuffling on the 64-bit data
+ * read, only that because the data for each vertex is positioned differently
+ * we need to apply different channel enables.
+ *
+ * This function takes 64bit data and shuffles it as explained above.
+ *
+ * The @for_write parameter is used to specify if the shuffling is being done
+ * for proper SIMD4x2 64-bit data that needs to be shuffled prior to a 32-bit
+ * write message (for_write = true), or instead we are doing the inverse
+ * operation and we have just read 64-bit data using a 32-bit messages that we
+ * need to shuffle to create valid SIMD4x2 64-bit data (for_write = false).
+ *
+ * If @block and @ref are non-NULL, then the shuffling is done after @ref,
+ * otherwise the instructions are emitted normally at the end. The function
+ * returns the last instruction inserted.
+ *
+ * Notice that @src and @dst cannot be the same register.
+ */
+vec4_instruction *
+vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write,
+                                 bool for_scratch,
+                                 bblock_t *block, vec4_instruction *ref)
+{
+   assert(type_sz(src.type) == 8);
+   assert(type_sz(dst.type) == 8);
+   assert(!regions_overlap(dst, 2 * REG_SIZE, src, 2 * REG_SIZE));
+   assert(!ref == !block);
+
+   opcode mov_op = for_scratch ? VEC4_OPCODE_MOV_FOR_SCRATCH : BRW_OPCODE_MOV;
+
+   const vec4_builder bld = !ref ? vec4_builder(this).at_end() :
+                                   vec4_builder(this).at(block, ref->next);
+
+   /* Resolve swizzle in src */
+   if (src.swizzle != BRW_SWIZZLE_XYZW) {
+      dst_reg data = dst_reg(this, glsl_dvec4_type());
+      bld.emit(mov_op, data, src);
+      src = src_reg(data);
+   }
+
+   /* dst+0.XY = src+0.XY */
+   bld.group(4, 0).emit(mov_op, writemask(dst, WRITEMASK_XY), src);
+
+   /* dst+0.ZW = src+1.XY */
+   bld.group(4, for_write ? 1 : 0)
+            .emit(mov_op, writemask(dst, WRITEMASK_ZW),
+                  swizzle(byte_offset(src, REG_SIZE), BRW_SWIZZLE_XYXY));
+
+   /* dst+1.XY = src+0.ZW */
+   bld.group(4, for_write ? 0 : 1)
+            .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY),
+                  swizzle(src, BRW_SWIZZLE_ZWZW));
+
+   /* dst+1.ZW = src+1.ZW */
+   return bld.group(4, 1)
+            .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW),
+                  byte_offset(src, REG_SIZE));
+}
+
+}
diff --git a/src/intel/compiler/elk/brw_vec4_reg_allocate.cpp b/src/intel/compiler/elk/brw_vec4_reg_allocate.cpp
new file mode 100644
index 00000000000..8ba1e80b9a5
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_reg_allocate.cpp
@@ -0,0 +1,512 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/register_allocate.h"
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+#define REG_CLASS_COUNT 20
+
+namespace brw {
+
+static void
+assign(unsigned int *reg_hw_locations, backend_reg *reg)
+{
+   if (reg->file == VGRF) {
+      reg->nr = reg_hw_locations[reg->nr] + reg->offset / REG_SIZE;
+      reg->offset %= REG_SIZE;
+   }
+}
+
+bool
+vec4_visitor::reg_allocate_trivial()
+{
+   unsigned int hw_reg_mapping[this->alloc.count];
+   bool virtual_grf_used[this->alloc.count];
+   int next;
+
+   /* Calculate which virtual GRFs are actually in use after whatever
+    * optimization passes have occurred.
+    */
+   for (unsigned i = 0; i < this->alloc.count; i++) {
+      virtual_grf_used[i] = false;
+   }
+
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      if (inst->dst.file == VGRF)
+         virtual_grf_used[inst->dst.nr] = true;
+
+      for (unsigned i = 0; i < 3; i++) {
+	 if (inst->src[i].file == VGRF)
+            virtual_grf_used[inst->src[i].nr] = true;
+      }
+   }
+
+   hw_reg_mapping[0] = this->first_non_payload_grf;
+   next = hw_reg_mapping[0] + this->alloc.sizes[0];
+   for (unsigned i = 1; i < this->alloc.count; i++) {
+      if (virtual_grf_used[i]) {
+	 hw_reg_mapping[i] = next;
+	 next += this->alloc.sizes[i];
+      }
+   }
+   prog_data->total_grf = next;
+
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      assign(hw_reg_mapping, &inst->dst);
+      assign(hw_reg_mapping, &inst->src[0]);
+      assign(hw_reg_mapping, &inst->src[1]);
+      assign(hw_reg_mapping, &inst->src[2]);
+   }
+
+   if (prog_data->total_grf > max_grf) {
+      fail("Ran out of regs on trivial allocator (%d/%d)\n",
+	   prog_data->total_grf, max_grf);
+      return false;
+   }
+
+   return true;
+}
+
+extern "C" void
+brw_vec4_alloc_reg_set(struct brw_compiler *compiler)
+{
+   int base_reg_count =
+      compiler->devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF;
+
+   assert(compiler->devinfo->ver < 8);
+
+   /* After running split_virtual_grfs(), almost all VGRFs will be of size 1.
+    * SEND-from-GRF sources cannot be split, so we also need classes for each
+    * potential message length.
+    */
+   assert(REG_CLASS_COUNT == MAX_VGRF_SIZE(compiler->devinfo));
+   int class_sizes[REG_CLASS_COUNT];
+
+   for (int i = 0; i < REG_CLASS_COUNT; i++)
+      class_sizes[i] = i + 1;
+
+
+   ralloc_free(compiler->vec4_reg_set.regs);
+   compiler->vec4_reg_set.regs = ra_alloc_reg_set(compiler, base_reg_count, false);
+   if (compiler->devinfo->ver >= 6)
+      ra_set_allocate_round_robin(compiler->vec4_reg_set.regs);
+   ralloc_free(compiler->vec4_reg_set.classes);
+   compiler->vec4_reg_set.classes = ralloc_array(compiler, struct ra_class *, REG_CLASS_COUNT);
+
+   /* Now, add the registers to their classes, and add the conflicts
+    * between them and the base GRF registers (and also each other).
+    */
+   for (int i = 0; i < REG_CLASS_COUNT; i++) {
+      int class_reg_count = base_reg_count - (class_sizes[i] - 1);
+      compiler->vec4_reg_set.classes[i] =
+         ra_alloc_contig_reg_class(compiler->vec4_reg_set.regs, class_sizes[i]);
+
+      for (int j = 0; j < class_reg_count; j++)
+         ra_class_add_reg(compiler->vec4_reg_set.classes[i], j);
+   }
+
+   ra_set_finalize(compiler->vec4_reg_set.regs, NULL);
+}
+
+void
+vec4_visitor::setup_payload_interference(struct ra_graph *g,
+                                         int first_payload_node,
+                                         int reg_node_count)
+{
+   int payload_node_count = this->first_non_payload_grf;
+
+   for (int i = 0; i < payload_node_count; i++) {
+      /* Mark each payload reg node as being allocated to its physical register.
+       *
+       * The alternative would be to have per-physical register classes, which
+       * would just be silly.
+       */
+      ra_set_node_reg(g, first_payload_node + i, i);
+
+      /* For now, just mark each payload node as interfering with every other
+       * node to be allocated.
+       */
+      for (int j = 0; j < reg_node_count; j++) {
+         ra_add_node_interference(g, first_payload_node + i, j);
+      }
+   }
+}
+
+bool
+vec4_visitor::reg_allocate()
+{
+   unsigned int hw_reg_mapping[alloc.count];
+   int payload_reg_count = this->first_non_payload_grf;
+
+   /* Using the trivial allocator can be useful in debugging undefined
+    * register access as a result of broken optimization passes.
+    */
+   if (0)
+      return reg_allocate_trivial();
+
+   assert(devinfo->ver < 8);
+
+   const vec4_live_variables &live = live_analysis.require();
+   int node_count = alloc.count;
+   int first_payload_node = node_count;
+   node_count += payload_reg_count;
+   struct ra_graph *g =
+      ra_alloc_interference_graph(compiler->vec4_reg_set.regs, node_count);
+
+   for (unsigned i = 0; i < alloc.count; i++) {
+      int size = this->alloc.sizes[i];
+      assert(size >= 1 && size <= MAX_VGRF_SIZE(devinfo));
+      ra_set_node_class(g, i, compiler->vec4_reg_set.classes[size - 1]);
+
+      for (unsigned j = 0; j < i; j++) {
+	 if (live.vgrfs_interfere(i, j)) {
+	    ra_add_node_interference(g, i, j);
+	 }
+      }
+   }
+
+   /* Certain instructions can't safely use the same register for their
+    * sources and destination.  Add interference.
+    */
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
+         for (unsigned i = 0; i < 3; i++) {
+            if (inst->src[i].file == VGRF) {
+               ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
+            }
+         }
+      }
+   }
+
+   setup_payload_interference(g, first_payload_node, node_count);
+
+   if (!ra_allocate(g)) {
+      /* Failed to allocate registers.  Spill a reg, and the caller will
+       * loop back into here to try again.
+       */
+      int reg = choose_spill_reg(g);
+      if (this->no_spills) {
+         fail("Failure to register allocate.  Reduce number of live "
+              "values to avoid this.");
+      } else if (reg == -1) {
+         fail("no register to spill\n");
+      } else {
+         spill_reg(reg);
+      }
+      ralloc_free(g);
+      return false;
+   }
+
+   /* Get the chosen virtual registers for each node, and map virtual
+    * regs in the register classes back down to real hardware reg
+    * numbers.
+    */
+   prog_data->total_grf = payload_reg_count;
+   for (unsigned i = 0; i < alloc.count; i++) {
+      hw_reg_mapping[i] = ra_get_node_reg(g, i);
+      prog_data->total_grf = MAX2(prog_data->total_grf,
+				  hw_reg_mapping[i] + alloc.sizes[i]);
+   }
+
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      assign(hw_reg_mapping, &inst->dst);
+      assign(hw_reg_mapping, &inst->src[0]);
+      assign(hw_reg_mapping, &inst->src[1]);
+      assign(hw_reg_mapping, &inst->src[2]);
+   }
+
+   ralloc_free(g);
+
+   return true;
+}
+
+/**
+ * When we decide to spill a register, instead of blindly spilling every use,
+ * save unspills when the spill register is used (read) in consecutive
+ * instructions. This can potentially save a bunch of unspills that would
+ * have very little impact in register allocation anyway.
+ *
+ * Notice that we need to account for this behavior when spilling a register
+ * and when evaluating spilling costs. This function is designed so it can
+ * be called from both places and avoid repeating the logic.
+ *
+ *  - When we call this function from spill_reg(), we pass in scratch_reg the
+ *    actual unspill/spill register that we want to reuse in the current
+ *    instruction.
+ *
+ *  - When we call this from evaluate_spill_costs(), we pass the register for
+ *    which we are evaluating spilling costs.
+ *
+ * In either case, we check if the previous instructions read scratch_reg until
+ * we find one that writes to it with a compatible mask or does not read/write
+ * scratch_reg at all.
+ */
+static bool
+can_use_scratch_for_source(const vec4_instruction *inst, unsigned i,
+                           unsigned scratch_reg)
+{
+   assert(inst->src[i].file == VGRF);
+   bool prev_inst_read_scratch_reg = false;
+
+   /* See if any previous source in the same instructions reads scratch_reg */
+   for (unsigned n = 0; n < i; n++) {
+      if (inst->src[n].file == VGRF && inst->src[n].nr == scratch_reg)
+         prev_inst_read_scratch_reg = true;
+   }
+
+   /* Now check if previous instructions read/write scratch_reg */
+   for (vec4_instruction *prev_inst = (vec4_instruction *) inst->prev;
+        !prev_inst->is_head_sentinel();
+        prev_inst = (vec4_instruction *) prev_inst->prev) {
+
+      /* If the previous instruction writes to scratch_reg then we can reuse
+       * it if the write is not conditional and the channels we write are
+       * compatible with our read mask
+       */
+      if (prev_inst->dst.file == VGRF && prev_inst->dst.nr == scratch_reg) {
+         return (!prev_inst->predicate || prev_inst->opcode == BRW_OPCODE_SEL) &&
+                (brw_mask_for_swizzle(inst->src[i].swizzle) &
+                 ~prev_inst->dst.writemask) == 0;
+      }
+
+      /* Skip scratch read/writes so that instructions generated by spilling
+       * other registers (that won't read/write scratch_reg) do not stop us from
+       * reusing scratch_reg for this instruction.
+       */
+      if (prev_inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_WRITE ||
+          prev_inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_READ)
+         continue;
+
+      /* If the previous instruction does not write to scratch_reg, then check
+       * if it reads it
+       */
+      int n;
+      for (n = 0; n < 3; n++) {
+         if (prev_inst->src[n].file == VGRF &&
+             prev_inst->src[n].nr == scratch_reg) {
+            prev_inst_read_scratch_reg = true;
+            break;
+         }
+      }
+      if (n == 3) {
+         /* The previous instruction does not read scratch_reg. At this point,
+          * if no previous instruction has read scratch_reg it means that we
+          * will need to unspill it here and we can't reuse it (so we return
+          * false). Otherwise, if we found at least one consecutive instruction
+          * that read scratch_reg, then we know that we got here from
+          * evaluate_spill_costs (since for the spill_reg path any block of
+          * consecutive instructions using scratch_reg must start with a write
+          * to that register, so we would've exited the loop in the check for
+          * the write that we have at the start of this loop), and in that case
+          * it means that we found the point at which the scratch_reg would be
+          * unspilled. Since we always unspill a full vec4, it means that we
+          * have all the channels available and we can just return true to
+          * signal that we can reuse the register in the current instruction
+          * too.
+          */
+         return prev_inst_read_scratch_reg;
+      }
+   }
+
+   return prev_inst_read_scratch_reg;
+}
+
+static inline float
+spill_cost_for_type(enum brw_reg_type type)
+{
+   /* Spilling of a 64-bit register involves emitting 2 32-bit scratch
+    * messages plus the 64b/32b shuffling code.
+    */
+   return type_sz(type) == 8 ? 2.25f : 1.0f;
+}
+
+void
+vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
+{
+   float loop_scale = 1.0;
+
+   unsigned *reg_type_size = (unsigned *)
+      ralloc_size(NULL, this->alloc.count * sizeof(unsigned));
+
+   for (unsigned i = 0; i < this->alloc.count; i++) {
+      spill_costs[i] = 0.0;
+      no_spill[i] = alloc.sizes[i] != 1 && alloc.sizes[i] != 2;
+      reg_type_size[i] = 0;
+   }
+
+   /* Calculate costs for spilling nodes.  Call it a cost of 1 per
+    * spill/unspill we'll have to do, and guess that the insides of
+    * loops run 10 times.
+    */
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (unsigned int i = 0; i < 3; i++) {
+         if (inst->src[i].file == VGRF && !no_spill[inst->src[i].nr]) {
+            /* We will only unspill src[i] it it wasn't unspilled for the
+             * previous instruction, in which case we'll just reuse the scratch
+             * reg for this instruction.
+             */
+            if (!can_use_scratch_for_source(inst, i, inst->src[i].nr)) {
+               spill_costs[inst->src[i].nr] +=
+                  loop_scale * spill_cost_for_type(inst->src[i].type);
+               if (inst->src[i].reladdr ||
+                   inst->src[i].offset >= REG_SIZE)
+                  no_spill[inst->src[i].nr] = true;
+
+               /* We don't support unspills of partial DF reads.
+                *
+                * Our 64-bit unspills are implemented with two 32-bit scratch
+                * messages, each one reading that for both SIMD4x2 threads that
+                * we need to shuffle into correct 64-bit data. Ensure that we
+                * are reading data for both threads.
+                */
+               if (type_sz(inst->src[i].type) == 8 && inst->exec_size != 8)
+                  no_spill[inst->src[i].nr] = true;
+            }
+
+            /* We can't spill registers that mix 32-bit and 64-bit access (that
+             * contain 64-bit data that is operated on via 32-bit instructions)
+             */
+            unsigned type_size = type_sz(inst->src[i].type);
+            if (reg_type_size[inst->src[i].nr] == 0)
+               reg_type_size[inst->src[i].nr] = type_size;
+            else if (reg_type_size[inst->src[i].nr] != type_size)
+               no_spill[inst->src[i].nr] = true;
+         }
+      }
+
+      if (inst->dst.file == VGRF && !no_spill[inst->dst.nr]) {
+         spill_costs[inst->dst.nr] +=
+            loop_scale * spill_cost_for_type(inst->dst.type);
+         if (inst->dst.reladdr || inst->dst.offset >= REG_SIZE)
+            no_spill[inst->dst.nr] = true;
+
+         /* We don't support spills of partial DF writes.
+          *
+          * Our 64-bit spills are implemented with two 32-bit scratch messages,
+          * each one writing that for both SIMD4x2 threads. Ensure that we
+          * are writing data for both threads.
+          */
+         if (type_sz(inst->dst.type) == 8 && inst->exec_size != 8)
+            no_spill[inst->dst.nr] = true;
+
+         /* We can't spill registers that mix 32-bit and 64-bit access (that
+          * contain 64-bit data that is operated on via 32-bit instructions)
+          */
+         unsigned type_size = type_sz(inst->dst.type);
+         if (reg_type_size[inst->dst.nr] == 0)
+            reg_type_size[inst->dst.nr] = type_size;
+         else if (reg_type_size[inst->dst.nr] != type_size)
+            no_spill[inst->dst.nr] = true;
+      }
+
+      switch (inst->opcode) {
+
+      case BRW_OPCODE_DO:
+         loop_scale *= 10;
+         break;
+
+      case BRW_OPCODE_WHILE:
+         loop_scale /= 10;
+         break;
+
+      case SHADER_OPCODE_GFX4_SCRATCH_READ:
+      case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
+      case VEC4_OPCODE_MOV_FOR_SCRATCH:
+         for (int i = 0; i < 3; i++) {
+            if (inst->src[i].file == VGRF)
+               no_spill[inst->src[i].nr] = true;
+         }
+         if (inst->dst.file == VGRF)
+            no_spill[inst->dst.nr] = true;
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   ralloc_free(reg_type_size);
+}
+
+int
+vec4_visitor::choose_spill_reg(struct ra_graph *g)
+{
+   float spill_costs[this->alloc.count];
+   bool no_spill[this->alloc.count];
+
+   evaluate_spill_costs(spill_costs, no_spill);
+
+   for (unsigned i = 0; i < this->alloc.count; i++) {
+      if (!no_spill[i])
+         ra_set_node_spill_cost(g, i, spill_costs[i]);
+   }
+
+   return ra_get_best_spill_node(g);
+}
+
+void
+vec4_visitor::spill_reg(unsigned spill_reg_nr)
+{
+   assert(alloc.sizes[spill_reg_nr] == 1 || alloc.sizes[spill_reg_nr] == 2);
+   unsigned spill_offset = last_scratch;
+   last_scratch += alloc.sizes[spill_reg_nr];
+
+   /* Generate spill/unspill instructions for the objects being spilled. */
+   unsigned scratch_reg = ~0u;
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (unsigned i = 0; i < 3; i++) {
+         if (inst->src[i].file == VGRF && inst->src[i].nr == spill_reg_nr) {
+            if (scratch_reg == ~0u ||
+                !can_use_scratch_for_source(inst, i, scratch_reg)) {
+               /* We need to unspill anyway so make sure we read the full vec4
+                * in any case. This way, the cached register can be reused
+                * for consecutive instructions that read different channels of
+                * the same vec4.
+                */
+               scratch_reg = alloc.allocate(alloc.sizes[spill_reg_nr]);
+               src_reg temp = inst->src[i];
+               temp.nr = scratch_reg;
+               temp.offset = 0;
+               temp.swizzle = BRW_SWIZZLE_XYZW;
+               emit_scratch_read(block, inst,
+                                 dst_reg(temp), inst->src[i], spill_offset);
+               temp.offset = inst->src[i].offset;
+            }
+            assert(scratch_reg != ~0u);
+            inst->src[i].nr = scratch_reg;
+         }
+      }
+
+      if (inst->dst.file == VGRF && inst->dst.nr == spill_reg_nr) {
+         emit_scratch_write(block, inst, spill_offset);
+         scratch_reg = inst->dst.nr;
+      }
+   }
+
+   invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/elk/brw_vec4_surface_builder.cpp b/src/intel/compiler/elk/brw_vec4_surface_builder.cpp
new file mode 100644
index 00000000000..fce3133bef8
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_surface_builder.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4_surface_builder.h"
+
+using namespace brw;
+
+namespace {
+   namespace array_utils {
+      /**
+       * Copy one every \p src_stride logical components of the argument into
+       * one every \p dst_stride logical components of the result.
+       */
+      static src_reg
+      emit_stride(const vec4_builder &bld, const src_reg &src, unsigned size,
+                  unsigned dst_stride, unsigned src_stride)
+      {
+         if (src_stride == 1 && dst_stride == 1) {
+            return src;
+         } else {
+            const dst_reg dst = bld.vgrf(src.type,
+                                         DIV_ROUND_UP(size * dst_stride, 4));
+
+            for (unsigned i = 0; i < size; ++i)
+               bld.MOV(writemask(offset(dst, 8, i * dst_stride / 4),
+                                 1 << (i * dst_stride % 4)),
+                       swizzle(offset(src, 8, i * src_stride / 4),
+                               brw_swizzle_for_mask(1 << (i * src_stride % 4))));
+
+            return src_reg(dst);
+         }
+      }
+
+      /**
+       * Convert a VEC4 into an array of registers with the layout expected by
+       * the recipient shared unit.  If \p has_simd4x2 is true the argument is
+       * left unmodified in SIMD4x2 form, otherwise it will be rearranged into
+       * a SIMD8 vector.
+       */
+      static src_reg
+      emit_insert(const vec4_builder &bld, const src_reg &src,
+                  unsigned n, bool has_simd4x2)
+      {
+         if (src.file == BAD_FILE || n == 0) {
+            return src_reg();
+
+         } else {
+            /* Pad unused components with zeroes. */
+            const unsigned mask = (1 << n) - 1;
+            const dst_reg tmp = bld.vgrf(src.type);
+
+            bld.MOV(writemask(tmp, mask), src);
+            if (n < 4)
+               bld.MOV(writemask(tmp, ~mask), brw_imm_d(0));
+
+            return emit_stride(bld, src_reg(tmp), n, has_simd4x2 ? 1 : 4, 1);
+         }
+      }
+   }
+}
+
+namespace brw {
+   namespace surface_access {
+      namespace {
+         using namespace array_utils;
+
+         /**
+          * Generate a send opcode for a surface message and return the
+          * result.
+          */
+         src_reg
+         emit_send(const vec4_builder &bld, enum opcode op,
+                   const src_reg &header,
+                   const src_reg &addr, unsigned addr_sz,
+                   const src_reg &src, unsigned src_sz,
+                   const src_reg &surface,
+                   unsigned arg, unsigned ret_sz,
+                   brw_predicate pred = BRW_PREDICATE_NONE)
+         {
+            /* Calculate the total number of components of the payload. */
+            const unsigned header_sz = (header.file == BAD_FILE ? 0 : 1);
+            const unsigned sz = header_sz + addr_sz + src_sz;
+
+            /* Construct the payload. */
+            const dst_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
+            unsigned n = 0;
+
+            if (header_sz)
+               bld.exec_all().MOV(offset(payload, 8, n++),
+                                  retype(header, BRW_REGISTER_TYPE_UD));
+
+            for (unsigned i = 0; i < addr_sz; i++)
+               bld.MOV(offset(payload, 8, n++),
+                       offset(retype(addr, BRW_REGISTER_TYPE_UD), 8, i));
+
+            for (unsigned i = 0; i < src_sz; i++)
+               bld.MOV(offset(payload, 8, n++),
+                       offset(retype(src, BRW_REGISTER_TYPE_UD), 8, i));
+
+            /* Reduce the dynamically uniform surface index to a single
+             * scalar.
+             */
+            const src_reg usurface = bld.emit_uniformize(surface);
+
+            /* Emit the message send instruction. */
+            const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, ret_sz);
+            vec4_instruction *inst =
+               bld.emit(op, dst, src_reg(payload), usurface, brw_imm_ud(arg));
+            inst->mlen = sz;
+            inst->size_written = ret_sz * REG_SIZE;
+            inst->header_size = header_sz;
+            inst->predicate = pred;
+
+            return src_reg(dst);
+         }
+      }
+
+      /**
+       * Emit an untyped surface read opcode.  \p dims determines the number
+       * of components of the address and \p size the number of components of
+       * the returned value.
+       */
+      src_reg
+      emit_untyped_read(const vec4_builder &bld,
+                        const src_reg &surface, const src_reg &addr,
+                        unsigned dims, unsigned size,
+                        brw_predicate pred)
+      {
+         return emit_send(bld, VEC4_OPCODE_UNTYPED_SURFACE_READ, src_reg(),
+                          emit_insert(bld, addr, dims, true), 1,
+                          src_reg(), 0,
+                          surface, size, 1, pred);
+      }
+
+      /**
+       * Emit an untyped surface write opcode.  \p dims determines the number
+       * of components of the address and \p size the number of components of
+       * the argument.
+       */
+      void
+      emit_untyped_write(const vec4_builder &bld, const src_reg &surface,
+                         const src_reg &addr, const src_reg &src,
+                         unsigned dims, unsigned size,
+                         brw_predicate pred)
+      {
+         const bool has_simd4x2 = bld.shader->devinfo->verx10 == 75;
+         emit_send(bld, VEC4_OPCODE_UNTYPED_SURFACE_WRITE, src_reg(),
+                   emit_insert(bld, addr, dims, has_simd4x2),
+                   has_simd4x2 ? 1 : dims,
+                   emit_insert(bld, src, size, has_simd4x2),
+                   has_simd4x2 ? 1 : size,
+                   surface, size, 0, pred);
+      }
+
+      /**
+       * Emit an untyped surface atomic opcode.  \p dims determines the number
+       * of components of the address and \p rsize the number of components of
+       * the returned value (either zero or one).
+       */
+      src_reg
+      emit_untyped_atomic(const vec4_builder &bld,
+                          const src_reg &surface, const src_reg &addr,
+                          const src_reg &src0, const src_reg &src1,
+                          unsigned dims, unsigned rsize, unsigned op,
+                          brw_predicate pred)
+      {
+         const bool has_simd4x2 = bld.shader->devinfo->verx10 == 75;
+
+         /* Zip the components of both sources, they are represented as the X
+          * and Y components of the same vector.
+          */
+         const unsigned size = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+         const dst_reg srcs = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+         if (size >= 1) {
+            bld.MOV(writemask(srcs, WRITEMASK_X),
+                    swizzle(src0, BRW_SWIZZLE_XXXX));
+         }
+
+         if (size >= 2) {
+            bld.MOV(writemask(srcs, WRITEMASK_Y),
+                    swizzle(src1, BRW_SWIZZLE_XXXX));
+         }
+
+         return emit_send(bld, VEC4_OPCODE_UNTYPED_ATOMIC, src_reg(),
+                          emit_insert(bld, addr, dims, has_simd4x2),
+                          has_simd4x2 ? 1 : dims,
+                          emit_insert(bld, src_reg(srcs), size, has_simd4x2),
+                          has_simd4x2 && size ? 1 : size,
+                          surface, op, rsize, pred);
+      }
+   }
+}
diff --git a/src/intel/compiler/elk/brw_vec4_surface_builder.h b/src/intel/compiler/elk/brw_vec4_surface_builder.h
new file mode 100644
index 00000000000..2821685a361
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_surface_builder.h
@@ -0,0 +1,53 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_VEC4_SURFACE_BUILDER_H
+#define BRW_VEC4_SURFACE_BUILDER_H
+
+#include "brw_vec4_builder.h"
+
+namespace brw {
+   namespace surface_access {
+      src_reg
+      emit_untyped_read(const vec4_builder &bld,
+                        const src_reg &surface, const src_reg &addr,
+                        unsigned dims, unsigned size,
+                        brw_predicate pred = BRW_PREDICATE_NONE);
+
+      void
+      emit_untyped_write(const vec4_builder &bld, const src_reg &surface,
+                         const src_reg &addr, const src_reg &src,
+                         unsigned dims, unsigned size,
+                         brw_predicate pred = BRW_PREDICATE_NONE);
+
+      src_reg
+      emit_untyped_atomic(const vec4_builder &bld,
+                          const src_reg &surface, const src_reg &addr,
+                          const src_reg &src0, const src_reg &src1,
+                          unsigned dims, unsigned rsize, unsigned op,
+                          brw_predicate pred = BRW_PREDICATE_NONE);
+   }
+}
+
+#endif
diff --git a/src/intel/compiler/elk/brw_vec4_tcs.cpp b/src/intel/compiler/elk/brw_vec4_tcs.cpp
new file mode 100644
index 00000000000..827bba3c59d
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_tcs.cpp
@@ -0,0 +1,500 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tcs.cpp
+ *
+ * Tessellaton control shader specific code derived from the vec4_visitor class.
+ */
+
+#include "intel_nir.h"
+#include "brw_nir.h"
+#include "brw_vec4_tcs.h"
+#include "brw_fs.h"
+#include "brw_private.h"
+#include "dev/intel_debug.h"
+
+namespace brw {
+
+vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler,
+                                   const struct brw_compile_params *params,
+                                   const struct brw_tcs_prog_key *key,
+                                   struct brw_tcs_prog_data *prog_data,
+                                   const nir_shader *nir,
+                                   bool debug_enabled)
+   : vec4_visitor(compiler, params, &key->base.tex, &prog_data->base,
+                  nir, false, debug_enabled),
+     key(key)
+{
+}
+
+
+void
+vec4_tcs_visitor::setup_payload()
+{
+   int reg = 0;
+
+   /* The payload always contains important data in r0, which contains
+    * the URB handles that are passed on to the URB write at the end
+    * of the thread.
+    */
+   reg++;
+
+   /* r1.0 - r4.7 may contain the input control point URB handles,
+    * which we use to pull vertex data.
+    */
+   reg += 4;
+
+   /* Push constants may start at r5.0 */
+   reg = setup_uniforms(reg);
+
+   this->first_non_payload_grf = reg;
+}
+
+
+void
+vec4_tcs_visitor::emit_prolog()
+{
+   invocation_id = src_reg(this, glsl_uint_type());
+   emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id));
+
+   /* HS threads are dispatched with the dispatch mask set to 0xFF.
+    * If there are an odd number of output vertices, then the final
+    * HS instance dispatched will only have its bottom half doing real
+    * work, and so we need to disable the upper half:
+    */
+   if (nir->info.tess.tcs_vertices_out % 2) {
+      emit(CMP(dst_null_d(), invocation_id,
+               brw_imm_ud(nir->info.tess.tcs_vertices_out),
+               BRW_CONDITIONAL_L));
+
+      /* Matching ENDIF is in emit_thread_end() */
+      emit(IF(BRW_PREDICATE_NORMAL));
+   }
+}
+
+
+void
+vec4_tcs_visitor::emit_thread_end()
+{
+   vec4_instruction *inst;
+   current_annotation = "thread end";
+
+   if (nir->info.tess.tcs_vertices_out % 2) {
+      emit(BRW_OPCODE_ENDIF);
+   }
+
+   if (devinfo->ver == 7) {
+      struct brw_tcs_prog_data *tcs_prog_data =
+         (struct brw_tcs_prog_data *) prog_data;
+
+      current_annotation = "release input vertices";
+
+      /* Synchronize all threads, so we know that no one is still
+       * using the input URB handles.
+       */
+      if (tcs_prog_data->instances > 1) {
+         dst_reg header = dst_reg(this, glsl_uvec4_type());
+         emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
+         emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
+      }
+
+      /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles.
+       * We want to compare the bottom half of invocation_id with 0, but
+       * use that truth value for the top half as well.  Unfortunately,
+       * we don't have stride in the vec4 world, nor UV immediates in
+       * align16, so we need an opcode to get invocation_id<0,4,0>.
+       */
+      set_condmod(BRW_CONDITIONAL_Z,
+                  emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(),
+                       invocation_id));
+      emit(IF(BRW_PREDICATE_NORMAL));
+      for (unsigned i = 0; i < key->input_vertices; i += 2) {
+         /* If we have an odd number of input vertices, the last will be
+          * unpaired.  We don't want to use an interleaved URB write in
+          * that case.
+          */
+         const bool is_unpaired = i == key->input_vertices - 1;
+
+         dst_reg header(this, glsl_uvec4_type());
+         emit(TCS_OPCODE_RELEASE_INPUT, header, brw_imm_ud(i),
+              brw_imm_ud(is_unpaired));
+      }
+      emit(BRW_OPCODE_ENDIF);
+   }
+
+   inst = emit(TCS_OPCODE_THREAD_END);
+   inst->base_mrf = 14;
+   inst->mlen = 2;
+}
+
+
+void
+vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst,
+                                      const src_reg &vertex_index,
+                                      unsigned base_offset,
+                                      unsigned first_component,
+                                      const src_reg &indirect_offset)
+{
+   vec4_instruction *inst;
+   dst_reg temp(this, glsl_ivec4_type());
+   temp.type = dst.type;
+
+   /* Set up the message header to reference the proper parts of the URB */
+   dst_reg header = dst_reg(this, glsl_uvec4_type());
+   inst = emit(VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index,
+               indirect_offset);
+   inst->force_writemask_all = true;
+
+   /* Read into a temporary, ignoring writemasking. */
+   inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
+   inst->offset = base_offset;
+   inst->mlen = 1;
+   inst->base_mrf = -1;
+
+   /* Copy the temporary to the destination to deal with writemasking.
+    *
+    * Also attempt to deal with gl_PointSize being in the .w component.
+    */
+   if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
+      emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW)));
+   } else {
+      src_reg src = src_reg(temp);
+      src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
+      emit(MOV(dst, src));
+   }
+}
+
+void
+vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst,
+                                       unsigned base_offset,
+                                       unsigned first_component,
+                                       const src_reg &indirect_offset)
+{
+   vec4_instruction *inst;
+
+   /* Set up the message header to reference the proper parts of the URB */
+   dst_reg header = dst_reg(this, glsl_uvec4_type());
+   inst = emit(VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header,
+               brw_imm_ud(dst.writemask << first_component), indirect_offset);
+   inst->force_writemask_all = true;
+
+   vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header));
+   read->offset = base_offset;
+   read->mlen = 1;
+   read->base_mrf = -1;
+
+   if (first_component) {
+      /* Read into a temporary and copy with a swizzle and writemask. */
+      read->dst = retype(dst_reg(this, glsl_ivec4_type()), dst.type);
+      emit(MOV(dst, swizzle(src_reg(read->dst),
+                            BRW_SWZ_COMP_INPUT(first_component))));
+   }
+}
+
+void
+vec4_tcs_visitor::emit_urb_write(const src_reg &value,
+                                 unsigned writemask,
+                                 unsigned base_offset,
+                                 const src_reg &indirect_offset)
+{
+   if (writemask == 0)
+      return;
+
+   src_reg message(this, glsl_uvec4_type(), 2);
+   vec4_instruction *inst;
+
+   inst = emit(VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message),
+               brw_imm_ud(writemask), indirect_offset);
+   inst->force_writemask_all = true;
+   inst = emit(MOV(byte_offset(dst_reg(retype(message, value.type)), REG_SIZE),
+                   value));
+   inst->force_writemask_all = true;
+
+   inst = emit(VEC4_TCS_OPCODE_URB_WRITE, dst_null_f(), message);
+   inst->offset = base_offset;
+   inst->mlen = 2;
+   inst->base_mrf = -1;
+}
+
+void
+vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_invocation_id:
+      emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_UD),
+               invocation_id));
+      break;
+   case nir_intrinsic_load_primitive_id:
+      emit(TCS_OPCODE_GET_PRIMITIVE_ID,
+           get_nir_def(instr->def, BRW_REGISTER_TYPE_UD));
+      break;
+   case nir_intrinsic_load_patch_vertices_in:
+      emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_D),
+               brw_imm_d(key->input_vertices)));
+      break;
+   case nir_intrinsic_load_per_vertex_input: {
+      assert(instr->def.bit_size == 32);
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = nir_intrinsic_base(instr);
+
+      src_reg vertex_index = retype(get_nir_src_imm(instr->src[0]),
+                                    BRW_REGISTER_TYPE_UD);
+
+      unsigned first_component = nir_intrinsic_component(instr);
+      dst_reg dst = get_nir_def(instr->def, BRW_REGISTER_TYPE_D);
+      dst.writemask = brw_writemask_for_size(instr->num_components);
+      emit_input_urb_read(dst, vertex_index, imm_offset,
+                          first_component, indirect_offset);
+      break;
+   }
+   case nir_intrinsic_load_input:
+      unreachable("nir_lower_io should use load_per_vertex_input intrinsics");
+      break;
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_per_vertex_output: {
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = nir_intrinsic_base(instr);
+
+      dst_reg dst = get_nir_def(instr->def, BRW_REGISTER_TYPE_D);
+      dst.writemask = brw_writemask_for_size(instr->num_components);
+
+      emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr),
+                           indirect_offset);
+      break;
+   }
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_per_vertex_output: {
+      assert(nir_src_bit_size(instr->src[0]) == 32);
+      src_reg value = get_nir_src(instr->src[0]);
+      unsigned mask = nir_intrinsic_write_mask(instr);
+      unsigned swiz = BRW_SWIZZLE_XYZW;
+
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = nir_intrinsic_base(instr);
+
+      unsigned first_component = nir_intrinsic_component(instr);
+      if (first_component) {
+         assert(swiz == BRW_SWIZZLE_XYZW);
+         swiz = BRW_SWZ_COMP_OUTPUT(first_component);
+         mask = mask << first_component;
+      }
+
+      emit_urb_write(swizzle(value, swiz), mask,
+                     imm_offset, indirect_offset);
+      break;
+   }
+
+   case nir_intrinsic_barrier:
+      if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
+         vec4_visitor::nir_emit_intrinsic(instr);
+      if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
+         dst_reg header = dst_reg(this, glsl_uvec4_type());
+         emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
+         emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
+      }
+      break;
+
+   default:
+      vec4_visitor::nir_emit_intrinsic(instr);
+   }
+}
+
+/**
+ * Return the number of patches to accumulate before a MULTI_PATCH mode thread is
+ * launched.  In cases with a large number of input control points and a large
+ * amount of VS outputs, the VS URB space needed to store an entire 8 patches
+ * worth of data can be prohibitive, so it can be beneficial to launch threads
+ * early.
+ *
+ * See the 3DSTATE_HS::Patch Count Threshold documentation for the recommended
+ * values.  Note that 0 means to "disable" early dispatch, meaning to wait for
+ * a full 8 patches as normal.
+ */
+static int
+get_patch_count_threshold(int input_control_points)
+{
+   if (input_control_points <= 4)
+      return 0;
+   else if (input_control_points <= 6)
+      return 5;
+   else if (input_control_points <= 8)
+      return 4;
+   else if (input_control_points <= 10)
+      return 3;
+   else if (input_control_points <= 14)
+      return 2;
+
+   /* Return patch count 1 for PATCHLIST_15 - PATCHLIST_32 */
+   return 1;
+}
+
+} /* namespace brw */
+
+extern "C" const unsigned *
+brw_compile_tcs(const struct brw_compiler *compiler,
+                struct brw_compile_tcs_params *params)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   nir_shader *nir = params->base.nir;
+   const struct brw_tcs_prog_key *key = params->key;
+   struct brw_tcs_prog_data *prog_data = params->prog_data;
+   struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
+
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
+   const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TCS);
+   const unsigned *assembly;
+
+   vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL;
+   prog_data->base.base.ray_queries = nir->info.ray_queries;
+   prog_data->base.base.total_scratch = 0;
+
+   nir->info.outputs_written = key->outputs_written;
+   nir->info.patch_outputs_written = key->patch_outputs_written;
+
+   struct intel_vue_map input_vue_map;
+   brw_compute_vue_map(devinfo, &input_vue_map, nir->info.inputs_read,
+                       nir->info.separate_shader, 1);
+   brw_compute_tess_vue_map(&vue_prog_data->vue_map,
+                            nir->info.outputs_written,
+                            nir->info.patch_outputs_written);
+
+   brw_nir_apply_key(nir, compiler, &key->base, 8);
+   brw_nir_lower_vue_inputs(nir, &input_vue_map);
+   brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map,
+                             key->_tes_primitive_mode);
+   if (key->quads_workaround)
+      intel_nir_apply_tcs_quads_workaround(nir);
+   if (key->input_vertices > 0)
+      intel_nir_lower_patch_vertices_in(nir, key->input_vertices);
+
+   brw_postprocess_nir(nir, compiler, debug_enabled,
+                       key->base.robust_flags);
+
+   bool has_primitive_id =
+      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
+
+   prog_data->patch_count_threshold = brw::get_patch_count_threshold(key->input_vertices);
+
+   if (compiler->use_tcs_multi_patch) {
+      vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
+      prog_data->instances = nir->info.tess.tcs_vertices_out;
+      prog_data->include_primitive_id = has_primitive_id;
+   } else {
+      unsigned verts_per_thread = is_scalar ? 8 : 2;
+      vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH;
+      prog_data->instances =
+         DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
+   }
+
+   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
+    * That divides up as follows:
+    *
+    *     32 bytes for the patch header (tessellation factors)
+    *    480 bytes for per-patch varyings (a varying component is 4 bytes and
+    *              gl_MaxTessPatchComponents = 120)
+    *  16384 bytes for per-vertex varyings (a varying component is 4 bytes,
+    *              gl_MaxPatchVertices = 32 and
+    *              gl_MaxTessControlOutputComponents = 128)
+    *
+    *  15808 bytes left for varying packing overhead
+    */
+   const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots;
+   const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots;
+   unsigned output_size_bytes = 0;
+   /* Note that the patch header is counted in num_per_patch_slots. */
+   output_size_bytes += num_per_patch_slots * 16;
+   output_size_bytes += nir->info.tess.tcs_vertices_out *
+                        num_per_vertex_slots * 16;
+
+   assert(output_size_bytes >= 1);
+   if (output_size_bytes > GFX7_MAX_HS_URB_ENTRY_SIZE_BYTES)
+      return NULL;
+
+   /* URB entry sizes are stored as a multiple of 64 bytes. */
+   vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+
+   /* HS does not use the usual payload pushing from URB to GRFs,
+    * because we don't have enough registers for a full-size payload, and
+    * the hardware is broken on Haswell anyway.
+    */
+   vue_prog_data->urb_read_length = 0;
+
+   if (unlikely(debug_enabled)) {
+      fprintf(stderr, "TCS Input ");
+      brw_print_vue_map(stderr, &input_vue_map, MESA_SHADER_TESS_CTRL);
+      fprintf(stderr, "TCS Output ");
+      brw_print_vue_map(stderr, &vue_prog_data->vue_map, MESA_SHADER_TESS_CTRL);
+   }
+
+   if (is_scalar) {
+      const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
+      fs_visitor v(compiler, &params->base, &key->base,
+                   &prog_data->base.base, nir, dispatch_width,
+                   params->base.stats != NULL, debug_enabled);
+      if (!v.run_tcs()) {
+         params->base.error_str =
+            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      assert(v.payload().num_regs % reg_unit(devinfo) == 0);
+      prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
+
+      fs_generator g(compiler, &params->base,
+                     &prog_data->base.base, false, MESA_SHADER_TESS_CTRL);
+      if (unlikely(debug_enabled)) {
+         g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
+                                        "%s tessellation control shader %s",
+                                        nir->info.label ? nir->info.label
+                                                        : "unnamed",
+                                        nir->info.name));
+      }
+
+      g.generate_code(v.cfg, dispatch_width, v.shader_stats,
+                      v.performance_analysis.require(), params->base.stats);
+
+      g.add_const_data(nir->constant_data, nir->constant_data_size);
+
+      assembly = g.get_assembly();
+   } else {
+      brw::vec4_tcs_visitor v(compiler, &params->base, key, prog_data,
+                              nir, debug_enabled);
+      if (!v.run()) {
+         params->base.error_str =
+            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      if (INTEL_DEBUG(DEBUG_TCS))
+         v.dump_instructions();
+
+
+      assembly = brw_vec4_generate_assembly(compiler, &params->base, nir,
+                                            &prog_data->base, v.cfg,
+                                            v.performance_analysis.require(),
+                                            debug_enabled);
+   }
+
+   return assembly;
+}
diff --git a/src/intel/compiler/elk/brw_vec4_tcs.h b/src/intel/compiler/elk/brw_vec4_tcs.h
new file mode 100644
index 00000000000..e5de6c4945b
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_tcs.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tcs.h
+ *
+ * The vec4-mode tessellation control shader compiler backend.
+ */
+
+#ifndef BRW_VEC4_TCS_H
+#define BRW_VEC4_TCS_H
+
+#include "brw_compiler.h"
+#include "brw_eu.h"
+#include "brw_vec4.h"
+
+#ifdef __cplusplus
+namespace brw {
+
+class vec4_tcs_visitor : public vec4_visitor
+{
+public:
+   vec4_tcs_visitor(const struct brw_compiler *compiler,
+                    const struct brw_compile_params *params,
+                    const struct brw_tcs_prog_key *key,
+                    struct brw_tcs_prog_data *prog_data,
+                    const nir_shader *nir,
+                    bool debug_enabled);
+
+protected:
+   virtual void setup_payload();
+   virtual void emit_prolog();
+   virtual void emit_thread_end();
+
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+
+   void emit_input_urb_read(const dst_reg &dst,
+                            const src_reg &vertex_index,
+                            unsigned base_offset,
+                            unsigned first_component,
+                            const src_reg &indirect_offset);
+   void emit_output_urb_read(const dst_reg &dst,
+                             unsigned base_offset,
+                             unsigned first_component,
+                             const src_reg &indirect_offset);
+
+   void emit_urb_write(const src_reg &value, unsigned writemask,
+                       unsigned base_offset, const src_reg &indirect_offset);
+
+   /* we do not use the normal end-of-shader URB write mechanism -- but every
+    * vec4 stage must provide implementations of these:
+    */
+   virtual void emit_urb_write_header(int /* mrf */) {}
+   virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */) { return NULL; }
+
+   const struct brw_tcs_prog_key *key;
+   src_reg invocation_id;
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_TCS_H */
diff --git a/src/intel/compiler/elk/brw_vec4_tes.cpp b/src/intel/compiler/elk/brw_vec4_tes.cpp
new file mode 100644
index 00000000000..7af5220be75
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_tes.cpp
@@ -0,0 +1,223 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tes.cpp
+ *
+ * Tessellaton evaluation shader specific code derived from the vec4_visitor class.
+ */
+
+#include "brw_vec4_tes.h"
+#include "brw_cfg.h"
+#include "dev/intel_debug.h"
+
+namespace brw {
+
+vec4_tes_visitor::vec4_tes_visitor(const struct brw_compiler *compiler,
+                                   const struct brw_compile_params *params,
+                                  const struct brw_tes_prog_key *key,
+                                  struct brw_tes_prog_data *prog_data,
+                                  const nir_shader *shader,
+                                  bool debug_enabled)
+   : vec4_visitor(compiler, params, &key->base.tex, &prog_data->base,
+                  shader, false, debug_enabled)
+{
+}
+
+void
+vec4_tes_visitor::setup_payload()
+{
+   int reg = 0;
+
+   /* The payload always contains important data in r0 and r1, which contains
+    * the URB handles that are passed on to the URB write at the end
+    * of the thread.
+    */
+   reg += 2;
+
+   reg = setup_uniforms(reg);
+
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file != ATTR)
+            continue;
+
+         unsigned slot = inst->src[i].nr + inst->src[i].offset / 16;
+         struct brw_reg grf = brw_vec4_grf(reg + slot / 2, 4 * (slot % 2));
+         grf = stride(grf, 0, 4, 1);
+         grf.swizzle = inst->src[i].swizzle;
+         grf.type = inst->src[i].type;
+         grf.abs = inst->src[i].abs;
+         grf.negate = inst->src[i].negate;
+         inst->src[i] = grf;
+      }
+   }
+
+   reg += 8 * prog_data->urb_read_length;
+
+   this->first_non_payload_grf = reg;
+}
+
+
+void
+vec4_tes_visitor::emit_prolog()
+{
+   input_read_header = src_reg(this, glsl_uvec4_type());
+   emit(TES_OPCODE_CREATE_INPUT_READ_HEADER, dst_reg(input_read_header));
+
+   this->current_annotation = NULL;
+}
+
+
+void
+vec4_tes_visitor::emit_urb_write_header(int mrf)
+{
+   /* No need to do anything for DS; an implied write to this MRF will be
+    * performed by VEC4_VS_OPCODE_URB_WRITE.
+    */
+   (void) mrf;
+}
+
+
+vec4_instruction *
+vec4_tes_visitor::emit_urb_write_opcode(bool complete)
+{
+   vec4_instruction *inst = emit(VEC4_VS_OPCODE_URB_WRITE);
+   inst->urb_write_flags = complete ?
+      BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
+
+   return inst;
+}
+
+void
+vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   const struct brw_tes_prog_data *tes_prog_data =
+      (const struct brw_tes_prog_data *) prog_data;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_tess_coord:
+      /* gl_TessCoord is part of the payload in g1 channels 0-2 and 4-6. */
+      emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F),
+               src_reg(brw_vec8_grf(1, 0))));
+      break;
+   case nir_intrinsic_load_tess_level_outer:
+      if (tes_prog_data->domain == INTEL_TESS_DOMAIN_ISOLINE) {
+         emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F),
+                  swizzle(src_reg(ATTR, 1, glsl_vec4_type()),
+                          BRW_SWIZZLE_ZWZW)));
+      } else {
+         emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F),
+                  swizzle(src_reg(ATTR, 1, glsl_vec4_type()),
+                          BRW_SWIZZLE_WZYX)));
+      }
+      break;
+   case nir_intrinsic_load_tess_level_inner:
+      if (tes_prog_data->domain == INTEL_TESS_DOMAIN_QUAD) {
+         emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F),
+                  swizzle(src_reg(ATTR, 0, glsl_vec4_type()),
+                          BRW_SWIZZLE_WZYX)));
+      } else {
+         emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F),
+                  src_reg(ATTR, 1, glsl_float_type())));
+      }
+      break;
+   case nir_intrinsic_load_primitive_id:
+      emit(TES_OPCODE_GET_PRIMITIVE_ID,
+           get_nir_def(instr->def, BRW_REGISTER_TYPE_UD));
+      break;
+
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_per_vertex_input: {
+      assert(instr->def.bit_size == 32);
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+      src_reg header = input_read_header;
+      unsigned first_component = nir_intrinsic_component(instr);
+
+      if (indirect_offset.file != BAD_FILE) {
+         src_reg clamped_indirect_offset = src_reg(this, glsl_uvec4_type());
+
+         /* Page 190 of "Volume 7: 3D Media GPGPU Engine (Haswell)" says the
+          * valid range of the offset is [0, 0FFFFFFFh].
+          */
+         emit_minmax(BRW_CONDITIONAL_L,
+                     dst_reg(clamped_indirect_offset),
+                     retype(indirect_offset, BRW_REGISTER_TYPE_UD),
+                     brw_imm_ud(0x0fffffffu));
+
+         header = src_reg(this, glsl_uvec4_type());
+         emit(TES_OPCODE_ADD_INDIRECT_URB_OFFSET, dst_reg(header),
+              input_read_header, clamped_indirect_offset);
+      } else {
+         /* Arbitrarily only push up to 24 vec4 slots worth of data,
+          * which is 12 registers (since each holds 2 vec4 slots).
+          */
+         const unsigned max_push_slots = 24;
+         if (imm_offset < max_push_slots) {
+            src_reg src = src_reg(ATTR, imm_offset, glsl_ivec4_type());
+            src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
+
+            emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_D), src));
+
+            prog_data->urb_read_length =
+               MAX2(prog_data->urb_read_length,
+                    DIV_ROUND_UP(imm_offset + 1, 2));
+            break;
+         }
+      }
+
+      dst_reg temp(this, glsl_ivec4_type());
+      vec4_instruction *read =
+         emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
+      read->offset = imm_offset;
+      read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
+
+      src_reg src = src_reg(temp);
+      src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
+
+      /* Copy to target.  We might end up with some funky writemasks landing
+       * in here, but we really don't want them in the above pseudo-ops.
+       */
+      dst_reg dst = get_nir_def(instr->def, BRW_REGISTER_TYPE_D);
+      dst.writemask = brw_writemask_for_size(instr->num_components);
+      emit(MOV(dst, src));
+      break;
+   }
+   default:
+      vec4_visitor::nir_emit_intrinsic(instr);
+   }
+}
+
+
+void
+vec4_tes_visitor::emit_thread_end()
+{
+   /* For DS, we always end the thread by emitting a single vertex.
+    * emit_urb_write_opcode() will take care of setting the eot flag on the
+    * SEND instruction.
+    */
+   emit_vertex();
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/elk/brw_vec4_tes.h b/src/intel/compiler/elk/brw_vec4_tes.h
new file mode 100644
index 00000000000..23a11956681
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_tes.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tes.h
+ *
+ * The vec4 mode tessellation evaluation shader compiler backend.
+ */
+
+#ifndef BRW_VEC4_TES_H
+#define BRW_VEC4_TES_H
+
+#include "brw_vec4.h"
+
+#ifdef __cplusplus
+namespace brw {
+
+class vec4_tes_visitor : public vec4_visitor
+{
+public:
+   vec4_tes_visitor(const struct brw_compiler *compiler,
+                    const struct brw_compile_params *params,
+                   const struct brw_tes_prog_key *key,
+                   struct brw_tes_prog_data *prog_data,
+                   const nir_shader *nir,
+                   bool debug_enabled);
+
+protected:
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+
+   virtual void setup_payload();
+   virtual void emit_prolog();
+   virtual void emit_thread_end();
+
+   virtual void emit_urb_write_header(int mrf);
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete);
+
+private:
+   src_reg input_read_header;
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_TES_H */
diff --git a/src/intel/compiler/elk/brw_vec4_visitor.cpp b/src/intel/compiler/elk/brw_vec4_visitor.cpp
new file mode 100644
index 00000000000..31a8814ac45
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_visitor.cpp
@@ -0,0 +1,1429 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+#include "util/u_math.h"
+
+namespace brw {
+
+vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
+                                   const src_reg &src0, const src_reg &src1,
+                                   const src_reg &src2)
+{
+   this->opcode = opcode;
+   this->dst = dst;
+   this->src[0] = src0;
+   this->src[1] = src1;
+   this->src[2] = src2;
+   this->saturate = false;
+   this->force_writemask_all = false;
+   this->no_dd_clear = false;
+   this->no_dd_check = false;
+   this->writes_accumulator = false;
+   this->conditional_mod = BRW_CONDITIONAL_NONE;
+   this->predicate = BRW_PREDICATE_NONE;
+   this->predicate_inverse = false;
+   this->target = 0;
+   this->shadow_compare = false;
+   this->eot = false;
+   this->ir = NULL;
+   this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
+   this->header_size = 0;
+   this->flag_subreg = 0;
+   this->mlen = 0;
+   this->base_mrf = 0;
+   this->offset = 0;
+   this->exec_size = 8;
+   this->group = 0;
+   this->size_written = (dst.file == BAD_FILE ?
+                         0 : this->exec_size * type_sz(dst.type));
+   this->annotation = NULL;
+}
+
+vec4_instruction *
+vec4_visitor::emit(vec4_instruction *inst)
+{
+   inst->ir = this->base_ir;
+   inst->annotation = this->current_annotation;
+
+   this->instructions.push_tail(inst);
+
+   return inst;
+}
+
+vec4_instruction *
+vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
+                          vec4_instruction *new_inst)
+{
+   new_inst->ir = inst->ir;
+   new_inst->annotation = inst->annotation;
+
+   inst->insert_before(block, new_inst);
+
+   return inst;
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+                   const src_reg &src1, const src_reg &src2)
+{
+   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
+}
+
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+                   const src_reg &src1)
+{
+   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
+{
+   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
+{
+   return emit(new(mem_ctx) vec4_instruction(opcode, dst));
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode)
+{
+   return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
+}
+
+#define ALU1(op)							\
+   vec4_instruction *							\
+   vec4_visitor::op(const dst_reg &dst, const src_reg &src0)		\
+   {									\
+      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
+   }
+
+#define ALU2(op)							\
+   vec4_instruction *							\
+   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
+                    const src_reg &src1)				\
+   {									\
+      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
+                                           src0, src1);                 \
+   }
+
+#define ALU2_ACC(op)							\
+   vec4_instruction *							\
+   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
+                    const src_reg &src1)				\
+   {									\
+      vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
+                       BRW_OPCODE_##op, dst, src0, src1);		\
+      inst->writes_accumulator = true;                                  \
+      return inst;                                                      \
+   }
+
+#define ALU3(op)							\
+   vec4_instruction *							\
+   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
+                    const src_reg &src1, const src_reg &src2)		\
+   {									\
+      assert(devinfo->ver >= 6);						\
+      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,	\
+					   src0, src1, src2);		\
+   }
+
+ALU1(NOT)
+ALU1(MOV)
+ALU1(FRC)
+ALU1(RNDD)
+ALU1(RNDE)
+ALU1(RNDZ)
+ALU1(F32TO16)
+ALU1(F16TO32)
+ALU2(ADD)
+ALU2(MUL)
+ALU2_ACC(MACH)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(DP3)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(SHL)
+ALU2(SHR)
+ALU2(ASR)
+ALU3(LRP)
+ALU1(BFREV)
+ALU3(BFE)
+ALU2(BFI1)
+ALU3(BFI2)
+ALU1(FBH)
+ALU1(FBL)
+ALU1(CBIT)
+ALU1(LZD)
+ALU3(MAD)
+ALU2_ACC(ADDC)
+ALU2_ACC(SUBB)
+ALU2(MAC)
+ALU1(DIM)
+
+/** Gfx4 predicated IF. */
+vec4_instruction *
+vec4_visitor::IF(enum brw_predicate predicate)
+{
+   vec4_instruction *inst;
+
+   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
+   inst->predicate = predicate;
+
+   return inst;
+}
+
+/** Gfx6 IF with embedded comparison. */
+vec4_instruction *
+vec4_visitor::IF(src_reg src0, src_reg src1,
+                 enum brw_conditional_mod condition)
+{
+   assert(devinfo->ver == 6);
+
+   vec4_instruction *inst;
+
+   resolve_ud_negate(&src0);
+   resolve_ud_negate(&src1);
+
+   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
+					src0, src1);
+   inst->conditional_mod = condition;
+
+   return inst;
+}
+
+/**
+ * CMP: Sets the low bit of the destination channels with the result
+ * of the comparison, while the upper bits are undefined, and updates
+ * the flag register with the packed 16 bits of the result.
+ */
+vec4_instruction *
+vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
+                  enum brw_conditional_mod condition)
+{
+   vec4_instruction *inst;
+
+   /* Take the instruction:
+    *
+    * CMP null<d> src0<f> src1<f>
+    *
+    * Original gfx4 does type conversion to the destination type before
+    * comparison, producing garbage results for floating point comparisons.
+    *
+    * The destination type doesn't matter on newer generations, so we set the
+    * type to match src0 so we can compact the instruction.
+    */
+   dst.type = src0.type;
+
+   resolve_ud_negate(&src0);
+   resolve_ud_negate(&src1);
+
+   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
+   inst->conditional_mod = condition;
+
+   return inst;
+}
+
+vec4_instruction *
+vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
+{
+   vec4_instruction *inst;
+
+   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_READ,
+					dst, index);
+   inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver) + 1;
+   inst->mlen = 2;
+
+   return inst;
+}
+
+vec4_instruction *
+vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
+                            const src_reg &index)
+{
+   vec4_instruction *inst;
+
+   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_WRITE,
+					dst, src, index);
+   inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver);
+   inst->mlen = 3;
+
+   return inst;
+}
+
+src_reg
+vec4_visitor::fix_3src_operand(const src_reg &src)
+{
+   /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
+    * able to use vertical stride of zero to replicate the vec4 uniform, like
+    *
+    *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
+    *
+    * But you can't, since vertical stride is always four in three-source
+    * instructions. Instead, insert a MOV instruction to do the replication so
+    * that the three-source instruction can consume it.
+    */
+
+   /* The MOV is only needed if the source is a uniform or immediate. */
+   if (src.file != UNIFORM && src.file != IMM)
+      return src;
+
+   if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
+      return src;
+
+   dst_reg expanded = dst_reg(this, glsl_vec4_type());
+   expanded.type = src.type;
+   emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
+   return src_reg(expanded);
+}
+
+src_reg
+vec4_visitor::fix_math_operand(const src_reg &src)
+{
+   if (devinfo->ver < 6 || src.file == BAD_FILE)
+      return src;
+
+   /* The gfx6 math instruction ignores the source modifiers --
+    * swizzle, abs, negate, and at least some parts of the register
+    * region description.
+    *
+    * Rather than trying to enumerate all these cases, *always* expand the
+    * operand to a temp GRF for gfx6.
+    *
+    * For gfx7, keep the operand as-is, except if immediate, which gfx7 still
+    * can't use.
+    */
+
+   if (devinfo->ver == 7 && src.file != IMM)
+      return src;
+
+   dst_reg expanded = dst_reg(this, glsl_vec4_type());
+   expanded.type = src.type;
+   emit(MOV(expanded, src));
+   return src_reg(expanded);
+}
+
+vec4_instruction *
+vec4_visitor::emit_math(enum opcode opcode,
+                        const dst_reg &dst,
+                        const src_reg &src0, const src_reg &src1)
+{
+   vec4_instruction *math =
+      emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
+
+   if (devinfo->ver == 6 && dst.writemask != WRITEMASK_XYZW) {
+      /* MATH on Gfx6 must be align1, so we can't do writemasks. */
+      math->dst = dst_reg(this, glsl_vec4_type());
+      math->dst.type = dst.type;
+      math = emit(MOV(dst, src_reg(math->dst)));
+   } else if (devinfo->ver < 6) {
+      math->base_mrf = 1;
+      math->mlen = src1.file == BAD_FILE ? 1 : 2;
+   }
+
+   return math;
+}
+
+void
+vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
+{
+   if (devinfo->ver < 7) {
+      unreachable("ir_unop_pack_half_2x16 should be lowered");
+   }
+
+   assert(dst.type == BRW_REGISTER_TYPE_UD);
+   assert(src0.type == BRW_REGISTER_TYPE_F);
+
+   /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
+    *
+    *   Because this instruction does not have a 16-bit floating-point type,
+    *   the destination data type must be Word (W).
+    *
+    *   The destination must be DWord-aligned and specify a horizontal stride
+    *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
+    *   each destination channel and the upper word is not modified.
+    *
+    * The above restriction implies that the f32to16 instruction must use
+    * align1 mode, because only in align1 mode is it possible to specify
+    * horizontal stride.  We choose here to defy the hardware docs and emit
+    * align16 instructions.
+    *
+    * (I [chadv] did attempt to emit align1 instructions for VS f32to16
+    * instructions. I was partially successful in that the code passed all
+    * tests.  However, the code was dubiously correct and fragile, and the
+    * tests were not harsh enough to probe that frailty. Not trusting the
+    * code, I chose instead to remain in align16 mode in defiance of the hw
+    * docs).
+    *
+    * I've [chadv] experimentally confirmed that, on gfx7 hardware and the
+    * simulator, emitting a f32to16 in align16 mode with UD as destination
+    * data type is safe. The behavior differs from that specified in the PRM
+    * in that the upper word of each destination channel is cleared to 0.
+    */
+
+   dst_reg tmp_dst(this, glsl_uvec2_type());
+   src_reg tmp_src(tmp_dst);
+
+#if 0
+   /* Verify the undocumented behavior on which the following instructions
+    * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
+    * then the result of the bit-or instruction below will be incorrect.
+    *
+    * You should inspect the disasm output in order to verify that the MOV is
+    * not optimized away.
+    */
+   emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
+#endif
+
+   /* Give tmp the form below, where "." means untouched.
+    *
+    *     w z          y          x w z          y          x
+    *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
+    *
+    * That the upper word of each write-channel be 0 is required for the
+    * following bit-shift and bit-or instructions to work. Note that this
+    * relies on the undocumented hardware behavior mentioned above.
+    */
+   tmp_dst.writemask = WRITEMASK_XY;
+   emit(F32TO16(tmp_dst, src0));
+
+   /* Give the write-channels of dst the form:
+    *   0xhhhh0000
+    */
+   tmp_src.swizzle = BRW_SWIZZLE_YYYY;
+   emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
+
+   /* Finally, give the write-channels of dst the form of packHalf2x16's
+    * output:
+    *   0xhhhhllll
+    */
+   tmp_src.swizzle = BRW_SWIZZLE_XXXX;
+   emit(OR(dst, src_reg(dst), tmp_src));
+}
+
+void
+vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
+{
+   if (devinfo->ver < 7) {
+      unreachable("ir_unop_unpack_half_2x16 should be lowered");
+   }
+
+   assert(dst.type == BRW_REGISTER_TYPE_F);
+   assert(src0.type == BRW_REGISTER_TYPE_UD);
+
+   /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
+    *
+    *   Because this instruction does not have a 16-bit floating-point type,
+    *   the source data type must be Word (W). The destination type must be
+    *   F (Float).
+    *
+    * To use W as the source data type, we must adjust horizontal strides,
+    * which is only possible in align1 mode. All my [chadv] attempts at
+    * emitting align1 instructions for unpackHalf2x16 failed to pass the
+    * Piglit tests, so I gave up.
+    *
+    * I've verified that, on gfx7 hardware and the simulator, it is safe to
+    * emit f16to32 in align16 mode with UD as source data type.
+    */
+
+   dst_reg tmp_dst(this, glsl_uvec2_type());
+   src_reg tmp_src(tmp_dst);
+
+   tmp_dst.writemask = WRITEMASK_X;
+   emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
+
+   tmp_dst.writemask = WRITEMASK_Y;
+   emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
+
+   dst.writemask = WRITEMASK_XY;
+   emit(F16TO32(dst, tmp_src));
+}
+
+void
+vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
+{
+   /* Instead of splitting the 32-bit integer, shifting, and ORing it back
+    * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
+    * is not suitable to generate the shift values, but we can use the packed
+    * vector float and a type-converting MOV.
+    */
+   dst_reg shift(this, glsl_uvec4_type());
+   emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
+
+   dst_reg shifted(this, glsl_uvec4_type());
+   src0.swizzle = BRW_SWIZZLE_XXXX;
+   emit(SHR(shifted, src0, src_reg(shift)));
+
+   shifted.type = BRW_REGISTER_TYPE_UB;
+   dst_reg f(this, glsl_vec4_type());
+   emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
+
+   emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
+}
+
+void
+vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
+{
+   /* Instead of splitting the 32-bit integer, shifting, and ORing it back
+    * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
+    * is not suitable to generate the shift values, but we can use the packed
+    * vector float and a type-converting MOV.
+    */
+   dst_reg shift(this, glsl_uvec4_type());
+   emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
+
+   dst_reg shifted(this, glsl_uvec4_type());
+   src0.swizzle = BRW_SWIZZLE_XXXX;
+   emit(SHR(shifted, src0, src_reg(shift)));
+
+   shifted.type = BRW_REGISTER_TYPE_B;
+   dst_reg f(this, glsl_vec4_type());
+   emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
+
+   dst_reg scaled(this, glsl_vec4_type());
+   emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
+
+   dst_reg max(this, glsl_vec4_type());
+   emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
+   emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
+}
+
+void
+vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
+{
+   dst_reg saturated(this, glsl_vec4_type());
+   vec4_instruction *inst = emit(MOV(saturated, src0));
+   inst->saturate = true;
+
+   dst_reg scaled(this, glsl_vec4_type());
+   emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
+
+   dst_reg rounded(this, glsl_vec4_type());
+   emit(RNDE(rounded, src_reg(scaled)));
+
+   dst_reg u(this, glsl_uvec4_type());
+   emit(MOV(u, src_reg(rounded)));
+
+   src_reg bytes(u);
+   emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
+}
+
+void
+vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
+{
+   dst_reg max(this, glsl_vec4_type());
+   emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
+
+   dst_reg min(this, glsl_vec4_type());
+   emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
+
+   dst_reg scaled(this, glsl_vec4_type());
+   emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
+
+   dst_reg rounded(this, glsl_vec4_type());
+   emit(RNDE(rounded, src_reg(scaled)));
+
+   dst_reg i(this, glsl_ivec4_type());
+   emit(MOV(i, src_reg(rounded)));
+
+   src_reg bytes(i);
+   emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
+}
+
+/*
+ * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
+ * false) elements needed to pack a type.
+ */
+static int
+type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
+{
+   unsigned int i;
+   int size;
+
+   switch (type->base_type) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_FLOAT16:
+   case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_UINT16:
+   case GLSL_TYPE_INT16:
+   case GLSL_TYPE_UINT8:
+   case GLSL_TYPE_INT8:
+   case GLSL_TYPE_UINT64:
+   case GLSL_TYPE_INT64:
+      if (glsl_type_is_matrix(type)) {
+         const glsl_type *col_type = glsl_get_column_type(type);
+         unsigned col_slots =
+            (as_vec4 && glsl_type_is_dual_slot(col_type)) ? 2 : 1;
+         return type->matrix_columns * col_slots;
+      } else {
+         /* Regardless of size of vector, it gets a vec4. This is bad
+          * packing for things like floats, but otherwise arrays become a
+          * mess.  Hopefully a later pass over the code can pack scalars
+          * down if appropriate.
+          */
+         return (as_vec4 && glsl_type_is_dual_slot(type)) ? 2 : 1;
+      }
+   case GLSL_TYPE_ARRAY:
+      assert(type->length > 0);
+      return type_size_xvec4(type->fields.array, as_vec4, bindless) *
+             type->length;
+   case GLSL_TYPE_STRUCT:
+   case GLSL_TYPE_INTERFACE:
+      size = 0;
+      for (i = 0; i < type->length; i++) {
+	 size += type_size_xvec4(type->fields.structure[i].type, as_vec4,
+                                 bindless);
+      }
+      return size;
+   case GLSL_TYPE_SUBROUTINE:
+      return 1;
+
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
+      /* Samplers and textures take up no register space, since they're baked
+       * in at link time.
+       */
+      return bindless ? 1 : 0;
+   case GLSL_TYPE_ATOMIC_UINT:
+      return 0;
+   case GLSL_TYPE_IMAGE:
+      return bindless ? 1 : DIV_ROUND_UP(ISL_IMAGE_PARAM_SIZE, 4);
+   case GLSL_TYPE_VOID:
+   case GLSL_TYPE_ERROR:
+   case GLSL_TYPE_COOPERATIVE_MATRIX:
+      unreachable("not reached");
+   }
+
+   return 0;
+}
+
+/**
+ * Returns the minimum number of vec4 elements needed to pack a type.
+ *
+ * For simple types, it will return 1 (a single vec4); for matrices, the
+ * number of columns; for array and struct, the sum of the vec4_size of
+ * each of its elements; and for sampler and atomic, zero.
+ *
+ * This method is useful to calculate how much register space is needed to
+ * store a particular type.
+ */
+extern "C" int
+type_size_vec4(const struct glsl_type *type, bool bindless)
+{
+   return type_size_xvec4(type, true, bindless);
+}
+
+/**
+ * Returns the minimum number of dvec4 elements needed to pack a type.
+ *
+ * For simple types, it will return 1 (a single dvec4); for matrices, the
+ * number of columns; for array and struct, the sum of the dvec4_size of
+ * each of its elements; and for sampler and atomic, zero.
+ *
+ * This method is useful to calculate how much register space is needed to
+ * store a particular type.
+ *
+ * Measuring double-precision vertex inputs as dvec4 is required because
+ * ARB_vertex_attrib_64bit states that these uses the same number of locations
+ * than the single-precision version. That is, two consecutives dvec4 would be
+ * located in location "x" and location "x+1", not "x+2".
+ *
+ * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
+ * remap_vs_attrs() will take in account both the location and also if the
+ * type fits in one or two vec4 slots.
+ */
+extern "C" int
+type_size_dvec4(const struct glsl_type *type, bool bindless)
+{
+   return type_size_xvec4(type, false, bindless);
+}
+
+src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
+{
+   init();
+
+   this->file = VGRF;
+   this->nr = v->alloc.allocate(type_size_vec4(type, false));
+
+   if (glsl_type_is_array(type) || glsl_type_is_struct(type)) {
+      this->swizzle = BRW_SWIZZLE_NOOP;
+   } else {
+      this->swizzle = brw_swizzle_for_size(type->vector_elements);
+   }
+
+   this->type = brw_type_for_base_type(type);
+}
+
+src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
+{
+   assert(size > 0);
+
+   init();
+
+   this->file = VGRF;
+   this->nr = v->alloc.allocate(type_size_vec4(type, false) * size);
+
+   this->swizzle = BRW_SWIZZLE_NOOP;
+
+   this->type = brw_type_for_base_type(type);
+}
+
+dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
+{
+   init();
+
+   this->file = VGRF;
+   this->nr = v->alloc.allocate(type_size_vec4(type, false));
+
+   if (glsl_type_is_array(type) || glsl_type_is_struct(type)) {
+      this->writemask = WRITEMASK_XYZW;
+   } else {
+      this->writemask = (1 << type->vector_elements) - 1;
+   }
+
+   this->type = brw_type_for_base_type(type);
+}
+
+vec4_instruction *
+vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
+                          src_reg src0, src_reg src1)
+{
+   vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
+   inst->conditional_mod = conditionalmod;
+   return inst;
+}
+
+/**
+ * Emits the instructions needed to perform a pull constant load. before_block
+ * and before_inst can be NULL in which case the instruction will be appended
+ * to the end of the instruction list.
+ */
+void
+vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
+                                          src_reg surf_index,
+                                          src_reg offset_reg,
+                                          bblock_t *before_block,
+                                          vec4_instruction *before_inst)
+{
+   assert((before_inst == NULL && before_block == NULL) ||
+          (before_inst && before_block));
+
+   vec4_instruction *pull;
+
+   if (devinfo->ver >= 7) {
+      dst_reg grf_offset = dst_reg(this, glsl_uint_type());
+
+      grf_offset.type = offset_reg.type;
+
+      pull = MOV(grf_offset, offset_reg);
+
+      if (before_inst)
+         emit_before(before_block, before_inst, pull);
+      else
+         emit(pull);
+
+      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GFX7,
+                                           dst,
+                                           surf_index,
+                                           src_reg(grf_offset));
+      pull->mlen = 1;
+   } else {
+      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
+                                           dst,
+                                           surf_index,
+                                           offset_reg);
+      pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
+      pull->mlen = 1;
+   }
+
+   if (before_inst)
+      emit_before(before_block, before_inst, pull);
+   else
+      emit(pull);
+}
+
+src_reg
+vec4_visitor::emit_uniformize(const src_reg &src)
+{
+   const src_reg chan_index(this, glsl_uint_type());
+   const dst_reg dst = retype(dst_reg(this, glsl_uint_type()),
+                              src.type);
+
+   emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
+      ->force_writemask_all = true;
+   emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
+      ->force_writemask_all = true;
+
+   return src_reg(dst);
+}
+
+void
+vec4_visitor::gs_emit_vertex(int /* stream_id */)
+{
+   unreachable("not reached");
+}
+
+void
+vec4_visitor::gs_end_primitive()
+{
+   unreachable("not reached");
+}
+
+void
+vec4_visitor::emit_ndc_computation()
+{
+   if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
+      return;
+
+   /* Get the position */
+   src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
+
+   /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
+   dst_reg ndc = dst_reg(this, glsl_vec4_type());
+   output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
+   output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
+
+   current_annotation = "NDC";
+   dst_reg ndc_w = ndc;
+   ndc_w.writemask = WRITEMASK_W;
+   src_reg pos_w = pos;
+   pos_w.swizzle = BRW_SWIZZLE4(BRW_SWIZZLE_W, BRW_SWIZZLE_W, BRW_SWIZZLE_W, BRW_SWIZZLE_W);
+   emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
+
+   dst_reg ndc_xyz = ndc;
+   ndc_xyz.writemask = WRITEMASK_XYZ;
+
+   emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
+}
+
+void
+vec4_visitor::emit_psiz_and_flags(dst_reg reg)
+{
+   if (devinfo->ver < 6 &&
+       ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
+        output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
+        devinfo->has_negative_rhw_bug)) {
+      dst_reg header1 = dst_reg(this, glsl_uvec4_type());
+      dst_reg header1_w = header1;
+      header1_w.writemask = WRITEMASK_W;
+
+      emit(MOV(header1, brw_imm_ud(0u)));
+
+      if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
+	 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
+
+	 current_annotation = "Point size";
+	 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
+	 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
+      }
+
+      if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
+         current_annotation = "Clipping flags";
+         dst_reg flags0 = dst_reg(this, glsl_uint_type());
+
+         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
+         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
+         emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
+      }
+
+      if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) {
+         dst_reg flags1 = dst_reg(this, glsl_uint_type());
+         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
+         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
+         emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
+         emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
+      }
+
+      /* i965 clipping workaround:
+       * 1) Test for -ve rhw
+       * 2) If set,
+       *      set ndc = (0,0,0,0)
+       *      set ucp[6] = 1
+       *
+       * Later, clipping will detect ucp[6] and ensure the primitive is
+       * clipped against all fixed planes.
+       */
+      if (devinfo->has_negative_rhw_bug &&
+          output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
+         src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
+         ndc_w.swizzle = BRW_SWIZZLE_WWWW;
+         emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
+         vec4_instruction *inst;
+         inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
+         inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+      }
+
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
+   } else if (devinfo->ver < 6) {
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
+   } else {
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
+      if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
+         dst_reg reg_w = reg;
+         reg_w.writemask = WRITEMASK_W;
+         src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
+         reg_as_src.type = reg_w.type;
+         reg_as_src.swizzle = brw_swizzle_for_size(1);
+         emit(MOV(reg_w, reg_as_src));
+      }
+      if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
+         dst_reg reg_y = reg;
+         reg_y.writemask = WRITEMASK_Y;
+         reg_y.type = BRW_REGISTER_TYPE_D;
+         output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
+         emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
+      }
+      if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
+         dst_reg reg_z = reg;
+         reg_z.writemask = WRITEMASK_Z;
+         reg_z.type = BRW_REGISTER_TYPE_D;
+         output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
+         emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
+      }
+   }
+}
+
+vec4_instruction *
+vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
+{
+   assert(varying < VARYING_SLOT_MAX);
+
+   unsigned num_comps = output_num_components[varying][component];
+   if (num_comps == 0)
+      return NULL;
+
+   assert(output_reg[varying][component].type == reg.type);
+   current_annotation = output_reg_annotation[varying];
+   if (output_reg[varying][component].file != BAD_FILE) {
+      src_reg src = src_reg(output_reg[varying][component]);
+      src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
+      reg.writemask =
+         brw_writemask_for_component_packing(num_comps, component);
+      return emit(MOV(reg, src));
+   }
+   return NULL;
+}
+
+void
+vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
+{
+   reg.type = BRW_REGISTER_TYPE_F;
+   output_reg[varying][0].type = reg.type;
+
+   switch (varying) {
+   case VARYING_SLOT_PSIZ:
+   {
+      /* PSIZ is always in slot 0, and is coupled with other flags. */
+      current_annotation = "indices, point width, clip flags";
+      emit_psiz_and_flags(reg);
+      break;
+   }
+   case BRW_VARYING_SLOT_NDC:
+      current_annotation = "NDC";
+      if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
+         emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
+      break;
+   case VARYING_SLOT_POS:
+      current_annotation = "gl_Position";
+      if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
+         emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
+      break;
+   case BRW_VARYING_SLOT_PAD:
+      /* No need to write to this slot */
+      break;
+   default:
+      for (int i = 0; i < 4; i++) {
+         emit_generic_urb_slot(reg, varying, i);
+      }
+      break;
+   }
+}
+
+static unsigned
+align_interleaved_urb_mlen(const struct intel_device_info *devinfo,
+                           unsigned mlen)
+{
+   if (devinfo->ver >= 6) {
+      /* URB data written (does not include the message header reg) must
+       * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
+       * section 5.4.3.2.2: URB_INTERLEAVED.
+       *
+       * URB entries are allocated on a multiple of 1024 bits, so an
+       * extra 128 bits written here to make the end align to 256 is
+       * no problem.
+       */
+      if ((mlen % 2) != 1)
+	 mlen++;
+   }
+
+   return mlen;
+}
+
+
+/**
+ * Generates the VUE payload plus the necessary URB write instructions to
+ * output it.
+ *
+ * The VUE layout is documented in Volume 2a.
+ */
+void
+vec4_visitor::emit_vertex()
+{
+   /* MRF 0 is reserved for the debugger, so start with message header
+    * in MRF 1.
+    */
+   int base_mrf = 1;
+   int mrf = base_mrf;
+   /* In the process of generating our URB write message contents, we
+    * may need to unspill a register or load from an array.  Those
+    * reads would use MRFs 14-15.
+    */
+   int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver);
+
+   /* The following assertion verifies that max_usable_mrf causes an
+    * even-numbered amount of URB write data, which will meet gfx6's
+    * requirements for length alignment.
+    */
+   assert ((max_usable_mrf - base_mrf) % 2 == 0);
+
+   /* First mrf is the g0-based message header containing URB handles and
+    * such.
+    */
+   emit_urb_write_header(mrf++);
+
+   if (devinfo->ver < 6) {
+      emit_ndc_computation();
+   }
+
+   /* We may need to split this up into several URB writes, so do them in a
+    * loop.
+    */
+   int slot = 0;
+   bool complete = false;
+   do {
+      /* URB offset is in URB row increments, and each of our MRFs is half of
+       * one of those, since we're doing interleaved writes.
+       */
+      int offset = slot / 2;
+
+      mrf = base_mrf + 1;
+      for (; slot < prog_data->vue_map.num_slots; ++slot) {
+         emit_urb_slot(dst_reg(MRF, mrf++),
+                       prog_data->vue_map.slot_to_varying[slot]);
+
+         /* If this was max_usable_mrf, we can't fit anything more into this
+          * URB WRITE. Same thing if we reached the maximum length available.
+          */
+         if (mrf > max_usable_mrf ||
+             align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
+            slot++;
+            break;
+         }
+      }
+
+      complete = slot >= prog_data->vue_map.num_slots;
+      current_annotation = "URB write";
+      vec4_instruction *inst = emit_urb_write_opcode(complete);
+      inst->base_mrf = base_mrf;
+      inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
+      inst->offset += offset;
+   } while(!complete);
+}
+
+
+src_reg
+vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
+				 src_reg *reladdr, int reg_offset)
+{
+   /* Because we store the values to scratch interleaved like our
+    * vertex data, we need to scale the vec4 index by 2.
+    */
+   int message_header_scale = 2;
+
+   /* Pre-gfx6, the message header uses byte offsets instead of vec4
+    * (16-byte) offset units.
+    */
+   if (devinfo->ver < 6)
+      message_header_scale *= 16;
+
+   if (reladdr) {
+      /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
+       * to multiply the reladdr by 2. Notice that the reg_offset part
+       * is in units of 16 bytes and is used to select the low/high 16-byte
+       * chunk of a full dvec4, so we don't want to multiply that part.
+       */
+      src_reg index = src_reg(this, glsl_int_type());
+      if (type_sz(inst->dst.type) < 8) {
+         emit_before(block, inst, ADD(dst_reg(index), *reladdr,
+                                      brw_imm_d(reg_offset)));
+         emit_before(block, inst, MUL(dst_reg(index), index,
+                                      brw_imm_d(message_header_scale)));
+      } else {
+         emit_before(block, inst, MUL(dst_reg(index), *reladdr,
+                                      brw_imm_d(message_header_scale * 2)));
+         emit_before(block, inst, ADD(dst_reg(index), index,
+                                      brw_imm_d(reg_offset * message_header_scale)));
+      }
+      return index;
+   } else {
+      return brw_imm_d(reg_offset * message_header_scale);
+   }
+}
+
+/**
+ * Emits an instruction before @inst to load the value named by @orig_src
+ * from scratch space at @base_offset to @temp.
+ *
+ * @base_offset is measured in 32-byte units (the size of a register).
+ */
+void
+vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
+				dst_reg temp, src_reg orig_src,
+				int base_offset)
+{
+   assert(orig_src.offset % REG_SIZE == 0);
+   int reg_offset = base_offset + orig_src.offset / REG_SIZE;
+   src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
+                                      reg_offset);
+
+   if (type_sz(orig_src.type) < 8) {
+      emit_before(block, inst, SCRATCH_READ(temp, index));
+   } else {
+      dst_reg shuffled = dst_reg(this, glsl_dvec4_type());
+      dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
+      emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
+      index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
+      vec4_instruction *last_read =
+         SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
+      emit_before(block, inst, last_read);
+      shuffle_64bit_data(temp, src_reg(shuffled), false, true, block, last_read);
+   }
+}
+
+/**
+ * Emits an instruction after @inst to store the value to be written
+ * to @orig_dst to scratch space at @base_offset, from @temp.
+ *
+ * @base_offset is measured in 32-byte units (the size of a register).
+ */
+void
+vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
+                                 int base_offset)
+{
+   assert(inst->dst.offset % REG_SIZE == 0);
+   int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
+   src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
+                                      reg_offset);
+
+   /* Create a temporary register to store *inst's result in.
+    *
+    * We have to be careful in MOVing from our temporary result register in
+    * the scratch write.  If we swizzle from channels of the temporary that
+    * weren't initialized, it will confuse live interval analysis, which will
+    * make spilling fail to make progress.
+    */
+   bool is_64bit = type_sz(inst->dst.type) == 8;
+   const glsl_type *alloc_type =
+      is_64bit ? glsl_dvec4_type() : glsl_vec4_type();
+   const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
+                                       inst->dst.type),
+                                brw_swizzle_for_mask(inst->dst.writemask));
+
+   if (!is_64bit) {
+      dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
+				          inst->dst.writemask));
+      vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
+      if (inst->opcode != BRW_OPCODE_SEL)
+         write->predicate = inst->predicate;
+      write->ir = inst->ir;
+      write->annotation = inst->annotation;
+      inst->insert_after(block, write);
+   } else {
+      dst_reg shuffled = dst_reg(this, alloc_type);
+      vec4_instruction *last =
+         shuffle_64bit_data(shuffled, temp, true, true, block, inst);
+      src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
+
+      uint8_t mask = 0;
+      if (inst->dst.writemask & WRITEMASK_X)
+         mask |= WRITEMASK_XY;
+      if (inst->dst.writemask & WRITEMASK_Y)
+         mask |= WRITEMASK_ZW;
+      if (mask) {
+         dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
+
+         vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
+         if (inst->opcode != BRW_OPCODE_SEL)
+            write->predicate = inst->predicate;
+         write->ir = inst->ir;
+         write->annotation = inst->annotation;
+         last->insert_after(block, write);
+      }
+
+      mask = 0;
+      if (inst->dst.writemask & WRITEMASK_Z)
+         mask |= WRITEMASK_XY;
+      if (inst->dst.writemask & WRITEMASK_W)
+         mask |= WRITEMASK_ZW;
+      if (mask) {
+         dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
+
+         src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
+                                            reg_offset + 1);
+         vec4_instruction *write =
+            SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
+         if (inst->opcode != BRW_OPCODE_SEL)
+            write->predicate = inst->predicate;
+         write->ir = inst->ir;
+         write->annotation = inst->annotation;
+         last->insert_after(block, write);
+      }
+   }
+
+   inst->dst.file = temp.file;
+   inst->dst.nr = temp.nr;
+   inst->dst.offset %= REG_SIZE;
+   inst->dst.reladdr = NULL;
+}
+
+/**
+ * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
+ * adds the scratch read(s) before \p inst. The function also checks for
+ * recursive reladdr scratch accesses, issuing the corresponding scratch
+ * loads and rewriting reladdr references accordingly.
+ *
+ * \return \p src if it did not require a scratch load, otherwise, the
+ * register holding the result of the scratch load that the caller should
+ * use to rewrite src.
+ */
+src_reg
+vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
+                                   vec4_instruction *inst, src_reg src)
+{
+   /* Resolve recursive reladdr scratch access by calling ourselves
+    * with src.reladdr
+    */
+   if (src.reladdr)
+      *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
+                                          *src.reladdr);
+
+   /* Now handle scratch access on src */
+   if (src.file == VGRF && scratch_loc[src.nr] != -1) {
+      dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
+         glsl_dvec4_type() : glsl_vec4_type());
+      emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
+      src.nr = temp.nr;
+      src.offset %= REG_SIZE;
+      src.reladdr = NULL;
+   }
+
+   return src;
+}
+
+/**
+ * We can't generally support array access in GRF space, because a
+ * single instruction's destination can only span 2 contiguous
+ * registers.  So, we send all GRF arrays that get variable index
+ * access to scratch space.
+ */
+void
+vec4_visitor::move_grf_array_access_to_scratch()
+{
+   int scratch_loc[this->alloc.count];
+   memset(scratch_loc, -1, sizeof(scratch_loc));
+
+   /* First, calculate the set of virtual GRFs that need to be punted
+    * to scratch due to having any array access on them, and where in
+    * scratch.
+    */
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      if (inst->dst.file == VGRF && inst->dst.reladdr) {
+         if (scratch_loc[inst->dst.nr] == -1) {
+            scratch_loc[inst->dst.nr] = last_scratch;
+            last_scratch += this->alloc.sizes[inst->dst.nr];
+         }
+
+         for (src_reg *iter = inst->dst.reladdr;
+              iter->reladdr;
+              iter = iter->reladdr) {
+            if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
+               scratch_loc[iter->nr] = last_scratch;
+               last_scratch += this->alloc.sizes[iter->nr];
+            }
+         }
+      }
+
+      for (int i = 0 ; i < 3; i++) {
+         for (src_reg *iter = &inst->src[i];
+              iter->reladdr;
+              iter = iter->reladdr) {
+            if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
+               scratch_loc[iter->nr] = last_scratch;
+               last_scratch += this->alloc.sizes[iter->nr];
+            }
+         }
+      }
+   }
+
+   /* Now, for anything that will be accessed through scratch, rewrite
+    * it to load/store.  Note that this is a _safe list walk, because
+    * we may generate a new scratch_write instruction after the one
+    * we're processing.
+    */
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      /* Set up the annotation tracking for new generated instructions. */
+      base_ir = inst->ir;
+      current_annotation = inst->annotation;
+
+      /* First handle scratch access on the dst. Notice we have to handle
+       * the case where the dst's reladdr also points to scratch space.
+       */
+      if (inst->dst.reladdr)
+         *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
+                                                   *inst->dst.reladdr);
+
+      /* Now that we have handled any (possibly recursive) reladdr scratch
+       * accesses for dst we can safely do the scratch write for dst itself
+       */
+      if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
+         emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
+
+      /* Now handle scratch access on any src. In this case, since inst->src[i]
+       * already is a src_reg, we can just call emit_resolve_reladdr with
+       * inst->src[i] and it will take care of handling scratch loads for
+       * both src and src.reladdr (recursively).
+       */
+      for (int i = 0 ; i < 3; i++) {
+         inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
+                                             inst->src[i]);
+      }
+   }
+}
+
+void
+vec4_visitor::resolve_ud_negate(src_reg *reg)
+{
+   if (reg->type != BRW_REGISTER_TYPE_UD ||
+       !reg->negate)
+      return;
+
+   src_reg temp = src_reg(this, glsl_uvec4_type());
+   emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
+   *reg = temp;
+}
+
+static brw_rnd_mode
+brw_rnd_mode_from_execution_mode(unsigned execution_mode)
+{
+   if (nir_has_any_rounding_mode_rtne(execution_mode))
+      return BRW_RND_MODE_RTNE;
+   if (nir_has_any_rounding_mode_rtz(execution_mode))
+      return BRW_RND_MODE_RTZ;
+   return BRW_RND_MODE_UNSPECIFIED;
+}
+
+void
+vec4_visitor::emit_shader_float_controls_execution_mode()
+{
+   unsigned execution_mode = this->nir->info.float_controls_execution_mode;
+   if (nir_has_any_rounding_mode_enabled(execution_mode)) {
+      brw_rnd_mode rnd = brw_rnd_mode_from_execution_mode(execution_mode);
+      const vec4_builder bld = vec4_builder(this).at_end();
+      bld.exec_all().emit(SHADER_OPCODE_RND_MODE, dst_null_ud(), brw_imm_d(rnd));
+   }
+}
+
+vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
+                           const struct brw_compile_params *params,
+                           const struct brw_sampler_prog_key_data *key_tex,
+                           struct brw_vue_prog_data *prog_data,
+                           const nir_shader *shader,
+                           bool no_spills,
+                           bool debug_enabled)
+   : backend_shader(compiler, params, shader, &prog_data->base, debug_enabled),
+     key_tex(key_tex),
+     prog_data(prog_data),
+     fail_msg(NULL),
+     first_non_payload_grf(0),
+     ubo_push_start(),
+     push_length(0),
+     live_analysis(this), performance_analysis(this),
+     no_spills(no_spills),
+     last_scratch(0)
+{
+   this->failed = false;
+
+   this->base_ir = NULL;
+   this->current_annotation = NULL;
+   memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
+
+   memset(this->output_num_components, 0, sizeof(this->output_num_components));
+
+   this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF;
+
+   this->uniforms = 0;
+
+   this->nir_ssa_values = NULL;
+}
+
+
+void
+vec4_visitor::fail(const char *format, ...)
+{
+   va_list va;
+   char *msg;
+
+   if (failed)
+      return;
+
+   failed = true;
+
+   va_start(va, format);
+   msg = ralloc_vasprintf(mem_ctx, format, va);
+   va_end(va);
+   msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n",
+                         _mesa_shader_stage_to_abbrev(stage), msg);
+
+   this->fail_msg = msg;
+
+   if (unlikely(debug_enabled)) {
+      fprintf(stderr, "%s",  msg);
+   }
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/elk/brw_vec4_vs.h b/src/intel/compiler/elk/brw_vec4_vs.h
new file mode 100644
index 00000000000..0929df5ff3d
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_vs.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright © 2006 - 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_VEC4_VS_VISITOR_H
+#define BRW_VEC4_VS_VISITOR_H
+
+#include "brw_vec4.h"
+
+namespace brw {
+
+class vec4_vs_visitor : public vec4_visitor
+{
+public:
+   vec4_vs_visitor(const struct brw_compiler *compiler,
+                   const struct brw_compile_params *params,
+                   const struct brw_vs_prog_key *key,
+                   struct brw_vs_prog_data *vs_prog_data,
+                   const nir_shader *shader,
+                   bool debug_enabled);
+
+protected:
+   virtual void setup_payload();
+   virtual void emit_prolog();
+   virtual void emit_thread_end();
+   virtual void emit_urb_write_header(int mrf);
+   virtual void emit_urb_slot(dst_reg reg, int varying);
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete);
+
+private:
+   int setup_attributes(int payload_reg);
+
+   const struct brw_vs_prog_key *const key;
+   struct brw_vs_prog_data * const vs_prog_data;
+};
+
+} /* namespace brw */
+
+#endif /* BRW_VEC4_VS_VISITOR_H */
diff --git a/src/intel/compiler/elk/brw_vec4_vs_visitor.cpp b/src/intel/compiler/elk/brw_vec4_vs_visitor.cpp
new file mode 100644
index 00000000000..c30a3434451
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vec4_vs_visitor.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include "brw_vec4_vs.h"
+#include "dev/intel_debug.h"
+
+namespace brw {
+
+void
+vec4_vs_visitor::emit_prolog()
+{
+}
+
+
+void
+vec4_vs_visitor::emit_urb_write_header(int mrf)
+{
+   /* No need to do anything for VS; an implied write to this MRF will be
+    * performed by VEC4_VS_OPCODE_URB_WRITE.
+    */
+   (void) mrf;
+}
+
+
+vec4_instruction *
+vec4_vs_visitor::emit_urb_write_opcode(bool complete)
+{
+   vec4_instruction *inst = emit(VEC4_VS_OPCODE_URB_WRITE);
+   inst->urb_write_flags = complete ?
+      BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
+
+   return inst;
+}
+
+
+void
+vec4_vs_visitor::emit_urb_slot(dst_reg reg, int varying)
+{
+   reg.type = BRW_REGISTER_TYPE_F;
+   output_reg[varying][0].type = reg.type;
+
+   switch (varying) {
+   case VARYING_SLOT_COL0:
+   case VARYING_SLOT_COL1:
+   case VARYING_SLOT_BFC0:
+   case VARYING_SLOT_BFC1: {
+      /* These built-in varyings are only supported in compatibility mode,
+       * and we only support GS in core profile.  So, this must be a vertex
+       * shader.
+       */
+      vec4_instruction *inst = emit_generic_urb_slot(reg, varying, 0);
+      if (inst && key->clamp_vertex_color)
+         inst->saturate = true;
+      break;
+   }
+   default:
+      return vec4_visitor::emit_urb_slot(reg, varying);
+   }
+}
+
+
+void
+vec4_vs_visitor::emit_thread_end()
+{
+   /* For VS, we always end the thread by emitting a single vertex.
+    * emit_urb_write_opcode() will take care of setting the eot flag on the
+    * SEND instruction.
+    */
+   emit_vertex();
+}
+
+
+vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler,
+                                 const struct brw_compile_params *params,
+                                 const struct brw_vs_prog_key *key,
+                                 struct brw_vs_prog_data *vs_prog_data,
+                                 const nir_shader *shader,
+                                 bool debug_enabled)
+   : vec4_visitor(compiler, params, &key->base.tex, &vs_prog_data->base,
+                  shader, false /* no_spills */, debug_enabled),
+     key(key),
+     vs_prog_data(vs_prog_data)
+{
+}
+
+
+} /* namespace brw */
diff --git a/src/intel/compiler/elk/brw_vue_map.c b/src/intel/compiler/elk/brw_vue_map.c
new file mode 100644
index 00000000000..56cb560f1e4
--- /dev/null
+++ b/src/intel/compiler/elk/brw_vue_map.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file brw_vue_map.c
+ *
+ * This file computes the "VUE map" for a (non-fragment) shader stage, which
+ * describes the layout of its output varyings.  The VUE map is used to match
+ * outputs from one stage with the inputs of the next.
+ *
+ * Largely, varyings can be placed however we like - producers/consumers simply
+ * have to agree on the layout.  However, there is also a "VUE Header" that
+ * prescribes a fixed-layout for items that interact with fixed function
+ * hardware, such as the clipper and rasterizer.
+ *
+ * Authors:
+ *   Paul Berry <stereotype441@gmail.com>
+ *   Chris Forbes <chrisf@ijw.co.nz>
+ *   Eric Anholt <eric@anholt.net>
+ */
+
+
+#include "brw_compiler.h"
+#include "dev/intel_debug.h"
+
+static inline void
+assign_vue_slot(struct intel_vue_map *vue_map, int varying, int slot)
+{
+   /* Make sure this varying hasn't been assigned a slot already */
+   assert (vue_map->varying_to_slot[varying] == -1);
+
+   vue_map->varying_to_slot[varying] = slot;
+   vue_map->slot_to_varying[slot] = varying;
+}
+
+/**
+ * Compute the VUE map for a shader stage.
+ */
+void
+brw_compute_vue_map(const struct intel_device_info *devinfo,
+                    struct intel_vue_map *vue_map,
+                    uint64_t slots_valid,
+                    bool separate,
+                    uint32_t pos_slots)
+{
+   /* Keep using the packed/contiguous layout on old hardware - we only need
+    * the SSO layout when using geometry/tessellation shaders or 32 FS input
+    * varyings, which only exist on Gen >= 6.  It's also a bit more efficient.
+    */
+   if (devinfo->ver < 6)
+      separate = false;
+
+   if (separate) {
+      /* In SSO mode, we don't know whether the adjacent stage will
+       * read/write gl_ClipDistance, which has a fixed slot location.
+       * We have to assume the worst and reserve a slot for it, or else
+       * the rest of our varyings will be off by a slot.
+       *
+       * Note that we don't have to worry about COL/BFC, as those built-in
+       * variables only exist in legacy GL, which only supports VS and FS.
+       */
+      slots_valid |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
+      slots_valid |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
+   }
+
+   vue_map->slots_valid = slots_valid;
+   vue_map->separate = separate;
+
+   /* gl_Layer, gl_ViewportIndex & gl_PrimitiveShadingRateEXT don't get their
+    * own varying slots -- they are stored in the first VUE slot
+    * (VARYING_SLOT_PSIZ).
+    */
+   slots_valid &= ~(VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PRIMITIVE_SHADING_RATE);
+
+   /* Make sure that the values we store in vue_map->varying_to_slot and
+    * vue_map->slot_to_varying won't overflow the signed chars that are used
+    * to store them.  Note that since vue_map->slot_to_varying sometimes holds
+    * values equal to BRW_VARYING_SLOT_COUNT, we need to ensure that
+    * BRW_VARYING_SLOT_COUNT is <= 127, not 128.
+    */
+   STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 127);
+
+   for (int i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) {
+      vue_map->varying_to_slot[i] = -1;
+      vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD;
+   }
+
+   int slot = 0;
+
+   /* VUE header: format depends on chip generation and whether clipping is
+    * enabled.
+    *
+    * See the Sandybridge PRM, Volume 2 Part 1, section 1.5.1 (page 30),
+    * "Vertex URB Entry (VUE) Formats" which describes the VUE header layout.
+    */
+   if (devinfo->ver < 6) {
+      /* There are 8 dwords in VUE header pre-Ironlake:
+       * dword 0-3 is indices, point width, clip flags.
+       * dword 4-7 is ndc position
+       * dword 8-11 is the first vertex data.
+       *
+       * On Ironlake the VUE header is nominally 20 dwords, but the hardware
+       * will accept the same header layout as Gfx4 [and should be a bit faster]
+       */
+      assign_vue_slot(vue_map, VARYING_SLOT_PSIZ, slot++);
+      assign_vue_slot(vue_map, BRW_VARYING_SLOT_NDC, slot++);
+      assign_vue_slot(vue_map, VARYING_SLOT_POS, slot++);
+   } else {
+      /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
+       * dword 0-3 of the header is shading rate, indices, point width, clip flags.
+       * dword 4-7 is the 4D space position
+       * dword 8-15 of the vertex header is the user clip distance if
+       * enabled.
+       * dword 8-11 or 16-19 is the first vertex element data we fill.
+       */
+      assign_vue_slot(vue_map, VARYING_SLOT_PSIZ, slot++);
+      assign_vue_slot(vue_map, VARYING_SLOT_POS, slot++);
+
+      /* When using Primitive Replication, multiple slots are used for storing
+       * positions for each view.
+       */
+      assert(pos_slots >= 1);
+      if (pos_slots > 1) {
+         for (int i = 1; i < pos_slots; i++) {
+            vue_map->slot_to_varying[slot++] = VARYING_SLOT_POS;
+         }
+      }
+
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0))
+         assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST0, slot++);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1))
+         assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST1, slot++);
+
+      /* Vertex URB Formats table says: "Vertex Header shall be padded at the
+       * end so that the header ends on a 32-byte boundary".
+       */
+      slot += slot % 2;
+
+      /* front and back colors need to be consecutive so that we can use
+       * ATTRIBUTE_SWIZZLE_INPUTATTR_FACING to swizzle them when doing
+       * two-sided color.
+       */
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL0))
+         assign_vue_slot(vue_map, VARYING_SLOT_COL0, slot++);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC0))
+         assign_vue_slot(vue_map, VARYING_SLOT_BFC0, slot++);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL1))
+         assign_vue_slot(vue_map, VARYING_SLOT_COL1, slot++);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC1))
+         assign_vue_slot(vue_map, VARYING_SLOT_BFC1, slot++);
+   }
+
+   /* The hardware doesn't care about the rest of the vertex outputs, so we
+    * can assign them however we like.  For normal programs, we simply assign
+    * them contiguously.
+    *
+    * For separate shader pipelines, we first assign built-in varyings
+    * contiguous slots.  This works because ARB_separate_shader_objects
+    * requires that all shaders have matching built-in varying interface
+    * blocks.  Next, we assign generic varyings based on their location
+    * (either explicit or linker assigned).  This guarantees a fixed layout.
+    *
+    * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX,
+    * since it's encoded as the clip distances by emit_clip_distances().
+    * However, it may be output by transform feedback, and we'd rather not
+    * recompute state when TF changes, so we just always include it.
+    */
+   uint64_t builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   while (builtins != 0) {
+      const int varying = ffsll(builtins) - 1;
+      if (vue_map->varying_to_slot[varying] == -1) {
+         assign_vue_slot(vue_map, varying, slot++);
+      }
+      builtins &= ~BITFIELD64_BIT(varying);
+   }
+
+   const int first_generic_slot = slot;
+   uint64_t generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   while (generics != 0) {
+      const int varying = ffsll(generics) - 1;
+      if (separate) {
+         slot = first_generic_slot + varying - VARYING_SLOT_VAR0;
+      }
+      assign_vue_slot(vue_map, varying, slot++);
+      generics &= ~BITFIELD64_BIT(varying);
+   }
+
+   vue_map->num_slots = slot;
+   vue_map->num_pos_slots = pos_slots;
+   vue_map->num_per_vertex_slots = 0;
+   vue_map->num_per_patch_slots = 0;
+}
+
+/**
+ * Compute the VUE map for tessellation control shader outputs and
+ * tessellation evaluation shader inputs.
+ */
+void
+brw_compute_tess_vue_map(struct intel_vue_map *vue_map,
+                         uint64_t vertex_slots,
+                         uint32_t patch_slots)
+{
+   /* I don't think anything actually uses this... */
+   vue_map->slots_valid = vertex_slots;
+
+   /* separate isn't really meaningful, but make sure it's initialized */
+   vue_map->separate = false;
+
+   vertex_slots &= ~(VARYING_BIT_TESS_LEVEL_OUTER |
+                     VARYING_BIT_TESS_LEVEL_INNER);
+
+   /* Make sure that the values we store in vue_map->varying_to_slot and
+    * vue_map->slot_to_varying won't overflow the signed chars that are used
+    * to store them.  Note that since vue_map->slot_to_varying sometimes holds
+    * values equal to VARYING_SLOT_TESS_MAX , we need to ensure that
+    * VARYING_SLOT_TESS_MAX is <= 127, not 128.
+    */
+   STATIC_ASSERT(VARYING_SLOT_TESS_MAX <= 127);
+
+   for (int i = 0; i < VARYING_SLOT_TESS_MAX ; ++i) {
+      vue_map->varying_to_slot[i] = -1;
+      vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD;
+   }
+
+   int slot = 0;
+
+   /* The first 8 DWords are reserved for the "Patch Header".
+    *
+    * VARYING_SLOT_TESS_LEVEL_OUTER / INNER live here, but the exact layout
+    * depends on the domain type.  They might not be in slots 0 and 1 as
+    * described here, but pretending they're separate allows us to uniquely
+    * identify them by distinct slot locations.
+    */
+   assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_INNER, slot++);
+   assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_OUTER, slot++);
+
+   /* first assign per-patch varyings */
+   while (patch_slots != 0) {
+      const int varying = ffsll(patch_slots) - 1;
+      if (vue_map->varying_to_slot[varying + VARYING_SLOT_PATCH0] == -1) {
+         assign_vue_slot(vue_map, varying + VARYING_SLOT_PATCH0, slot++);
+      }
+      patch_slots &= ~BITFIELD64_BIT(varying);
+   }
+
+   /* apparently, including the patch header... */
+   vue_map->num_per_patch_slots = slot;
+
+   /* then assign per-vertex varyings for each vertex in our patch */
+   while (vertex_slots != 0) {
+      const int varying = ffsll(vertex_slots) - 1;
+      if (vue_map->varying_to_slot[varying] == -1) {
+         assign_vue_slot(vue_map, varying, slot++);
+      }
+      vertex_slots &= ~BITFIELD64_BIT(varying);
+   }
+
+   vue_map->num_per_vertex_slots = slot - vue_map->num_per_patch_slots;
+   vue_map->num_pos_slots = 0;
+   vue_map->num_slots = slot;
+}
+
+static const char *
+varying_name(brw_varying_slot slot, gl_shader_stage stage)
+{
+   assume(slot < BRW_VARYING_SLOT_COUNT);
+
+   if (slot < VARYING_SLOT_MAX)
+      return gl_varying_slot_name_for_stage((gl_varying_slot)slot, stage);
+
+   static const char *brw_names[] = {
+      [BRW_VARYING_SLOT_NDC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_NDC",
+      [BRW_VARYING_SLOT_PAD - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PAD",
+      [BRW_VARYING_SLOT_PNTC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PNTC",
+   };
+
+   return brw_names[slot - VARYING_SLOT_MAX];
+}
+
+void
+brw_print_vue_map(FILE *fp, const struct intel_vue_map *vue_map,
+                  gl_shader_stage stage)
+{
+   if (vue_map->num_per_vertex_slots > 0 || vue_map->num_per_patch_slots > 0) {
+      fprintf(fp, "PUE map (%d slots, %d/patch, %d/vertex, %s)\n",
+              vue_map->num_slots,
+              vue_map->num_per_patch_slots,
+              vue_map->num_per_vertex_slots,
+              vue_map->separate ? "SSO" : "non-SSO");
+      for (int i = 0; i < vue_map->num_slots; i++) {
+         if (vue_map->slot_to_varying[i] >= VARYING_SLOT_PATCH0) {
+            fprintf(fp, "  [%d] VARYING_SLOT_PATCH%d\n", i,
+                    vue_map->slot_to_varying[i] - VARYING_SLOT_PATCH0);
+         } else {
+            fprintf(fp, "  [%d] %s\n", i,
+                    varying_name(vue_map->slot_to_varying[i], stage));
+         }
+      }
+   } else {
+      fprintf(fp, "VUE map (%d slots, %s)\n",
+              vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO");
+      for (int i = 0; i < vue_map->num_slots; i++) {
+         fprintf(fp, "  [%d] %s\n", i,
+                 varying_name(vue_map->slot_to_varying[i], stage));
+      }
+   }
+   fprintf(fp, "\n");
+}
diff --git a/src/intel/compiler/elk/gfx6_gs_visitor.cpp b/src/intel/compiler/elk/gfx6_gs_visitor.cpp
new file mode 100644
index 00000000000..5465094ed36
--- /dev/null
+++ b/src/intel/compiler/elk/gfx6_gs_visitor.cpp
@@ -0,0 +1,702 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * This code is based on original work by Ilia Mirkin.
+ */
+
+/**
+ * \file gfx6_gs_visitor.cpp
+ *
+ * Gfx6 geometry shader implementation
+ */
+
+#include "gfx6_gs_visitor.h"
+#include "brw_eu.h"
+#include "brw_prim.h"
+
+namespace brw {
+
+void
+gfx6_gs_visitor::emit_prolog()
+{
+   vec4_gs_visitor::emit_prolog();
+
+   /* Gfx6 geometry shaders require to allocate an initial VUE handle via
+    * FF_SYNC message, however the documentation remarks that only one thread
+    * can write to the URB simultaneously and the FF_SYNC message provides the
+    * synchronization mechanism for this, so using this message effectively
+    * stalls the thread until it is its turn to write to the URB. Because of
+    * this, the best way to implement geometry shader algorithms in gfx6 is to
+    * execute the algorithm before the FF_SYNC message to maximize parallelism.
+    *
+    * To achieve this we buffer the geometry shader outputs for each emitted
+    * vertex in vertex_output during operation. Then, when we have processed
+    * the last vertex (that is, at thread end time), we send the FF_SYNC
+    * message to allocate the initial VUE handle and write all buffered vertex
+    * data to the URB in one go.
+    *
+    * For each emitted vertex, vertex_output will hold vue_map.num_slots
+    * data items plus one additional item to hold required flags
+    * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
+    * which come right after the data items for that vertex. Vertex data and
+    * flags for the next vertex come right after the data items and flags for
+    * the previous vertex.
+    */
+   this->current_annotation = "gfx6 prolog";
+   this->vertex_output = src_reg(this,
+                                 glsl_uint_type(),
+                                 (prog_data->vue_map.num_slots + 1) *
+                                 nir->info.gs.vertices_out);
+   this->vertex_output_offset = src_reg(this, glsl_uint_type());
+   emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
+
+   /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
+    * so initialize it once to R0.
+    */
+   vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
+                                     retype(brw_vec8_grf(0, 0),
+                                            BRW_REGISTER_TYPE_UD)));
+   inst->force_writemask_all = true;
+
+   /* This will be used as a temporary to store writeback data of FF_SYNC
+    * and URB_WRITE messages.
+    */
+   this->temp = src_reg(this, glsl_uint_type());
+
+   /* This will be used to know when we are processing the first vertex of
+    * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
+    * that we are processing the first vertex in the primitive and to zero
+    * otherwise. This way we can use its value directly in the URB write
+    * headers.
+    */
+   this->first_vertex = src_reg(this, glsl_uint_type());
+   emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START)));
+
+   /* The FF_SYNC message requires to know the number of primitives generated,
+    * so keep a counter for this.
+    */
+   this->prim_count = src_reg(this, glsl_uint_type());
+   emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u)));
+
+   if (gs_prog_data->num_transform_feedback_bindings) {
+      /* Create a virtual register to hold destination indices in SOL */
+      this->destination_indices = src_reg(this, glsl_uvec4_type());
+      /* Create a virtual register to hold number of written primitives */
+      this->sol_prim_written = src_reg(this, glsl_uint_type());
+      /* Create a virtual register to hold Streamed Vertex Buffer Indices */
+      this->svbi = src_reg(this, glsl_uvec4_type());
+      /* Create a virtual register to hold max values of SVBI */
+      this->max_svbi = src_reg(this, glsl_uvec4_type());
+      emit(MOV(dst_reg(this->max_svbi),
+               src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
+   }
+
+   /* PrimitveID is delivered in r0.1 of the thread payload. If the program
+    * needs it we have to move it to a separate register where we can map
+    * the attribute.
+    *
+    * Notice that we cannot use a virtual register for this, because we need to
+    * map all input attributes to hardware registers in setup_payload(),
+    * which happens before virtual registers are mapped to hardware registers.
+    * We could work around that issue if we were able to compute the first
+    * non-payload register here and move the PrimitiveID information to that
+    * register, but we can't because at this point we don't know the final
+    * number uniforms that will be included in the payload.
+    *
+    * So, what we do is to place PrimitiveID information in r1, which is always
+    * delivered as part of the payload, but its only populated with data
+    * relevant for transform feedback when we set GFX6_GS_SVBI_PAYLOAD_ENABLE
+    * in the 3DSTATE_GS state packet. That information can be obtained by other
+    * means though, so we can safely use r1 for this purpose.
+    */
+   if (gs_prog_data->include_primitive_id) {
+      this->primitive_id =
+         src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+      emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
+   }
+}
+
+void
+gfx6_gs_visitor::gs_emit_vertex(int stream_id)
+{
+   this->current_annotation = "gfx6 emit vertex";
+
+   /* Buffer all output slots for this vertex in vertex_output */
+   for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
+      int varying = prog_data->vue_map.slot_to_varying[slot];
+      if (varying != VARYING_SLOT_PSIZ) {
+         dst_reg dst(this->vertex_output);
+         dst.reladdr = ralloc(mem_ctx, src_reg);
+         memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
+         emit_urb_slot(dst, varying);
+      } else {
+         /* The PSIZ slot can pack multiple varyings in different channels
+          * and emit_urb_slot() will produce a MOV instruction for each of
+          * them. Since we are writing to an array, that will translate to
+          * possibly multiple MOV instructions with an array destination and
+          * each will generate a scratch write with the same offset into
+          * scratch space (thus, each one overwriting the previous). This is
+          * not what we want. What we will do instead is emit PSIZ to a
+          * a regular temporary register, then move that register into the
+          * array. This way we only have one instruction with an array
+          * destination and we only produce a single scratch write.
+          */
+         dst_reg tmp = dst_reg(src_reg(this, glsl_uvec4_type()));
+         emit_urb_slot(tmp, varying);
+         dst_reg dst(this->vertex_output);
+         dst.reladdr = ralloc(mem_ctx, src_reg);
+         memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
+         vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
+         inst->force_writemask_all = true;
+      }
+
+      emit(ADD(dst_reg(this->vertex_output_offset),
+               this->vertex_output_offset, brw_imm_ud(1u)));
+   }
+
+   /* Now buffer flags for this vertex */
+   dst_reg dst(this->vertex_output);
+   dst.reladdr = ralloc(mem_ctx, src_reg);
+   memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
+   if (nir->info.gs.output_primitive == MESA_PRIM_POINTS) {
+      /* If we are outputting points, then every vertex has PrimStart and
+       * PrimEnd set.
+       */
+      emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
+                              URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)));
+      emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
+   } else {
+      /* Otherwise, we can only set the PrimStart flag, which we have stored
+       * in the first_vertex register. We will have to wait until we execute
+       * EndPrimitive() or we end the thread to set the PrimEnd flag on a
+       * vertex.
+       */
+      emit(OR(dst, this->first_vertex,
+              brw_imm_ud(gs_prog_data->output_topology <<
+                         URB_WRITE_PRIM_TYPE_SHIFT)));
+      emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u)));
+   }
+   emit(ADD(dst_reg(this->vertex_output_offset),
+            this->vertex_output_offset, brw_imm_ud(1u)));
+}
+
+void
+gfx6_gs_visitor::gs_end_primitive()
+{
+   this->current_annotation = "gfx6 end primitive";
+   /* Calling EndPrimitive() is optional for point output. In this case we set
+    * the PrimEnd flag when we process EmitVertex().
+    */
+   if (nir->info.gs.output_primitive == MESA_PRIM_POINTS)
+      return;
+
+   /* Otherwise we know that the last vertex we have processed was the last
+    * vertex in the primitive and we need to set its PrimEnd flag, so do this
+    * unless we haven't emitted that vertex at all (vertex_count != 0).
+    *
+    * Notice that we have already incremented vertex_count when we processed
+    * the last emit_vertex, so we need to take that into account in the
+    * comparison below (hence the num_output_vertices + 1 in the comparison
+    * below).
+    */
+   unsigned num_output_vertices = nir->info.gs.vertices_out;
+   emit(CMP(dst_null_ud(), this->vertex_count,
+            brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L));
+   vec4_instruction *inst = emit(CMP(dst_null_ud(),
+                                     this->vertex_count, brw_imm_ud(0u),
+                                     BRW_CONDITIONAL_NEQ));
+   inst->predicate = BRW_PREDICATE_NORMAL;
+   emit(IF(BRW_PREDICATE_NORMAL));
+   {
+      /* vertex_output_offset is already pointing at the first entry of the
+       * next vertex. So subtract 1 to modify the flags for the previous
+       * vertex.
+       */
+      src_reg offset(this, glsl_uint_type());
+      emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
+
+      src_reg dst(this->vertex_output);
+      dst.reladdr = ralloc(mem_ctx, src_reg);
+      memcpy(dst.reladdr, &offset, sizeof(src_reg));
+
+      emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END)));
+      emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
+
+      /* Set the first vertex flag to indicate that the next vertex will start
+       * a primitive.
+       */
+      emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START)));
+   }
+   emit(BRW_OPCODE_ENDIF);
+}
+
+void
+gfx6_gs_visitor::emit_urb_write_header(int mrf)
+{
+   this->current_annotation = "gfx6 urb header";
+   /* Compute offset of the flags for the current vertex in vertex_output and
+    * write them in dw2 of the message header.
+    *
+    * Notice that by the time that emit_thread_end() calls here
+    * vertex_output_offset should point to the first data item of the current
+    * vertex in vertex_output, thus we only need to add the number of output
+    * slots per vertex to that offset to obtain the flags data offset.
+    */
+   src_reg flags_offset(this, glsl_uint_type());
+   emit(ADD(dst_reg(flags_offset),
+            this->vertex_output_offset,
+            brw_imm_d(prog_data->vue_map.num_slots)));
+
+   src_reg flags_data(this->vertex_output);
+   flags_data.reladdr = ralloc(mem_ctx, src_reg);
+   memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
+
+   emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
+}
+
+static unsigned
+align_interleaved_urb_mlen(unsigned mlen)
+{
+   /* URB data written (does not include the message header reg) must
+    * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
+    * section 5.4.3.2.2: URB_INTERLEAVED.
+    */
+   if ((mlen % 2) != 1)
+      mlen++;
+   return mlen;
+}
+
+void
+gfx6_gs_visitor::emit_snb_gs_urb_write_opcode(bool complete, int base_mrf,
+                                              int last_mrf, int urb_offset)
+{
+   vec4_instruction *inst = NULL;
+
+   if (!complete) {
+      /* If the vertex is not complete we don't have to do anything special */
+      inst = emit(VEC4_GS_OPCODE_URB_WRITE);
+      inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
+   } else {
+      /* Otherwise we always request to allocate a new VUE handle. If this is
+       * the last write before the EOT message and the new handle never gets
+       * used it will be dereferenced when we send the EOT message. This is
+       * necessary to avoid different setups for the EOT message (one for the
+       * case when there is no output and another for the case when there is)
+       * which would require to end the program with an IF/ELSE/ENDIF block,
+       * something we do not want.
+       */
+      inst = emit(VEC4_GS_OPCODE_URB_WRITE_ALLOCATE);
+      inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
+      inst->dst = dst_reg(MRF, base_mrf);
+      inst->src[0] = this->temp;
+   }
+
+   inst->base_mrf = base_mrf;
+   inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf);
+   inst->offset = urb_offset;
+}
+
+void
+gfx6_gs_visitor::emit_thread_end()
+{
+   /* Make sure the current primitive is ended: we know it is not ended when
+    * first_vertex is not zero. This is only relevant for outputs other than
+    * points because in the point case we set PrimEnd on all vertices.
+    */
+   if (nir->info.gs.output_primitive != MESA_PRIM_POINTS) {
+      emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z));
+      emit(IF(BRW_PREDICATE_NORMAL));
+      gs_end_primitive();
+      emit(BRW_OPCODE_ENDIF);
+   }
+
+   /* Here we have to:
+    * 1) Emit an FF_SYNC message to obtain an initial VUE handle.
+    * 2) Loop over all buffered vertex data and write it to corresponding
+    *    URB entries.
+    * 3) Allocate new VUE handles for all vertices other than the first.
+    * 4) Send a final EOT message.
+    */
+
+   /* MRF 0 is reserved for the debugger, so start with message header
+    * in MRF 1.
+    */
+   int base_mrf = 1;
+
+   /* In the process of generating our URB write message contents, we
+    * may need to unspill a register or load from an array.  Those
+    * reads would use MRFs 21..23
+    */
+   int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver);
+
+   /* Issue the FF_SYNC message and obtain the initial VUE handle. */
+   this->current_annotation = "gfx6 thread end: ff_sync";
+
+   vec4_instruction *inst = NULL;
+   if (gs_prog_data->num_transform_feedback_bindings) {
+      src_reg sol_temp(this, glsl_uvec4_type());
+      emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
+           dst_reg(this->svbi),
+           this->vertex_count,
+           this->prim_count,
+           sol_temp);
+      inst = emit(GS_OPCODE_FF_SYNC,
+                  dst_reg(this->temp), this->prim_count, this->svbi);
+   } else {
+      inst = emit(GS_OPCODE_FF_SYNC,
+                  dst_reg(this->temp), this->prim_count, brw_imm_ud(0u));
+   }
+   inst->base_mrf = base_mrf;
+
+   emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G));
+   emit(IF(BRW_PREDICATE_NORMAL));
+   {
+      /* Loop over all buffered vertices and emit URB write messages */
+      this->current_annotation = "gfx6 thread end: urb writes init";
+      src_reg vertex(this, glsl_uint_type());
+      emit(MOV(dst_reg(vertex), brw_imm_ud(0u)));
+      emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
+
+      this->current_annotation = "gfx6 thread end: urb writes";
+      emit(BRW_OPCODE_DO);
+      {
+         emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
+         inst = emit(BRW_OPCODE_BREAK);
+         inst->predicate = BRW_PREDICATE_NORMAL;
+
+         /* First we prepare the message header */
+         emit_urb_write_header(base_mrf);
+
+         /* Then add vertex data to the message in interleaved fashion */
+         int slot = 0;
+         bool complete = false;
+         do {
+            int mrf = base_mrf + 1;
+
+            /* URB offset is in URB row increments, and each of our MRFs is half
+             * of one of those, since we're doing interleaved writes.
+             */
+            int urb_offset = slot / 2;
+
+            for (; slot < prog_data->vue_map.num_slots; ++slot) {
+               int varying = prog_data->vue_map.slot_to_varying[slot];
+               current_annotation = output_reg_annotation[varying];
+
+               /* Compute offset of this slot for the current vertex
+                * in vertex_output
+                */
+               src_reg data(this->vertex_output);
+               data.reladdr = ralloc(mem_ctx, src_reg);
+               memcpy(data.reladdr, &this->vertex_output_offset,
+                      sizeof(src_reg));
+
+               /* Copy this slot to the appropriate message register */
+               dst_reg reg = dst_reg(MRF, mrf);
+               reg.type = output_reg[varying][0].type;
+               data.type = reg.type;
+               inst = emit(MOV(reg, data));
+               inst->force_writemask_all = true;
+
+               mrf++;
+               emit(ADD(dst_reg(this->vertex_output_offset),
+                        this->vertex_output_offset, brw_imm_ud(1u)));
+
+               /* If this was max_usable_mrf, we can't fit anything more into
+                * this URB WRITE. Same if we reached the max. message length.
+                */
+               if (mrf > max_usable_mrf ||
+                   align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
+                  slot++;
+                  break;
+               }
+            }
+
+            complete = slot >= prog_data->vue_map.num_slots;
+            emit_snb_gs_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
+         } while (!complete);
+
+         /* Skip over the flags data item so that vertex_output_offset points
+          * to the first data item of the next vertex, so that we can start
+          * writing the next vertex.
+          */
+         emit(ADD(dst_reg(this->vertex_output_offset),
+                  this->vertex_output_offset, brw_imm_ud(1u)));
+
+         emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u)));
+      }
+      emit(BRW_OPCODE_WHILE);
+
+      if (gs_prog_data->num_transform_feedback_bindings)
+         xfb_write();
+   }
+   emit(BRW_OPCODE_ENDIF);
+
+   /* Finally, emit EOT message.
+    *
+    * In gfx6 we need to end the thread differently depending on whether we have
+    * emitted at least one vertex or not. In case we did, the EOT message must
+    * always include the COMPLETE flag or else the GPU hangs. If we have not
+    * produced any output we can't use the COMPLETE flag.
+    *
+    * However, this would lead us to end the program with an ENDIF opcode,
+    * which we want to avoid, so what we do is that we always request a new
+    * VUE handle every time, even if GS produces no output.
+    * With this we make sure that whether we have emitted at least one vertex
+    * or none at all, we have to finish the thread without writing to the URB,
+    * which works for both cases by setting the COMPLETE and UNUSED flags in
+    * the EOT message.
+    */
+   this->current_annotation = "gfx6 thread end: EOT";
+
+   if (gs_prog_data->num_transform_feedback_bindings) {
+      /* When emitting EOT, set SONumPrimsWritten Increment Value. */
+      src_reg data(this, glsl_uint_type());
+      emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu)));
+      emit(SHL(dst_reg(data), data, brw_imm_ud(16u)));
+      emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
+   }
+
+   inst = emit(GS_OPCODE_THREAD_END);
+   inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
+   inst->base_mrf = base_mrf;
+   inst->mlen = 1;
+}
+
+void
+gfx6_gs_visitor::setup_payload()
+{
+   int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
+
+   /* Attributes are going to be interleaved, so one register contains two
+    * attribute slots.
+    */
+   int attributes_per_reg = 2;
+
+   /* If a geometry shader tries to read from an input that wasn't written by
+    * the vertex shader, that produces undefined results, but it shouldn't
+    * crash anything.  So initialize attribute_map to zeros--that ensures that
+    * these undefined results are read from r0.
+    */
+   memset(attribute_map, 0, sizeof(attribute_map));
+
+   int reg = 0;
+
+   /* The payload always contains important data in r0. */
+   reg++;
+
+   /* r1 is always part of the payload and it holds information relevant
+    * for transform feedback when we set the GFX6_GS_SVBI_PAYLOAD_ENABLE bit in
+    * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
+    * information (and move the original value to a virtual register if
+    * necessary).
+    */
+   if (gs_prog_data->include_primitive_id)
+      attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
+   reg++;
+
+   reg = setup_uniforms(reg);
+
+   reg = setup_varying_inputs(reg, attributes_per_reg);
+
+   this->first_non_payload_grf = reg;
+}
+
+void
+gfx6_gs_visitor::xfb_write()
+{
+   unsigned num_verts;
+
+   switch (gs_prog_data->output_topology) {
+   case _3DPRIM_POINTLIST:
+      num_verts = 1;
+      break;
+   case _3DPRIM_LINELIST:
+   case _3DPRIM_LINESTRIP:
+   case _3DPRIM_LINELOOP:
+      num_verts = 2;
+      break;
+   case _3DPRIM_TRILIST:
+   case _3DPRIM_TRIFAN:
+   case _3DPRIM_TRISTRIP:
+   case _3DPRIM_RECTLIST:
+      num_verts = 3;
+      break;
+   case _3DPRIM_QUADLIST:
+   case _3DPRIM_QUADSTRIP:
+   case _3DPRIM_POLYGON:
+      num_verts = 3;
+      break;
+   default:
+      unreachable("Unexpected primitive type in Gfx6 SOL program.");
+   }
+
+   this->current_annotation = "gfx6 thread end: svb writes init";
+
+   emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
+   emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u)));
+
+   /* Check that at least one primitive can be written
+    *
+    * Note: since we use the binding table to keep track of buffer offsets
+    * and stride, the GS doesn't need to keep track of a separate pointer
+    * into each buffer; it uses a single pointer which increments by 1 for
+    * each vertex.  So we use SVBI0 for this pointer, regardless of whether
+    * transform feedback is in interleaved or separate attribs mode.
+    */
+   src_reg sol_temp(this, glsl_uvec4_type());
+   emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts)));
+
+   /* Compare SVBI calculated number with the maximum value, which is
+    * in R1.4 (previously saved in this->max_svbi) for gfx6.
+    */
+   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
+   emit(IF(BRW_PREDICATE_NORMAL));
+   {
+      vec4_instruction *inst = emit(MOV(dst_reg(destination_indices),
+                                        brw_imm_vf4(brw_float_to_vf(0.0),
+                                                    brw_float_to_vf(1.0),
+                                                    brw_float_to_vf(2.0),
+                                                    brw_float_to_vf(0.0))));
+      inst->force_writemask_all = true;
+
+      emit(ADD(dst_reg(this->destination_indices),
+               this->destination_indices,
+               this->svbi));
+   }
+   emit(BRW_OPCODE_ENDIF);
+
+   /* Write transform feedback data for all processed vertices. */
+   for (int i = 0; i < (int)nir->info.gs.vertices_out; i++) {
+      emit(MOV(dst_reg(sol_temp), brw_imm_d(i)));
+      emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
+               BRW_CONDITIONAL_L));
+      emit(IF(BRW_PREDICATE_NORMAL));
+      {
+         xfb_program(i, num_verts);
+      }
+      emit(BRW_OPCODE_ENDIF);
+   }
+}
+
+void
+gfx6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
+{
+   unsigned binding;
+   unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings;
+   src_reg sol_temp(this, glsl_uvec4_type());
+
+   /* Check for buffer overflow: we need room to write the complete primitive
+    * (all vertices). Otherwise, avoid writing any vertices for it
+    */
+   emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u)));
+   emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts)));
+   emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
+   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
+   emit(IF(BRW_PREDICATE_NORMAL));
+   {
+      /* Avoid overwriting MRF 1 as it is used as URB write message header */
+      dst_reg mrf_reg(MRF, 2);
+
+      this->current_annotation = "gfx6: emit SOL vertex data";
+      /* For each vertex, generate code to output each varying using the
+       * appropriate binding table entry.
+       */
+      for (binding = 0; binding < num_bindings; ++binding) {
+         unsigned char varying =
+            gs_prog_data->transform_feedback_bindings[binding];
+
+         /* Set up the correct destination index for this vertex */
+         vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
+                                       mrf_reg,
+                                       this->destination_indices);
+         inst->sol_vertex = vertex % num_verts;
+
+         /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
+          *
+          *   "Prior to End of Thread with a URB_WRITE, the kernel must
+          *   ensure that all writes are complete by sending the final
+          *   write as a committed write."
+          */
+         bool final_write = binding == (unsigned) num_bindings - 1 &&
+                            inst->sol_vertex == num_verts - 1;
+
+         /* Compute offset of this varying for the current vertex
+          * in vertex_output
+          */
+         this->current_annotation = output_reg_annotation[varying];
+         src_reg data(this->vertex_output);
+         data.reladdr = ralloc(mem_ctx, src_reg);
+         int offset = get_vertex_output_offset_for_varying(vertex, varying);
+         emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset)));
+         memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
+         data.type = output_reg[varying][0].type;
+         data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
+
+         /* Write data */
+         inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
+         inst->sol_binding = binding;
+         inst->sol_final_write = final_write;
+
+         if (final_write) {
+            /* This is the last vertex of the primitive, then increment
+             * SO num primitive counter and destination indices.
+             */
+            emit(ADD(dst_reg(this->destination_indices),
+                     this->destination_indices,
+                     brw_imm_ud(num_verts)));
+            emit(ADD(dst_reg(this->sol_prim_written),
+                     this->sol_prim_written, brw_imm_ud(1u)));
+         }
+
+      }
+      this->current_annotation = NULL;
+   }
+   emit(BRW_OPCODE_ENDIF);
+}
+
+int
+gfx6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
+{
+   /* Find the output slot assigned to this varying.
+    *
+    * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
+    * as VARYING_SLOT_PSIZ.
+    */
+   if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
+      varying = VARYING_SLOT_PSIZ;
+   int slot = prog_data->vue_map.varying_to_slot[varying];
+
+   if (slot < 0) {
+      /* This varying does not exist in the VUE so we are not writing to it
+       * and its value is undefined. We still want to return a valid offset
+       * into vertex_output though, to prevent any out-of-bound accesses into
+       * the vertex_output array. Since the value for this varying is undefined
+       * we don't really care for the value we assign to it, so any offset
+       * within the limits of vertex_output will do.
+       */
+      slot = 0;
+   }
+
+   return vertex * (prog_data->vue_map.num_slots + 1) + slot;
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/elk/gfx6_gs_visitor.h b/src/intel/compiler/elk/gfx6_gs_visitor.h
new file mode 100644
index 00000000000..61832a0cb6b
--- /dev/null
+++ b/src/intel/compiler/elk/gfx6_gs_visitor.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef GFX6_GS_VISITOR_H
+#define GFX6_GS_VISITOR_H
+
+#include "brw_vec4.h"
+#include "brw_vec4_gs_visitor.h"
+
+#ifdef __cplusplus
+
+namespace brw {
+
+class gfx6_gs_visitor : public vec4_gs_visitor
+{
+public:
+   gfx6_gs_visitor(const struct brw_compiler *comp,
+                   const struct brw_compile_params *params,
+                   struct brw_gs_compile *c,
+                   struct brw_gs_prog_data *prog_data,
+                   const nir_shader *shader,
+                   bool no_spills,
+                   bool debug_enabled) :
+      vec4_gs_visitor(comp, params, c, prog_data, shader, no_spills, debug_enabled)
+      {
+      }
+
+protected:
+   virtual void emit_prolog();
+   virtual void emit_thread_end();
+   virtual void gs_emit_vertex(int stream_id);
+   virtual void gs_end_primitive();
+   virtual void emit_urb_write_header(int mrf);
+   virtual void setup_payload();
+
+private:
+   void xfb_write();
+   void xfb_program(unsigned vertex, unsigned num_verts);
+   int get_vertex_output_offset_for_varying(int vertex, int varying);
+   void emit_snb_gs_urb_write_opcode(bool complete,
+                                     int base_mrf,
+                                     int last_mrf,
+                                     int urb_offset);
+
+   src_reg vertex_output;
+   src_reg vertex_output_offset;
+   src_reg temp;
+   src_reg first_vertex;
+   src_reg prim_count;
+   src_reg primitive_id;
+
+   /* Transform Feedback members */
+   src_reg sol_prim_written;
+   src_reg svbi;
+   src_reg max_svbi;
+   src_reg destination_indices;
+};
+
+} /* namespace brw */
+
+#endif /* __cplusplus */
+
+#endif /* GFX6_GS_VISITOR_H */
diff --git a/src/intel/compiler/elk/intel_clc.c b/src/intel/compiler/elk/intel_clc.c
new file mode 100644
index 00000000000..6e7ba453eaf
--- /dev/null
+++ b/src/intel/compiler/elk/intel_clc.c
@@ -0,0 +1,676 @@
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_compiler.h"
+#include "brw_kernel.h"
+#include "compiler/brw_disasm.h"
+#include "compiler/clc/clc.h"
+#include "compiler/glsl_types.h"
+#include "compiler/nir/nir_serialize.h"
+#include "dev/intel_debug.h"
+#include "util/build_id.h"
+#include "util/disk_cache.h"
+#include "util/macros.h"
+#include "util/mesa-sha1.h"
+#include "util/u_dynarray.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+
+/* Shader functions */
+#define SPIR_V_MAGIC_NUMBER 0x07230203
+
+static struct disk_cache *
+get_disk_cache(struct brw_compiler *compiler)
+{
+#ifdef ENABLE_SHADER_CACHE
+   char renderer[14];
+   ASSERTED int len = snprintf(renderer, sizeof(renderer), "brw_clc_%04x",
+                               compiler->devinfo->pci_device_id);
+   assert(len == sizeof(renderer) - 2);
+
+   const struct build_id_note *note =
+      build_id_find_nhdr_for_addr(get_disk_cache);
+   if (note == NULL) {
+      fprintf(stderr, "Failed to find build-id\n");
+      abort();
+   }
+
+   unsigned build_id_len = build_id_length(note);
+   if (build_id_len < 20) {
+      fprintf(stderr, "build-id too short.  It needs to be a SHA\n");
+      abort();
+   }
+
+   struct mesa_sha1 sha1_ctx;
+   uint8_t sha1[20];
+   _mesa_sha1_init(&sha1_ctx);
+   _mesa_sha1_update(&sha1_ctx, build_id_data(note), build_id_len);
+   _mesa_sha1_final(&sha1_ctx, sha1);
+
+   char timestamp[41];
+   _mesa_sha1_format(timestamp, sha1);
+
+   const uint64_t driver_flags = brw_get_compiler_config_value(compiler);
+
+   return disk_cache_create(renderer, timestamp, driver_flags);
+#endif
+   return NULL;
+}
+
+static void
+compiler_log(void *data, unsigned *id, const char *fmt, ...)
+{
+   va_list args;
+   va_start(args, fmt);
+   if (INTEL_DEBUG(DEBUG_CS))
+      vfprintf(stderr, fmt, args);
+   va_end(args);
+}
+
+static void
+msg_callback(void *priv, const char *msg)
+{
+   (void)priv;
+   fprintf(stderr, "%s", msg);
+}
+
+static void
+print_u32_data(FILE *fp, const char *prefix, const char *arr_name,
+               const uint32_t *data, size_t len)
+{
+   assert(len % 4 == 0);
+   fprintf(fp, "static const uint32_t %s_%s[] = {", prefix, arr_name);
+   for (unsigned i = 0; i < (len / 4); i++) {
+      if (i % 4 == 0)
+         fprintf(fp,"\n   ");
+
+      fprintf(fp, " 0x%08" PRIx32 ",", data[i]);
+   }
+   fprintf(fp, "\n};\n");
+}
+
+static void
+print_u8_data(FILE *fp, const char *prefix, const char *arr_name,
+               const uint8_t *data, size_t len)
+{
+   fprintf(fp, "static const uint8_t %s_%s[] = {", prefix, arr_name);
+   for (unsigned i = 0; i < len; i++) {
+      if (i % 16 == 0)
+         fprintf(fp,"\n   ");
+
+      fprintf(fp, " 0x%02" PRIx8 ",", data[i]);
+   }
+   fprintf(fp, "\n};\n");
+}
+
+static const char *
+reloc_type_str(enum brw_shader_reloc_type type)
+{
+   switch (type) {
+#define CASE(e) case e: return #e;
+   CASE(BRW_SHADER_RELOC_TYPE_U32)
+   CASE(BRW_SHADER_RELOC_TYPE_MOV_IMM)
+#undef CASE
+   default:
+      unreachable("Unknown relocation type");
+   }
+}
+
+static void
+print_cs_prog_data_fields(FILE *fp, const char *prefix, const char *pad,
+                          const struct brw_cs_prog_data *cs_prog_data)
+{
+#define PROG_DATA_FIELD(fmt, field) \
+   fprintf(fp, "%s." #field " = " fmt ",\n", pad, cs_prog_data->field)
+
+#define PROG_DATA_BOOL_FIELD(field) \
+   fprintf(fp, "%s." #field " = %s,\n", pad, \
+           cs_prog_data->field ? "true" : "false")
+
+   PROG_DATA_FIELD("%u", base.nr_params);
+   assert(cs_prog_data->base.stage == MESA_SHADER_COMPUTE);
+   fprintf(fp, "%s.base.stage = MESA_SHADER_COMPUTE,\n", pad);
+   assert(cs_prog_data->base.zero_push_reg == 0);
+   assert(cs_prog_data->base.push_reg_mask_param == 0);
+   PROG_DATA_FIELD("%u", base.curb_read_length);
+   PROG_DATA_FIELD("%u", base.total_scratch);
+   PROG_DATA_FIELD("%u", base.total_shared);
+   PROG_DATA_FIELD("%u", base.program_size);
+   PROG_DATA_FIELD("%u", base.const_data_size);
+   PROG_DATA_FIELD("%u", base.const_data_offset);
+   PROG_DATA_FIELD("%u", base.num_relocs);
+   fprintf(fp, "%s.base.relocs = %s_relocs,\n", pad, prefix);
+   assert(!cs_prog_data->base.has_ubo_pull);
+   assert(cs_prog_data->base.dispatch_grf_start_reg == 0);
+   assert(!cs_prog_data->base.use_alt_mode);
+   assert(cs_prog_data->base.param == 0);
+   PROG_DATA_BOOL_FIELD(base.uses_atomic_load_store);
+   fprintf(fp, "%s.local_size = { %u, %u, %u },\n", pad,
+           cs_prog_data->local_size[0],
+           cs_prog_data->local_size[1],
+           cs_prog_data->local_size[2]);
+   fprintf(fp, "%s.prog_offset = { %u, %u, %u },\n", pad,
+           cs_prog_data->prog_offset[0],
+           cs_prog_data->prog_offset[1],
+           cs_prog_data->prog_offset[2]);
+   PROG_DATA_FIELD("%u", prog_mask);
+   PROG_DATA_FIELD("%u", prog_spilled);
+   PROG_DATA_BOOL_FIELD(uses_barrier);
+   PROG_DATA_BOOL_FIELD(uses_num_work_groups);
+   assert(!cs_prog_data->uses_inline_data);
+   assert(!cs_prog_data->uses_btd_stack_ids);
+   PROG_DATA_FIELD("%u", push.per_thread.dwords);
+   PROG_DATA_FIELD("%u", push.per_thread.regs);
+   PROG_DATA_FIELD("%u", push.per_thread.size);
+   PROG_DATA_FIELD("%u", push.cross_thread.dwords);
+   PROG_DATA_FIELD("%u", push.cross_thread.regs);
+   PROG_DATA_FIELD("%u", push.cross_thread.size);
+
+#undef PROG_DATA_FIELD
+#undef PROG_DATA_BOOL_FIELD
+}
+
+static void
+print_kernel(FILE *fp, const char *prefix,
+             const struct brw_kernel *kernel,
+             const struct brw_isa_info *isa)
+{
+   struct mesa_sha1 sha1_ctx;
+   _mesa_sha1_init(&sha1_ctx);
+
+#define SHA1_UPDATE_VALUE(val) \
+   _mesa_sha1_update(&sha1_ctx, &val, sizeof(val))
+
+   fprintf(fp, "#include \"intel/compiler/brw_kernel.h\"\n");
+   fprintf(fp, "\n");
+
+   fprintf(fp, "static const struct brw_shader_reloc %s_relocs[] = {\n",
+           prefix);
+   for (unsigned i = 0; i < kernel->prog_data.base.num_relocs; i++) {
+      const struct brw_shader_reloc *reloc = &kernel->prog_data.base.relocs[i];
+      fprintf(fp, "   { %"PRIu32", %s, %"PRIu32", %"PRIu32" },\n",
+              reloc->id, reloc_type_str(reloc->type),
+              reloc->offset, reloc->delta);
+   }
+   fprintf(fp, "};\n");
+   _mesa_sha1_update(&sha1_ctx, kernel->prog_data.base.relocs,
+                     kernel->prog_data.base.num_relocs *
+                     sizeof(kernel->prog_data.base.relocs[0]));
+
+   /* Get rid of the pointers before we hash */
+   struct brw_cs_prog_data cs_prog_data = kernel->prog_data;
+   cs_prog_data.base.relocs = NULL;
+   assert(cs_prog_data.base.param == NULL);
+   _mesa_sha1_update(&sha1_ctx, &cs_prog_data, sizeof(cs_prog_data));
+
+   SHA1_UPDATE_VALUE(kernel->args_size);
+   SHA1_UPDATE_VALUE(kernel->arg_count);
+   _mesa_sha1_update(&sha1_ctx, kernel->args,
+                     kernel->arg_count * sizeof(kernel->args[0]));
+
+   fprintf(fp, "static const struct brw_kernel_arg_desc %s_args[] = {\n",
+           prefix);
+   for (unsigned i = 0; i < kernel->arg_count; i++) {
+      fprintf(fp, "   { %d, %d },\n",
+              kernel->args[i].offset, kernel->args[i].size);
+   }
+   fprintf(fp, "};\n\n");
+
+   _mesa_sha1_update(&sha1_ctx, kernel->code,
+                     kernel->prog_data.base.program_size);
+
+   fprintf(fp, "#if 0  /* BEGIN KERNEL ASSEMBLY */\n");
+   fprintf(fp, "\n");
+   brw_disassemble_with_errors(isa, kernel->code, 0, fp);
+   fprintf(fp, "\n");
+   fprintf(fp, "#endif /* END KERNEL ASSEMBLY */\n");
+   print_u32_data(fp, prefix, "code", kernel->code,
+                  kernel->prog_data.base.program_size);
+
+   fprintf(fp, "static const struct brw_kernel %s = {\n", prefix);
+   fprintf(fp, "   .prog_data = {\n");
+   print_cs_prog_data_fields(fp, prefix, "      ", &kernel->prog_data);
+   fprintf(fp, "   },\n");
+   fprintf(fp, "   .args_size = %d,\n", (int)kernel->args_size);
+   fprintf(fp, "   .arg_count = %d,\n", (int)kernel->arg_count);
+   fprintf(fp, "   .args = %s_args,\n", prefix);
+   fprintf(fp, "   .code = %s_code,\n", prefix);
+   fprintf(fp, "};\n");
+
+   unsigned char sha1[20];
+   _mesa_sha1_final(&sha1_ctx, sha1);
+   char sha1_str[41];
+   _mesa_sha1_format(sha1_str, sha1);
+   fprintf(fp, "const char *%s_sha1 = \"%s\";\n", prefix, sha1_str);
+}
+
+static void
+print_usage(char *exec_name, FILE *f)
+{
+   fprintf(f,
+"Usage: %s [options] -- [clang args]\n"
+"Options:\n"
+"  -h  --help              Print this help.\n"
+"  -e, --entrypoint <name> Specify the entry-point name.\n"
+"  -L, --llvm17-wa         Enable LLVM 17 workarounds for opaque pointers"
+"  -p, --platform <name>   Specify the target platform name.\n"
+"      --prefix <prefix>   Prefix for variable names in generated C code.\n"
+"  -o, --out <filename>    Specify the output filename.\n"
+"  -i, --in <filename>     Specify one input filename. Accepted multiple times.\n"
+"  -s, --spv <filename>    Specify the output filename for spirv.\n"
+"  -n, --nir               Specify whether to output serialized NIR instead of ISA.\n"
+"  -t, --text <filename>   Specify the output filename for the parsed text\n"
+"  -v, --verbose           Print more information during compilation.\n"
+"  -M, --llvm-version      Print LLVM version.\n"
+   , exec_name);
+}
+
+#define OPT_PREFIX 1000
+
+struct intel_clc_params {
+   char *entry_point;
+   char *platform;
+   char *outfile;
+   char *spv_outfile;
+   char *txt_outfile;
+   char *prefix;
+
+   bool output_nir;
+   bool print_info;
+   bool llvm17_wa;
+
+   void *mem_ctx;
+
+   struct intel_device_info devinfo;
+};
+
+#include "compiler/spirv/nir_spirv.h"
+
+static int
+output_nir(const struct intel_clc_params *params, struct clc_binary *binary)
+{
+   struct spirv_to_nir_options spirv_options = {
+      .environment = NIR_SPIRV_OPENCL,
+      .caps = {
+         .address = true,
+         .groups = true,
+         .image_write_without_format = true,
+         .int8 = true,
+         .int16 = true,
+         .int64 = true,
+         .int64_atomics = true,
+         .kernel = true,
+         .linkage = true, /* We receive linked kernel from clc */
+         .float_controls = true,
+         .generic_pointers = true,
+         .storage_8bit = true,
+         .storage_16bit = true,
+         .subgroup_arithmetic = true,
+         .subgroup_basic = true,
+         .subgroup_ballot = true,
+         .subgroup_dispatch = true,
+         .subgroup_quad = true,
+         .subgroup_shuffle = true,
+         .subgroup_vote = true,
+
+         .intel_subgroup_shuffle = true,
+         .intel_subgroup_buffer_block_io = true,
+      },
+      .shared_addr_format = nir_address_format_62bit_generic,
+      .global_addr_format = nir_address_format_62bit_generic,
+      .temp_addr_format = nir_address_format_62bit_generic,
+      .constant_addr_format = nir_address_format_64bit_global,
+      .create_library = true,
+   };
+
+   FILE *fp = params->outfile != NULL ?
+      fopen(params->outfile, "w") : stdout;
+   if (!fp) {
+      fprintf(stderr, "Failed to open %s\n", params->outfile);
+      return -1;
+   }
+
+   spirv_library_to_nir_builder(fp, binary->data, binary->size / 4,
+                                &spirv_options);
+
+   nir_shader *nir = brw_nir_from_spirv(params->mem_ctx,
+                                        binary->data, binary->size,
+                                        params->llvm17_wa);
+   if (!nir) {
+      fprintf(stderr, "Failed to generate NIR out of SPIRV\n");
+      return -1;
+   }
+
+   struct blob blob;
+   blob_init(&blob);
+   nir_serialize(&blob, nir, false /* strip */);
+   print_u8_data(fp, params->prefix, "nir", blob.data, blob.size);
+   blob_finish(&blob);
+
+   if (params->outfile)
+      fclose(fp);
+
+   return 0;
+}
+
+static int
+output_isa(const struct intel_clc_params *params, struct clc_binary *binary)
+{
+   struct brw_kernel kernel = {};
+   char *error_str;
+
+   struct brw_isa_info _isa, *isa = &_isa;
+   brw_init_isa_info(isa, &params->devinfo);
+
+   struct brw_compiler *compiler = brw_compiler_create(params->mem_ctx,
+                                                       &params->devinfo);
+   compiler->shader_debug_log = compiler_log;
+   compiler->shader_perf_log = compiler_log;
+   struct disk_cache *disk_cache = get_disk_cache(compiler);
+
+   if (!brw_kernel_from_spirv(compiler, disk_cache, &kernel, NULL, params->mem_ctx,
+                              binary->data, binary->size,
+                              params->entry_point, &error_str)) {
+      fprintf(stderr, "Compile failed: %s\n", error_str);
+      return -1;
+   }
+
+   if (params->print_info) {
+      fprintf(stdout, "kernel info:\n");
+      fprintf(stdout, "   uses_barrier           : %u\n", kernel.prog_data.uses_barrier);
+      fprintf(stdout, "   uses_num_work_groups   : %u\n", kernel.prog_data.uses_num_work_groups);
+      fprintf(stdout, "   uses_inline_data       : %u\n", kernel.prog_data.uses_inline_data);
+      fprintf(stdout, "   local_size             : %ux%ux%u\n",
+              kernel.prog_data.local_size[0],
+              kernel.prog_data.local_size[1],
+              kernel.prog_data.local_size[2]);
+      fprintf(stdout, "   curb_read_length       : %u\n", kernel.prog_data.base.curb_read_length);
+      fprintf(stdout, "   total_scratch          : %u\n", kernel.prog_data.base.total_scratch);
+      fprintf(stdout, "   total_shared           : %u\n", kernel.prog_data.base.total_shared);
+      fprintf(stdout, "   program_size           : %u\n", kernel.prog_data.base.program_size);
+      fprintf(stdout, "   const_data_size        : %u\n", kernel.prog_data.base.const_data_size);
+      fprintf(stdout, "   uses_atomic_load_store : %u\n", kernel.prog_data.base.uses_atomic_load_store);
+      fprintf(stdout, "   dispatch_grf_start_reg : %u\n", kernel.prog_data.base.dispatch_grf_start_reg);
+   }
+
+   char *prefix = params->prefix;
+   char prefix_tmp[256];
+   if (prefix == NULL) {
+      bool is_pt_5 = (params->devinfo.verx10 % 10) == 5;
+      snprintf(prefix_tmp, sizeof(prefix_tmp), "gfx%d%s_clc_%s",
+               params->devinfo.ver, is_pt_5 ? "5" : "", params->entry_point);
+      prefix = prefix_tmp;
+   }
+
+   if (params->outfile != NULL) {
+      FILE *fp = fopen(params->outfile, "w");
+      print_kernel(fp, prefix, &kernel, isa);
+      fclose(fp);
+   } else {
+      print_kernel(stdout, prefix, &kernel, isa);
+   }
+
+   return 0;
+}
+
+static void
+print_llvm_version(FILE *out)
+{
+   fprintf(out, "%s\n", MESA_LLVM_VERSION_STRING);
+}
+
+int main(int argc, char **argv)
+{
+   int exit_code = 0;
+
+   process_intel_debug_variable();
+
+   static struct option long_options[] ={
+      {"help",         no_argument,         0, 'h'},
+      {"entrypoint",   required_argument,   0, 'e'},
+      {"platform",     required_argument,   0, 'p'},
+      {"prefix",       required_argument,   0, OPT_PREFIX},
+      {"in",           required_argument,   0, 'i'},
+      {"out",          required_argument,   0, 'o'},
+      {"spv",          required_argument,   0, 's'},
+      {"text",         required_argument,   0, 't'},
+      {"nir",          no_argument,         0, 'n'},
+      {"llvm17-wa",    no_argument,         0, 'L'},
+      {"llvm-version", no_argument,         0, 'M'},
+      {"verbose",      no_argument,         0, 'v'},
+      {0, 0, 0, 0}
+   };
+
+   struct intel_clc_params params = {};
+
+   struct util_dynarray clang_args;
+   struct util_dynarray input_files;
+
+   struct clc_binary spirv_obj = {0};
+   struct clc_parsed_spirv parsed_spirv_data = {0};
+   struct disk_cache *disk_cache = NULL;
+
+   params.mem_ctx = ralloc_context(NULL);
+
+   util_dynarray_init(&clang_args, params.mem_ctx);
+   util_dynarray_init(&input_files, params.mem_ctx);
+
+   int ch;
+   while ((ch = getopt_long(argc, argv, "he:p:s:t:i:no:MLv", long_options, NULL)) != -1)
+   {
+      switch (ch)
+      {
+      case 'h':
+         print_usage(argv[0], stdout);
+         goto end;
+      case 'e':
+         params.entry_point = optarg;
+         break;
+      case 'p':
+         params.platform = optarg;
+         break;
+      case 'o':
+         params.outfile = optarg;
+         break;
+      case 'i':
+         util_dynarray_append(&input_files, char *, optarg);
+	 break;
+      case 'n':
+         params.output_nir = true;
+         break;
+      case 's':
+         params.spv_outfile = optarg;
+         break;
+      case 't':
+         params.txt_outfile = optarg;
+         break;
+      case 'v':
+         params.print_info = true;
+         break;
+      case 'L':
+         params.llvm17_wa = true;
+         break;
+      case 'M':
+         print_llvm_version(stdout);
+         return EXIT_SUCCESS;
+      case OPT_PREFIX:
+         params.prefix = optarg;
+         break;
+      default:
+         fprintf(stderr, "Unrecognized option \"%s\".\n", optarg);
+         print_usage(argv[0], stderr);
+         goto fail;
+      }
+   }
+
+   for (int i = optind; i < argc; i++) {
+      util_dynarray_append(&clang_args, char *, argv[i]);
+   }
+
+   if (util_dynarray_num_elements(&input_files, char *) == 0) {
+      fprintf(stderr, "No input file(s).\n");
+      print_usage(argv[0], stderr);
+      goto fail;
+   }
+
+   struct clc_logger logger = {
+      .error = msg_callback,
+      .warning = msg_callback,
+   };
+
+   size_t total_size = 0;
+   char *all_inputs = NULL;
+   util_dynarray_foreach(&input_files, char *, infile) {
+      int fd = open(*infile, O_RDONLY);
+      if (fd < 0) {
+         fprintf(stderr, "Failed to open %s\n", *infile);
+         goto fail;
+      }
+
+      off_t len = lseek(fd, 0, SEEK_END);
+      size_t new_size = total_size + len;
+      all_inputs = reralloc_size(params.mem_ctx, all_inputs, new_size + 1);
+      if (!all_inputs) {
+         fprintf(stderr, "Failed to allocate memory\n");
+         goto fail;
+      }
+      lseek(fd, 0, SEEK_SET);
+      read(fd, all_inputs + total_size, len);
+      close(fd);
+      total_size = new_size;
+      all_inputs[total_size] = '\0';
+   }
+
+   if (params.txt_outfile) {
+      FILE *fp = fopen(params.txt_outfile, "w");
+      fwrite(all_inputs, total_size, 1, fp);
+      fclose(fp);
+   }
+
+   const char *allowed_spirv_extensions[] = {
+      "SPV_EXT_shader_atomic_float_add",
+      "SPV_EXT_shader_atomic_float_min_max",
+      "SPV_KHR_float_controls",
+      "SPV_INTEL_subgroups",
+      NULL,
+   };
+
+   struct clc_compile_args clc_args = {
+      .source = {
+         .name = "intel_clc_files",
+         .value = all_inputs,
+      },
+      .features = {
+         .fp16 = true,
+         .intel_subgroups = true,
+         .subgroups = true,
+         .subgroups_ifp = true,
+      },
+      .args = util_dynarray_begin(&clang_args),
+      .num_args = util_dynarray_num_elements(&clang_args, char *),
+      .allowed_spirv_extensions = allowed_spirv_extensions,
+   };
+
+   if (!clc_compile_c_to_spirv(&clc_args, &logger, &spirv_obj)) {
+      goto fail;
+   }
+
+   if (params.spv_outfile) {
+      FILE *fp = fopen(params.spv_outfile, "w");
+      fwrite(spirv_obj.data, spirv_obj.size, 1, fp);
+      fclose(fp);
+   }
+
+   glsl_type_singleton_init_or_ref();
+
+   if (params.output_nir) {
+      exit_code = output_nir(&params, &spirv_obj);
+   } else {
+      if (params.platform == NULL) {
+         fprintf(stderr, "No target platform name specified.\n");
+         print_usage(argv[0], stderr);
+         goto fail;
+      }
+
+      int pci_id = intel_device_name_to_pci_device_id(params.platform);
+      if (pci_id < 0) {
+         fprintf(stderr, "Invalid target platform name: %s\n", params.platform);
+         goto fail;
+      }
+
+      if (!intel_get_device_info_from_pci_id(pci_id, &params.devinfo)) {
+         fprintf(stderr, "Failed to get device information.\n");
+         goto fail;
+      }
+
+      if (params.devinfo.verx10 < 125) {
+         fprintf(stderr, "Platform currently not supported.\n");
+         goto fail;
+      }
+
+      if (params.entry_point == NULL) {
+         fprintf(stderr, "No entry-point name specified.\n");
+         print_usage(argv[0], stderr);
+         goto fail;
+      }
+
+      struct clc_parsed_spirv parsed_spirv_data;
+      if (!clc_parse_spirv(&spirv_obj, &logger, &parsed_spirv_data))
+         goto fail;
+
+      const struct clc_kernel_info *kernel_info = NULL;
+      for (unsigned i = 0; i < parsed_spirv_data.num_kernels; i++) {
+         if (strcmp(parsed_spirv_data.kernels[i].name, params.entry_point) == 0) {
+            kernel_info = &parsed_spirv_data.kernels[i];
+            break;
+         }
+      }
+      if (kernel_info == NULL) {
+         fprintf(stderr, "Kernel entrypoint %s not found\n", params.entry_point);
+         goto fail;
+      }
+
+      exit_code = output_isa(&params, &spirv_obj);
+   }
+
+   glsl_type_singleton_decref();
+
+   goto end;
+
+fail:
+   exit_code = 1;
+
+end:
+   disk_cache_destroy(disk_cache);
+   clc_free_parsed_spirv(&parsed_spirv_data);
+   clc_free_spirv(&spirv_obj);
+   ralloc_free(params.mem_ctx);
+
+   return exit_code;
+}
diff --git a/src/intel/compiler/elk/intel_gfx_ver_enum.h b/src/intel/compiler/elk/intel_gfx_ver_enum.h
new file mode 100644
index 00000000000..6ca55abe8a1
--- /dev/null
+++ b/src/intel/compiler/elk/intel_gfx_ver_enum.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef INTEL_GFX_VER_ENUM_H
+#define INTEL_GFX_VER_ENUM_H
+
+#include "util/macros.h"
+#include "dev/intel_device_info.h"
+
+enum gfx_ver {
+   GFX4    = (1 << 0),
+   GFX45   = (1 << 1),
+   GFX5    = (1 << 2),
+   GFX6    = (1 << 3),
+   GFX7    = (1 << 4),
+   GFX75   = (1 << 5),
+   GFX8    = (1 << 6),
+   GFX9    = (1 << 7),
+   GFX10   = (1 << 8),
+   GFX11   = (1 << 9),
+   GFX12   = (1 << 10),
+   GFX125  = (1 << 11),
+   XE2     = (1 << 12),
+   GFX_ALL = ~0
+};
+
+#define GFX_LT(ver) ((ver) - 1)
+#define GFX_GE(ver) (~GFX_LT(ver))
+#define GFX_LE(ver) (GFX_LT(ver) | (ver))
+
+static inline enum gfx_ver
+gfx_ver_from_devinfo(const struct intel_device_info *devinfo)
+{
+   switch (devinfo->verx10) {
+   case 40: return GFX4;
+   case 45: return GFX45;
+   case 50: return GFX5;
+   case 60: return GFX6;
+   case 70: return GFX7;
+   case 75: return GFX75;
+   case 80: return GFX8;
+   case 90: return GFX9;
+   case 110: return GFX11;
+   case 120: return GFX12;
+   case 125: return GFX125;
+   case 200: return XE2;
+   default:
+      unreachable("not reached");
+   }
+}
+
+#endif
diff --git a/src/intel/compiler/elk/intel_nir.c b/src/intel/compiler/elk/intel_nir.c
new file mode 100644
index 00000000000..de71f56a513
--- /dev/null
+++ b/src/intel/compiler/elk/intel_nir.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2014-2023 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "intel_nir.h"
+
+bool
+intel_nir_pulls_at_sample(nir_shader *shader)
+{
+   nir_foreach_function_impl(impl, shader) {
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+            if (intrin->intrinsic == nir_intrinsic_load_barycentric_at_sample)
+               return true;
+         }
+      }
+   }
+
+   return false;
+}
+
+
diff --git a/src/intel/compiler/elk/intel_nir.h b/src/intel/compiler/elk/intel_nir.h
new file mode 100644
index 00000000000..29f83011a94
--- /dev/null
+++ b/src/intel/compiler/elk/intel_nir.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2015-2023 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef INTEL_NIR_H
+#define INTEL_NIR_H
+
+#include "nir.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct intel_device_info;
+
+void intel_nir_apply_tcs_quads_workaround(nir_shader *nir);
+bool intel_nir_blockify_uniform_loads(nir_shader *shader,
+                                      const struct intel_device_info *devinfo);
+bool intel_nir_clamp_image_1d_2d_array_sizes(nir_shader *shader);
+bool intel_nir_clamp_per_vertex_loads(nir_shader *shader);
+bool intel_nir_cleanup_resource_intel(nir_shader *shader);
+
+bool intel_nir_lower_conversions(nir_shader *nir);
+bool intel_nir_lower_non_uniform_barycentric_at_sample(nir_shader *nir);
+bool intel_nir_lower_non_uniform_resource_intel(nir_shader *shader);
+bool intel_nir_lower_patch_vertices_in(nir_shader *shader, unsigned input_vertices);
+bool intel_nir_lower_shading_rate_output(nir_shader *nir);
+bool intel_nir_lower_sparse_intrinsics(nir_shader *nir);
+
+struct intel_nir_lower_texture_opts {
+   bool combined_lod_and_array_index;
+};
+bool intel_nir_lower_texture(nir_shader *nir,
+                             const struct intel_nir_lower_texture_opts *opts);
+
+bool intel_nir_opt_peephole_ffma(nir_shader *shader);
+bool intel_nir_opt_peephole_imul32x16(nir_shader *shader);
+
+bool intel_nir_pulls_at_sample(nir_shader *shader);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* INTEL_NIR_H */
diff --git a/src/intel/compiler/elk/intel_nir_blockify_uniform_loads.c b/src/intel/compiler/elk/intel_nir_blockify_uniform_loads.c
new file mode 100644
index 00000000000..2ad0a117a34
--- /dev/null
+++ b/src/intel/compiler/elk/intel_nir_blockify_uniform_loads.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "dev/intel_device_info.h"
+#include "intel_nir.h"
+#include "isl/isl.h"
+#include "nir_builder.h"
+
+static bool
+intel_nir_blockify_uniform_loads_instr(nir_builder *b,
+                                       nir_instr *instr,
+                                       void *cb_data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   const struct intel_device_info *devinfo = cb_data;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_ssbo:
+      /* BDW PRMs, Volume 7: 3D-Media-GPGPU: OWord Block ReadWrite:
+       *
+       *    "The surface base address must be OWord-aligned."
+       *
+       * We can't make that guarantee with SSBOs where the alignment is
+       * 4bytes.
+       */
+      if (devinfo->ver < 9)
+         return false;
+
+      if (nir_src_is_divergent(intrin->src[1]))
+         return false;
+
+      if (intrin->def.bit_size != 32)
+         return false;
+
+      /* Without the LSC, we can only do block loads of at least 4dwords (1
+       * oword).
+       */
+      if (!devinfo->has_lsc && intrin->def.num_components < 4)
+         return false;
+
+      intrin->intrinsic =
+         intrin->intrinsic == nir_intrinsic_load_ubo ?
+         nir_intrinsic_load_ubo_uniform_block_intel :
+         nir_intrinsic_load_ssbo_uniform_block_intel;
+      return true;
+
+   case nir_intrinsic_load_shared:
+      /* Block loads on shared memory are not supported before the LSC. */
+      if (!devinfo->has_lsc)
+         return false;
+
+      if (nir_src_is_divergent(intrin->src[0]))
+         return false;
+
+      if (intrin->def.bit_size != 32)
+         return false;
+
+      intrin->intrinsic = nir_intrinsic_load_shared_uniform_block_intel;
+      return true;
+
+   case nir_intrinsic_load_global_constant:
+      if (nir_src_is_divergent(intrin->src[0]))
+         return false;
+
+      if (intrin->def.bit_size != 32)
+         return false;
+
+      /* Without the LSC, we can only do block loads of at least 4dwords (1
+       * oword).
+       */
+      if (!devinfo->has_lsc && intrin->def.num_components < 4)
+         return false;
+
+      intrin->intrinsic = nir_intrinsic_load_global_constant_uniform_block_intel;
+      return true;
+
+   default:
+      return false;
+   }
+}
+
+bool
+intel_nir_blockify_uniform_loads(nir_shader *shader,
+                                 const struct intel_device_info *devinfo)
+{
+   return nir_shader_instructions_pass(shader,
+                                       intel_nir_blockify_uniform_loads_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance |
+                                       nir_metadata_live_defs,
+                                       (void *) devinfo);
+}
diff --git a/src/intel/compiler/elk/intel_nir_clamp_image_1d_2d_array_sizes.c b/src/intel/compiler/elk/intel_nir_clamp_image_1d_2d_array_sizes.c
new file mode 100644
index 00000000000..2f2f907c5d1
--- /dev/null
+++ b/src/intel/compiler/elk/intel_nir_clamp_image_1d_2d_array_sizes.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "intel_nir.h"
+
+/**
+ * Wa_1806565034:
+ *
+ * Gfx12+ allows to set RENDER_SURFACE_STATE::SurfaceArray to 1 only if
+ * array_len > 1. Setting RENDER_SURFACE_STATE::SurfaceArray to 0 results in
+ * the HW RESINFO message to report an array size of 0 which breaks texture
+ * array size queries.
+ *
+ * This NIR pass works around this by patching the array size with a
+ * MAX(array_size, 1) for array textures.
+ */
+
+static bool
+intel_nir_clamp_image_1d_2d_array_sizes_instr(nir_builder *b,
+                                              nir_instr *instr,
+                                              UNUSED void *cb_data)
+{
+   nir_def *image_size = NULL;
+
+   switch (instr->type) {
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+      switch (intr->intrinsic) {
+      case nir_intrinsic_image_size:
+      case nir_intrinsic_bindless_image_size:
+         if (!nir_intrinsic_image_array(intr))
+            break;
+
+         image_size = &intr->def;
+         break;
+
+      case nir_intrinsic_image_deref_size: {
+         nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+
+         assert(glsl_type_is_image(deref->type));
+
+         if (!glsl_sampler_type_is_array(deref->type))
+            break;
+
+         image_size = &intr->def;
+         break;
+      }
+
+      default:
+         break;
+      }
+      break;
+   }
+
+   case nir_instr_type_tex: {
+      nir_tex_instr *tex_instr = nir_instr_as_tex(instr);
+      if (tex_instr->op != nir_texop_txs)
+         break;
+
+      if (!tex_instr->is_array)
+         break;
+
+      image_size = &tex_instr->def;
+      break;
+   }
+
+   default:
+      break;
+   }
+
+   if (!image_size)
+      return false;
+
+   b->cursor = nir_after_instr(instr);
+
+   nir_def *components[4];
+   /* OR all the sizes for all components but the last. */
+   nir_def *or_components = nir_imm_int(b, 0);
+   for (int i = 0; i < image_size->num_components; i++) {
+      if (i == (image_size->num_components - 1)) {
+         nir_def *null_or_size[2] = {
+            nir_imm_int(b, 0),
+            nir_imax(b, nir_channel(b, image_size, i),
+                         nir_imm_int(b, 1)),
+         };
+         nir_def *vec2_null_or_size = nir_vec(b, null_or_size, 2);
+
+         /* Using the ORed sizes select either the element 0 or 1
+          * from this vec2. For NULL textures which have a size of
+          * 0x0x0, we'll select the first element which is 0 and for
+          * the rest MAX(depth, 1).
+          */
+         components[i] =
+            nir_vector_extract(b, vec2_null_or_size,
+                                   nir_imin(b, or_components,
+                                                nir_imm_int(b, 1)));
+      } else {
+         components[i] = nir_channel(b, image_size, i);
+         or_components = nir_ior(b, components[i], or_components);
+      }
+   }
+   nir_def *image_size_replacement =
+      nir_vec(b, components, image_size->num_components);
+
+   b->cursor = nir_after_instr(instr);
+
+   nir_def_rewrite_uses_after(image_size,
+                                  image_size_replacement,
+                                  image_size_replacement->parent_instr);
+
+   return true;
+}
+
+bool
+intel_nir_clamp_image_1d_2d_array_sizes(nir_shader *shader)
+{
+   return nir_shader_instructions_pass(shader,
+                                       intel_nir_clamp_image_1d_2d_array_sizes_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       NULL);
+}
diff --git a/src/intel/compiler/elk/intel_nir_clamp_per_vertex_loads.c b/src/intel/compiler/elk/intel_nir_clamp_per_vertex_loads.c
new file mode 100644
index 00000000000..b9fafa82f56
--- /dev/null
+++ b/src/intel/compiler/elk/intel_nir_clamp_per_vertex_loads.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/*
+ * Limit input per vertex input accesses. This is useful for the tesselation stages.
+ * On Gfx12.5+ out of bound accesses generate hangs.
+ *
+ * This pass operates on derefs, it must be called before shader inputs are
+ * lowered.
+ */
+
+#include "intel_nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_deref.h"
+
+static bool
+clamp_per_vertex_loads_instr(nir_builder *b, nir_intrinsic_instr *intrin,
+                             void *cb_data)
+{
+   if (intrin->intrinsic != nir_intrinsic_load_deref)
+      return false;
+
+   nir_deref_instr *deref = nir_instr_as_deref(intrin->src[0].ssa->parent_instr);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+   if (var == NULL || (var->data.mode & nir_var_shader_in) == 0)
+      return false;
+
+   nir_deref_path path;
+   nir_deref_path_init(&path, deref, cb_data);
+
+   bool progress = false;
+   for (unsigned i = 0; path.path[i]; i++) {
+      if (path.path[i]->deref_type != nir_deref_type_array)
+         continue;
+
+      b->cursor = nir_before_instr(&path.path[i]->instr);
+
+      nir_src_rewrite(&path.path[i]->arr.index,
+                      nir_umin(b, path.path[i]->arr.index.ssa, nir_iadd_imm(b, nir_load_patch_vertices_in(b), -1)));
+
+      progress = true;
+      break;
+   }
+
+   nir_deref_path_finish(&path);
+
+   return progress;
+}
+
+bool
+intel_nir_clamp_per_vertex_loads(nir_shader *shader)
+{
+   void *mem_ctx = ralloc_context(NULL);
+
+   bool ret = nir_shader_intrinsics_pass(shader, clamp_per_vertex_loads_instr,
+                                           nir_metadata_block_index |
+                                           nir_metadata_dominance,
+                                           mem_ctx);
+
+   ralloc_free(mem_ctx);
+
+   return ret;
+}
+
+static bool
+lower_patch_vertices_instr(nir_builder *b, nir_intrinsic_instr *intrin,
+                           void *cb_data)
+{
+   if (intrin->intrinsic != nir_intrinsic_load_patch_vertices_in)
+      return false;
+
+   unsigned *input_vertices = cb_data;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_def_rewrite_uses(&intrin->def, nir_imm_int(b, *input_vertices));
+
+   return true;
+}
+
+bool
+intel_nir_lower_patch_vertices_in(nir_shader *shader, unsigned input_vertices)
+{
+   return nir_shader_intrinsics_pass(shader, lower_patch_vertices_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       &input_vertices);
+}
diff --git a/src/intel/compiler/elk/intel_nir_lower_conversions.c b/src/intel/compiler/elk/intel_nir_lower_conversions.c
new file mode 100644
index 00000000000..e0dde853349
--- /dev/null
+++ b/src/intel/compiler/elk/intel_nir_lower_conversions.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "intel_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+static void
+split_conversion(nir_builder *b, nir_alu_instr *alu, nir_alu_type src_type,
+                 nir_alu_type tmp_type, nir_alu_type dst_type)
+{
+   b->cursor = nir_before_instr(&alu->instr);
+   nir_def *src = nir_ssa_for_alu_src(b, alu, 0);
+   nir_def *tmp = nir_type_convert(b, src, src_type, tmp_type, nir_rounding_mode_undef);
+   nir_def *res = nir_type_convert(b, tmp, tmp_type, dst_type, nir_rounding_mode_undef);
+   nir_def_rewrite_uses(&alu->def, res);
+   nir_instr_remove(&alu->instr);
+}
+
+static bool
+lower_alu_instr(nir_builder *b, nir_alu_instr *alu)
+{
+   unsigned src_bit_size = nir_src_bit_size(alu->src[0].src);
+   nir_alu_type src_type = nir_op_infos[alu->op].input_types[0];
+   nir_alu_type src_full_type = (nir_alu_type) (src_type | src_bit_size);
+
+   unsigned dst_bit_size = alu->def.bit_size;
+   nir_alu_type dst_full_type = nir_op_infos[alu->op].output_type;
+   nir_alu_type dst_type = nir_alu_type_get_base_type(dst_full_type);
+
+   /* BDW PRM, vol02, Command Reference Instructions, mov - MOVE:
+    *
+    *   "There is no direct conversion from HF to DF or DF to HF.
+    *    Use two instructions and F (Float) as an intermediate type.
+    *
+    *    There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
+    *    Use two instructions and F (Float) or a word integer type
+    *    or a DWord integer type as an intermediate type."
+    *
+    * It is important that the intermediate conversion happens through a
+    * 32-bit float type so we don't lose range when we convert from
+    * a 64-bit integer.
+    */
+   unsigned int64_types = nir_type_int64 | nir_type_uint64;
+   if ((src_full_type == nir_type_float16 && (dst_full_type & int64_types)) ||
+       ((src_full_type & int64_types) && dst_full_type == nir_type_float16)) {
+      split_conversion(b, alu, src_type, nir_type_float | 32,
+                       dst_type | dst_bit_size);
+      return true;
+   }
+
+   /* SKL PRM, vol 02a, Command Reference: Instructions, Move:
+    *
+    *   "There is no direct conversion from B/UB to DF or DF to B/UB. Use
+    *    two instructions and a word or DWord intermediate type."
+    *
+    *   "There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
+    *    Use two instructions and a word or DWord intermediate integer
+    *    type."
+    *
+    * It is important that we use a 32-bit integer matching the sign of the
+    * destination as the intermediate type so we avoid any chance of rtne
+    * rounding happening before the conversion to integer (which is expected
+    * to round towards zero) in double to byte conversions.
+    */
+   if ((src_bit_size == 8 && dst_bit_size == 64) ||
+       (src_bit_size == 64 && dst_bit_size == 8)) {
+      split_conversion(b, alu, src_type, dst_type | 32, dst_type | dst_bit_size);
+      return true;
+   }
+
+   return false;
+}
+
+static bool
+lower_instr(nir_builder *b, nir_instr *instr, UNUSED void *cb_data)
+{
+   if (instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *alu = nir_instr_as_alu(instr);
+
+   if (!nir_op_infos[alu->op].is_conversion)
+      return false;
+
+   return lower_alu_instr(b, alu);
+}
+
+bool
+intel_nir_lower_conversions(nir_shader *shader)
+{
+   return nir_shader_instructions_pass(shader, lower_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       NULL);
+}
diff --git a/src/intel/compiler/elk/intel_nir_lower_cs_intrinsics.c b/src/intel/compiler/elk/intel_nir_lower_cs_intrinsics.c
new file mode 100644
index 00000000000..2ec364a9ebb
--- /dev/null
+++ b/src/intel/compiler/elk/intel_nir_lower_cs_intrinsics.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "intel_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+struct lower_intrinsics_state {
+   nir_shader *nir;
+   nir_function_impl *impl;
+   bool progress;
+   nir_builder builder;
+};
+
+static void
+compute_local_index_id(nir_builder *b,
+                       nir_shader *nir,
+                       nir_def **local_index,
+                       nir_def **local_id)
+{
+   nir_def *subgroup_id = nir_load_subgroup_id(b);
+
+   nir_def *thread_local_id =
+      nir_imul(b, subgroup_id, nir_load_simd_width_intel(b));
+   nir_def *channel = nir_load_subgroup_invocation(b);
+   nir_def *linear = nir_iadd(b, channel, thread_local_id);
+
+   nir_def *size_x;
+   nir_def *size_y;
+   if (nir->info.workgroup_size_variable) {
+      nir_def *size_xyz = nir_load_workgroup_size(b);
+      size_x = nir_channel(b, size_xyz, 0);
+      size_y = nir_channel(b, size_xyz, 1);
+   } else {
+      size_x = nir_imm_int(b, nir->info.workgroup_size[0]);
+      size_y = nir_imm_int(b, nir->info.workgroup_size[1]);
+   }
+   nir_def *size_xy = nir_imul(b, size_x, size_y);
+
+   /* The local invocation index and ID must respect the following
+    *
+    *    gl_LocalInvocationID.x =
+    *       gl_LocalInvocationIndex % gl_WorkGroupSize.x;
+    *    gl_LocalInvocationID.y =
+    *       (gl_LocalInvocationIndex / gl_WorkGroupSize.x) %
+    *       gl_WorkGroupSize.y;
+    *    gl_LocalInvocationID.z =
+    *       (gl_LocalInvocationIndex /
+    *        (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
+    *       gl_WorkGroupSize.z;
+    *
+    * However, the final % gl_WorkGroupSize.z does nothing unless we
+    * accidentally end up with a gl_LocalInvocationIndex that is too
+    * large so it can safely be omitted.
+    */
+
+   nir_def *id_x, *id_y, *id_z;
+   switch (nir->info.cs.derivative_group) {
+   case DERIVATIVE_GROUP_NONE:
+      if (nir->info.num_images == 0 &&
+          nir->info.num_textures == 0) {
+         /* X-major lid order. Optimal for linear accesses only,
+          * which are usually buffers. X,Y ordering will look like:
+          * (0,0) (1,0) (2,0) ... (size_x-1,0) (0,1) (1,1) ...
+          */
+         id_x = nir_umod(b, linear, size_x);
+         id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
+         *local_index = linear;
+      } else if (!nir->info.workgroup_size_variable &&
+                 nir->info.workgroup_size[1] % 4 == 0) {
+         /* 1x4 block X-major lid order. Same as X-major except increments in
+          * blocks of width=1 height=4. Always optimal for tileY and usually
+          * optimal for linear accesses.
+          *   x = (linear / 4) % size_x
+          *   y = ((linear % 4) + (linear / 4 / size_x) * 4) % size_y
+          * X,Y ordering will look like: (0,0) (0,1) (0,2) (0,3) (1,0) (1,1)
+          * (1,2) (1,3) (2,0) ... (size_x-1,3) (0,4) (0,5) (0,6) (0,7) (1,4) ...
+          */
+         const unsigned height = 4;
+         nir_def *block = nir_udiv_imm(b, linear, height);
+         id_x = nir_umod(b, block, size_x);
+         id_y = nir_umod(b,
+                         nir_iadd(b,
+                                  nir_umod_imm(b, linear, height),
+                                  nir_imul_imm(b,
+                                               nir_udiv(b, block, size_x),
+                                               height)),
+                         size_y);
+      } else {
+         /* Y-major lid order. Optimal for tileY accesses only,
+          * which are usually images. X,Y ordering will look like:
+          * (0,0) (0,1) (0,2) ... (0,size_y-1) (1,0) (1,1) ...
+          */
+         id_y = nir_umod(b, linear, size_y);
+         id_x = nir_umod(b, nir_udiv(b, linear, size_y), size_x);
+      }
+
+      id_z = nir_udiv(b, linear, size_xy);
+      *local_id = nir_vec3(b, id_x, id_y, id_z);
+      if (!*local_index) {
+         *local_index = nir_iadd(b, nir_iadd(b, id_x,
+                                                nir_imul(b, id_y, size_x)),
+                                                nir_imul(b, id_z, size_xy));
+      }
+      break;
+   case DERIVATIVE_GROUP_LINEAR:
+      /* For linear, just set the local invocation index linearly,
+       * and calculate local invocation ID from that.
+       */
+      id_x = nir_umod(b, linear, size_x);
+      id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
+      id_z = nir_udiv(b, linear, size_xy);
+      *local_id = nir_vec3(b, id_x, id_y, id_z);
+      *local_index = linear;
+      break;
+   case DERIVATIVE_GROUP_QUADS: {
+      /* For quads, first we figure out the 2x2 grid the invocation
+       * belongs to -- treating extra Z layers as just more rows.
+       * Then map that into local invocation ID (trivial) and local
+       * invocation index.  Skipping Z simplify index calculation.
+       */
+
+      nir_def *one = nir_imm_int(b, 1);
+      nir_def *double_size_x = nir_ishl(b, size_x, one);
+
+      /* ID within a pair of rows, where each group of 4 is 2x2 quad. */
+      nir_def *row_pair_id = nir_umod(b, linear, double_size_x);
+      nir_def *y_row_pairs = nir_udiv(b, linear, double_size_x);
+
+      nir_def *x =
+         nir_ior(b,
+                 nir_iand(b, row_pair_id, one),
+                 nir_iand(b, nir_ishr(b, row_pair_id, one),
+                          nir_imm_int(b, 0xfffffffe)));
+      nir_def *y =
+         nir_ior(b,
+                 nir_ishl(b, y_row_pairs, one),
+                 nir_iand(b, nir_ishr(b, row_pair_id, one), one));
+
+      *local_id = nir_vec3(b, x,
+                           nir_umod(b, y, size_y),
+                           nir_udiv(b, y, size_y));
+      *local_index = nir_iadd(b, x, nir_imul(b, y, size_x));
+      break;
+   }
+   default:
+      unreachable("invalid derivative group");
+   }
+}
+
+static bool
+lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
+                                  nir_block *block)
+{
+   bool progress = false;
+   nir_builder *b = &state->builder;
+   nir_shader *nir = state->nir;
+
+   /* Reuse calculated values inside the block. */
+   nir_def *local_index = NULL;
+   nir_def *local_id = NULL;
+
+   nir_foreach_instr_safe(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
+
+      b->cursor = nir_after_instr(&intrinsic->instr);
+
+      nir_def *sysval;
+      switch (intrinsic->intrinsic) {
+      case nir_intrinsic_load_local_invocation_index:
+      case nir_intrinsic_load_local_invocation_id: {
+         if (!local_index && !nir->info.workgroup_size_variable) {
+            const uint16_t *ws = nir->info.workgroup_size;
+            if (ws[0] * ws[1] * ws[2] == 1) {
+               nir_def *zero = nir_imm_int(b, 0);
+               local_index = zero;
+               local_id = nir_replicate(b, zero, 3);
+            }
+         }
+
+         if (!local_index) {
+            if (nir->info.stage == MESA_SHADER_TASK ||
+                nir->info.stage == MESA_SHADER_MESH) {
+               /* Will be lowered by nir_emit_task_mesh_intrinsic() using
+                * information from the payload.
+                */
+               continue;
+            }
+
+            /* First time we are using those, so let's calculate them. */
+            assert(!local_id);
+            compute_local_index_id(b, nir, &local_index, &local_id);
+         }
+
+         assert(local_id);
+         assert(local_index);
+         if (intrinsic->intrinsic == nir_intrinsic_load_local_invocation_id)
+            sysval = local_id;
+         else
+            sysval = local_index;
+         break;
+      }
+
+      case nir_intrinsic_load_num_subgroups: {
+         nir_def *size;
+         if (state->nir->info.workgroup_size_variable) {
+            nir_def *size_xyz = nir_load_workgroup_size(b);
+            nir_def *size_x = nir_channel(b, size_xyz, 0);
+            nir_def *size_y = nir_channel(b, size_xyz, 1);
+            nir_def *size_z = nir_channel(b, size_xyz, 2);
+            size = nir_imul(b, nir_imul(b, size_x, size_y), size_z);
+         } else {
+            size = nir_imm_int(b, nir->info.workgroup_size[0] *
+                                  nir->info.workgroup_size[1] *
+                                  nir->info.workgroup_size[2]);
+         }
+
+         /* Calculate the equivalent of DIV_ROUND_UP. */
+         nir_def *simd_width = nir_load_simd_width_intel(b);
+         sysval =
+            nir_udiv(b, nir_iadd_imm(b, nir_iadd(b, size, simd_width), -1),
+                        simd_width);
+         break;
+      }
+
+      default:
+         continue;
+      }
+
+      if (intrinsic->def.bit_size == 64)
+         sysval = nir_u2u64(b, sysval);
+
+      nir_def_rewrite_uses(&intrinsic->def, sysval);
+      nir_instr_remove(&intrinsic->instr);
+
+      state->progress = true;
+   }
+
+   return progress;
+}
+
+static void
+lower_cs_intrinsics_convert_impl(struct lower_intrinsics_state *state)
+{
+   state->builder = nir_builder_create(state->impl);
+
+   nir_foreach_block(block, state->impl) {
+      lower_cs_intrinsics_convert_block(state, block);
+   }
+
+   nir_metadata_preserve(state->impl,
+                         nir_metadata_block_index | nir_metadata_dominance);
+}
+
+bool
+intel_nir_lower_cs_intrinsics(nir_shader *nir)
+{
+   assert(gl_shader_stage_uses_workgroup(nir->info.stage));
+
+   struct lower_intrinsics_state state = {
+      .nir = nir,
+   };
+
+   /* Constraints from NV_compute_shader_derivatives. */
+   if (gl_shader_stage_is_compute(nir->info.stage) &&
+       !nir->info.workgroup_size_variable) {
+      if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
+         assert(nir->info.workgroup_size[0] % 2 == 0);
+         assert(nir->info.workgroup_size[1] % 2 == 0);
+      } else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR) {
+         ASSERTED unsigned workgroup_size =
+            nir->info.workgroup_size[0] *
+            nir->info.workgroup_size[1] *
+            nir->info.workgroup_size[2];
+         assert(workgroup_size % 4 == 0);
+      }
+   }
+
+   nir_foreach_function_impl(impl, nir) {
+      state.impl = impl;
+      lower_cs_intrinsics_convert_impl(&state);
+   }
+
+   return state.progress;
+}
diff --git a/src/intel/compiler/elk/intel_nir_lower_non_uniform_barycentric_at_sample.c b/src/intel/compiler/elk/intel_nir_lower_non_uniform_barycentric_at_sample.c
new file mode 100644
index 00000000000..49fd1b44add
--- /dev/null
+++ b/src/intel/compiler/elk/intel_nir_lower_non_uniform_barycentric_at_sample.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/*
+ * Lower non uniform at sample messages to the interpolator.
+ *
+ * This is pretty much identical to what nir_lower_non_uniform_access() does.
+ * We do it here because otherwise GCM would undo this optimization. Also we
+ * can assume divergence analysis here.
+ */
+
+#include "intel_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+static bool
+intel_nir_lower_non_uniform_barycentric_at_sample_instr(nir_builder *b,
+                                                        nir_instr *instr,
+                                                        void *cb_data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   if (intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample)
+      return false;
+
+   if (nir_src_is_always_uniform(intrin->src[0]) ||
+       !nir_src_is_divergent(intrin->src[0]))
+      return false;
+
+   nir_def *sample_id = intrin->src[0].ssa;
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_push_loop(b);
+   {
+      nir_def *first_sample_id = nir_read_first_invocation(b, sample_id);
+
+      nir_push_if(b, nir_ieq(b, sample_id, first_sample_id));
+      {
+         nir_builder_instr_insert(b, &intrin->instr);
+
+         nir_src_rewrite(&intrin->src[0], first_sample_id);
+
+         nir_jump(b, nir_jump_break);
+      }
+   }
+
+   return true;
+}
+
+bool
+intel_nir_lower_non_uniform_barycentric_at_sample(nir_shader *nir)
+{
+   return nir_shader_instructions_pass(
+      nir,
+      intel_nir_lower_non_uniform_barycentric_at_sample_instr,
+      nir_metadata_none,
+      NULL);
+}
diff --git a/src/intel/compiler/elk/intel_nir_lower_non_uniform_resource_intel.c b/src/intel/compiler/elk/intel_nir_lower_non_uniform_resource_intel.c
new file mode 100644
index 00000000000..78314897d82
--- /dev/null
+++ b/src/intel/compiler/elk/intel_nir_lower_non_uniform_resource_intel.c
@@ -0,0 +1,319 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "util/u_dynarray.h"
+
+#include "intel_nir.h"
+
+static bool
+nir_instr_is_resource_intel(nir_instr *instr)
+{
+   return instr->type == nir_instr_type_intrinsic &&
+      nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_resource_intel;
+}
+
+static bool
+add_src_instr(nir_src *src, void *state)
+{
+   struct util_dynarray *inst_array = state;
+   util_dynarray_foreach(inst_array, nir_instr *, instr_ptr) {
+      if (*instr_ptr == src->ssa->parent_instr)
+         return true;
+   }
+
+   util_dynarray_append(inst_array, nir_instr *, src->ssa->parent_instr);
+
+   return true;
+}
+
+static nir_intrinsic_instr *
+find_resource_intel(struct util_dynarray *inst_array,
+                    nir_def *def)
+{
+   /* If resouce_intel is already directly in front of the instruction, there
+    * is nothing to do.
+    */
+   if (nir_instr_is_resource_intel(def->parent_instr))
+      return NULL;
+
+   util_dynarray_append(inst_array, nir_instr *, def->parent_instr);
+
+   unsigned idx = 0, scan_index = 0;
+   while (idx < util_dynarray_num_elements(inst_array, nir_instr *)) {
+      nir_instr *instr = *util_dynarray_element(inst_array, nir_instr *, idx++);
+
+      for (; scan_index < util_dynarray_num_elements(inst_array, nir_instr *); scan_index++) {
+         nir_instr *scan_instr = *util_dynarray_element(inst_array, nir_instr *, scan_index);
+         if (nir_instr_is_resource_intel(scan_instr))
+            return nir_instr_as_intrinsic(scan_instr);
+      }
+
+      nir_foreach_src(instr, add_src_instr, inst_array);
+   }
+
+   return NULL;
+}
+
+static bool
+intel_nir_lower_non_uniform_intrinsic(nir_builder *b,
+                                      nir_intrinsic_instr *intrin,
+                                      struct util_dynarray *inst_array)
+{
+   unsigned source;
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_ssbo:
+   case nir_intrinsic_get_ssbo_size:
+   case nir_intrinsic_ssbo_atomic:
+   case nir_intrinsic_ssbo_atomic_swap:
+   case nir_intrinsic_load_ssbo_block_intel:
+   case nir_intrinsic_store_ssbo_block_intel:
+   case nir_intrinsic_load_ubo_uniform_block_intel:
+   case nir_intrinsic_load_ssbo_uniform_block_intel:
+   case nir_intrinsic_image_load_raw_intel:
+   case nir_intrinsic_image_store_raw_intel:
+   case nir_intrinsic_image_load:
+   case nir_intrinsic_image_store:
+   case nir_intrinsic_image_atomic:
+   case nir_intrinsic_image_atomic_swap:
+   case nir_intrinsic_bindless_image_load:
+   case nir_intrinsic_bindless_image_store:
+   case nir_intrinsic_bindless_image_atomic:
+   case nir_intrinsic_bindless_image_atomic_swap:
+   case nir_intrinsic_image_size:
+   case nir_intrinsic_bindless_image_size:
+      source = 0;
+      break;
+
+   case nir_intrinsic_store_ssbo:
+      source = 1;
+      break;
+
+   default:
+      return false;
+   }
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   util_dynarray_clear(inst_array);
+
+   nir_intrinsic_instr *old_resource_intel =
+      find_resource_intel(inst_array, intrin->src[source].ssa);
+   if (old_resource_intel == NULL)
+      return false;
+
+   nir_instr *new_instr =
+      nir_instr_clone(b->shader, &old_resource_intel->instr);
+
+   nir_instr_insert(b->cursor, new_instr);
+
+   nir_intrinsic_instr *new_resource_intel =
+      nir_instr_as_intrinsic(new_instr);
+
+   nir_src_rewrite(&new_resource_intel->src[1], intrin->src[source].ssa);
+   nir_src_rewrite(&intrin->src[source], &new_resource_intel->def);
+
+   return true;
+}
+
+static bool
+intel_nir_lower_non_uniform_tex(nir_builder *b,
+                                nir_tex_instr *tex,
+                                struct util_dynarray *inst_array)
+{
+   b->cursor = nir_before_instr(&tex->instr);
+
+   bool progress = false;
+   for (unsigned s = 0; s < tex->num_srcs; s++) {
+      if (tex->src[s].src_type != nir_tex_src_texture_handle &&
+          tex->src[s].src_type != nir_tex_src_sampler_handle)
+         continue;
+
+      util_dynarray_clear(inst_array);
+
+      nir_intrinsic_instr *old_resource_intel =
+         find_resource_intel(inst_array, tex->src[s].src.ssa);
+      if (old_resource_intel == NULL)
+         continue;
+
+      nir_instr *new_instr =
+         nir_instr_clone(b->shader, &old_resource_intel->instr);
+
+      nir_instr_insert(b->cursor, new_instr);
+
+      nir_intrinsic_instr *new_resource_intel =
+         nir_instr_as_intrinsic(new_instr);
+
+      nir_src_rewrite(&new_resource_intel->src[1], tex->src[s].src.ssa);
+      nir_src_rewrite(&tex->src[s].src, &new_resource_intel->def);
+
+      progress = true;
+   }
+
+   return progress;
+}
+
+static bool
+intel_nir_lower_non_uniform_instr(nir_builder *b,
+                                  nir_instr *instr,
+                                  void *cb_data)
+{
+   struct util_dynarray *inst_array = cb_data;
+
+   switch (instr->type) {
+   case nir_instr_type_intrinsic:
+      return intel_nir_lower_non_uniform_intrinsic(b,
+                                                   nir_instr_as_intrinsic(instr),
+                                                   inst_array);
+
+   case nir_instr_type_tex:
+      return intel_nir_lower_non_uniform_tex(b,
+                                             nir_instr_as_tex(instr),
+                                             inst_array);
+
+   default:
+      return false;
+   }
+}
+
+/** This pass rematerializes resource_intel intrinsics closer to their use.
+ *
+ * For example will turn this :
+ *    ssa_1 = iadd ...
+ *    ssa_2 = resource_intel ..., ssa_1, ...
+ *    ssa_3 = read_first_invocation ssa_2
+ *    ssa_4 = load_ssbo ssa_3, ...
+ *
+ * into this :
+ *    ssa_1 = iadd ...
+ *    ssa_3 = read_first_invocation ssa_1
+ *    ssa_5 = resource_intel ..., ssa_3, ...
+ *    ssa_4 = load_ssbo ssa_5, ...
+ *
+ * The goal is to have the resource_intel immediately before its use so that
+ * the backend compiler can know how the load_ssbo should be compiled (binding
+ * table or bindless access, etc...).
+ */
+bool
+intel_nir_lower_non_uniform_resource_intel(nir_shader *shader)
+{
+   void *mem_ctx = ralloc_context(NULL);
+
+   struct util_dynarray inst_array;
+   util_dynarray_init(&inst_array, mem_ctx);
+
+   bool ret = nir_shader_instructions_pass(shader,
+                                           intel_nir_lower_non_uniform_instr,
+                                           nir_metadata_block_index |
+                                           nir_metadata_dominance,
+                                           &inst_array);
+
+   ralloc_free(mem_ctx);
+
+   return ret;
+}
+
+static bool
+skip_resource_intel_cleanup(nir_instr *instr)
+{
+   switch (instr->type) {
+   case nir_instr_type_tex:
+      return true;
+
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrin =
+         nir_instr_as_intrinsic(instr);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_load_ubo:
+      case nir_intrinsic_load_ssbo:
+      case nir_intrinsic_store_ssbo:
+      case nir_intrinsic_get_ssbo_size:
+      case nir_intrinsic_ssbo_atomic:
+      case nir_intrinsic_ssbo_atomic_swap:
+      case nir_intrinsic_load_ssbo_block_intel:
+      case nir_intrinsic_store_ssbo_block_intel:
+      case nir_intrinsic_load_ssbo_uniform_block_intel:
+      case nir_intrinsic_image_load_raw_intel:
+      case nir_intrinsic_image_store_raw_intel:
+      case nir_intrinsic_image_load:
+      case nir_intrinsic_image_store:
+      case nir_intrinsic_image_atomic:
+      case nir_intrinsic_image_atomic_swap:
+      case nir_intrinsic_bindless_image_load:
+      case nir_intrinsic_bindless_image_store:
+      case nir_intrinsic_bindless_image_atomic:
+      case nir_intrinsic_bindless_image_atomic_swap:
+      case nir_intrinsic_image_size:
+      case nir_intrinsic_bindless_image_size:
+         return true;
+
+      default:
+         return false;
+      }
+   }
+
+   default:
+      return false;
+   }
+}
+
+static bool
+intel_nir_cleanup_resource_intel_instr(nir_builder *b,
+                                       nir_intrinsic_instr *intrin,
+                                       void *cb_data)
+{
+   if (intrin->intrinsic != nir_intrinsic_resource_intel)
+      return false;
+
+   bool progress = false;
+   nir_foreach_use_safe(src, &intrin->def) {
+      if (!nir_src_is_if(src) && skip_resource_intel_cleanup(nir_src_parent_instr(src)))
+         continue;
+
+      progress = true;
+      nir_src_rewrite(src, intrin->src[1].ssa);
+   }
+
+   return progress;
+}
+
+/** This pass removes unnecessary resource_intel intrinsics
+ *
+ * This pass must not be run before intel_nir_lower_non_uniform_resource_intel.
+ */
+bool
+intel_nir_cleanup_resource_intel(nir_shader *shader)
+{
+   void *mem_ctx = ralloc_context(NULL);
+
+   bool ret = nir_shader_intrinsics_pass(shader,
+                                         intel_nir_cleanup_resource_intel_instr,
+                                         nir_metadata_block_index |
+                                         nir_metadata_dominance,
+                                         NULL);
+
+   ralloc_free(mem_ctx);
+
+   return ret;
+}
diff --git a/src/intel/compiler/elk/intel_nir_lower_shading_rate_output.c b/src/intel/compiler/elk/intel_nir_lower_shading_rate_output.c
new file mode 100644
index 00000000000..18c89f8bea0
--- /dev/null
+++ b/src/intel/compiler/elk/intel_nir_lower_shading_rate_output.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/*
+ * Lower the shading rate output from the bit field format described in the
+ * SPIRV spec :
+ *
+ * bit | name              | description
+ *   0 | Vertical2Pixels   | Fragment invocation covers 2 pixels vertically
+ *   1 | Vertical4Pixels   | Fragment invocation covers 4 pixels vertically
+ *   2 | Horizontal2Pixels | Fragment invocation covers 2 pixels horizontally
+ *   3 | Horizontal4Pixels | Fragment invocation covers 4 pixels horizontally
+ *
+ * into a single dword composed of 2 fp16 to be stored in the dword 0 of the
+ * VUE header.
+ *
+ * When no horizontal/vertical bits are set, the size in pixel size in that
+ * dimension is assumed to be 1.
+ *
+ * According to the specification, the shading rate output can be read &
+ * written. A read after a write should report a different value if the
+ * implementation decides on different primitive shading rate for some reason.
+ * This is never the case in our implementation.
+ */
+
+#include "intel_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+static bool
+lower_shading_rate_output_instr(nir_builder *b, nir_intrinsic_instr *intrin,
+                                UNUSED void *_state)
+{
+   nir_intrinsic_op op = intrin->intrinsic;
+
+   if (op != nir_intrinsic_load_output &&
+       op != nir_intrinsic_store_output &&
+       op != nir_intrinsic_load_per_primitive_output &&
+       op != nir_intrinsic_store_per_primitive_output)
+      return false;
+
+   struct nir_io_semantics io = nir_intrinsic_io_semantics(intrin);
+   if (io.location != VARYING_SLOT_PRIMITIVE_SHADING_RATE)
+      return false;
+
+   bool is_store = op == nir_intrinsic_store_output ||
+                   op == nir_intrinsic_store_per_primitive_output;
+
+   b->cursor = is_store ? nir_before_instr(&intrin->instr) : nir_after_instr(&intrin->instr);
+
+   if (is_store) {
+      nir_def *bit_field = intrin->src[0].ssa;
+      nir_def *fp16_x =
+         nir_i2f16(b,
+                   nir_ishl(b, nir_imm_int(b, 1),
+                            nir_ishr_imm(b, bit_field, 2)));
+      nir_def *fp16_y =
+         nir_i2f16(b,
+                   nir_ishl(b, nir_imm_int(b, 1),
+                            nir_iand_imm(b, bit_field, 0x3)));
+      nir_def *packed_fp16_xy = nir_pack_32_2x16_split(b, fp16_x, fp16_y);
+
+      nir_src_rewrite(&intrin->src[0], packed_fp16_xy);
+   } else {
+      nir_def *packed_fp16_xy = &intrin->def;
+
+      nir_def *u32_x =
+         nir_i2i32(b, nir_unpack_32_2x16_split_x(b, packed_fp16_xy));
+      nir_def *u32_y =
+         nir_i2i32(b, nir_unpack_32_2x16_split_y(b, packed_fp16_xy));
+
+      nir_def *bit_field =
+         nir_ior(b, nir_ishl_imm(b, nir_ushr_imm(b, u32_x, 1), 2),
+                    nir_ushr_imm(b, u32_y, 1));
+
+      nir_def_rewrite_uses_after(packed_fp16_xy, bit_field,
+                                     bit_field->parent_instr);
+   }
+
+   return true;
+}
+
+bool
+intel_nir_lower_shading_rate_output(nir_shader *nir)
+{
+   return nir_shader_intrinsics_pass(nir, lower_shading_rate_output_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance, NULL);
+}
diff --git a/src/intel/compiler/elk/intel_nir_lower_sparse.c b/src/intel/compiler/elk/intel_nir_lower_sparse.c
new file mode 100644
index 00000000000..f7625c2dd0d
--- /dev/null
+++ b/src/intel/compiler/elk/intel_nir_lower_sparse.c
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "intel_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+/*
+ * This pass lowers a few of the sparse instructions to something HW can
+ * handle.
+ *
+ * The image_*_sparse_load intrinsics are lowered into 2 instructions, a
+ * regular image_*_load intrinsic and a sparse texture txf operation and
+ * reconstructs the sparse vector of the original intrinsic using the 2 new
+ * values. We need to do this because our backend implements image load/store
+ * using the dataport and the dataport unit doesn't provide residency
+ * information. We need to use the sampler for residency.
+ *
+ * The is_sparse_texels_resident intrinsic is lowered to a bit checking
+ * operation as the data reported by the sampler is a single bit per lane in
+ * the first component.
+ *
+ * The tex_* instructions with a compare value need to be lower into 2
+ * instructions due to a HW limitation :
+ *
+ * SKL PRMs, Volume 7: 3D-Media-GPGPU, Messages, SIMD Payloads :
+ *
+ *    "The Pixel Null Mask field, when enabled via the Pixel Null Mask Enable
+ *     will be incorect for sample_c when applied to a surface with 64-bit per
+ *     texel format such as R16G16BA16_UNORM. Pixel Null mask Enable may
+ *     incorrectly report pixels as referencing a Null surface."
+ */
+
+static void
+lower_is_sparse_texels_resident(nir_builder *b, nir_intrinsic_instr *intrin)
+{
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_def_rewrite_uses(
+      &intrin->def,
+      nir_i2b(b, nir_iand(b, intrin->src[0].ssa,
+                              nir_ishl(b, nir_imm_int(b, 1),
+                                          nir_load_subgroup_invocation(b)))));
+}
+
+static void
+lower_sparse_residency_code_and(nir_builder *b, nir_intrinsic_instr *intrin)
+{
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_def_rewrite_uses(
+      &intrin->def,
+      nir_iand(b, intrin->src[0].ssa, intrin->src[1].ssa));
+}
+
+static void
+lower_sparse_image_load(nir_builder *b, nir_intrinsic_instr *intrin)
+{
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_def *img_load;
+   nir_intrinsic_instr *new_intrin;
+   if (intrin->intrinsic == nir_intrinsic_image_sparse_load) {
+      img_load = nir_image_load(b,
+                                intrin->num_components - 1,
+                                intrin->def.bit_size,
+                                intrin->src[0].ssa,
+                                intrin->src[1].ssa,
+                                intrin->src[2].ssa,
+                                intrin->src[3].ssa);
+      new_intrin = nir_instr_as_intrinsic(img_load->parent_instr);
+      nir_intrinsic_set_range_base(new_intrin, nir_intrinsic_range_base(intrin));
+   } else {
+      img_load = nir_bindless_image_load(b,
+                                         intrin->num_components - 1,
+                                         intrin->def.bit_size,
+                                         intrin->src[0].ssa,
+                                         intrin->src[1].ssa,
+                                         intrin->src[2].ssa,
+                                         intrin->src[3].ssa);
+      new_intrin = nir_instr_as_intrinsic(img_load->parent_instr);
+   }
+
+   nir_intrinsic_set_image_array(new_intrin, nir_intrinsic_image_array(intrin));
+   nir_intrinsic_set_image_dim(new_intrin, nir_intrinsic_image_dim(intrin));
+   nir_intrinsic_set_format(new_intrin, nir_intrinsic_format(intrin));
+   nir_intrinsic_set_access(new_intrin, nir_intrinsic_access(intrin));
+   nir_intrinsic_set_dest_type(new_intrin, nir_intrinsic_dest_type(intrin));
+
+   nir_def *dests[NIR_MAX_VEC_COMPONENTS];
+   for (unsigned i = 0; i < intrin->num_components - 1; i++) {
+      dests[i] = nir_channel(b, img_load, i);
+   }
+
+   /* Use texture instruction to compute residency */
+   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
+
+   tex->op = nir_texop_txf;
+   /* We don't care about the dest type since we're not using any of that
+    * data.
+    */
+   tex->dest_type = nir_type_float32;
+   tex->is_array = nir_intrinsic_image_array(intrin);
+   tex->is_shadow = false;
+   tex->sampler_index = 0;
+   tex->is_sparse = true;
+
+   tex->src[0].src_type = intrin->intrinsic == nir_intrinsic_image_sparse_load ?
+                          nir_tex_src_texture_offset :
+                          nir_tex_src_texture_handle;
+   tex->src[0].src = nir_src_for_ssa(intrin->src[0].ssa);
+
+   tex->coord_components = nir_image_intrinsic_coord_components(intrin);
+   nir_def *coord;
+   if (nir_intrinsic_image_dim(intrin) == GLSL_SAMPLER_DIM_CUBE &&
+       nir_intrinsic_image_array(intrin)) {
+      tex->coord_components++;
+
+      nir_def *img_layer = nir_channel(b, intrin->src[1].ssa, 2);
+      nir_def *tex_slice = nir_idiv(b, img_layer, nir_imm_int(b, 6));
+      nir_def *tex_face =
+         nir_iadd(b, img_layer, nir_ineg(b, nir_imul_imm(b, tex_slice, 6)));
+      nir_def *comps[4] = {
+         nir_channel(b, intrin->src[1].ssa, 0),
+         nir_channel(b, intrin->src[1].ssa, 1),
+         tex_face,
+         tex_slice
+      };
+      coord = nir_vec(b, comps, 4);
+   } else {
+      coord = nir_channels(b, intrin->src[1].ssa,
+                           nir_component_mask(tex->coord_components));
+   }
+   tex->src[1].src_type = nir_tex_src_coord;
+   tex->src[1].src = nir_src_for_ssa(coord);
+
+   tex->src[2].src_type = nir_tex_src_lod;
+   tex->src[2].src = nir_src_for_ssa(nir_imm_int(b, 0));
+
+   nir_def_init(&tex->instr, &tex->def, 5,
+                intrin->def.bit_size);
+
+   nir_builder_instr_insert(b, &tex->instr);
+
+   dests[intrin->num_components - 1] = nir_channel(b, &tex->def, 4);
+
+   nir_def_rewrite_uses(
+      &intrin->def,
+      nir_vec(b, dests, intrin->num_components));
+}
+
+static void
+lower_tex_compare(nir_builder *b, nir_tex_instr *tex, int compare_idx)
+{
+   b->cursor = nir_after_instr(&tex->instr);
+
+   /* Clone the original instruction */
+   nir_tex_instr *sparse_tex = nir_instr_as_tex(nir_instr_clone(b->shader, &tex->instr));
+   nir_def_init(&sparse_tex->instr, &sparse_tex->def,
+                tex->def.num_components, tex->def.bit_size);
+   nir_builder_instr_insert(b, &sparse_tex->instr);
+
+   /* Drop the compare source on the cloned instruction */
+   nir_tex_instr_remove_src(sparse_tex, compare_idx);
+
+   /* Drop the residency query on the original tex instruction */
+   tex->is_sparse = false;
+   tex->def.num_components = tex->def.num_components - 1;
+
+   nir_def *new_comps[NIR_MAX_VEC_COMPONENTS];
+   for (unsigned i = 0; i < tex->def.num_components; i++)
+      new_comps[i] = nir_channel(b, &tex->def, i);
+   new_comps[tex->def.num_components] =
+      nir_channel(b, &sparse_tex->def, tex->def.num_components);
+
+   nir_def *new_vec = nir_vec(b, new_comps, sparse_tex->def.num_components);
+
+   nir_def_rewrite_uses_after(&tex->def, new_vec, new_vec->parent_instr);
+}
+
+static bool
+lower_sparse_intrinsics(nir_builder *b, nir_instr *instr, void *cb_data)
+{
+   switch (instr->type) {
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_image_sparse_load:
+      case nir_intrinsic_bindless_image_sparse_load:
+         lower_sparse_image_load(b, intrin);
+         return true;
+
+      case nir_intrinsic_is_sparse_texels_resident:
+         lower_is_sparse_texels_resident(b, intrin);
+         return true;
+
+      case nir_intrinsic_sparse_residency_code_and:
+         lower_sparse_residency_code_and(b, intrin);
+         return true;
+
+      default:
+         return false;
+      }
+   }
+
+   case nir_instr_type_tex: {
+      nir_tex_instr *tex = nir_instr_as_tex(instr);
+      int comp_idx = nir_tex_instr_src_index(tex, nir_tex_src_comparator);
+      if (comp_idx != -1 && tex->is_sparse) {
+         lower_tex_compare(b, tex, comp_idx);
+         return true;
+      }
+      return false;
+   }
+
+   default:
+      return false;
+   }
+}
+
+bool
+intel_nir_lower_sparse_intrinsics(nir_shader *nir)
+{
+   return nir_shader_instructions_pass(nir, lower_sparse_intrinsics,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       NULL);
+}
diff --git a/src/intel/compiler/elk/intel_nir_lower_texture.c b/src/intel/compiler/elk/intel_nir_lower_texture.c
new file mode 100644
index 00000000000..d1b34022024
--- /dev/null
+++ b/src/intel/compiler/elk/intel_nir_lower_texture.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright © 2024 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "intel_nir.h"
+
+/**
+ * Pack either the explicit LOD or LOD bias and the array index together.
+ */
+static bool
+pack_lod_and_array_index(nir_builder *b, nir_tex_instr *tex)
+{
+   /* If 32-bit texture coordinates are used, pack either the explicit LOD or
+    * LOD bias and the array index into a single (32-bit) value.
+    */
+   int lod_index = nir_tex_instr_src_index(tex, nir_tex_src_lod);
+   if (lod_index < 0) {
+      lod_index = nir_tex_instr_src_index(tex, nir_tex_src_bias);
+
+      /* The explicit LOD or LOD bias may not be found if this lowering has
+       * already occured.  The explicit LOD may also not be found in some
+       * cases where it is zero.
+       */
+      if (lod_index < 0)
+         return false;
+   }
+
+   assert(nir_tex_instr_src_type(tex, lod_index) == nir_type_float);
+
+   /* Also do not perform this packing if the explicit LOD is zero. */
+   if (tex->op == nir_texop_txl &&
+       nir_src_is_const(tex->src[lod_index].src) &&
+       nir_src_as_float(tex->src[lod_index].src) == 0.0) {
+      return false;
+   }
+
+   const int coord_index = nir_tex_instr_src_index(tex, nir_tex_src_coord);
+   assert(coord_index >= 0);
+
+   nir_def *lod = tex->src[lod_index].src.ssa;
+   nir_def *coord = tex->src[coord_index].src.ssa;
+
+   assert(nir_tex_instr_src_type(tex, coord_index) == nir_type_float);
+
+   if (coord->bit_size < 32)
+      return false;
+
+   b->cursor = nir_before_instr(&tex->instr);
+
+   /* First, combine the two values.  The packing format is a little weird.
+    * The explicit LOD / LOD bias is stored as float, as normal.  However, the
+    * array index is converted to an integer and smashed into the low 9 bits.
+    */
+   const unsigned array_index = tex->coord_components - 1;
+
+   nir_def *clamped_ai =
+      nir_umin(b,
+               nir_f2u32(b, nir_fround_even(b, nir_channel(b, coord,
+                                                           array_index))),
+               nir_imm_int(b, 511));
+
+   nir_def *lod_ai = nir_ior(b, nir_iand_imm(b, lod, 0xfffffe00), clamped_ai);
+
+   /* Second, replace the coordinate with a new value that has one fewer
+    * component (i.e., drop the array index).
+    */
+   nir_def *reduced_coord = nir_trim_vector(b, coord, 2);
+   tex->coord_components--;
+
+   /* Finally, remove the old sources and add the new. */
+   nir_src_rewrite(&tex->src[coord_index].src, reduced_coord);
+
+   nir_tex_instr_remove_src(tex, lod_index);
+   nir_tex_instr_add_src(tex, nir_tex_src_backend1, lod_ai);
+
+   return true;
+}
+
+static bool
+intel_nir_lower_texture_instr(nir_builder *b, nir_instr *instr, void *cb_data)
+{
+   if (instr->type != nir_instr_type_tex)
+      return false;
+
+   const struct intel_nir_lower_texture_opts *opts = cb_data;
+   nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+   switch (tex->op) {
+   case nir_texop_txl:
+   case nir_texop_txb:
+      if (tex->is_array &&
+          tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
+          opts->combined_lod_and_array_index) {
+         return pack_lod_and_array_index(b, tex);
+      }
+      return false;
+   default:
+      /* Nothing to do */
+      return false;
+   }
+
+   return false;
+}
+
+bool
+intel_nir_lower_texture(nir_shader *shader,
+                        const struct intel_nir_lower_texture_opts *opts)
+{
+   return nir_shader_instructions_pass(shader,
+                                       intel_nir_lower_texture_instr,
+                                       nir_metadata_none,
+                                       (void *)opts);
+}
diff --git a/src/intel/compiler/elk/intel_nir_opt_peephole_ffma.c b/src/intel/compiler/elk/intel_nir_opt_peephole_ffma.c
new file mode 100644
index 00000000000..6b19f7eb65c
--- /dev/null
+++ b/src/intel/compiler/elk/intel_nir_opt_peephole_ffma.c
@@ -0,0 +1,253 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "intel_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+/*
+ * Implements a small peephole optimization that looks for a multiply that
+ * is only ever used in an add and replaces both with an fma.
+ */
+
+static inline bool
+are_all_uses_fadd(nir_def *def)
+{
+   nir_foreach_use_including_if(use_src, def) {
+      if (nir_src_is_if(use_src))
+         return false;
+
+      nir_instr *use_instr = nir_src_parent_instr(use_src);
+      if (use_instr->type != nir_instr_type_alu)
+         return false;
+
+      nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
+      switch (use_alu->op) {
+      case nir_op_fadd:
+         break; /* This one's ok */
+
+      case nir_op_mov:
+      case nir_op_fneg:
+      case nir_op_fabs:
+         if (!are_all_uses_fadd(&use_alu->def))
+            return false;
+         break;
+
+      default:
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static nir_alu_instr *
+get_mul_for_src(nir_alu_src *src, unsigned num_components,
+                uint8_t *swizzle, bool *negate, bool *abs)
+{
+   uint8_t swizzle_tmp[NIR_MAX_VEC_COMPONENTS];
+
+   nir_instr *instr = src->src.ssa->parent_instr;
+   if (instr->type != nir_instr_type_alu)
+      return NULL;
+
+   nir_alu_instr *alu = nir_instr_as_alu(instr);
+
+   /* We want to bail if any of the other ALU operations involved is labeled
+    * exact.  One reason for this is that, while the value that is changing is
+    * actually the result of the add and not the multiply, the intention of
+    * the user when they specify an exact multiply is that they want *that*
+    * value and what they don't care about is the add.  Another reason is that
+    * SPIR-V explicitly requires this behaviour.
+    */
+   if (alu->exact)
+      return NULL;
+
+   switch (alu->op) {
+   case nir_op_mov:
+      alu = get_mul_for_src(&alu->src[0], alu->def.num_components,
+                            swizzle, negate, abs);
+      break;
+
+   case nir_op_fneg:
+      alu = get_mul_for_src(&alu->src[0], alu->def.num_components,
+                            swizzle, negate, abs);
+      *negate = !*negate;
+      break;
+
+   case nir_op_fabs:
+      alu = get_mul_for_src(&alu->src[0], alu->def.num_components,
+                            swizzle, negate, abs);
+      *negate = false;
+      *abs = true;
+      break;
+
+   case nir_op_fmul:
+      /* Only absorb a fmul into a ffma if the fmul is only used in fadd
+       * operations.  This prevents us from being too aggressive with our
+       * fusing which can actually lead to more instructions.
+       */
+      if (!are_all_uses_fadd(&alu->def))
+         return NULL;
+      break;
+
+   default:
+      return NULL;
+   }
+
+   if (!alu)
+      return NULL;
+
+   /* Copy swizzle data before overwriting it to avoid setting a wrong swizzle.
+    *
+    * Example:
+    *   Former swizzle[] = xyzw
+    *   src->swizzle[] = zyxx
+    *
+    *   Expected output swizzle = zyxx
+    *   If we reuse swizzle in the loop, then output swizzle would be zyzz.
+    */
+   memcpy(swizzle_tmp, swizzle, NIR_MAX_VEC_COMPONENTS*sizeof(uint8_t));
+   for (int i = 0; i < num_components; i++)
+      swizzle[i] = swizzle_tmp[src->swizzle[i]];
+
+   return alu;
+}
+
+/**
+ * Given a list of (at least two) nir_alu_src's, tells if any of them is a
+ * constant value and is used only once.
+ */
+static bool
+any_alu_src_is_a_constant(nir_alu_src srcs[])
+{
+   for (unsigned i = 0; i < 2; i++) {
+      if (srcs[i].src.ssa->parent_instr->type == nir_instr_type_load_const) {
+         nir_load_const_instr *load_const =
+            nir_instr_as_load_const (srcs[i].src.ssa->parent_instr);
+
+         if (list_is_singular(&load_const->def.uses))
+            return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+intel_nir_opt_peephole_ffma_instr(nir_builder *b,
+                                  nir_instr *instr,
+                                  UNUSED void *cb_data)
+{
+   if (instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *add = nir_instr_as_alu(instr);
+   if (add->op != nir_op_fadd)
+      return false;
+
+   if (add->exact)
+      return false;
+
+
+   /* This, is the case a + a.  We would rather handle this with an
+    * algebraic reduction than fuse it.  Also, we want to only fuse
+    * things where the multiply is used only once and, in this case,
+    * it would be used twice by the same instruction.
+    */
+   if (add->src[0].src.ssa == add->src[1].src.ssa)
+      return false;
+
+   nir_alu_instr *mul;
+   uint8_t add_mul_src, swizzle[NIR_MAX_VEC_COMPONENTS];
+   bool negate, abs;
+   for (add_mul_src = 0; add_mul_src < 2; add_mul_src++) {
+      for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++)
+         swizzle[i] = i;
+
+      negate = false;
+      abs = false;
+
+      mul = get_mul_for_src(&add->src[add_mul_src],
+                            add->def.num_components,
+                            swizzle, &negate, &abs);
+
+      if (mul != NULL)
+         break;
+   }
+
+   if (mul == NULL)
+      return false;
+
+   unsigned bit_size = add->def.bit_size;
+
+   nir_def *mul_src[2];
+   mul_src[0] = mul->src[0].src.ssa;
+   mul_src[1] = mul->src[1].src.ssa;
+
+   /* If any of the operands of the fmul and any of the fadd is a constant,
+    * we bypass because it will be more efficient as the constants will be
+    * propagated as operands, potentially saving two load_const instructions.
+    */
+   if (any_alu_src_is_a_constant(mul->src) &&
+       any_alu_src_is_a_constant(add->src)) {
+      return false;
+   }
+
+   b->cursor = nir_before_instr(&add->instr);
+
+   if (abs) {
+      for (unsigned i = 0; i < 2; i++)
+         mul_src[i] = nir_fabs(b, mul_src[i]);
+   }
+
+   if (negate)
+      mul_src[0] = nir_fneg(b, mul_src[0]);
+
+   nir_alu_instr *ffma = nir_alu_instr_create(b->shader, nir_op_ffma);
+
+   for (unsigned i = 0; i < 2; i++) {
+      ffma->src[i].src = nir_src_for_ssa(mul_src[i]);
+      for (unsigned j = 0; j < add->def.num_components; j++)
+         ffma->src[i].swizzle[j] = mul->src[i].swizzle[swizzle[j]];
+   }
+   nir_alu_src_copy(&ffma->src[2], &add->src[1 - add_mul_src]);
+
+   nir_def_init(&ffma->instr, &ffma->def,
+                add->def.num_components, bit_size);
+   nir_def_rewrite_uses(&add->def, &ffma->def);
+
+   nir_builder_instr_insert(b, &ffma->instr);
+   assert(list_is_empty(&add->def.uses));
+   nir_instr_remove(&add->instr);
+
+   return true;
+}
+
+bool
+intel_nir_opt_peephole_ffma(nir_shader *shader)
+{
+   return nir_shader_instructions_pass(shader, intel_nir_opt_peephole_ffma_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       NULL);
+}
diff --git a/src/intel/compiler/elk/intel_nir_opt_peephole_imul32x16.c b/src/intel/compiler/elk/intel_nir_opt_peephole_imul32x16.c
new file mode 100644
index 00000000000..c42cc5a8b14
--- /dev/null
+++ b/src/intel/compiler/elk/intel_nir_opt_peephole_imul32x16.c
@@ -0,0 +1,319 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "intel_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+/**
+ * Implement a peephole pass to convert integer multiplications to imul32x16.
+ */
+
+struct pass_data {
+   struct hash_table *range_ht;
+};
+
+static void
+replace_imul_instr(nir_builder *b, nir_alu_instr *imul, unsigned small_val,
+                   nir_op new_opcode)
+{
+   assert(small_val == 0 || small_val == 1);
+
+   b->cursor = nir_before_instr(&imul->instr);
+
+   nir_alu_instr *imul_32x16 = nir_alu_instr_create(b->shader, new_opcode);
+
+   nir_alu_src_copy(&imul_32x16->src[0], &imul->src[1 - small_val]);
+   nir_alu_src_copy(&imul_32x16->src[1], &imul->src[small_val]);
+
+   nir_def_init(&imul_32x16->instr, &imul_32x16->def,
+                imul->def.num_components, 32);
+
+   nir_def_rewrite_uses(&imul->def,
+                            &imul_32x16->def);
+
+   nir_builder_instr_insert(b, &imul_32x16->instr);
+
+   nir_instr_remove(&imul->instr);
+   nir_instr_free(&imul->instr);
+}
+
+enum root_operation {
+   non_unary = 0,
+   integer_neg = 1 << 0,
+   integer_abs = 1 << 1,
+   integer_neg_abs = integer_neg | integer_abs,
+   invalid_root = 255
+};
+
+static enum root_operation
+signed_integer_range_analysis(nir_shader *shader, struct hash_table *range_ht,
+                              nir_scalar scalar, int *lo, int *hi)
+{
+   if (nir_scalar_is_const(scalar)) {
+      *lo = nir_scalar_as_int(scalar);
+      *hi = *lo;
+      return non_unary;
+   }
+
+   if (nir_scalar_is_alu(scalar)) {
+      switch (nir_scalar_alu_op(scalar)) {
+      case nir_op_iabs:
+         signed_integer_range_analysis(shader, range_ht,
+                                       nir_scalar_chase_alu_src(scalar, 0),
+                                       lo, hi);
+
+         if (*lo == INT32_MIN) {
+            *hi = INT32_MAX;
+         } else {
+            const int32_t a = abs(*lo);
+            const int32_t b = abs(*hi);
+
+            *lo = MIN2(a, b);
+            *hi = MAX2(a, b);
+         }
+
+         /* Absolute value wipes out any inner negations, and it is redundant
+          * with any inner absolute values.
+          */
+         return integer_abs;
+
+      case nir_op_ineg: {
+         const enum root_operation root =
+            signed_integer_range_analysis(shader, range_ht,
+                                          nir_scalar_chase_alu_src(scalar, 0),
+                                          lo, hi);
+
+         if (*lo == INT32_MIN) {
+            *hi = INT32_MAX;
+         } else {
+            const int32_t a = -(*lo);
+            const int32_t b = -(*hi);
+
+            *lo = MIN2(a, b);
+            *hi = MAX2(a, b);
+         }
+
+         /* Negation of a negation cancels out, but negation of absolute value
+          * must preserve the integer_abs bit.
+          */
+         return root ^ integer_neg;
+      }
+
+      case nir_op_imax: {
+         int src0_lo, src0_hi;
+         int src1_lo, src1_hi;
+
+         signed_integer_range_analysis(shader, range_ht,
+                                       nir_scalar_chase_alu_src(scalar, 0),
+                                       &src0_lo, &src0_hi);
+         signed_integer_range_analysis(shader, range_ht,
+                                       nir_scalar_chase_alu_src(scalar, 1),
+                                       &src1_lo, &src1_hi);
+
+         *lo = MAX2(src0_lo, src1_lo);
+         *hi = MAX2(src0_hi, src1_hi);
+
+         return non_unary;
+      }
+
+      case nir_op_imin: {
+         int src0_lo, src0_hi;
+         int src1_lo, src1_hi;
+
+         signed_integer_range_analysis(shader, range_ht,
+                                       nir_scalar_chase_alu_src(scalar, 0),
+                                       &src0_lo, &src0_hi);
+         signed_integer_range_analysis(shader, range_ht,
+                                       nir_scalar_chase_alu_src(scalar, 1),
+                                       &src1_lo, &src1_hi);
+
+         *lo = MIN2(src0_lo, src1_lo);
+         *hi = MIN2(src0_hi, src1_hi);
+
+         return non_unary;
+      }
+
+      default:
+         break;
+      }
+   }
+
+   /* Any value with the sign-bit set is problematic. Consider the case when
+    * bound is 0x80000000. As an unsigned value, this means the value must be
+    * in the range [0, 0x80000000]. As a signed value, it means the value must
+    * be in the range [0, INT_MAX] or it must be INT_MIN.
+    *
+    * If bound is -2, it means the value is either in the range [INT_MIN, -2]
+    * or it is in the range [0, INT_MAX].
+    *
+    * This function only returns a single, contiguous range. The union of the
+    * two ranges for any value of bound with the sign-bit set is [INT_MIN,
+    * INT_MAX].
+    */
+   const int32_t bound = nir_unsigned_upper_bound(shader, range_ht,
+                                                     scalar, NULL);
+   if (bound < 0) {
+      *lo = INT32_MIN;
+      *hi = INT32_MAX;
+   } else {
+      *lo = 0;
+      *hi = bound;
+   }
+
+   return non_unary;
+}
+
+static bool
+intel_nir_opt_peephole_imul32x16_instr(nir_builder *b,
+                                       nir_instr *instr,
+                                       void *cb_data)
+{
+   struct pass_data *d = (struct pass_data *) cb_data;
+   struct hash_table *range_ht = d->range_ht;
+
+   if (instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *imul = nir_instr_as_alu(instr);
+   if (imul->op != nir_op_imul)
+      return false;
+
+   if (imul->def.bit_size != 32)
+      return false;
+
+   nir_op new_opcode = nir_num_opcodes;
+
+   unsigned i;
+   for (i = 0; i < 2; i++) {
+      if (!nir_src_is_const(imul->src[i].src))
+         continue;
+
+      int64_t lo = INT64_MAX;
+      int64_t hi = INT64_MIN;
+
+      for (unsigned comp = 0; comp < imul->def.num_components; comp++) {
+         int64_t v = nir_src_comp_as_int(imul->src[i].src, comp);
+
+         if (v < lo)
+            lo = v;
+
+         if (v > hi)
+            hi = v;
+      }
+
+      if (lo >= INT16_MIN && hi <= INT16_MAX) {
+         new_opcode = nir_op_imul_32x16;
+         break;
+      } else if (lo >= 0 && hi <= UINT16_MAX) {
+         new_opcode = nir_op_umul_32x16;
+         break;
+      }
+   }
+
+   if (new_opcode != nir_num_opcodes) {
+      replace_imul_instr(b, imul, i, new_opcode);
+      return true;
+   }
+
+   if (imul->def.num_components > 1)
+      return false;
+
+   const nir_scalar imul_scalar = { &imul->def, 0 };
+   int idx = -1;
+   enum root_operation prev_root = invalid_root;
+
+   for (i = 0; i < 2; i++) {
+      /* All constants were previously processed.  There is nothing more to
+       * learn from a constant here.
+       */
+      if (imul->src[i].src.ssa->parent_instr->type == nir_instr_type_load_const)
+         continue;
+
+      nir_scalar scalar = nir_scalar_chase_alu_src(imul_scalar, i);
+      int lo = INT32_MIN;
+      int hi = INT32_MAX;
+
+      const enum root_operation root =
+         signed_integer_range_analysis(b->shader, range_ht, scalar, &lo, &hi);
+
+      /* Copy propagation (in the backend) has trouble handling cases like
+       *
+       *    mov(8)          g60<1>D         -g59<8,8,1>D
+       *    mul(8)          g61<1>D         g63<8,8,1>D     g60<16,8,2>W
+       *
+       * If g59 had absolute value instead of negation, even improved copy
+       * propagation would not be able to make progress.
+       *
+       * In cases where both sources to the integer multiplication can fit in
+       * 16-bits, choose the source that does not have a source modifier.
+       */
+      if (root < prev_root) {
+         if (lo >= INT16_MIN && hi <= INT16_MAX) {
+            new_opcode = nir_op_imul_32x16;
+            idx = i;
+            prev_root = root;
+
+            if (root == non_unary)
+               break;
+         } else if (lo >= 0 && hi <= UINT16_MAX) {
+            new_opcode = nir_op_umul_32x16;
+            idx = i;
+            prev_root = root;
+
+            if (root == non_unary)
+               break;
+         }
+      }
+   }
+
+   if (new_opcode == nir_num_opcodes) {
+      assert(idx == -1);
+      assert(prev_root == invalid_root);
+      return false;
+   }
+
+   assert(idx != -1);
+   assert(prev_root != invalid_root);
+
+   replace_imul_instr(b, imul, idx, new_opcode);
+   return true;
+}
+
+bool
+intel_nir_opt_peephole_imul32x16(nir_shader *shader)
+{
+   struct pass_data cb_data;
+
+   cb_data.range_ht = _mesa_pointer_hash_table_create(NULL);
+
+   bool progress = nir_shader_instructions_pass(shader,
+                                                intel_nir_opt_peephole_imul32x16_instr,
+                                                nir_metadata_block_index |
+                                                nir_metadata_dominance,
+                                                &cb_data);
+
+   _mesa_hash_table_destroy(cb_data.range_ht, NULL);
+
+   return progress;
+}
+
diff --git a/src/intel/compiler/elk/intel_nir_tcs_workarounds.c b/src/intel/compiler/elk/intel_nir_tcs_workarounds.c
new file mode 100644
index 00000000000..269259ff312
--- /dev/null
+++ b/src/intel/compiler/elk/intel_nir_tcs_workarounds.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "intel_nir.h"
+
+/**
+ * Implements the WaPreventHSTessLevelsInterference workaround (for Gfx7-8).
+ *
+ * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU), Page 494 (below the
+ * definition of the patch header layouts):
+ *
+ *    "HW Bug: The Tessellation stage will incorrectly add domain points
+ *     along patch edges under the following conditions, which may result
+ *     in conformance failures and/or cracking artifacts:
+ *
+ *       * QUAD domain
+ *       * INTEGER partitioning
+ *       * All three TessFactors in a given U or V direction (e.g., V
+ *         direction: UEQ0, InsideV, UEQ1) are all exactly 1.0
+ *       * All three TessFactors in the other direction are > 1.0 and all
+ *         round up to the same integer value (e.g, U direction:
+ *         VEQ0 = 3.1, InsideU = 3.7, VEQ1 = 3.4)
+ *
+ *     The suggested workaround (to be implemented as part of the postamble
+ *     to the HS shader in the HS kernel) is:
+ *
+ *     if (
+ *        (TF[UEQ0] > 1.0) ||
+ *        (TF[VEQ0] > 1.0) ||
+ *        (TF[UEQ1] > 1.0) ||
+ *        (TF[VEQ1] > 1.0) ||
+ *        (TF[INSIDE_U] > 1.0) ||
+ *        (TF[INSIDE_V] > 1.0) )
+ *     {
+ *        TF[INSIDE_U] = (TF[INSIDE_U] == 1.0) ? 2.0 : TF[INSIDE_U];
+ *        TF[INSIDE_V] = (TF[INSIDE_V] == 1.0) ? 2.0 : TF[INSIDE_V];
+ *     }"
+ *
+ * There's a subtlety here.  Intel internal HSD-ES bug 1208668495 notes
+ * that the above workaround fails to fix certain GL/ES CTS tests which
+ * have inside tessellation factors of -1.0.  This can be explained by
+ * a quote from the ARB_tessellation_shader specification:
+ *
+ *    "If "equal_spacing" is used, the floating-point tessellation level is
+ *     first clamped to the range [1,<max>], where <max> is implementation-
+ *     dependent maximum tessellation level (MAX_TESS_GEN_LEVEL)."
+ *
+ * In other words, the actual inner tessellation factor used is
+ * clamp(TF[INSIDE_*], 1.0, 64.0).  So we want to compare the clamped
+ * value against 1.0.  To accomplish this, we change the comparison from
+ * (TF[INSIDE_*] == 1.0) to (TF[INSIDE_*] <= 1.0).
+ */
+
+static inline nir_def *
+load_output(nir_builder *b, int num_components, int offset, int component)
+{
+   return nir_load_output(b, num_components, 32, nir_imm_int(b, 0),
+                          .base = offset,
+                          .component = component);
+}
+
+static void
+emit_quads_workaround(nir_builder *b, nir_block *block)
+{
+   b->cursor = nir_after_block_before_jump(block);
+
+   nir_def *inner = load_output(b, 2, 0, 2);
+   nir_def *outer = load_output(b, 4, 1, 0);
+
+   nir_def *any_greater_than_1 =
+       nir_ior(b, nir_bany(b, nir_fgt_imm(b, outer, 1.0f)),
+                  nir_bany(b, nir_fgt_imm(b, inner, 1.0f)));
+
+   nir_push_if(b, any_greater_than_1);
+
+   inner = nir_bcsel(b, nir_fle_imm(b, inner, 1.0f),
+                        nir_imm_float(b, 2.0f), inner);
+
+   nir_store_output(b, inner, nir_imm_int(b, 0),
+                    .component = 2,
+                    .write_mask = WRITEMASK_XY);
+
+   nir_pop_if(b, NULL);
+}
+
+void
+intel_nir_apply_tcs_quads_workaround(nir_shader *nir)
+{
+   assert(nir->info.stage == MESA_SHADER_TESS_CTRL);
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+   nir_builder b = nir_builder_create(impl);
+
+   /* emit_quads_workaround() inserts an if statement into each block,
+    * which splits it in two.  This changes the set of predecessors of
+    * the end block.  We want to process the original set, so to be safe,
+    * save it off to an array first.
+    */
+   const unsigned num_end_preds = impl->end_block->predecessors->entries;
+   nir_block *end_preds[num_end_preds];
+   unsigned i = 0;
+
+   set_foreach(impl->end_block->predecessors, entry) {
+      end_preds[i++] = (nir_block *) entry->key;
+   }
+
+   for (i = 0; i < num_end_preds; i++) {
+      emit_quads_workaround(&b, end_preds[i]);
+   }
+
+   nir_metadata_preserve(impl, nir_metadata_none);
+}
diff --git a/src/intel/compiler/elk/intel_shader_enums.h b/src/intel/compiler/elk/intel_shader_enums.h
new file mode 100644
index 00000000000..98ea9f25b0d
--- /dev/null
+++ b/src/intel/compiler/elk/intel_shader_enums.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2024 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef INTEL_SHADER_ENUMS_H
+#define INTEL_SHADER_ENUMS_H
+
+#include <stdint.h>
+
+#include "compiler/shader_enums.h"
+#include "util/enum_operators.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum intel_msaa_flags {
+   /** Must be set whenever any dynamic MSAA is used
+    *
+    * This flag mostly exists to let us assert that the driver understands
+    * dynamic MSAA so we don't run into trouble with drivers that don't.
+    */
+   INTEL_MSAA_FLAG_ENABLE_DYNAMIC = (1 << 0),
+
+   /** True if the framebuffer is multisampled */
+   INTEL_MSAA_FLAG_MULTISAMPLE_FBO = (1 << 1),
+
+   /** True if this shader has been dispatched per-sample */
+   INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH = (1 << 2),
+
+   /** True if inputs should be interpolated per-sample by default */
+   INTEL_MSAA_FLAG_PERSAMPLE_INTERP = (1 << 3),
+
+   /** True if this shader has been dispatched with alpha-to-coverage */
+   INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE = (1 << 4),
+
+   /** True if this shader has been dispatched coarse
+    *
+    * This is intentionally chose to be bit 15 to correspond to the coarse bit
+    * in the pixel interpolator messages.
+    */
+   INTEL_MSAA_FLAG_COARSE_PI_MSG = (1 << 15),
+
+   /** True if this shader has been dispatched coarse
+    *
+    * This is intentionally chose to be bit 18 to correspond to the coarse bit
+    * in the render target messages.
+    */
+   INTEL_MSAA_FLAG_COARSE_RT_WRITES = (1 << 18),
+};
+MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(intel_msaa_flags)
+
+/**
+ * @defgroup Tessellator parameter enumerations.
+ *
+ * These correspond to the hardware values in 3DSTATE_TE, and are provided
+ * as part of the tessellation evaluation shader.
+ *
+ * @{
+ */
+enum intel_tess_partitioning {
+   INTEL_TESS_PARTITIONING_INTEGER         = 0,
+   INTEL_TESS_PARTITIONING_ODD_FRACTIONAL  = 1,
+   INTEL_TESS_PARTITIONING_EVEN_FRACTIONAL = 2,
+};
+
+enum intel_tess_output_topology {
+   INTEL_TESS_OUTPUT_TOPOLOGY_POINT   = 0,
+   INTEL_TESS_OUTPUT_TOPOLOGY_LINE    = 1,
+   INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW  = 2,
+   INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW = 3,
+};
+
+enum intel_tess_domain {
+   INTEL_TESS_DOMAIN_QUAD    = 0,
+   INTEL_TESS_DOMAIN_TRI     = 1,
+   INTEL_TESS_DOMAIN_ISOLINE = 2,
+};
+/** @} */
+
+enum intel_shader_dispatch_mode {
+   INTEL_DISPATCH_MODE_4X1_SINGLE = 0,
+   INTEL_DISPATCH_MODE_4X2_DUAL_INSTANCE = 1,
+   INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT = 2,
+   INTEL_DISPATCH_MODE_SIMD8 = 3,
+
+   INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH = 0,
+   INTEL_DISPATCH_MODE_TCS_MULTI_PATCH = 2,
+};
+
+/**
+ * Data structure recording the relationship between the gl_varying_slot enum
+ * and "slots" within the vertex URB entry (VUE).  A "slot" is defined as a
+ * single octaword within the VUE (128 bits).
+ *
+ * Note that each BRW register contains 256 bits (2 octawords), so when
+ * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two
+ * consecutive VUE slots.  When accessing the VUE in URB_INTERLEAVED mode (as
+ * in a vertex shader), each register corresponds to a single VUE slot, since
+ * it contains data for two separate vertices.
+ */
+struct intel_vue_map {
+   /**
+    * Bitfield representing all varying slots that are (a) stored in this VUE
+    * map, and (b) actually written by the shader.  Does not include any of
+    * the additional varying slots defined in brw_varying_slot.
+    */
+   uint64_t slots_valid;
+
+   /**
+    * Is this VUE map for a separate shader pipeline?
+    *
+    * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched
+    * without the linker having a chance to dead code eliminate unused varyings.
+    *
+    * This means that we have to use a fixed slot layout, based on the output's
+    * location field, rather than assigning slots in a compact contiguous block.
+    */
+   bool separate;
+
+   /**
+    * Map from gl_varying_slot value to VUE slot.  For gl_varying_slots that are
+    * not stored in a slot (because they are not written, or because
+    * additional processing is applied before storing them in the VUE), the
+    * value is -1.
+    */
+   signed char varying_to_slot[VARYING_SLOT_TESS_MAX];
+
+   /**
+    * Map from VUE slot to gl_varying_slot value.  For slots that do not
+    * directly correspond to a gl_varying_slot, the value comes from
+    * brw_varying_slot.
+    *
+    * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD.
+    */
+   signed char slot_to_varying[VARYING_SLOT_TESS_MAX];
+
+   /**
+    * Total number of VUE slots in use
+    */
+   int num_slots;
+
+   /**
+    * Number of position VUE slots.  If num_pos_slots > 1, primitive
+    * replication is being used.
+    */
+   int num_pos_slots;
+
+   /**
+    * Number of per-patch VUE slots. Only valid for tessellation control
+    * shader outputs and tessellation evaluation shader inputs.
+    */
+   int num_per_patch_slots;
+
+   /**
+    * Number of per-vertex VUE slots. Only valid for tessellation control
+    * shader outputs and tessellation evaluation shader inputs.
+    */
+   int num_per_vertex_slots;
+};
+
+struct intel_cs_dispatch_info {
+   uint32_t group_size;
+   uint32_t simd_size;
+   uint32_t threads;
+
+   /* RightExecutionMask field used in GPGPU_WALKER. */
+   uint32_t right_mask;
+};
+
+enum PACKED intel_compute_walk_order {
+   INTEL_WALK_ORDER_XYZ = 0,
+   INTEL_WALK_ORDER_XZY = 1,
+   INTEL_WALK_ORDER_YXZ = 2,
+   INTEL_WALK_ORDER_YZX = 3,
+   INTEL_WALK_ORDER_ZXY = 4,
+   INTEL_WALK_ORDER_ZYX = 5,
+};
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* INTEL_SHADER_ENUMS_H */
diff --git a/src/intel/compiler/elk/meson.build b/src/intel/compiler/elk/meson.build
new file mode 100644
index 00000000000..509d788529e
--- /dev/null
+++ b/src/intel/compiler/elk/meson.build
@@ -0,0 +1,323 @@
+# Copyright © 2017 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+intel_nir_files = files(
+  'intel_nir.h',
+  'intel_nir.c',
+
+  'intel_nir_blockify_uniform_loads.c',
+  'intel_nir_clamp_image_1d_2d_array_sizes.c',
+  'intel_nir_clamp_per_vertex_loads.c',
+  'intel_nir_lower_conversions.c',
+  'intel_nir_lower_non_uniform_barycentric_at_sample.c',
+  'intel_nir_lower_non_uniform_resource_intel.c',
+  'intel_nir_lower_shading_rate_output.c',
+  'intel_nir_lower_sparse.c',
+  'intel_nir_lower_texture.c',
+  'intel_nir_opt_peephole_ffma.c',
+  'intel_nir_opt_peephole_imul32x16.c',
+  'intel_nir_tcs_workarounds.c',
+)
+
+libintel_compiler_files = files(
+  'brw_cfg.cpp',
+  'brw_cfg.h',
+  'brw_clip.h',
+  'brw_clip_line.c',
+  'brw_clip_point.c',
+  'brw_clip_tri.c',
+  'brw_clip_unfilled.c',
+  'brw_clip_util.c',
+  'brw_compile_clip.c',
+  'brw_compile_ff_gs.c',
+  'brw_compile_sf.c',
+  'brw_compiler.c',
+  'brw_compiler.h',
+  'brw_dead_control_flow.cpp',
+  'brw_dead_control_flow.h',
+  'brw_debug_recompile.c',
+  'brw_disasm.c',
+  'brw_disasm_info.c',
+  'brw_disasm_info.h',
+  'brw_eu.c',
+  'brw_eu_compact.c',
+  'brw_eu_defines.h',
+  'brw_eu_emit.c',
+  'brw_eu.h',
+  'brw_eu_util.c',
+  'brw_eu_validate.c',
+  'brw_fs_bank_conflicts.cpp',
+  'brw_fs_builder.h',
+  'brw_fs_cmod_propagation.cpp',
+  'brw_fs_combine_constants.cpp',
+  'brw_fs_copy_propagation.cpp',
+  'brw_fs.cpp',
+  'brw_fs_cse.cpp',
+  'brw_fs_dead_code_eliminate.cpp',
+  'brw_fs_generator.cpp',
+  'brw_fs.h',
+  'brw_fs_live_variables.cpp',
+  'brw_fs_live_variables.h',
+  'brw_fs_lower_dpas.cpp',
+  'brw_fs_lower_pack.cpp',
+  'brw_fs_lower_regioning.cpp',
+  'brw_fs_nir.cpp',
+  'brw_fs_reg_allocate.cpp',
+  'brw_fs_register_coalesce.cpp',
+  'brw_fs_saturate_propagation.cpp',
+  'brw_fs_scoreboard.cpp',
+  'brw_fs_sel_peephole.cpp',
+  'brw_fs_thread_payload.cpp',
+  'brw_fs_validate.cpp',
+  'brw_fs_visitor.cpp',
+  'brw_inst.h',
+  'brw_interpolation_map.c',
+  'brw_ir.h',
+  'brw_ir_allocator.h',
+  'brw_ir_analysis.h',
+  'brw_ir_fs.h',
+  'brw_ir_performance.h',
+  'brw_ir_performance.cpp',
+  'brw_ir_vec4.h',
+  'brw_isa_info.h',
+  'brw_lower_logical_sends.cpp',
+  'brw_mesh.cpp',
+  'brw_nir.h',
+  'brw_nir.c',
+  'brw_nir_analyze_boolean_resolves.c',
+  'brw_nir_analyze_ubo_ranges.c',
+  'brw_nir_attribute_workarounds.c',
+  'brw_nir_lower_cooperative_matrix.c',
+  'brw_nir_lower_cs_intrinsics.c',
+  'brw_nir_lower_alpha_to_coverage.c',
+  'brw_nir_lower_intersection_shader.c',
+  'brw_nir_lower_ray_queries.c',
+  'brw_nir_lower_rt_intrinsics.c',
+  'brw_nir_lower_shader_calls.c',
+  'brw_nir_lower_storage_image.c',
+  'brw_nir_rt.h',
+  'brw_nir_rt.c',
+  'brw_nir_rt_builder.h',
+  'brw_packed_float.c',
+  'brw_predicated_break.cpp',
+  'brw_prim.h',
+  'brw_private.h',
+  'brw_reg.h',
+  'brw_reg_type.c',
+  'brw_reg_type.h',
+  'brw_rt.h',
+  'brw_schedule_instructions.cpp',
+  'brw_shader.cpp',
+  'brw_shader.h',
+  'brw_simd_selection.cpp',
+  'brw_vec4_builder.h',
+  'brw_vec4_cmod_propagation.cpp',
+  'brw_vec4_copy_propagation.cpp',
+  'brw_vec4.cpp',
+  'brw_vec4_cse.cpp',
+  'brw_vec4_dead_code_eliminate.cpp',
+  'brw_vec4_generator.cpp',
+  'brw_vec4_gs_visitor.cpp',
+  'brw_vec4_gs_visitor.h',
+  'brw_vec4.h',
+  'brw_vec4_live_variables.cpp',
+  'brw_vec4_live_variables.h',
+  'brw_vec4_nir.cpp',
+  'brw_vec4_gs_nir.cpp',
+  'brw_vec4_reg_allocate.cpp',
+  'brw_vec4_surface_builder.cpp',
+  'brw_vec4_surface_builder.h',
+  'brw_vec4_tcs.cpp',
+  'brw_vec4_tcs.h',
+  'brw_vec4_tes.cpp',
+  'brw_vec4_tes.h',
+  'brw_vec4_visitor.cpp',
+  'brw_vec4_vs_visitor.cpp',
+  'brw_vec4_vs.h',
+  'brw_vue_map.c',
+  'gfx6_gs_visitor.cpp',
+  'gfx6_gs_visitor.h',
+)
+
+brw_device_sha1_gen_src = custom_target('brw_device_sha1_gen.c',
+                                        input : ['brw_device_sha1_gen_c.py', '../dev/intel_device_info.py'],
+                                        output : ['brw_device_sha1_gen.c'],
+                                        command : [prog_python, '@INPUT0@', '--outdir', meson.current_build_dir()])
+
+
+brw_nir_trig = custom_target(
+  'brw_nir_trig_workarounds.c',
+  input : 'brw_nir_trig_workarounds.py',
+  output : 'brw_nir_trig_workarounds.c',
+  command : [
+    prog_python, '@INPUT@', '-p', dir_compiler_nir,
+  ],
+  depend_files : nir_algebraic_depends,
+  capture : true,
+)
+
+libintel_compiler = static_library(
+  'intel_compiler',
+  [libintel_compiler_files, intel_nir_files, brw_nir_trig, ir_expression_operation_h, [brw_device_sha1_gen_src]],
+  include_directories : [inc_include, inc_src, inc_intel],
+  c_args : [no_override_init_args],
+  gnu_symbol_visibility : 'hidden',
+  dependencies : [idep_nir_headers, idep_mesautil, idep_intel_dev],
+  build_by_default : false,
+)
+
+# For now this tool is only going to be used by Anv
+if get_option('intel-clc') == 'system'
+  prog_intel_clc = find_program('intel_clc', native : true)
+  dep_prog_intel_clc = []
+elif with_intel_clc
+  prog_intel_clc = executable(
+    'intel_clc',
+    ['intel_clc.c', 'brw_kernel.c'],
+    link_with : [
+      libintel_compiler, libintel_common,libisl,
+    ],
+    include_directories : [inc_include, inc_src, inc_intel],
+    c_args : [pre_args, no_override_init_args],
+    link_args : [ld_args_build_id],
+    dependencies : [idep_nir, idep_vtn, idep_mesaclc, idep_mesautil, idep_intel_dev],
+    native : true,
+    install : get_option('install-intel-clc'),
+  )
+  dep_prog_intel_clc = [prog_intel_clc]
+endif
+
+if with_tests
+  test(
+    'intel_compiler_tests',
+    executable(
+      'intel_compiler_tests',
+      files(
+        'test_predicated_break.cpp',
+        'test_eu_compact.cpp',
+        'test_eu_validate.cpp',
+        'test_fs_cmod_propagation.cpp',
+        'test_fs_combine_constants.cpp',
+        'test_fs_copy_propagation.cpp',
+        'test_fs_saturate_propagation.cpp',
+        'test_fs_scoreboard.cpp',
+        'test_simd_selection.cpp',
+        'test_vec4_cmod_propagation.cpp',
+        'test_vec4_copy_propagation.cpp',
+        'test_vec4_dead_code_eliminate.cpp',
+        'test_vec4_register_coalesce.cpp',
+        'test_vf_float_conversions.cpp',
+      ),
+      ir_expression_operation_h,
+      include_directories : [inc_include, inc_src, inc_intel],
+      link_with : [
+        libintel_compiler, libintel_common, libisl
+      ],
+      dependencies : [idep_gtest, idep_nir, idep_mesautil, idep_intel_dev],
+    ),
+    suite : ['intel'],
+    protocol : 'gtest',
+  )
+endif
+
+if with_intel_tools
+
+bison_command = []
+if yacc_is_bison
+  bison_command = [
+    prog_bison, '@INPUT@', '--defines=@OUTPUT1@',
+    '--output=@OUTPUT0@'
+  ]
+else
+  bison_command = [
+    prog_bison, '-H', '@OUTPUT1@',
+    '-o', '@OUTPUT0@', '@INPUT@'
+  ]
+endif
+
+brw_gram_tab = custom_target(
+  'brw_gram.tab.[ch]',
+  input : 'brw_gram.y',
+  output : ['brw_gram.tab.c', 'brw_gram.tab.h'],
+  command : bison_command
+)
+
+brw_lex_yy_c = custom_target(
+  'brw_lex.yy.c',
+  input : 'brw_lex.l',
+  output : 'brw_lex.yy.c',
+  command : [prog_flex, '-o', '@OUTPUT@', '@INPUT@']
+)
+
+brw_asm_tool = executable(
+  'brw_asm',
+  ['brw_asm_tool.c', brw_gram_tab[0], brw_gram_tab[1], brw_lex_yy_c],
+  dependencies : [idep_mesautil, dep_thread, idep_intel_dev],
+  include_directories : [inc_include, inc_src, inc_intel],
+  link_with : [libintel_common, libintel_compiler],
+  c_args : [no_override_init_args],
+  gnu_symbol_visibility : 'hidden',
+  install : true
+)
+
+asm_testcases = [
+  ['brw', 'gfx4'],
+  ['g4x', 'gfx4.5'],
+  ['ilk', 'gfx5'],
+  ['snb', 'gfx6'],
+  ['ivb', 'gfx7'],
+  ['hsw', 'gfx7.5'],
+  ['bdw', 'gfx8'],
+  ['skl', 'gfx9'],
+  ['icl', 'gfx11'],
+  ['tgl', 'gfx12'],
+  ['dg2', 'gfx12.5'],
+]
+
+test_runner = find_program('tests/run-test.py')
+foreach testcase : asm_testcases
+  _gen_name = testcase[0]
+  _gen_num = testcase[1]
+  _gen_folder = join_paths(meson.current_source_dir(), 'tests',
+                           _gen_num.replace('gfx', 'gen'))
+  test(
+    'brw_asm_' + _gen_num, test_runner,
+    args : [
+      '--brw_asm', brw_asm_tool,
+      '--gen_name', _gen_name,
+      '--gen_folder', _gen_folder,
+    ],
+    suite : 'intel',
+  )
+endforeach
+
+brw_disasm_tool = executable(
+  'brw_disasm',
+  files('brw_disasm_tool.c'),
+  dependencies : [idep_mesautil, dep_thread, idep_intel_dev],
+  include_directories : [inc_include, inc_src, inc_intel],
+  link_with : [libintel_common, libintel_compiler],
+  c_args : [no_override_init_args],
+  gnu_symbol_visibility : 'hidden',
+  install : true
+)
+
+endif
+
diff --git a/src/intel/compiler/elk/test_eu_compact.cpp b/src/intel/compiler/elk/test_eu_compact.cpp
new file mode 100644
index 00000000000..e3e22596028
--- /dev/null
+++ b/src/intel/compiler/elk/test_eu_compact.cpp
@@ -0,0 +1,352 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include "util/ralloc.h"
+#include "brw_disasm.h"
+#include "brw_eu.h"
+
+#include <gtest/gtest.h>
+
+struct CompactParams {
+   unsigned verx10;
+   unsigned align;
+};
+
+std::string
+get_compact_params_name(const testing::TestParamInfo<CompactParams> p)
+{
+   CompactParams params = p.param;
+   std::stringstream ss;
+   ss << params.verx10 << "_";
+   switch (params.align) {
+   case BRW_ALIGN_1:
+      ss << "Align_1";
+      break;
+   case BRW_ALIGN_16:
+      ss << "Align_16";
+      break;
+   default:
+      unreachable("invalid align");
+   }
+   return ss.str();
+}
+
+static bool
+test_compact_instruction(struct brw_codegen *p, brw_inst src)
+{
+   brw_compact_inst dst;
+   memset(&dst, 0xd0, sizeof(dst));
+
+   if (brw_try_compact_instruction(p->isa, &dst, &src)) {
+      brw_inst uncompacted;
+
+      brw_uncompact_instruction(p->isa, &uncompacted, &dst);
+      if (memcmp(&uncompacted, &src, sizeof(src))) {
+	 brw_debug_compact_uncompact(p->isa, &src, &uncompacted);
+	 return false;
+      }
+   } else {
+      brw_compact_inst unchanged;
+      memset(&unchanged, 0xd0, sizeof(unchanged));
+      /* It's not supposed to change dst unless it compacted. */
+      if (memcmp(&unchanged, &dst, sizeof(dst))) {
+	 fprintf(stderr, "Failed to compact, but dst changed\n");
+	 fprintf(stderr, "  Instruction: ");
+	 brw_disassemble_inst(stderr, p->isa, &src, false, 0, NULL);
+	 return false;
+      }
+   }
+
+   return true;
+}
+
+/**
+ * When doing fuzz testing, pad bits won't round-trip.
+ *
+ * This sort of a superset of skip_bit, which is testing for changing bits that
+ * aren't worth testing for fuzzing.  We also just want to clear bits that
+ * become meaningless once fuzzing twiddles a related bit.
+ */
+static void
+clear_pad_bits(const struct brw_isa_info *isa, brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   if (brw_inst_opcode(isa, inst) != BRW_OPCODE_SEND &&
+       brw_inst_opcode(isa, inst) != BRW_OPCODE_SENDC &&
+       brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+       brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE) {
+      brw_inst_set_bits(inst, 127, 111, 0);
+   }
+
+   if (devinfo->ver == 8 && devinfo->platform != INTEL_PLATFORM_CHV &&
+       is_3src(isa, brw_inst_opcode(isa, inst))) {
+      brw_inst_set_bits(inst, 105, 105, 0);
+      brw_inst_set_bits(inst, 84, 84, 0);
+      brw_inst_set_bits(inst, 36, 35, 0);
+   }
+}
+
+static bool
+skip_bit(const struct brw_isa_info *isa, brw_inst *src, int bit)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   /* pad bit */
+   if (bit == 7)
+      return true;
+
+   /* The compact bit -- uncompacted can't have it set. */
+   if (bit == 29)
+      return true;
+
+   if (is_3src(isa, brw_inst_opcode(isa, src))) {
+      if (devinfo->ver >= 9 || devinfo->platform == INTEL_PLATFORM_CHV) {
+         if (bit == 127)
+            return true;
+      } else {
+         if (bit >= 126 && bit <= 127)
+            return true;
+
+         if (bit == 105)
+            return true;
+
+         if (bit == 84)
+            return true;
+
+         if (bit >= 35 && bit <= 36)
+            return true;
+      }
+   } else {
+      if (bit == 47)
+         return true;
+
+      if (devinfo->ver >= 8) {
+         if (bit == 11)
+            return true;
+
+         if (bit == 95)
+            return true;
+      } else {
+         if (devinfo->ver < 7 && bit == 90)
+            return true;
+
+         if (bit >= 91 && bit <= 95)
+            return true;
+      }
+   }
+
+   /* sometimes these are pad bits. */
+   if (brw_inst_opcode(isa, src) != BRW_OPCODE_SEND &&
+       brw_inst_opcode(isa, src) != BRW_OPCODE_SENDC &&
+       brw_inst_src0_reg_file(devinfo, src) != BRW_IMMEDIATE_VALUE &&
+       brw_inst_src1_reg_file(devinfo, src) != BRW_IMMEDIATE_VALUE &&
+       bit >= 121) {
+      return true;
+   }
+
+   return false;
+}
+
+static bool
+test_fuzz_compact_instruction(struct brw_codegen *p, brw_inst src)
+{
+   for (int bit0 = 0; bit0 < 128; bit0++) {
+      if (skip_bit(p->isa, &src, bit0))
+	 continue;
+
+      for (int bit1 = 0; bit1 < 128; bit1++) {
+         brw_inst instr = src;
+	 uint64_t *bits = instr.data;
+
+         if (skip_bit(p->isa, &src, bit1))
+	    continue;
+
+	 bits[bit0 / 64] ^= (1ull << (bit0 & 63));
+	 bits[bit1 / 64] ^= (1ull << (bit1 & 63));
+
+         clear_pad_bits(p->isa, &instr);
+
+         if (!brw_validate_instruction(p->isa, &instr, 0, sizeof(brw_inst), NULL))
+            continue;
+
+	 if (!test_compact_instruction(p, instr)) {
+	    printf("  twiddled bits for fuzzing %d, %d\n", bit0, bit1);
+	    return false;
+	 }
+      }
+   }
+
+   return true;
+}
+
+class CompactTestFixture : public testing::TestWithParam<CompactParams> {
+protected:
+   virtual void SetUp() {
+      CompactParams params = GetParam();
+      mem_ctx = ralloc_context(NULL);
+      devinfo = rzalloc(mem_ctx, intel_device_info);
+      p = rzalloc(mem_ctx, brw_codegen);
+
+      devinfo->verx10 = params.verx10;
+      devinfo->ver = devinfo->verx10 / 10;
+
+      brw_init_isa_info(&isa, devinfo);
+      brw_init_codegen(&isa, p, p);
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+      brw_set_default_access_mode(p, params.align);
+   };
+
+   virtual void TearDown() {
+      EXPECT_EQ(p->nr_insn, 1);
+      EXPECT_TRUE(test_compact_instruction(p, p->store[0]));
+      EXPECT_TRUE(test_fuzz_compact_instruction(p, p->store[0]));
+
+      ralloc_free(mem_ctx);
+   };
+
+   void *mem_ctx;
+   struct brw_isa_info isa;
+   intel_device_info *devinfo;
+   brw_codegen *p;
+};
+
+class Instructions : public CompactTestFixture {};
+
+INSTANTIATE_TEST_SUITE_P(
+   CompactTest,
+   Instructions,
+   testing::Values(
+      CompactParams{ 50,  BRW_ALIGN_1 }, CompactParams{ 50, BRW_ALIGN_16 },
+      CompactParams{ 60,  BRW_ALIGN_1 }, CompactParams{ 60, BRW_ALIGN_16 },
+      CompactParams{ 70,  BRW_ALIGN_1 }, CompactParams{ 70, BRW_ALIGN_16 },
+      CompactParams{ 75,  BRW_ALIGN_1 }, CompactParams{ 75, BRW_ALIGN_16 },
+      CompactParams{ 80,  BRW_ALIGN_1 }, CompactParams{ 80, BRW_ALIGN_16 },
+      CompactParams{ 90,  BRW_ALIGN_1 }, CompactParams{ 90, BRW_ALIGN_16 },
+      CompactParams{ 110, BRW_ALIGN_1 },
+      CompactParams{ 120, BRW_ALIGN_1 },
+      CompactParams{ 125, BRW_ALIGN_1 }
+   ),
+   get_compact_params_name);
+
+class InstructionsBeforeIvyBridge : public CompactTestFixture {};
+
+INSTANTIATE_TEST_SUITE_P(
+   CompactTest,
+   InstructionsBeforeIvyBridge,
+   testing::Values(
+      CompactParams{ 50,  BRW_ALIGN_1 }, CompactParams{ 50, BRW_ALIGN_16 },
+      CompactParams{ 60,  BRW_ALIGN_1 }, CompactParams{ 60, BRW_ALIGN_16 }
+   ),
+   get_compact_params_name);
+
+
+TEST_P(Instructions, ADD_GRF_GRF_GRF)
+{
+   struct brw_reg g0 = brw_vec8_grf(0, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+   struct brw_reg g4 = brw_vec8_grf(4, 0);
+
+   brw_ADD(p, g0, g2, g4);
+}
+
+TEST_P(Instructions, ADD_GRF_GRF_IMM)
+{
+   struct brw_reg g0 = brw_vec8_grf(0, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+   brw_ADD(p, g0, g2, brw_imm_f(1.0));
+}
+
+TEST_P(Instructions, ADD_GRF_GRF_IMM_d)
+{
+   struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_D);
+   struct brw_reg g2 = retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_D);
+
+   brw_ADD(p, g0, g2, brw_imm_d(1));
+}
+
+TEST_P(Instructions, MOV_GRF_GRF)
+{
+   struct brw_reg g0 = brw_vec8_grf(0, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+   brw_MOV(p, g0, g2);
+}
+
+TEST_P(InstructionsBeforeIvyBridge, ADD_MRF_GRF_GRF)
+{
+   struct brw_reg m6 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 6, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+   struct brw_reg g4 = brw_vec8_grf(4, 0);
+
+   brw_ADD(p, m6, g2, g4);
+}
+
+TEST_P(Instructions, ADD_vec1_GRF_GRF_GRF)
+{
+   struct brw_reg g0 = brw_vec1_grf(0, 0);
+   struct brw_reg g2 = brw_vec1_grf(2, 0);
+   struct brw_reg g4 = brw_vec1_grf(4, 0);
+
+   brw_ADD(p, g0, g2, g4);
+}
+
+TEST_P(InstructionsBeforeIvyBridge, PLN_MRF_GRF_GRF)
+{
+   struct brw_reg m6 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 6, 0);
+   struct brw_reg interp = brw_vec1_grf(2, 0);
+   struct brw_reg g4 = brw_vec8_grf(4, 0);
+
+   brw_PLN(p, m6, interp, g4);
+}
+
+TEST_P(Instructions, f0_0_MOV_GRF_GRF)
+{
+   struct brw_reg g0 = brw_vec8_grf(0, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+   brw_push_insn_state(p);
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
+   brw_MOV(p, g0, g2);
+   brw_pop_insn_state(p);
+}
+
+/* The handling of f0.1 vs f0.0 changes between gfx6 and gfx7.  Explicitly test
+ * it, so that we run the fuzzing can run over all the other bits that might
+ * interact with it.
+ */
+TEST_P(Instructions, f0_1_MOV_GRF_GRF)
+{
+   struct brw_reg g0 = brw_vec8_grf(0, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+   brw_push_insn_state(p);
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
+   brw_inst *mov = brw_MOV(p, g0, g2);
+   brw_inst_set_flag_subreg_nr(p->devinfo, mov, 1);
+   brw_pop_insn_state(p);
+}
diff --git a/src/intel/compiler/elk/test_eu_validate.cpp b/src/intel/compiler/elk/test_eu_validate.cpp
new file mode 100644
index 00000000000..03a594c3b3e
--- /dev/null
+++ b/src/intel/compiler/elk/test_eu_validate.cpp
@@ -0,0 +1,3532 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_disasm_info.h"
+#include "brw_eu.h"
+#include "brw_eu_defines.h"
+#include "util/bitset.h"
+#include "util/ralloc.h"
+
+static const struct intel_gfx_info {
+   const char *name;
+} gfx_names[] = {
+   { "brw", },
+   { "g4x", },
+   { "ilk", },
+   { "snb", },
+   { "ivb", },
+   { "hsw", },
+   { "byt", },
+   { "bdw", },
+   { "chv", },
+   { "skl", },
+   { "bxt", },
+   { "kbl", },
+   { "aml", },
+   { "glk", },
+   { "cfl", },
+   { "whl", },
+   { "cml", },
+   { "icl", },
+   { "ehl", },
+   { "jsl", },
+   { "tgl", },
+   { "rkl", },
+   { "dg1", },
+   { "adl", },
+   { "sg1", },
+   { "rpl", },
+   { "dg2", },
+   { "mtl", },
+};
+
+class validation_test: public ::testing::TestWithParam<struct intel_gfx_info> {
+   virtual void SetUp();
+
+public:
+   validation_test();
+   virtual ~validation_test();
+
+   struct brw_isa_info isa;
+   struct brw_codegen *p;
+   struct intel_device_info devinfo;
+};
+
+validation_test::validation_test()
+{
+   p = rzalloc(NULL, struct brw_codegen);
+   memset(&devinfo, 0, sizeof(devinfo));
+}
+
+validation_test::~validation_test()
+{
+   ralloc_free(p);
+}
+
+void validation_test::SetUp()
+{
+   struct intel_gfx_info info = GetParam();
+   int devid = intel_device_name_to_pci_device_id(info.name);
+
+   intel_get_device_info_from_pci_id(devid, &devinfo);
+
+   brw_init_isa_info(&isa, &devinfo);
+
+   brw_init_codegen(&isa, p, p);
+}
+
+struct gfx_name {
+   template <class ParamType>
+   std::string
+   operator()(const ::testing::TestParamInfo<ParamType>& info) const {
+      return info.param.name;
+   }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+   eu_assembly, validation_test,
+   ::testing::ValuesIn(gfx_names),
+   gfx_name()
+);
+
+static bool
+validate(struct brw_codegen *p)
+{
+   const bool print = getenv("TEST_DEBUG");
+   struct disasm_info *disasm = disasm_initialize(p->isa, NULL);
+
+   if (print) {
+      disasm_new_inst_group(disasm, 0);
+      disasm_new_inst_group(disasm, p->next_insn_offset);
+   }
+
+   bool ret = brw_validate_instructions(p->isa, p->store, 0,
+                                        p->next_insn_offset, disasm);
+
+   if (print) {
+      dump_assembly(p->store, 0, p->next_insn_offset, disasm, NULL);
+   }
+   ralloc_free(disasm);
+
+   return ret;
+}
+
+#define last_inst    (&p->store[p->nr_insn - 1])
+#define g0           brw_vec8_grf(0, 0)
+#define acc0         brw_acc_reg(8)
+#define null         brw_null_reg()
+#define zero         brw_imm_f(0.0f)
+
+static void
+clear_instructions(struct brw_codegen *p)
+{
+   p->next_insn_offset = 0;
+   p->nr_insn = 0;
+}
+
+TEST_P(validation_test, sanity)
+{
+   brw_ADD(p, g0, g0, g0);
+
+   EXPECT_TRUE(validate(p));
+}
+
+TEST_P(validation_test, src0_null_reg)
+{
+   brw_MOV(p, g0, null);
+
+   EXPECT_FALSE(validate(p));
+}
+
+TEST_P(validation_test, src1_null_reg)
+{
+   brw_ADD(p, g0, g0, null);
+
+   EXPECT_FALSE(validate(p));
+}
+
+TEST_P(validation_test, math_src0_null_reg)
+{
+   if (devinfo.ver >= 6) {
+      gfx6_math(p, g0, BRW_MATH_FUNCTION_SIN, null, null);
+   } else {
+      gfx4_math(p, g0, BRW_MATH_FUNCTION_SIN, 0, null, BRW_MATH_PRECISION_FULL);
+   }
+
+   EXPECT_FALSE(validate(p));
+}
+
+TEST_P(validation_test, math_src1_null_reg)
+{
+   if (devinfo.ver >= 6) {
+      gfx6_math(p, g0, BRW_MATH_FUNCTION_POW, g0, null);
+      EXPECT_FALSE(validate(p));
+   } else {
+      /* Math instructions on Gfx4/5 are actually SEND messages with payloads.
+       * src1 is an immediate message descriptor set by gfx4_math.
+       */
+   }
+}
+
+TEST_P(validation_test, opcode46)
+{
+   /* opcode 46 is "push" on Gen 4 and 5
+    *              "fork" on Gen 6
+    *              reserved on Gen 7
+    *              "goto" on Gfx8+
+    */
+   brw_next_insn(p, brw_opcode_decode(&isa, 46));
+
+   if (devinfo.ver == 7) {
+      EXPECT_FALSE(validate(p));
+   } else {
+      EXPECT_TRUE(validate(p));
+   }
+}
+
+TEST_P(validation_test, invalid_exec_size_encoding)
+{
+   const struct {
+      enum brw_execution_size exec_size;
+      bool expected_result;
+   } test_case[] = {
+      { BRW_EXECUTE_1,      true  },
+      { BRW_EXECUTE_2,      true  },
+      { BRW_EXECUTE_4,      true  },
+      { BRW_EXECUTE_8,      true  },
+      { BRW_EXECUTE_16,     true  },
+      { BRW_EXECUTE_32,     true  },
+
+      { (enum brw_execution_size)((int)BRW_EXECUTE_32 + 1), false },
+      { (enum brw_execution_size)((int)BRW_EXECUTE_32 + 2), false },
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(test_case); i++) {
+      brw_MOV(p, g0, g0);
+
+      brw_inst_set_exec_size(&devinfo, last_inst, test_case[i].exec_size);
+      brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+      brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+
+      if (test_case[i].exec_size == BRW_EXECUTE_1) {
+         brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+         brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+         brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+      } else {
+         brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_2);
+         brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_2);
+         brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+      }
+
+      EXPECT_EQ(test_case[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, invalid_file_encoding)
+{
+   /* Register file on Gfx12 is only one bit */
+   if (devinfo.ver >= 12)
+      return;
+
+   brw_MOV(p, g0, g0);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_MESSAGE_REGISTER_FILE, BRW_REGISTER_TYPE_F);
+
+   if (devinfo.ver > 6) {
+      EXPECT_FALSE(validate(p));
+   } else {
+      EXPECT_TRUE(validate(p));
+   }
+
+   clear_instructions(p);
+
+   if (devinfo.ver < 6) {
+      gfx4_math(p, g0, BRW_MATH_FUNCTION_SIN, 0, g0, BRW_MATH_PRECISION_FULL);
+   } else {
+      gfx6_math(p, g0, BRW_MATH_FUNCTION_SIN, g0, null);
+   }
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_MESSAGE_REGISTER_FILE, BRW_REGISTER_TYPE_F);
+
+   if (devinfo.ver > 6) {
+      EXPECT_FALSE(validate(p));
+   } else {
+      EXPECT_TRUE(validate(p));
+   }
+}
+
+TEST_P(validation_test, invalid_type_encoding)
+{
+   enum brw_reg_file files[2] = {
+      BRW_GENERAL_REGISTER_FILE,
+      BRW_IMMEDIATE_VALUE,
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(files); i++) {
+      const enum brw_reg_file file = files[i];
+      const int num_bits = devinfo.ver >= 8 ? 4 : 3;
+      const int num_encodings = 1 << num_bits;
+
+      /* The data types are encoded into <num_bits> bits to be used in hardware
+       * instructions, so keep a record in a bitset the invalid patterns so
+       * they can be verified to be invalid when used.
+       */
+      BITSET_DECLARE(invalid_encodings, num_encodings);
+
+      const struct {
+         enum brw_reg_type type;
+         bool expected_result;
+      } test_case[] = {
+         { BRW_REGISTER_TYPE_NF, devinfo.ver == 11 && file != IMM },
+         { BRW_REGISTER_TYPE_DF, devinfo.has_64bit_float && (devinfo.ver >= 8 || file != IMM) },
+         { BRW_REGISTER_TYPE_F,  true },
+         { BRW_REGISTER_TYPE_HF, devinfo.ver >= 8 },
+         { BRW_REGISTER_TYPE_VF, file == IMM },
+         { BRW_REGISTER_TYPE_Q,  devinfo.has_64bit_int },
+         { BRW_REGISTER_TYPE_UQ, devinfo.has_64bit_int },
+         { BRW_REGISTER_TYPE_D,  true },
+         { BRW_REGISTER_TYPE_UD, true },
+         { BRW_REGISTER_TYPE_W,  true },
+         { BRW_REGISTER_TYPE_UW, true },
+         { BRW_REGISTER_TYPE_B,  file == FIXED_GRF },
+         { BRW_REGISTER_TYPE_UB, file == FIXED_GRF },
+         { BRW_REGISTER_TYPE_V,  file == IMM },
+         { BRW_REGISTER_TYPE_UV, devinfo.ver >= 6 && file == IMM },
+      };
+
+      /* Initially assume all hardware encodings are invalid */
+      BITSET_ONES(invalid_encodings);
+
+      brw_set_default_exec_size(p, BRW_EXECUTE_4);
+
+      for (unsigned i = 0; i < ARRAY_SIZE(test_case); i++) {
+         if (test_case[i].expected_result) {
+            unsigned hw_type = brw_reg_type_to_hw_type(&devinfo, file, test_case[i].type);
+            if (hw_type != INVALID_REG_TYPE) {
+               /* ... and remove valid encodings from the set */
+               assert(BITSET_TEST(invalid_encodings, hw_type));
+               BITSET_CLEAR(invalid_encodings, hw_type);
+            }
+
+            if (file == FIXED_GRF) {
+               struct brw_reg g = retype(g0, test_case[i].type);
+               brw_MOV(p, g, g);
+               brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+               brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+               brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+            } else {
+               enum brw_reg_type t;
+
+               switch (test_case[i].type) {
+               case BRW_REGISTER_TYPE_V:
+                  t = BRW_REGISTER_TYPE_W;
+                  break;
+               case BRW_REGISTER_TYPE_UV:
+                  t = BRW_REGISTER_TYPE_UW;
+                  break;
+               case BRW_REGISTER_TYPE_VF:
+                  t = BRW_REGISTER_TYPE_F;
+                  break;
+               default:
+                  t = test_case[i].type;
+                  break;
+               }
+
+               struct brw_reg g = retype(g0, t);
+               brw_MOV(p, g, retype(brw_imm_w(0), test_case[i].type));
+            }
+
+            EXPECT_TRUE(validate(p));
+
+            clear_instructions(p);
+         }
+      }
+
+      /* The remaining encodings in invalid_encodings do not have a mapping
+       * from BRW_REGISTER_TYPE_* and must be invalid. Verify that invalid
+       * encodings are rejected by the validator.
+       */
+      int e;
+      BITSET_FOREACH_SET(e, invalid_encodings, num_encodings) {
+         if (file == FIXED_GRF) {
+            brw_MOV(p, g0, g0);
+            brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+            brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+            brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+         } else {
+            brw_MOV(p, g0, brw_imm_w(0));
+         }
+         brw_inst_set_dst_reg_hw_type(&devinfo, last_inst, e);
+         brw_inst_set_src0_reg_hw_type(&devinfo, last_inst, e);
+
+         EXPECT_FALSE(validate(p));
+
+         clear_instructions(p);
+      }
+   }
+}
+
+TEST_P(validation_test, invalid_type_encoding_3src_a16)
+{
+   /* 3-src instructions in align16 mode only supported on Gfx6-10 */
+   if (devinfo.ver < 6 || devinfo.ver > 10)
+      return;
+
+   const int num_bits = devinfo.ver >= 8 ? 3 : 2;
+   const int num_encodings = 1 << num_bits;
+
+   /* The data types are encoded into <num_bits> bits to be used in hardware
+    * instructions, so keep a record in a bitset the invalid patterns so
+    * they can be verified to be invalid when used.
+    */
+   BITSET_DECLARE(invalid_encodings, num_encodings);
+
+   const struct {
+      enum brw_reg_type type;
+      bool expected_result;
+   } test_case[] = {
+      { BRW_REGISTER_TYPE_DF, devinfo.ver >= 7  },
+      { BRW_REGISTER_TYPE_F,  true },
+      { BRW_REGISTER_TYPE_HF, devinfo.ver >= 8  },
+      { BRW_REGISTER_TYPE_D,  devinfo.ver >= 7  },
+      { BRW_REGISTER_TYPE_UD, devinfo.ver >= 7  },
+   };
+
+   /* Initially assume all hardware encodings are invalid */
+   BITSET_ONES(invalid_encodings);
+
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+   brw_set_default_exec_size(p, BRW_EXECUTE_4);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(test_case); i++) {
+      if (test_case[i].expected_result) {
+         unsigned hw_type = brw_reg_type_to_a16_hw_3src_type(&devinfo, test_case[i].type);
+         if (hw_type != INVALID_HW_REG_TYPE) {
+            /* ... and remove valid encodings from the set */
+            assert(BITSET_TEST(invalid_encodings, hw_type));
+            BITSET_CLEAR(invalid_encodings, hw_type);
+         }
+
+         struct brw_reg g = retype(g0, test_case[i].type);
+         if (!brw_reg_type_is_integer(test_case[i].type)) {
+            brw_MAD(p, g, g, g, g);
+         } else {
+            brw_BFE(p, g, g, g, g);
+         }
+
+         EXPECT_TRUE(validate(p));
+
+         clear_instructions(p);
+      }
+   }
+
+   /* The remaining encodings in invalid_encodings do not have a mapping
+    * from BRW_REGISTER_TYPE_* and must be invalid. Verify that invalid
+    * encodings are rejected by the validator.
+    */
+   int e;
+   BITSET_FOREACH_SET(e, invalid_encodings, num_encodings) {
+      for (unsigned i = 0; i < 2; i++) {
+         if (i == 0) {
+            brw_MAD(p, g0, g0, g0, g0);
+         } else {
+            brw_BFE(p, g0, g0, g0, g0);
+         }
+
+         brw_inst_set_3src_a16_dst_hw_type(&devinfo, last_inst, e);
+         brw_inst_set_3src_a16_src_hw_type(&devinfo, last_inst, e);
+
+         EXPECT_FALSE(validate(p));
+
+         clear_instructions(p);
+
+         if (devinfo.ver == 6)
+            break;
+      }
+   }
+}
+
+TEST_P(validation_test, invalid_type_encoding_3src_a1)
+{
+   /* 3-src instructions in align1 mode only supported on Gfx10+ */
+   if (devinfo.ver < 10)
+      return;
+
+   const int num_bits = 3 + 1 /* for exec_type */;
+   const int num_encodings = 1 << num_bits;
+
+   /* The data types are encoded into <num_bits> bits to be used in hardware
+    * instructions, so keep a record in a bitset the invalid patterns so
+    * they can be verified to be invalid when used.
+    */
+   BITSET_DECLARE(invalid_encodings, num_encodings);
+
+   const struct {
+      enum brw_reg_type type;
+      unsigned exec_type;
+      bool expected_result;
+   } test_case[] = {
+#define E(x) ((unsigned)BRW_ALIGN1_3SRC_EXEC_TYPE_##x)
+      { BRW_REGISTER_TYPE_NF, E(FLOAT), devinfo.ver == 11 },
+      { BRW_REGISTER_TYPE_DF, E(FLOAT), devinfo.has_64bit_float },
+      { BRW_REGISTER_TYPE_F,  E(FLOAT), true  },
+      { BRW_REGISTER_TYPE_HF, E(FLOAT), true  },
+      { BRW_REGISTER_TYPE_D,  E(INT),   true  },
+      { BRW_REGISTER_TYPE_UD, E(INT),   true  },
+      { BRW_REGISTER_TYPE_W,  E(INT),   true  },
+      { BRW_REGISTER_TYPE_UW, E(INT),   true  },
+
+      /* There are no ternary instructions that can operate on B-type sources
+       * on Gfx11-12. Src1/Src2 cannot be B-typed either.
+       */
+      { BRW_REGISTER_TYPE_B,  E(INT),   false },
+      { BRW_REGISTER_TYPE_UB, E(INT),   false },
+   };
+
+   /* Initially assume all hardware encodings are invalid */
+   BITSET_ONES(invalid_encodings);
+
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_exec_size(p, BRW_EXECUTE_4);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(test_case); i++) {
+      if (test_case[i].expected_result) {
+         unsigned hw_type = brw_reg_type_to_a1_hw_3src_type(&devinfo, test_case[i].type);
+         unsigned hw_exec_type = hw_type | (test_case[i].exec_type << 3);
+         if (hw_type != INVALID_HW_REG_TYPE) {
+            /* ... and remove valid encodings from the set */
+            assert(BITSET_TEST(invalid_encodings, hw_exec_type));
+            BITSET_CLEAR(invalid_encodings, hw_exec_type);
+         }
+
+         struct brw_reg g = retype(g0, test_case[i].type);
+         if (!brw_reg_type_is_integer(test_case[i].type)) {
+            brw_MAD(p, g, g, g, g);
+         } else {
+            brw_BFE(p, g, g, g, g);
+         }
+
+         EXPECT_TRUE(validate(p));
+
+         clear_instructions(p);
+      }
+   }
+
+   /* The remaining encodings in invalid_encodings do not have a mapping
+    * from BRW_REGISTER_TYPE_* and must be invalid. Verify that invalid
+    * encodings are rejected by the validator.
+    */
+   int e;
+   BITSET_FOREACH_SET(e, invalid_encodings, num_encodings) {
+      const unsigned hw_type = e & 0x7;
+      const unsigned exec_type = e >> 3;
+
+      for (unsigned i = 0; i < 2; i++) {
+         if (i == 0) {
+            brw_MAD(p, g0, g0, g0, g0);
+            brw_inst_set_3src_a1_exec_type(&devinfo, last_inst, BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
+         } else {
+            brw_CSEL(p, g0, g0, g0, g0);
+            brw_inst_set_3src_cond_modifier(&devinfo, last_inst, BRW_CONDITIONAL_NZ);
+            brw_inst_set_3src_a1_exec_type(&devinfo, last_inst, BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
+         }
+
+         brw_inst_set_3src_a1_exec_type(&devinfo, last_inst, exec_type);
+         brw_inst_set_3src_a1_dst_hw_type (&devinfo, last_inst, hw_type);
+         brw_inst_set_3src_a1_src0_hw_type(&devinfo, last_inst, hw_type);
+         brw_inst_set_3src_a1_src1_hw_type(&devinfo, last_inst, hw_type);
+         brw_inst_set_3src_a1_src2_hw_type(&devinfo, last_inst, hw_type);
+
+         EXPECT_FALSE(validate(p));
+
+         clear_instructions(p);
+      }
+   }
+}
+
+TEST_P(validation_test, 3src_inst_access_mode)
+{
+   /* 3-src instructions only supported on Gfx6+ */
+   if (devinfo.ver < 6)
+      return;
+
+   /* No access mode bit on Gfx12+ */
+   if (devinfo.ver >= 12)
+      return;
+
+   const struct {
+      unsigned mode;
+      bool expected_result;
+   } test_case[] = {
+      { BRW_ALIGN_1,  devinfo.ver >= 10 },
+      { BRW_ALIGN_16, devinfo.ver <= 10 },
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(test_case); i++) {
+      if (devinfo.ver < 10)
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+      brw_MAD(p, g0, g0, g0, g0);
+      brw_inst_set_access_mode(&devinfo, last_inst, test_case[i].mode);
+
+      EXPECT_EQ(test_case[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+/* When the Execution Data Type is wider than the destination data type, the
+ * destination must [...] specify a HorzStride equal to the ratio in sizes of
+ * the two data types.
+ */
+TEST_P(validation_test, dest_stride_must_be_equal_to_the_ratio_of_exec_size_to_dest_size)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_D);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_D);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_D);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_D);
+
+   EXPECT_TRUE(validate(p));
+}
+
+/* When the Execution Data Type is wider than the destination data type, the
+ * destination must be aligned as required by the wider execution data type
+ * [...]
+ */
+TEST_P(validation_test, dst_subreg_must_be_aligned_to_exec_type_size)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 2);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_D);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_D);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 8);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_D);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_D);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   EXPECT_TRUE(validate(p));
+}
+
+/* ExecSize must be greater than or equal to Width. */
+TEST_P(validation_test, exec_size_less_than_width)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_16);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_16);
+
+   EXPECT_FALSE(validate(p));
+}
+
+/* If ExecSize = Width and HorzStride ≠ 0,
+ * VertStride must be set to Width * HorzStride.
+ */
+TEST_P(validation_test, vertical_stride_is_width_by_horizontal_stride)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+
+   EXPECT_FALSE(validate(p));
+}
+
+/* If Width = 1, HorzStride must be 0 regardless of the values
+ * of ExecSize and VertStride.
+ */
+TEST_P(validation_test, horizontal_stride_must_be_0_if_width_is_1)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   EXPECT_FALSE(validate(p));
+}
+
+/* If ExecSize = Width = 1, both VertStride and HorzStride must be 0. */
+TEST_P(validation_test, scalar_region_must_be_0_1_0)
+{
+   struct brw_reg g0_0 = brw_vec1_grf(0, 0);
+
+   brw_ADD(p, g0, g0, g0_0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_1);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_1);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0_0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_1);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_1);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+   EXPECT_FALSE(validate(p));
+}
+
+/* If VertStride = HorzStride = 0, Width must be 1 regardless of the value
+ * of ExecSize.
+ */
+TEST_P(validation_test, zero_stride_implies_0_1_0)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_2);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_2);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+   EXPECT_FALSE(validate(p));
+}
+
+/* Dst.HorzStride must not be 0. */
+TEST_P(validation_test, dst_horizontal_stride_0)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   /* Align16 does not exist on Gfx11+ */
+   if (devinfo.ver >= 11)
+      return;
+
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+   EXPECT_FALSE(validate(p));
+}
+
+/* VertStride must be used to cross BRW_GENERAL_REGISTER_FILE register boundaries. This rule implies
+ * that elements within a 'Width' cannot cross BRW_GENERAL_REGISTER_FILE boundaries.
+ */
+TEST_P(validation_test, must_not_cross_grf_boundary_in_a_width)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, 4);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src1_da1_subreg_nr(&devinfo, last_inst, 4);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_FALSE(validate(p));
+}
+
+/* Destination Horizontal must be 1 in Align16 */
+TEST_P(validation_test, dst_hstride_on_align16_must_be_1)
+{
+   /* Align16 does not exist on Gfx11+ */
+   if (devinfo.ver >= 11)
+      return;
+
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   EXPECT_TRUE(validate(p));
+}
+
+/* VertStride must be 0 or 4 in Align16 */
+TEST_P(validation_test, vstride_on_align16_must_be_0_or_4)
+{
+   /* Align16 does not exist on Gfx11+ */
+   if (devinfo.ver >= 11)
+      return;
+
+   const struct {
+      enum brw_vertical_stride vstride;
+      bool expected_result;
+   } vstride[] = {
+      { BRW_VERTICAL_STRIDE_0, true },
+      { BRW_VERTICAL_STRIDE_1, false },
+      { BRW_VERTICAL_STRIDE_2, devinfo.verx10 >= 75 },
+      { BRW_VERTICAL_STRIDE_4, true },
+      { BRW_VERTICAL_STRIDE_8, false },
+      { BRW_VERTICAL_STRIDE_16, false },
+      { BRW_VERTICAL_STRIDE_32, false },
+      { BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL, false },
+   };
+
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(vstride); i++) {
+      brw_ADD(p, g0, g0, g0);
+      brw_inst_set_src0_vstride(&devinfo, last_inst, vstride[i].vstride);
+
+      EXPECT_EQ(vstride[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+
+   for (unsigned i = 0; i < ARRAY_SIZE(vstride); i++) {
+      brw_ADD(p, g0, g0, g0);
+      brw_inst_set_src1_vstride(&devinfo, last_inst, vstride[i].vstride);
+
+      EXPECT_EQ(vstride[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+/* In Direct Addressing mode, a source cannot span more than 2 adjacent BRW_GENERAL_REGISTER_FILE
+ * registers.
+ */
+TEST_P(validation_test, source_cannot_span_more_than_2_registers)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_32);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_8);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_8);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_src1_da1_subreg_nr(&devinfo, last_inst, 2);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+
+   EXPECT_TRUE(validate(p));
+}
+
+/* A destination cannot span more than 2 adjacent BRW_GENERAL_REGISTER_FILE registers. */
+TEST_P(validation_test, destination_cannot_span_more_than_2_registers)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_32);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_8);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 6);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   EXPECT_TRUE(validate(p));
+}
+
+TEST_P(validation_test, src_region_spans_two_regs_dst_region_spans_one)
+{
+   /* Writes to dest are to the lower OWord */
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   /* Writes to dest are to the upper OWord */
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 16);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   /* Writes to dest are evenly split between OWords */
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_8);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   /* Writes to dest are uneven between OWords */
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 10);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_2);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   if (devinfo.ver >= 9) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+}
+
+TEST_P(validation_test, dst_elements_must_be_evenly_split_between_registers)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 4);
+
+   if (devinfo.ver >= 9 && devinfo.verx10 < 125) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   if (devinfo.ver >= 6) {
+      gfx6_math(p, g0, BRW_MATH_FUNCTION_SIN, g0, null);
+
+      EXPECT_TRUE(validate(p));
+
+      clear_instructions(p);
+
+      gfx6_math(p, g0, BRW_MATH_FUNCTION_SIN, g0, null);
+      brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 4);
+
+      EXPECT_FALSE(validate(p));
+   }
+}
+
+TEST_P(validation_test, two_src_two_dst_source_offsets_must_be_same)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4);
+   brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, 16);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_2);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+  if (devinfo.ver <= 7 || devinfo.verx10 >= 125) {
+      EXPECT_FALSE(validate(p));
+   } else {
+      EXPECT_TRUE(validate(p));
+   }
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_8);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_2);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   if (devinfo.verx10 >= 125)
+      EXPECT_FALSE(validate(p));
+   else
+      EXPECT_TRUE(validate(p));
+}
+
+TEST_P(validation_test, two_src_two_dst_each_dst_must_be_derived_from_one_src)
+{
+   brw_MOV(p, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, 8);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   if (devinfo.ver <= 7) {
+      EXPECT_FALSE(validate(p));
+   } else {
+      EXPECT_TRUE(validate(p));
+   }
+
+   clear_instructions(p);
+
+   brw_MOV(p, g0, g0);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 16);
+   brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, 8);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_2);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_2);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   if (devinfo.ver <= 7 || devinfo.verx10 >= 125) {
+      EXPECT_FALSE(validate(p));
+   } else {
+      EXPECT_TRUE(validate(p));
+   }
+}
+
+TEST_P(validation_test, one_src_two_dst)
+{
+   struct brw_reg g0_0 = brw_vec1_grf(0, 0);
+
+   brw_ADD(p, g0, g0_0, g0_0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_D);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_D);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_D);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_D);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+
+   if (devinfo.ver >= 8) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_D);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+
+   if (devinfo.ver >= 8) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+   if (devinfo.ver >= 8) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+   brw_inst_set_src1_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W);
+
+   if (devinfo.ver >= 8) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+}
+
+TEST_P(validation_test, packed_byte_destination)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src_type;
+      bool neg, abs, sat;
+      bool expected_result;
+   } move[] = {
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 0, 0, 0, true },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 0, 0, 0, true },
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 0, 0, 0, true },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 0, 0, 0, true },
+
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 1, 0, 0, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 1, 0, 0, false },
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 1, 0, 0, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 1, 0, 0, false },
+
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 0, 1, 0, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 0, 1, 0, false },
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 0, 1, 0, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 0, 1, 0, false },
+
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 0, 0, 1, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 0, 0, 1, false },
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 0, 0, 1, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 0, 0, 1, false },
+
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UW, 0, 0, 0, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_W , 0, 0, 0, false },
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UD, 0, 0, 0, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_D , 0, 0, 0, false },
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
+      brw_MOV(p, retype(g0, move[i].dst_type), retype(g0, move[i].src_type));
+      brw_inst_set_src0_negate(&devinfo, last_inst, move[i].neg);
+      brw_inst_set_src0_abs(&devinfo, last_inst, move[i].abs);
+      brw_inst_set_saturate(&devinfo, last_inst, move[i].sat);
+
+      EXPECT_EQ(move[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+
+   brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_UB),
+              retype(g0, BRW_REGISTER_TYPE_UB),
+              retype(g0, BRW_REGISTER_TYPE_UB));
+   brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_B),
+              retype(g0, BRW_REGISTER_TYPE_B),
+              retype(g0, BRW_REGISTER_TYPE_B));
+   brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL);
+
+   EXPECT_FALSE(validate(p));
+}
+
+TEST_P(validation_test, byte_destination_relaxed_alignment)
+{
+   brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_B),
+              retype(g0, BRW_REGISTER_TYPE_W),
+              retype(g0, BRW_REGISTER_TYPE_W));
+   brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_B),
+              retype(g0, BRW_REGISTER_TYPE_W),
+              retype(g0, BRW_REGISTER_TYPE_W));
+   brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 1);
+
+   if (devinfo.verx10 >= 45) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+}
+
+TEST_P(validation_test, byte_64bit_conversion)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src_type;
+      unsigned dst_stride;
+      bool expected_result;
+   } inst[] = {
+#define INST(dst_type, src_type, dst_stride, expected_result)             \
+      {                                                                   \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src_type,                                    \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                              \
+         expected_result,                                                 \
+      }
+
+      INST(B,   Q, 1, false),
+      INST(B,  UQ, 1, false),
+      INST(B,  DF, 1, false),
+      INST(UB,  Q, 1, false),
+      INST(UB, UQ, 1, false),
+      INST(UB, DF, 1, false),
+
+      INST(B,   Q, 2, false),
+      INST(B,  UQ, 2, false),
+      INST(B , DF, 2, false),
+      INST(UB,  Q, 2, false),
+      INST(UB, UQ, 2, false),
+      INST(UB, DF, 2, false),
+
+      INST(B,   Q, 4, false),
+      INST(B,  UQ, 4, false),
+      INST(B,  DF, 4, false),
+      INST(UB,  Q, 4, false),
+      INST(UB, UQ, 4, false),
+      INST(UB, DF, 4, false),
+
+#undef INST
+   };
+
+   if (devinfo.ver < 8)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      if (!devinfo.has_64bit_float &&
+          inst[i].src_type == BRW_REGISTER_TYPE_DF)
+         continue;
+
+      if (!devinfo.has_64bit_int &&
+          (inst[i].src_type == BRW_REGISTER_TYPE_Q ||
+           inst[i].src_type == BRW_REGISTER_TYPE_UQ))
+         continue;
+
+      brw_MOV(p, retype(g0, inst[i].dst_type), retype(g0, inst[i].src_type));
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, half_float_conversion)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src_type;
+      unsigned dst_stride;
+      unsigned dst_subnr;
+      bool expected_result_bdw;
+      bool expected_result_chv_gfx9;
+      bool expected_result_gfx125;
+   } inst[] = {
+#define INST(dst_type, src_type, dst_stride, dst_subnr,                     \
+             expected_result_bdw, expected_result_chv_gfx9,                 \
+             expected_result_gfx125)                                        \
+      {                                                                     \
+         BRW_REGISTER_TYPE_##dst_type,                                      \
+         BRW_REGISTER_TYPE_##src_type,                                      \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                                \
+         dst_subnr,                                                         \
+         expected_result_bdw,                                               \
+         expected_result_chv_gfx9,                                          \
+         expected_result_gfx125,                                            \
+      }
+
+      /* MOV to half-float destination */
+      INST(HF,  B, 1, 0, false, false, false), /* 0 */
+      INST(HF,  W, 1, 0, false, false, false),
+      INST(HF, HF, 1, 0, true,  true,  true),
+      INST(HF, HF, 1, 2, true,  true,  false),
+      INST(HF,  D, 1, 0, false, false, false),
+      INST(HF,  F, 1, 0, false, true,  false),
+      INST(HF,  Q, 1, 0, false, false, false),
+      INST(HF,  B, 2, 0, true,  true,  false),
+      INST(HF,  B, 2, 2, false, false, false),
+      INST(HF,  W, 2, 0, true,  true,  false),
+      INST(HF,  W, 2, 2, false, false, false), /* 10 */
+      INST(HF, HF, 2, 0, true,  true,  false),
+      INST(HF, HF, 2, 2, true,  true,  false),
+      INST(HF,  D, 2, 0, true,  true,  true),
+      INST(HF,  D, 2, 2, false, false, false),
+      INST(HF,  F, 2, 0, true,  true,  true),
+      INST(HF,  F, 2, 2, false, true,  false),
+      INST(HF,  Q, 2, 0, false, false, false),
+      INST(HF, DF, 2, 0, false, false, false),
+      INST(HF,  B, 4, 0, false, false, false),
+      INST(HF,  W, 4, 0, false, false, false), /* 20 */
+      INST(HF, HF, 4, 0, true,  true,  false),
+      INST(HF, HF, 4, 2, true,  true,  false),
+      INST(HF,  D, 4, 0, false, false, false),
+      INST(HF,  F, 4, 0, false, false, false),
+      INST(HF,  Q, 4, 0, false, false, false),
+      INST(HF, DF, 4, 0, false, false, false),
+
+      /* MOV from half-float source */
+      INST( B, HF, 1, 0, false, false, false),
+      INST( W, HF, 1, 0, false, false, false),
+      INST( D, HF, 1, 0, true,  true,  true),
+      INST( D, HF, 1, 4, true,  true,  true),  /* 30 */
+      INST( F, HF, 1, 0, true,  true,  false),
+      INST( F, HF, 1, 4, true,  true,  false),
+      INST( Q, HF, 1, 0, false, false, false),
+      INST(DF, HF, 1, 0, false, false, false),
+      INST( B, HF, 2, 0, false, false, false),
+      INST( W, HF, 2, 0, true,  true,  true),
+      INST( W, HF, 2, 2, false, false, false),
+      INST( D, HF, 2, 0, false, false, false),
+      INST( F, HF, 2, 0, true,  true,  false),
+      INST( B, HF, 4, 0, true,  true,  true),  /* 40 */
+      INST( B, HF, 4, 1, false, false, false),
+      INST( W, HF, 4, 0, false, false, false),
+
+#undef INST
+   };
+
+   if (devinfo.ver < 8)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      if (!devinfo.has_64bit_float &&
+          (inst[i].dst_type == BRW_REGISTER_TYPE_DF ||
+           inst[i].src_type == BRW_REGISTER_TYPE_DF))
+         continue;
+
+      if (!devinfo.has_64bit_int &&
+          (inst[i].dst_type == BRW_REGISTER_TYPE_Q ||
+           inst[i].dst_type == BRW_REGISTER_TYPE_UQ ||
+           inst[i].src_type == BRW_REGISTER_TYPE_Q ||
+           inst[i].src_type == BRW_REGISTER_TYPE_UQ))
+         continue;
+
+      brw_MOV(p, retype(g0, inst[i].dst_type), retype(g0, inst[i].src_type));
+
+      brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+      brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, inst[i].dst_subnr);
+
+      if (inst[i].src_type == BRW_REGISTER_TYPE_B) {
+         brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+         brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_2);
+         brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+      } else {
+         brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+         brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+         brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+      }
+
+      if (devinfo.verx10 >= 125) {
+         EXPECT_EQ(inst[i].expected_result_gfx125, validate(p)) <<
+            "Failing test is: " << i;
+      } else if (devinfo.platform == INTEL_PLATFORM_CHV || devinfo.ver >= 9) {
+         EXPECT_EQ(inst[i].expected_result_chv_gfx9, validate(p)) <<
+            "Failing test is: " << i;
+      } else {
+         EXPECT_EQ(inst[i].expected_result_bdw, validate(p)) <<
+            "Failing test is: " << i;
+      }
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_source_indirect_addressing)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      unsigned dst_stride;
+      bool dst_indirect;
+      bool src0_indirect;
+      bool expected_result;
+      bool gfx125_expected_result;
+   } inst[] = {
+#define INST(dst_type, src0_type, src1_type,                              \
+             dst_stride, dst_indirect, src0_indirect, expected_result,    \
+             gfx125_expected_result)                                      \
+      {                                                                   \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                              \
+         dst_indirect,                                                    \
+         src0_indirect,                                                   \
+         expected_result,                                                 \
+         gfx125_expected_result,                                          \
+      }
+
+      /* Source and dest are mixed float: indirect src addressing not allowed */
+      INST(HF,  F,  F, 2, false, false, true,  true),
+      INST(HF,  F,  F, 2, true,  false, true,  true),
+      INST(HF,  F,  F, 2, false, true,  false, false),
+      INST(HF,  F,  F, 2, true,  true,  false, false),
+      INST( F, HF,  F, 1, false, false, true,  false),
+      INST( F, HF,  F, 1, true,  false, true,  false),
+      INST( F, HF,  F, 1, false, true,  false, false),
+      INST( F, HF,  F, 1, true,  true,  false, false),
+
+      INST(HF, HF,  F, 2, false, false, true,  false),
+      INST(HF, HF,  F, 2, true,  false, true,  false),
+      INST(HF, HF,  F, 2, false, true,  false, false),
+      INST(HF, HF,  F, 2, true,  true,  false, false),
+      INST( F,  F, HF, 1, false, false, true,  false),
+      INST( F,  F, HF, 1, true,  false, true,  false),
+      INST( F,  F, HF, 1, false, true,  false, false),
+      INST( F,  F, HF, 1, true,  true,  false, false),
+
+#undef INST
+   };
+
+   if (devinfo.ver < 8)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      brw_ADD(p, retype(g0, inst[i].dst_type),
+                 retype(g0, inst[i].src0_type),
+                 retype(g0, inst[i].src1_type));
+
+      brw_inst_set_dst_address_mode(&devinfo, last_inst, inst[i].dst_indirect);
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+      brw_inst_set_src0_address_mode(&devinfo, last_inst, inst[i].src0_indirect);
+
+      if (devinfo.verx10 >= 125) {
+         EXPECT_EQ(inst[i].gfx125_expected_result, validate(p));
+      } else {
+         EXPECT_EQ(inst[i].expected_result, validate(p));
+      }
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align1_simd16)
+{
+   static const struct {
+      unsigned exec_size;
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      unsigned dst_stride;
+      bool expected_result;
+      bool gfx125_expected_result;
+   } inst[] = {
+#define INST(exec_size, dst_type, src0_type, src1_type,                   \
+             dst_stride, expected_result, gfx125_expected_result)         \
+      {                                                                   \
+         BRW_EXECUTE_##exec_size,                                         \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                              \
+         expected_result,                                                 \
+         gfx125_expected_result,                                          \
+      }
+
+      /* No SIMD16 in mixed mode when destination is packed f16 */
+      INST( 8, HF,  F, HF, 2, true,  false),
+      INST(16, HF, HF,  F, 2, true,  false),
+      INST(16, HF, HF,  F, 1, false, false),
+      INST(16, HF,  F, HF, 1, false, false),
+
+      /* No SIMD16 in mixed mode when destination is f32 */
+      INST( 8,  F, HF,  F, 1, true,  false),
+      INST( 8,  F,  F, HF, 1, true,  false),
+      INST(16,  F, HF,  F, 1, false, false),
+      INST(16,  F,  F, HF, 1, false, false),
+
+#undef INST
+   };
+
+   if (devinfo.ver < 8)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      brw_ADD(p, retype(g0, inst[i].dst_type),
+                 retype(g0, inst[i].src0_type),
+                 retype(g0, inst[i].src1_type));
+
+      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+
+      if (devinfo.verx10 >= 125) {
+         EXPECT_EQ(inst[i].gfx125_expected_result, validate(p));
+      } else {
+         EXPECT_EQ(inst[i].expected_result, validate(p));
+      }
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align1_packed_fp16_dst_acc_read_offset_0)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      unsigned dst_stride;
+      bool read_acc;
+      unsigned subnr;
+      bool expected_result_bdw;
+      bool expected_result_chv_skl;
+      bool expected_result_gfx125;
+   } inst[] = {
+#define INST(dst_type, src0_type, src1_type, dst_stride, read_acc, subnr,   \
+             expected_result_bdw, expected_result_chv_skl,                  \
+             expected_result_gfx125)                                        \
+      {                                                                     \
+         BRW_REGISTER_TYPE_##dst_type,                                      \
+         BRW_REGISTER_TYPE_##src0_type,                                     \
+         BRW_REGISTER_TYPE_##src1_type,                                     \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                                \
+         read_acc,                                                          \
+         subnr,                                                             \
+         expected_result_bdw,                                               \
+         expected_result_chv_skl,                                           \
+         expected_result_gfx125,                                            \
+      }
+
+      /* Destination is not packed */
+      INST(HF, HF,  F, 2, true,  0, true, true, false),
+      INST(HF, HF,  F, 2, true,  2, true, true, false),
+      INST(HF, HF,  F, 2, true,  4, true, true, false),
+      INST(HF, HF,  F, 2, true,  8, true, true, false),
+      INST(HF, HF,  F, 2, true, 16, true, true, false),
+
+      /* Destination is packed, we don't read acc */
+      INST(HF, HF,  F, 1, false,  0, false, true, false),
+      INST(HF, HF,  F, 1, false,  2, false, true, false),
+      INST(HF, HF,  F, 1, false,  4, false, true, false),
+      INST(HF, HF,  F, 1, false,  8, false, true, false),
+      INST(HF, HF,  F, 1, false, 16, false, true, false),
+
+      /* Destination is packed, we read acc */
+      INST(HF, HF,  F, 1, true,  0, false, false, false),
+      INST(HF, HF,  F, 1, true,  2, false, false, false),
+      INST(HF, HF,  F, 1, true,  4, false, false, false),
+      INST(HF, HF,  F, 1, true,  8, false, false, false),
+      INST(HF, HF,  F, 1, true, 16, false, false, false),
+
+#undef INST
+   };
+
+   if (devinfo.ver < 8)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      brw_ADD(p, retype(g0, inst[i].dst_type),
+                 retype(inst[i].read_acc ? acc0 : g0, inst[i].src0_type),
+                 retype(g0, inst[i].src1_type));
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+
+      brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, inst[i].subnr);
+
+      if (devinfo.verx10 >= 125)
+         EXPECT_EQ(inst[i].expected_result_gfx125, validate(p));
+      else if (devinfo.platform == INTEL_PLATFORM_CHV || devinfo.ver >= 9)
+         EXPECT_EQ(inst[i].expected_result_chv_skl, validate(p));
+      else
+         EXPECT_EQ(inst[i].expected_result_bdw, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_fp16_dest_with_acc)
+{
+   static const struct {
+      unsigned exec_size;
+      unsigned opcode;
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      unsigned dst_stride;
+      bool read_acc;
+      bool expected_result_bdw;
+      bool expected_result_chv_skl;
+      bool expected_result_gfx125;
+   } inst[] = {
+#define INST(exec_size, opcode, dst_type, src0_type, src1_type,           \
+             dst_stride, read_acc,expected_result_bdw,                    \
+             expected_result_chv_skl, expected_result_gfx125)             \
+      {                                                                   \
+         BRW_EXECUTE_##exec_size,                                         \
+         BRW_OPCODE_##opcode,                                             \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                              \
+         read_acc,                                                        \
+         expected_result_bdw,                                             \
+         expected_result_chv_skl,                                         \
+         expected_result_gfx125,                                          \
+      }
+
+      /* Packed fp16 dest with implicit acc needs hstride=2 */
+      INST(8, MAC, HF, HF,  F, 1, false, false, false, false),
+      INST(8, MAC, HF, HF,  F, 2, false, true,  true,  false),
+      INST(8, MAC, HF,  F, HF, 1, false, false, false, false),
+      INST(8, MAC, HF,  F, HF, 2, false, true,  true,  false),
+
+      /* Packed fp16 dest with explicit acc needs hstride=2 */
+      INST(8, ADD, HF, HF,  F, 1, true,  false, false, false),
+      INST(8, ADD, HF, HF,  F, 2, true,  true,  true,  false),
+      INST(8, ADD, HF,  F, HF, 1, true,  false, false, false),
+      INST(8, ADD, HF,  F, HF, 2, true,  true,  true,  false),
+
+      /* If destination is not fp16, restriction doesn't apply */
+      INST(8, MAC,  F, HF,  F, 1, false, true, true, false),
+      INST(8, MAC,  F, HF,  F, 2, false, true, true, false),
+
+      /* If there is no implicit/explicit acc, restriction doesn't apply */
+      INST(8, ADD, HF, HF,  F, 1, false, false, true, false),
+      INST(8, ADD, HF, HF,  F, 2, false, true,  true, false),
+      INST(8, ADD, HF,  F, HF, 1, false, false, true, false),
+      INST(8, ADD, HF,  F, HF, 2, false, true,  true, false),
+      INST(8, ADD,  F, HF,  F, 1, false, true,  true, false),
+      INST(8, ADD,  F, HF,  F, 2, false, true,  true, false),
+
+#undef INST
+   };
+
+   if (devinfo.ver < 8)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      if (inst[i].opcode == BRW_OPCODE_MAC) {
+         brw_MAC(p, retype(g0, inst[i].dst_type),
+                    retype(g0, inst[i].src0_type),
+                    retype(g0, inst[i].src1_type));
+      } else {
+         assert(inst[i].opcode == BRW_OPCODE_ADD);
+         brw_ADD(p, retype(g0, inst[i].dst_type),
+                    retype(inst[i].read_acc ? acc0: g0, inst[i].src0_type),
+                    retype(g0, inst[i].src1_type));
+      }
+
+      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+
+      if (devinfo.verx10 >= 125)
+         EXPECT_EQ(inst[i].expected_result_gfx125, validate(p));
+      else if (devinfo.platform == INTEL_PLATFORM_CHV || devinfo.ver >= 9)
+         EXPECT_EQ(inst[i].expected_result_chv_skl, validate(p));
+      else
+         EXPECT_EQ(inst[i].expected_result_bdw, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align1_math_strided_fp16_inputs)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      unsigned dst_stride;
+      unsigned src0_stride;
+      unsigned src1_stride;
+      bool expected_result;
+      bool expected_result_gfx125;
+   } inst[] = {
+#define INST(dst_type, src0_type, src1_type,                              \
+             dst_stride, src0_stride, src1_stride, expected_result,       \
+             expected_result_125)                                         \
+      {                                                                   \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                              \
+         BRW_HORIZONTAL_STRIDE_##src0_stride,                             \
+         BRW_HORIZONTAL_STRIDE_##src1_stride,                             \
+         expected_result,                                                 \
+         expected_result_125,                                             \
+      }
+
+      INST(HF, HF,  F, 2, 2, 1, true,  false),
+      INST(HF,  F, HF, 2, 1, 2, true,  false),
+      INST(HF,  F, HF, 1, 1, 2, true,  false),
+      INST(HF,  F, HF, 2, 1, 1, false, false),
+      INST(HF, HF,  F, 2, 1, 1, false, false),
+      INST(HF, HF,  F, 1, 1, 1, false, false),
+      INST(HF, HF,  F, 2, 1, 1, false, false),
+      INST( F, HF,  F, 1, 1, 1, false, false),
+      INST( F,  F, HF, 1, 1, 2, true,  false),
+      INST( F, HF, HF, 1, 2, 1, false, false),
+      INST( F, HF, HF, 1, 2, 2, true,  false),
+
+#undef INST
+   };
+
+   /* No half-float math in gfx8 */
+   if (devinfo.ver < 9)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      gfx6_math(p, retype(g0, inst[i].dst_type),
+                   BRW_MATH_FUNCTION_POW,
+                   retype(g0, inst[i].src0_type),
+                   retype(g0, inst[i].src1_type));
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+      brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+      brw_inst_set_src0_hstride(&devinfo, last_inst, inst[i].src0_stride);
+
+      brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+      brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+      brw_inst_set_src1_hstride(&devinfo, last_inst, inst[i].src1_stride);
+
+      if (devinfo.verx10 >= 125)
+         EXPECT_EQ(inst[i].expected_result_gfx125, validate(p));
+      else
+         EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align1_packed_fp16_dst)
+{
+   static const struct {
+      unsigned exec_size;
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      unsigned dst_stride;
+      unsigned dst_subnr;
+      bool expected_result_bdw;
+      bool expected_result_chv_skl;
+      bool expected_result_gfx125;
+   } inst[] = {
+#define INST(exec_size, dst_type, src0_type, src1_type, dst_stride, dst_subnr, \
+             expected_result_bdw, expected_result_chv_skl,                     \
+             expected_result_gfx125)                                           \
+      {                                                                        \
+         BRW_EXECUTE_##exec_size,                                              \
+         BRW_REGISTER_TYPE_##dst_type,                                         \
+         BRW_REGISTER_TYPE_##src0_type,                                        \
+         BRW_REGISTER_TYPE_##src1_type,                                        \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                                   \
+         dst_subnr,                                                            \
+         expected_result_bdw,                                                  \
+         expected_result_chv_skl,                                              \
+         expected_result_gfx125                                                \
+      }
+
+      /* SIMD8 packed fp16 dst won't cross oword boundaries if region is
+       * oword-aligned
+       */
+      INST( 8, HF, HF,  F, 1,  0, false, true,  false),
+      INST( 8, HF, HF,  F, 1,  2, false, false, false),
+      INST( 8, HF, HF,  F, 1,  4, false, false, false),
+      INST( 8, HF, HF,  F, 1,  8, false, false, false),
+      INST( 8, HF, HF,  F, 1, 16, false, true,  false),
+
+      /* SIMD16 packed fp16 always crosses oword boundaries */
+      INST(16, HF, HF,  F, 1,  0, false, false, false),
+      INST(16, HF, HF,  F, 1,  2, false, false, false),
+      INST(16, HF, HF,  F, 1,  4, false, false, false),
+      INST(16, HF, HF,  F, 1,  8, false, false, false),
+      INST(16, HF, HF,  F, 1, 16, false, false, false),
+
+      /* If destination is not packed (or not fp16) we can cross oword
+       * boundaries
+       */
+      INST( 8, HF, HF,  F, 2,  0, true, true, false),
+      INST( 8,  F, HF,  F, 1,  0, true, true, false),
+
+#undef INST
+   };
+
+   if (devinfo.ver < 8)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      brw_ADD(p, retype(g0, inst[i].dst_type),
+                 retype(g0, inst[i].src0_type),
+                 retype(g0, inst[i].src1_type));
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+      brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, inst[i].dst_subnr);
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+      brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+      brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+      brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+      brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+      brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
+
+      if (devinfo.verx10 >= 125)
+         EXPECT_EQ(inst[i].expected_result_gfx125, validate(p));
+      else if (devinfo.platform == INTEL_PLATFORM_CHV || devinfo.ver >= 9)
+         EXPECT_EQ(inst[i].expected_result_chv_skl, validate(p));
+      else
+         EXPECT_EQ(inst[i].expected_result_bdw, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align16_packed_data)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      unsigned src0_vstride;
+      unsigned src1_vstride;
+      bool expected_result;
+   } inst[] = {
+#define INST(dst_type, src0_type, src1_type,                              \
+             src0_vstride, src1_vstride, expected_result)                 \
+      {                                                                   \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         BRW_VERTICAL_STRIDE_##src0_vstride,                              \
+         BRW_VERTICAL_STRIDE_##src1_vstride,                              \
+         expected_result,                                                 \
+      }
+
+      /* We only test with F destination because there is a restriction
+       * by which F->HF conversions need to be DWord aligned but Align16 also
+       * requires that destination horizontal stride is 1.
+       */
+      INST(F,  F, HF, 4, 4, true),
+      INST(F,  F, HF, 2, 4, false),
+      INST(F,  F, HF, 4, 2, false),
+      INST(F,  F, HF, 0, 4, false),
+      INST(F,  F, HF, 4, 0, false),
+      INST(F, HF,  F, 4, 4, true),
+      INST(F, HF,  F, 4, 2, false),
+      INST(F, HF,  F, 2, 4, false),
+      INST(F, HF,  F, 0, 4, false),
+      INST(F, HF,  F, 4, 0, false),
+
+#undef INST
+   };
+
+   if (devinfo.ver < 8 || devinfo.ver >= 11)
+      return;
+
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      brw_ADD(p, retype(g0, inst[i].dst_type),
+                 retype(g0, inst[i].src0_type),
+                 retype(g0, inst[i].src1_type));
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, inst[i].src0_vstride);
+      brw_inst_set_src1_vstride(&devinfo, last_inst, inst[i].src1_vstride);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align16_no_simd16)
+{
+   static const struct {
+      unsigned exec_size;
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      bool expected_result;
+   } inst[] = {
+#define INST(exec_size, dst_type, src0_type, src1_type, expected_result)  \
+      {                                                                   \
+         BRW_EXECUTE_##exec_size,                                         \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         expected_result,                                                 \
+      }
+
+      /* We only test with F destination because there is a restriction
+       * by which F->HF conversions need to be DWord aligned but Align16 also
+       * requires that destination horizontal stride is 1.
+       */
+      INST( 8,  F,  F, HF, true),
+      INST( 8,  F, HF,  F, true),
+      INST( 8,  F,  F, HF, true),
+      INST(16,  F,  F, HF, false),
+      INST(16,  F, HF,  F, false),
+      INST(16,  F,  F, HF, false),
+
+#undef INST
+   };
+
+   if (devinfo.ver < 8 || devinfo.ver >= 11)
+      return;
+
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      brw_ADD(p, retype(g0, inst[i].dst_type),
+                 retype(g0, inst[i].src0_type),
+                 retype(g0, inst[i].src1_type));
+
+      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+      brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align16_no_acc_read)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      bool read_acc;
+      bool expected_result;
+   } inst[] = {
+#define INST(dst_type, src0_type, src1_type, read_acc, expected_result)   \
+      {                                                                   \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         read_acc,                                                        \
+         expected_result,                                                 \
+      }
+
+      /* We only test with F destination because there is a restriction
+       * by which F->HF conversions need to be DWord aligned but Align16 also
+       * requires that destination horizontal stride is 1.
+       */
+      INST( F,  F, HF, false, true),
+      INST( F,  F, HF, true,  false),
+      INST( F, HF,  F, false, true),
+      INST( F, HF,  F, true,  false),
+
+#undef INST
+   };
+
+   if (devinfo.ver < 8 || devinfo.ver >= 11)
+      return;
+
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      brw_ADD(p, retype(g0, inst[i].dst_type),
+                 retype(inst[i].read_acc ? acc0 : g0, inst[i].src0_type),
+                 retype(g0, inst[i].src1_type));
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+      brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align16_math_packed_format)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      unsigned src0_vstride;
+      unsigned src1_vstride;
+      bool expected_result;
+   } inst[] = {
+#define INST(dst_type, src0_type, src1_type,                              \
+             src0_vstride, src1_vstride, expected_result)                 \
+      {                                                                   \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         BRW_VERTICAL_STRIDE_##src0_vstride,                              \
+         BRW_VERTICAL_STRIDE_##src1_vstride,                              \
+         expected_result,                                                 \
+      }
+
+      /* We only test with F destination because there is a restriction
+       * by which F->HF conversions need to be DWord aligned but Align16 also
+       * requires that destination horizontal stride is 1.
+       */
+      INST( F, HF,  F, 4, 0, false),
+      INST( F, HF, HF, 4, 4, true),
+      INST( F,  F, HF, 4, 0, false),
+      INST( F,  F, HF, 2, 4, false),
+      INST( F,  F, HF, 4, 2, false),
+      INST( F, HF, HF, 0, 4, false),
+
+#undef INST
+   };
+
+   /* Align16 Math for mixed float mode is not supported in gfx8 */
+   if (devinfo.ver < 9 || devinfo.ver >= 11)
+      return;
+
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      gfx6_math(p, retype(g0, inst[i].dst_type),
+                   BRW_MATH_FUNCTION_POW,
+                   retype(g0, inst[i].src0_type),
+                   retype(g0, inst[i].src1_type));
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, inst[i].src0_vstride);
+      brw_inst_set_src1_vstride(&devinfo, last_inst, inst[i].src1_vstride);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, vector_immediate_destination_alignment)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src_type;
+      unsigned subnr;
+      unsigned exec_size;
+      bool expected_result;
+   } move[] = {
+      { BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_VF,  0, BRW_EXECUTE_4, true  },
+      { BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_VF, 16, BRW_EXECUTE_4, true  },
+      { BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_VF,  1, BRW_EXECUTE_4, false },
+
+      { BRW_REGISTER_TYPE_W, BRW_REGISTER_TYPE_V,   0, BRW_EXECUTE_8, true  },
+      { BRW_REGISTER_TYPE_W, BRW_REGISTER_TYPE_V,  16, BRW_EXECUTE_8, true  },
+      { BRW_REGISTER_TYPE_W, BRW_REGISTER_TYPE_V,   1, BRW_EXECUTE_8, false },
+
+      { BRW_REGISTER_TYPE_W, BRW_REGISTER_TYPE_UV,  0, BRW_EXECUTE_8, true  },
+      { BRW_REGISTER_TYPE_W, BRW_REGISTER_TYPE_UV, 16, BRW_EXECUTE_8, true  },
+      { BRW_REGISTER_TYPE_W, BRW_REGISTER_TYPE_UV,  1, BRW_EXECUTE_8, false },
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
+      /* UV type is Gfx6+ */
+      if (devinfo.ver < 6 &&
+          move[i].src_type == BRW_REGISTER_TYPE_UV)
+         continue;
+
+      brw_MOV(p, retype(g0, move[i].dst_type), retype(zero, move[i].src_type));
+      brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, move[i].subnr);
+      brw_inst_set_exec_size(&devinfo, last_inst, move[i].exec_size);
+
+      EXPECT_EQ(move[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, vector_immediate_destination_stride)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src_type;
+      unsigned stride;
+      bool expected_result;
+   } move[] = {
+      { BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_VF, BRW_HORIZONTAL_STRIDE_1, true  },
+      { BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_VF, BRW_HORIZONTAL_STRIDE_2, false },
+      { BRW_REGISTER_TYPE_D, BRW_REGISTER_TYPE_VF, BRW_HORIZONTAL_STRIDE_1, true  },
+      { BRW_REGISTER_TYPE_D, BRW_REGISTER_TYPE_VF, BRW_HORIZONTAL_STRIDE_2, false },
+      { BRW_REGISTER_TYPE_W, BRW_REGISTER_TYPE_VF, BRW_HORIZONTAL_STRIDE_2, true  },
+      { BRW_REGISTER_TYPE_B, BRW_REGISTER_TYPE_VF, BRW_HORIZONTAL_STRIDE_4, true  },
+
+      { BRW_REGISTER_TYPE_W, BRW_REGISTER_TYPE_V,  BRW_HORIZONTAL_STRIDE_1, true  },
+      { BRW_REGISTER_TYPE_W, BRW_REGISTER_TYPE_V,  BRW_HORIZONTAL_STRIDE_2, false },
+      { BRW_REGISTER_TYPE_W, BRW_REGISTER_TYPE_V,  BRW_HORIZONTAL_STRIDE_4, false },
+      { BRW_REGISTER_TYPE_B, BRW_REGISTER_TYPE_V,  BRW_HORIZONTAL_STRIDE_2, true  },
+
+      { BRW_REGISTER_TYPE_W, BRW_REGISTER_TYPE_UV, BRW_HORIZONTAL_STRIDE_1, true  },
+      { BRW_REGISTER_TYPE_W, BRW_REGISTER_TYPE_UV, BRW_HORIZONTAL_STRIDE_2, false },
+      { BRW_REGISTER_TYPE_W, BRW_REGISTER_TYPE_UV, BRW_HORIZONTAL_STRIDE_4, false },
+      { BRW_REGISTER_TYPE_B, BRW_REGISTER_TYPE_UV, BRW_HORIZONTAL_STRIDE_2, true  },
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
+      /* UV type is Gfx6+ */
+      if (devinfo.ver < 6 &&
+          move[i].src_type == BRW_REGISTER_TYPE_UV)
+         continue;
+
+      brw_MOV(p, retype(g0, move[i].dst_type), retype(zero, move[i].src_type));
+      brw_inst_set_dst_hstride(&devinfo, last_inst, move[i].stride);
+
+      EXPECT_EQ(move[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, qword_low_power_align1_regioning_restrictions)
+{
+   static const struct {
+      enum opcode opcode;
+      unsigned exec_size;
+
+      enum brw_reg_type dst_type;
+      unsigned dst_subreg;
+      unsigned dst_stride;
+
+      enum brw_reg_type src_type;
+      unsigned src_subreg;
+      unsigned src_vstride;
+      unsigned src_width;
+      unsigned src_hstride;
+
+      bool expected_result;
+   } inst[] = {
+#define INST(opcode, exec_size, dst_type, dst_subreg, dst_stride, src_type,    \
+             src_subreg, src_vstride, src_width, src_hstride, expected_result) \
+      {                                                                        \
+         BRW_OPCODE_##opcode,                                                  \
+         BRW_EXECUTE_##exec_size,                                              \
+         BRW_REGISTER_TYPE_##dst_type,                                         \
+         dst_subreg,                                                           \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                                   \
+         BRW_REGISTER_TYPE_##src_type,                                         \
+         src_subreg,                                                           \
+         BRW_VERTICAL_STRIDE_##src_vstride,                                    \
+         BRW_WIDTH_##src_width,                                                \
+         BRW_HORIZONTAL_STRIDE_##src_hstride,                                  \
+         expected_result,                                                      \
+      }
+
+      /* Some instruction that violate no restrictions, as a control */
+      INST(MOV, 4, DF, 0, 1, DF, 0, 4, 4, 1, true ),
+      INST(MOV, 4, Q,  0, 1, Q,  0, 4, 4, 1, true ),
+      INST(MOV, 4, UQ, 0, 1, UQ, 0, 4, 4, 1, true ),
+
+      INST(MOV, 4, DF, 0, 1, F,  0, 8, 4, 2, true ),
+      INST(MOV, 4, Q,  0, 1, D,  0, 8, 4, 2, true ),
+      INST(MOV, 4, UQ, 0, 1, UD, 0, 8, 4, 2, true ),
+
+      INST(MOV, 4, F,  0, 2, DF, 0, 4, 4, 1, true ),
+      INST(MOV, 4, D,  0, 2, Q,  0, 4, 4, 1, true ),
+      INST(MOV, 4, UD, 0, 2, UQ, 0, 4, 4, 1, true ),
+
+      INST(MUL, 8, D,  0, 2, D,  0, 8, 4, 2, true ),
+      INST(MUL, 8, UD, 0, 2, UD, 0, 8, 4, 2, true ),
+
+      /* Something with subreg nrs */
+      INST(MOV, 2, DF, 8, 1, DF, 8, 2, 2, 1, true ),
+      INST(MOV, 2, Q,  8, 1, Q,  8, 2, 2, 1, true ),
+      INST(MOV, 2, UQ, 8, 1, UQ, 8, 2, 2, 1, true ),
+
+      INST(MUL, 2, D,  4, 2, D,  4, 4, 2, 2, true ),
+      INST(MUL, 2, UD, 4, 2, UD, 4, 4, 2, 2, true ),
+
+      /* The PRMs say that for CHV, BXT:
+       *
+       *    When source or destination datatype is 64b or operation is integer
+       *    DWord multiply, regioning in Align1 must follow these rules:
+       *
+       *    1. Source and Destination horizontal stride must be aligned to the
+       *       same qword.
+       */
+      INST(MOV, 4, DF, 0, 2, DF, 0, 4, 4, 1, false),
+      INST(MOV, 4, Q,  0, 2, Q,  0, 4, 4, 1, false),
+      INST(MOV, 4, UQ, 0, 2, UQ, 0, 4, 4, 1, false),
+
+      INST(MOV, 4, DF, 0, 2, F,  0, 8, 4, 2, false),
+      INST(MOV, 4, Q,  0, 2, D,  0, 8, 4, 2, false),
+      INST(MOV, 4, UQ, 0, 2, UD, 0, 8, 4, 2, false),
+
+      INST(MOV, 4, DF, 0, 2, F,  0, 4, 4, 1, false),
+      INST(MOV, 4, Q,  0, 2, D,  0, 4, 4, 1, false),
+      INST(MOV, 4, UQ, 0, 2, UD, 0, 4, 4, 1, false),
+
+      INST(MUL, 4, D,  0, 2, D,  0, 4, 4, 1, false),
+      INST(MUL, 4, UD, 0, 2, UD, 0, 4, 4, 1, false),
+
+      INST(MUL, 4, D,  0, 1, D,  0, 8, 4, 2, false),
+      INST(MUL, 4, UD, 0, 1, UD, 0, 8, 4, 2, false),
+
+      /*    2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride. */
+      INST(MOV, 4, DF, 0, 1, DF, 0, 0, 2, 1, false),
+      INST(MOV, 4, Q,  0, 1, Q,  0, 0, 2, 1, false),
+      INST(MOV, 4, UQ, 0, 1, UQ, 0, 0, 2, 1, false),
+
+      INST(MOV, 4, DF, 0, 1, F,  0, 0, 2, 2, false),
+      INST(MOV, 4, Q,  0, 1, D,  0, 0, 2, 2, false),
+      INST(MOV, 4, UQ, 0, 1, UD, 0, 0, 2, 2, false),
+
+      INST(MOV, 8, F,  0, 2, DF, 0, 0, 2, 1, false),
+      INST(MOV, 8, D,  0, 2, Q,  0, 0, 2, 1, false),
+      INST(MOV, 8, UD, 0, 2, UQ, 0, 0, 2, 1, false),
+
+      INST(MUL, 8, D,  0, 2, D,  0, 0, 4, 2, false),
+      INST(MUL, 8, UD, 0, 2, UD, 0, 0, 4, 2, false),
+
+      INST(MUL, 8, D,  0, 2, D,  0, 0, 4, 2, false),
+      INST(MUL, 8, UD, 0, 2, UD, 0, 0, 4, 2, false),
+
+      /*    3. Source and Destination offset must be the same, except the case
+       *       of scalar source.
+       */
+      INST(MOV, 2, DF, 8, 1, DF, 0, 2, 2, 1, false),
+      INST(MOV, 2, Q,  8, 1, Q,  0, 2, 2, 1, false),
+      INST(MOV, 2, UQ, 8, 1, UQ, 0, 2, 2, 1, false),
+
+      INST(MOV, 2, DF, 0, 1, DF, 8, 2, 2, 1, false),
+      INST(MOV, 2, Q,  0, 1, Q,  8, 2, 2, 1, false),
+      INST(MOV, 2, UQ, 0, 1, UQ, 8, 2, 2, 1, false),
+
+      INST(MUL, 4, D,  4, 2, D,  0, 4, 2, 2, false),
+      INST(MUL, 4, UD, 4, 2, UD, 0, 4, 2, 2, false),
+
+      INST(MUL, 4, D,  0, 2, D,  4, 4, 2, 2, false),
+      INST(MUL, 4, UD, 0, 2, UD, 4, 4, 2, 2, false),
+
+      INST(MOV, 2, DF, 8, 1, DF, 0, 0, 1, 0, true ),
+      INST(MOV, 2, Q,  8, 1, Q,  0, 0, 1, 0, true ),
+      INST(MOV, 2, UQ, 8, 1, UQ, 0, 0, 1, 0, true ),
+
+      INST(MOV, 2, DF, 8, 1, F,  4, 0, 1, 0, true ),
+      INST(MOV, 2, Q,  8, 1, D,  4, 0, 1, 0, true ),
+      INST(MOV, 2, UQ, 8, 1, UD, 4, 0, 1, 0, true ),
+
+      INST(MUL, 4, D,  4, 1, D,  0, 0, 1, 0, true ),
+      INST(MUL, 4, UD, 4, 1, UD, 0, 0, 1, 0, true ),
+
+      INST(MUL, 4, D,  0, 1, D,  4, 0, 1, 0, true ),
+      INST(MUL, 4, UD, 0, 1, UD, 4, 0, 1, 0, true ),
+
+#undef INST
+   };
+
+   /* These restrictions only apply to Gfx8+ */
+   if (devinfo.ver < 8)
+      return;
+
+   /* NoDDChk/NoDDClr does not exist on Gfx12+ */
+   if (devinfo.ver >= 12)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      if (!devinfo.has_64bit_float &&
+          (inst[i].dst_type == BRW_REGISTER_TYPE_DF ||
+           inst[i].src_type == BRW_REGISTER_TYPE_DF))
+         continue;
+
+      if (!devinfo.has_64bit_int &&
+          (inst[i].dst_type == BRW_REGISTER_TYPE_Q ||
+           inst[i].dst_type == BRW_REGISTER_TYPE_UQ ||
+           inst[i].src_type == BRW_REGISTER_TYPE_Q ||
+           inst[i].src_type == BRW_REGISTER_TYPE_UQ))
+         continue;
+
+      if (inst[i].opcode == BRW_OPCODE_MOV) {
+         brw_MOV(p, retype(g0, inst[i].dst_type),
+                    retype(g0, inst[i].src_type));
+      } else {
+         assert(inst[i].opcode == BRW_OPCODE_MUL);
+         brw_MUL(p, retype(g0, inst[i].dst_type),
+                    retype(g0, inst[i].src_type),
+                    retype(zero, inst[i].src_type));
+      }
+      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
+
+      brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, inst[i].dst_subreg);
+      brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, inst[i].src_subreg);
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, inst[i].src_vstride);
+      brw_inst_set_src0_width(&devinfo, last_inst, inst[i].src_width);
+      brw_inst_set_src0_hstride(&devinfo, last_inst, inst[i].src_hstride);
+
+      if (devinfo.platform == INTEL_PLATFORM_CHV ||
+          intel_device_info_is_9lp(&devinfo)) {
+         EXPECT_EQ(inst[i].expected_result, validate(p));
+      } else {
+         EXPECT_TRUE(validate(p));
+      }
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, qword_low_power_no_indirect_addressing)
+{
+   static const struct {
+      enum opcode opcode;
+      unsigned exec_size;
+
+      enum brw_reg_type dst_type;
+      bool dst_is_indirect;
+      unsigned dst_stride;
+
+      enum brw_reg_type src_type;
+      bool src_is_indirect;
+      unsigned src_vstride;
+      unsigned src_width;
+      unsigned src_hstride;
+
+      bool expected_result;
+   } inst[] = {
+#define INST(opcode, exec_size, dst_type, dst_is_indirect, dst_stride,         \
+             src_type, src_is_indirect, src_vstride, src_width, src_hstride,   \
+             expected_result)                                                  \
+      {                                                                        \
+         BRW_OPCODE_##opcode,                                                  \
+         BRW_EXECUTE_##exec_size,                                              \
+         BRW_REGISTER_TYPE_##dst_type,                                         \
+         dst_is_indirect,                                                      \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                                   \
+         BRW_REGISTER_TYPE_##src_type,                                         \
+         src_is_indirect,                                                      \
+         BRW_VERTICAL_STRIDE_##src_vstride,                                    \
+         BRW_WIDTH_##src_width,                                                \
+         BRW_HORIZONTAL_STRIDE_##src_hstride,                                  \
+         expected_result,                                                      \
+      }
+
+      /* Some instruction that violate no restrictions, as a control */
+      INST(MOV, 4, DF, 0, 1, DF, 0, 4, 4, 1, true ),
+      INST(MOV, 4, Q,  0, 1, Q,  0, 4, 4, 1, true ),
+      INST(MOV, 4, UQ, 0, 1, UQ, 0, 4, 4, 1, true ),
+
+      INST(MUL, 8, D,  0, 2, D,  0, 8, 4, 2, true ),
+      INST(MUL, 8, UD, 0, 2, UD, 0, 8, 4, 2, true ),
+
+      INST(MOV, 4, F,  1, 1, F,  0, 4, 4, 1, true ),
+      INST(MOV, 4, F,  0, 1, F,  1, 4, 4, 1, true ),
+      INST(MOV, 4, F,  1, 1, F,  1, 4, 4, 1, true ),
+
+      /* The PRMs say that for CHV, BXT:
+       *
+       *    When source or destination datatype is 64b or operation is integer
+       *    DWord multiply, indirect addressing must not be used.
+       */
+      INST(MOV, 4, DF, 1, 1, DF, 0, 4, 4, 1, false),
+      INST(MOV, 4, Q,  1, 1, Q,  0, 4, 4, 1, false),
+      INST(MOV, 4, UQ, 1, 1, UQ, 0, 4, 4, 1, false),
+
+      INST(MOV, 4, DF, 0, 1, DF, 1, 4, 4, 1, false),
+      INST(MOV, 4, Q,  0, 1, Q,  1, 4, 4, 1, false),
+      INST(MOV, 4, UQ, 0, 1, UQ, 1, 4, 4, 1, false),
+
+      INST(MOV, 4, DF, 1, 1, F,  0, 8, 4, 2, false),
+      INST(MOV, 4, Q,  1, 1, D,  0, 8, 4, 2, false),
+      INST(MOV, 4, UQ, 1, 1, UD, 0, 8, 4, 2, false),
+
+      INST(MOV, 4, DF, 0, 1, F,  1, 8, 4, 2, false),
+      INST(MOV, 4, Q,  0, 1, D,  1, 8, 4, 2, false),
+      INST(MOV, 4, UQ, 0, 1, UD, 1, 8, 4, 2, false),
+
+      INST(MOV, 4, F,  1, 2, DF, 0, 4, 4, 1, false),
+      INST(MOV, 4, D,  1, 2, Q,  0, 4, 4, 1, false),
+      INST(MOV, 4, UD, 1, 2, UQ, 0, 4, 4, 1, false),
+
+      INST(MOV, 4, F,  0, 2, DF, 1, 4, 4, 1, false),
+      INST(MOV, 4, D,  0, 2, Q,  1, 4, 4, 1, false),
+      INST(MOV, 4, UD, 0, 2, UQ, 1, 4, 4, 1, false),
+
+      INST(MUL, 8, D,  1, 2, D,  0, 8, 4, 2, false),
+      INST(MUL, 8, UD, 1, 2, UD, 0, 8, 4, 2, false),
+
+      INST(MUL, 8, D,  0, 2, D,  1, 8, 4, 2, false),
+      INST(MUL, 8, UD, 0, 2, UD, 1, 8, 4, 2, false),
+
+#undef INST
+   };
+
+   /* These restrictions only apply to Gfx8+ */
+   if (devinfo.ver < 8)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      if (!devinfo.has_64bit_float &&
+          (inst[i].dst_type == BRW_REGISTER_TYPE_DF ||
+           inst[i].src_type == BRW_REGISTER_TYPE_DF))
+         continue;
+
+      if (!devinfo.has_64bit_int &&
+          (inst[i].dst_type == BRW_REGISTER_TYPE_Q ||
+           inst[i].dst_type == BRW_REGISTER_TYPE_UQ ||
+           inst[i].src_type == BRW_REGISTER_TYPE_Q ||
+           inst[i].src_type == BRW_REGISTER_TYPE_UQ))
+         continue;
+
+      if (inst[i].opcode == BRW_OPCODE_MOV) {
+         brw_MOV(p, retype(g0, inst[i].dst_type),
+                    retype(g0, inst[i].src_type));
+      } else {
+         assert(inst[i].opcode == BRW_OPCODE_MUL);
+         brw_MUL(p, retype(g0, inst[i].dst_type),
+                    retype(g0, inst[i].src_type),
+                    retype(zero, inst[i].src_type));
+      }
+      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
+
+      brw_inst_set_dst_address_mode(&devinfo, last_inst, inst[i].dst_is_indirect);
+      brw_inst_set_src0_address_mode(&devinfo, last_inst, inst[i].src_is_indirect);
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, inst[i].src_vstride);
+      brw_inst_set_src0_width(&devinfo, last_inst, inst[i].src_width);
+      brw_inst_set_src0_hstride(&devinfo, last_inst, inst[i].src_hstride);
+
+      if (devinfo.platform == INTEL_PLATFORM_CHV ||
+          intel_device_info_is_9lp(&devinfo)) {
+         EXPECT_EQ(inst[i].expected_result, validate(p));
+      } else {
+         EXPECT_TRUE(validate(p));
+      }
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, qword_low_power_no_64bit_arf)
+{
+   static const struct {
+      enum opcode opcode;
+      unsigned exec_size;
+
+      struct brw_reg dst;
+      enum brw_reg_type dst_type;
+      unsigned dst_stride;
+
+      struct brw_reg src;
+      enum brw_reg_type src_type;
+      unsigned src_vstride;
+      unsigned src_width;
+      unsigned src_hstride;
+
+      bool acc_wr;
+      bool expected_result;
+   } inst[] = {
+#define INST(opcode, exec_size, dst, dst_type, dst_stride,                     \
+             src, src_type, src_vstride, src_width, src_hstride,               \
+             acc_wr, expected_result)                                          \
+      {                                                                        \
+         BRW_OPCODE_##opcode,                                                  \
+         BRW_EXECUTE_##exec_size,                                              \
+         dst,                                                                  \
+         BRW_REGISTER_TYPE_##dst_type,                                         \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                                   \
+         src,                                                                  \
+         BRW_REGISTER_TYPE_##src_type,                                         \
+         BRW_VERTICAL_STRIDE_##src_vstride,                                    \
+         BRW_WIDTH_##src_width,                                                \
+         BRW_HORIZONTAL_STRIDE_##src_hstride,                                  \
+         acc_wr,                                                               \
+         expected_result,                                                      \
+      }
+
+      /* Some instruction that violate no restrictions, as a control */
+      INST(MOV, 4, g0,   DF, 1, g0,   F,  4, 2, 2, 0, true ),
+      INST(MOV, 4, g0,   F,  2, g0,   DF, 4, 4, 1, 0, true ),
+
+      INST(MOV, 4, g0,   Q,  1, g0,   D,  4, 2, 2, 0, true ),
+      INST(MOV, 4, g0,   D,  2, g0,   Q,  4, 4, 1, 0, true ),
+
+      INST(MOV, 4, g0,   UQ, 1, g0,   UD, 4, 2, 2, 0, true ),
+      INST(MOV, 4, g0,   UD, 2, g0,   UQ, 4, 4, 1, 0, true ),
+
+      INST(MOV, 4, null, F,  1, g0,   F,  4, 4, 1, 0, true ),
+      INST(MOV, 4, acc0, F,  1, g0,   F,  4, 4, 1, 0, true ),
+      INST(MOV, 4, g0,   F,  1, acc0, F,  4, 4, 1, 0, true ),
+
+      INST(MOV, 4, null, D,  1, g0,   D,  4, 4, 1, 0, true ),
+      INST(MOV, 4, acc0, D,  1, g0,   D,  4, 4, 1, 0, true ),
+      INST(MOV, 4, g0,   D,  1, acc0, D,  4, 4, 1, 0, true ),
+
+      INST(MOV, 4, null, UD, 1, g0,   UD, 4, 4, 1, 0, true ),
+      INST(MOV, 4, acc0, UD, 1, g0,   UD, 4, 4, 1, 0, true ),
+      INST(MOV, 4, g0,   UD, 1, acc0, UD, 4, 4, 1, 0, true ),
+
+      INST(MUL, 4, g0,   D,  2, g0,   D,  4, 2, 2, 0, true ),
+      INST(MUL, 4, g0,   UD, 2, g0,   UD, 4, 2, 2, 0, true ),
+
+      /* The PRMs say that for CHV, BXT:
+       *
+       *    ARF registers must never be used with 64b datatype or when
+       *    operation is integer DWord multiply.
+       */
+      INST(MOV, 4, acc0, DF, 1, g0,   F,  4, 2, 2, 0, false),
+      INST(MOV, 4, g0,   DF, 1, acc0, F,  4, 2, 2, 0, false),
+
+      INST(MOV, 4, acc0, Q,  1, g0,   D,  4, 2, 2, 0, false),
+      INST(MOV, 4, g0,   Q,  1, acc0, D,  4, 2, 2, 0, false),
+
+      INST(MOV, 4, acc0, UQ, 1, g0,   UD, 4, 2, 2, 0, false),
+      INST(MOV, 4, g0,   UQ, 1, acc0, UD, 4, 2, 2, 0, false),
+
+      INST(MOV, 4, acc0, F,  2, g0,   DF, 4, 4, 1, 0, false),
+      INST(MOV, 4, g0,   F,  2, acc0, DF, 4, 4, 1, 0, false),
+
+      INST(MOV, 4, acc0, D,  2, g0,   Q,  4, 4, 1, 0, false),
+      INST(MOV, 4, g0,   D,  2, acc0, Q,  4, 4, 1, 0, false),
+
+      INST(MOV, 4, acc0, UD, 2, g0,   UQ, 4, 4, 1, 0, false),
+      INST(MOV, 4, g0,   UD, 2, acc0, UQ, 4, 4, 1, 0, false),
+
+      INST(MUL, 4, acc0, D,  2, g0,   D,  4, 2, 2, 0, false),
+      INST(MUL, 4, acc0, UD, 2, g0,   UD, 4, 2, 2, 0, false),
+      /* MUL cannot have integer accumulator sources, so don't test that */
+
+      /* We assume that the restriction does not apply to the null register */
+      INST(MOV, 4, null, DF, 1, g0,   F,  4, 2, 2, 0, true ),
+      INST(MOV, 4, null, Q,  1, g0,   D,  4, 2, 2, 0, true ),
+      INST(MOV, 4, null, UQ, 1, g0,   UD, 4, 2, 2, 0, true ),
+
+      /* Check implicit accumulator write control */
+      INST(MOV, 4, null, DF, 1, g0,   F,  4, 2, 2, 1, false),
+      INST(MUL, 4, null, DF, 1, g0,   F,  4, 2, 2, 1, false),
+
+#undef INST
+   };
+
+   /* These restrictions only apply to Gfx8+ */
+   if (devinfo.ver < 8)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      if (!devinfo.has_64bit_float &&
+          (inst[i].dst_type == BRW_REGISTER_TYPE_DF ||
+           inst[i].src_type == BRW_REGISTER_TYPE_DF))
+         continue;
+
+      if (!devinfo.has_64bit_int &&
+          (inst[i].dst_type == BRW_REGISTER_TYPE_Q ||
+           inst[i].dst_type == BRW_REGISTER_TYPE_UQ ||
+           inst[i].src_type == BRW_REGISTER_TYPE_Q ||
+           inst[i].src_type == BRW_REGISTER_TYPE_UQ))
+         continue;
+
+      if (inst[i].opcode == BRW_OPCODE_MOV) {
+         brw_MOV(p, retype(inst[i].dst, inst[i].dst_type),
+                    retype(inst[i].src, inst[i].src_type));
+      } else {
+         assert(inst[i].opcode == BRW_OPCODE_MUL);
+         brw_MUL(p, retype(inst[i].dst, inst[i].dst_type),
+                    retype(inst[i].src, inst[i].src_type),
+                    retype(zero, inst[i].src_type));
+         brw_inst_set_opcode(&isa, last_inst, inst[i].opcode);
+      }
+      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
+      brw_inst_set_acc_wr_control(&devinfo, last_inst, inst[i].acc_wr);
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, inst[i].src_vstride);
+      brw_inst_set_src0_width(&devinfo, last_inst, inst[i].src_width);
+      brw_inst_set_src0_hstride(&devinfo, last_inst, inst[i].src_hstride);
+
+      /* Note: The Broadwell PRM also lists the restriction that destination
+       * of DWord multiplication cannot be the accumulator.
+       */
+      if (devinfo.platform == INTEL_PLATFORM_CHV ||
+          intel_device_info_is_9lp(&devinfo) ||
+          (devinfo.ver == 8 &&
+           inst[i].opcode == BRW_OPCODE_MUL &&
+           brw_inst_dst_reg_file(&devinfo, last_inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+           brw_inst_dst_da_reg_nr(&devinfo, last_inst) != BRW_ARF_NULL)) {
+         EXPECT_EQ(inst[i].expected_result, validate(p));
+      } else {
+         EXPECT_TRUE(validate(p));
+      }
+
+      clear_instructions(p);
+   }
+
+   if (!devinfo.has_64bit_float)
+      return;
+
+   /* MAC implicitly reads the accumulator */
+   brw_MAC(p, retype(g0, BRW_REGISTER_TYPE_DF),
+              retype(stride(g0, 4, 4, 1), BRW_REGISTER_TYPE_DF),
+              retype(stride(g0, 4, 4, 1), BRW_REGISTER_TYPE_DF));
+   if (devinfo.platform == INTEL_PLATFORM_CHV ||
+       intel_device_info_is_9lp(&devinfo)) {
+      EXPECT_FALSE(validate(p));
+   } else {
+      EXPECT_TRUE(validate(p));
+   }
+}
+
+TEST_P(validation_test, align16_64_bit_integer)
+{
+   static const struct {
+      enum opcode opcode;
+      unsigned exec_size;
+
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src_type;
+
+      bool expected_result;
+   } inst[] = {
+#define INST(opcode, exec_size, dst_type, src_type, expected_result)           \
+      {                                                                        \
+         BRW_OPCODE_##opcode,                                                  \
+         BRW_EXECUTE_##exec_size,                                              \
+         BRW_REGISTER_TYPE_##dst_type,                                         \
+         BRW_REGISTER_TYPE_##src_type,                                         \
+         expected_result,                                                      \
+      }
+
+      /* Some instruction that violate no restrictions, as a control */
+      INST(MOV, 2, Q,  D,  true ),
+      INST(MOV, 2, UQ, UD, true ),
+      INST(MOV, 2, DF, F,  true ),
+
+      INST(ADD, 2, Q,  D,  true ),
+      INST(ADD, 2, UQ, UD, true ),
+      INST(ADD, 2, DF, F,  true ),
+
+      /* The PRMs say that for BDW, SKL:
+       *
+       *    If Align16 is required for an operation with QW destination and non-QW
+       *    source datatypes, the execution size cannot exceed 2.
+       */
+
+      INST(MOV, 4, Q,  D,  false),
+      INST(MOV, 4, UQ, UD, false),
+      INST(MOV, 4, DF, F,  false),
+
+      INST(ADD, 4, Q,  D,  false),
+      INST(ADD, 4, UQ, UD, false),
+      INST(ADD, 4, DF, F,  false),
+
+#undef INST
+   };
+
+   /* 64-bit integer types exist on Gfx8+ */
+   if (devinfo.ver < 8)
+      return;
+
+   /* Align16 does not exist on Gfx11+ */
+   if (devinfo.ver >= 11)
+      return;
+
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      if (inst[i].opcode == BRW_OPCODE_MOV) {
+         brw_MOV(p, retype(g0, inst[i].dst_type),
+                    retype(g0, inst[i].src_type));
+      } else {
+         assert(inst[i].opcode == BRW_OPCODE_ADD);
+         brw_ADD(p, retype(g0, inst[i].dst_type),
+                    retype(g0, inst[i].src_type),
+                    retype(g0, inst[i].src_type));
+      }
+      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, qword_low_power_no_depctrl)
+{
+   static const struct {
+      enum opcode opcode;
+      unsigned exec_size;
+
+      enum brw_reg_type dst_type;
+      unsigned dst_stride;
+
+      enum brw_reg_type src_type;
+      unsigned src_vstride;
+      unsigned src_width;
+      unsigned src_hstride;
+
+      bool no_dd_check;
+      bool no_dd_clear;
+
+      bool expected_result;
+   } inst[] = {
+#define INST(opcode, exec_size, dst_type, dst_stride,                          \
+             src_type, src_vstride, src_width, src_hstride,                    \
+             no_dd_check, no_dd_clear, expected_result)                        \
+      {                                                                        \
+         BRW_OPCODE_##opcode,                                                  \
+         BRW_EXECUTE_##exec_size,                                              \
+         BRW_REGISTER_TYPE_##dst_type,                                         \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                                   \
+         BRW_REGISTER_TYPE_##src_type,                                         \
+         BRW_VERTICAL_STRIDE_##src_vstride,                                    \
+         BRW_WIDTH_##src_width,                                                \
+         BRW_HORIZONTAL_STRIDE_##src_hstride,                                  \
+         no_dd_check,                                                          \
+         no_dd_clear,                                                          \
+         expected_result,                                                      \
+      }
+
+      /* Some instruction that violate no restrictions, as a control */
+      INST(MOV, 4, DF, 1, F,  8, 4, 2, 0, 0, true ),
+      INST(MOV, 4, Q,  1, D,  8, 4, 2, 0, 0, true ),
+      INST(MOV, 4, UQ, 1, UD, 8, 4, 2, 0, 0, true ),
+
+      INST(MOV, 4, F,  2, DF, 4, 4, 1, 0, 0, true ),
+      INST(MOV, 4, D,  2, Q,  4, 4, 1, 0, 0, true ),
+      INST(MOV, 4, UD, 2, UQ, 4, 4, 1, 0, 0, true ),
+
+      INST(MUL, 8, D,  2, D,  8, 4, 2, 0, 0, true ),
+      INST(MUL, 8, UD, 2, UD, 8, 4, 2, 0, 0, true ),
+
+      INST(MOV, 4, F,  1, F,  4, 4, 1, 1, 1, true ),
+
+      /* The PRMs say that for CHV, BXT:
+       *
+       *    When source or destination datatype is 64b or operation is integer
+       *    DWord multiply, DepCtrl must not be used.
+       */
+      INST(MOV, 4, DF, 1, F,  8, 4, 2, 1, 0, false),
+      INST(MOV, 4, Q,  1, D,  8, 4, 2, 1, 0, false),
+      INST(MOV, 4, UQ, 1, UD, 8, 4, 2, 1, 0, false),
+
+      INST(MOV, 4, F,  2, DF, 4, 4, 1, 1, 0, false),
+      INST(MOV, 4, D,  2, Q,  4, 4, 1, 1, 0, false),
+      INST(MOV, 4, UD, 2, UQ, 4, 4, 1, 1, 0, false),
+
+      INST(MOV, 4, DF, 1, F,  8, 4, 2, 0, 1, false),
+      INST(MOV, 4, Q,  1, D,  8, 4, 2, 0, 1, false),
+      INST(MOV, 4, UQ, 1, UD, 8, 4, 2, 0, 1, false),
+
+      INST(MOV, 4, F,  2, DF, 4, 4, 1, 0, 1, false),
+      INST(MOV, 4, D,  2, Q,  4, 4, 1, 0, 1, false),
+      INST(MOV, 4, UD, 2, UQ, 4, 4, 1, 0, 1, false),
+
+      INST(MUL, 8, D,  2, D,  8, 4, 2, 1, 0, false),
+      INST(MUL, 8, UD, 2, UD, 8, 4, 2, 1, 0, false),
+
+      INST(MUL, 8, D,  2, D,  8, 4, 2, 0, 1, false),
+      INST(MUL, 8, UD, 2, UD, 8, 4, 2, 0, 1, false),
+
+#undef INST
+   };
+
+   /* These restrictions only apply to Gfx8+ */
+   if (devinfo.ver < 8)
+      return;
+
+   /* NoDDChk/NoDDClr does not exist on Gfx12+ */
+   if (devinfo.ver >= 12)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      if (!devinfo.has_64bit_float &&
+          (inst[i].dst_type == BRW_REGISTER_TYPE_DF ||
+           inst[i].src_type == BRW_REGISTER_TYPE_DF))
+         continue;
+
+      if (!devinfo.has_64bit_int &&
+          (inst[i].dst_type == BRW_REGISTER_TYPE_Q ||
+           inst[i].dst_type == BRW_REGISTER_TYPE_UQ ||
+           inst[i].src_type == BRW_REGISTER_TYPE_Q ||
+           inst[i].src_type == BRW_REGISTER_TYPE_UQ))
+         continue;
+
+      if (inst[i].opcode == BRW_OPCODE_MOV) {
+         brw_MOV(p, retype(g0, inst[i].dst_type),
+                    retype(g0, inst[i].src_type));
+      } else {
+         assert(inst[i].opcode == BRW_OPCODE_MUL);
+         brw_MUL(p, retype(g0, inst[i].dst_type),
+                    retype(g0, inst[i].src_type),
+                    retype(zero, inst[i].src_type));
+      }
+      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, inst[i].src_vstride);
+      brw_inst_set_src0_width(&devinfo, last_inst, inst[i].src_width);
+      brw_inst_set_src0_hstride(&devinfo, last_inst, inst[i].src_hstride);
+
+      brw_inst_set_no_dd_check(&devinfo, last_inst, inst[i].no_dd_check);
+      brw_inst_set_no_dd_clear(&devinfo, last_inst, inst[i].no_dd_clear);
+
+      if (devinfo.platform == INTEL_PLATFORM_CHV ||
+          intel_device_info_is_9lp(&devinfo)) {
+         EXPECT_EQ(inst[i].expected_result, validate(p));
+      } else {
+         EXPECT_TRUE(validate(p));
+      }
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, gfx11_no_byte_src_1_2)
+{
+   static const struct {
+      enum opcode opcode;
+      unsigned access_mode;
+
+      enum brw_reg_type dst_type;
+      struct {
+         enum brw_reg_type type;
+         unsigned vstride;
+         unsigned width;
+         unsigned hstride;
+      } srcs[3];
+
+      int  gfx_ver;
+      bool expected_result;
+   } inst[] = {
+#define INST(opcode, access_mode, dst_type,                             \
+             src0_type, src0_vstride, src0_width, src0_hstride,         \
+             src1_type, src1_vstride, src1_width, src1_hstride,         \
+             src2_type,                                                 \
+             gfx_ver, expected_result)                                  \
+      {                                                                 \
+         BRW_OPCODE_##opcode,                                           \
+         BRW_ALIGN_##access_mode,                                       \
+         BRW_REGISTER_TYPE_##dst_type,                                  \
+         {                                                              \
+            {                                                           \
+               BRW_REGISTER_TYPE_##src0_type,                           \
+               BRW_VERTICAL_STRIDE_##src0_vstride,                      \
+               BRW_WIDTH_##src0_width,                                  \
+               BRW_HORIZONTAL_STRIDE_##src0_hstride,                    \
+            },                                                          \
+            {                                                           \
+               BRW_REGISTER_TYPE_##src1_type,                           \
+               BRW_VERTICAL_STRIDE_##src1_vstride,                      \
+               BRW_WIDTH_##src1_width,                                  \
+               BRW_HORIZONTAL_STRIDE_##src1_hstride,                    \
+            },                                                          \
+            {                                                           \
+               BRW_REGISTER_TYPE_##src2_type,                           \
+            },                                                          \
+         },                                                             \
+         gfx_ver,                                                       \
+         expected_result,                                               \
+      }
+
+      /* Passes on < 11 */
+      INST(MOV, 16,  F, B, 2, 4, 0, UD, 0, 4, 0,  D,  8, true ),
+      INST(ADD, 16, UD, F, 0, 4, 0, UB, 0, 1, 0,  D,  7, true ),
+      INST(MAD, 16,  D, B, 0, 4, 0, UB, 0, 1, 0,  B, 10, true ),
+
+      /* Fails on 11+ */
+      INST(MAD,  1, UB, W, 1, 1, 0,  D, 0, 4, 0,  B, 11, false ),
+      INST(MAD,  1, UB, W, 1, 1, 1, UB, 1, 1, 0,  W, 11, false ),
+      INST(ADD,  1,  W, W, 1, 4, 1,  B, 1, 1, 0,  D, 11, false ),
+
+      /* Passes on 11+ */
+      INST(MOV,  1,  W, B, 8, 8, 1,  D, 8, 8, 1,  D, 11, true ),
+      INST(ADD,  1, UD, B, 8, 8, 1,  W, 8, 8, 1,  D, 11, true ),
+      INST(MAD,  1,  B, B, 0, 1, 0,  D, 0, 4, 0,  W, 11, true ),
+
+#undef INST
+   };
+
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      /* Skip instruction not meant for this gfx_ver. */
+      if (devinfo.ver != inst[i].gfx_ver)
+         continue;
+
+      brw_push_insn_state(p);
+
+      brw_set_default_exec_size(p, BRW_EXECUTE_8);
+      brw_set_default_access_mode(p, inst[i].access_mode);
+
+      switch (inst[i].opcode) {
+      case BRW_OPCODE_MOV:
+         brw_MOV(p, retype(g0, inst[i].dst_type),
+                    retype(g0, inst[i].srcs[0].type));
+         brw_inst_set_src0_vstride(&devinfo, last_inst, inst[i].srcs[0].vstride);
+         brw_inst_set_src0_hstride(&devinfo, last_inst, inst[i].srcs[0].hstride);
+         break;
+      case BRW_OPCODE_ADD:
+         brw_ADD(p, retype(g0, inst[i].dst_type),
+                    retype(g0, inst[i].srcs[0].type),
+                    retype(g0, inst[i].srcs[1].type));
+         brw_inst_set_src0_vstride(&devinfo, last_inst, inst[i].srcs[0].vstride);
+         brw_inst_set_src0_width(&devinfo, last_inst, inst[i].srcs[0].width);
+         brw_inst_set_src0_hstride(&devinfo, last_inst, inst[i].srcs[0].hstride);
+         brw_inst_set_src1_vstride(&devinfo, last_inst, inst[i].srcs[1].vstride);
+         brw_inst_set_src1_width(&devinfo, last_inst, inst[i].srcs[1].width);
+         brw_inst_set_src1_hstride(&devinfo, last_inst, inst[i].srcs[1].hstride);
+         break;
+      case BRW_OPCODE_MAD:
+         brw_MAD(p, retype(g0, inst[i].dst_type),
+                    retype(g0, inst[i].srcs[0].type),
+                    retype(g0, inst[i].srcs[1].type),
+                    retype(g0, inst[i].srcs[2].type));
+         brw_inst_set_3src_a1_src0_vstride(&devinfo, last_inst, inst[i].srcs[0].vstride);
+         brw_inst_set_3src_a1_src0_hstride(&devinfo, last_inst, inst[i].srcs[0].hstride);
+         brw_inst_set_3src_a1_src1_vstride(&devinfo, last_inst, inst[i].srcs[0].vstride);
+         brw_inst_set_3src_a1_src1_hstride(&devinfo, last_inst, inst[i].srcs[0].hstride);
+         break;
+      default:
+         unreachable("invalid opcode");
+      }
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+      brw_inst_set_src0_width(&devinfo, last_inst, inst[i].srcs[0].width);
+      brw_inst_set_src1_width(&devinfo, last_inst, inst[i].srcs[1].width);
+
+      brw_pop_insn_state(p);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, add3_source_types)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      enum brw_reg_type src2_type;
+      bool expected_result;
+   } inst[] = {
+#define INST(dst_type, src0_type, src1_type, src2_type, expected_result)  \
+      {                                                                   \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         BRW_REGISTER_TYPE_##src2_type,                                   \
+         expected_result,                                                 \
+      }
+
+      INST( F,  F,  F,  F, false),
+      INST(HF, HF, HF, HF, false),
+      INST( B,  B,  B,  B, false),
+      INST(UB, UB, UB, UB, false),
+
+      INST( W,  W,  W,  W, true),
+      INST(UW, UW, UW, UW, true),
+      INST( D,  D,  D,  D, true),
+      INST(UD, UD, UD, UD, true),
+
+      INST( W,  D,  W,  W, true),
+      INST(UW, UW, UD, UW, true),
+      INST( D,  D,  W,  D, true),
+      INST(UD, UD, UD, UW, true),
+#undef INST
+   };
+
+
+   if (devinfo.verx10 < 125)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      brw_ADD3(p,
+               retype(g0, inst[i].dst_type),
+               retype(g0, inst[i].src0_type),
+               retype(g0, inst[i].src1_type),
+               retype(g0, inst[i].src2_type));
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, add3_immediate_types)
+{
+   static const struct {
+      enum brw_reg_type reg_type;
+      enum brw_reg_type imm_type;
+      unsigned imm_src;
+      bool expected_result;
+   } inst[] = {
+#define INST(reg_type, imm_type, imm_src, expected_result)                \
+      {                                                                   \
+         BRW_REGISTER_TYPE_##reg_type,                                    \
+         BRW_REGISTER_TYPE_##imm_type,                                    \
+         imm_src,                                                         \
+         expected_result,                                                 \
+      }
+
+      INST( W,  W,  0, true),
+      INST( W,  W,  2, true),
+      INST(UW, UW,  0, true),
+      INST(UW, UW,  2, true),
+      INST( D,  W,  0, true),
+      INST(UD,  W,  2, true),
+      INST( D, UW,  0, true),
+      INST(UW, UW,  2, true),
+
+      INST( W,  D,  0, false),
+      INST( W,  D,  2, false),
+      INST(UW, UD,  0, false),
+      INST(UW, UD,  2, false),
+      INST( D,  D,  0, false),
+      INST(UD,  D,  2, false),
+      INST( D, UD,  0, false),
+      INST(UW, UD,  2, false),
+#undef INST
+   };
+
+
+   if (devinfo.verx10 < 125)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) {
+      brw_ADD3(p,
+               retype(g0, inst[i].reg_type),
+               inst[i].imm_src == 0 ? retype(brw_imm_d(0x1234), inst[i].imm_type)
+                                    : retype(g0, inst[i].reg_type),
+               retype(g0, inst[i].reg_type),
+               inst[i].imm_src == 2 ? retype(brw_imm_d(0x2143), inst[i].imm_type)
+                                    : retype(g0, inst[i].reg_type));
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, dpas_sdepth)
+{
+   if (devinfo.verx10 < 125)
+      return;
+
+   static const enum gfx12_systolic_depth depth[] = {
+      BRW_SYSTOLIC_DEPTH_16,
+      BRW_SYSTOLIC_DEPTH_2,
+      BRW_SYSTOLIC_DEPTH_4,
+      BRW_SYSTOLIC_DEPTH_8,
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(depth); i++) {
+      brw_DPAS(p,
+               depth[i],
+               8,
+               retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_F),
+               null,
+               retype(brw_vec8_grf(16, 0), BRW_REGISTER_TYPE_HF),
+               retype(brw_vec8_grf(32, 0), BRW_REGISTER_TYPE_HF));
+
+      const bool expected_result = depth[i] == BRW_SYSTOLIC_DEPTH_8;
+
+      EXPECT_EQ(expected_result, validate(p)) <<
+         "Encoded systolic depth value is: " << depth[i];
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, dpas_exec_size)
+{
+   if (devinfo.verx10 < 125)
+      return;
+
+   static const enum brw_execution_size test_vectors[] = {
+      BRW_EXECUTE_1,
+      BRW_EXECUTE_2,
+      BRW_EXECUTE_4,
+      BRW_EXECUTE_8,
+      BRW_EXECUTE_16,
+      BRW_EXECUTE_32,
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(test_vectors); i++) {
+      brw_set_default_exec_size(p, test_vectors[i]);
+
+      brw_DPAS(p,
+               BRW_SYSTOLIC_DEPTH_8,
+               8,
+               retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_F),
+               null,
+               retype(brw_vec8_grf(16, 0), BRW_REGISTER_TYPE_HF),
+               retype(brw_vec8_grf(32, 0), BRW_REGISTER_TYPE_HF));
+
+      const bool expected_result = test_vectors[i] == BRW_EXECUTE_8;
+
+      EXPECT_EQ(expected_result, validate(p)) <<
+         "Exec size = " << (1u << test_vectors[i]);
+
+      clear_instructions(p);
+   }
+
+   brw_set_default_exec_size(p, BRW_EXECUTE_8);
+}
+
+TEST_P(validation_test, dpas_sub_byte_precision)
+{
+   if (devinfo.verx10 < 125)
+      return;
+
+   static const struct {
+      brw_reg_type dst_type;
+      brw_reg_type src0_type;
+      brw_reg_type src1_type;
+      enum gfx12_sub_byte_precision src1_prec;
+      brw_reg_type src2_type;
+      enum gfx12_sub_byte_precision src2_prec;
+      bool expected_result;
+   } test_vectors[] = {
+      {
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_NONE,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_NONE,
+         true,
+      },
+      {
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_NONE,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_4BIT,
+         false,
+      },
+      {
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_NONE,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_2BIT,
+         false,
+      },
+      {
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_4BIT,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_NONE,
+         false,
+      },
+      {
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_2BIT,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_NONE,
+         false,
+      },
+
+      {
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         true,
+      },
+      {
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_4BIT,
+         true,
+      },
+      {
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_2BIT,
+         true,
+      },
+      {
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         BRW_REGISTER_TYPE_UB, (enum gfx12_sub_byte_precision) 3,
+         false,
+      },
+      {
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_4BIT,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         true,
+      },
+      {
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_2BIT,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         true,
+      },
+      {
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UB, (enum gfx12_sub_byte_precision) 3,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         false,
+      },
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(test_vectors); i++) {
+      brw_inst *inst =
+         brw_DPAS(p,
+                  BRW_SYSTOLIC_DEPTH_8,
+                  8,
+                  retype(brw_vec8_grf(0, 0), test_vectors[i].dst_type),
+                  retype(brw_vec8_grf(16, 0), test_vectors[i].src0_type),
+                  retype(brw_vec8_grf(32, 0), test_vectors[i].src1_type),
+                  retype(brw_vec8_grf(48, 0), test_vectors[i].src2_type));
+
+      brw_inst_set_dpas_3src_src1_subbyte(&devinfo, inst,
+                                          test_vectors[i].src1_prec);
+      brw_inst_set_dpas_3src_src2_subbyte(&devinfo, inst,
+                                          test_vectors[i].src2_prec);
+
+      EXPECT_EQ(test_vectors[i].expected_result, validate(p)) <<
+         "test vector index = " << i;
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, dpas_types)
+{
+   if (devinfo.verx10 < 125)
+      return;
+
+#define TV(a, b, c, d, r)                              \
+   { BRW_REGISTER_TYPE_ ## a, BRW_REGISTER_TYPE_ ## b, \
+     BRW_REGISTER_TYPE_ ## c, BRW_REGISTER_TYPE_ ## d, \
+     r }
+
+   static const struct {
+      brw_reg_type dst_type;
+      brw_reg_type src0_type;
+      brw_reg_type src1_type;
+      brw_reg_type src2_type;
+      bool expected_result;
+   } test_vectors[] = {
+      TV( F,  F, HF, HF, true),
+      TV( F, HF, HF, HF, false),
+      TV(HF,  F, HF, HF, false),
+      TV( F,  F,  F, HF, false),
+      TV( F,  F, HF,  F, false),
+
+      TV(DF, DF, DF, DF, false),
+      TV(DF, DF, DF,  F, false),
+      TV(DF, DF,  F, DF, false),
+      TV(DF,  F, DF, DF, false),
+      TV(DF, DF, DF, HF, false),
+      TV(DF, DF, HF, DF, false),
+      TV(DF, HF, DF, DF, false),
+
+      TV(UD, UD, UB, UB, true),
+      TV(UD, UD, UB, UD, false),
+      TV(UD, UD, UD, UB, false),
+      TV(UD, UD, UB, UW, false),
+      TV(UD, UD, UW, UB, false),
+
+      TV(UD, UB, UB, UB, false),
+      TV(UD, UW, UB, UB, false),
+
+      TV(UQ, UQ, UB, UB, false),
+      TV(UQ, UQ, UB, UQ, false),
+      TV(UQ, UQ, UQ, UB, false),
+      TV(UQ, UQ, UB, UW, false),
+      TV(UQ, UQ, UW, UB, false),
+
+      TV( D,  D,  B,  B, true),
+      TV( D,  D,  B, UB, true),
+      TV( D,  D, UB,  B, true),
+      TV( D, UD,  B,  B, true),
+
+      TV( D,  D,  B,  D, false),
+      TV( D,  D,  D,  B, false),
+      TV( D,  D,  B,  W, false),
+      TV( D,  D,  W,  B, false),
+
+      TV( D,  B,  B,  B, false),
+      TV( D,  W,  B,  B, false),
+
+      TV( Q,  Q,  B,  B, false),
+      TV( Q,  Q,  B,  Q, false),
+      TV( Q,  Q,  Q,  B, false),
+      TV( Q,  Q,  B,  W, false),
+      TV( Q,  Q,  W,  B, false),
+
+      TV(UD, UD, UB,  B, false),
+      TV(UD, UD,  B, UB, false),
+      TV(UD,  D, UB, UB, false),
+   };
+
+#undef TV
+
+   for (unsigned i = 0; i < ARRAY_SIZE(test_vectors); i++) {
+      brw_DPAS(p,
+               BRW_SYSTOLIC_DEPTH_8,
+               8,
+               retype(brw_vec8_grf(0, 0), test_vectors[i].dst_type),
+               retype(brw_vec8_grf(16, 0), test_vectors[i].src0_type),
+               retype(brw_vec8_grf(32, 0), test_vectors[i].src1_type),
+               retype(brw_vec8_grf(48, 0), test_vectors[i].src2_type));
+
+      EXPECT_EQ(test_vectors[i].expected_result, validate(p)) <<
+         "test vector index = " << i;
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, dpas_src_subreg_nr)
+{
+   if (devinfo.verx10 < 125)
+      return;
+
+#define TV(dt, od, t0, o0, t1, o1, o2, r) {  \
+      BRW_REGISTER_TYPE_ ## dt, od,          \
+      BRW_REGISTER_TYPE_ ## t0, o0,          \
+      BRW_REGISTER_TYPE_ ## t1, o1, o2,      \
+      r }
+
+   static const struct {
+      brw_reg_type dst_type;
+      unsigned dst_subnr;
+      brw_reg_type src0_type;
+      unsigned src0_subnr;
+      brw_reg_type src1_src2_type;
+      unsigned src1_subnr;
+      unsigned src2_subnr;
+      bool expected_result;
+   } test_vectors[] = {
+      TV( F,  0,  F,  0, HF,  0,  0, true),
+      TV( D,  0,  D,  0,  B,  0,  0, true),
+      TV( D,  0,  D,  0, UB,  0,  0, true),
+      TV( D,  0, UD,  0,  B,  0,  0, true),
+
+      TV( F,  1,  F,  0, HF,  0,  0, false),
+      TV( F,  2,  F,  0, HF,  0,  0, false),
+      TV( F,  3,  F,  0, HF,  0,  0, false),
+      TV( F,  4,  F,  0, HF,  0,  0, false),
+      TV( F,  5,  F,  0, HF,  0,  0, false),
+      TV( F,  6,  F,  0, HF,  0,  0, false),
+      TV( F,  7,  F,  0, HF,  0,  0, false),
+
+      TV( F,  0,  F,  1, HF,  0,  0, false),
+      TV( F,  0,  F,  2, HF,  0,  0, false),
+      TV( F,  0,  F,  3, HF,  0,  0, false),
+      TV( F,  0,  F,  4, HF,  0,  0, false),
+      TV( F,  0,  F,  5, HF,  0,  0, false),
+      TV( F,  0,  F,  6, HF,  0,  0, false),
+      TV( F,  0,  F,  7, HF,  0,  0, false),
+
+      TV( F,  0,  F,  0, HF,  1,  0, false),
+      TV( F,  0,  F,  0, HF,  2,  0, false),
+      TV( F,  0,  F,  0, HF,  3,  0, false),
+      TV( F,  0,  F,  0, HF,  4,  0, false),
+      TV( F,  0,  F,  0, HF,  5,  0, false),
+      TV( F,  0,  F,  0, HF,  6,  0, false),
+      TV( F,  0,  F,  0, HF,  7,  0, false),
+      TV( F,  0,  F,  0, HF,  8,  0, false),
+      TV( F,  0,  F,  0, HF,  9,  0, false),
+      TV( F,  0,  F,  0, HF, 10,  0, false),
+      TV( F,  0,  F,  0, HF, 11,  0, false),
+      TV( F,  0,  F,  0, HF, 12,  0, false),
+      TV( F,  0,  F,  0, HF, 13,  0, false),
+      TV( F,  0,  F,  0, HF, 14,  0, false),
+      TV( F,  0,  F,  0, HF, 15,  0, false),
+
+      TV( F,  0,  F,  0, HF,  0,  1, false),
+      TV( F,  0,  F,  0, HF,  0,  2, false),
+      TV( F,  0,  F,  0, HF,  0,  3, false),
+      TV( F,  0,  F,  0, HF,  0,  4, false),
+      TV( F,  0,  F,  0, HF,  0,  5, false),
+      TV( F,  0,  F,  0, HF,  0,  6, false),
+      TV( F,  0,  F,  0, HF,  0,  7, false),
+      TV( F,  0,  F,  0, HF,  0,  8, false),
+      TV( F,  0,  F,  0, HF,  0,  9, false),
+      TV( F,  0,  F,  0, HF,  0, 10, false),
+      TV( F,  0,  F,  0, HF,  0, 11, false),
+      TV( F,  0,  F,  0, HF,  0, 12, false),
+      TV( F,  0,  F,  0, HF,  0, 13, false),
+      TV( F,  0,  F,  0, HF,  0, 14, false),
+      TV( F,  0,  F,  0, HF,  0, 15, false),
+
+      /* These meet the requirements, but they specify a subnr that is part of
+       * the next register. It is currently not possible to specify a subnr of
+       * 32 for the B and UB values because brw_reg::subnr is only 5 bits.
+       */
+      TV( F, 16,  F,  0, HF,  0,  0, false),
+      TV( F,  0,  F, 16, HF,  0,  0, false),
+      TV( F,  0,  F,  0, HF,  0, 16, false),
+
+      TV( D, 16,  D,  0,  B,  0,  0, false),
+      TV( D,  0,  D, 16,  B,  0,  0, false),
+   };
+
+#undef TV
+
+   for (unsigned i = 0; i < ARRAY_SIZE(test_vectors); i++) {
+      struct brw_reg dst =
+         retype(brw_vec8_grf( 0, 0), test_vectors[i].dst_type);
+      struct brw_reg src0 =
+         retype(brw_vec8_grf(16, 0), test_vectors[i].src0_type);
+      struct brw_reg src1 =
+         retype(brw_vec8_grf(32, 0), test_vectors[i].src1_src2_type);
+      struct brw_reg src2 =
+         retype(brw_vec8_grf(48, 0), test_vectors[i].src1_src2_type);
+
+      /* subnr for DPAS is in units of datatype precision instead of bytes as
+       * it is for every other instruction. Set the value by hand instead of
+       * using byte_offset() or similar.
+       */
+      dst.subnr = test_vectors[i].dst_subnr;
+      src0.subnr = test_vectors[i].src0_subnr;
+      src1.subnr = test_vectors[i].src1_subnr;
+      src2.subnr = test_vectors[i].src2_subnr;
+
+      brw_DPAS(p, BRW_SYSTOLIC_DEPTH_8, 8, dst, src0, src1, src2);
+
+      EXPECT_EQ(test_vectors[i].expected_result, validate(p)) <<
+         "test vector index = " << i;
+
+      clear_instructions(p);
+   }
+}
diff --git a/src/intel/compiler/elk/test_fs_cmod_propagation.cpp b/src/intel/compiler/elk/test_fs_cmod_propagation.cpp
new file mode 100644
index 00000000000..c82c55431f2
--- /dev/null
+++ b/src/intel/compiler/elk/test_fs_cmod_propagation.cpp
@@ -0,0 +1,3219 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+class cmod_propagation_test : public ::testing::Test {
+protected:
+   cmod_propagation_test();
+   ~cmod_propagation_test() override;
+
+   struct brw_compiler *compiler;
+   struct brw_compile_params params;
+   struct intel_device_info *devinfo;
+   void *ctx;
+   struct brw_wm_prog_data *prog_data;
+   struct gl_shader_program *shader_prog;
+   fs_visitor *v;
+   fs_builder bld;
+
+   void test_mov_prop(enum brw_conditional_mod cmod,
+                      enum brw_reg_type add_type,
+                      enum brw_reg_type mov_dst_type,
+                      bool expected_cmod_prop_progress);
+
+   void test_saturate_prop(enum brw_conditional_mod before,
+                           enum opcode op,
+                           enum brw_reg_type add_type,
+                           enum brw_reg_type op_type,
+                           bool expected_cmod_prop_progress);
+};
+
+class cmod_propagation_fs_visitor : public fs_visitor
+{
+public:
+   cmod_propagation_fs_visitor(struct brw_compiler *compiler,
+                               struct brw_compile_params *params,
+                               struct brw_wm_prog_data *prog_data,
+                               nir_shader *shader)
+      : fs_visitor(compiler, params, NULL,
+                   &prog_data->base, shader, 8, false, false) {}
+};
+
+
+cmod_propagation_test::cmod_propagation_test()
+   : bld(NULL, 0)
+{
+   ctx = ralloc_context(NULL);
+   compiler = rzalloc(ctx, struct brw_compiler);
+   devinfo = rzalloc(ctx, struct intel_device_info);
+   compiler->devinfo = devinfo;
+
+   params = {};
+   params.mem_ctx = ctx;
+
+   prog_data = ralloc(ctx, struct brw_wm_prog_data);
+   nir_shader *shader =
+      nir_shader_create(ctx, MESA_SHADER_FRAGMENT, NULL, NULL);
+
+   v = new cmod_propagation_fs_visitor(compiler, &params, prog_data, shader);
+
+   bld = fs_builder(v).at_end();
+
+   devinfo->ver = 7;
+   devinfo->verx10 = devinfo->ver * 10;
+}
+
+cmod_propagation_test::~cmod_propagation_test()
+{
+   delete v;
+   v = NULL;
+
+   ralloc_free(ctx);
+   ctx = NULL;
+}
+
+static fs_inst *
+instruction(bblock_t *block, int num)
+{
+   fs_inst *inst = (fs_inst *)block->start();
+   for (int i = 0; i < num; i++) {
+      inst = (fs_inst *)inst->next;
+   }
+   return inst;
+}
+
+static bool
+cmod_propagation(fs_visitor *v)
+{
+   const bool print = getenv("TEST_DEBUG");
+
+   if (print) {
+      fprintf(stderr, "= Before =\n");
+      v->cfg->dump();
+   }
+
+   bool ret = v->opt_cmod_propagation();
+
+   if (print) {
+      fprintf(stderr, "\n= After =\n");
+      v->cfg->dump();
+   }
+
+   return ret;
+}
+
+TEST_F(cmod_propagation_test, basic)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest  src0  src1
+    * 1: cmp.ge.f0(8)  null  dest  0.0f
+    *
+    * = After =
+    * 0: add.ge.f0(8)  dest  src0  src1
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, basic_other_flag)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE)
+      ->flag_subreg = 1;
+
+   /* = Before =
+    *
+    * 0: add(8)         dest  src0  src1
+    * 1: cmp.ge.f0.1(8) null  dest  0.0f
+    *
+    * = After =
+    * 0: add.ge.f0.1(8) dest  src0  src1
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(1, instruction(block0, 0)->flag_subreg);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_nonzero)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg nonzero(brw_imm_f(1.0f));
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), dest, nonzero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest  src0  src1
+    * 1: cmp.ge.f0(8)  null  dest  1.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, non_cmod_instruction)
+{
+   fs_reg dest = v->vgrf(glsl_uint_type());
+   fs_reg src0 = v->vgrf(glsl_uint_type());
+   fs_reg zero(brw_imm_ud(0u));
+   bld.FBL(dest, src0);
+   bld.CMP(bld.null_reg_ud(), dest, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: fbl(8)        dest  src0
+    * 1: cmp.ge.f0(8)  null  dest  0u
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_FBL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, non_cmod_livechannel)
+{
+   fs_reg dest = v->vgrf(glsl_uint_type());
+   fs_reg zero(brw_imm_d(0));
+   bld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dest)->exec_size = 32;
+   bld.CMP(bld.null_reg_d(), dest, zero, BRW_CONDITIONAL_Z)->exec_size = 32;
+
+   /* = Before =
+    *
+    * 0: find_live_channel(32) dest
+    * 1: cmp.z.f0.0(32)   null dest 0d
+    *
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(SHADER_OPCODE_FIND_LIVE_CHANNEL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_Z, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_write)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg src2 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE);
+   bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest  src0  src1
+    * 1: cmp.ge.f0(8)  null  src2  0.0f
+    * 2: cmp.ge.f0(8)  null  dest  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_mismatch_flag_write)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg src2 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE)
+      ->flag_subreg = 1;
+   bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)         dest  src0  src1
+    * 1: cmp.ge.f0.1(8) null  src2  0.0f
+    * 2: cmp.ge.f0(8)   null  dest  0.0f
+    *
+    * = After =
+    * 0: add.ge.f0(8)   dest  src0  src1
+    * 1: cmp.ge.f0.1(8) null  src2  0.0f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(0, instruction(block0, 0)->flag_subreg);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+   EXPECT_EQ(1, instruction(block0, 1)->flag_subreg);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_read)
+{
+   fs_reg dest0 = v->vgrf(glsl_float_type());
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg src2 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest0, src0, src1);
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   bld.CMP(bld.null_reg_f(), dest0, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest0 src0  src1
+    * 1: (+f0) sel(8)  dest1 src2  0.0f
+    * 2: cmp.ge.f0(8)  null  dest0 0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_mismatch_flag_read)
+{
+   fs_reg dest0 = v->vgrf(glsl_float_type());
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg src2 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest0, src0, src1);
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero))
+      ->flag_subreg = 1;
+   bld.CMP(bld.null_reg_f(), dest0, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)         dest0 src0  src1
+    * 1: (+f0.1) sel(8) dest1 src2  0.0f
+    * 2: cmp.ge.f0(8)   null  dest0 0.0f
+    *
+    * = After =
+    * 0: add.ge.f0(8)   dest0 src0  src1
+    * 1: (+f0.1) sel(8) dest1 src2  0.0f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(0, instruction(block0, 0)->flag_subreg);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+   EXPECT_EQ(1, instruction(block0, 1)->flag_subreg);
+}
+
+TEST_F(cmod_propagation_test, intervening_dest_write)
+{
+   fs_reg dest = v->vgrf(glsl_vec4_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg src2 = v->vgrf(glsl_vec2_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(offset(dest, bld, 2), src0, src1);
+   bld.emit(SHADER_OPCODE_TEX, dest, src2)
+      ->size_written = 4 * REG_SIZE;
+   bld.CMP(bld.null_reg_f(), offset(dest, bld, 2), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest+2  src0    src1
+    * 1: tex(8) rlen 4 dest+0  src2
+    * 2: cmp.ge.f0(8)  null    dest+2  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(SHADER_OPCODE_TEX, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_read_same_value)
+{
+   fs_reg dest0 = v->vgrf(glsl_float_type());
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg src2 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   set_condmod(BRW_CONDITIONAL_GE, bld.ADD(dest0, src0, src1));
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   bld.CMP(bld.null_reg_f(), dest0, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add.ge.f0(8)  dest0 src0  src1
+    * 1: (+f0) sel(8)  dest1 src2  0.0f
+    * 2: cmp.ge.f0(8)  null  dest0 0.0f
+    *
+    * = After =
+    * 0: add.ge.f0(8)  dest0 src0  src1
+    * 1: (+f0) sel(8)  dest1 src2  0.0f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+}
+
+TEST_F(cmod_propagation_test, negate)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   dest.negate = true;
+   bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest  src0  src1
+    * 1: cmp.ge.f0(8)  null  -dest 0.0f
+    *
+    * = After =
+    * 0: add.le.f0(8)  dest  src0  src1
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, movnz)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.CMP(dest, src0, src1, BRW_CONDITIONAL_GE);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.MOV(bld.null_reg_f(), dest));
+
+   /* = Before =
+    *
+    * 0: cmp.ge.f0(8)  dest  src0  src1
+    * 1: mov.nz.f0(8)  null  dest
+    *
+    * = After =
+    * 0: cmp.ge.f0(8)  dest  src0  src1
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, different_types_cmod_with_zero)
+{
+   fs_reg dest = v->vgrf(glsl_int_type());
+   fs_reg src0 = v->vgrf(glsl_int_type());
+   fs_reg src1 = v->vgrf(glsl_int_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), retype(dest, BRW_REGISTER_TYPE_F), zero,
+           BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest:D  src0:D  src1:D
+    * 1: cmp.ge.f0(8)  null:F  dest:F  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, andnz_one)
+{
+   fs_reg dest = v->vgrf(glsl_int_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   fs_reg one(brw_imm_d(1));
+
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.AND(bld.null_reg_d(), dest, one));
+
+   /* = Before =
+    * 0: cmp.l.f0(8)     dest:F  src0:F  0F
+    * 1: and.nz.f0(8)    null:D  dest:D  1D
+    *
+    * = After =
+    * 0: cmp.l.f0(8)     dest:F  src0:F  0F
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_TRUE(retype(dest, BRW_REGISTER_TYPE_F)
+               .equals(instruction(block0, 0)->dst));
+}
+
+TEST_F(cmod_propagation_test, andnz_non_one)
+{
+   fs_reg dest = v->vgrf(glsl_int_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   fs_reg nonone(brw_imm_d(38));
+
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.AND(bld.null_reg_d(), dest, nonone));
+
+   /* = Before =
+    * 0: cmp.l.f0(8)     dest:F  src0:F  0F
+    * 1: and.nz.f0(8)    null:D  dest:D  38D
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_cmpnz)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0));
+
+   bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ);
+   bld.CMP(bld.null_reg_f(), dst0, zero, BRW_CONDITIONAL_NZ);
+
+   /* = Before =
+    * 0: cmp.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f
+    * 1: cmp.nz.f0.0(8) null:F, vgrf0:F, 0f
+    *
+    * = After =
+    * 0: cmp.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_cmpg)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0));
+
+   bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ);
+   bld.CMP(bld.null_reg_f(), dst0, zero, BRW_CONDITIONAL_G);
+
+   /* = Before =
+    * 0: cmp.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f
+    * 1: cmp.g.f0.0(8) null:F, vgrf0:F, 0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_G, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, plnnz_cmpnz)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0));
+
+   set_condmod(BRW_CONDITIONAL_NZ, bld.PLN(dst0, src0, zero));
+   bld.CMP(bld.null_reg_f(), dst0, zero, BRW_CONDITIONAL_NZ);
+
+   /* = Before =
+    * 0: pln.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f
+    * 1: cmp.nz.f0.0(8) null:F, vgrf0:F, 0f
+    *
+    * = After =
+    * 0: pln.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_PLN, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, plnnz_cmpz)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0));
+
+   set_condmod(BRW_CONDITIONAL_NZ, bld.PLN(dst0, src0, zero));
+   bld.CMP(bld.null_reg_f(), dst0, zero, BRW_CONDITIONAL_Z);
+
+   /* = Before =
+    * 0: pln.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f
+    * 1: cmp.z.f0.0(8) null:F, vgrf0:F, 0f
+    *
+    * = After =
+    * 0: pln.z.f0.0(8) vgrf0:F, vgrf1:F, 0f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_PLN, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_Z, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, plnnz_sel_cmpz)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0));
+
+   set_condmod(BRW_CONDITIONAL_NZ, bld.PLN(dst0, src0, zero));
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dst1, src0, zero));
+   bld.CMP(bld.null_reg_f(), dst0, zero, BRW_CONDITIONAL_Z);
+
+   /* = Before =
+    * 0: pln.nz.f0.0(8) vgrf0:F, vgrf2:F, 0f
+    * 1: (+f0.0) sel(8) vgrf1:F, vgrf2:F, 0f
+    * 2: cmp.z.f0.0(8) null:F, vgrf0:F, 0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_PLN, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_Z, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_cmpg_D)
+{
+   fs_reg dst0 = v->vgrf(glsl_int_type());
+   fs_reg src0 = v->vgrf(glsl_int_type());
+   fs_reg zero(brw_imm_d(0));
+   fs_reg one(brw_imm_d(1));
+
+   bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ);
+   bld.CMP(bld.null_reg_d(), dst0, zero, BRW_CONDITIONAL_G);
+
+   /* = Before =
+    * 0: cmp.nz.f0.0(8) vgrf0:D, vgrf1:D, 0d
+    * 1: cmp.g.f0.0(8) null:D, vgrf0:D, 0d
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_G, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_cmpg_UD)
+{
+   fs_reg dst0 = v->vgrf(glsl_uint_type());
+   fs_reg src0 = v->vgrf(glsl_uint_type());
+   fs_reg zero(brw_imm_ud(0));
+
+   bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ);
+   bld.CMP(bld.null_reg_ud(), dst0, zero, BRW_CONDITIONAL_G);
+
+   /* = Before =
+    * 0: cmp.nz.f0.0(8) vgrf0:UD, vgrf1:UD, 0u
+    * 1: cmp.g.f0.0(8) null:UD, vgrf0:UD, 0u
+    *
+    * = After =
+    * 0: cmp.nz.f0.0(8) vgrf0:UD, vgrf1:UD, 0u
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_cmpl_D)
+{
+   fs_reg dst0 = v->vgrf(glsl_int_type());
+   fs_reg src0 = v->vgrf(glsl_int_type());
+   fs_reg zero(brw_imm_d(0));
+
+   bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ);
+   bld.CMP(bld.null_reg_d(), dst0, zero, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    * 0: cmp.nz.f0.0(8) vgrf0:D, vgrf1:D, 0d
+    * 1: cmp.l.f0.0(8) null:D, vgrf0:D, 0d
+    *
+    * = After =
+    * 0: cmp.nz.f0.0(8) vgrf0:D, vgrf1:D, 0d
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_cmpl_UD)
+{
+   fs_reg dst0 = v->vgrf(glsl_uint_type());
+   fs_reg src0 = v->vgrf(glsl_uint_type());
+   fs_reg zero(brw_imm_ud(0));
+
+   bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ);
+   bld.CMP(bld.null_reg_ud(), dst0, zero, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    * 0: cmp.nz.f0.0(8) vgrf0:UD, vgrf1:UD, 0u
+    * 1: cmp.l.f0.0(8) null:UD, vgrf0:UD, 0u
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, andz_one)
+{
+   fs_reg dest = v->vgrf(glsl_int_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   fs_reg one(brw_imm_d(1));
+
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_Z,
+               bld.AND(bld.null_reg_d(), dest, one));
+
+   /* = Before =
+    * 0: cmp.l.f0(8)     dest:F  src0:F  0F
+    * 1: and.z.f0(8)     null:D  dest:D  1D
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_EQ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, add_not_merge_with_compare)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), src0, src1, BRW_CONDITIONAL_L);
+
+   /* The addition and the implicit subtraction in the compare do not compute
+    * related values.
+    *
+    * = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: cmp.l.f0(8)     null:F  src0:F  src1:F
+    *
+    * = After =
+    * (no changes)
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, subtract_merge_with_compare)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dest, src0, negate(src1));
+   bld.CMP(bld.null_reg_f(), src0, src1, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  -src1:F
+    * 1: cmp.l.f0(8)     null:F  src0:F  src1:F
+    *
+    * = After =
+    * 0: add.l.f0(8)     dest:F  src0:F  -src1:F
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, subtract_immediate_merge_with_compare)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg one(brw_imm_f(1.0f));
+   fs_reg negative_one(brw_imm_f(-1.0f));
+
+   bld.ADD(dest, src0, negative_one);
+   bld.CMP(bld.null_reg_f(), src0, one, BRW_CONDITIONAL_NZ);
+
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  -1.0f
+    * 1: cmp.nz.f0(8)    null:F  src0:F  1.0f
+    *
+    * = After =
+    * 0: add.nz.f0(8)    dest:F  src0:F  -1.0f
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, subtract_merge_with_compare_intervening_add)
+{
+   fs_reg dest0 = v->vgrf(glsl_float_type());
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dest0, src0, negate(src1));
+   bld.ADD(dest1, src0, src1);
+   bld.CMP(bld.null_reg_f(), src0, src1, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    * 0: add(8)          dest0:F src0:F  -src1:F
+    * 1: add(8)          dest1:F src0:F  src1:F
+    * 2: cmp.l.f0(8)     null:F  src0:F  src1:F
+    *
+    * = After =
+    * 0: add.l.f0(8)     dest0:F src0:F  -src1:F
+    * 1: add(8)          dest1:F src0:F  src1:F
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, subtract_not_merge_with_compare_intervening_partial_write)
+{
+   fs_reg dest0 = v->vgrf(glsl_float_type());
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dest0, src0, negate(src1));
+   set_predicate(BRW_PREDICATE_NORMAL, bld.ADD(dest1, src0, negate(src1)));
+   bld.CMP(bld.null_reg_f(), src0, src1, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    * 0: add(8)          dest0:F src0:F  -src1:F
+    * 1: (+f0) add(8)    dest1:F src0:F  -src1:F
+    * 2: cmp.l.f0(8)     null:F  src0:F  src1:F
+    *
+    * = After =
+    * (no changes)
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 1)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, subtract_not_merge_with_compare_intervening_add)
+{
+   fs_reg dest0 = v->vgrf(glsl_float_type());
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dest0, src0, negate(src1));
+   set_condmod(BRW_CONDITIONAL_EQ, bld.ADD(dest1, src0, src1));
+   bld.CMP(bld.null_reg_f(), src0, src1, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    * 0: add(8)          dest0:F src0:F  -src1:F
+    * 1: add.z.f0(8)     dest1:F src0:F  src1:F
+    * 2: cmp.l.f0(8)     null:F  src0:F  src1:F
+    *
+    * = After =
+    * (no changes)
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_EQ, instruction(block0, 1)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, add_merge_with_compare)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), src0, negate(src1), BRW_CONDITIONAL_L);
+
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: cmp.l.f0(8)     null:F  src0:F  -src1:F
+    *
+    * = After =
+    * 0: add.l.f0(8)     dest:F  src0:F  src1:F
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, negative_subtract_merge_with_compare)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dest, src1, negate(src0));
+   bld.CMP(bld.null_reg_f(), src0, src1, BRW_CONDITIONAL_L);
+
+   /* The result of the subtract is the negatiion of the result of the
+    * implicit subtract in the compare, so the condition must change.
+    *
+    * = Before =
+    * 0: add(8)          dest:F  src1:F  -src0:F
+    * 1: cmp.l.f0(8)     null:F  src0:F  src1:F
+    *
+    * = After =
+    * 0: add.g.f0(8)     dest:F  src0:F  -src1:F
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_G, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, subtract_delete_compare)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg src2 = v->vgrf(glsl_float_type());
+
+   set_condmod(BRW_CONDITIONAL_L, bld.ADD(dest, src0, negate(src1)));
+   set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(dest1, src2));
+   bld.CMP(bld.null_reg_f(), src0, src1, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    * 0: add.l.f0(8)     dest0:F src0:F  -src1:F
+    * 1: (+f0) mov(0)    dest1:F src2:F
+    * 2: cmp.l.f0(8)     null:F  src0:F  src1:F
+    *
+    * = After =
+    * 0: add.l.f0(8)     dest:F  src0:F  -src1:F
+    * 1: (+f0) mov(0)    dest1:F src2:F
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+}
+
+TEST_F(cmod_propagation_test, subtract_delete_compare_other_flag)
+{
+   /* This test is the same as subtract_delete_compare but it explicitly used
+    * flag f0.1 for the subtraction and the comparison.
+    */
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg src2 = v->vgrf(glsl_float_type());
+
+   set_condmod(BRW_CONDITIONAL_L, bld.ADD(dest, src0, negate(src1)))
+      ->flag_subreg = 1;
+   set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(dest1, src2));
+   bld.CMP(bld.null_reg_f(), src0, src1, BRW_CONDITIONAL_L)
+      ->flag_subreg = 1;
+
+   /* = Before =
+    * 0: add.l.f0.1(8)   dest0:F src0:F  -src1:F
+    * 1: (+f0) mov(0)    dest1:F src2:F
+    * 2: cmp.l.f0.1(8)   null:F  src0:F  src1:F
+    *
+    * = After =
+    * 0: add.l.f0.1(8)   dest:F  src0:F  -src1:F
+    * 1: (+f0) mov(0)    dest1:F src2:F
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(1, instruction(block0, 0)->flag_subreg);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+}
+
+TEST_F(cmod_propagation_test, subtract_to_mismatch_flag)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+
+   set_condmod(BRW_CONDITIONAL_L, bld.ADD(dest, src0, negate(src1)));
+   bld.CMP(bld.null_reg_f(), src0, src1, BRW_CONDITIONAL_L)
+      ->flag_subreg = 1;
+
+   /* = Before =
+    * 0: add.l.f0(8)     dest0:F src0:F  -src1:F
+    * 1: cmp.l.f0.1(8)   null:F  src0:F  src1:F
+    *
+    * = After =
+    * No changes
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(0, instruction(block0, 0)->flag_subreg);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 1)->conditional_mod);
+   EXPECT_EQ(1, instruction(block0, 1)->flag_subreg);
+}
+
+TEST_F(cmod_propagation_test,
+       subtract_merge_with_compare_intervening_mismatch_flag_write)
+{
+   fs_reg dest0 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+
+   bld.ADD(dest0, src0, negate(src1));
+   bld.CMP(bld.null_reg_f(), src0, src1, BRW_CONDITIONAL_L)
+            ->flag_subreg = 1;
+   bld.CMP(bld.null_reg_f(), src0, src1, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    * 0: add(8)         dest0:F src0:F  -src1:F
+    * 1: cmp.l.f0.1(8)  null:F  src0:F  src1:F
+    * 2: cmp.l.f0(8)    null:F  src0:F  src1:F
+    *
+    * = After =
+    * 0: add.l.f0(8)    dest0:F src0:F  -src1:F
+    * 1: cmp.l.f0.1(8)  null:F  src0:F  src1:F
+    *
+    * NOTE: Another perfectly valid after sequence would be:
+    *
+    * 0: add.f0.1(8)    dest0:F src0:F  -src1:F
+    * 1: cmp.l.f0(8)    null:F  src0:F  src1:F
+    *
+    * However, the optimization pass starts at the end of the basic block.
+    * Because of this, the cmp.l.f0 will always be chosen.  If the pass
+    * changes its strategy, this test will also need to change.
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(0, instruction(block0, 0)->flag_subreg);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 1)->conditional_mod);
+   EXPECT_EQ(1, instruction(block0, 1)->flag_subreg);
+}
+
+TEST_F(cmod_propagation_test,
+       subtract_merge_with_compare_intervening_mismatch_flag_read)
+{
+   fs_reg dest0 = v->vgrf(glsl_float_type());
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg src2 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+
+   bld.ADD(dest0, src0, negate(src1));
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero))
+      ->flag_subreg = 1;
+   bld.CMP(bld.null_reg_f(), src0, src1, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    * 0: add(8)         dest0:F src0:F  -src1:F
+    * 1: (+f0.1) sel(8) dest1   src2    0.0f
+    * 2: cmp.l.f0(8)    null:F  src0:F  src1:F
+    *
+    * = After =
+    * 0: add.l.f0(8)    dest0:F src0:F  -src1:F
+    * 1: (+f0.1) sel(8) dest1   src2    0.0f
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(0, instruction(block0, 0)->flag_subreg);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+   EXPECT_EQ(1, instruction(block0, 1)->flag_subreg);
+}
+
+TEST_F(cmod_propagation_test, subtract_delete_compare_derp)
+{
+   fs_reg dest0 = v->vgrf(glsl_float_type());
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+
+   set_condmod(BRW_CONDITIONAL_L, bld.ADD(dest0, src0, negate(src1)));
+   set_predicate(BRW_PREDICATE_NORMAL, bld.ADD(dest1, negate(src0), src1));
+   bld.CMP(bld.null_reg_f(), src0, src1, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    * 0: add.l.f0(8)     dest0:F src0:F  -src1:F
+    * 1: (+f0) add(0)    dest1:F -src0:F src1:F
+    * 2: cmp.l.f0(8)     null:F  src0:F  src1:F
+    *
+    * = After =
+    * 0: add.l.f0(8)     dest0:F src0:F  -src1:F
+    * 1: (+f0) add(0)    dest1:F -src0:F src1:F
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+}
+
+TEST_F(cmod_propagation_test, signed_unsigned_comparison_mismatch)
+{
+   fs_reg dest0 = v->vgrf(glsl_int_type());
+   fs_reg src0 = v->vgrf(glsl_int_type());
+   src0.type = BRW_REGISTER_TYPE_W;
+
+   bld.ASR(dest0, negate(src0), brw_imm_d(15));
+   bld.CMP(bld.null_reg_ud(), retype(dest0, BRW_REGISTER_TYPE_UD),
+           brw_imm_ud(0u), BRW_CONDITIONAL_LE);
+
+   /* = Before =
+    * 0: asr(8)          dest:D   -src0:W 15D
+    * 1: cmp.le.f0(8)    null:UD  dest:UD 0UD
+    *
+    * = After =
+    * (no changes)
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ASR, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, ior_f2i_nz)
+{
+   fs_reg dest = bld.vgrf(BRW_REGISTER_TYPE_D);
+   fs_reg src0 = bld.vgrf(BRW_REGISTER_TYPE_D);
+   fs_reg src1 = bld.vgrf(BRW_REGISTER_TYPE_D);
+
+   bld.OR(dest, src0, src1);
+   bld.MOV(bld.null_reg_d(), retype(dest, BRW_REGISTER_TYPE_F))
+      ->conditional_mod = BRW_CONDITIONAL_NZ;
+
+   /* = Before =
+    * 0: or(8)           dest:D  src0:D  src1:D
+    * 1: mov.nz(8)       null:D  dest:F
+    *
+    * = After =
+    * No changes.
+    *
+    * If src0 = 0x30000000 and src1 = 0x0f000000, then the value stored in
+    * dest, interpreted as floating point, is 0.5.  This bit pattern is not
+    * zero, but after the float-to-integer conversion, the value is zero.
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+
+   EXPECT_EQ(BRW_OPCODE_OR, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+
+   /* This is ASSERT_EQ because if end_ip is 0, the instruction(block0, 1)
+    * calls will not work properly, and the test will give weird results.
+    */
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+
+void
+cmod_propagation_test::test_mov_prop(enum brw_conditional_mod cmod,
+                                     enum brw_reg_type add_type,
+                                     enum brw_reg_type mov_dst_type,
+                                     bool expected_cmod_prop_progress)
+{
+   fs_reg dest = bld.vgrf(add_type);
+   fs_reg src0 = bld.vgrf(add_type);
+   fs_reg src1 = bld.vgrf(add_type);
+
+   bld.ADD(dest, src0, src1);
+   bld.MOV(retype(bld.null_reg_ud(), mov_dst_type), dest)
+      ->conditional_mod = cmod;
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_EQ(expected_cmod_prop_progress, cmod_propagation(v));
+
+   const enum brw_conditional_mod add_cmod =
+      expected_cmod_prop_progress ? cmod : BRW_CONDITIONAL_NONE;
+
+   EXPECT_EQ(0, block0->start_ip);
+
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(add_cmod, instruction(block0, 0)->conditional_mod);
+
+   if (expected_cmod_prop_progress) {
+      EXPECT_EQ(0, block0->end_ip);
+   } else {
+      /* This is ASSERT_EQ because if end_ip is 0, the instruction(block0, 1)
+       * calls will not work properly, and the test will give weird results.
+       */
+      ASSERT_EQ(1, block0->end_ip);
+
+      EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+      EXPECT_EQ(cmod, instruction(block0, 1)->conditional_mod);
+   }
+}
+
+TEST_F(cmod_propagation_test, fadd_fmov_nz)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.nz(8)       null:F  dest:F
+    *
+    * = After =
+    * 0: add.nz(8)       dest:F  src0:F  src1:F
+    */
+   test_mov_prop(BRW_CONDITIONAL_NZ,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_F,
+                 true);
+}
+
+TEST_F(cmod_propagation_test, fadd_fmov_z)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.z(8)        null:F  dest:F
+    *
+    * = After =
+    * 0: add.z(8)        dest:F  src0:F  src1:F
+    */
+   test_mov_prop(BRW_CONDITIONAL_Z,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_F,
+                 true);
+}
+
+TEST_F(cmod_propagation_test, fadd_fmov_l)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.l(8)        null:F  dest:F
+    *
+    * = After =
+    * 0: add.l(8)        dest:F  src0:F  src1:F
+    */
+   test_mov_prop(BRW_CONDITIONAL_L,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_F,
+                 true);
+}
+
+TEST_F(cmod_propagation_test, fadd_fmov_g)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.g(8)        null:F  dest:F
+    *
+    * = After =
+    * 0: add.g(8)        dest:F  src0:F  src1:F
+    */
+   test_mov_prop(BRW_CONDITIONAL_G,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_F,
+                 true);
+}
+
+TEST_F(cmod_propagation_test, fadd_fmov_le)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.le(8)       null:F  dest:F
+    *
+    * = After =
+    * 0: add.le(8)        dest:F  src0:F  src1:F
+    */
+   test_mov_prop(BRW_CONDITIONAL_LE,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_F,
+                 true);
+}
+
+TEST_F(cmod_propagation_test, fadd_fmov_ge)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.ge(8)       null:F  dest:F
+    *
+    * = After =
+    * 0: add.ge(8)       dest:F  src0:F  src1:F
+    */
+   test_mov_prop(BRW_CONDITIONAL_GE,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_F,
+                 true);
+}
+
+TEST_F(cmod_propagation_test, iadd_imov_nz)
+{
+   /* = Before =
+    * 0: add(8)          dest:D  src0:D  src1:D
+    * 1: mov.nz(8)       null:D  dest:D
+    *
+    * = After =
+    * 0: add.nz(8)       dest:D  src0:D  src1:D
+    */
+   test_mov_prop(BRW_CONDITIONAL_NZ,
+                 BRW_REGISTER_TYPE_D,
+                 BRW_REGISTER_TYPE_D,
+                 true);
+}
+
+TEST_F(cmod_propagation_test, iadd_imov_z)
+{
+   /* = Before =
+    * 0: add(8)          dest:D  src0:D  src1:D
+    * 1: mov.z(8)        null:D  dest:D
+    *
+    * = After =
+    * 0: add.z(8)        dest:D  src0:D  src1:D
+    */
+   test_mov_prop(BRW_CONDITIONAL_Z,
+                 BRW_REGISTER_TYPE_D,
+                 BRW_REGISTER_TYPE_D,
+                 true);
+}
+
+TEST_F(cmod_propagation_test, iadd_imov_l)
+{
+   /* = Before =
+    * 0: add(8)          dest:D  src0:D  src1:D
+    * 1: mov.l(8)        null:D  dest:D
+    *
+    * = After =
+    * 0: add.l(8)        dest:D  src0:D  src1:D
+    */
+   test_mov_prop(BRW_CONDITIONAL_L,
+                 BRW_REGISTER_TYPE_D,
+                 BRW_REGISTER_TYPE_D,
+                 true);
+}
+
+TEST_F(cmod_propagation_test, iadd_imov_g)
+{
+   /* = Before =
+    * 0: add(8)          dest:D  src0:D  src1:D
+    * 1: mov.g(8)        null:D  dest:D
+    *
+    * = After =
+    * 0: add.g(8)        dest:D  src0:D  src1:D
+    */
+   test_mov_prop(BRW_CONDITIONAL_G,
+                 BRW_REGISTER_TYPE_D,
+                 BRW_REGISTER_TYPE_D,
+                 true);
+}
+
+TEST_F(cmod_propagation_test, iadd_imov_le)
+{
+   /* = Before =
+    * 0: add(8)          dest:D  src0:D  src1:D
+    * 1: mov.le(8)       null:D  dest:D
+    *
+    * = After =
+    * 0: add.le(8)       dest:D  src0:D  src1:D
+    */
+   test_mov_prop(BRW_CONDITIONAL_LE,
+                 BRW_REGISTER_TYPE_D,
+                 BRW_REGISTER_TYPE_D,
+                 true);
+}
+
+TEST_F(cmod_propagation_test, iadd_imov_ge)
+{
+   /* = Before =
+    * 0: add(8)          dest:D  src0:D  src1:D
+    * 1: mov.ge(8)       null:D  dest:D
+    *
+    * = After =
+    * 0: add.ge(8)       dest:D  src0:D  src1:D
+    */
+   test_mov_prop(BRW_CONDITIONAL_GE,
+                 BRW_REGISTER_TYPE_D,
+                 BRW_REGISTER_TYPE_D,
+                 true);
+}
+
+TEST_F(cmod_propagation_test, iadd_umov_nz)
+{
+   /* = Before =
+    * 0: add(8)          dest:D  src0:D  src1:D
+    * 1: mov.nz(8)       null:UD dest:D
+    *
+    * = After =
+    * 0: add.nz(8)       dest:D  src0:D  src1:D
+    */
+   test_mov_prop(BRW_CONDITIONAL_NZ,
+                 BRW_REGISTER_TYPE_D,
+                 BRW_REGISTER_TYPE_UD,
+                 true);
+}
+
+TEST_F(cmod_propagation_test, iadd_umov_z)
+{
+   /* = Before =
+    * 0: add(8)          dest:D  src0:D  src1:D
+    * 1: mov.z(8)        null:UD dest:D
+    *
+    * = After =
+    * 0: add.z(8)        dest:D  src0:D  src1:D
+    */
+   test_mov_prop(BRW_CONDITIONAL_Z,
+                 BRW_REGISTER_TYPE_D,
+                 BRW_REGISTER_TYPE_UD,
+                 true);
+}
+
+TEST_F(cmod_propagation_test, iadd_umov_l)
+{
+   /* = Before =
+    * 0: add(8)          dest:D  src0:D  src1:D
+    * 1: mov.l(8)        null:UD dest:D
+    *
+    * = After =
+    * No changes.
+    *
+    * Due to the signed-to-usigned type conversion, the conditional modifier
+    * cannot be propagated to the ADD without changing at least the
+    * destination type of the add.
+    *
+    * This particular tests is a little silly.  Unsigned less than zero is a
+    * contradiction, and earlier optimization passes should have eliminated
+    * it.
+    */
+   test_mov_prop(BRW_CONDITIONAL_L,
+                 BRW_REGISTER_TYPE_D,
+                 BRW_REGISTER_TYPE_UD,
+                 false);
+}
+
+TEST_F(cmod_propagation_test, iadd_umov_g)
+{
+   /* = Before =
+    * 0: add(8)          dest:D  src0:D  src1:D
+    * 1: mov.g(8)        null:UD dest:D
+    *
+    * = After =
+    * No changes.
+    *
+    * In spite of the type conversion, this could be made to work by
+    * propagating NZ instead of G to the ADD.
+    */
+   test_mov_prop(BRW_CONDITIONAL_G,
+                 BRW_REGISTER_TYPE_D,
+                 BRW_REGISTER_TYPE_UD,
+                 false);
+}
+
+TEST_F(cmod_propagation_test, iadd_umov_le)
+{
+   /* = Before =
+    * 0: add(8)          dest:D  src0:D  src1:D
+    * 1: mov.le(8)       null:UD dest:D
+    *
+    * = After =
+    * No changes.
+    *
+    * In spite of the type conversion, this could be made to work by
+    * propagating Z instead of LE to the ADD.
+    */
+   test_mov_prop(BRW_CONDITIONAL_LE,
+                 BRW_REGISTER_TYPE_D,
+                 BRW_REGISTER_TYPE_UD,
+                 false);
+}
+
+TEST_F(cmod_propagation_test, iadd_umov_ge)
+{
+   /* = Before =
+    * 0: add(8)          dest:D  src0:D  src1:D
+    * 1: mov.ge(8)       null:UD dest:D
+    *
+    * = After =
+    * No changes.
+    *
+    * Due to the signed-to-usigned type conversion, the conditional modifier
+    * cannot be propagated to the ADD without changing at least the
+    * destination type of the add.
+    *
+    * This particular tests is a little silly.  Unsigned greater than or equal
+    * to zero is a tautology, and earlier optimization passes should have
+    * eliminated it.
+    */
+   test_mov_prop(BRW_CONDITIONAL_GE,
+                 BRW_REGISTER_TYPE_D,
+                 BRW_REGISTER_TYPE_UD,
+                 false);
+}
+
+TEST_F(cmod_propagation_test, fadd_f2u_nz)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.nz(8)       null:UD dest:F
+    *
+    * = After =
+    * No changes.  The MOV changes the type from float to unsigned integer.
+    * If dest is in the range [-Inf, 1), the conversion will clamp it to zero.
+    * If dest is NaN, the conversion will also clamp it to zero.  It is not
+    * safe to propagate the NZ back to the ADD.
+    *
+    * It's tempting to try to propagate G to the ADD in place of the NZ.  This
+    * fails for values (0, 1).  For example, if dest is 0.5, add.g would set
+    * the flag, but mov.nz would not because the 0.5 would get rounded down to
+    * zero.
+    */
+   test_mov_prop(BRW_CONDITIONAL_NZ,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_UD,
+                 false);
+}
+
+TEST_F(cmod_propagation_test, fadd_f2u_z)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.z(8)        null:UD dest:F
+    *
+    * = After =
+    * No changes.
+    *
+    * The MOV changes the type from float to unsigned integer.  If dest is in
+    * the range [-Inf, 1), the conversion will clamp it to zero.  If dest is
+    * NaN, the conversion will also clamp it to zero.  It is not safe to
+    * propagate the Z back to the ADD.
+    */
+   test_mov_prop(BRW_CONDITIONAL_Z,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_UD,
+                 false);
+}
+
+TEST_F(cmod_propagation_test, fadd_f2u_l)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.l(8)        null:UD dest:F
+    *
+    * = After =
+    * No changes.
+    *
+    * The MOV changes the type from float to unsigned integer.  If dest is in
+    * the range [-Inf, 1), the conversion will clamp it to zero.  If dest is
+    * NaN, the conversion will also clamp it to zero.  It is not safe to
+    * propagate the L back to the ADD.
+    */
+   test_mov_prop(BRW_CONDITIONAL_L,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_UD,
+                 false);
+}
+
+TEST_F(cmod_propagation_test, fadd_f2u_g)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.g(8)        null:UD dest:F
+    *
+    * = After =
+    * No changes.
+    *
+    * The MOV changes the type from float to unsigned integer.  If dest is in
+    * the range [-Inf, 1), the conversion will clamp it to zero.  If dest is
+    * NaN, the conversion will also clamp it to zero.  It is not safe to
+    * propagate the G back to the ADD.
+    */
+   test_mov_prop(BRW_CONDITIONAL_G,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_UD,
+                 false);
+}
+
+TEST_F(cmod_propagation_test, fadd_f2u_le)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.le(8)       null:UD dest:F
+    *
+    * = After =
+    * No changes.
+    *
+    * The MOV changes the type from float to unsigned integer.  If dest is in
+    * the range [-Inf, 1), the conversion will clamp it to zero.  If dest is
+    * NaN, the conversion will also clamp it to zero.  It is not safe to
+    * propagate the LE back to the ADD.
+    */
+   test_mov_prop(BRW_CONDITIONAL_LE,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_UD,
+                 false);
+}
+
+TEST_F(cmod_propagation_test, fadd_f2u_ge)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.ge(8)       null:UD dest:F
+    *
+    * = After =
+    * No changes.
+    *
+    * The MOV changes the type from float to unsigned integer.  If dest is in
+    * the range [-Inf, 1), the conversion will clamp it to zero.  If dest is
+    * NaN, the conversion will also clamp it to zero.  It is not safe to
+    * propagate the GE back to the ADD.
+    */
+   test_mov_prop(BRW_CONDITIONAL_GE,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_UD,
+                 false);
+}
+
+TEST_F(cmod_propagation_test, fadd_f2i_nz)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.nz(8)       null:D  dest:F
+    *
+    * = After =
+    * No changes.  The MOV changes the type from float to signed integer.  If
+    * dest is in the range (-1, 1), the conversion will clamp it to zero.  If
+    * dest is NaN, the conversion will also clamp it to zero.  It is not safe
+    * to propagate the NZ back to the ADD.
+    */
+   test_mov_prop(BRW_CONDITIONAL_NZ,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_D,
+                 false);
+}
+
+TEST_F(cmod_propagation_test, fadd_f2i_z)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.z(8)        null:D  dest:F
+    *
+    * = After =
+    * No changes.
+    *
+    * The MOV changes the type from float to signed integer.  If dest is in
+    * the range (-1, 1), the conversion will clamp it to zero.  If dest is
+    * NaN, the conversion will also clamp it to zero.  It is not safe to
+    * propagate the Z back to the ADD.
+    */
+   test_mov_prop(BRW_CONDITIONAL_Z,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_D,
+                 false);
+}
+
+TEST_F(cmod_propagation_test, fadd_f2i_l)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.l(8)        null:D  dest:F
+    *
+    * = After =
+    * No changes.
+    *
+    * The MOV changes the type from float to signed integer.  If dest is in
+    * the range (-1, 1), the conversion will clamp it to zero.  If dest is
+    * NaN, the conversion will also clamp it to zero.  It is not safe to
+    * propagate the L back to the ADD.
+    */
+   test_mov_prop(BRW_CONDITIONAL_L,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_D,
+                 false);
+}
+
+TEST_F(cmod_propagation_test, fadd_f2i_g)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.g(8)        null:D  dest:F
+    *
+    * = After =
+    * No changes.
+    *
+    * The MOV changes the type from float to signed integer.  If dest is in
+    * the range (-1, 1), the conversion will clamp it to zero.  If dest is
+    * NaN, the conversion will also clamp it to zero.  It is not safe to
+    * propagate the G back to the ADD.
+    */
+   test_mov_prop(BRW_CONDITIONAL_G,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_D,
+                 false);
+}
+
+TEST_F(cmod_propagation_test, fadd_f2i_le)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.le(8)       null:D  dest:F
+    *
+    * = After =
+    * No changes.
+    *
+    * The MOV changes the type from float to signed integer.  If dest is in
+    * the range (-1, 1), the conversion will clamp it to zero.  If dest is
+    * NaN, the conversion will also clamp it to zero.  It is not safe to
+    * propagate the LE back to the ADD.
+    */
+   test_mov_prop(BRW_CONDITIONAL_LE,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_D,
+                 false);
+}
+
+TEST_F(cmod_propagation_test, fadd_f2i_ge)
+{
+   /* = Before =
+    * 0: add(8)          dest:F  src0:F  src1:F
+    * 1: mov.ge(8)       null:D  dest:F
+    *
+    * = After =
+    * No changes.
+    *
+    * The MOV changes the type from float to signed integer.  If dest is in
+    * the range (-1, 1), the conversion will clamp it to zero.  If dest is
+    * NaN, the conversion will also clamp it to zero.  It is not safe to
+    * propagate the GE back to the ADD.
+    */
+   test_mov_prop(BRW_CONDITIONAL_GE,
+                 BRW_REGISTER_TYPE_F,
+                 BRW_REGISTER_TYPE_D,
+                 false);
+}
+
+void
+cmod_propagation_test::test_saturate_prop(enum brw_conditional_mod before,
+                                          enum opcode op,
+                                          enum brw_reg_type add_type,
+                                          enum brw_reg_type op_type,
+                                          bool expected_cmod_prop_progress)
+{
+   fs_reg dest = bld.vgrf(add_type);
+   fs_reg src0 = bld.vgrf(add_type);
+   fs_reg src1 = bld.vgrf(add_type);
+   fs_reg zero(brw_imm_ud(0));
+
+   bld.ADD(dest, src0, src1)->saturate = true;
+
+   assert(op == BRW_OPCODE_CMP || op == BRW_OPCODE_MOV);
+   if (op == BRW_OPCODE_CMP) {
+      bld.CMP(bld.vgrf(op_type, 0),
+              retype(dest, op_type),
+              retype(zero, op_type),
+              before);
+   } else {
+      bld.MOV(bld.vgrf(op_type, 0), retype(dest, op_type))
+         ->conditional_mod = before;
+   }
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_EQ(expected_cmod_prop_progress, cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(add_type, instruction(block0, 0)->dst.type);
+   EXPECT_EQ(add_type, instruction(block0, 0)->src[0].type);
+   EXPECT_EQ(add_type, instruction(block0, 0)->src[1].type);
+   EXPECT_TRUE(instruction(block0, 0)->saturate);
+
+   if (expected_cmod_prop_progress) {
+      EXPECT_EQ(0, block0->end_ip);
+      EXPECT_EQ(before, instruction(block0, 0)->conditional_mod);
+   } else {
+      EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+
+      /* This is ASSERT_EQ because if end_ip is 0, the instruction(block0, 1)
+       * calls will not work properly, and the test will give weird results.
+       */
+      ASSERT_EQ(1, block0->end_ip);
+      EXPECT_EQ(op, instruction(block0, 1)->opcode);
+      EXPECT_EQ(op_type, instruction(block0, 1)->dst.type);
+      EXPECT_EQ(op_type, instruction(block0, 1)->src[0].type);
+      EXPECT_FALSE(instruction(block0, 1)->saturate);
+      EXPECT_EQ(before, instruction(block0, 1)->conditional_mod);
+   }
+}
+
+TEST_F(cmod_propagation_test, float_saturate_nz_cmp)
+{
+   /* With the saturate modifier, the comparison happens after clamping to
+    * [0, 1].
+    *
+    * = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: cmp.nz.f0(8)  null  dest  0.0f
+    *
+    * = After =
+    * 0: add.sat.nz.f0(8)  dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_NZ, BRW_OPCODE_CMP,
+                      BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_F,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, float_saturate_nz_mov)
+{
+   /* With the saturate modifier, the comparison happens after clamping to
+    * [0, 1].
+    *
+    * = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: mov.nz.f0(8)  null  dest
+    *
+    * = After =
+    * 0: add.sat.nz.f0(8)  dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_NZ, BRW_OPCODE_MOV,
+                      BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_F,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, float_saturate_z_cmp)
+{
+   /* With the saturate modifier, the comparison happens after clamping to
+    * [0, 1].
+    *
+    * = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: cmp.z.f0(8)   null  dest  0.0f
+    *
+    * = After =
+    * 0: add.sat.z.f0(8)  dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_Z, BRW_OPCODE_CMP,
+                      BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_F,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, float_saturate_z_mov)
+{
+   /* With the saturate modifier, the comparison happens after clamping to
+    * [0, 1].
+    *
+    * = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: mov.z.f0(8)   null  dest
+    *
+    * = After =
+    * 0: add.sat.z.f0(8) dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_Z, BRW_OPCODE_MOV,
+                      BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_F,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, float_saturate_g_cmp)
+{
+   /* With the saturate modifier, the comparison happens after clamping to
+    * [0, 1].
+    *
+    * = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: cmp.g.f0(8)   null  dest  0.0f
+    *
+    * = After =
+    * 0: add.sat.g.f0(8)  dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_G, BRW_OPCODE_CMP,
+                      BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_F,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, float_saturate_g_mov)
+{
+   /* With the saturate modifier, the comparison happens after clamping to
+    * [0, 1].
+    *
+    * = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: mov.g.f0(8)   null  dest
+    *
+    * = After =
+    * 0: add.sat.g.f0(8)  dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_G, BRW_OPCODE_MOV,
+                      BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_F,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, float_saturate_le_cmp)
+{
+   /* With the saturate modifier, the comparison happens after clamping to
+    * [0, 1].
+    *
+    * = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: cmp.le.f0(8)  null  dest  0.0f
+    *
+    * = After =
+    * 0: add.sat.le.f0(8)  dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_LE, BRW_OPCODE_CMP,
+                      BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_F,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, float_saturate_le_mov)
+{
+   /* With the saturate modifier, the comparison happens after clamping to
+    * [0, 1].  (sat(x) <= 0) == (x <= 0).
+    *
+    * = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: mov.le.f0(8)  null  dest
+    *
+    * = After =
+    * 0: add.sat.le.f0(8)  dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_LE, BRW_OPCODE_MOV,
+                      BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_F,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, float_saturate_l_cmp)
+{
+   /* With the saturate modifier, the comparison happens after clamping to
+    * [0, 1].
+    *
+    * = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: cmp.l.f0(8)  null  dest  0.0f
+    *
+    * = After =
+    * 0: add.sat.l.f0(8)  dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_L, BRW_OPCODE_CMP,
+                      BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_F,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, float_saturate_l_mov)
+{
+   /* With the saturate modifier, the comparison happens after clamping to
+    * [0, 1].
+    *
+    * = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: mov.l.f0(8)   null  dest
+    *
+    * = After =
+    * 0: add.sat.l.f0(8)    dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_L, BRW_OPCODE_MOV,
+                      BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_F,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, float_saturate_ge_cmp)
+{
+   /* With the saturate modifier, the comparison happens after clamping to
+    * [0, 1].
+    *
+    * = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: cmp.ge.f0(8)  null  dest  0.0f
+    *
+    * = After =
+    * 0: add.sat.ge.f0(8)  dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_GE, BRW_OPCODE_CMP,
+                      BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_F,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, float_saturate_ge_mov)
+{
+   /* With the saturate modifier, the comparison happens before clamping to
+    * [0, 1].
+    *
+    * = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: mov.ge.f0(8)  null  dest
+    *
+    * = After =
+    * 0: add.sat.ge.f0(8)    dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_GE, BRW_OPCODE_MOV,
+                      BRW_REGISTER_TYPE_F, BRW_REGISTER_TYPE_F,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, int_saturate_nz_cmp)
+{
+   /* = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: cmp.nz.f0(8)  null  dest  0
+    *
+    * = After =
+    * 0: add.sat.nz.f0(8)    dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_NZ, BRW_OPCODE_CMP,
+                      BRW_REGISTER_TYPE_D, BRW_REGISTER_TYPE_D,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, uint_saturate_nz_cmp)
+{
+   /* = Before =
+    *
+    * 0: add.sat(8)    dest:UD  src0:UD  src1:UD
+    * 1: cmp.nz.f0(8)  null:D   dest:D   0
+    *
+    * = After =
+    * 0: add.sat.nz.f0(8)    dest:UD  src0:UD  src1:UD
+    */
+   test_saturate_prop(BRW_CONDITIONAL_NZ, BRW_OPCODE_CMP,
+                      BRW_REGISTER_TYPE_UD, BRW_REGISTER_TYPE_D,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, int_saturate_nz_mov)
+{
+   /* = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: mov.nz.f0(8)  null  dest
+    *
+    * = After =
+    * 0: add.sat.nz.f0(8)    dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_NZ, BRW_OPCODE_MOV,
+                      BRW_REGISTER_TYPE_D, BRW_REGISTER_TYPE_D,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, int_saturate_z_cmp)
+{
+   /* = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: cmp.z.f0(8)   null  dest  0
+    *
+    * = After =
+    * 0: add.sat.z.f0(8)    dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_Z, BRW_OPCODE_CMP,
+                      BRW_REGISTER_TYPE_D, BRW_REGISTER_TYPE_D,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, uint_saturate_z_cmp)
+{
+   /* = Before =
+    *
+    * 0: add.sat(8)   dest:UD  src0:UD  src1:UD
+    * 1: cmp.z.f0(8)  null:D   dest:D   0
+    *
+    * = After =
+    * 0: add.sat.z.f0(8)    dest:UD  src0:UD  src1:UD
+    */
+   test_saturate_prop(BRW_CONDITIONAL_Z, BRW_OPCODE_CMP,
+                      BRW_REGISTER_TYPE_UD, BRW_REGISTER_TYPE_D,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, int_saturate_z_mov)
+{
+   /* With the saturate modifier, the comparison happens before clamping to
+    * [0, 1].  (sat(x) == 0) == (x <= 0).
+    *
+    * = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: mov.z.f0(8)   null  dest
+    *
+    * = After =
+    * 0: add.sat.z.f0(8)    dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_Z, BRW_OPCODE_MOV,
+                      BRW_REGISTER_TYPE_D, BRW_REGISTER_TYPE_D,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, int_saturate_g_cmp)
+{
+   /* = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: cmp.g.f0(8)   null  dest  0
+    *
+    * = After =
+    * 0: add.sat.g.f0(8)    dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_G, BRW_OPCODE_CMP,
+                      BRW_REGISTER_TYPE_D, BRW_REGISTER_TYPE_D,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, int_saturate_g_mov)
+{
+   /* = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: mov.g.f0(8)   null  dest
+    *
+    * = After =
+    * 0: add.sat.g.f0(8)    dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_G, BRW_OPCODE_MOV,
+                      BRW_REGISTER_TYPE_D, BRW_REGISTER_TYPE_D,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, int_saturate_le_cmp)
+{
+   /* = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: cmp.le.f0(8)  null  dest  0
+    *
+    * = After =
+    * 0: add.sat.le.f0(8)    dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_LE, BRW_OPCODE_CMP,
+                      BRW_REGISTER_TYPE_D, BRW_REGISTER_TYPE_D,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, int_saturate_le_mov)
+{
+   /* = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: mov.le.f0(8)  null  dest
+    *
+    * = After =
+    * 0: add.sat.le.f0(8)    dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_LE, BRW_OPCODE_MOV,
+                      BRW_REGISTER_TYPE_D, BRW_REGISTER_TYPE_D,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, int_saturate_l_cmp)
+{
+   /* = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: cmp.l.f0(8)  null  dest  0
+    *
+    * = After =
+    * 0: add.sat.l.f0(8)    dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_L, BRW_OPCODE_CMP,
+                      BRW_REGISTER_TYPE_D, BRW_REGISTER_TYPE_D,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, int_saturate_l_mov)
+{
+   /* = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: mov.l.f0(8)  null  dest  0
+    *
+    * = After =
+    * 0: add.sat.l.f0(8)    dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_L, BRW_OPCODE_MOV,
+                      BRW_REGISTER_TYPE_D, BRW_REGISTER_TYPE_D,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, int_saturate_ge_cmp)
+{
+   /* = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: cmp.ge.f0(8)  null  dest  0
+    *
+    * = After =
+    * 0: add.sat.ge.f0(8)    dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_GE, BRW_OPCODE_CMP,
+                      BRW_REGISTER_TYPE_D, BRW_REGISTER_TYPE_D,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, int_saturate_ge_mov)
+{
+   /* = Before =
+    *
+    * 0: add.sat(8)    dest  src0  src1
+    * 1: mov.ge.f0(8)  null  dest
+    *
+    * = After =
+    * 0: add.sat.ge.f0(8)    dest  src0  src1
+    */
+   test_saturate_prop(BRW_CONDITIONAL_GE, BRW_OPCODE_MOV,
+                      BRW_REGISTER_TYPE_D, BRW_REGISTER_TYPE_D,
+                      true);
+}
+
+TEST_F(cmod_propagation_test, not_to_or)
+{
+   /* Exercise propagation of conditional modifier from a NOT instruction to
+    * another ALU instruction as performed by cmod_propagate_not.
+    */
+   fs_reg dest = v->vgrf(glsl_uint_type());
+   fs_reg src0 = v->vgrf(glsl_uint_type());
+   fs_reg src1 = v->vgrf(glsl_uint_type());
+   bld.OR(dest, src0, src1);
+   set_condmod(BRW_CONDITIONAL_NZ, bld.NOT(bld.null_reg_ud(), dest));
+
+   /* = Before =
+    *
+    * 0: or(8)         dest  src0  src1
+    * 1: not.nz.f0(8)  null  dest
+    *
+    * = After =
+    * 0: or.z.f0(8)    dest  src0  src1
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_OR, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_Z, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, not_to_and)
+{
+   /* Exercise propagation of conditional modifier from a NOT instruction to
+    * another ALU instruction as performed by cmod_propagate_not.
+    */
+   fs_reg dest = v->vgrf(glsl_uint_type());
+   fs_reg src0 = v->vgrf(glsl_uint_type());
+   fs_reg src1 = v->vgrf(glsl_uint_type());
+   bld.AND(dest, src0, src1);
+   set_condmod(BRW_CONDITIONAL_NZ, bld.NOT(bld.null_reg_ud(), dest));
+
+   /* = Before =
+    *
+    * 0: and(8)        dest  src0  src1
+    * 1: not.nz.f0(8)  null  dest
+    *
+    * = After =
+    * 0: and.z.f0(8)   dest  src0  src1
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_Z, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, not_to_uadd)
+{
+   /* Exercise propagation of conditional modifier from a NOT instruction to
+    * another ALU instruction as performed by cmod_propagate_not.
+    *
+    * The optimization pass currently restricts to just OR and AND.  It's
+    * possible that this is too restrictive, and the actual, necessary
+    * restriction is just the the destination type of the ALU instruction is
+    * the same as the source type of the NOT instruction.
+    */
+   fs_reg dest = v->vgrf(glsl_uint_type());
+   fs_reg src0 = v->vgrf(glsl_uint_type());
+   fs_reg src1 = v->vgrf(glsl_uint_type());
+   bld.ADD(dest, src0, src1);
+   set_condmod(BRW_CONDITIONAL_NZ, bld.NOT(bld.null_reg_ud(), dest));
+
+   /* = Before =
+    *
+    * 0: add(8)        dest  src0  src1
+    * 1: not.nz.f0(8)  null  dest
+    *
+    * = After =
+    * No changes
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_NOT, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, not_to_fadd_to_ud)
+{
+   /* Exercise propagation of conditional modifier from a NOT instruction to
+    * another ALU instruction as performed by cmod_propagate_not.
+    *
+    * The optimization pass currently restricts to just OR and AND.  It's
+    * possible that this is too restrictive, and the actual, necessary
+    * restriction is just the the destination type of the ALU instruction is
+    * the same as the source type of the NOT instruction.
+    */
+   fs_reg dest = v->vgrf(glsl_uint_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dest, src0, src1);
+   set_condmod(BRW_CONDITIONAL_NZ, bld.NOT(bld.null_reg_ud(), dest));
+
+   /* = Before =
+    *
+    * 0: add(8)        dest.ud src0.f  src1.f
+    * 1: not.nz.f0(8)  null    dest.ud
+    *
+    * = After =
+    * No changes
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_NOT, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, not_to_fadd)
+{
+   /* Exercise propagation of conditional modifier from a NOT instruction to
+    * another ALU instruction as performed by cmod_propagate_not.
+    *
+    * The optimization pass currently restricts to just OR and AND.  It's
+    * possible that this is too restrictive, and the actual, necessary
+    * restriction is just the the destination type of the ALU instruction is
+    * the same as the source type of the NOT instruction.
+    */
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dest, src0, src1);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.NOT(bld.null_reg_ud(),
+                       retype(dest, BRW_REGISTER_TYPE_UD)));
+
+   /* = Before =
+    *
+    * 0: add(8)        dest.f  src0.f  src1.f
+    * 1: not.nz.f0(8)  null    dest.ud
+    *
+    * = After =
+    * No changes
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_NOT, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, not_to_or_intervening_flag_read_compatible_value)
+{
+   /* Exercise propagation of conditional modifier from a NOT instruction to
+    * another ALU instruction as performed by cmod_propagate_not.
+    */
+   fs_reg dest0 = v->vgrf(glsl_uint_type());
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_uint_type());
+   fs_reg src1 = v->vgrf(glsl_uint_type());
+   fs_reg src2 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   set_condmod(BRW_CONDITIONAL_Z, bld.OR(dest0, src0, src1));
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   set_condmod(BRW_CONDITIONAL_NZ, bld.NOT(bld.null_reg_ud(), dest0));
+
+   /* = Before =
+    *
+    * 0: or.z.f0(8)    dest0 src0  src1
+    * 1: (+f0) sel(8)  dest1 src2  0.0f
+    * 2: not.nz.f0(8)  null  dest0
+    *
+    * = After =
+    * 0: or.z.f0(8)    dest0 src0  src1
+    * 1: (+f0) sel(8)  dest1 src2  0.0f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_OR, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_Z, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+}
+
+TEST_F(cmod_propagation_test,
+       not_to_or_intervening_flag_read_compatible_value_mismatch_flag)
+{
+   /* Exercise propagation of conditional modifier from a NOT instruction to
+    * another ALU instruction as performed by cmod_propagate_not.
+    */
+   fs_reg dest0 = v->vgrf(glsl_uint_type());
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_uint_type());
+   fs_reg src1 = v->vgrf(glsl_uint_type());
+   fs_reg src2 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   set_condmod(BRW_CONDITIONAL_Z, bld.OR(dest0, src0, src1))
+      ->flag_subreg = 1;
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   set_condmod(BRW_CONDITIONAL_NZ, bld.NOT(bld.null_reg_ud(), dest0));
+
+   /* = Before =
+    *
+    * 0: or.z.f0.1(8)  dest0 src0  src1
+    * 1: (+f0) sel(8)  dest1 src2  0.0f
+    * 2: not.nz.f0(8)  null  dest0
+    *
+    * = After =
+    * No changes
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_OR, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_Z, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(1, instruction(block0, 0)->flag_subreg);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+   EXPECT_EQ(BRW_OPCODE_NOT, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 2)->conditional_mod);
+   EXPECT_EQ(0, instruction(block0, 2)->flag_subreg);
+}
+
+TEST_F(cmod_propagation_test, not_to_or_intervening_flag_read_incompatible_value)
+{
+   /* Exercise propagation of conditional modifier from a NOT instruction to
+    * another ALU instruction as performed by cmod_propagate_not.
+    */
+   fs_reg dest0 = v->vgrf(glsl_uint_type());
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_uint_type());
+   fs_reg src1 = v->vgrf(glsl_uint_type());
+   fs_reg src2 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   set_condmod(BRW_CONDITIONAL_NZ, bld.OR(dest0, src0, src1));
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   set_condmod(BRW_CONDITIONAL_NZ, bld.NOT(bld.null_reg_ud(), dest0));
+
+   /* = Before =
+    *
+    * 0: or.nz.f0(8)   dest0 src0  src1
+    * 1: (+f0) sel(8)  dest1 src2  0.0f
+    * 2: not.nz.f0(8)  null  dest0
+    *
+    * = After =
+    * No changes
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_OR, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+   EXPECT_EQ(BRW_OPCODE_NOT, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, not_to_or_intervening_mismatch_flag_write)
+{
+   /* Exercise propagation of conditional modifier from a NOT instruction to
+    * another ALU instruction as performed by cmod_propagate_not.
+    */
+   fs_reg dest0 = v->vgrf(glsl_uint_type());
+   fs_reg dest1 = v->vgrf(glsl_uint_type());
+   fs_reg src0 = v->vgrf(glsl_uint_type());
+   fs_reg src1 = v->vgrf(glsl_uint_type());
+
+   bld.OR(dest0, src0, src1);
+   set_condmod(BRW_CONDITIONAL_Z, bld.OR(dest1, src0, src1))
+      ->flag_subreg = 1;
+   set_condmod(BRW_CONDITIONAL_NZ, bld.NOT(bld.null_reg_ud(), dest0));
+
+   /* = Before =
+    *
+    * 0: or(8)          dest0 src0  src1
+    * 1: or.z.f0.1(8)   dest1 src0  src1
+    * 2: not.nz.f0(8)   null  dest0
+    *
+    * = After =
+    * 0: or.z.f0(8)     dest0 src0  src1
+    * 1: or.z.f0.1(8)   dest1 src0  src1
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_OR, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_Z, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(0, instruction(block0, 0)->flag_subreg);
+   EXPECT_EQ(BRW_OPCODE_OR, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_Z, instruction(block0, 1)->conditional_mod);
+   EXPECT_EQ(1, instruction(block0, 1)->flag_subreg);
+}
+
+TEST_F(cmod_propagation_test, not_to_or_intervening_mismatch_flag_read)
+{
+   /* Exercise propagation of conditional modifier from a NOT instruction to
+    * another ALU instruction as performed by cmod_propagate_not.
+    */
+   fs_reg dest0 = v->vgrf(glsl_uint_type());
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_uint_type());
+   fs_reg src1 = v->vgrf(glsl_uint_type());
+   fs_reg src2 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+
+   bld.OR(dest0, src0, src1);
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero))
+      ->flag_subreg = 1;
+   set_condmod(BRW_CONDITIONAL_NZ, bld.NOT(bld.null_reg_ud(), dest0));
+
+   /* = Before =
+    *
+    * 0: or(8)          dest0 src0  src1
+    * 1: (+f0.1) sel(8) dest1 src2  0.0f
+    * 2: not.nz.f0(8)   null  dest0
+    *
+    * = After =
+    * 0: or.z.f0(8)     dest0 src0  src1
+    * 1: (+f0.1) sel(8) dest1 src2  0.0f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_OR, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_Z, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(0, instruction(block0, 0)->flag_subreg);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+   EXPECT_EQ(1, instruction(block0, 1)->flag_subreg);
+}
+
+TEST_F(cmod_propagation_test, cmp_to_add_float_e)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg neg10(brw_imm_f(-10.0f));
+   fs_reg pos10(brw_imm_f(10.0f));
+
+   bld.ADD(dest, src0, neg10)->saturate = true;
+   bld.CMP(bld.null_reg_f(), src0, pos10, BRW_CONDITIONAL_EQ);
+
+   /* = Before =
+    * 0: add.sat(8) vgrf0:F, vgrf1:F, -10f
+    * 1: cmp.z.f0.0(8) null:F, vgrf1:F, 10f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_EQ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_to_add_float_g)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg neg10(brw_imm_f(-10.0f));
+   fs_reg pos10(brw_imm_f(10.0f));
+
+   bld.ADD(dest, src0, neg10)->saturate = true;
+   bld.CMP(bld.null_reg_f(), src0, pos10, BRW_CONDITIONAL_G);
+
+   /* = Before =
+    * 0: add.sat(8) vgrf0:F, vgrf1:F, -10f
+    * 1: cmp.g.f0.0(8) null:F, vgrf1:F, 10f
+    *
+    * = After =
+    * 0: add.sat.g.f0.0(8) vgrf0:F, vgrf1:F, -10f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_G, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_to_add_float_le)
+{
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg neg10(brw_imm_f(-10.0f));
+   fs_reg pos10(brw_imm_f(10.0f));
+
+   bld.ADD(dest, src0, neg10)->saturate = true;
+   bld.CMP(bld.null_reg_f(), src0, pos10, BRW_CONDITIONAL_LE);
+
+   /* = Before =
+    * 0: add.sat(8) vgrf0:F, vgrf1:F, -10f
+    * 1: cmp.le.f0.0(8) null:F, vgrf1:F, 10f
+    *
+    * = After =
+    * 0: add.sat.le.f0.0(8) vgrf0:F, vgrf1:F, -10f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, prop_across_sel_gfx7)
+{
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg dest2 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg src2 = v->vgrf(glsl_float_type());
+   fs_reg src3 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest1, src0, src1);
+   bld.emit_minmax(dest2, src2, src3, BRW_CONDITIONAL_GE);
+   bld.CMP(bld.null_reg_f(), dest1, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest1 src0  src1
+    * 1: sel.ge(8)     dest2 src2  src3
+    * 2: cmp.ge.f0(8)  null  dest1 0.0f
+    *
+    * = After =
+    * 0: add.ge.f0(8)  dest1 src0  src1
+    * 1: sel.ge(8)     dest2 src2  src3
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, prop_across_sel_gfx5)
+{
+   devinfo->ver = 5;
+   devinfo->verx10 = devinfo->ver * 10;
+
+   fs_reg dest1 = v->vgrf(glsl_float_type());
+   fs_reg dest2 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg src2 = v->vgrf(glsl_float_type());
+   fs_reg src3 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest1, src0, src1);
+   bld.emit_minmax(dest2, src2, src3, BRW_CONDITIONAL_GE);
+   bld.CMP(bld.null_reg_f(), dest1, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest1 src0  src1
+    * 1: sel.ge(8)     dest2 src2  src3
+    * 2: cmp.ge.f0(8)  null  dest1 0.0f
+    *
+    * = After =
+    * (no changes)
+    *
+    * On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
+    * using a separate cmpn and sel instruction.  This lowering occurs in
+    * fs_vistor::lower_minmax which is called a long time after the first
+    * calls to cmod_propagation.
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, prop_into_sel_gfx5)
+{
+   devinfo->ver = 5;
+   devinfo->verx10 = devinfo->ver * 10;
+
+   fs_reg dest = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.emit_minmax(dest, src0, src1, BRW_CONDITIONAL_GE);
+   bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: sel.ge(8)     dest  src0  src1
+    * 1: cmp.ge.f0(8)  null  dest  0.0f
+    *
+    * = After =
+    * (no changes)
+    *
+    * Do not copy propagate into a sel.cond instruction.  While it does modify
+    * the flags, the flags are not based on the result compared with zero (as
+    * with most other instructions).  The result is based on the sources
+    * compared with each other (like cmp.cond).
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
diff --git a/src/intel/compiler/elk/test_fs_combine_constants.cpp b/src/intel/compiler/elk/test_fs_combine_constants.cpp
new file mode 100644
index 00000000000..805a78e0b50
--- /dev/null
+++ b/src/intel/compiler/elk/test_fs_combine_constants.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2024 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <gtest/gtest.h>
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+struct FSCombineConstantsTest : public ::testing::Test {
+   FSCombineConstantsTest() {
+      mem_ctx = ralloc_context(NULL);
+
+      devinfo = {};
+      devinfo.ver = 9;
+      devinfo.verx10 = 90;
+
+      compiler = {};
+      compiler.devinfo = &devinfo;
+      brw_init_isa_info(&compiler.isa, &devinfo);
+
+      params = {};
+      params.mem_ctx = mem_ctx;
+
+      prog_data = {};
+      nir_shader *nir =
+         nir_shader_create(mem_ctx, MESA_SHADER_COMPUTE, NULL, NULL);
+
+      shader = new fs_visitor(&compiler, &params, NULL,
+                              &prog_data.base, nir, 8, false, false);
+   }
+
+   ~FSCombineConstantsTest() override {
+      delete shader;
+      ralloc_free(mem_ctx);
+      mem_ctx = NULL;
+   }
+
+   void *mem_ctx;
+   brw_compiler compiler;
+   brw_compile_params params;
+   intel_device_info devinfo;
+   struct brw_wm_prog_data prog_data;
+   struct gl_shader_program *shader_prog;
+
+   fs_visitor *shader;
+
+   bool opt_combine_constants(fs_visitor *s) {
+      const bool print = getenv("TEST_DEBUG");
+
+      if (print) {
+         fprintf(stderr, "= Before =\n");
+         s->cfg->dump();
+      }
+
+      bool ret = s->opt_combine_constants();
+
+      if (print) {
+         fprintf(stderr, "\n= After =\n");
+         s->cfg->dump();
+      }
+
+      return ret;
+   }
+};
+
+static fs_builder
+make_builder(fs_visitor *s)
+{
+   return fs_builder(s, s->dispatch_width).at_end();
+}
+
+TEST_F(FSCombineConstantsTest, Simple)
+{
+   fs_builder bld = make_builder(shader);
+
+   fs_reg r = brw_vec8_grf(1, 0);
+   fs_reg imm_a = brw_imm_ud(1);
+   fs_reg imm_b = brw_imm_ud(2);
+
+   bld.SEL(r, imm_a, imm_b);
+   shader->calculate_cfg();
+
+   bool progress = opt_combine_constants(shader);
+   ASSERT_TRUE(progress);
+
+   ASSERT_EQ(shader->cfg->num_blocks, 1);
+   bblock_t *block = cfg_first_block(shader->cfg);
+   ASSERT_NE(block, nullptr);
+
+   /* We can do better but for now sanity check that
+    * there's a MOV and a SEL.
+    */
+   ASSERT_EQ(bblock_start(block)->opcode, BRW_OPCODE_MOV);
+   ASSERT_EQ(bblock_end(block)->opcode, BRW_OPCODE_SEL);
+}
+
+TEST_F(FSCombineConstantsTest, DoContainingDo)
+{
+   fs_builder bld = make_builder(shader);
+
+   fs_reg r1 = brw_vec8_grf(1, 0);
+   fs_reg r2 = brw_vec8_grf(2, 0);
+   fs_reg imm_a = brw_imm_ud(1);
+   fs_reg imm_b = brw_imm_ud(2);
+
+   bld.DO();
+   bld.DO();
+   bld.SEL(r1, imm_a, imm_b);
+   bld.WHILE();
+   bld.WHILE();
+   bld.SEL(r2, imm_a, imm_b);
+   shader->calculate_cfg();
+
+   unsigned original_num_blocks = shader->cfg->num_blocks;
+
+   bool progress = opt_combine_constants(shader);
+   ASSERT_TRUE(progress);
+
+   /* We can do better but for now sanity check there's
+    * enough blocks, since the original issue motivating this
+    * test is that the shader would be empty.
+    */
+   ASSERT_GE(shader->cfg->num_blocks, original_num_blocks);
+   shader->validate();
+}
+
diff --git a/src/intel/compiler/elk/test_fs_copy_propagation.cpp b/src/intel/compiler/elk/test_fs_copy_propagation.cpp
new file mode 100644
index 00000000000..9b56601b404
--- /dev/null
+++ b/src/intel/compiler/elk/test_fs_copy_propagation.cpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+class copy_propagation_test : public ::testing::Test {
+protected:
+   copy_propagation_test();
+   ~copy_propagation_test() override;
+
+   struct brw_compiler *compiler;
+   struct brw_compile_params params;
+   struct intel_device_info *devinfo;
+   void *ctx;
+   struct brw_wm_prog_data *prog_data;
+   struct gl_shader_program *shader_prog;
+   fs_visitor *v;
+   fs_builder bld;
+};
+
+class copy_propagation_fs_visitor : public fs_visitor
+{
+public:
+   copy_propagation_fs_visitor(struct brw_compiler *compiler,
+                               struct brw_compile_params *params,
+                               struct brw_wm_prog_data *prog_data,
+                               nir_shader *shader)
+      : fs_visitor(compiler, params, NULL,
+                   &prog_data->base, shader, 8, false, false) {}
+};
+
+
+copy_propagation_test::copy_propagation_test()
+   : bld(NULL, 0)
+{
+   ctx = ralloc_context(NULL);
+   compiler = rzalloc(ctx, struct brw_compiler);
+   devinfo = rzalloc(ctx, struct intel_device_info);
+   compiler->devinfo = devinfo;
+
+   params = {};
+   params.mem_ctx = ctx;
+
+   prog_data = ralloc(ctx, struct brw_wm_prog_data);
+   nir_shader *shader =
+      nir_shader_create(ctx, MESA_SHADER_FRAGMENT, NULL, NULL);
+
+   v = new copy_propagation_fs_visitor(compiler, &params, prog_data, shader);
+
+   bld = fs_builder(v).at_end();
+
+   devinfo->ver = 4;
+   devinfo->verx10 = devinfo->ver * 10;
+}
+
+copy_propagation_test::~copy_propagation_test()
+{
+   delete v;
+   v = NULL;
+
+   ralloc_free(ctx);
+   ctx = NULL;
+}
+
+static fs_inst *
+instruction(bblock_t *block, int num)
+{
+   fs_inst *inst = (fs_inst *)block->start();
+   for (int i = 0; i < num; i++) {
+      inst = (fs_inst *)inst->next;
+   }
+   return inst;
+}
+
+static bool
+copy_propagation(fs_visitor *v)
+{
+   const bool print = getenv("TEST_DEBUG");
+
+   if (print) {
+      fprintf(stderr, "= Before =\n");
+      v->cfg->dump();
+   }
+
+   bool ret = v->opt_copy_propagation();
+
+   if (print) {
+      fprintf(stderr, "\n= After =\n");
+      v->cfg->dump();
+   }
+
+   return ret;
+}
+
+TEST_F(copy_propagation_test, basic)
+{
+   fs_reg vgrf0 = v->vgrf(glsl_float_type());
+   fs_reg vgrf1 = v->vgrf(glsl_float_type());
+   fs_reg vgrf2 = v->vgrf(glsl_float_type());
+   fs_reg vgrf3 = v->vgrf(glsl_float_type());
+   bld.MOV(vgrf0, vgrf2);
+   bld.ADD(vgrf1, vgrf0, vgrf3);
+
+   /* = Before =
+    *
+    * 0: mov(8)        vgrf0  vgrf2
+    * 1: add(8)        vgrf1  vgrf0  vgrf3
+    *
+    * = After =
+    * 0: mov(8)        vgrf0  vgrf2
+    * 1: add(8)        vgrf1  vgrf2  vgrf3
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(copy_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   fs_inst *mov = instruction(block0, 0);
+   EXPECT_EQ(BRW_OPCODE_MOV, mov->opcode);
+   EXPECT_TRUE(mov->dst.equals(vgrf0));
+   EXPECT_TRUE(mov->src[0].equals(vgrf2));
+
+   fs_inst *add = instruction(block0, 1);
+   EXPECT_EQ(BRW_OPCODE_ADD, add->opcode);
+   EXPECT_TRUE(add->dst.equals(vgrf1));
+   EXPECT_TRUE(add->src[0].equals(vgrf2));
+   EXPECT_TRUE(add->src[1].equals(vgrf3));
+}
+
+TEST_F(copy_propagation_test, maxmax_sat_imm)
+{
+   fs_reg vgrf0 = v->vgrf(glsl_float_type());
+   fs_reg vgrf1 = v->vgrf(glsl_float_type());
+   fs_reg vgrf2 = v->vgrf(glsl_float_type());
+
+   static const struct {
+      enum brw_conditional_mod conditional_mod;
+      float immediate;
+      bool expected_result;
+   } test[] = {
+      /*   conditional mod,     imm, expected_result */
+      { BRW_CONDITIONAL_GE  ,  0.1f, false },
+      { BRW_CONDITIONAL_L   ,  0.1f, false },
+      { BRW_CONDITIONAL_GE  ,  0.5f, false },
+      { BRW_CONDITIONAL_L   ,  0.5f, false },
+      { BRW_CONDITIONAL_GE  ,  0.9f, false },
+      { BRW_CONDITIONAL_L   ,  0.9f, false },
+      { BRW_CONDITIONAL_GE  , -1.5f, false },
+      { BRW_CONDITIONAL_L   , -1.5f, false },
+      { BRW_CONDITIONAL_GE  ,  1.5f, false },
+      { BRW_CONDITIONAL_L   ,  1.5f, false },
+
+      { BRW_CONDITIONAL_NONE, 0.5f, false },
+      { BRW_CONDITIONAL_Z   , 0.5f, false },
+      { BRW_CONDITIONAL_NZ  , 0.5f, false },
+      { BRW_CONDITIONAL_G   , 0.5f, false },
+      { BRW_CONDITIONAL_LE  , 0.5f, false },
+      { BRW_CONDITIONAL_R   , 0.5f, false },
+      { BRW_CONDITIONAL_O   , 0.5f, false },
+      { BRW_CONDITIONAL_U   , 0.5f, false },
+   };
+
+   for (unsigned i = 0; i < sizeof(test) / sizeof(test[0]); i++) {
+      fs_inst *mov = set_saturate(true, bld.MOV(vgrf0, vgrf1));
+      fs_inst *sel = set_condmod(test[i].conditional_mod,
+                                 bld.SEL(vgrf2, vgrf0,
+                                         brw_imm_f(test[i].immediate)));
+
+      v->calculate_cfg();
+
+      bblock_t *block0 = v->cfg->blocks[0];
+
+      EXPECT_EQ(0, block0->start_ip);
+      EXPECT_EQ(1, block0->end_ip);
+
+      EXPECT_EQ(test[i].expected_result, copy_propagation(v));
+      EXPECT_EQ(0, block0->start_ip);
+      EXPECT_EQ(1, block0->end_ip);
+
+      EXPECT_EQ(BRW_OPCODE_MOV, mov->opcode);
+      EXPECT_TRUE(mov->saturate);
+      EXPECT_TRUE(mov->dst.equals(vgrf0));
+      EXPECT_TRUE(mov->src[0].equals(vgrf1));
+
+      EXPECT_EQ(BRW_OPCODE_SEL, sel->opcode);
+      EXPECT_EQ(test[i].conditional_mod, sel->conditional_mod);
+      EXPECT_EQ(test[i].expected_result, sel->saturate);
+      EXPECT_TRUE(sel->dst.equals(vgrf2));
+      if (test[i].expected_result) {
+         EXPECT_TRUE(sel->src[0].equals(vgrf1));
+      } else {
+         EXPECT_TRUE(sel->src[0].equals(vgrf0));
+      }
+      EXPECT_TRUE(sel->src[1].equals(brw_imm_f(test[i].immediate)));
+
+      delete v->cfg;
+      v->cfg = NULL;
+   }
+}
diff --git a/src/intel/compiler/elk/test_fs_saturate_propagation.cpp b/src/intel/compiler/elk/test_fs_saturate_propagation.cpp
new file mode 100644
index 00000000000..110653fc58d
--- /dev/null
+++ b/src/intel/compiler/elk/test_fs_saturate_propagation.cpp
@@ -0,0 +1,832 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+class saturate_propagation_test : public ::testing::Test {
+protected:
+   saturate_propagation_test();
+   ~saturate_propagation_test() override;
+
+   struct brw_compiler *compiler;
+   struct brw_compile_params params;
+   struct intel_device_info *devinfo;
+   void *ctx;
+   struct brw_wm_prog_data *prog_data;
+   struct gl_shader_program *shader_prog;
+   fs_visitor *v;
+   fs_builder bld;
+};
+
+class saturate_propagation_fs_visitor : public fs_visitor
+{
+public:
+   saturate_propagation_fs_visitor(struct brw_compiler *compiler,
+                                   struct brw_compile_params *params,
+                                   struct brw_wm_prog_data *prog_data,
+                                   nir_shader *shader)
+      : fs_visitor(compiler, params, NULL,
+                   &prog_data->base, shader, 16, false, false) {}
+};
+
+
+saturate_propagation_test::saturate_propagation_test()
+   : bld(NULL, 0)
+{
+   ctx = ralloc_context(NULL);
+   compiler = rzalloc(ctx, struct brw_compiler);
+   devinfo = rzalloc(ctx, struct intel_device_info);
+   compiler->devinfo = devinfo;
+
+   params = {};
+   params.mem_ctx = ctx;
+
+   prog_data = ralloc(ctx, struct brw_wm_prog_data);
+   nir_shader *shader =
+      nir_shader_create(ctx, MESA_SHADER_FRAGMENT, NULL, NULL);
+
+   v = new saturate_propagation_fs_visitor(compiler, &params, prog_data, shader);
+
+   bld = fs_builder(v).at_end();
+
+   devinfo->ver = 6;
+   devinfo->verx10 = devinfo->ver * 10;
+}
+
+saturate_propagation_test::~saturate_propagation_test()
+{
+   delete v;
+   v = NULL;
+
+   ralloc_free(ctx);
+   ctx = NULL;
+}
+
+
+static fs_inst *
+instruction(bblock_t *block, int num)
+{
+   fs_inst *inst = (fs_inst *)block->start();
+   for (int i = 0; i < num; i++) {
+      inst = (fs_inst *)inst->next;
+   }
+   return inst;
+}
+
+static bool
+saturate_propagation(fs_visitor *v)
+{
+   const bool print = false;
+
+   if (print) {
+      fprintf(stderr, "= Before =\n");
+      v->cfg->dump();
+   }
+
+   bool ret = v->opt_saturate_propagation();
+
+   if (print) {
+      fprintf(stderr, "\n= After =\n");
+      v->cfg->dump();
+   }
+
+   return ret;
+}
+
+TEST_F(saturate_propagation_test, basic)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dst0, src0, src1);
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: add(16)       dst0  src0  src1
+    * 1: mov.sat(16)   dst1  dst0
+    *
+    * = After =
+    * 0: add.sat(16)   dst0  src0  src1
+    * 1: mov(16)       dst1  dst0
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_TRUE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, other_non_saturated_use)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg dst2 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dst0, src0, src1);
+   set_saturate(true, bld.MOV(dst1, dst0));
+   bld.ADD(dst2, dst0, src0);
+
+   /* = Before =
+    *
+    * 0: add(16)       dst0  src0  src1
+    * 1: mov.sat(16)   dst1  dst0
+    * 2: add(16)       dst2  dst0  src0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 2)->opcode);
+}
+
+TEST_F(saturate_propagation_test, predicated_instruction)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dst0, src0, src1)
+      ->predicate = BRW_PREDICATE_NORMAL;
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: (+f0) add(16) dst0  src0  src1
+    * 1: mov.sat(16)   dst1  dst0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, neg_mov_sat)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   bld.RNDU(dst0, src0);
+   dst0.negate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: rndu(16)      dst0  src0
+    * 1: mov.sat(16)   dst1  -dst0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_RNDU, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, add_neg_mov_sat)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dst0, src0, src1);
+   dst0.negate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: add(16)       dst0  src0  src1
+    * 1: mov.sat(16)   dst1  -dst0
+    *
+    * = After =
+    * 0: add.sat(16)   dst0  -src0 -src1
+    * 1: mov(16)       dst1  dst0
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_TRUE(instruction(block0, 0)->saturate);
+   EXPECT_TRUE(instruction(block0, 0)->src[0].negate);
+   EXPECT_TRUE(instruction(block0, 0)->src[1].negate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, add_imm_float_neg_mov_sat)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = brw_imm_f(1.0f);
+   bld.ADD(dst0, src0, src1);
+   dst0.negate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: add(16)       dst0  src0  1.0f
+    * 1: mov.sat(16)   dst1  -dst0
+    *
+    * = After =
+    * 0: add.sat(16)   dst0  -src0 -1.0f
+    * 1: mov(16)       dst1  dst0
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_TRUE(instruction(block0, 0)->saturate);
+   EXPECT_TRUE(instruction(block0, 0)->src[0].negate);
+   EXPECT_EQ(instruction(block0, 0)->src[1].f, -1.0f);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, mul_neg_mov_sat)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.MUL(dst0, src0, src1);
+   dst0.negate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: mul(16)       dst0  src0  src1
+    * 1: mov.sat(16)   dst1  -dst0
+    *
+    * = After =
+    * 0: mul.sat(16)   dst0  src0 -src1
+    * 1: mov(16)       dst1  dst0
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_TRUE(instruction(block0, 0)->saturate);
+   EXPECT_TRUE(instruction(block0, 0)->src[0].negate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 1)->saturate);
+   EXPECT_FALSE(instruction(block0, 1)->src[0].negate);
+}
+
+TEST_F(saturate_propagation_test, mad_neg_mov_sat)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg src2 = v->vgrf(glsl_float_type());
+   bld.MAD(dst0, src0, src1, src2);
+   dst0.negate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: mad(16)       dst0  src0  src1 src2
+    * 1: mov.sat(16)   dst1  -dst0
+    *
+    * = After =
+    * 0: mad.sat(16)   dst0  -src0 -src1 src2
+    * 1: mov(16)       dst1  dst0
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode);
+   EXPECT_TRUE(instruction(block0, 0)->saturate);
+   EXPECT_TRUE(instruction(block0, 0)->src[0].negate);
+   EXPECT_TRUE(instruction(block0, 0)->src[1].negate);
+   EXPECT_FALSE(instruction(block0, 0)->src[2].negate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 1)->saturate);
+   EXPECT_FALSE(instruction(block0, 1)->src[0].negate);
+}
+
+TEST_F(saturate_propagation_test, mad_imm_float_neg_mov_sat)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = brw_imm_f(1.0f);
+   fs_reg src1 = brw_imm_f(-2.0f);
+   fs_reg src2 = v->vgrf(glsl_float_type());
+   /* The builder for MAD tries to be helpful and not put immediates as direct
+    * sources. We want to test specifically that case.
+    */
+   fs_inst *mad = bld.MAD(dst0, src2, src2, src2);
+   mad->src[0]= src0;
+   mad->src[1] = src1;
+   dst0.negate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: mad(16)       dst0  1.0f -2.0f src2
+    * 1: mov.sat(16)   dst1  -dst0
+    *
+    * = After =
+    * 0: mad.sat(16)   dst0  -1.0f 2.0f src2
+    * 1: mov(16)       dst1  dst0
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode);
+   EXPECT_TRUE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(instruction(block0, 0)->src[0].f, -1.0f);
+   EXPECT_EQ(instruction(block0, 0)->src[1].f, 2.0f);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 1)->saturate);
+   EXPECT_FALSE(instruction(block0, 1)->src[0].negate);
+}
+
+TEST_F(saturate_propagation_test, mul_mov_sat_neg_mov_sat)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg dst2 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.MUL(dst0, src0, src1);
+   set_saturate(true, bld.MOV(dst1, dst0));
+   dst0.negate = true;
+   set_saturate(true, bld.MOV(dst2, dst0));
+
+   /* = Before =
+    *
+    * 0: mul(16)       dst0  src0  src1
+    * 1: mov.sat(16)   dst1  dst0
+    * 2: mov.sat(16)   dst2  -dst0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_FALSE(instruction(block0, 0)->src[1].negate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+   EXPECT_TRUE(instruction(block0, 2)->src[0].negate);
+   EXPECT_TRUE(instruction(block0, 2)->saturate);
+}
+
+TEST_F(saturate_propagation_test, mul_neg_mov_sat_neg_mov_sat)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg dst2 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.MUL(dst0, src0, src1);
+   dst0.negate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
+   set_saturate(true, bld.MOV(dst2, dst0));
+
+   /* = Before =
+    *
+    * 0: mul(16)       dst0  src0  src1
+    * 1: mov.sat(16)   dst1  -dst0
+    * 2: mov.sat(16)   dst2  -dst0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_FALSE(instruction(block0, 0)->src[1].negate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->src[0].negate);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+   EXPECT_TRUE(instruction(block0, 2)->src[0].negate);
+   EXPECT_TRUE(instruction(block0, 2)->saturate);
+}
+
+TEST_F(saturate_propagation_test, abs_mov_sat)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dst0, src0, src1);
+   dst0.abs = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: add(16)       dst0  src0  src1
+    * 1: mov.sat(16)   dst1  (abs)dst0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, producer_saturates)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg dst2 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   set_saturate(true, bld.ADD(dst0, src0, src1));
+   set_saturate(true, bld.MOV(dst1, dst0));
+   bld.MOV(dst2, dst0);
+
+   /* = Before =
+    *
+    * 0: add.sat(16)   dst0  src0  src1
+    * 1: mov.sat(16)   dst1  dst0
+    * 2: mov(16)       dst2  dst0
+    *
+    * = After =
+    * 0: add.sat(16)   dst0  src0  src1
+    * 1: mov(16)       dst1  dst0
+    * 2: mov(16)       dst2  dst0
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_TRUE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, intervening_saturating_copy)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg dst2 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dst0, src0, src1);
+   set_saturate(true, bld.MOV(dst1, dst0));
+   set_saturate(true, bld.MOV(dst2, dst0));
+
+   /* = Before =
+    *
+    * 0: add(16)       dst0  src0  src1
+    * 1: mov.sat(16)   dst1  dst0
+    * 2: mov.sat(16)   dst2  dst0
+    *
+    * = After =
+    * 0: add.sat(16)   dst0  src0  src1
+    * 1: mov(16)       dst1  dst0
+    * 2: mov(16)       dst2  dst0
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_TRUE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 1)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+   EXPECT_FALSE(instruction(block0, 2)->saturate);
+}
+
+TEST_F(saturate_propagation_test, intervening_dest_write)
+{
+   fs_reg dst0 = v->vgrf(glsl_vec4_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   fs_reg src2 = v->vgrf(glsl_vec2_type());
+   bld.ADD(offset(dst0, bld, 2), src0, src1);
+   bld.emit(SHADER_OPCODE_TEX, dst0, src2)
+      ->size_written = 8 * REG_SIZE;
+   set_saturate(true, bld.MOV(dst1, offset(dst0, bld, 2)));
+
+   /* = Before =
+    *
+    * 0: add(16)        dst0+2  src0    src1
+    * 1: tex(16) rlen 4 dst0+0  src2
+    * 2: mov.sat(16)    dst1    dst0+2
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(SHADER_OPCODE_TEX, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+   EXPECT_TRUE(instruction(block0, 2)->saturate);
+}
+
+TEST_F(saturate_propagation_test, mul_neg_mov_sat_mov_sat)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg dst2 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.MUL(dst0, src0, src1);
+   dst0.negate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
+   dst0.negate = false;
+   set_saturate(true, bld.MOV(dst2, dst0));
+
+   /* = Before =
+    *
+    * 0: mul(16)       dst0  src0  src1
+    * 1: mov.sat(16)   dst1  -dst0
+    * 2: mov.sat(16)   dst2  dst0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_FALSE(instruction(block0, 0)->src[1].negate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+   EXPECT_TRUE(instruction(block0, 1)->src[0].negate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+   EXPECT_TRUE(instruction(block0, 2)->saturate);
+}
+
+TEST_F(saturate_propagation_test, smaller_exec_size_consumer)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.ADD(dst0, src0, src1);
+   set_saturate(true, bld.group(8, 0).MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: add(16)       dst0  src0  src1
+    * 1: mov.sat(8)    dst1  dst0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, larger_exec_size_consumer)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.group(8, 0).ADD(dst0, src0, src1);
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: add(8)        dst0  src0  src1
+    * 1: mov.sat(16)   dst1  dst0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, offset_source_barrier)
+{
+   fs_reg dst0 = v->vgrf(glsl_float_type());
+   fs_reg dst1 = v->vgrf(glsl_float_type());
+   fs_reg dst2 = v->vgrf(glsl_float_type());
+   fs_reg src0 = v->vgrf(glsl_float_type());
+   fs_reg src1 = v->vgrf(glsl_float_type());
+   bld.group(16, 0).ADD(dst0, src0, src1);
+   bld.group(1, 0).ADD(dst1, component(dst0, 8), brw_imm_f(1.0f));
+   set_saturate(true, bld.group(16, 0).MOV(dst2, dst0));
+
+   /* = Before =
+    *
+    * 0: add(16)       dst0  src0   src1
+    * 0: add(1)        dst1  dst0+8 1.0f
+    * 1: mov.sat(16)   dst2  dst0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_FALSE(instruction(block0, 1)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+   EXPECT_TRUE(instruction(block0, 2)->saturate);
+}
diff --git a/src/intel/compiler/elk/test_fs_scoreboard.cpp b/src/intel/compiler/elk/test_fs_scoreboard.cpp
new file mode 100644
index 00000000000..8502fc3a4ae
--- /dev/null
+++ b/src/intel/compiler/elk/test_fs_scoreboard.cpp
@@ -0,0 +1,893 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+class scoreboard_test : public ::testing::Test {
+protected:
+   scoreboard_test();
+   ~scoreboard_test() override;
+
+   struct brw_compiler *compiler;
+   struct brw_compile_params params;
+   struct intel_device_info *devinfo;
+   void *ctx;
+   struct brw_wm_prog_data *prog_data;
+   struct gl_shader_program *shader_prog;
+   fs_visitor *v;
+   fs_builder bld;
+};
+
+scoreboard_test::scoreboard_test()
+   : bld(NULL, 0)
+{
+   ctx = ralloc_context(NULL);
+   compiler = rzalloc(ctx, struct brw_compiler);
+   devinfo = rzalloc(ctx, struct intel_device_info);
+   devinfo->ver = 12;
+   devinfo->verx10 = devinfo->ver * 10;
+
+   compiler->devinfo = devinfo;
+   brw_init_isa_info(&compiler->isa, devinfo);
+
+   params = {};
+   params.mem_ctx = ctx;
+
+   prog_data = ralloc(ctx, struct brw_wm_prog_data);
+   nir_shader *shader =
+      nir_shader_create(ctx, MESA_SHADER_FRAGMENT, NULL, NULL);
+
+   v = new fs_visitor(compiler, &params, NULL, &prog_data->base, shader, 8,
+                      false, false);
+
+   bld = fs_builder(v).at_end();
+}
+
+scoreboard_test::~scoreboard_test()
+{
+   delete v;
+   v = NULL;
+
+   ralloc_free(ctx);
+   ctx = NULL;
+}
+
+static fs_inst *
+instruction(bblock_t *block, int num)
+{
+   fs_inst *inst = (fs_inst *)block->start();
+   for (int i = 0; i < num; i++) {
+      inst = (fs_inst *)inst->next;
+   }
+   return inst;
+}
+
+static void
+lower_scoreboard(fs_visitor *v)
+{
+   const bool print = getenv("TEST_DEBUG");
+
+   if (print) {
+      fprintf(stderr, "= Before =\n");
+      v->cfg->dump();
+   }
+
+   v->lower_scoreboard();
+
+   if (print) {
+      fprintf(stderr, "\n= After =\n");
+      v->cfg->dump();
+   }
+}
+
+fs_inst *
+emit_SEND(const fs_builder &bld, const fs_reg &dst,
+          const fs_reg &desc, const fs_reg &payload)
+{
+   fs_inst *inst = bld.emit(SHADER_OPCODE_SEND, dst, desc, desc, payload);
+   inst->mlen = 1;
+   return inst;
+}
+
+static tgl_swsb
+tgl_swsb_testcase(unsigned regdist, unsigned sbid, enum tgl_sbid_mode mode)
+{
+   tgl_swsb swsb = tgl_swsb_sbid(mode, sbid);
+   swsb.regdist = regdist;
+   return swsb;
+}
+
+bool operator ==(const tgl_swsb &a, const tgl_swsb &b)
+{
+   return a.mode == b.mode &&
+          a.regdist == b.regdist &&
+          (a.mode == TGL_SBID_NULL || a.sbid == b.sbid);
+}
+
+std::ostream &operator<<(std::ostream &os, const tgl_swsb &swsb) {
+   if (swsb.regdist)
+      os << "@" << swsb.regdist;
+
+   if (swsb.mode) {
+      if (swsb.regdist)
+         os << " ";
+      os << "$" << swsb.sbid;
+      if (swsb.mode & TGL_SBID_DST)
+         os << ".dst";
+      if (swsb.mode & TGL_SBID_SRC)
+         os << ".src";
+   }
+
+   return os;
+}
+
+TEST_F(scoreboard_test, RAW_inorder_inorder)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   fs_reg y = v->vgrf(glsl_int_type());
+   bld.ADD(   x, g[1], g[2]);
+   bld.MUL(   y, g[3], g[4]);
+   bld.AND(g[5],    x,    y);
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   lower_scoreboard(v);
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_regdist(1));
+}
+
+TEST_F(scoreboard_test, RAW_inorder_outoforder)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.ADD(          x, g[1], g[2]);
+   bld.MUL(       g[3], g[4], g[5]);
+   emit_SEND(bld, g[6], g[7],    x);
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   lower_scoreboard(v);
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_testcase(2, 0, TGL_SBID_SET));
+}
+
+TEST_F(scoreboard_test, RAW_outoforder_inorder)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   fs_reg y = v->vgrf(glsl_int_type());
+   emit_SEND(bld,    x, g[1], g[2]);
+   bld.MUL(          y, g[3], g[4]);
+   bld.AND(       g[5],    x,    y);
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   lower_scoreboard(v);
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_sbid(TGL_SBID_SET, 0));
+   EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_testcase(1, 0, TGL_SBID_DST));
+}
+
+TEST_F(scoreboard_test, RAW_outoforder_outoforder)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   /* The second SEND depends on the first, and would need to refer to two
+    * SBIDs.  Since it is not possible we expect a SYNC instruction to be
+    * added.
+    */
+   fs_reg x = v->vgrf(glsl_int_type());
+   emit_SEND(bld,    x, g[1], g[2]);
+   emit_SEND(bld, g[3],    x, g[4])->sfid++;
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+
+   lower_scoreboard(v);
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_sbid(TGL_SBID_SET, 0));
+
+   fs_inst *sync = instruction(block0, 1);
+   EXPECT_EQ(sync->opcode, BRW_OPCODE_SYNC);
+   EXPECT_EQ(sync->sched, tgl_swsb_sbid(TGL_SBID_DST, 0));
+
+   EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_sbid(TGL_SBID_SET, 1));
+}
+
+TEST_F(scoreboard_test, WAR_inorder_inorder)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.ADD(g[1],    x, g[2]);
+   bld.MUL(g[3], g[4], g[5]);
+   bld.AND(   x, g[6], g[7]);
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   lower_scoreboard(v);
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_null());
+}
+
+TEST_F(scoreboard_test, WAR_inorder_outoforder)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.ADD(       g[1],    x, g[2]);
+   bld.MUL(       g[3], g[4], g[5]);
+   emit_SEND(bld,    x, g[6], g[7]);
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   lower_scoreboard(v);
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_testcase(2, 0, TGL_SBID_SET));
+}
+
+TEST_F(scoreboard_test, WAR_outoforder_inorder)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   emit_SEND(bld, g[1], g[2],    x);
+   bld.MUL(       g[4], g[5], g[6]);
+   bld.AND(          x, g[7], g[8]);
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   lower_scoreboard(v);
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_sbid(TGL_SBID_SET, 0));
+   EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_sbid(TGL_SBID_SRC, 0));
+}
+
+TEST_F(scoreboard_test, WAR_outoforder_outoforder)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   emit_SEND(bld, g[1], g[2],    x);
+   emit_SEND(bld,    x, g[3], g[4])->sfid++;
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+
+   lower_scoreboard(v);
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_sbid(TGL_SBID_SET, 0));
+
+   fs_inst *sync = instruction(block0, 1);
+   EXPECT_EQ(sync->opcode, BRW_OPCODE_SYNC);
+   EXPECT_EQ(sync->sched, tgl_swsb_sbid(TGL_SBID_SRC, 0));
+
+   EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_sbid(TGL_SBID_SET, 1));
+}
+
+TEST_F(scoreboard_test, WAW_inorder_inorder)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.ADD(   x, g[1], g[2]);
+   bld.MUL(g[3], g[4], g[5]);
+   bld.AND(   x, g[6], g[7]);
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   lower_scoreboard(v);
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null());
+
+   /* NOTE: We only need this RegDist if a long instruction is followed by a
+    * short one.  The pass is currently conservative about this and adding the
+    * annotation.
+    */
+   EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_regdist(2));
+}
+
+TEST_F(scoreboard_test, WAW_inorder_outoforder)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.ADD(          x, g[1], g[2]);
+   bld.MUL(       g[3], g[4], g[5]);
+   emit_SEND(bld,    x, g[6], g[7]);
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   lower_scoreboard(v);
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_testcase(2, 0, TGL_SBID_SET));
+}
+
+TEST_F(scoreboard_test, WAW_outoforder_inorder)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   emit_SEND(bld,    x, g[1], g[2]);
+   bld.MUL(       g[3], g[4], g[5]);
+   bld.AND(          x, g[6], g[7]);
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   lower_scoreboard(v);
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_sbid(TGL_SBID_SET, 0));
+   EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_sbid(TGL_SBID_DST, 0));
+}
+
+TEST_F(scoreboard_test, WAW_outoforder_outoforder)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   emit_SEND(bld, x, g[1], g[2]);
+   emit_SEND(bld, x, g[3], g[4])->sfid++;
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+
+   lower_scoreboard(v);
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_sbid(TGL_SBID_SET, 0));
+
+   fs_inst *sync = instruction(block0, 1);
+   EXPECT_EQ(sync->opcode, BRW_OPCODE_SYNC);
+   EXPECT_EQ(sync->sched, tgl_swsb_sbid(TGL_SBID_DST, 0));
+
+   EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_sbid(TGL_SBID_SET, 1));
+}
+
+
+TEST_F(scoreboard_test, loop1)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.XOR(   x, g[1], g[2]);
+
+   bld.emit(BRW_OPCODE_DO);
+
+   bld.ADD(   x, g[1], g[2]);
+   bld.emit(BRW_OPCODE_WHILE)->predicate = BRW_PREDICATE_NORMAL;
+
+   bld.MUL(   x, g[1], g[2]);
+
+   v->calculate_cfg();
+   lower_scoreboard(v);
+
+   bblock_t *body = v->cfg->blocks[2];
+   fs_inst *add = instruction(body, 0);
+   EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
+   EXPECT_EQ(add->sched, tgl_swsb_regdist(1));
+
+   bblock_t *last_block = v->cfg->blocks[3];
+   fs_inst *mul = instruction(last_block, 0);
+   EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
+   EXPECT_EQ(mul->sched, tgl_swsb_regdist(1));
+}
+
+TEST_F(scoreboard_test, loop2)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.XOR(   x, g[1], g[2]);
+   bld.XOR(g[3], g[1], g[2]);
+   bld.XOR(g[4], g[1], g[2]);
+   bld.XOR(g[5], g[1], g[2]);
+
+   bld.emit(BRW_OPCODE_DO);
+
+   bld.ADD(   x, g[1], g[2]);
+   bld.emit(BRW_OPCODE_WHILE)->predicate = BRW_PREDICATE_NORMAL;
+
+   bld.MUL(   x, g[1], g[2]);
+
+   v->calculate_cfg();
+   lower_scoreboard(v);
+
+   /* Now the write in ADD has the tightest RegDist for both ADD and MUL. */
+
+   bblock_t *body = v->cfg->blocks[2];
+   fs_inst *add = instruction(body, 0);
+   EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
+   EXPECT_EQ(add->sched, tgl_swsb_regdist(2));
+
+   bblock_t *last_block = v->cfg->blocks[3];
+   fs_inst *mul = instruction(last_block, 0);
+   EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
+   EXPECT_EQ(mul->sched, tgl_swsb_regdist(2));
+}
+
+TEST_F(scoreboard_test, loop3)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.XOR(   x, g[1], g[2]);
+
+   bld.emit(BRW_OPCODE_DO);
+
+   /* For the ADD in the loop body this extra distance will always apply. */
+   bld.XOR(g[3], g[1], g[2]);
+   bld.XOR(g[4], g[1], g[2]);
+   bld.XOR(g[5], g[1], g[2]);
+   bld.XOR(g[6], g[1], g[2]);
+
+   bld.ADD(   x, g[1], g[2]);
+   bld.emit(BRW_OPCODE_WHILE)->predicate = BRW_PREDICATE_NORMAL;
+
+   bld.MUL(   x, g[1], g[2]);
+
+   v->calculate_cfg();
+   lower_scoreboard(v);
+
+   bblock_t *body = v->cfg->blocks[2];
+   fs_inst *add = instruction(body, 4);
+   EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
+   EXPECT_EQ(add->sched, tgl_swsb_regdist(5));
+
+   bblock_t *last_block = v->cfg->blocks[3];
+   fs_inst *mul = instruction(last_block, 0);
+   EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
+   EXPECT_EQ(mul->sched, tgl_swsb_regdist(1));
+}
+
+
+TEST_F(scoreboard_test, conditional1)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.XOR(   x, g[1], g[2]);
+   bld.emit(BRW_OPCODE_IF);
+
+   bld.ADD(   x, g[1], g[2]);
+
+   bld.emit(BRW_OPCODE_ENDIF);
+   bld.MUL(   x, g[1], g[2]);
+
+   v->calculate_cfg();
+   lower_scoreboard(v);
+
+   bblock_t *body = v->cfg->blocks[1];
+   fs_inst *add = instruction(body, 0);
+   EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
+   EXPECT_EQ(add->sched, tgl_swsb_regdist(2));
+
+   bblock_t *last_block = v->cfg->blocks[2];
+   fs_inst *mul = instruction(last_block, 1);
+   EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
+   EXPECT_EQ(mul->sched, tgl_swsb_regdist(2));
+}
+
+TEST_F(scoreboard_test, conditional2)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.XOR(   x, g[1], g[2]);
+   bld.XOR(g[3], g[1], g[2]);
+   bld.XOR(g[4], g[1], g[2]);
+   bld.XOR(g[5], g[1], g[2]);
+   bld.emit(BRW_OPCODE_IF);
+
+   bld.ADD(   x, g[1], g[2]);
+
+   bld.emit(BRW_OPCODE_ENDIF);
+   bld.MUL(   x, g[1], g[2]);
+
+   v->calculate_cfg();
+   lower_scoreboard(v);
+
+   bblock_t *body = v->cfg->blocks[1];
+   fs_inst *add = instruction(body, 0);
+   EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
+   EXPECT_EQ(add->sched, tgl_swsb_regdist(5));
+
+   bblock_t *last_block = v->cfg->blocks[2];
+   fs_inst *mul = instruction(last_block, 1);
+   EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
+   EXPECT_EQ(mul->sched, tgl_swsb_regdist(2));
+}
+
+TEST_F(scoreboard_test, conditional3)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.XOR(   x, g[1], g[2]);
+   bld.emit(BRW_OPCODE_IF);
+
+   bld.XOR(g[3], g[1], g[2]);
+   bld.XOR(g[4], g[1], g[2]);
+   bld.XOR(g[5], g[1], g[2]);
+   bld.ADD(   x, g[1], g[2]);
+
+   bld.emit(BRW_OPCODE_ENDIF);
+   bld.MUL(   x, g[1], g[2]);
+
+   v->calculate_cfg();
+   lower_scoreboard(v);
+
+   bblock_t *body = v->cfg->blocks[1];
+   fs_inst *add = instruction(body, 3);
+   EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
+   EXPECT_EQ(add->sched, tgl_swsb_regdist(5));
+
+   bblock_t *last_block = v->cfg->blocks[2];
+   fs_inst *mul = instruction(last_block, 1);
+   EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
+   EXPECT_EQ(mul->sched, tgl_swsb_regdist(2));
+}
+
+TEST_F(scoreboard_test, conditional4)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.XOR(   x, g[1], g[2]);
+   bld.emit(BRW_OPCODE_IF);
+
+   bld.ADD(   x, g[1], g[2]);
+   bld.XOR(g[3], g[1], g[2]);
+   bld.XOR(g[4], g[1], g[2]);
+   bld.XOR(g[5], g[1], g[2]);
+
+   bld.emit(BRW_OPCODE_ENDIF);
+   bld.MUL(   x, g[1], g[2]);
+
+   v->calculate_cfg();
+   lower_scoreboard(v);
+
+   bblock_t *body = v->cfg->blocks[1];
+   fs_inst *add = instruction(body, 0);
+   EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
+   EXPECT_EQ(add->sched, tgl_swsb_regdist(2));
+
+   bblock_t *last_block = v->cfg->blocks[2];
+   fs_inst *mul = instruction(last_block, 1);
+   EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
+   EXPECT_EQ(mul->sched, tgl_swsb_regdist(3));
+}
+
+TEST_F(scoreboard_test, conditional5)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.XOR(   x, g[1], g[2]);
+   bld.emit(BRW_OPCODE_IF);
+
+   bld.ADD(   x, g[1], g[2]);
+   bld.emit(BRW_OPCODE_ELSE);
+
+   bld.ROL(   x, g[1], g[2]);
+
+   bld.emit(BRW_OPCODE_ENDIF);
+   bld.MUL(   x, g[1], g[2]);
+
+   v->calculate_cfg();
+   lower_scoreboard(v);
+
+   bblock_t *then_body = v->cfg->blocks[1];
+   fs_inst *add = instruction(then_body, 0);
+   EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
+   EXPECT_EQ(add->sched, tgl_swsb_regdist(2));
+
+   bblock_t *else_body = v->cfg->blocks[2];
+   fs_inst *rol = instruction(else_body, 0);
+   EXPECT_EQ(rol->opcode, BRW_OPCODE_ROL);
+   EXPECT_EQ(rol->sched, tgl_swsb_regdist(2));
+
+   bblock_t *last_block = v->cfg->blocks[3];
+   fs_inst *mul = instruction(last_block, 1);
+   EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
+   EXPECT_EQ(mul->sched, tgl_swsb_regdist(2));
+}
+
+TEST_F(scoreboard_test, conditional6)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.XOR(   x, g[1], g[2]);
+   bld.emit(BRW_OPCODE_IF);
+
+   bld.XOR(g[3], g[1], g[2]);
+   bld.XOR(g[4], g[1], g[2]);
+   bld.XOR(g[5], g[1], g[2]);
+   bld.ADD(   x, g[1], g[2]);
+   bld.emit(BRW_OPCODE_ELSE);
+
+   bld.XOR(g[6], g[1], g[2]);
+   bld.XOR(g[7], g[1], g[2]);
+   bld.XOR(g[8], g[1], g[2]);
+   bld.XOR(g[9], g[1], g[2]);
+   bld.ROL(   x, g[1], g[2]);
+
+   bld.emit(BRW_OPCODE_ENDIF);
+   bld.MUL(   x, g[1], g[2]);
+
+   v->calculate_cfg();
+   lower_scoreboard(v);
+
+   bblock_t *then_body = v->cfg->blocks[1];
+   fs_inst *add = instruction(then_body, 3);
+   EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
+   EXPECT_EQ(add->sched, tgl_swsb_regdist(5));
+
+   bblock_t *else_body = v->cfg->blocks[2];
+   fs_inst *rol = instruction(else_body, 4);
+   EXPECT_EQ(rol->opcode, BRW_OPCODE_ROL);
+   EXPECT_EQ(rol->sched, tgl_swsb_regdist(6));
+
+   bblock_t *last_block = v->cfg->blocks[3];
+   fs_inst *mul = instruction(last_block, 1);
+   EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
+   EXPECT_EQ(mul->sched, tgl_swsb_regdist(2));
+}
+
+TEST_F(scoreboard_test, conditional7)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.XOR(   x, g[1], g[2]);
+   bld.emit(BRW_OPCODE_IF);
+
+   bld.ADD(   x, g[1], g[2]);
+   bld.XOR(g[3], g[1], g[2]);
+   bld.XOR(g[4], g[1], g[2]);
+   bld.XOR(g[5], g[1], g[2]);
+   bld.emit(BRW_OPCODE_ELSE);
+
+   bld.ROL(   x, g[1], g[2]);
+   bld.XOR(g[6], g[1], g[2]);
+   bld.XOR(g[7], g[1], g[2]);
+   bld.XOR(g[8], g[1], g[2]);
+   bld.XOR(g[9], g[1], g[2]);
+
+   bld.emit(BRW_OPCODE_ENDIF);
+   bld.MUL(   x, g[1], g[2]);
+
+   v->calculate_cfg();
+   lower_scoreboard(v);
+
+   bblock_t *then_body = v->cfg->blocks[1];
+   fs_inst *add = instruction(then_body, 0);
+   EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
+   EXPECT_EQ(add->sched, tgl_swsb_regdist(2));
+
+   bblock_t *else_body = v->cfg->blocks[2];
+   fs_inst *rol = instruction(else_body, 0);
+   EXPECT_EQ(rol->opcode, BRW_OPCODE_ROL);
+   EXPECT_EQ(rol->sched, tgl_swsb_regdist(2));
+
+   bblock_t *last_block = v->cfg->blocks[3];
+   fs_inst *mul = instruction(last_block, 1);
+   EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
+   EXPECT_EQ(mul->sched, tgl_swsb_regdist(6));
+}
+
+TEST_F(scoreboard_test, conditional8)
+{
+   fs_reg g[16];
+   for (unsigned i = 0; i < ARRAY_SIZE(g); i++)
+      g[i] = v->vgrf(glsl_int_type());
+
+   fs_reg x = v->vgrf(glsl_int_type());
+   bld.XOR(   x, g[1], g[2]);
+   bld.XOR(g[3], g[1], g[2]);
+   bld.XOR(g[4], g[1], g[2]);
+   bld.XOR(g[5], g[1], g[2]);
+   bld.XOR(g[6], g[1], g[2]);
+   bld.XOR(g[7], g[1], g[2]);
+   bld.emit(BRW_OPCODE_IF);
+
+   bld.ADD(   x, g[1], g[2]);
+   bld.emit(BRW_OPCODE_ELSE);
+
+   bld.ROL(   x, g[1], g[2]);
+
+   bld.emit(BRW_OPCODE_ENDIF);
+   bld.MUL(   x, g[1], g[2]);
+
+   v->calculate_cfg();
+   lower_scoreboard(v);
+
+   bblock_t *then_body = v->cfg->blocks[1];
+   fs_inst *add = instruction(then_body, 0);
+   EXPECT_EQ(add->opcode, BRW_OPCODE_ADD);
+   EXPECT_EQ(add->sched, tgl_swsb_regdist(7));
+
+   /* Note that the ROL will have RegDist 2 and not 7, illustrating the
+    * physical CFG edge between the then-block and the else-block.
+    */
+   bblock_t *else_body = v->cfg->blocks[2];
+   fs_inst *rol = instruction(else_body, 0);
+   EXPECT_EQ(rol->opcode, BRW_OPCODE_ROL);
+   EXPECT_EQ(rol->sched, tgl_swsb_regdist(2));
+
+   bblock_t *last_block = v->cfg->blocks[3];
+   fs_inst *mul = instruction(last_block, 1);
+   EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL);
+   EXPECT_EQ(mul->sched, tgl_swsb_regdist(2));
+}
+
+TEST_F(scoreboard_test, gfx125_RaR_over_different_pipes)
+{
+   devinfo->verx10 = 125;
+   brw_init_isa_info(&compiler->isa, devinfo);
+
+   fs_reg a = v->vgrf(glsl_int_type());
+   fs_reg b = v->vgrf(glsl_int_type());
+   fs_reg f = v->vgrf(glsl_float_type());
+   fs_reg x = v->vgrf(glsl_int_type());
+
+   bld.ADD(f, x, x);
+   bld.ADD(a, x, x);
+   bld.ADD(x, b, b);
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   lower_scoreboard(v);
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+
+   EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null());
+   EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_regdist(1));
+}
diff --git a/src/intel/compiler/elk/test_predicated_break.cpp b/src/intel/compiler/elk/test_predicated_break.cpp
new file mode 100644
index 00000000000..6289867d037
--- /dev/null
+++ b/src/intel/compiler/elk/test_predicated_break.cpp
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2023 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <gtest/gtest.h>
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+class PredicatedBreakTest : public ::testing::Test {
+   virtual void SetUp();
+   virtual void TearDown();
+
+public:
+   bool debug;
+   void *mem_ctx;
+   brw_compiler compiler;
+   brw_compile_params params;
+   intel_device_info devinfo;
+   struct brw_wm_prog_data prog_data;
+   struct gl_shader_program *shader_prog;
+
+   fs_visitor *shader_a;
+   fs_visitor *shader_b;
+
+   bool opt_predicated_break(fs_visitor *s);
+};
+
+void
+PredicatedBreakTest::SetUp()
+{
+   debug = getenv("TEST_DEBUG");
+
+   mem_ctx = ralloc_context(NULL);
+
+   devinfo = {};
+   devinfo.ver = 9;
+   devinfo.verx10 = 90;
+
+   compiler = {};
+   compiler.devinfo = &devinfo;
+   brw_init_isa_info(&compiler.isa, &devinfo);
+
+   params = {};
+   params.mem_ctx = mem_ctx;
+
+   prog_data = {};
+   nir_shader *nir =
+      nir_shader_create(mem_ctx, MESA_SHADER_FRAGMENT, NULL, NULL);
+
+   shader_a = new fs_visitor(&compiler, &params, NULL,
+                             &prog_data.base, nir, 8, false, false);
+
+   shader_b = new fs_visitor(&compiler, &params, NULL,
+                             &prog_data.base, nir, 8, false, false);
+}
+
+void
+PredicatedBreakTest::TearDown()
+{
+   delete shader_a;
+   delete shader_b;
+   ralloc_free(mem_ctx);
+   mem_ctx = NULL;
+}
+
+bool
+PredicatedBreakTest::opt_predicated_break(fs_visitor *s)
+{
+   const bool print = getenv("TEST_DEBUG");
+
+   if (print) {
+      fprintf(stderr, "= Before =\n");
+      s->cfg->dump();
+   }
+
+   bool ret = ::opt_predicated_break(s);
+
+   if (print) {
+      fprintf(stderr, "\n= After =\n");
+      s->cfg->dump();
+   }
+
+   return ret;
+}
+
+static fs_builder
+make_builder(fs_visitor *s)
+{
+   return fs_builder(s, s->dispatch_width).at_end();
+}
+
+static testing::AssertionResult
+shaders_match(const char *a_expr, const char *b_expr,
+              fs_visitor *a, fs_visitor *b)
+{
+   /* Using the CFG string dump for this.  Not ideal but it is
+    * convenient that covers some CFG information, helping to
+    * check if the optimization is keeping the CFG valid.
+    */
+
+   char *a_str = NULL;
+   size_t a_size = 0;
+   FILE *a_file = open_memstream(&a_str, &a_size);
+   a->cfg->dump(a_file);
+   fclose(a_file);
+
+   char *b_str = NULL;
+   size_t b_size = 0;
+   FILE *b_file = open_memstream(&b_str, &b_size);
+   b->cfg->dump(b_file);
+   fclose(b_file);
+
+   if (a_size != b_size || strcmp(a_str, b_str) != 0) {
+      std::stringstream result;
+
+      result << "Shaders don't match.\n\n"
+             << a_expr << " is:\n\n" << a_str
+             << "\n---\n"
+             << b_expr << " is:\n\n" << b_str
+             << "\n";
+
+      free(a_str);
+      free(b_str);
+      return testing::AssertionFailure() << result.str();
+   }
+
+   free(a_str);
+   free(b_str);
+   return testing::AssertionSuccess();
+}
+
+#define ASSERT_SHADERS_MATCH(a, b)  ASSERT_PRED_FORMAT2(shaders_match, a, b)
+
+TEST_F(PredicatedBreakTest, TopBreakWithoutContinue)
+{
+   fs_builder a = make_builder(shader_a);
+   fs_builder b = make_builder(shader_b);
+
+   fs_reg r1 = brw_vec8_grf(1, 0);
+   fs_reg r2 = brw_vec8_grf(2, 0);
+   fs_reg r3 = brw_vec8_grf(3, 0);
+
+   a.DO();
+   a.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
+   a.IF(BRW_PREDICATE_NORMAL);
+   a.BREAK();
+   a.ENDIF();
+   a.ADD(r1, r2, r3);
+   a.WHILE();
+   a.NOP();  /* There's always going to be something after a WHILE. */
+   shader_a->calculate_cfg();
+
+   /* The IF/ENDIF around the BREAK is expected to be removed. */
+   bool progress = opt_predicated_break(shader_a);
+   EXPECT_TRUE(progress);
+
+   b.DO();
+   b.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
+   b.BREAK()->predicate = BRW_PREDICATE_NORMAL;
+   b.ADD(r1, r2, r3);
+   b.WHILE();
+   b.NOP();
+   shader_b->calculate_cfg();
+
+   ASSERT_SHADERS_MATCH(shader_a, shader_b);
+}
+
+TEST_F(PredicatedBreakTest, TopBreakWithContinue)
+{
+   fs_builder a = make_builder(shader_a);
+   fs_builder b = make_builder(shader_b);
+
+   fs_reg r1 = brw_vec8_grf(1, 0);
+   fs_reg r2 = brw_vec8_grf(2, 0);
+   fs_reg r3 = brw_vec8_grf(3, 0);
+
+   a.DO();
+   a.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
+   a.IF(BRW_PREDICATE_NORMAL);
+   a.BREAK();
+   a.ENDIF();
+   a.ADD(r1, r2, r3);
+   a.CMP(r1, r2, r3, BRW_CONDITIONAL_GE);
+   a.IF(BRW_PREDICATE_NORMAL);
+   a.CONTINUE();
+   a.ENDIF();
+   a.MUL(r1, r2, r3);
+   a.WHILE();
+   a.NOP();  /* There's always going to be something after a WHILE. */
+   shader_a->calculate_cfg();
+
+   /* The IF/ENDIF around the BREAK and the CONTINUE are expected to be
+    * removed.
+    */
+   bool progress = opt_predicated_break(shader_a);
+   EXPECT_TRUE(progress);
+
+   b.DO();
+   b.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
+   b.BREAK()->predicate = BRW_PREDICATE_NORMAL;
+   b.ADD(r1, r2, r3);
+   b.CMP(r1, r2, r3, BRW_CONDITIONAL_GE);
+   b.CONTINUE()->predicate = BRW_PREDICATE_NORMAL;
+   b.MUL(r1, r2, r3);
+   b.WHILE();
+   b.NOP();
+   shader_b->calculate_cfg();
+
+   ASSERT_SHADERS_MATCH(shader_a, shader_b);
+}
+
+TEST_F(PredicatedBreakTest, DISABLED_BottomBreakWithoutContinue)
+{
+   fs_builder a = make_builder(shader_a);
+   fs_builder b = make_builder(shader_b);
+
+   fs_reg r1 = brw_vec8_grf(1, 0);
+   fs_reg r2 = brw_vec8_grf(2, 0);
+   fs_reg r3 = brw_vec8_grf(3, 0);
+
+   a.DO();
+   a.ADD(r1, r2, r3);
+   a.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
+   a.IF(BRW_PREDICATE_NORMAL);
+   a.BREAK();
+   a.ENDIF();
+   a.WHILE();
+   a.NOP();  /* There's always going to be something after a WHILE. */
+   shader_a->calculate_cfg();
+
+   /* BREAK is the only way to exit the loop, so expect to remove the
+    * IF/BREAK/ENDIF and add a predicate to WHILE.
+    */
+   bool progress = opt_predicated_break(shader_a);
+   EXPECT_TRUE(progress);
+
+   b.DO();
+   b.ADD(r1, r2, r3);
+   b.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
+   auto w = b.WHILE();
+   w->predicate = BRW_PREDICATE_NORMAL;
+   w->predicate_inverse = true;
+   b.NOP();
+   shader_b->calculate_cfg();
+
+   ASSERT_SHADERS_MATCH(shader_a, shader_b);
+}
+
+
+TEST_F(PredicatedBreakTest, BottomBreakWithContinue)
+{
+   fs_builder a = make_builder(shader_a);
+   fs_builder b = make_builder(shader_b);
+
+   fs_reg r1 = brw_vec8_grf(1, 0);
+   fs_reg r2 = brw_vec8_grf(2, 0);
+   fs_reg r3 = brw_vec8_grf(3, 0);
+
+   a.DO();
+   a.ADD(r1, r2, r3);
+   a.CMP(r1, r2, r3, BRW_CONDITIONAL_GE);
+   a.IF(BRW_PREDICATE_NORMAL);
+   a.CONTINUE();
+   a.ENDIF();
+   a.MUL(r1, r2, r3);
+   a.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
+   a.IF(BRW_PREDICATE_NORMAL);
+   a.BREAK();
+   a.ENDIF();
+   a.WHILE();
+   a.NOP();  /* There's always going to be something after a WHILE. */
+   shader_a->calculate_cfg();
+
+   /* With a CONTINUE, the BREAK can't be removed, but still remove the
+    * IF/ENDIF around both of them.
+    */
+   bool progress = opt_predicated_break(shader_a);
+   EXPECT_TRUE(progress);
+
+   b.DO();
+   b.ADD(r1, r2, r3);
+   b.CMP(r1, r2, r3, BRW_CONDITIONAL_GE);
+   b.CONTINUE()->predicate = BRW_PREDICATE_NORMAL;
+   b.MUL(r1, r2, r3);
+   b.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
+   b.BREAK()->predicate = BRW_PREDICATE_NORMAL;
+   b.WHILE();
+   b.NOP();
+   shader_b->calculate_cfg();
+
+   ASSERT_SHADERS_MATCH(shader_a, shader_b);
+}
+
+TEST_F(PredicatedBreakTest, TwoBreaks)
+{
+   fs_builder a = make_builder(shader_a);
+   fs_builder b = make_builder(shader_b);
+
+   fs_reg r1 = brw_vec8_grf(1, 0);
+   fs_reg r2 = brw_vec8_grf(2, 0);
+   fs_reg r3 = brw_vec8_grf(3, 0);
+
+   a.DO();
+   a.ADD(r1, r2, r3);
+   a.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
+   a.IF(BRW_PREDICATE_NORMAL);
+   a.BREAK();
+   a.ENDIF();
+   a.MUL(r1, r2, r3);
+   a.CMP(r1, r2, r3, BRW_CONDITIONAL_GE);
+   a.IF(BRW_PREDICATE_NORMAL);
+   a.BREAK();
+   a.ENDIF();
+   a.AND(r1, r2, r3);
+   a.WHILE();
+   a.NOP();  /* There's always going to be something after a WHILE. */
+   shader_a->calculate_cfg();
+
+   /* The IF/ENDIF around the breaks are expected to be removed. */
+   bool progress = opt_predicated_break(shader_a);
+   EXPECT_TRUE(progress);
+
+   b.DO();
+   b.ADD(r1, r2, r3);
+   b.CMP(r1, r2, r3, BRW_CONDITIONAL_NZ);
+   b.BREAK()->predicate = BRW_PREDICATE_NORMAL;
+   b.MUL(r1, r2, r3);
+   b.CMP(r1, r2, r3, BRW_CONDITIONAL_GE);
+   b.BREAK()->predicate = BRW_PREDICATE_NORMAL;
+   b.AND(r1, r2, r3);
+   b.WHILE();
+   b.NOP();  /* There's always going to be something after a WHILE. */
+   shader_b->calculate_cfg();
+
+   ASSERT_SHADERS_MATCH(shader_a, shader_b);
+}
diff --git a/src/intel/compiler/elk/test_simd_selection.cpp b/src/intel/compiler/elk/test_simd_selection.cpp
new file mode 100644
index 00000000000..f3076b817a0
--- /dev/null
+++ b/src/intel/compiler/elk/test_simd_selection.cpp
@@ -0,0 +1,398 @@
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+
+#include "brw_private.h"
+#include "compiler/shader_info.h"
+#include "intel/dev/intel_debug.h"
+#include "intel/dev/intel_device_info.h"
+#include "util/ralloc.h"
+
+#include <gtest/gtest.h>
+
+enum {
+   SIMD8  = 0,
+   SIMD16 = 1,
+   SIMD32 = 2,
+};
+
+const bool spilled = true;
+const bool not_spilled = false;
+
+class SIMDSelectionTest : public ::testing::Test {
+protected:
+   SIMDSelectionTest()
+   : mem_ctx(ralloc_context(NULL))
+   , devinfo(rzalloc(mem_ctx, intel_device_info))
+   , prog_data(rzalloc(mem_ctx, struct brw_cs_prog_data))
+   , simd_state{
+      .devinfo = devinfo,
+      .prog_data = prog_data,
+     }
+   {
+      process_intel_debug_variable();
+   }
+
+   ~SIMDSelectionTest() {
+      ralloc_free(mem_ctx);
+   };
+
+   void *mem_ctx;
+   intel_device_info *devinfo;
+   struct brw_cs_prog_data *prog_data;
+   brw_simd_selection_state simd_state;
+};
+
+class SIMDSelectionCS : public SIMDSelectionTest {
+protected:
+   SIMDSelectionCS() {
+      prog_data->base.stage = MESA_SHADER_COMPUTE;
+      prog_data->local_size[0] = 32;
+      prog_data->local_size[1] = 1;
+      prog_data->local_size[2] = 1;
+
+      devinfo->max_cs_workgroup_threads = 64;
+   }
+};
+
+TEST_F(SIMDSelectionCS, DefaultsToSIMD16)
+{
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   brw_simd_mark_compiled(simd_state, SIMD8, not_spilled);
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD16));
+   brw_simd_mark_compiled(simd_state, SIMD16, not_spilled);
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD32));
+
+   ASSERT_EQ(brw_simd_select(simd_state), SIMD16);
+}
+
+TEST_F(SIMDSelectionCS, TooBigFor16)
+{
+   prog_data->local_size[0] = devinfo->max_cs_workgroup_threads;
+   prog_data->local_size[1] = 32;
+   prog_data->local_size[2] = 1;
+
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD8));
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD16));
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD32));
+   brw_simd_mark_compiled(simd_state, SIMD32, spilled);
+
+   ASSERT_EQ(brw_simd_select(simd_state), SIMD32);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSize1)
+{
+   prog_data->local_size[0] = 1;
+   prog_data->local_size[1] = 1;
+   prog_data->local_size[2] = 1;
+
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   brw_simd_mark_compiled(simd_state, SIMD8, not_spilled);
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD16));
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD32));
+
+   ASSERT_EQ(brw_simd_select(simd_state), SIMD8);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSize8)
+{
+   prog_data->local_size[0] = 8;
+   prog_data->local_size[1] = 1;
+   prog_data->local_size[2] = 1;
+
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   brw_simd_mark_compiled(simd_state, SIMD8, not_spilled);
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD16));
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD32));
+
+   ASSERT_EQ(brw_simd_select(simd_state), SIMD8);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSizeVariable)
+{
+   prog_data->local_size[0] = 0;
+   prog_data->local_size[1] = 0;
+   prog_data->local_size[2] = 0;
+
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   brw_simd_mark_compiled(simd_state, SIMD8, not_spilled);
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD16));
+   brw_simd_mark_compiled(simd_state, SIMD16, not_spilled);
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD32));
+   brw_simd_mark_compiled(simd_state, SIMD32, not_spilled);
+
+   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD16 | 1u << SIMD32);
+
+   const unsigned wg_8_1_1[] = { 8, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD8);
+
+   const unsigned wg_16_1_1[] = { 16, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD16);
+
+   const unsigned wg_32_1_1[] = { 32, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD16);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSizeVariableSpilled)
+{
+   prog_data->local_size[0] = 0;
+   prog_data->local_size[1] = 0;
+   prog_data->local_size[2] = 0;
+
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   brw_simd_mark_compiled(simd_state, SIMD8, spilled);
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD16));
+   brw_simd_mark_compiled(simd_state, SIMD16, spilled);
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD32));
+   brw_simd_mark_compiled(simd_state, SIMD32, spilled);
+
+   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD16 | 1u << SIMD32);
+
+   const unsigned wg_8_1_1[] = { 8, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD8);
+
+   const unsigned wg_16_1_1[] = { 16, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD8);
+
+   const unsigned wg_32_1_1[] = { 32, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD8);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8)
+{
+   prog_data->local_size[0] = 0;
+   prog_data->local_size[1] = 0;
+   prog_data->local_size[2] = 0;
+
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD16));
+   brw_simd_mark_compiled(simd_state, SIMD16, not_spilled);
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD32));
+   brw_simd_mark_compiled(simd_state, SIMD32, not_spilled);
+
+   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD16 | 1u << SIMD32);
+
+   const unsigned wg_8_1_1[] = { 8, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD16);
+
+   const unsigned wg_16_1_1[] = { 16, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD16);
+
+   const unsigned wg_32_1_1[] = { 32, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD16);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD16)
+{
+   prog_data->local_size[0] = 0;
+   prog_data->local_size[1] = 0;
+   prog_data->local_size[2] = 0;
+
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   brw_simd_mark_compiled(simd_state, SIMD8, not_spilled);
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD16));
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD32));
+   brw_simd_mark_compiled(simd_state, SIMD32, not_spilled);
+
+   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD32);
+
+   const unsigned wg_8_1_1[] = { 8, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD8);
+
+   const unsigned wg_16_1_1[] = { 16, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD8);
+
+   const unsigned wg_32_1_1[] = { 32, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD8);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8NoSIMD16)
+{
+   prog_data->local_size[0] = 0;
+   prog_data->local_size[1] = 0;
+   prog_data->local_size[2] = 0;
+
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD16));
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD32));
+   brw_simd_mark_compiled(simd_state, SIMD32, not_spilled);
+
+   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD32);
+
+   const unsigned wg_8_1_1[] = { 8, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD32);
+
+   const unsigned wg_16_1_1[] = { 16, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD32);
+
+   const unsigned wg_32_1_1[] = { 32, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD32);
+}
+
+TEST_F(SIMDSelectionCS, SpillAtSIMD8)
+{
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   brw_simd_mark_compiled(simd_state, SIMD8, spilled);
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD16));
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD32));
+
+   ASSERT_EQ(brw_simd_select(simd_state), SIMD8);
+}
+
+TEST_F(SIMDSelectionCS, SpillAtSIMD16)
+{
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   brw_simd_mark_compiled(simd_state, SIMD8, not_spilled);
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD16));
+   brw_simd_mark_compiled(simd_state, SIMD16, spilled);
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD32));
+
+   ASSERT_EQ(brw_simd_select(simd_state), SIMD8);
+}
+
+TEST_F(SIMDSelectionCS, EnvironmentVariable32)
+{
+   intel_debug |= DEBUG_DO32;
+
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   brw_simd_mark_compiled(simd_state, SIMD8, not_spilled);
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD16));
+   brw_simd_mark_compiled(simd_state, SIMD16, not_spilled);
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD32));
+   brw_simd_mark_compiled(simd_state, SIMD32, not_spilled);
+
+   ASSERT_EQ(brw_simd_select(simd_state), SIMD32);
+}
+
+TEST_F(SIMDSelectionCS, EnvironmentVariable32ButSpills)
+{
+   intel_debug |= DEBUG_DO32;
+
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   brw_simd_mark_compiled(simd_state, SIMD8, not_spilled);
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD16));
+   brw_simd_mark_compiled(simd_state, SIMD16, not_spilled);
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD32));
+   brw_simd_mark_compiled(simd_state, SIMD32, spilled);
+
+   ASSERT_EQ(brw_simd_select(simd_state), SIMD16);
+}
+
+TEST_F(SIMDSelectionCS, Require8)
+{
+   simd_state.required_width = 8;
+
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   brw_simd_mark_compiled(simd_state, SIMD8, not_spilled);
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD16));
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD32));
+
+   ASSERT_EQ(brw_simd_select(simd_state), SIMD8);
+}
+
+TEST_F(SIMDSelectionCS, Require8ErrorWhenNotCompile)
+{
+   simd_state.required_width = 8;
+
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD16));
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD32));
+
+   ASSERT_EQ(brw_simd_select(simd_state), -1);
+}
+
+TEST_F(SIMDSelectionCS, Require16)
+{
+   simd_state.required_width = 16;
+
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD8));
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD16));
+   brw_simd_mark_compiled(simd_state, SIMD16, not_spilled);
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD32));
+
+   ASSERT_EQ(brw_simd_select(simd_state), SIMD16);
+}
+
+TEST_F(SIMDSelectionCS, Require16ErrorWhenNotCompile)
+{
+   simd_state.required_width = 16;
+
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD8));
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD16));
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD32));
+
+   ASSERT_EQ(brw_simd_select(simd_state), -1);
+}
+
+TEST_F(SIMDSelectionCS, Require32)
+{
+   simd_state.required_width = 32;
+
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD8));
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD16));
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD32));
+   brw_simd_mark_compiled(simd_state, SIMD32, not_spilled);
+
+   ASSERT_EQ(brw_simd_select(simd_state), SIMD32);
+}
+
+TEST_F(SIMDSelectionCS, Require32ErrorWhenNotCompile)
+{
+   simd_state.required_width = 32;
+
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD8));
+   ASSERT_FALSE(brw_simd_should_compile(simd_state, SIMD16));
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD32));
+
+   ASSERT_EQ(brw_simd_select(simd_state), -1);
+}
+
+TEST_F(SIMDSelectionCS, FirstCompiledIsSIMD8)
+{
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   brw_simd_mark_compiled(simd_state, SIMD8, not_spilled);
+
+   ASSERT_TRUE(brw_simd_any_compiled(simd_state));
+   ASSERT_EQ(brw_simd_first_compiled(simd_state), SIMD8);
+}
+
+TEST_F(SIMDSelectionCS, FirstCompiledIsSIMD16)
+{
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD16));
+   brw_simd_mark_compiled(simd_state, SIMD16, not_spilled);
+
+   ASSERT_TRUE(brw_simd_any_compiled(simd_state));
+   ASSERT_EQ(brw_simd_first_compiled(simd_state), SIMD16);
+}
+
+TEST_F(SIMDSelectionCS, FirstCompiledIsSIMD32)
+{
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD8));
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD16));
+   ASSERT_TRUE(brw_simd_should_compile(simd_state, SIMD32));
+   brw_simd_mark_compiled(simd_state, SIMD32, not_spilled);
+
+   ASSERT_TRUE(brw_simd_any_compiled(simd_state));
+   ASSERT_EQ(brw_simd_first_compiled(simd_state), SIMD32);
+}
diff --git a/src/intel/compiler/elk/test_vec4_cmod_propagation.cpp b/src/intel/compiler/elk/test_vec4_cmod_propagation.cpp
new file mode 100644
index 00000000000..73de39d10fe
--- /dev/null
+++ b/src/intel/compiler/elk/test_vec4_cmod_propagation.cpp
@@ -0,0 +1,1056 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Based on test_fs_cmod_propagation.cpp
+ */
+
+#include <gtest/gtest.h>
+#include "brw_vec4.h"
+#include "brw_vec4_builder.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+class cmod_propagation_vec4_test : public ::testing::Test {
+   virtual void SetUp();
+   virtual void TearDown();
+
+public:
+   struct brw_compiler *compiler;
+   struct brw_compile_params params;
+   struct intel_device_info *devinfo;
+   void *ctx;
+   struct gl_shader_program *shader_prog;
+   struct brw_vue_prog_data *prog_data;
+   vec4_visitor *v;
+};
+
+class cmod_propagation_vec4_visitor : public vec4_visitor
+{
+public:
+   cmod_propagation_vec4_visitor(struct brw_compiler *compiler,
+                                 struct brw_compile_params *params,
+                                 nir_shader *shader,
+                                 struct brw_vue_prog_data *prog_data)
+      : vec4_visitor(compiler, params, NULL, prog_data, shader,
+                     false, false)
+      {
+         prog_data->dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
+      }
+
+protected:
+   /* Dummy implementation for pure virtual methods */
+   virtual dst_reg *make_reg_for_system_value(int /* location */)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void setup_payload()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_prolog()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_program_code()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_thread_end()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_urb_write_header(int /* mrf */)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */)
+   {
+      unreachable("Not reached");
+   }
+};
+
+
+void cmod_propagation_vec4_test::SetUp()
+{
+   ctx = ralloc_context(NULL);
+   compiler = rzalloc(ctx, struct brw_compiler);
+   devinfo = rzalloc(ctx, struct intel_device_info);
+   compiler->devinfo = devinfo;
+
+   params = {};
+   params.mem_ctx = ctx;
+
+   prog_data = ralloc(ctx, struct brw_vue_prog_data);
+   nir_shader *shader =
+      nir_shader_create(ctx, MESA_SHADER_VERTEX, NULL, NULL);
+
+   v = new cmod_propagation_vec4_visitor(compiler, &params, shader, prog_data);
+
+   devinfo->ver = 7;
+   devinfo->verx10 = devinfo->ver * 10;
+}
+
+void cmod_propagation_vec4_test::TearDown()
+{
+   delete v;
+   v = NULL;
+
+   ralloc_free(ctx);
+   ctx = NULL;
+}
+
+static vec4_instruction *
+instruction(bblock_t *block, int num)
+{
+   vec4_instruction *inst = (vec4_instruction *)block->start();
+   for (int i = 0; i < num; i++) {
+      inst = (vec4_instruction *)inst->next;
+   }
+   return inst;
+}
+
+static bool
+cmod_propagation(vec4_visitor *v)
+{
+   const bool print = getenv("TEST_DEBUG");
+
+   if (print) {
+      fprintf(stderr, "= Before =\n");
+      v->dump_instructions();
+   }
+
+   bool ret = v->opt_cmod_propagation();
+
+   if (print) {
+      fprintf(stderr, "\n= After =\n");
+      v->dump_instructions();
+   }
+
+   return ret;
+}
+
+TEST_F(cmod_propagation_vec4_test, basic)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_float_type());
+   src_reg src0 = src_reg(v, glsl_float_type());
+   src_reg src1 = src_reg(v, glsl_float_type());
+   src_reg zero(brw_imm_f(0.0f));
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+
+   bld.ADD(dest, src0, src1);
+   bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest.x  src0.xxxx  src1.xxxx
+    * 1: cmp.ge.f0  null.x  dest.xxxx  0.0f
+    *
+    * = After =
+    * 0: add.ge.f0  dest.x  src0.xxxx  src1.xxxx
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, basic_different_dst_writemask)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_float_type());
+   src_reg src0 = src_reg(v, glsl_float_type());
+   src_reg src1 = src_reg(v, glsl_float_type());
+   src_reg zero(brw_imm_f(0.0f));
+   dst_reg dest_null = bld.null_reg_f();
+
+   bld.ADD(dest, src0, src1);
+   bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest.x     src0  src1
+    * 1: cmp.ge.f0  null.xyzw  dest  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, andz_one)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_int_type());
+   src_reg src0 = src_reg(v, glsl_float_type());
+   src_reg zero(brw_imm_f(0.0f));
+   src_reg one(brw_imm_d(1));
+
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_Z,
+               bld.AND(bld.null_reg_d(), src_reg(dest), one));
+
+   /* = Before =
+    * 0: cmp.l.f0     dest:F  src0:F  0F
+    * 1: and.z.f0     null:D  dest:D  1D
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_EQ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, non_cmod_instruction)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_uint_type());
+   src_reg src0 = src_reg(v, glsl_uint_type());
+   src_reg zero(brw_imm_ud(0u));
+   bld.FBL(dest, src0);
+   bld.CMP(bld.null_reg_ud(), src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: fbl        dest  src0
+    * 1: cmp.ge.f0  null  dest  0u
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_FBL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, intervening_flag_write)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_float_type());
+   src_reg src0 = src_reg(v, glsl_float_type());
+   src_reg src1 = src_reg(v, glsl_float_type());
+   src_reg src2 = src_reg(v, glsl_float_type());
+   src_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE);
+   bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest  src0  src1
+    * 1: cmp.ge.f0  null  src2  0.0f
+    * 2: cmp.ge.f0  null  dest  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, intervening_flag_read)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest0 = dst_reg(v, glsl_float_type());
+   dst_reg dest1 = dst_reg(v, glsl_float_type());
+   src_reg src0 = src_reg(v, glsl_float_type());
+   src_reg src1 = src_reg(v, glsl_float_type());
+   src_reg src2 = src_reg(v, glsl_float_type());
+   src_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest0, src0, src1);
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   bld.CMP(bld.null_reg_f(), src_reg(dest0), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest0 src0  src1
+    * 1: (+f0) sel  dest1 src2  0.0f
+    * 2: cmp.ge.f0  null  dest0 0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, intervening_dest_write)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_vec4_type());
+   src_reg src0 = src_reg(v, glsl_float_type());
+   src_reg src1 = src_reg(v, glsl_float_type());
+   src_reg src2 = src_reg(v, glsl_vec2_type());
+   src_reg zero(brw_imm_f(0.0f));
+   bld.ADD(offset(dest, 8, 2), src0, src1);
+   bld.emit(SHADER_OPCODE_TEX, dest, src2)
+      ->size_written = 4 * REG_SIZE;
+   bld.CMP(bld.null_reg_f(), offset(src_reg(dest), 8, 2), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest+2  src0    src1
+    * 1: tex rlen 4 dest+0  src2
+    * 2: cmp.ge.f0  null    dest+2  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(SHADER_OPCODE_TEX, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, intervening_flag_read_same_value)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest0 = dst_reg(v, glsl_float_type());
+   dst_reg dest1 = dst_reg(v, glsl_float_type());
+   src_reg src0 = src_reg(v, glsl_float_type());
+   src_reg src1 = src_reg(v, glsl_float_type());
+   src_reg src2 = src_reg(v, glsl_float_type());
+   src_reg zero(brw_imm_f(0.0f));
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+
+   set_condmod(BRW_CONDITIONAL_GE, bld.ADD(dest0, src0, src1));
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   bld.CMP(dest_null, src_reg(dest0), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add.ge.f0  dest0   src0  src1
+    * 1: (+f0) sel  dest1   src2  0.0f
+    * 2: cmp.ge.f0  null.x  dest0 0.0f
+    *
+    * = After =
+    * 0: add.ge.f0  dest0 src0  src1
+    * 1: (+f0) sel  dest1 src2  0.0f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+}
+
+TEST_F(cmod_propagation_vec4_test, negate)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_float_type());
+   src_reg src0 = src_reg(v, glsl_float_type());
+   src_reg src1 = src_reg(v, glsl_float_type());
+   src_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   src_reg tmp_src = src_reg(dest);
+   tmp_src.negate = true;
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+   bld.CMP(dest_null, tmp_src, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest     src0  src1
+    * 1: cmp.ge.f0  null.x  -dest 0.0f
+    *
+    * = After =
+    * 0: add.le.f0  dest     src0  src1
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, movnz)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_float_type());
+   src_reg src0 = src_reg(v, glsl_float_type());
+   src_reg src1 = src_reg(v, glsl_float_type());
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+
+   bld.CMP(dest, src0, src1, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.MOV(dest_null, src_reg(dest)));
+
+   /* = Before =
+    *
+    * 0: cmp.l.f0  dest:F  src0:F  src1:F
+    * 1: mov.nz.f0 null.x  dest:F
+    *
+    * = After =
+    * 0: cmp.l.f0  dest  src0:F  src1:F
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, different_types_cmod_with_zero)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_int_type());
+   src_reg src0 = src_reg(v, glsl_int_type());
+   src_reg src1 = src_reg(v, glsl_int_type());
+   src_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), retype(src_reg(dest), BRW_REGISTER_TYPE_F), zero,
+           BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest:D  src0:D  src1:D
+    * 1: cmp.ge.f0  null:F  dest:F  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, andnz_non_one)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_int_type());
+   src_reg src0 = src_reg(v, glsl_float_type());
+   src_reg zero(brw_imm_f(0.0f));
+   src_reg nonone(brw_imm_d(38));
+
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.AND(bld.null_reg_d(), src_reg(dest), nonone));
+
+   /* = Before =
+    * 0: cmp.l.f0     dest:F  src0:F  0F
+    * 1: and.nz.f0    null:D  dest:D  38D
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+/* Note that basic is using glsl_type:float types, while this one is using
+ * glsl_type::vec4 */
+TEST_F(cmod_propagation_vec4_test, basic_vec4)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_vec4_type());
+   src_reg src0 = src_reg(v, glsl_vec4_type());
+   src_reg src1 = src_reg(v, glsl_vec4_type());
+   src_reg zero(brw_imm_f(0.0f));
+
+   bld.MUL(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_NZ);
+
+   /* = Before =
+    * 0: mul         dest.xyzw  src0.xyzw  src1.xyzw
+    * 1: cmp.nz.f0.0 null.xyzw  dest.xyzw  0.0f
+    *
+    * = After =
+    * 0: mul.nz.f0.0 dest.xyzw  src0.xyzw  src1.xyzw
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, basic_vec4_different_dst_writemask)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_vec4_type());
+   dest.writemask = WRITEMASK_X;
+   src_reg src0 = src_reg(v, glsl_vec4_type());
+   src_reg src1 = src_reg(v, glsl_vec4_type());
+   src_reg zero(brw_imm_f(0.0f));
+   dst_reg dest_null = bld.null_reg_f();
+
+   bld.MUL(dest, src0, src1);
+   bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_NZ);
+
+   /* = Before =
+    * 0: mul         dest.x  src0  src1
+    * 1: cmp.nz.f0.0 null    dest  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, mad_one_component_vec4)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_vec4_type());
+   dest.writemask = WRITEMASK_X;
+   src_reg src0 = src_reg(v, glsl_vec4_type());
+   src_reg src1 = src_reg(v, glsl_vec4_type());
+   src_reg src2 = src_reg(v, glsl_vec4_type());
+   src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX;
+   src2.negate = true;
+   src_reg zero(brw_imm_f(0.0f));
+   src_reg tmp(dest);
+   tmp.swizzle = BRW_SWIZZLE_XXXX;
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+
+   bld.MAD(dest, src0, src1, src2);
+   bld.CMP(dest_null, tmp, zero, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    *
+    * 0: mad         dest.x:F  src0.xxxx:F  src10.xxxx:F  -src2.xxxx:F
+    * 1: cmp.l.f0.0  null.x:F  dest.xxxx:F  0.0f
+    *
+    * = After =
+    * 0: mad.l.f0    dest.x:F  src0.xxxx:F  src10.xxxx:F  -src2.xxxx:F
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, mad_more_one_component_vec4)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_vec4_type());
+   dest.writemask = WRITEMASK_XW;
+   src_reg src0 = src_reg(v, glsl_vec4_type());
+   src_reg src1 = src_reg(v, glsl_vec4_type());
+   src_reg src2 = src_reg(v, glsl_vec4_type());
+   src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX;
+   src2.negate = true;
+   src_reg zero(brw_imm_f(0.0f));
+   src_reg tmp(dest);
+   tmp.swizzle = BRW_SWIZZLE_XXXX;
+   dst_reg dest_null = bld.null_reg_f();
+
+   bld.MAD(dest, src0, src1, src2);
+   bld.CMP(dest_null, tmp, zero, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    *
+    * 0: mad         dest.xw:F  src0.xxxx:F  src10.xxxx:F  -src2.xxxx:F
+    * 1: cmp.l.f0.0  null:F  dest.xxxx:F  zeroF
+    *
+    * = After =
+    * (No changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, cmp_mov_vec4)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_ivec4_type());
+   dest.writemask = WRITEMASK_X;
+   src_reg src0 = src_reg(v, glsl_ivec4_type());
+   src0.swizzle = BRW_SWIZZLE_XXXX;
+   src0.file = UNIFORM;
+   src_reg nonone = retype(brw_imm_d(16), BRW_REGISTER_TYPE_D);
+   src_reg mov_src = src_reg(dest);
+   mov_src.swizzle = BRW_SWIZZLE_XXXX;
+   dst_reg dest_null = bld.null_reg_d();
+   dest_null.writemask = WRITEMASK_X;
+
+   bld.CMP(dest, src0, nonone, BRW_CONDITIONAL_GE);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.MOV(dest_null, mov_src));
+
+   /* = Before =
+    *
+    * 0: cmp.ge.f0  dest.x:D  u.xxxx:D  16D
+    * 1: mov.nz.f0  null.x:D  dest.xxxx:D
+    *
+    * = After =
+    * 0: cmp.ge.f0  dest.x:D  u.xxxx:D  16D
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, mul_cmp_different_channels_vec4)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_vec4_type());
+   src_reg src0 = src_reg(v, glsl_vec4_type());
+   src_reg src1 = src_reg(v, glsl_vec4_type());
+   src_reg zero(brw_imm_f(0.0f));
+   src_reg cmp_src = src_reg(dest);
+   cmp_src.swizzle = BRW_SWIZZLE4(0,1,3,2);
+
+   bld.MUL(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), cmp_src, zero, BRW_CONDITIONAL_NZ);
+
+   /* = Before =
+    * 0: mul         dest  src0       src1
+    * 1: cmp.nz.f0.0 null  dest.xywz  0.0f
+    *
+    * = After =
+    * (No changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, add_cmp_same_dst_writemask)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_vec4_type());
+   src_reg src0 = src_reg(v, glsl_vec4_type());
+   src_reg src1 = src_reg(v, glsl_vec4_type());
+   dst_reg dest_null = bld.null_reg_f();
+
+   bld.ADD(dest, src0, src1);
+   vec4_instruction *inst = bld.CMP(dest_null, src0, src1, BRW_CONDITIONAL_GE);
+   inst->src[1].negate = true;
+
+   /* = Before =
+    *
+    * 0: add        dest.xyzw  src0  src1
+    * 1: cmp.ge.f0  null.xyzw  src0  -src1
+    *
+    * = After =
+    * 0: add.ge.f0  dest.xyzw  src0  src1
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, add_cmp_different_dst_writemask)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_float_type());
+   src_reg src0 = src_reg(v, glsl_vec4_type());
+   src_reg src1 = src_reg(v, glsl_vec4_type());
+   dst_reg dest_null = bld.null_reg_f();
+
+   bld.ADD(dest, src0, src1);
+   vec4_instruction *inst = bld.CMP(dest_null, src0, src1, BRW_CONDITIONAL_GE);
+   inst->src[1].negate = true;
+
+   /* = Before =
+    *
+    * 0: add        dest.x     src0  src1
+    * 1: cmp.ge.f0  null.xyzw  src0  -src1
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, prop_across_sel_gfx7)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest1 = dst_reg(v, glsl_float_type());
+   dst_reg dest2 = dst_reg(v, glsl_float_type());
+   src_reg src0 = src_reg(v, glsl_float_type());
+   src_reg src1 = src_reg(v, glsl_float_type());
+   src_reg src2 = src_reg(v, glsl_float_type());
+   src_reg src3 = src_reg(v, glsl_float_type());
+   src_reg zero(brw_imm_f(0.0f));
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+
+   bld.ADD(dest1, src0, src1);
+   bld.SEL(dest2, src2, src3)
+      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.CMP(dest_null, src_reg(dest1), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest1.x src0.xxxx  src1.xxxx
+    * 1: sel.ge.f0  dest2.x src2.xxxx  src3.xxxx
+    * 2: cmp.ge.f0  null.x  dest.xxxx  0.0f
+    *
+    * = After =
+    * 0: add.ge.f0  dest.x  src0.xxxx  src1.xxxx
+    * 1: sel.ge.f0  dest2.x src2.xxxx  src3.xxxx
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, prop_across_sel_gfx5)
+{
+   devinfo->ver = 5;
+   devinfo->verx10 = devinfo->ver * 10;
+
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest1 = dst_reg(v, glsl_float_type());
+   dst_reg dest2 = dst_reg(v, glsl_float_type());
+   src_reg src0 = src_reg(v, glsl_float_type());
+   src_reg src1 = src_reg(v, glsl_float_type());
+   src_reg src2 = src_reg(v, glsl_float_type());
+   src_reg src3 = src_reg(v, glsl_float_type());
+   src_reg zero(brw_imm_f(0.0f));
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+
+   bld.ADD(dest1, src0, src1);
+   bld.SEL(dest2, src2, src3)
+      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.CMP(dest_null, src_reg(dest1), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest1.x src0.xxxx  src1.xxxx
+    * 1: sel.ge.f0  dest2.x src2.xxxx  src3.xxxx
+    * 2: cmp.ge.f0  null.x  dest.xxxx  0.0f
+    *
+    * = After =
+    * (no changes)
+    *
+    * On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
+    * using a separate cmpn and sel instruction.  This lowering occurs in
+    * fs_vistor::lower_minmax which is called a long time after the first
+    * calls to cmod_propagation.
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_vec4_test, prop_into_sel_gfx5)
+{
+   devinfo->ver = 5;
+   devinfo->verx10 = devinfo->ver * 10;
+
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_float_type());
+   src_reg src0 = src_reg(v, glsl_float_type());
+   src_reg src1 = src_reg(v, glsl_float_type());
+   src_reg zero(brw_imm_f(0.0f));
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+
+   bld.SEL(dest, src0, src1)
+      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: sel.ge.f0  dest.x  src2.xxxx  src3.xxxx
+    * 1: cmp.ge.f0  null.x  dest.xxxx  0.0f
+    *
+    * = After =
+    * (no changes)
+    *
+    * Do not copy propagate into a sel.cond instruction.  While it does modify
+    * the flags, the flags are not based on the result compared with zero (as
+    * with most other instructions).  The result is based on the sources
+    * compared with each other (like cmp.cond).
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
diff --git a/src/intel/compiler/elk/test_vec4_copy_propagation.cpp b/src/intel/compiler/elk/test_vec4_copy_propagation.cpp
new file mode 100644
index 00000000000..7690458b928
--- /dev/null
+++ b/src/intel/compiler/elk/test_vec4_copy_propagation.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_vec4.h"
+
+using namespace brw;
+
+class copy_propagation_vec4_test : public ::testing::Test {
+   virtual void SetUp();
+   virtual void TearDown();
+
+public:
+   struct brw_compiler *compiler;
+   struct brw_compile_params params;
+   struct intel_device_info *devinfo;
+   void *ctx;
+   struct gl_shader_program *shader_prog;
+   struct brw_vue_prog_data *prog_data;
+   vec4_visitor *v;
+};
+
+class copy_propagation_vec4_visitor : public vec4_visitor
+{
+public:
+   copy_propagation_vec4_visitor(struct brw_compiler *compiler,
+                                 struct brw_compile_params *params,
+                                 nir_shader *shader,
+                                 struct brw_vue_prog_data *prog_data)
+      : vec4_visitor(compiler, params, NULL, prog_data, shader,
+                     false /* no_spills */, false)
+   {
+      prog_data->dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
+   }
+
+protected:
+   virtual dst_reg *make_reg_for_system_value(int /* location */)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void setup_payload()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_prolog()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_thread_end()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_urb_write_header(int /* mrf */)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */)
+   {
+      unreachable("Not reached");
+   }
+};
+
+
+void copy_propagation_vec4_test::SetUp()
+{
+   ctx = ralloc_context(NULL);
+   compiler = rzalloc(ctx, struct brw_compiler);
+   devinfo = rzalloc(ctx, struct intel_device_info);
+   compiler->devinfo = devinfo;
+
+   params = {};
+   params.mem_ctx = ctx;
+
+   prog_data = ralloc(ctx, struct brw_vue_prog_data);
+   nir_shader *shader =
+      nir_shader_create(ctx, MESA_SHADER_VERTEX, NULL, NULL);
+
+   v = new copy_propagation_vec4_visitor(compiler, &params, shader, prog_data);
+
+   devinfo->ver = 4;
+   devinfo->verx10 = devinfo->ver * 10;
+}
+
+void copy_propagation_vec4_test::TearDown()
+{
+   delete v;
+   v = NULL;
+
+   ralloc_free(ctx);
+   ctx = NULL;
+}
+
+
+static void
+copy_propagation(vec4_visitor *v)
+{
+   const bool print = getenv("TEST_DEBUG");
+
+   if (print) {
+      fprintf(stderr, "instructions before:\n");
+      v->dump_instructions();
+   }
+
+   v->calculate_cfg();
+   v->opt_copy_propagation();
+
+   if (print) {
+      fprintf(stderr, "instructions after:\n");
+      v->dump_instructions();
+   }
+}
+
+TEST_F(copy_propagation_vec4_test, test_swizzle_swizzle)
+{
+   dst_reg a = dst_reg(v, glsl_vec4_type());
+   dst_reg b = dst_reg(v, glsl_vec4_type());
+   dst_reg c = dst_reg(v, glsl_vec4_type());
+
+   v->emit(v->ADD(a, src_reg(a), src_reg(a)));
+
+   v->emit(v->MOV(b, swizzle(src_reg(a), BRW_SWIZZLE4(BRW_SWIZZLE_Y,
+                                                      BRW_SWIZZLE_Z,
+                                                      BRW_SWIZZLE_W,
+                                                      BRW_SWIZZLE_X))));
+
+   vec4_instruction *test_mov =
+      v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(BRW_SWIZZLE_Y,
+                                                 BRW_SWIZZLE_Z,
+                                                 BRW_SWIZZLE_W,
+                                                 BRW_SWIZZLE_X)));
+   v->emit(test_mov);
+
+   copy_propagation(v);
+
+   EXPECT_EQ(test_mov->src[0].nr, a.nr);
+   EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(BRW_SWIZZLE_Z,
+                                                    BRW_SWIZZLE_W,
+                                                    BRW_SWIZZLE_X,
+                                                    BRW_SWIZZLE_Y));
+}
+
+TEST_F(copy_propagation_vec4_test, test_swizzle_writemask)
+{
+   dst_reg a = dst_reg(v, glsl_vec4_type());
+   dst_reg b = dst_reg(v, glsl_vec4_type());
+   dst_reg c = dst_reg(v, glsl_vec4_type());
+
+   v->emit(v->MOV(b, swizzle(src_reg(a), BRW_SWIZZLE4(BRW_SWIZZLE_X,
+                                                      BRW_SWIZZLE_Y,
+                                                      BRW_SWIZZLE_X,
+                                                      BRW_SWIZZLE_Z))));
+
+   v->emit(v->MOV(writemask(a, WRITEMASK_XYZ), brw_imm_f(1.0f)));
+
+   vec4_instruction *test_mov =
+      v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(BRW_SWIZZLE_W,
+                                                 BRW_SWIZZLE_W,
+                                                 BRW_SWIZZLE_W,
+                                                 BRW_SWIZZLE_W)));
+   v->emit(test_mov);
+
+   copy_propagation(v);
+
+   /* should not copy propagate */
+   EXPECT_EQ(test_mov->src[0].nr, b.nr);
+   EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(BRW_SWIZZLE_W,
+                                                    BRW_SWIZZLE_W,
+                                                    BRW_SWIZZLE_W,
+                                                    BRW_SWIZZLE_W));
+}
diff --git a/src/intel/compiler/elk/test_vec4_dead_code_eliminate.cpp b/src/intel/compiler/elk/test_vec4_dead_code_eliminate.cpp
new file mode 100644
index 00000000000..c3a07c1735b
--- /dev/null
+++ b/src/intel/compiler/elk/test_vec4_dead_code_eliminate.cpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_vec4.h"
+
+using namespace brw;
+
+class dead_code_eliminate_vec4_test : public ::testing::Test {
+   virtual void SetUp();
+   virtual void TearDown();
+
+public:
+   struct brw_compiler *compiler;
+   struct brw_compile_params params;
+   struct intel_device_info *devinfo;
+   void *ctx;
+   struct gl_shader_program *shader_prog;
+   struct brw_vue_prog_data *prog_data;
+   vec4_visitor *v;
+};
+
+class dead_code_eliminate_vec4_visitor : public vec4_visitor
+{
+public:
+   dead_code_eliminate_vec4_visitor(struct brw_compiler *compiler,
+                                    struct brw_compile_params *params,
+                                 nir_shader *shader,
+                                 struct brw_vue_prog_data *prog_data)
+      : vec4_visitor(compiler, params, NULL, prog_data, shader,
+                     false /* no_spills */, false)
+   {
+      prog_data->dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
+   }
+
+protected:
+   virtual dst_reg *make_reg_for_system_value(int /* location */)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void setup_payload()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_prolog()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_thread_end()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_urb_write_header(int /* mrf */)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */)
+   {
+      unreachable("Not reached");
+   }
+};
+
+
+void dead_code_eliminate_vec4_test::SetUp()
+{
+   ctx = ralloc_context(NULL);
+   compiler = rzalloc(ctx, struct brw_compiler);
+   devinfo = rzalloc(ctx, struct intel_device_info);
+   compiler->devinfo = devinfo;
+
+   params = {};
+   params.mem_ctx = ctx;
+
+   prog_data = ralloc(ctx, struct brw_vue_prog_data);
+   nir_shader *shader =
+      nir_shader_create(ctx, MESA_SHADER_VERTEX, NULL, NULL);
+
+  v = new dead_code_eliminate_vec4_visitor(compiler, &params, shader, prog_data);
+
+   devinfo->ver = 4;
+   devinfo->verx10 = devinfo->ver * 10;
+}
+
+void dead_code_eliminate_vec4_test::TearDown()
+{
+   delete v;
+   v = NULL;
+
+   ralloc_free(ctx);
+   ctx = NULL;
+}
+
+static void
+dead_code_eliminate(vec4_visitor *v)
+{
+   const bool print = getenv("TEST_DEBUG");
+
+   if (print) {
+      fprintf(stderr, "instructions before:\n");
+      v->dump_instructions();
+   }
+
+   v->calculate_cfg();
+   v->dead_code_eliminate();
+
+   if (print) {
+      fprintf(stderr, "instructions after:\n");
+      v->dump_instructions();
+   }
+}
+
+TEST_F(dead_code_eliminate_vec4_test, some_dead_channels_all_flags_used)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   src_reg r1 = src_reg(v, glsl_vec4_type());
+   src_reg r2 = src_reg(v, glsl_vec4_type());
+   src_reg r3 = src_reg(v, glsl_vec4_type());
+   src_reg r4 = src_reg(v, glsl_vec4_type());
+   src_reg r5 = src_reg(v, glsl_vec4_type());
+   src_reg r6 = src_reg(v, glsl_vec4_type());
+
+   /* Sequence like the following should not be modified by DCE.
+    *
+    *     cmp.l.f0(8)     g4<1>F         g2<4,4,1>.wF   g1<4,4,1>.xF
+    *     mov(8)          g5<1>.xF       g4<4,4,1>.xF
+    *     (+f0.x) sel(8)  g6<1>UD        g3<4>UD        g6<4>UD
+    */
+   vec4_instruction *test_cmp =
+      bld.CMP(dst_reg(r4), r2, r1, BRW_CONDITIONAL_L);
+
+   test_cmp->src[0].swizzle = BRW_SWIZZLE_WWWW;
+   test_cmp->src[1].swizzle = BRW_SWIZZLE_XXXX;
+
+   vec4_instruction *test_mov =
+      bld.MOV(dst_reg(r5), r4);
+
+   test_mov->dst.writemask = WRITEMASK_X;
+   test_mov->src[0].swizzle = BRW_SWIZZLE_XXXX;
+
+   vec4_instruction *test_sel =
+      bld.SEL(dst_reg(r6), r3, r6);
+
+   set_predicate(BRW_PREDICATE_NORMAL, test_sel);
+
+   /* The scratch write is here just to make r5 and r6 be live so that the
+    * whole program doesn't get eliminated by DCE.
+    */
+   v->emit(v->SCRATCH_WRITE(dst_reg(r4), r6, r5));
+
+   dead_code_eliminate(v);
+
+   EXPECT_EQ(test_cmp->dst.writemask, WRITEMASK_XYZW);
+}
diff --git a/src/intel/compiler/elk/test_vec4_register_coalesce.cpp b/src/intel/compiler/elk/test_vec4_register_coalesce.cpp
new file mode 100644
index 00000000000..13d01c450d4
--- /dev/null
+++ b/src/intel/compiler/elk/test_vec4_register_coalesce.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_vec4.h"
+
+using namespace brw;
+
+#define register_coalesce(v) _register_coalesce(v, __func__)
+
+class register_coalesce_vec4_test : public ::testing::Test {
+   virtual void SetUp();
+   virtual void TearDown();
+
+public:
+   struct brw_compiler *compiler;
+   struct brw_compile_params params;
+   struct intel_device_info *devinfo;
+   void *ctx;
+   struct gl_shader_program *shader_prog;
+   struct brw_vue_prog_data *prog_data;
+   vec4_visitor *v;
+};
+
+
+class register_coalesce_vec4_visitor : public vec4_visitor
+{
+public:
+   register_coalesce_vec4_visitor(struct brw_compiler *compiler,
+                                  struct brw_compile_params *params,
+                                  nir_shader *shader,
+                                  struct brw_vue_prog_data *prog_data)
+      : vec4_visitor(compiler, params, NULL, prog_data, shader,
+                     false /* no_spills */, false)
+   {
+      prog_data->dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
+   }
+
+protected:
+   virtual dst_reg *make_reg_for_system_value(int /* location */)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void setup_payload()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_prolog()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_thread_end()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_urb_write_header(int /* mrf */)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */)
+   {
+      unreachable("Not reached");
+   }
+};
+
+
+void register_coalesce_vec4_test::SetUp()
+{
+   ctx = ralloc_context(NULL);
+   compiler = rzalloc(ctx, struct brw_compiler);
+   devinfo = rzalloc(ctx, struct intel_device_info);
+   compiler->devinfo = devinfo;
+
+   prog_data = ralloc(ctx, struct brw_vue_prog_data);
+
+   params = {};
+   params.mem_ctx = ctx;
+
+   nir_shader *shader =
+      nir_shader_create(ctx, MESA_SHADER_VERTEX, NULL, NULL);
+
+   v = new register_coalesce_vec4_visitor(compiler, &params, shader, prog_data);
+
+   devinfo->ver = 4;
+   devinfo->verx10 = devinfo->ver * 10;
+}
+
+void register_coalesce_vec4_test::TearDown()
+{
+   delete v;
+   v = NULL;
+
+   ralloc_free(ctx);
+   ctx = NULL;
+}
+
+static void
+_register_coalesce(vec4_visitor *v, const char *func)
+{
+   const bool print = getenv("TEST_DEBUG");
+
+   if (print) {
+      printf("%s: instructions before:\n", func);
+      v->dump_instructions();
+   }
+
+   v->calculate_cfg();
+   v->opt_register_coalesce();
+
+   if (print) {
+      printf("%s: instructions after:\n", func);
+      v->dump_instructions();
+   }
+}
+
+TEST_F(register_coalesce_vec4_test, test_compute_to_mrf)
+{
+   src_reg something = src_reg(v, glsl_float_type());
+   dst_reg temp = dst_reg(v, glsl_float_type());
+   dst_reg init;
+
+   dst_reg m0 = dst_reg(MRF, 0);
+   m0.writemask = WRITEMASK_X;
+   m0.type = BRW_REGISTER_TYPE_F;
+
+   vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f)));
+   v->emit(v->MOV(m0, src_reg(temp)));
+
+   register_coalesce(v);
+
+   EXPECT_EQ(mul->dst.file, MRF);
+}
+
+
+TEST_F(register_coalesce_vec4_test, test_multiple_use)
+{
+   src_reg something = src_reg(v, glsl_float_type());
+   dst_reg temp = dst_reg(v, glsl_vec4_type());
+   dst_reg init;
+
+   dst_reg m0 = dst_reg(MRF, 0);
+   m0.writemask = WRITEMASK_X;
+   m0.type = BRW_REGISTER_TYPE_F;
+
+   dst_reg m1 = dst_reg(MRF, 1);
+   m1.writemask = WRITEMASK_XYZW;
+   m1.type = BRW_REGISTER_TYPE_F;
+
+   src_reg src = src_reg(temp);
+   vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f)));
+   src.swizzle = BRW_SWIZZLE_XXXX;
+   v->emit(v->MOV(m0, src));
+   src.swizzle = BRW_SWIZZLE_XYZW;
+   v->emit(v->MOV(m1, src));
+
+   register_coalesce(v);
+
+   EXPECT_NE(mul->dst.file, MRF);
+}
+
+TEST_F(register_coalesce_vec4_test, test_dp4_mrf)
+{
+   src_reg some_src_1 = src_reg(v, glsl_vec4_type());
+   src_reg some_src_2 = src_reg(v, glsl_vec4_type());
+   dst_reg init;
+
+   dst_reg m0 = dst_reg(MRF, 0);
+   m0.writemask = WRITEMASK_Y;
+   m0.type = BRW_REGISTER_TYPE_F;
+
+   dst_reg temp = dst_reg(v, glsl_float_type());
+
+   vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2));
+   v->emit(v->MOV(m0, src_reg(temp)));
+
+   register_coalesce(v);
+
+   EXPECT_EQ(dp4->dst.file, MRF);
+   EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y);
+}
+
+TEST_F(register_coalesce_vec4_test, test_dp4_grf)
+{
+   src_reg some_src_1 = src_reg(v, glsl_vec4_type());
+   src_reg some_src_2 = src_reg(v, glsl_vec4_type());
+   dst_reg init;
+
+   dst_reg to = dst_reg(v, glsl_vec4_type());
+   dst_reg temp = dst_reg(v, glsl_float_type());
+
+   vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2));
+   to.writemask = WRITEMASK_Y;
+   v->emit(v->MOV(to, src_reg(temp)));
+
+   /* if we don't do something with the result, the automatic dead code
+    * elimination will remove all our instructions.
+    */
+   src_reg src = src_reg(to);
+   src.negate = true;
+   v->emit(v->MOV(dst_reg(MRF, 0), src));
+
+   register_coalesce(v);
+
+   EXPECT_EQ(dp4->dst.nr, to.nr);
+   EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y);
+}
+
+TEST_F(register_coalesce_vec4_test, test_channel_mul_grf)
+{
+   src_reg some_src_1 = src_reg(v, glsl_vec4_type());
+   src_reg some_src_2 = src_reg(v, glsl_vec4_type());
+   dst_reg init;
+
+   dst_reg to = dst_reg(v, glsl_vec4_type());
+   dst_reg temp = dst_reg(v, glsl_float_type());
+
+   vec4_instruction *mul = v->emit(v->MUL(temp, some_src_1, some_src_2));
+   to.writemask = WRITEMASK_Y;
+   v->emit(v->MOV(to, src_reg(temp)));
+
+   /* if we don't do something with the result, the automatic dead code
+    * elimination will remove all our instructions.
+    */
+   src_reg src = src_reg(to);
+   src.negate = true;
+   v->emit(v->MOV(dst_reg(MRF, 0), src));
+
+   register_coalesce(v);
+
+   EXPECT_EQ(mul->dst.nr, to.nr);
+}
diff --git a/src/intel/compiler/elk/test_vf_float_conversions.cpp b/src/intel/compiler/elk/test_vf_float_conversions.cpp
new file mode 100644
index 00000000000..7af97d0d097
--- /dev/null
+++ b/src/intel/compiler/elk/test_vf_float_conversions.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include <math.h>
+#include "brw_reg.h"
+
+class vf_float_conversion_test : public ::testing::Test {
+   virtual void SetUp();
+
+public:
+   float vf_to_float[128];
+};
+
+void vf_float_conversion_test::SetUp() {
+   /* 0 is special cased. */
+   vf_to_float[0] = 0.0;
+
+   for (int vf = 1; vf < 128; vf++) {
+      int ebits = (vf >> 4) & 0x7;
+      int mbits = vf & 0xf;
+
+      float x = 1.0f + mbits / 16.0f;
+      int exp = ebits - 3;
+
+      vf_to_float[vf] = ldexpf(x, exp);
+   }
+}
+
+union fu {
+   float f;
+   unsigned u;
+};
+
+static unsigned
+f2u(float f)
+{
+   union fu fu;
+   fu.f = f;
+   return fu.u;
+}
+
+TEST_F(vf_float_conversion_test, test_vf_to_float)
+{
+   for (int vf = 0; vf < 256; vf++) {
+      float expected = vf_to_float[vf % 128];
+      if (vf > 127)
+         expected = -expected;
+
+      EXPECT_EQ(f2u(expected), f2u(brw_vf_to_float(vf)));
+   }
+}
+
+TEST_F(vf_float_conversion_test, test_float_to_vf)
+{
+   for (int vf = 0; vf < 256; vf++) {
+      float f = vf_to_float[vf % 128];
+      if (vf > 127)
+         f = -f;
+
+      EXPECT_EQ(vf, brw_float_to_vf(f));
+   }
+}
+
+TEST_F(vf_float_conversion_test, test_special_case_0)
+{
+   /* ±0.0f are special cased to the VFs that would otherwise correspond
+    * to ±0.125f. Make sure we can't convert these values to VF.
+    */
+   EXPECT_EQ(brw_float_to_vf(+0.125f), -1);
+   EXPECT_EQ(brw_float_to_vf(-0.125f), -1);
+
+   EXPECT_EQ(f2u(brw_vf_to_float(brw_float_to_vf(+0.0f))), f2u(+0.0f));
+   EXPECT_EQ(f2u(brw_vf_to_float(brw_float_to_vf(-0.0f))), f2u(-0.0f));
+}
+
+TEST_F(vf_float_conversion_test, test_nonrepresentable_float_input)
+{
+   EXPECT_EQ(brw_float_to_vf(+32.0f), -1);
+   EXPECT_EQ(brw_float_to_vf(-32.0f), -1);
+
+   EXPECT_EQ(brw_float_to_vf(+16.5f), -1);
+   EXPECT_EQ(brw_float_to_vf(-16.5f), -1);
+
+   EXPECT_EQ(brw_float_to_vf(+8.25f), -1);
+   EXPECT_EQ(brw_float_to_vf(-8.25f), -1);
+
+   EXPECT_EQ(brw_float_to_vf(+4.125f), -1);
+   EXPECT_EQ(brw_float_to_vf(-4.125f), -1);
+}
diff --git a/src/intel/compiler/elk/tests/gen11/cr0.asm b/src/intel/compiler/elk/tests/gen11/cr0.asm
new file mode 100644
index 00000000000..a6213bb0f93
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen11/cr0.asm
@@ -0,0 +1,7 @@
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xfffffb7fUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xffffff7fUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xffffffcfUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xfffffbffUD    { align1 1N switch };
+or(1)           cr0<1>UD        cr0<0,1,0>UD    0x00000400UD    { align1 1N switch };
+or(1)           cr0<1>UD        cr0<0,1,0>UD    0x00000030UD    { align1 1N switch };
+or(1)           cr0<1>UD        cr0<0,1,0>UD    0x00000080UD    { align1 1N switch };
diff --git a/src/intel/compiler/elk/tests/gen11/cr0.expected b/src/intel/compiler/elk/tests/gen11/cr0.expected
new file mode 100644
index 00000000000..60e24ef73ef
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen11/cr0.expected
@@ -0,0 +1,7 @@
+05 80 00 00 00 00 00 30 00 10 00 06 7f fb ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 7f ff ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 cf ff ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 ff fb ff ff
+06 80 00 00 00 00 00 30 00 10 00 06 00 04 00 00
+06 80 00 00 00 00 00 30 00 10 00 06 30 00 00 00
+06 80 00 00 00 00 00 30 00 10 00 06 80 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen11/rol.asm b/src/intel/compiler/elk/tests/gen11/rol.asm
new file mode 100644
index 00000000000..e8eba29ff9d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen11/rol.asm
@@ -0,0 +1 @@
+rol(16)         g3<1>UD         g2<0,1,0>UD     g2.1<0,1,0>UD   { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen11/rol.expected b/src/intel/compiler/elk/tests/gen11/rol.expected
new file mode 100644
index 00000000000..e9daccab7e4
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen11/rol.expected
@@ -0,0 +1 @@
+0f 00 80 00 08 02 60 20 40 00 00 02 44 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen11/ror.asm b/src/intel/compiler/elk/tests/gen11/ror.asm
new file mode 100644
index 00000000000..4a83a26ada4
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen11/ror.asm
@@ -0,0 +1 @@
+ror(16)         g3<1>UD         g2<0,1,0>UD     g2.1<0,1,0>UD   { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen11/ror.expected b/src/intel/compiler/elk/tests/gen11/ror.expected
new file mode 100644
index 00000000000..1778601e0bc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen11/ror.expected
@@ -0,0 +1 @@
+0e 00 80 00 08 02 60 20 40 00 00 02 44 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen12.5/add3.asm b/src/intel/compiler/elk/tests/gen12.5/add3.asm
new file mode 100644
index 00000000000..1e81dd0031f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen12.5/add3.asm
@@ -0,0 +1,7 @@
+add3(8)         g118<1>D        -g117<8,8,1>D   g114<8,8,1>D    g115<1,1,1>D { align1 1Q I@2 };
+add3(16)        g55<1>D         g50<8,8,1>D     g46<8,8,1>D     -g53<1,1,1>D { align1 1H @2 $5.dst };
+add3(16)        g111<1>D        -g40<8,8,1>D    -g88<8,8,1>D    g111<1,1,1>D { align1 1H I@1 };
+add3(16)        g49<1>D         0x0008UW        g47<8,8,1>D     g26<1,1,1>D { align1 1H I@4 };
+add3(16)        g55<1>D         0x0008UW        g53<8,8,1>D     g65<1,1,1>D { align1 2H I@3 };
+add3(8)         g57<1>D         g52<8,8,1>D     (abs)g48<8,8,1>D (abs)g59<1,1,1>D { align1 1Q I@4 };
+add3(16)        g51<1>D         g63<8,8,1>D     -g122<8,8,1>D   (abs)g27<1,1,1>D { align1 1H I@7 };
diff --git a/src/intel/compiler/elk/tests/gen12.5/add3.expected b/src/intel/compiler/elk/tests/gen12.5/add3.expected
new file mode 100644
index 00000000000..e6146920beb
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen12.5/add3.expected
@@ -0,0 +1,7 @@
+52 1a 03 00 68 2e 04 76 05 75 0e 0e 05 72 05 73
+52 a5 04 00 68 0e 04 37 05 32 2e 0e 05 2e 05 35
+52 19 04 00 68 2e 04 6f 05 28 8e 0e 05 58 05 6f
+52 1c 04 00 60 41 04 31 08 00 0e 0e 05 2f 05 1a
+52 1b 24 00 60 41 04 37 08 00 0e 0e 05 35 05 41
+52 1c 03 00 68 0e 04 39 05 34 5e 0e 05 30 05 3b
+52 1f 04 00 68 0e 04 33 05 3f 9e 0e 05 7a 05 1b
diff --git a/src/intel/compiler/elk/tests/gen12.5/send.asm b/src/intel/compiler/elk/tests/gen12.5/send.asm
new file mode 100644
index 00000000000..6321737809f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen12.5/send.asm
@@ -0,0 +1,30 @@
+(+f0.0.any8h) send(1) g57UD     g58UD           nullUD          0x6210c500                0x02000000
+                            ugm MsgDesc: ( load, a32, d32, V8, transpose, L1STATE_L3MOCS dst_len = 1, src0_len = 1, src1_len = 0 bti )  BTI 2  base_offset 0  { align1 WE_all 1N $5 };
+(+f0.0.any8h) send(1) g28UD     g29UD           nullUD          0x6210c500                0x02000000
+                            ugm MsgDesc: ( load, a32, d32, V8, transpose, L1STATE_L3MOCS dst_len = 1, src0_len = 1, src1_len = 0 bti )  BTI 2  base_offset 0  { align1 WE_all 1N $2 };
+(+f0.0.any32h) send(1) g57UD    g58UD           nullUD          0x6210c500                0x02000000
+                            ugm MsgDesc: ( load, a32, d32, V8, transpose, L1STATE_L3MOCS dst_len = 1, src0_len = 1, src1_len = 0 bti )  BTI 2  base_offset 0  { align1 WE_all 1N $0 };
+send(8)         nullUD          g79UD           g10UD           0x6200f506                0x04000100
+                            ugm MsgDesc: ( store_cmask, a32, d32, xyzw, L1STATE_L3MOCS dst_len = 0, src0_len = 1, src1_len = 4 bti )  BTI 4  base_offset 0  { align1 1Q $0 };
+send(16)        nullUD          g9UD            g7UD            0x44000504                a0.1<0>UD
+                            ugm MsgDesc: ( store, a32, d32, V1, L1STATE_L3MOCS dst_len = 0, src0_len = 2, src1_len = 0 ss )  surface_state_index 0  { align1 1H @1 $0 };
+send(1)         g4UD            g0UD            nullUD          0x0210151f                0x00000000
+                            tgm MsgDesc: ( fence, a32, tile, evict, normal_routing dst_len = 1, src0_len = 1, src1_len = 0 flat )  base_offset 0  { align1 WE_all 1N $3 };
+send(8)         nullUD          g36UD           g37UD           0x02000b04                0x00000040
+                            slm MsgDesc: ( store, a32, d16u32, V1, L1STATE_L3MOCS dst_len = 0, src0_len = 1, src1_len = 1 flat )  base_offset 0  { align1 1Q $1 };
+send(8)         nullUD          g34UD           g35UD           0x02000b04                0x00000040
+                            slm MsgDesc: ( store, a32, d16u32, V1, L1STATE_L3MOCS dst_len = 0, src0_len = 1, src1_len = 1 flat )  base_offset 0  { align1 1Q $0 };
+send(8)         nullUD          g6UD            g7UD            0x0200f506                0x00000100
+                            slm MsgDesc: ( store_cmask, a32, d32, xyzw, L1STATE_L3MOCS dst_len = 0, src0_len = 1, src1_len = 4 flat )  base_offset 0  { align1 1Q $6 };
+send(16)        nullUD          g82UD           g91UD           0x04040519                0x00000080
+                            slm MsgDesc: ( atomic_or, a32, d32, V1, L1UC_L3WB dst_len = 0, src0_len = 2, src1_len = 2 flat )  base_offset 0  { align1 2H $0 };
+send(1)         g10UD           g0UD            nullUD          0x0210011f                0x00000000
+                            slm MsgDesc: ( fence, a32, threadgroup, none, normal_routing dst_len = 1, src0_len = 1, src1_len = 0 flat )  base_offset 0  { align1 WE_all 1N $1 };
+send(1)         g23UD           g117UD          nullUD          0x2210c500                a0.1<0>UD
+                            ugm MsgDesc: ( load, a32, d32, V8, transpose, L1STATE_L3MOCS dst_len = 1, src0_len = 1, bss )  src1_len = 0 ex_bso surface_state_index 0  { align1 WE_all 1N @1 $10 };
+send(8)         nullUD          g14UD           g24UD           0x040350fc                a0.1<0>UD
+                            dp data 1 MsgDesc: (DC typed surface write, Surface = 252, SIMD16, Mask = 0x0)  src1_len = 4 ex_bso mlen 2 rlen 0 { align1 1Q @1 $5 };
+send(8)         nullUD          g51UD           g52UD           0x02000000                0x00000040
+                            rt accel MsgDesc: SIMD8,  mlen 1 ex_mlen 1 rlen 0 { align1 1Q $2 };
+send(16)        nullUD          g88UD           g98UD           0x02000100                0x00000080
+                            rt accel MsgDesc: SIMD16,  mlen 1 ex_mlen 2 rlen 0 { align1 1H $6 };
diff --git a/src/intel/compiler/elk/tests/gen12.5/send.expected b/src/intel/compiler/elk/tests/gen12.5/send.expected
new file mode 100644
index 00000000000..51616dd2b3d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen12.5/send.expected
@@ -0,0 +1,15 @@
+31 45 00 88 00 00 0c 39 8e 3a 00 fa 00 00 30 04
+31 42 00 88 00 00 0c 1c 8e 1d 00 fa 00 00 30 04
+31 40 00 8c 00 00 0c 39 8e 3a 00 fa 00 00 30 04
+31 40 03 00 00 00 00 00 8c 4f 0c fa 25 0a 3c 04
+31 90 04 00 00 01 02 00 14 09 08 fa 04 07 00 04
+31 43 00 80 00 00 0c 04 0c 00 3e da 00 00 04 00
+31 41 03 00 00 00 00 00 0c 24 08 e6 0c 25 02 00
+31 40 03 00 00 00 00 00 0c 22 08 e6 0c 23 02 00
+31 46 03 00 00 00 00 00 0c 06 0c ea 24 07 3c 00
+31 40 24 00 00 00 00 00 14 52 32 ea 14 5b 00 01
+31 41 00 80 00 00 0c 0a 0c 00 3e e2 00 00 00 00
+31 9a 00 80 80 01 0e 17 8c 75 00 fa 00 00 30 00
+31 95 03 00 80 01 02 00 14 0e f8 c1 24 18 d4 00
+31 42 03 00 00 00 00 00 0c 33 00 80 0c 34 00 00
+31 46 04 00 00 00 00 00 0c 58 00 82 14 62 00 00
diff --git a/src/intel/compiler/elk/tests/gen12.5/swsb.asm b/src/intel/compiler/elk/tests/gen12.5/swsb.asm
new file mode 100644
index 00000000000..4a7b9af8daf
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen12.5/swsb.asm
@@ -0,0 +1,23 @@
+mul(8)          g37<1>D         g99<8,8,1>D     g36<16,8,2>UW   { align1 1Q I@1 };
+mul(8)          g37<1>D         g99<8,8,1>D     g36<16,8,2>UW   { align1 1Q I@2 };
+mul(8)          g37<1>D         g99<8,8,1>D     g36<16,8,2>UW   { align1 1Q I@3 };
+mul(8)          g37<1>D         g99<8,8,1>D     g36<16,8,2>UW   { align1 1Q I@4 };
+mul(8)          g37<1>D         g99<8,8,1>D     g36<16,8,2>UW   { align1 1Q I@5 };
+mul(8)          g37<1>D         g99<8,8,1>D     g36<16,8,2>UW   { align1 1Q I@6 };
+mul(8)          g37<1>D         g99<8,8,1>D     g36<16,8,2>UW   { align1 1Q I@7 };
+
+mov(8)          g36<1>UD        g35<8,8,1>F                     { align1 1Q F@1 };
+mov(8)          g36<1>UD        g35<8,8,1>F                     { align1 1Q F@2 };
+mov(8)          g36<1>UD        g35<8,8,1>F                     { align1 1Q F@3 };
+mov(8)          g36<1>UD        g35<8,8,1>F                     { align1 1Q F@4 };
+mov(8)          g36<1>UD        g35<8,8,1>F                     { align1 1Q F@5 };
+mov(8)          g36<1>UD        g35<8,8,1>F                     { align1 1Q F@6 };
+mov(8)          g36<1>UD        g35<8,8,1>F                     { align1 1Q F@7 };
+
+add(1)          a0<1>UD         a0<0,1,0>UD     0x00000800UD    { align1 WE_all 1N A@1 };
+add(1)          a0<1>UD         a0<0,1,0>UD     0x00000800UD    { align1 WE_all 1N A@2 };
+add(1)          a0<1>UD         a0<0,1,0>UD     0x00000800UD    { align1 WE_all 1N A@3 };
+add(1)          a0<1>UD         a0<0,1,0>UD     0x00000800UD    { align1 WE_all 1N A@4 };
+add(1)          a0<1>UD         a0<0,1,0>UD     0x00000800UD    { align1 WE_all 1N A@5 };
+add(1)          a0<1>UD         a0<0,1,0>UD     0x00000800UD    { align1 WE_all 1N A@6 };
+add(1)          a0<1>UD         a0<0,1,0>UD     0x00000800UD    { align1 WE_all 1N A@7 };
\ No newline at end of file
diff --git a/src/intel/compiler/elk/tests/gen12.5/swsb.expected b/src/intel/compiler/elk/tests/gen12.5/swsb.expected
new file mode 100644
index 00000000000..fef186c7079
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen12.5/swsb.expected
@@ -0,0 +1,21 @@
+41 19 03 00 60 06 05 25 05 63 46 01 06 24 56 00
+41 1a 03 00 60 06 05 25 05 63 46 01 06 24 56 00
+41 1b 03 00 60 06 05 25 05 63 46 01 06 24 56 00
+41 1c 03 00 60 06 05 25 05 63 46 01 06 24 56 00
+41 1d 03 00 60 06 05 25 05 63 46 01 06 24 56 00
+41 1e 03 00 60 06 05 25 05 63 46 01 06 24 56 00
+41 1f 03 00 60 06 05 25 05 63 46 01 06 24 56 00
+61 11 03 00 20 0a 05 24 05 23 46 00 00 00 00 00
+61 12 03 00 20 0a 05 24 05 23 46 00 00 00 00 00
+61 13 03 00 20 0a 05 24 05 23 46 00 00 00 00 00
+61 14 03 00 20 0a 05 24 05 23 46 00 00 00 00 00
+61 15 03 00 20 0a 05 24 05 23 46 00 00 00 00 00
+61 16 03 00 20 0a 05 24 05 23 46 00 00 00 00 00
+61 17 03 00 20 0a 05 24 05 23 46 00 00 00 00 00
+40 09 00 80 20 82 01 10 00 10 00 02 00 08 00 00
+40 0a 00 80 20 82 01 10 00 10 00 02 00 08 00 00
+40 0b 00 80 20 82 01 10 00 10 00 02 00 08 00 00
+40 0c 00 80 20 82 01 10 00 10 00 02 00 08 00 00
+40 0d 00 80 20 82 01 10 00 10 00 02 00 08 00 00
+40 0e 00 80 20 82 01 10 00 10 00 02 00 08 00 00
+40 0f 00 80 20 82 01 10 00 10 00 02 00 08 00 00
diff --git a/src/intel/compiler/elk/tests/gen12/dp4a.asm b/src/intel/compiler/elk/tests/gen12/dp4a.asm
new file mode 100644
index 00000000000..5ae00c8aae3
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen12/dp4a.asm
@@ -0,0 +1,33 @@
+dp4a(8)         g10<1>D         g2<8,8,1>D      g6<8,8,1>D      g7<1,1,1>D { align1 1Q @1 };
+dp4a(8)         g10<1>D         g2<8,8,1>D      g6<8,8,1>D      g7<1,1,1>UD { align1 1Q @1 };
+dp4a(8)         g10<1>D         g2<8,8,1>D      g8<8,8,1>D      g9<1,1,1>D { align1 1Q @1 };
+dp4a(8)         g10<1>D         g2<8,8,1>D      g8<8,8,1>D      g9<1,1,1>UD { align1 1Q @1 };
+dp4a(8)         g10<1>UD        g2<8,8,1>UD     g6<8,8,1>UD     g7<1,1,1>UD { align1 1Q @1 };
+dp4a(8)         g10<1>UD        g2<8,8,1>UD     g8<8,8,1>UD     g9<1,1,1>UD { align1 1Q @1 };
+dp4a(8)         g5<1>D          g2<8,8,1>D      g3<8,8,1>D      g4<1,1,1>D { align1 1Q @3 $0.dst };
+dp4a(8)         g5<1>D          g2<8,8,1>D      g3<8,8,1>D      g4<1,1,1>UD { align1 1Q @3 $0.dst };
+dp4a(8)         g5<1>UD         g2<8,8,1>UD     g3<8,8,1>UD     g4<1,1,1>UD { align1 1Q @3 $0.dst };
+dp4a(8)         g6<1>D          g2<8,8,1>D      g3<8,8,1>D      g4<1,1,1>D { align1 1Q @4 $1.dst };
+dp4a(8)         g6<1>D          g2<8,8,1>D      g3<8,8,1>D      g4<1,1,1>UD { align1 1Q @4 $1.dst };
+dp4a(8)         g6<1>D          g2<8,8,1>D      g4<8,8,1>D      g5<1,1,1>D { align1 1Q @4 $0.dst };
+dp4a(8)         g6<1>D          g2<8,8,1>D      g4<8,8,1>D      g5<1,1,1>UD { align1 1Q @4 $0.dst };
+dp4a(8)         g6<1>UD         g2<8,8,1>UD     g3<8,8,1>UD     g4<1,1,1>UD { align1 1Q @4 $1.dst };
+dp4a(8)         g6<1>UD         g2<8,8,1>UD     g4<8,8,1>UD     g5<1,1,1>UD { align1 1Q @4 $0.dst };
+dp4a(8)         g7<1>D          g2<8,8,1>D      g5<8,8,1>D      g6<1,1,1>D { align1 1Q @1 };
+dp4a(8)         g7<1>D          g2<8,8,1>D      g5<8,8,1>D      g6<1,1,1>UD { align1 1Q @1 };
+dp4a(8)         g7<1>UD         g2<8,8,1>UD     g5<8,8,1>UD     g6<1,1,1>UD { align1 1Q @1 };
+dp4a(8)         g8<1>D          g2<8,8,1>D      g4<8,8,1>D      g5<1,1,1>D { align1 1Q @3 $0.dst };
+dp4a(8)         g8<1>D          g2<8,8,1>D      g4<8,8,1>D      g5<1,1,1>D { align1 1Q @4 $0.dst };
+dp4a(8)         g8<1>D          g2<8,8,1>D      g4<8,8,1>D      g5<1,1,1>UD { align1 1Q @3 $0.dst };
+dp4a(8)         g8<1>D          g2<8,8,1>D      g4<8,8,1>D      g5<1,1,1>UD { align1 1Q @4 $0.dst };
+dp4a(8)         g8<1>D          g2<8,8,1>D      g6<8,8,1>D      g7<1,1,1>D { align1 1Q @1 };
+dp4a(8)         g8<1>D          g2<8,8,1>D      g6<8,8,1>D      g7<1,1,1>UD { align1 1Q @1 };
+dp4a(8)         g8<1>UD         g2<8,8,1>UD     g4<8,8,1>UD     g5<1,1,1>UD { align1 1Q @3 $0.dst };
+dp4a(8)         g8<1>UD         g2<8,8,1>UD     g4<8,8,1>UD     g5<1,1,1>UD { align1 1Q @4 $0.dst };
+dp4a(8)         g8<1>UD         g2<8,8,1>UD     g6<8,8,1>UD     g7<1,1,1>UD { align1 1Q @1 };
+dp4a.sat(8)     g10<1>D         g5<8,8,1>D      g6<8,8,1>D      g7<1,1,1>D { align1 1Q @1 $2.dst };
+dp4a.sat(8)     g10<1>D         g5<8,8,1>D      g6<8,8,1>D      g7<1,1,1>UD { align1 1Q @1 $2.dst };
+dp4a.sat(8)     g10<1>UD        g5<8,8,1>UD     g6<8,8,1>UD     g7<1,1,1>UD { align1 1Q @1 $2.dst };
+dp4a.sat(8)     g8<1>D          g5<8,8,1>D      g3<8,8,1>D      g4<1,1,1>D { align1 1Q $2.dst };
+dp4a.sat(8)     g8<1>D          g5<8,8,1>D      g3<8,8,1>D      g4<1,1,1>UD { align1 1Q $2.dst };
+dp4a.sat(8)     g8<1>UD         g5<8,8,1>UD     g3<8,8,1>UD     g4<1,1,1>UD { align1 1Q $2.dst };
diff --git a/src/intel/compiler/elk/tests/gen12/dp4a.expected b/src/intel/compiler/elk/tests/gen12/dp4a.expected
new file mode 100644
index 00000000000..44904c296ff
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen12/dp4a.expected
@@ -0,0 +1,33 @@
+58 01 03 00 68 0e 04 0a 05 02 0e 0e 05 06 05 07
+58 01 03 00 68 0e 04 0a 05 02 0a 0e 05 06 05 07
+58 01 03 00 68 0e 04 0a 05 02 0e 0e 05 08 05 09
+58 01 03 00 68 0e 04 0a 05 02 0a 0e 05 08 05 09
+58 01 03 00 28 0a 04 0a 05 02 0a 0a 05 06 05 07
+58 01 03 00 28 0a 04 0a 05 02 0a 0a 05 08 05 09
+58 b0 03 00 68 0e 04 05 05 02 0e 0e 05 03 05 04
+58 b0 03 00 68 0e 04 05 05 02 0a 0e 05 03 05 04
+58 b0 03 00 28 0a 04 05 05 02 0a 0a 05 03 05 04
+58 c1 03 00 68 0e 04 06 05 02 0e 0e 05 03 05 04
+58 c1 03 00 68 0e 04 06 05 02 0a 0e 05 03 05 04
+58 c0 03 00 68 0e 04 06 05 02 0e 0e 05 04 05 05
+58 c0 03 00 68 0e 04 06 05 02 0a 0e 05 04 05 05
+58 c1 03 00 28 0a 04 06 05 02 0a 0a 05 03 05 04
+58 c0 03 00 28 0a 04 06 05 02 0a 0a 05 04 05 05
+58 01 03 00 68 0e 04 07 05 02 0e 0e 05 05 05 06
+58 01 03 00 68 0e 04 07 05 02 0a 0e 05 05 05 06
+58 01 03 00 28 0a 04 07 05 02 0a 0a 05 05 05 06
+58 b0 03 00 68 0e 04 08 05 02 0e 0e 05 04 05 05
+58 c0 03 00 68 0e 04 08 05 02 0e 0e 05 04 05 05
+58 b0 03 00 68 0e 04 08 05 02 0a 0e 05 04 05 05
+58 c0 03 00 68 0e 04 08 05 02 0a 0e 05 04 05 05
+58 01 03 00 68 0e 04 08 05 02 0e 0e 05 06 05 07
+58 01 03 00 68 0e 04 08 05 02 0a 0e 05 06 05 07
+58 b0 03 00 28 0a 04 08 05 02 0a 0a 05 04 05 05
+58 c0 03 00 28 0a 04 08 05 02 0a 0a 05 04 05 05
+58 01 03 00 28 0a 04 08 05 02 0a 0a 05 06 05 07
+58 92 03 00 6c 0e 04 0a 05 05 0e 0e 05 06 05 07
+58 92 03 00 6c 0e 04 0a 05 05 0a 0e 05 06 05 07
+58 92 03 00 2c 0a 04 0a 05 05 0a 0a 05 06 05 07
+58 22 03 00 6c 0e 04 08 05 05 0e 0e 05 03 05 04
+58 22 03 00 6c 0e 04 08 05 05 0a 0e 05 03 05 04
+58 22 03 00 2c 0a 04 08 05 05 0a 0a 05 03 05 04
diff --git a/src/intel/compiler/elk/tests/gen12/send.asm b/src/intel/compiler/elk/tests/gen12/send.asm
new file mode 100644
index 00000000000..81119aa8ccf
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen12/send.asm
@@ -0,0 +1,43 @@
+send(16)        g113UD          g12UD           nullUD          a0<0>UD         0x00000000
+                            dp data 1 MsgDesc: indirect ex_mlen 0           { align1 1H @1 $6 };
+(+f1.0) send(16) nullUD         g15UD           g17UD           a0<0>UD         0x00000080
+                            dp data 1 MsgDesc: indirect ex_mlen 2           { align1 1H @1 $4 };
+send(8)         g104UD          g119UD          nullUD          0x04116e13                0x00000000
+                            dp data 1 MsgDesc: (DC typed surface read, Surface = 19, SIMD8, Mask = 0xe)  mlen 2 ex_mlen 0 rlen 1 { align1 2Q $8 };
+send(8)         nullUD          g92UD           g117UD          0x020350fc                a0.1<0>UD
+                            dp data 1 MsgDesc: (DC typed surface write, Surface = 252, SIMD16, Mask = 0x0)  mlen 1 rlen 0 { align1 1Q @1 $8 };
+(+f0.0.any8h) send(8) g55UD     g118UD          nullUD          0x02184201                0x00000000
+                            data MsgDesc: (DC unaligned OWORD block read, bti 1, 2)  mlen 1 ex_mlen 0 rlen 1 { align1 WE_all 1Q @3 $9 };
+send(8)         nullUD          g126UD          nullUD          0x02000000                0x00000000
+                            thread_spawner MsgDesc:  mlen 1 ex_mlen 0 rlen 0 { align1 WE_all 1Q @1 EOT };
+send(8)         g18UD           g24UD           nullUD          0x04115e10                0x00000000
+                            dp data 1 MsgDesc: (DC typed surface read, Surface = 16, SIMD16, Mask = 0xe)  mlen 2 ex_mlen 0 rlen 1 { align1 1Q $1 };
+send(8)         g19UD           g28UD           nullUD          0x04116e10                0x00000000
+                            dp data 1 MsgDesc: (DC typed surface read, Surface = 16, SIMD8, Mask = 0xe)  mlen 2 ex_mlen 0 rlen 1 { align1 2Q @7 $2 };
+send(16)        g50UD           g36UD           nullUD          a0<0>UD         0x00000000
+                            sampler MsgDesc: indirect ex_mlen 0             { align1 1H @1 $3 };
+send(8)         nullUD          g25UD           g21UD           0x02035001                0x00000100
+                            dp data 1 MsgDesc: (DC typed surface write, Surface = 1, SIMD16, Mask = 0x0)  mlen 1 ex_mlen 4 rlen 0 { align1 1Q $9 };
+send(8)         g5UD            g25UD           nullUD          0x02415001                0x00000000
+                            dp data 1 MsgDesc: (DC typed surface read, Surface = 1, SIMD16, Mask = 0x0)  mlen 1 ex_mlen 0 rlen 4 { align1 1Q $10 };
+send(8)         g27UD           g35UD           nullUD          0x04146efd                0x00000000
+                            dp data 1 MsgDesc: (DC A64 untyped surface read, Surface = 253, SIMD8, Mask = 0xe)  mlen 2 ex_mlen 0 rlen 1 { align1 1Q @1 $0 };
+send(8)         nullUD          g36UD           g38UD           0x04035001                0x00000100
+                            dp data 1 MsgDesc: (DC typed surface write, Surface = 1, SIMD16, Mask = 0x0)  mlen 2 ex_mlen 4 rlen 0 { align1 1Q @1 $1 };
+send(8)         nullUD          g126UD          g118UD          0x02080007                0x00000200
+                            urb MsgDesc: offset 0 SIMD8 write  mlen 1 ex_mlen 8 rlen 0 { align1 1Q @1 EOT };
+send(8)         g14UD           g37UD           nullUD          0x02110401                0x00000000
+                            data MsgDesc: (DC byte scattered read, bti 1, 4)  mlen 1 ex_mlen 0 rlen 1 { align1 1Q @1 $0 };
+send(1)         g100UD          g0UD            nullUD          0x0219e000                0x00000000
+                            data MsgDesc: (DC mfence, bti 0, 32)  mlen 1 ex_mlen 0 rlen 1 { align1 WE_all 1N $1 };
+send(1)         g15UD           g0UD            nullUD          0x0219e000                0x00000000
+                            data MsgDesc: (DC mfence, bti 0, 32)  mlen 1 ex_mlen 0 rlen 1 { align1 WE_all 1N $5 };
+
+sendc(16)       nullUD          g119UD          nullUD          0x10031000                0x00000000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0  mlen 8 ex_mlen 0 rlen 0 { align1 1H @1 EOT };
+sendc(8)        nullUD          g125UD          g123UD          0x04031400                0x00000080
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0  mlen 2 ex_mlen 2 rlen 0 { align1 1Q @1 EOT };
+sendc(16)       nullUD          g119UD          nullUD          0x10031000                0x00000000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0  mlen 8 ex_mlen 0 rlen 0 { align1 1H @1 EOT };
+sendc(16)       nullUD          g123UD          g119UD          0x08031000                0x00000100
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0  mlen 4 ex_mlen 4 rlen 0 { align1 1H @1 EOT };
diff --git a/src/intel/compiler/elk/tests/gen12/send.expected b/src/intel/compiler/elk/tests/gen12/send.expected
new file mode 100644
index 00000000000..3ba1ebb53c2
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen12/send.expected
@@ -0,0 +1,21 @@
+31 96 04 00 00 00 05 71 04 0c 00 c0 00 00 00 00
+31 94 84 01 00 00 01 00 04 0f 00 c0 14 11 00 00
+31 48 13 00 00 00 0c 68 14 77 26 cc 00 00 5a 00
+31 98 03 00 00 01 02 00 0c 5c f8 c1 04 75 d4 00
+31 b9 03 88 00 00 0c 37 0c 76 02 a4 00 00 10 02
+31 01 03 80 04 00 00 00 0c 7e 00 70 00 00 00 00
+31 41 03 00 00 00 0c 12 14 18 20 cc 00 00 56 00
+31 f2 13 00 00 00 0c 13 14 1c 20 cc 00 00 5a 00
+31 93 04 00 00 00 05 32 04 24 00 20 00 00 00 00
+31 49 03 00 00 00 00 00 0c 19 02 c0 24 15 d4 00
+31 4a 03 00 00 00 24 05 0c 19 02 c0 00 00 54 00
+31 90 03 00 00 00 0c 1b 14 23 fa cd 00 00 1a 01
+31 91 03 00 00 00 00 00 14 24 02 c0 24 26 d4 00
+31 01 03 00 04 00 00 00 0c 7e 0e 60 44 76 00 02
+31 90 03 00 00 00 0c 0e 0c 25 02 a8 00 00 40 00
+31 41 00 80 00 00 0c 64 0c 00 00 a0 00 00 78 02
+31 45 00 80 00 00 0c 0f 0c 00 00 a0 00 00 78 02
+32 01 04 00 04 00 00 00 44 77 00 50 00 00 c4 00
+32 01 03 00 04 00 00 00 14 7d 00 58 14 7b c4 00
+32 01 04 00 04 00 00 00 44 77 00 50 00 00 c4 00
+32 01 04 00 04 00 00 00 24 7b 00 50 24 77 c4 00
diff --git a/src/intel/compiler/elk/tests/gen12/swsb.asm b/src/intel/compiler/elk/tests/gen12/swsb.asm
new file mode 100644
index 00000000000..7c813356cf3
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen12/swsb.asm
@@ -0,0 +1,40 @@
+cmp.l.f0.0(8)   g55<1>UD        g54<8,8,1>UD    0x00000290UD    { align1 1Q @1 };
+mov(16)         g6<1>D          g20<8,8,1>W                     { align1 2H @2 };
+add(16)         g122<1>F        g98<8,8,1>F     (abs)g102<8,8,1>F { align1 1H @3 };
+shl(8)          g75<1>D         g122<8,8,1>D    0x00000002UD    { align1 1Q @4 };
+sel.l(4)        g90.4<1>D       g90.3<0,1,0>D   g90.4<4,4,1>D   { align1 WE_all 1N @5 };
+and(16)         g58<1>UD        g16<8,8,1>UD    g56<8,8,1>UD    { align1 1H @6 };
+or.nz.f0.0(16)  null<1>UD       g105<8,8,1>UD   g103<8,8,1>UD   { align1 1H @7 };
+
+math cos(16)    g17<1>F         g15<8,8,1>F     null<8,8,1>F    { align1 1H @1 $0 };
+math exp(16)    g1<1>F          g29<8,8,1>F     null<8,8,1>F    { align1 1H @5 $2 };
+math sqrt(8)    g9<1>HF         g6<8,8,1>HF     null<8,8,1>F    { align1 1Q @1 $3 };
+math intdiv(8)  g103<1>D        g101<8,8,1>D    g35<8,8,1>D     { align1 1Q @4 $4 };
+math intmod(8)  g101<1>D        g97<8,8,1>D     g76<8,8,1>D     { align1 2Q @2 $5 };
+math inv(16)    g10<1>F         g8<8,8,1>F      null<8,8,1>F    { align1 2H @2 $6 };
+math log(16)    g102<1>F        g100<8,8,1>F    null<8,8,1>F    { align1 2H @1 $7 };
+math rsq(16)    g76<1>F         g74<8,8,1>F     null<8,8,1>F    { align1 1H @7 $8 };
+math sin(16)    g123<1>F        g121<8,8,1>F    null<8,8,1>F    { align1 1H @4 $9 };
+math sqrt(16)   g43<1>F         g47<8,8,1>F     null<8,8,1>F    { align1 2H @7 $10 };
+math cos(8)     g103<1>HF       g98<8,8,1>HF    null<8,8,1>F    { align1 1Q @3 $11 };
+math exp(8)     g54<1>HF        g52<8,8,1>HF    null<8,8,1>F    { align1 1Q @1 $12 };
+math intdiv(8)  g35<1>D         g31<8,8,1>D     g33<8,8,1>D     { align1 4Q @2 $13 };
+math intmod(8)  g101<1>D        g97<8,8,1>D     g99<8,8,1>D     { align1 2Q @4 $14 };
+math inv(8)     g102<1>HF       g92<8,8,1>HF    null<8,8,1>F    { align1 1Q @6 $15 };
+
+sel.ge(16)      g7<1>UW         g7<16,16,1>UW   g89<16,8,2>UW   { align1 1H @7 $0.dst };
+mov(16)         a0<1>UW         0x03e0UW                        { align1 WE_all 1H @3 $1.dst };
+add(16)         g100<1>D        g102<8,8,1>D    -2114D          { align1 1H @3 $2.dst };
+add(16)         g100<1>D        g105<8,8,1>D    (abs)g18<8,8,1>D { align1 1H @3 $3.dst };
+add(16)         g36<1>D         g36<8,8,1>D     g106<8,8,1>D    { align1 1H @7 $4.dst };
+and(16)         g49<1>UD        g45<8,8,1>UD    g47<8,8,1>UD    { align1 1H @3 $5.dst };
+asr(16)         g102<2>W        g41<16,8,2>W    g28<8,8,1>UD    { align1 2H @6 $6.dst };
+cmp.l.f0.0(8)   g97<1>F         (abs)g96<8,8,1>F 0x3d4ccccdF  /* 0.05F */ { align1 1Q @3 $7.dst };
+cmp.nz.f0.0(8)  g100<1>F        g98<8,8,1>F     g99<8,8,1>F     { align1 1Q @1 $8.dst };
+(+f0.0) sel(8)  g64<1>D         -g15<8,8,1>D    g15<8,8,1>D     { align1 1Q @1 $9.dst };
+mov(16)         g15<1>UD        g13<8,8,1>D                     { align1 1H @1 $10.dst };
+mul(8)          acc0<1>UD       g10<8,4,2>UD    g101<16,8,2>UW  { align1 1Q @7 $11.dst };
+or(16)          g51<1>UW        g51<16,16,1>UW  g75<16,8,2>UW   { align1 1H @7 $12.dst };
+sel.ge(16)      g28<1>W         g28<16,16,1>W   g92<16,8,2>W    { align1 2H @7 $13.dst };
+xor(16)         g10<1>UD        g10<8,8,1>UD    g100<8,8,1>UD   { align1 1H @7 $14.dst };
+and(16)         g39<1>UD        g35<8,8,1>UD    g37<8,8,1>UD    { align1 2H @5 $15.dst };
diff --git a/src/intel/compiler/elk/tests/gen12/swsb.expected b/src/intel/compiler/elk/tests/gen12/swsb.expected
new file mode 100644
index 00000000000..223c3fecdd9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen12/swsb.expected
@@ -0,0 +1,38 @@
+70 01 03 00 20 82 05 37 05 36 46 52 90 02 00 00
+61 02 24 00 60 05 05 06 05 14 46 00 00 00 00 00
+40 03 04 00 a0 0a 05 7a 05 62 46 0a 05 66 46 01
+69 04 03 00 60 86 05 4b 05 7a 46 02 02 00 00 00
+62 05 02 80 60 06 85 5a 64 5a 00 56 85 5a 34 00
+65 06 04 00 20 02 05 3a 05 10 46 02 05 38 46 00
+66 07 04 00 20 02 01 00 05 69 46 22 05 67 46 00
+38 90 04 00 a0 0a 05 11 05 0f 46 7a 01 00 46 00
+38 d2 04 00 a0 0a 05 01 05 1d 46 3a 01 00 46 00
+38 93 03 00 90 09 05 09 05 06 46 4a 01 00 46 00
+38 c4 03 00 60 06 05 67 05 65 46 c6 05 23 46 00
+38 a5 13 00 60 06 05 65 05 61 46 d6 05 4c 46 00
+38 a6 24 00 a0 0a 05 0a 05 08 46 1a 01 00 46 00
+38 97 24 00 a0 0a 05 66 05 64 46 2a 01 00 46 00
+38 f8 04 00 a0 0a 05 4c 05 4a 46 5a 01 00 46 00
+38 c9 04 00 a0 0a 05 7b 05 79 46 6a 01 00 46 00
+38 fa 24 00 a0 0a 05 2b 05 2f 46 4a 01 00 46 00
+38 bb 03 00 90 09 05 67 05 62 46 7a 01 00 46 00
+38 9c 03 00 90 09 05 36 05 34 46 3a 01 00 46 00
+38 ad 33 00 60 06 05 23 05 1f 46 c6 05 21 46 00
+38 ce 13 00 60 06 05 65 05 61 46 d6 05 63 46 00
+38 ef 03 00 90 09 05 66 05 5c 46 1a 01 00 46 00
+62 f0 04 00 10 01 05 07 05 07 58 41 06 59 56 00
+61 b1 04 80 10 41 01 10 00 00 00 00 e0 03 e0 03
+40 b2 04 00 60 86 05 64 05 66 46 06 be f7 ff ff
+40 b3 04 00 60 06 05 64 05 69 46 06 05 12 46 01
+40 f4 04 00 60 06 05 24 05 24 46 06 05 6a 46 00
+65 b5 04 00 20 02 05 31 05 2d 46 02 05 2f 46 00
+6c e6 24 00 50 05 06 66 06 29 56 02 05 1c 46 00
+70 b7 03 00 a0 9a 05 61 05 60 46 5a cd cc 4c 3d
+70 98 03 00 a0 0a 05 64 05 62 46 2a 05 63 46 00
+62 99 03 01 60 26 05 40 05 0f 46 06 05 0f 46 00
+61 9a 04 00 20 06 05 0f 05 0d 46 00 00 00 00 00
+41 fb 03 00 20 02 01 20 06 0a 44 01 06 65 56 00
+66 fc 04 00 10 01 05 33 05 33 58 01 06 4b 56 00
+62 fd 24 00 50 05 05 1c 05 1c 58 45 06 5c 56 00
+67 fe 04 00 20 02 05 0a 05 0a 46 02 05 64 46 00
+65 df 24 00 20 02 05 27 05 23 46 02 05 25 46 00
diff --git a/src/intel/compiler/elk/tests/gen12/sync.asm b/src/intel/compiler/elk/tests/gen12/sync.asm
new file mode 100644
index 00000000000..a47c5dec28c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen12/sync.asm
@@ -0,0 +1,33 @@
+sync nop(16)                    null<0,1,0>UB                   { align1 WE_all 1H @1 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 1N @1 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 1N @2 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 1N @3 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 1N @4 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 1N @5 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 1N @6 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 1N @7 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 3N @1 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 3N @2 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 3N @3 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 3N @4 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 3N @5 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 3N @6 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 3N @7 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 5N @1 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 5N @2 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 5N @3 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 5N @4 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 5N @5 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 5N @6 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 5N @7 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 7N @1 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 7N @2 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 7N @3 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 7N @4 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 7N @5 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 7N @6 };
+sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 7N @7 };
+sync nop(32)                    null<0,1,0>UB                   { align1 WE_all @1 };
+sync nop(8)                     null<0,1,0>UB                   { align1 WE_all 1Q @1 };
+sync allwr(16)                  null<0,1,0>UB                   { align1 1H };
+sync allwr(8)                   null<0,1,0>UB                   { align1 1Q };
diff --git a/src/intel/compiler/elk/tests/gen12/sync.expected b/src/intel/compiler/elk/tests/gen12/sync.expected
new file mode 100644
index 00000000000..2e98e4ab791
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen12/sync.expected
@@ -0,0 +1,33 @@
+01 01 04 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 01 00 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 02 00 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 03 00 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 04 00 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 05 00 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 06 00 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 07 00 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 01 10 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 02 10 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 03 10 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 04 10 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 05 10 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 06 10 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 07 10 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 01 20 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 02 20 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 03 20 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 04 20 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 05 20 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 06 20 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 07 20 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 01 30 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 02 30 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 03 30 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 04 30 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 05 30 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 06 30 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 07 30 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 01 05 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 01 03 80 00 00 00 00 00 00 00 00 00 00 00 00
+01 00 04 00 00 00 00 00 00 00 00 30 00 00 00 00
+01 00 03 00 00 00 00 00 00 00 00 30 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/add.asm b/src/intel/compiler/elk/tests/gen4.5/add.asm
new file mode 100644
index 00000000000..1646fb12617
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/add.asm
@@ -0,0 +1,49 @@
+add(16)         g10<1>UW        g1.4<2,4,0>UW   0x10101010V     { align1 };
+add(8)          g6<1>F          g10<8,8,1>UW    -g1<0,1,0>F     { align1 };
+add(8)          g8<1>F          g10.8<8,8,1>UW  -g1<0,1,0>F     { align1 sechalf };
+add(16)         g4<1>F          g18<8,8,1>F     g6<8,8,1>F      { align1 compr };
+add(1)          m14.4<1>D       g8.4<0,1,0>D    16D             { align1 nomask };
+add(8)          g5<1>.xD        g2<4>.xD        64D             { align16 };
+add(8)          g4<1>.xD        g5<4>.xD        g4<4>.xD        { align16 };
+add(8)          g3<1>F          g3<4>F          g5<4>F          { align16 };
+add(16)         g24<1>F         g20<8,8,1>F     0x3f800000F  /* 1F */ { align1 compr };
+add(16)         g14<1>D         g14<8,8,1>D     1D              { align1 compr };
+add(8)          m5<1>.xyzF      g10<4>.xyzzF    g8<4>.xyzzF     { align16 NoDDClr };
+add.le.f0.0(16) g6<1>F          g8<8,8,1>F      g4<8,8,1>F      { align1 compr };
+add(16)         m3<1>F          g4<8,8,1>F      g12<8,8,1>F     { align1 compr4 };
+add(8)          a0<1>UW         g4<16,8,2>UW    0x0040UW        { align1 };
+add(8)          a0<1>UW         g5<16,8,2>UW    0x0040UW        { align1 sechalf };
+add(8)          g3<1>.xyF       g2<4>.xyyyF     0x3f800000F  /* 1F */ { align16 };
+add(16)         m4<1>F          -g6<8,8,1>F     0x3f800000F  /* 1F */ { align1 compr4 };
+add(16)         m2<1>D          g6<8,8,1>D      g8.3<0,1,0>D    { align1 compr };
+add(16)         m14<1>D         g4<8,8,1>D      12D             { align1 compr };
+add.sat(16)     g6<1>F          g4<8,8,1>F      g2.1<0,1,0>F    { align1 compr };
+add(8)          g37<1>UW        g1.4<2,4,0>UW   0x10101010V     { align1 };
+add(8)          g38<1>D         g2<0,1,0>D      1D              { align1 };
+add(8)          m5<1>.xF        g3<4>.xF        0x3f000000F  /* 0.5F */ { align16 };
+add(16)         g4<1>D          g2<0,1,0>D      -g2.2<0,1,0>D   { align1 compr };
+add.sat(8)      m5<1>F          g7<4>F          g8<4>F          { align16 };
+add(8)          g31<1>.xyzF     g28<4>.xyzzF    0x30300000VF /* [0F, 0F, 1F, 1F]VF */ { align16 };
+add.sat(8)      m5<1>.xyzF      g25<4>.xyzzF    g26<4>.xyzzF    { align16 NoDDClr };
+add.ge.f0.0(8)  g8<1>.xF        -g8<4>.xF       0x3f800000F  /* 1F */ { align16 };
+add(16)         g4.1<2>UW       g4.1<16,8,2>UW  g6<16,8,2>UW    { align1 compr };
+add.ge.f0.0(16) g4<1>F          -g6<8,8,1>F     0x3f800000F  /* 1F */ { align1 compr };
+add(8)          g4<1>.xyF       g4<4>.xyyyF     0xbf800000F  /* -1F */ { align16 NoDDClr };
+add(8)          m5<1>.xyzF      g4<4>.xyzzF     g2<0>.xyzzF     { align16 };
+add.sat(16)     m6<1>F          g2<0,1,0>F      g2.4<0,1,0>F    { align1 compr4 };
+add(8)          m5<1>.zwF       g8<4>.xxxyF     g9<4>.xxxyF     { align16 NoDDChk };
+add(8)          g4<1>.xUD       g4<4>.xUD       0x00000040UD    { align16 };
+add.sat(8)      m5<1>.yF        g1<0>.zF        0x3f000000F  /* 0.5F */ { align16 };
+add(16)         m14<1>UD        g4<8,8,1>UD     0x00000110UD    { align1 compr };
+add(8)          g5<1>F          -g9<4>.xyxyF    g9<4>.zwzwF     { align16 sechalf };
+add.sat(8)      m5<1>.yF        g6<4>.xF        g7<4>.xF        { align16 NoDDClr,NoDDChk };
+add.sat(8)      m5<1>.wF        g6<4>.xF        g7<4>.xF        { align16 NoDDChk };
+add.ge.f0.0(16) g16<1>F         g18<8,8,1>F     g10<8,8,1>F     { align1 compr };
+add.sat(8)      m5<1>.yF        -g1<0>.xF       0x3f000000F  /* 0.5F */ { align16 NoDDClr };
+add.sat(8)      m5<1>.zF        g3<4>.yF        0x40000000F  /* 2F */ { align16 NoDDClr,NoDDChk };
+add.sat(8)      m5<1>.wF        g3<4>.yF        0xc0000000F  /* -2F */ { align16 NoDDChk };
+add(8)          m5<1>F          g3<4>F          0x2020a038VF /* [1.5F, -0.5F, 0.5F, 0.5F]VF */ { align16 };
+add(8)          g5<1>.zF        g4<4>.xF        0xbf800000F  /* -1F */ { align16 NoDDClr,NoDDChk };
+add(8)          m5<1>.xyF       g12<4>.xyyyF    0x3f000000F  /* 0.5F */ { align16 NoDDClr };
+add(8)          m5<1>.wF        -g3<4>.xF       0x3f800000F  /* 1F */ { align16 NoDDClr,NoDDChk };
+add(8)          g5<1>.xyF       g3<0>.xyyyF     g4<4>.xyyyF     { align16 NoDDClr };
diff --git a/src/intel/compiler/elk/tests/gen4.5/add.expected b/src/intel/compiler/elk/tests/gen4.5/add.expected
new file mode 100644
index 00000000000..29656d9842c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/add.expected
@@ -0,0 +1,49 @@
+40 00 80 00 29 6d 40 21 28 00 48 00 10 10 10 10
+40 00 60 00 3d 75 c0 20 40 01 8d 00 20 40 00 00
+40 10 60 00 3d 75 00 21 50 01 8d 00 20 40 00 00
+40 20 80 00 bd 77 80 20 40 02 8d 00 c0 00 8d 00
+40 02 00 00 a6 1c d0 21 10 01 00 00 10 00 00 00
+40 01 60 00 a5 1c a1 20 40 00 60 00 40 00 00 00
+40 01 60 00 a5 14 81 20 a0 00 60 00 80 00 60 00
+40 01 60 00 bd 77 6f 20 64 00 6e 00 a4 00 6e 00
+40 20 80 00 bd 7f 00 23 80 02 8d 00 00 00 80 3f
+40 20 80 00 a5 1c c0 21 c0 01 8d 00 01 00 00 00
+40 05 60 00 be 77 a7 20 44 01 6a 00 04 01 6a 00
+40 20 80 06 bd 77 c0 20 00 01 8d 00 80 00 8d 00
+40 20 80 00 be 77 60 30 80 00 8d 00 80 01 8d 00
+40 00 60 00 28 2d 00 22 80 00 ae 00 40 00 40 00
+40 10 60 00 28 2d 00 22 a0 00 ae 00 40 00 40 00
+40 01 60 00 bd 7f 63 20 44 00 65 00 00 00 80 3f
+40 20 80 00 be 7f 80 30 c0 40 8d 00 00 00 80 3f
+40 20 80 00 a6 14 40 20 c0 00 8d 00 0c 01 00 00
+40 20 80 00 a6 1c c0 21 80 00 8d 00 0c 00 00 00
+40 20 80 80 bd 77 c0 20 80 00 8d 00 44 00 00 00
+40 00 60 00 29 6d a0 24 28 00 48 00 10 10 10 10
+40 00 60 00 a5 1c c0 24 40 00 00 00 01 00 00 00
+40 01 60 00 be 7f a1 20 60 00 60 00 00 00 00 3f
+40 20 80 00 a5 14 80 20 40 00 00 00 48 40 00 00
+40 01 60 80 be 77 af 20 e4 00 6e 00 04 01 6e 00
+40 01 60 00 bd 5f e7 23 84 03 6a 00 00 00 30 30
+40 05 60 80 be 77 a7 20 24 03 6a 00 44 03 6a 00
+40 01 60 04 bd 7f 01 21 00 41 60 00 00 00 80 3f
+40 20 80 00 29 25 82 40 82 00 ae 00 c0 00 ae 00
+40 20 80 04 bd 7f 80 20 c0 40 8d 00 00 00 80 3f
+40 05 60 00 bd 7f 83 20 84 00 65 00 00 00 80 bf
+40 01 60 00 be 77 a7 20 84 00 6a 00 44 00 0a 00
+40 20 80 80 be 77 c0 30 40 00 00 00 50 00 00 00
+40 09 60 00 be 77 ac 20 00 01 64 00 20 01 64 00
+40 01 60 00 21 0c 81 20 80 00 60 00 40 00 00 00
+40 01 60 80 be 7f a2 20 2a 00 0a 00 00 00 00 3f
+40 20 80 00 22 0c c0 21 80 00 8d 00 10 01 00 00
+40 11 60 00 bd 77 af 20 24 41 64 00 2e 01 6e 00
+40 0d 60 80 be 77 a2 20 c0 00 60 00 e0 00 60 00
+40 09 60 80 be 77 a8 20 c0 00 60 00 e0 00 60 00
+40 20 80 04 bd 77 00 22 40 02 8d 00 40 01 8d 00
+40 05 60 80 be 7f a2 20 20 40 00 00 00 00 00 3f
+40 0d 60 80 be 7f a4 20 65 00 65 00 00 00 00 40
+40 09 60 80 be 7f a8 20 65 00 65 00 00 00 00 c0
+40 01 60 00 be 5f af 20 64 00 6e 00 38 a0 20 20
+40 0d 60 00 bd 7f a4 20 80 00 60 00 00 00 80 bf
+40 05 60 00 be 7f a3 20 84 01 65 00 00 00 00 3f
+40 0d 60 00 be 7f a8 20 60 40 60 00 00 00 80 3f
+40 05 60 00 bd 77 a3 20 64 00 05 00 84 00 65 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/and.asm b/src/intel/compiler/elk/tests/gen4.5/and.asm
new file mode 100644
index 00000000000..1c731270003
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/and.asm
@@ -0,0 +1,17 @@
+and(8)          g9<1>.wUD       g9<4>.wUD       524032D         { align16 };
+and(16)         g4<1>D          g6<8,8,1>D      1D              { align1 compr };
+and(8)          g10<1>.xD       g10<4>.xD       1D              { align16 };
+and(16)         g6<1>UD         g10<8,8,1>UD    g8<8,8,1>UD     { align1 compr };
+and.nz.f0.0(16) null<1>D        g6<8,8,1>UD     1D              { align1 compr };
+and(16)         g4<1>D          g8<8,8,1>UD     1D              { align1 compr };
+and(8)          g2<1>D          g2<8,8,1>UD     1D              { align1 };
+and.nz.f0.0(8)  null<1>.xD      g9<4>.xUD       1D              { align16 };
+and(16)         g12<1>UD        g2.4<0,1,0>UD   0x80000000UD    { align1 compr };
+and.nz.f0.0(16) g110<1>D        g6<8,8,1>D      1D              { align1 compr };
+and(1)          g10<1>UD        f0<0,1,0>UW     0x0000000fUD    { align1 nomask };
+and(8)          g17<1>.xUD      g1<0>.xUD       0x80000000UD    { align16 };
+and.nz.f0.0(16) g6<1>D          g4<8,8,1>UD     1D              { align1 compr };
+and(8)          g5<1>.xUD       g1<0>.xUD       g1<0>.yUD       { align16 };
+and(8)          g8<1>.xD        g7<4>.xUD       1D              { align16 };
+and.nz.f0.0(8)  g6<1>.xD        g6<4>.xD        1D              { align16 };
+and.nz.f0.0(1)  null<1>UD       g1.6<0,1,0>UD   0x04000000UD    { align1 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/and.expected b/src/intel/compiler/elk/tests/gen4.5/and.expected
new file mode 100644
index 00000000000..4124a19585f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/and.expected
@@ -0,0 +1,17 @@
+05 01 60 00 21 1c 28 21 2f 01 6f 00 00 ff 07 00
+05 20 80 00 a5 1c 80 20 c0 00 8d 00 01 00 00 00
+05 01 60 00 a5 1c 41 21 40 01 60 00 01 00 00 00
+05 20 80 00 21 04 c0 20 40 01 8d 00 00 01 8d 00
+05 20 80 02 24 1c 00 20 c0 00 8d 00 01 00 00 00
+05 20 80 00 25 1c 80 20 00 01 8d 00 01 00 00 00
+05 00 60 00 25 1c 40 20 40 00 8d 00 01 00 00 00
+05 01 60 02 24 1c 01 20 20 01 60 00 01 00 00 00
+05 20 80 00 21 0c 80 21 50 00 00 00 00 00 00 80
+05 20 80 02 a5 1c c0 2d c0 00 8d 00 01 00 00 00
+05 02 00 00 01 0d 40 21 00 06 00 00 0f 00 00 00
+05 01 60 00 21 0c 21 22 20 00 00 00 00 00 00 80
+05 20 80 02 25 1c c0 20 80 00 8d 00 01 00 00 00
+05 01 60 00 21 04 a1 20 20 00 00 00 25 00 05 00
+05 01 60 00 25 1c 01 21 e0 00 60 00 01 00 00 00
+05 01 60 02 a5 1c c1 20 c0 00 60 00 01 00 00 00
+05 00 00 02 20 0c 00 20 38 00 00 00 00 00 00 04
diff --git a/src/intel/compiler/elk/tests/gen4.5/asr.asm b/src/intel/compiler/elk/tests/gen4.5/asr.asm
new file mode 100644
index 00000000000..3fdb60d77ec
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/asr.asm
@@ -0,0 +1,5 @@
+asr(16)         g4<1>D          -g1.6<0,1,0>D   31D             { align1 compr };
+asr.nz.f0.0(16) null<1>D        -g1.6<0,1,0>D   31D             { align1 compr };
+asr(8)          g4<1>D          g5<4>D          g4<4>UD         { align16 };
+asr(8)          g11<1>.xD       g5<4>.xD        0x00000002UD    { align16 };
+asr(16)         g10<1>D         g6<8,8,1>D      0x00000002UD    { align1 compr };
diff --git a/src/intel/compiler/elk/tests/gen4.5/asr.expected b/src/intel/compiler/elk/tests/gen4.5/asr.expected
new file mode 100644
index 00000000000..ed2318952b9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/asr.expected
@@ -0,0 +1,5 @@
+0c 20 80 00 a5 1c 80 20 38 40 00 00 1f 00 00 00
+0c 20 80 02 a4 1c 00 20 38 40 00 00 1f 00 00 00
+0c 01 60 00 a5 04 8f 20 a4 00 6e 00 84 00 6e 00
+0c 01 60 00 a5 0c 61 21 a0 00 60 00 02 00 00 00
+0c 20 80 00 a5 0c 40 21 c0 00 8d 00 02 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/break.asm b/src/intel/compiler/elk/tests/gen4.5/break.asm
new file mode 100644
index 00000000000..71018565915
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/break.asm
@@ -0,0 +1,5 @@
+(-f0.0) break(16) Jump: 10      Pop: 0                          { align1 };
+break(16)       Jump: 5         Pop: 1                          { align1 };
+(+f0.0) break(16) Jump: 141     Pop: 0                          { align1 };
+(+f0.0.x) break(8) Jump: 16     Pop: 0                          { align16 };
+break(8)        Jump: 6         Pop: 2                          { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/break.expected b/src/intel/compiler/elk/tests/gen4.5/break.expected
new file mode 100644
index 00000000000..2ab41227a9f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/break.expected
@@ -0,0 +1,5 @@
+28 00 91 00 00 1c 00 34 00 14 60 00 0a 00 00 00
+28 00 80 00 00 1c 00 34 00 14 60 00 05 00 01 00
+28 00 81 00 00 1c 00 34 00 14 60 00 8d 00 00 00
+28 01 62 00 00 1c 0f 34 04 14 6e 00 10 00 00 00
+28 01 60 00 00 1c 0f 34 04 14 6e 00 06 00 02 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/cmp.asm b/src/intel/compiler/elk/tests/gen4.5/cmp.asm
new file mode 100644
index 00000000000..a0e66e17247
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/cmp.asm
@@ -0,0 +1,80 @@
+cmp.nz.f0.0(8)  null<1>F        g3<0>.xyzzF     0x74746e64VF /* [10F, 15F, 20F, 20F]VF */ { align16 };
+cmp.nz.f0.0(8)  null<1>D        g7<4>.xyzzD     0D              { align16 };
+cmp.ge.f0.0(16) g6<1>F          g4<8,8,1>F      0x3189705fF  /* 4e-09F */ { align1 compr };
+cmp.l.f0.0(16)  g8<1>F          g4<8,8,1>F      0x3189705fF  /* 4e-09F */ { align1 compr };
+cmp.l.f0.0(16)  g8<1>F          g4<8,8,1>F      g6<8,8,1>F      { align1 compr };
+cmp.ge.f0.0(16) g10<1>F         g4<8,8,1>F      g6<8,8,1>F      { align1 compr };
+cmp.z.f0.0(8)   g10<1>.xD       g4<0>.xD        0D              { align16 };
+cmp.l.f0.0(8)   g7<1>.xF        g7<4>.xF        0x3189705fF  /* 4e-09F */ { align16 };
+cmp.ge.f0.0(8)  g6<1>.xF        g2<0>.xF        g6<4>.xF        { align16 };
+cmp.z.f0.0(8)   null<1>F        g3<0>.zwwwF     g3<0>.xyyyF     { align16 };
+cmp.ge.f0.0(16) null<1>D        g14<8,8,1>D     16D             { align1 compr };
+cmp.l.f0.0(16)  null<1>D        g2<0,1,0>D      1D              { align1 compr };
+cmp.z.f0.0(16)  g8<1>F          g32<8,8,1>F     g2.3<0,1,0>F    { align1 compr };
+cmp.ge.f0.0(16) null<1>F        g6<8,8,1>F      0x0F  /* 0F */  { align1 compr };
+cmp.nz.f0.0(8)  null<1>F        g12<4>.xyyyF    g1<0>.xyyyF     { align16 };
+cmp.z.f0.0(8)   null<1>D        g6<4>D          g2.4<0>D        { align16 };
+cmp.z.f0.0(16)  g6<1>D          g2.1<0,1,0>D    39D             { align1 compr };
+cmp.z.f0.0(16)  g4<1>F          g2.1<0,1,0>F    0x41000000F  /* 8F */ { align1 compr };
+cmp.z.f0.0(8)   g5<1>.xD        g5<4>.xD        g1<0>.zD        { align16 };
+cmp.l.f0.0(8)   g3<1>.xyF       g1<0>.xyyyF     g1<0>.zwwwF     { align16 };
+cmp.z.f0.0(16)  null<1>D        g2<0,1,0>D      1D              { align1 compr };
+cmp.z.f0.0(16)  null<1>F        g14<8,8,1>F     g2.1<0,1,0>F    { align1 compr };
+cmp.z.f0.0(8)   g6<1>.xF        g6<4>.xF        g3<0>.yF        { align16 };
+cmp.nz.f0.0(16) g4<1>F          g6<8,8,1>F      g2.2<0,1,0>F    { align1 compr };
+cmp.ge.f0.0(16) null<1>F        (abs)g16<8,8,1>F (abs)g8<8,8,1>F { align1 compr };
+cmp.nz.f0.0(16) null<1>D        g2<0,1,0>D      0D              { align1 compr };
+cmp.nz.f0.0(8)  g5<1>F          g5<8,8,1>F      g38<8,8,1>F     { align1 };
+cmp.ge.f0.0(8)  null<1>.xD      g5<4>.xD        4D              { align16 };
+cmp.nz.f0.0(16) null<1>F        g2.4<0,1,0>F    0x0F  /* 0F */  { align1 compr };
+cmp.z.f0.0(16)  null<1>F        g4.1<0,1,0>F    0x3f800000F  /* 1F */ { align1 compr };
+cmp.ge.f0.0(16) g4<1>D          g2<0,1,0>D      1D              { align1 compr };
+cmp.nz.f0.0(16) g4<1>D          g2.1<0,1,0>D    0D              { align1 compr };
+cmp.z.f0.0(16)  g8<1>D          g6<8,8,1>D      g2.5<0,1,0>D    { align1 compr };
+cmp.l.f0.0(16)  null<1>F        g4<8,8,1>F      g2.5<0,1,0>F    { align1 compr };
+cmp.l.f0.0(16)  g6<1>D          g3<0,1,0>D      1D              { align1 compr };
+cmp.ge.f0.0(8)  null<1>F        g32<4>.xF       0x0F  /* 0F */  { align16 };
+cmp.l.f0.0(8)   null<1>F        g23<4>.xF       0x43000000F  /* 128F */ { align16 };
+cmp.le.f0.0(8)  g32<1>.xF       g32<4>.xF       0x0F  /* 0F */  { align16 };
+cmp.ge.f0.0(16) g4<1>D          g2.3<0,1,0>D    g2<0,1,0>D      { align1 compr };
+cmp.nz.f0.0(8)  g3<1>.xD        g1<0>.xD        g1<0>.yD        { align16 };
+cmp.nz.f0.0(8)  g3<1>.xyzF      g1<0>.xyzzF     g1.4<0>.xyzzF   { align16 };
+cmp.nz.f0.0(8)  null<1>F        g1<0>.xF        0x0F  /* 0F */  { align16 };
+cmp.le.f0.0(8)  g5<1>.xD        g1<0>.xD        0D              { align16 };
+cmp.l.f0.0(16)  g4<1>D          g2.1<0,1,0>D    g2<0,1,0>D      { align1 compr };
+cmp.ge.f0.0(8)  g3<1>D          g1<0>D          g1.4<0>D        { align16 };
+cmp.le.f0.0(16) null<1>F        g4<8,8,1>F      0x3f000000F  /* 0.5F */ { align1 compr };
+cmp.le.f0.0(16) g20<1>F         g4<8,8,1>F      0x461c3f9aF  /* 9999.9F */ { align1 compr };
+cmp.z.f0.0(8)   null<1>F        g3<0>.xyzzF     0x6e6e6c6aVF /* [13F, 14F, 15F, 15F]VF */ { align16 };
+cmp.nz.f0.0(8)  null<1>D        g1<0>.xyzzD     g1.4<0>.xyzzD   { align16 };
+cmp.ge.f0.0(8)  null<1>.xD      g5<4>.xD        g3<0>.xD        { align16 };
+cmp.nz.f0.0(16) g8<1>F          g2.2<0,1,0>F    0x0F  /* 0F */  { align1 compr };
+cmp.l.f0.0(8)   null<1>F        g1<0>F          g3<4>F          { align16 };
+cmp.g.f0.0(8)   g7<1>.xF        g2<4>.xF        0x0F  /* 0F */  { align16 };
+cmp.g.f0.0(8)   null<1>.xF      g2<4>.yF        0x0F  /* 0F */  { align16 };
+cmp.nz.f0.0(16) null<1>D        g16<8,8,1>D     g12<8,8,1>D     { align1 compr };
+cmp.l.f0.0(16)  null<1>F        g2<0,1,0>F      0x0F  /* 0F */  { align1 compr };
+cmp.z.f0.0(8)   null<1>.xD      g1<0>.xD        1D              { align16 };
+cmp.nz.f0.0(16) g6<1>D          g4<8,8,1>D      g2.2<0,1,0>D    { align1 compr };
+cmp.g.f0.0(16)  g16<1>F         (abs)g8<8,8,1>F 0x3f800000F  /* 1F */ { align1 compr };
+cmp.l.f0.0(8)   g5<1>.xD        g1<0>.yD        g1<0>.xD        { align16 };
+cmp.ge.f0.0(8)  g6<1>.xF        g3<4>.xF        0x41f00000F  /* 30F */ { align16 };
+cmp.g.f0.0(16)  null<1>D        g2.1<0,1,0>D    0D              { align1 compr };
+cmp.ge.f0.0(16) null<1>D        g4<8,8,1>D      g2.1<0,1,0>D    { align1 compr };
+cmp.le.f0.0(8)  null<1>.xF      g8<4>.xF        0x3f000000F  /* 0.5F */ { align16 };
+cmp.ge.f0.0(8)  null<1>.xF      g22<4>.xF       g10<4>.xF       { align16 };
+cmp.z.f0.0(8)   g9<1>.xF        g1<0>.xF        0x40b79581F  /* 5.737F */ { align16 };
+cmp.z.f0.0(16)  null<1>D        g6<8,8,1>D      g2<0,1,0>D      { align1 compr };
+cmp.nz.f0.0(16) null<1>F        g4<8,8,1>F      g8<8,8,1>F      { align1 compr };
+(+f0.1) cmp.z.f0.1(16) null<1>D g6<8,8,1>D      0D              { align1 compr };
+cmp.nz.f0.0(8)  g11<1>.xD       g4<4>.xD        10D             { align16 };
+cmp.nz.f0.0(8)  g3<1>F          g3<4>F          0x0F  /* 0F */  { align16 };
+cmp.le.f0.0(16) g4<1>D          g2<0,1,0>D      0D              { align1 compr };
+cmp.l.f0.0(8)   null<1>.xD      g6<4>.xD        g5<4>.xD        { align16 };
+cmp.ge.f0.0(8)  g10<1>.xD       g5<4>.xD        2D              { align16 };
+cmp.g.f0.0(8)   null<1>.xD      g3<0>.zD        4D              { align16 };
+cmp.g.f0.0(16)  null<1>F        g20<8,8,1>F     0x0F  /* 0F */  { align1 compr };
+cmp.l.f0.0(16)  null<1>D        g2<0,1,0>D      g6<8,8,1>D      { align1 compr };
+(+f0.1) cmp.nz.f0.1(16) null<1>UW g0<8,8,1>UW   g0<8,8,1>UW     { align1 };
+cmp.le.f0.0(8)  g3<1>.xUD       g1<0>.xUD       0x00000001UD    { align16 };
+cmp.g.f0.0(8)   g8<1>.xD        g1<0>.xD        2D              { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/cmp.expected b/src/intel/compiler/elk/tests/gen4.5/cmp.expected
new file mode 100644
index 00000000000..24ae8d5962c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/cmp.expected
@@ -0,0 +1,80 @@
+10 01 60 02 bc 5f 0f 20 64 00 0a 00 64 6e 74 74
+10 01 60 02 a4 1c 0f 20 e4 00 6a 00 00 00 00 00
+10 20 80 04 bd 7f c0 20 80 00 8d 00 5f 70 89 31
+10 20 80 05 bd 7f 00 21 80 00 8d 00 5f 70 89 31
+10 20 80 05 bd 77 00 21 80 00 8d 00 c0 00 8d 00
+10 20 80 04 bd 77 40 21 80 00 8d 00 c0 00 8d 00
+10 01 60 01 a5 1c 41 21 80 00 00 00 00 00 00 00
+10 01 60 05 bd 7f e1 20 e0 00 60 00 5f 70 89 31
+10 01 60 04 bd 77 c1 20 40 00 00 00 c0 00 60 00
+10 01 60 01 bc 77 0f 20 6e 00 0f 00 64 00 05 00
+10 20 80 04 a4 1c 00 20 c0 01 8d 00 10 00 00 00
+10 20 80 05 a4 1c 00 20 40 00 00 00 01 00 00 00
+10 20 80 01 bd 77 00 21 00 04 8d 00 4c 00 00 00
+10 20 80 04 bc 7f 00 20 c0 00 8d 00 00 00 00 00
+10 01 60 02 bc 77 0f 20 84 01 65 00 24 00 05 00
+10 01 60 01 a4 14 0f 20 c4 00 6e 00 54 00 0e 00
+10 20 80 01 a5 1c c0 20 44 00 00 00 27 00 00 00
+10 20 80 01 bd 7f 80 20 44 00 00 00 00 00 00 41
+10 01 60 01 a5 14 a1 20 a0 00 60 00 2a 00 0a 00
+10 01 60 05 bd 77 63 20 24 00 05 00 2e 00 0f 00
+10 20 80 01 a4 1c 00 20 40 00 00 00 01 00 00 00
+10 20 80 01 bc 77 00 20 c0 01 8d 00 44 00 00 00
+10 01 60 01 bd 77 c1 20 c0 00 60 00 65 00 05 00
+10 20 80 02 bd 77 80 20 c0 00 8d 00 48 00 00 00
+10 20 80 04 bc 77 00 20 00 22 8d 00 00 21 8d 00
+10 20 80 02 a4 1c 00 20 40 00 00 00 00 00 00 00
+10 00 60 02 bd 77 a0 20 a0 00 8d 00 c0 04 8d 00
+10 01 60 04 a4 1c 01 20 a0 00 60 00 04 00 00 00
+10 20 80 02 bc 7f 00 20 50 00 00 00 00 00 00 00
+10 20 80 01 bc 7f 00 20 84 00 00 00 00 00 80 3f
+10 20 80 04 a5 1c 80 20 40 00 00 00 01 00 00 00
+10 20 80 02 a5 1c 80 20 44 00 00 00 00 00 00 00
+10 20 80 01 a5 14 00 21 c0 00 8d 00 54 00 00 00
+10 20 80 05 bc 77 00 20 80 00 8d 00 54 00 00 00
+10 20 80 05 a5 1c c0 20 60 00 00 00 01 00 00 00
+10 01 60 04 bc 7f 0f 20 00 04 60 00 00 00 00 00
+10 01 60 05 bc 7f 0f 20 e0 02 60 00 00 00 00 43
+10 01 60 06 bd 7f 01 24 00 04 60 00 00 00 00 00
+10 20 80 04 a5 14 80 20 4c 00 00 00 40 00 00 00
+10 01 60 02 a5 14 61 20 20 00 00 00 25 00 05 00
+10 01 60 02 bd 77 67 20 24 00 0a 00 34 00 0a 00
+10 01 60 02 bc 7f 0f 20 20 00 00 00 00 00 00 00
+10 01 60 06 a5 1c a1 20 20 00 00 00 00 00 00 00
+10 20 80 05 a5 14 80 20 44 00 00 00 40 00 00 00
+10 01 60 04 a5 14 6f 20 24 00 0e 00 34 00 0e 00
+10 20 80 06 bc 7f 00 20 80 00 8d 00 00 00 00 3f
+10 20 80 06 bd 7f 80 22 80 00 8d 00 9a 3f 1c 46
+10 01 60 01 bc 5f 0f 20 64 00 0a 00 6a 6c 6e 6e
+10 01 60 02 a4 14 0f 20 24 00 0a 00 34 00 0a 00
+10 01 60 04 a4 14 01 20 a0 00 60 00 60 00 00 00
+10 20 80 02 bd 7f 00 21 48 00 00 00 00 00 00 00
+10 01 60 05 bc 77 0f 20 24 00 0e 00 64 00 6e 00
+10 01 60 03 bd 7f e1 20 40 00 60 00 00 00 00 00
+10 01 60 03 bc 7f 01 20 45 00 65 00 00 00 00 00
+10 20 80 02 a4 14 00 20 00 02 8d 00 80 01 8d 00
+10 20 80 05 bc 7f 00 20 40 00 00 00 00 00 00 00
+10 01 60 01 a4 1c 01 20 20 00 00 00 01 00 00 00
+10 20 80 02 a5 14 c0 20 80 00 8d 00 48 00 00 00
+10 20 80 03 bd 7f 00 22 00 21 8d 00 00 00 80 3f
+10 01 60 05 a5 14 a1 20 25 00 05 00 20 00 00 00
+10 01 60 04 bd 7f c1 20 60 00 60 00 00 00 f0 41
+10 20 80 03 a4 1c 00 20 44 00 00 00 00 00 00 00
+10 20 80 04 a4 14 00 20 80 00 8d 00 44 00 00 00
+10 01 60 06 bc 7f 01 20 00 01 60 00 00 00 00 3f
+10 01 60 04 bc 77 01 20 c0 02 60 00 40 01 60 00
+10 01 60 01 bd 7f 21 21 20 00 00 00 81 95 b7 40
+10 20 80 01 a4 14 00 20 c0 00 8d 00 40 00 00 00
+10 20 80 02 bc 77 00 20 80 00 8d 00 00 01 8d 00
+10 20 81 01 a4 1c 00 20 c0 00 8d 02 00 00 00 00
+10 01 60 02 a5 1c 61 21 80 00 60 00 0a 00 00 00
+10 01 60 02 bd 7f 6f 20 64 00 6e 00 00 00 00 00
+10 20 80 06 a5 1c 80 20 40 00 00 00 00 00 00 00
+10 01 60 05 a4 14 01 20 c0 00 60 00 a0 00 60 00
+10 01 60 04 a5 1c 41 21 a0 00 60 00 02 00 00 00
+10 01 60 03 a4 1c 01 20 6a 00 0a 00 04 00 00 00
+10 20 80 03 bc 7f 00 20 80 02 8d 00 00 00 00 00
+10 20 80 05 a4 14 00 20 40 00 00 00 c0 00 8d 00
+10 00 81 02 28 25 00 20 00 00 8d 02 00 00 8d 00
+10 01 60 06 21 0c 61 20 20 00 00 00 01 00 00 00
+10 01 60 03 a5 1c 01 21 20 00 00 00 02 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/cont.asm b/src/intel/compiler/elk/tests/gen4.5/cont.asm
new file mode 100644
index 00000000000..a03dd989d99
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/cont.asm
@@ -0,0 +1,2 @@
+cont(16)        Jump: 4         Pop: 1                          { align1 };
+cont(8)         Jump: 4         Pop: 1                          { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/cont.expected b/src/intel/compiler/elk/tests/gen4.5/cont.expected
new file mode 100644
index 00000000000..c40dc1ce543
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/cont.expected
@@ -0,0 +1,2 @@
+29 00 80 00 00 1c 00 34 00 14 60 00 04 00 01 00
+29 01 60 00 00 1c 0f 34 04 14 6e 00 04 00 01 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/do.asm b/src/intel/compiler/elk/tests/gen4.5/do.asm
new file mode 100644
index 00000000000..f0121e9b663
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/do.asm
@@ -0,0 +1,2 @@
+do(16)                                                          { align1 };
+do(8)                                                           { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/do.expected b/src/intel/compiler/elk/tests/gen4.5/do.expected
new file mode 100644
index 00000000000..4ca58b752d7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/do.expected
@@ -0,0 +1,2 @@
+26 00 80 00 9c 73 00 20 00 00 8d 00 00 00 8d 00
+26 01 60 00 9c 73 0f 20 04 00 6e 00 04 00 6e 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/dp2.asm b/src/intel/compiler/elk/tests/gen4.5/dp2.asm
new file mode 100644
index 00000000000..6411dbdfdec
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/dp2.asm
@@ -0,0 +1,7 @@
+dp2(8)          g7<1>.xF        g7<4>.xyyyF     g7<4>.xyyyF     { align16 };
+dp2(8)          m5<1>.xF        g1<0>.yF        g1<0>.yF        { align16 };
+dp2(8)          m5<1>.yzF       g1<0>.xF        g1<0>.zwwwF     { align16 NoDDClr };
+dp2(8)          m5<1>.wF        g1<0>.ywwwF     g1<0>.wyyyF     { align16 NoDDChk };
+dp2(8)          g4<1>.yF        g1<0>.xyyyF     g1.4<0>.xyyyF   { align16 NoDDClr };
+dp2(8)          g4<1>.zF        g1<0>.xyyyF     g1.4<0>.zwwwF   { align16 NoDDClr,NoDDChk };
+dp2(8)          g4<1>.wF        g1<0>.xyyyF     g2<0>.xyyyF     { align16 NoDDChk };
diff --git a/src/intel/compiler/elk/tests/gen4.5/dp2.expected b/src/intel/compiler/elk/tests/gen4.5/dp2.expected
new file mode 100644
index 00000000000..491895d42ae
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/dp2.expected
@@ -0,0 +1,7 @@
+57 01 60 00 bd 77 e1 20 e4 00 65 00 e4 00 65 00
+57 01 60 00 be 77 a1 20 25 00 05 00 25 00 05 00
+57 05 60 00 be 77 a6 20 20 00 00 00 2e 00 0f 00
+57 09 60 00 be 77 a8 20 2d 00 0f 00 27 00 05 00
+57 05 60 00 bd 77 82 20 24 00 05 00 34 00 05 00
+57 0d 60 00 bd 77 84 20 24 00 05 00 3e 00 0f 00
+57 09 60 00 bd 77 88 20 24 00 05 00 44 00 05 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/dp3.asm b/src/intel/compiler/elk/tests/gen4.5/dp3.asm
new file mode 100644
index 00000000000..09cc1ab0114
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/dp3.asm
@@ -0,0 +1,9 @@
+dp3(8)          g5<1>.xF        g5<4>.xyzzF     g5<4>.xyzzF     { align16 };
+dp3(8)          m5<1>.xF        g3<0>.xyzzF     g6<4>.xyzzF     { align16 NoDDClr };
+dp3(8)          m5<1>.yF        g3.4<0>.xyzzF   g6<4>.xyzzF     { align16 NoDDClr,NoDDChk };
+dp3(8)          g19<1>.xF       g3<0>.xyzzF     g3.4<0>.xyzzF   { align16 NoDDClr };
+dp3(8)          g19<1>.yF       g3<0>.xyzzF     g4<0>.xyzzF     { align16 NoDDClr,NoDDChk };
+dp3(8)          g19<1>.zF       g3<0>.xyzzF     g4.4<0>.xyzzF   { align16 NoDDChk };
+dp3(8)          m5<1>.xF        g4<4>.xyzzF     g5<4>.xyzzF     { align16 };
+dp3.le.f0.0(8)  g18<1>.xF       g17<4>.xyzzF    g3.4<0>.xyzzF   { align16 };
+dp3.sat(8)      g4<1>.xF        g4<4>.xyzzF     g5<4>.xyzzF     { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/dp3.expected b/src/intel/compiler/elk/tests/gen4.5/dp3.expected
new file mode 100644
index 00000000000..82f5b363097
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/dp3.expected
@@ -0,0 +1,9 @@
+56 01 60 00 bd 77 a1 20 a4 00 6a 00 a4 00 6a 00
+56 05 60 00 be 77 a1 20 64 00 0a 00 c4 00 6a 00
+56 0d 60 00 be 77 a2 20 74 00 0a 00 c4 00 6a 00
+56 05 60 00 bd 77 61 22 64 00 0a 00 74 00 0a 00
+56 0d 60 00 bd 77 62 22 64 00 0a 00 84 00 0a 00
+56 09 60 00 bd 77 64 22 64 00 0a 00 94 00 0a 00
+56 01 60 00 be 77 a1 20 84 00 6a 00 a4 00 6a 00
+56 01 60 06 bd 77 41 22 24 02 6a 00 74 00 0a 00
+56 01 60 80 bd 77 81 20 84 00 6a 00 a4 00 6a 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/dp4.asm b/src/intel/compiler/elk/tests/gen4.5/dp4.asm
new file mode 100644
index 00000000000..5394d783cd6
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/dp4.asm
@@ -0,0 +1,5 @@
+dp4(8)          g6<1>.xF        g3<4>F          g1<0>F          { align16 };
+dp4(8)          g4<1>.xF        g5<4>F          g1<0>F          { align16 NoDDClr };
+dp4(8)          g4<1>.yF        g5<4>F          g1.4<0>F        { align16 NoDDClr,NoDDChk };
+dp4(8)          g4<1>.wF        g5<4>F          g2.4<0>F        { align16 NoDDChk };
+dp4(8)          m5<1>.xF        g4<4>F          g5<4>F          { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/dp4.expected b/src/intel/compiler/elk/tests/gen4.5/dp4.expected
new file mode 100644
index 00000000000..99bf76dd7d4
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/dp4.expected
@@ -0,0 +1,5 @@
+54 01 60 00 bd 77 c1 20 64 00 6e 00 24 00 0e 00
+54 05 60 00 bd 77 81 20 a4 00 6e 00 24 00 0e 00
+54 0d 60 00 bd 77 82 20 a4 00 6e 00 34 00 0e 00
+54 09 60 00 bd 77 88 20 a4 00 6e 00 54 00 0e 00
+54 01 60 00 be 77 a1 20 84 00 6e 00 a4 00 6e 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/dph.asm b/src/intel/compiler/elk/tests/gen4.5/dph.asm
new file mode 100644
index 00000000000..16c9d525604
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/dph.asm
@@ -0,0 +1,5 @@
+dph(8)          m5<1>.xF        g4<4>.xyzxF     g5<4>F          { align16 };
+dph.sat(8)      m5<1>F          g1<0>.xyzxF     g3<4>F          { align16 };
+dph(8)          g5<1>.xF        g4<4>.xyzxF     g1<0>F          { align16 NoDDClr };
+dph(8)          g5<1>.yF        g4<4>.xyzxF     g1.4<0>F        { align16 NoDDClr,NoDDChk };
+dph(8)          g6<1>.wF        g5<4>.xyzxF     g2.4<0>F        { align16 NoDDChk };
diff --git a/src/intel/compiler/elk/tests/gen4.5/dph.expected b/src/intel/compiler/elk/tests/gen4.5/dph.expected
new file mode 100644
index 00000000000..aed1eaec314
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/dph.expected
@@ -0,0 +1,5 @@
+55 01 60 00 be 77 a1 20 84 00 62 00 a4 00 6e 00
+55 01 60 80 be 77 af 20 24 00 02 00 64 00 6e 00
+55 05 60 00 bd 77 a1 20 84 00 62 00 24 00 0e 00
+55 0d 60 00 bd 77 a2 20 84 00 62 00 34 00 0e 00
+55 09 60 00 bd 77 c8 20 a4 00 62 00 54 00 0e 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/else.asm b/src/intel/compiler/elk/tests/gen4.5/else.asm
new file mode 100644
index 00000000000..7ce3494b66f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/else.asm
@@ -0,0 +1,2 @@
+else(16)        Jump: 7         Pop: 1                          { align1 switch };
+else(8)         Jump: 3         Pop: 1                          { align16 switch };
diff --git a/src/intel/compiler/elk/tests/gen4.5/else.expected b/src/intel/compiler/elk/tests/gen4.5/else.expected
new file mode 100644
index 00000000000..c56d1248844
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/else.expected
@@ -0,0 +1,2 @@
+24 80 80 00 00 1c 00 34 00 14 60 00 07 00 01 00
+24 81 60 00 00 1c 0f 34 04 14 6e 00 03 00 01 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/endif.asm b/src/intel/compiler/elk/tests/gen4.5/endif.asm
new file mode 100644
index 00000000000..6c71e4a033a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/endif.asm
@@ -0,0 +1,2 @@
+endif(16)       Pop: 1                                          { align1 switch };
+endif(8)        Pop: 1                                          { align16 switch };
diff --git a/src/intel/compiler/elk/tests/gen4.5/endif.expected b/src/intel/compiler/elk/tests/gen4.5/endif.expected
new file mode 100644
index 00000000000..99daf4c5ab7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/endif.expected
@@ -0,0 +1,2 @@
+25 80 80 00 84 1c 00 20 00 00 8d 00 00 00 01 00
+25 81 60 00 84 1c 0f 20 04 00 6e 00 00 00 01 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/frc.asm b/src/intel/compiler/elk/tests/gen4.5/frc.asm
new file mode 100644
index 00000000000..02e11fc05dc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/frc.asm
@@ -0,0 +1,4 @@
+frc.sat(8)      m5<1>F          g3<4>F                          { align16 };
+frc(8)          g7<1>.xF        (abs)g1<0>.xF                   { align16 };
+frc(16)         g4<1>F          g2<0,1,0>F                      { align1 compr };
+frc(16)         m3<1>F          g10<8,8,1>F                     { align1 compr4 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/frc.expected b/src/intel/compiler/elk/tests/gen4.5/frc.expected
new file mode 100644
index 00000000000..591f73dea24
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/frc.expected
@@ -0,0 +1,4 @@
+43 01 60 80 be 03 af 20 64 00 6e 00 00 00 00 00
+43 01 60 00 bd 03 e1 20 20 20 00 00 00 00 00 00
+43 20 80 00 bd 03 80 20 40 00 00 00 00 00 00 00
+43 20 80 00 be 03 60 30 40 01 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/if.asm b/src/intel/compiler/elk/tests/gen4.5/if.asm
new file mode 100644
index 00000000000..db56acacf21
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/if.asm
@@ -0,0 +1,2 @@
+(+f0.0) if(16)  Jump: 15                                        { align1 switch };
+(+f0.0.x) if(8) Jump: 7                                         { align16 switch };
diff --git a/src/intel/compiler/elk/tests/gen4.5/if.expected b/src/intel/compiler/elk/tests/gen4.5/if.expected
new file mode 100644
index 00000000000..cef48388bd3
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/if.expected
@@ -0,0 +1,2 @@
+22 80 81 00 00 1c 00 34 00 14 60 00 0f 00 00 00
+22 81 62 00 00 1c 0f 34 04 14 6e 00 07 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/iff.asm b/src/intel/compiler/elk/tests/gen4.5/iff.asm
new file mode 100644
index 00000000000..1ff0b17a776
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/iff.asm
@@ -0,0 +1,3 @@
+(-f0.0) iff(16) Jump: 5                                         { align1 switch };
+(+f0.0.x) iff(8) Jump: 11                                       { align16 switch };
+(+f0.0) iff(16) Jump: 7                                         { align1 switch };
diff --git a/src/intel/compiler/elk/tests/gen4.5/iff.expected b/src/intel/compiler/elk/tests/gen4.5/iff.expected
new file mode 100644
index 00000000000..4ed27050911
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/iff.expected
@@ -0,0 +1,3 @@
+23 80 91 00 00 1c 00 34 00 14 60 00 05 00 00 00
+23 81 62 00 00 1c 0f 34 04 14 6e 00 0b 00 00 00
+23 80 81 00 00 1c 00 34 00 14 60 00 07 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/jmpi.asm b/src/intel/compiler/elk/tests/gen4.5/jmpi.asm
new file mode 100644
index 00000000000..65d0d5357b7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/jmpi.asm
@@ -0,0 +1 @@
+(+f0.0) jmpi(1) 0x00000002UD                                    { align1 nomask };
diff --git a/src/intel/compiler/elk/tests/gen4.5/jmpi.expected b/src/intel/compiler/elk/tests/gen4.5/jmpi.expected
new file mode 100644
index 00000000000..682e0a75561
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/jmpi.expected
@@ -0,0 +1 @@
+20 02 01 00 00 0c 00 34 00 14 00 00 02 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/mach.asm b/src/intel/compiler/elk/tests/gen4.5/mach.asm
new file mode 100644
index 00000000000..5e0ccc54566
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/mach.asm
@@ -0,0 +1 @@
+mach(8)         null<1>D        g1<0>.xD        g1<0>.yD        { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/mach.expected b/src/intel/compiler/elk/tests/gen4.5/mach.expected
new file mode 100644
index 00000000000..90d1371bd61
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/mach.expected
@@ -0,0 +1 @@
+49 01 60 00 a4 14 0f 20 20 00 00 00 25 00 05 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/mov.asm b/src/intel/compiler/elk/tests/gen4.5/mov.asm
new file mode 100644
index 00000000000..70bb68f5b55
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/mov.asm
@@ -0,0 +1,102 @@
+mov(8)          m2<1>UD         g1<8,8,1>UD                     { align1 nomask };
+mov(8)          g9<1>.xyzUD     0x00000000UD                    { align16 };
+mov.sat(8)      m5<1>F          g4<4>F                          { align16 };
+mov(8)          m4<1>F          g6<4>F                          { align16 };
+mov(8)          m2<1>UD         g9<4>UD                         { align16 };
+mov(16)         g6<1>D          1065353216D                     { align1 compr };
+mov.nz.f0.0(16) null<1>D        g2<0,1,0>D                      { align1 compr };
+mov(16)         m3<1>F          0x0F             /* 0F */       { align1 compr4 };
+mov(16)         m4<1>F          g4<8,8,1>F                      { align1 compr4 };
+mov(8)          m2<1>UD         0x00000000UD                    { align16 };
+mov(8)          g8<1>F          0x30003000VF    /* [0F, 1F, 0F, 1F]VF */ { align16 };
+mov(8)          g7<1>.xD        0D                              { align16 };
+(+f0.0.any4h) mov(8) g7<1>.xD   -1D                             { align16 };
+mov(16)         m3<1>F          g4<8,8,1>D                      { align1 compr4 };
+mov(1)          m14<1>D         0D                              { align1 nomask };
+mov(8)          m15<1>D         g3<0>D                          { align16 };
+mov(1)          m14<1>D         g8<0,1,0>D                      { align1 nomask };
+mov(16)         g12<1>F         g4<8,8,1>UW                     { align1 compr };
+mov(16)         g4<1>D          g12<8,8,1>F                     { align1 compr };
+mov(16)         g12<1>F         g4<8,8,1>D                      { align1 compr };
+mov(8)          m15<1>D         g2<4>.xUD                       { align16 };
+mov(8)          g7<1>.xD        g4<0>.yD                        { align16 };
+mov(8)          g7<1>.xD        g10<4>.xD                       { align16 NoDDClr };
+mov(8)          g7<1>.yD        g4<0>.yD                        { align16 NoDDChk };
+mov(16)         m2<1>UD         0x00000000UD                    { align1 compr };
+mov(16)         m6<1>D          g9.3<0,1,0>D                    { align1 compr };
+mov(16)         m8<1>UD         0D                              { align1 compr };
+mov(16)         m2<1>D          g4<8,8,1>F                      { align1 compr };
+mov(8)          m5<1>.xF        g3<4>.xD                        { align16 NoDDClr };
+mov(8)          m5<1>.yzwD      0D                              { align16 NoDDChk };
+mov.sat(16)     m3<1>F          g2<0,1,0>F                      { align1 compr4 };
+mov(8)          m6<1>F          0x50484030VF    /* [1F, 2F, 3F, 4F]VF */ { align16 };
+mov(8)          m3<1>F          0x42fc6666F      /* 126.2F */   { align1 };
+mov(8)          m3<1>F          0x42fc6666F      /* 126.2F */   { align1 sechalf };
+mov(8)          m5<1>.wD        g8<4>.wD                        { align16 NoDDChk };
+mov(8)          g6<1>.xD        g6<4>.xF                        { align16 };
+mov(8)          m3<1>F          g[a0]<VxH,1,0>F                 { align1 };
+mov(8)          m7<1>F          g[a0]<VxH,1,0>F                 { align1 sechalf };
+mov(8)          g20<1>.yD       -1070881309D                    { align16 NoDDClr };
+mov(8)          g20<1>.zD       1091044167D                     { align16 NoDDChk };
+mov(8)          g28<1>.zD       -1102236248D                    { align16 NoDDClr,NoDDChk };
+mov(8)          g5<1>.xD        acc0<4>D                        { align16 };
+mov(8)          m13<1>.wD       1107296256D                     { align16 NoDDClr };
+mov(8)          g11<1>.yzwD     0x48403000VF    /* [0F, 1F, 2F, 3F]VF */ { align16 };
+mov(8)          m13<1>.xyzF     0x7f7e7dVF      /* [29F, 30F, 31F, 0F]VF */ { align16 NoDDChk };
+mov(16)         m3<1>UD         g4<8,8,1>UD                     { align1 compr4 };
+mov(8)          m6<1>.xF        0x0F             /* 0F */       { align16 };
+(+f0.0.all4h) mov(8) g3<1>.xD   -1D                             { align16 };
+mov(8)          g3<1>F          g2<0,1,0>D                      { align1 };
+mov(8)          m3<1>F          g2<8,8,1>D                      { align1 };
+mov(8)          m5<1>.yF        g3<4>.xD                        { align16 NoDDClr,NoDDChk };
+mov(8)          m5<1>.wF        g3<4>.xD                        { align16 NoDDChk };
+mov(8)          g3<1>.xF        g3<4>.xD                        { align16 NoDDClr };
+mov(8)          g3<1>.yF        g4<4>.xD                        { align16 NoDDClr,NoDDChk };
+mov(8)          g3<1>.wF        g4<4>.xD                        { align16 NoDDChk };
+mov(8)          g8<1>UD         g2<4>UD                         { align16 };
+mov(8)          g7<1>.xF        g3<0>.xD                        { align16 };
+mov(8)          g6<1>.xF        -g5<4>.yF                       { align16 NoDDClr };
+mov.nz.f0.0(16) g4<1>F          -(abs)g2<0,1,0>F                { align1 compr };
+(+f0.0) mov(16) g4<1>F          0xbf800000F      /* -1F */      { align1 compr };
+mov(16)         g24<1>D         g42<8,8,1>D                     { align1 compr };
+mov(8)          g8<1>F          g[a0]<VxH,1,0>F                 { align1 };
+mov(8)          g9<1>F          g[a0]<VxH,1,0>F                 { align1 sechalf };
+mov(8)          g3<1>.xyzF      0x0F             /* 0F */       { align16 };
+mov(16)         m2<1>UD         g28<8,8,1>UW                    { align1 compr };
+mov(8)          m3<1>D          g2<0,1,0>D                      { align1 };
+mov(8)          m3<1>D          g2<0,1,0>D                      { align1 sechalf };
+mov(1)          m14.2<1>UD      0x00000000UD                    { align1 nomask };
+mov(8)          g5<1>.zD        g1.4<0>.xD                      { align16 NoDDClr,NoDDChk };
+mov.sat(8)      m5<1>.wF        g20<4>.wF                       { align16 NoDDChk };
+mov(8)          g26<1>.xyzUD    0x00000000UD                    { align16 NoDDClr };
+mov(8)          m9<1>.xyD       g4<0>.yzzzD                     { align16 NoDDClr };
+mov(8)          m5<1>F          g3<4>D                          { align16 };
+mov(8)          m3<1>F          g4<8,8,1>F                      { align1 nomask };
+mov.sat(8)      m5<1>.zF        0x3eaaaaabF      /* 0.333333F */ { align16 };
+mov.sat(8)      m5<1>.wF        0x3dcccccdF      /* 0.1F */     { align16 NoDDClr };
+mov(8)          m5<1>.zD        g3<4>.zD                        { align16 NoDDClr,NoDDChk };
+mov(8)          m13<1>.yD       1107820544D                     { align16 NoDDClr,NoDDChk };
+mov.sat(8)      m5<1>.wF        0x3f800000F      /* 1F */       { align16 NoDDChk };
+mov.sat(8)      m5<1>F          g3<4>D                          { align16 };
+mov.sat(8)      m5<1>.zF        0x3f666660F      /* 0.9F */     { align16 NoDDClr,NoDDChk };
+mov(16)         g10<1>F         g2<0,1,0>F                      { align1 compr };
+mov(16)         g10<1>F         0x3f800000F      /* 1F */       { align1 compr };
+mov(8)          m15<1>D         0D                              { align16 };
+mov.sat(16)     g4<1>F          g2<0,1,0>F                      { align1 compr };
+mov(8)          g2<1>.xyzF      g2<4>.wF                        { align16 };
+mov(8)          g5<1>.xyzF      0x7f7e7dVF      /* [29F, 30F, 31F, 0F]VF */ { align16 NoDDChk };
+mov.sat(8)      m5<1>.xF        g4<4>.xF                        { align16 NoDDClr };
+mov.sat(8)      m5<1>.yzF       g5<4>.xxyyF                     { align16 NoDDClr,NoDDChk };
+mov(1)          f0.1<1>UW       g0<0,1,0>UW                     { align1 nomask };
+mov(1)          g0<1>UW         f0.1<0,1,0>UW                   { align1 nomask };
+mov(8)          m5<1>.zwF       0x30000000VF    /* [0F, 0F, 0F, 1F]VF */ { align16 NoDDClr };
+mov.sat(8)      m5<1>.xF        g5<4>.xD                        { align16 NoDDClr };
+mov.sat(8)      m5<1>.yF        g5<4>.xD                        { align16 NoDDClr,NoDDChk };
+mov.sat(8)      m5<1>.wF        g5<4>.xD                        { align16 NoDDChk };
+mov(8)          g6<1>.yzD       0xf7c000VF      /* [0F, -2F, -23F, 0F]VF */ { align16 NoDDChk };
+mov(8)          m2<1>.xyzUD     0x00000000UD                    { align16 NoDDClr };
+mov(8)          m2<1>.wUD       g8<4>.xUD                       { align16 NoDDChk };
+mov(8)          g5<1>F          g3<4>UD                         { align16 };
+mov.nz.f0.0(8)  null<1>.xD      g8<4>.xD                        { align16 };
+mov.nz.f0.0(8)  g8<1>F          -(abs)g1<0>F                    { align16 };
+(+f0.0) mov(8)  g8<1>F          0xbf800000F      /* -1F */      { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/mov.expected b/src/intel/compiler/elk/tests/gen4.5/mov.expected
new file mode 100644
index 00000000000..8273f505e46
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/mov.expected
@@ -0,0 +1,102 @@
+01 02 60 00 22 00 40 20 20 00 8d 00 00 00 00 00
+01 01 60 00 61 00 27 21 00 00 00 00 00 00 00 00
+01 01 60 80 be 03 af 20 84 00 6e 00 00 00 00 00
+01 01 60 00 be 03 8f 20 c4 00 6e 00 00 00 00 00
+01 01 60 00 22 00 4f 20 24 01 6e 00 00 00 00 00
+01 20 80 00 e5 10 c0 20 00 00 00 00 00 00 80 3f
+01 20 80 02 a4 00 00 20 40 00 00 00 00 00 00 00
+01 20 80 00 fe 73 60 30 00 00 00 00 00 00 00 00
+01 20 80 00 be 03 80 30 80 00 8d 00 00 00 00 00
+01 01 60 00 62 00 4f 20 00 00 00 00 00 00 00 00
+01 01 60 00 fd 52 0f 21 00 00 00 00 00 30 00 30
+01 01 60 00 e5 10 e1 20 00 00 00 00 00 00 00 00
+01 01 66 00 e5 10 e1 20 00 00 00 00 ff ff ff ff
+01 20 80 00 be 00 60 30 80 00 8d 00 00 00 00 00
+01 02 00 00 e6 10 c0 21 00 00 00 00 00 00 00 00
+01 01 60 00 a6 00 ef 21 64 00 0e 00 00 00 00 00
+01 02 00 00 a6 00 c0 21 00 01 00 00 00 00 00 00
+01 20 80 00 3d 01 80 21 80 00 8d 00 00 00 00 00
+01 20 80 00 a5 03 80 20 80 01 8d 00 00 00 00 00
+01 20 80 00 bd 00 80 21 80 00 8d 00 00 00 00 00
+01 01 60 00 26 00 ef 21 40 00 60 00 00 00 00 00
+01 01 60 00 a5 00 e1 20 85 00 05 00 00 00 00 00
+01 05 60 00 a5 00 e1 20 40 01 60 00 00 00 00 00
+01 09 60 00 a5 00 e2 20 85 00 05 00 00 00 00 00
+01 20 80 00 62 00 40 20 00 00 00 00 00 00 00 00
+01 20 80 00 a6 00 c0 20 2c 01 00 00 00 00 00 00
+01 20 80 00 e2 10 00 21 00 00 00 00 00 00 00 00
+01 20 80 00 a6 03 40 20 80 00 8d 00 00 00 00 00
+01 05 60 00 be 00 a1 20 60 00 60 00 00 00 00 00
+01 09 60 00 e6 10 ae 20 00 00 00 00 00 00 00 00
+01 20 80 80 be 03 60 30 40 00 00 00 00 00 00 00
+01 01 60 00 fe 52 cf 20 00 00 00 00 30 40 48 50
+01 00 60 00 fe 73 60 20 00 00 00 00 66 66 fc 42
+01 10 60 00 fe 73 60 20 00 00 00 00 66 66 fc 42
+01 09 60 00 a6 00 a8 20 0f 01 6f 00 00 00 00 00
+01 01 60 00 a5 03 c1 20 c0 00 60 00 00 00 00 00
+01 00 60 00 be 03 60 20 00 80 e0 01 00 00 00 00
+01 10 60 00 be 03 e0 20 00 80 e0 01 00 00 00 00
+01 05 60 00 e5 10 82 22 00 00 00 00 e3 a5 2b c0
+01 09 60 00 e5 10 84 22 00 00 00 00 47 03 08 41
+01 0d 60 00 e5 10 84 23 00 00 00 00 a8 35 4d be
+01 01 60 00 85 00 a1 20 04 04 6e 00 00 00 00 00
+01 05 60 00 e6 10 a8 21 00 00 00 00 00 00 00 42
+01 01 60 00 e5 52 6e 21 00 00 00 00 00 30 40 48
+01 09 60 00 fe 52 a7 21 00 00 00 00 7d 7e 7f 00
+01 20 80 00 22 00 60 30 80 00 8d 00 00 00 00 00
+01 01 60 00 fe 73 c1 20 00 00 00 00 00 00 00 00
+01 01 67 00 e5 10 61 20 00 00 00 00 ff ff ff ff
+01 00 60 00 bd 00 60 20 40 00 00 00 00 00 00 00
+01 00 60 00 be 00 60 20 40 00 8d 00 00 00 00 00
+01 0d 60 00 be 00 a2 20 60 00 60 00 00 00 00 00
+01 09 60 00 be 00 a8 20 60 00 60 00 00 00 00 00
+01 05 60 00 bd 00 61 20 60 00 60 00 00 00 00 00
+01 0d 60 00 bd 00 62 20 80 00 60 00 00 00 00 00
+01 09 60 00 bd 00 68 20 80 00 60 00 00 00 00 00
+01 01 60 00 21 00 0f 21 44 00 6e 00 00 00 00 00
+01 01 60 00 bd 00 e1 20 60 00 00 00 00 00 00 00
+01 05 60 00 bd 03 c1 20 a5 40 65 00 00 00 00 00
+01 20 80 02 bd 03 80 20 40 60 00 00 00 00 00 00
+01 20 81 00 fd 73 80 20 00 00 00 00 00 00 80 bf
+01 20 80 00 a5 00 00 23 40 05 8d 00 00 00 00 00
+01 00 60 00 bd 03 00 21 00 80 e0 01 00 00 00 00
+01 10 60 00 bd 03 20 21 00 80 e0 01 00 00 00 00
+01 01 60 00 fd 73 67 20 00 00 00 00 00 00 00 00
+01 20 80 00 22 01 40 20 80 03 8d 00 00 00 00 00
+01 00 60 00 a6 00 60 20 40 00 00 00 00 00 00 00
+01 10 60 00 a6 00 60 20 40 00 00 00 00 00 00 00
+01 02 00 00 62 00 c8 21 00 00 00 00 00 00 00 00
+01 0d 60 00 a5 00 a4 20 30 00 00 00 00 00 00 00
+01 09 60 80 be 03 a8 20 8f 02 6f 00 00 00 00 00
+01 05 60 00 61 00 47 23 00 00 00 00 00 00 00 00
+01 05 60 00 a6 00 23 21 89 00 0a 00 00 00 00 00
+01 01 60 00 be 00 af 20 64 00 6e 00 00 00 00 00
+01 02 60 00 be 03 60 20 80 00 8d 00 00 00 00 00
+01 01 60 80 fe 73 a4 20 00 00 00 00 ab aa aa 3e
+01 05 60 80 fe 73 a8 20 00 00 00 00 cd cc cc 3d
+01 0d 60 00 a6 00 a4 20 6a 00 6a 00 00 00 00 00
+01 0d 60 00 e6 10 a2 21 00 00 00 00 00 00 08 42
+01 09 60 80 fe 73 a8 20 00 00 00 00 00 00 80 3f
+01 01 60 80 be 00 af 20 64 00 6e 00 00 00 00 00
+01 0d 60 80 fe 73 a4 20 00 00 00 00 60 66 66 3f
+01 20 80 00 bd 03 40 21 40 00 00 00 00 00 00 00
+01 20 80 00 fd 73 40 21 00 00 00 00 00 00 80 3f
+01 01 60 00 e6 10 ef 21 00 00 00 00 00 00 00 00
+01 20 80 80 bd 03 80 20 40 00 00 00 00 00 00 00
+01 01 60 00 bd 03 47 20 4f 00 6f 00 00 00 00 00
+01 09 60 00 fd 52 a7 20 00 00 00 00 7d 7e 7f 00
+01 05 60 80 be 03 a1 20 80 00 60 00 00 00 00 00
+01 0d 60 80 be 03 a6 20 a0 00 65 00 00 00 00 00
+01 02 00 00 28 01 02 26 00 00 00 00 00 00 00 00
+01 02 00 00 09 01 00 20 02 06 00 00 00 00 00 00
+01 05 60 00 fe 52 ac 20 00 00 00 00 00 00 00 30
+01 05 60 80 be 00 a1 20 a0 00 60 00 00 00 00 00
+01 0d 60 80 be 00 a2 20 a0 00 60 00 00 00 00 00
+01 09 60 80 be 00 a8 20 a0 00 60 00 00 00 00 00
+01 09 60 00 e5 52 c6 20 00 00 00 00 00 c0 f7 00
+01 05 60 00 62 00 47 20 00 00 00 00 00 00 00 00
+01 09 60 00 22 00 48 20 00 01 60 00 00 00 00 00
+01 01 60 00 3d 00 af 20 64 00 6e 00 00 00 00 00
+01 01 60 02 a4 00 01 20 00 01 60 00 00 00 00 00
+01 01 60 02 bd 03 0f 21 24 60 0e 00 00 00 00 00
+01 01 61 00 fd 73 0f 21 00 00 00 00 00 00 80 bf
diff --git a/src/intel/compiler/elk/tests/gen4.5/mul.asm b/src/intel/compiler/elk/tests/gen4.5/mul.asm
new file mode 100644
index 00000000000..6d1247e16bc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/mul.asm
@@ -0,0 +1,37 @@
+mul(16)         m3<1>F          g10<8,8,1>F     g12<8,8,1>F     { align1 compr4 };
+mul(8)          g8<1>.xyzF      g6<4>.xyzzF     g8<4>.wF        { align16 };
+mul(8)          g9<1>.wUD       g7<4>.wF        0x45000000F  /* 2048F */ { align16 };
+mul(16)         g22<1>F         g18<8,8,1>F     g20<8,8,1>F     { align1 compr };
+mul(8)          g8<1>.xD        g8<4>.xD        g5<0>.xD        { align16 };
+mul(8)          g8<1>.xD        g8<4>.xD        32D             { align16 };
+mul(16)         g22<1>F         g16<8,8,1>F     0x41800000F  /* 16F */ { align1 compr };
+mul(16)         m3<1>F          g6<8,8,1>F      0x3b800000F  /* 0.00390625F */ { align1 compr4 };
+mul(8)          m5<1>.xyF       g3<4>.xyyyF     0x3f000000F  /* 0.5F */ { align16 NoDDClr };
+mul(8)          g5<1>F          g3<4>F          0x37800000F  /* 1.52588e-05F */ { align16 };
+mul.sat(16)     m2<1>F          g14<8,8,1>F     g6<8,8,1>F      { align1 compr };
+mul(8)          acc0<1>D        g1<0>.xD        g1<0>.yD        { align16 };
+mul(8)          m5<1>F          g3<4>F          0x3f000000F  /* 0.5F */ { align16 };
+mul(16)         g4<1>D          g6<8,8,1>D      g2<0,1,0>D      { align1 compr };
+mul.sat(16)     g18<1>F         g16<8,8,1>F     g14<8,8,1>F     { align1 compr };
+mul(8)          g4<1>F          g4<8,8,1>F      g55<8,8,1>F     { align1 };
+mul(8)          g26<1>.wUD      g29<4>.wF       0x45000000F  /* 2048F */ { align16 NoDDChk };
+mul(8)          g2<1>.xyzF      g2<4>.wF        0x40404830VF /* [1F, 3F, 2F, 2F]VF */ { align16 };
+mul(16)         g4<1>D          g2<0,1,0>UW     g2.2<0,1,0>D    { align1 compr };
+mul.sat(8)      g6<1>.xyzF      g6<4>.xyzzF     g7<4>.xF        { align16 };
+mul.sat(8)      m5<1>F          g6<4>F          0x3b800000F  /* 0.00390625F */ { align16 };
+mul.sat(8)      m5<1>.xyzF      g3<4>.xyzzF     0x3f000000F  /* 0.5F */ { align16 NoDDClr };
+mul.g.f0.0(16)  null<1>F        g10<8,8,1>F     g12<8,8,1>F     { align1 compr };
+mul.l.f0.0(8)   null<1>.xF      g1<0>.zF        g1<0>.yF        { align16 };
+mul.l.f0.0(16)  null<1>F        g2.2<0,1,0>F    g2.1<0,1,0>F    { align1 compr };
+mul.l.f0.0(16)  g14<1>F         g10<8,8,1>F     g12<8,8,1>F     { align1 compr };
+mul(8)          m5<1>.xyF       g3<4>.xyyyF     0x3f000000F  /* 0.5F */ { align16 NoDDChk };
+mul.nz.f0.0(16) g16<1>F         g10<8,8,1>F     g12<8,8,1>F     { align1 compr };
+mul.sat(8)      m6<1>.xyzF      g32<4>.xF       g30<4>.xyzzF    { align16 NoDDClr };
+mul.nz.f0.0(16) g6<1>F          g4<8,8,1>F      0x3f808000F  /* 1.00391F */ { align1 compr };
+mul.sat(8)      m5<1>.xyF       g1<0>.wzzzF     g3<4>.wzzzF     { align16 };
+mul.sat(8)      m5<1>F          g4<4>F          0x20303030VF /* [1F, 1F, 1F, 0.5F]VF */ { align16 };
+mul(8)          m5<1>F          g3<4>F          0x20305454VF /* [5F, 5F, 1F, 0.5F]VF */ { align16 };
+mul(8)          m6<1>.xyzF      g12<4>.xyzzF    g13<4>.xF       { align16 NoDDClr };
+mul(8)          m5<1>.xyzF      g3<4>.xyzzF     0x30302020VF /* [0.5F, 0.5F, 1F, 1F]VF */ { align16 NoDDClr };
+mul(8)          m5<1>.zF        g3<4>.zF        0x3f000000F  /* 0.5F */ { align16 NoDDClr,NoDDChk };
+mul(8)          m5<1>F          g3<4>F          g1<0>.xF        { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/mul.expected b/src/intel/compiler/elk/tests/gen4.5/mul.expected
new file mode 100644
index 00000000000..bcd42f8c04a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/mul.expected
@@ -0,0 +1,37 @@
+41 20 80 00 be 77 60 30 40 01 8d 00 80 01 8d 00
+41 01 60 00 bd 77 07 21 c4 00 6a 00 0f 01 6f 00
+41 01 60 00 a1 7f 28 21 ef 00 6f 00 00 00 00 45
+41 20 80 00 bd 77 c0 22 40 02 8d 00 80 02 8d 00
+41 01 60 00 a5 14 01 21 00 01 60 00 a0 00 00 00
+41 01 60 00 a5 1c 01 21 00 01 60 00 20 00 00 00
+41 20 80 00 bd 7f c0 22 00 02 8d 00 00 00 80 41
+41 20 80 00 be 7f 60 30 c0 00 8d 00 00 00 80 3b
+41 05 60 00 be 7f a3 20 64 00 65 00 00 00 00 3f
+41 01 60 00 bd 7f af 20 64 00 6e 00 00 00 80 37
+41 20 80 80 be 77 40 20 c0 01 8d 00 c0 00 8d 00
+41 01 60 00 a4 14 0f 24 20 00 00 00 25 00 05 00
+41 01 60 00 be 7f af 20 64 00 6e 00 00 00 00 3f
+41 20 80 00 a5 14 80 20 c0 00 8d 00 40 00 00 00
+41 20 80 80 bd 77 40 22 00 02 8d 00 c0 01 8d 00
+41 00 60 00 bd 77 80 20 80 00 8d 00 e0 06 8d 00
+41 09 60 00 a1 7f 48 23 af 03 6f 00 00 00 00 45
+41 01 60 00 bd 5f 47 20 4f 00 6f 00 30 48 40 40
+41 20 80 00 25 15 80 20 40 00 00 00 48 00 00 00
+41 01 60 80 bd 77 c7 20 c4 00 6a 00 e0 00 60 00
+41 01 60 80 be 7f af 20 c4 00 6e 00 00 00 80 3b
+41 05 60 80 be 7f a7 20 64 00 6a 00 00 00 00 3f
+41 20 80 03 bc 77 00 20 40 01 8d 00 80 01 8d 00
+41 01 60 05 bc 77 01 20 2a 00 0a 00 25 00 05 00
+41 20 80 05 bc 77 00 20 48 00 00 00 44 00 00 00
+41 20 80 05 bd 77 c0 21 40 01 8d 00 80 01 8d 00
+41 09 60 00 be 7f a3 20 64 00 65 00 00 00 00 3f
+41 20 80 02 bd 77 00 22 40 01 8d 00 80 01 8d 00
+41 05 60 80 be 77 c7 20 00 04 60 00 c4 03 6a 00
+41 20 80 02 bd 7f c0 20 80 00 8d 00 00 80 80 3f
+41 01 60 80 be 77 a3 20 2b 00 0a 00 6b 00 6a 00
+41 01 60 80 be 5f af 20 84 00 6e 00 30 30 30 20
+41 01 60 00 be 5f af 20 64 00 6e 00 54 54 30 20
+41 05 60 00 be 77 c7 20 84 01 6a 00 a0 01 60 00
+41 05 60 00 be 5f a7 20 64 00 6a 00 20 20 30 30
+41 0d 60 00 be 7f a4 20 6a 00 6a 00 00 00 00 3f
+41 01 60 00 be 77 af 20 64 00 6e 00 20 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/not.asm b/src/intel/compiler/elk/tests/gen4.5/not.asm
new file mode 100644
index 00000000000..e245cb403ed
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/not.asm
@@ -0,0 +1,3 @@
+not(16)         g6<1>D          -g4<8,8,1>D                     { align1 compr };
+not(8)          g2<1>D          -g2<8,8,1>D                     { align1 };
+not(8)          g5<1>.xD        g5<4>.xD                        { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/not.expected b/src/intel/compiler/elk/tests/gen4.5/not.expected
new file mode 100644
index 00000000000..93498187119
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/not.expected
@@ -0,0 +1,3 @@
+04 20 80 00 a5 00 c0 20 80 40 8d 00 00 00 00 00
+04 00 60 00 a5 00 40 20 40 40 8d 00 00 00 00 00
+04 01 60 00 a5 00 a1 20 a0 00 60 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/or.asm b/src/intel/compiler/elk/tests/gen4.5/or.asm
new file mode 100644
index 00000000000..c9a41dbf737
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/or.asm
@@ -0,0 +1,6 @@
+or(8)           g13<1>.xUD      g13<4>.xUD      g14<4>.xUD      { align16 };
+or(8)           g3<1>UD         g3<8,8,1>UD     g5<8,8,1>UD     { align1 };
+or(16)          g12<1>UD        g14<8,8,1>UD    g20<8,8,1>UD    { align1 compr };
+(+f0.0) or(16)  g12<1>UD        g12<8,8,1>UD    0x3f800000UD    { align1 compr };
+or(8)           m2<1>.wUD       g10<4>.xUD      g11<4>.xUD      { align16 };
+(+f0.0) or(8)   g17<1>.xUD      g17<4>.xUD      0x3f800000UD    { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/or.expected b/src/intel/compiler/elk/tests/gen4.5/or.expected
new file mode 100644
index 00000000000..b3e96ffd9cf
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/or.expected
@@ -0,0 +1,6 @@
+06 01 60 00 21 04 a1 21 a0 01 60 00 c0 01 60 00
+06 00 60 00 21 04 60 20 60 00 8d 00 a0 00 8d 00
+06 20 80 00 21 04 80 21 c0 01 8d 00 80 02 8d 00
+06 20 81 00 21 0c 80 21 80 01 8d 00 00 00 80 3f
+06 01 60 00 22 04 48 20 40 01 60 00 60 01 60 00
+06 01 61 00 21 0c 21 22 20 02 60 00 00 00 80 3f
diff --git a/src/intel/compiler/elk/tests/gen4.5/pln.asm b/src/intel/compiler/elk/tests/gen4.5/pln.asm
new file mode 100644
index 00000000000..8747d5fdc48
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/pln.asm
@@ -0,0 +1,3 @@
+pln(16)         g10<1>F         g3.4<0,1,0>F    g6<8,8,1>F      { align1 compr };
+pln(8)          g37<1>F         g4.4<0,1,0>F    g38<8,8,1>F     { align1 };
+pln(16)         m4<1>F          g5.4<0,1,0>F    g6<8,8,1>F      { align1 compr4 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/pln.expected b/src/intel/compiler/elk/tests/gen4.5/pln.expected
new file mode 100644
index 00000000000..495f81a8fe0
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/pln.expected
@@ -0,0 +1,3 @@
+5a 20 80 00 bd 77 40 21 70 00 00 00 c0 00 8d 00
+5a 00 60 00 bd 77 a0 24 90 00 00 00 c0 04 8d 00
+5a 20 80 00 be 77 80 30 b0 00 00 00 c0 00 8d 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/rndd.asm b/src/intel/compiler/elk/tests/gen4.5/rndd.asm
new file mode 100644
index 00000000000..aa022867779
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/rndd.asm
@@ -0,0 +1,4 @@
+rndd(16)        g16<1>F         g24<8,8,1>F                     { align1 compr };
+rndd(8)         g6<1>.xF        g1<0>.xF                        { align16 };
+rndd(8)         g6<1>.xF        (abs)g1<0>.xF                   { align16 NoDDClr };
+rndd(8)         g6<1>.yF        g7<4>.xF                        { align16 NoDDClr,NoDDChk };
diff --git a/src/intel/compiler/elk/tests/gen4.5/rndd.expected b/src/intel/compiler/elk/tests/gen4.5/rndd.expected
new file mode 100644
index 00000000000..2d59a9268b5
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/rndd.expected
@@ -0,0 +1,4 @@
+45 20 80 00 bd 03 00 22 00 03 8d 00 00 00 00 00
+45 01 60 00 bd 03 c1 20 20 00 00 00 00 00 00 00
+45 05 60 00 bd 03 c1 20 20 20 00 00 00 00 00 00
+45 0d 60 00 bd 03 c2 20 e0 00 60 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/sel.asm b/src/intel/compiler/elk/tests/gen4.5/sel.asm
new file mode 100644
index 00000000000..bda5ae6bd83
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/sel.asm
@@ -0,0 +1,31 @@
+(+f0.0.any4h) sel(8) g7<1>UD    g9<4>UD         g8<4>UD         { align16 };
+(+f0.0) sel(8)  g10<1>.xyUD     g7<4>.xyyyUD    g3<0>.zwwwUD    { align16 };
+(+f0.0.all4h) sel(8) g6<1>UD    g6<4>UD         g7<4>UD         { align16 };
+(+f0.0) sel(16) g6<1>UD         g40<8,8,1>UD    g46<8,8,1>UD    { align1 compr };
+(+f0.0) sel(16) m3<1>UD         g30<8,8,1>UD    0x3f800000UD    { align1 compr4 };
+(+f0.0) sel(16) g10<1>F         g6<8,8,1>F      0x0F  /* 0F */  { align1 compr };
+(-f0.0) sel(16) g4<1>UD         g6<8,8,1>UD     0x00000000UD    { align1 compr };
+(+f0.0.x) sel(8) g6<1>.xUD      g6<4>.yUD       0x41a80000UD    { align16 };
+(-f0.0.x) sel(8) g6<1>.xUD      g6<4>.xUD       0x41b80000UD    { align16 };
+(+f0.0) sel(16) g4<1>F          (abs)g16<8,8,1>F (abs)g8<8,8,1>F { align1 compr };
+(+f0.0) sel(16) m3<1>UD         g18<8,8,1>UD    g24<8,8,1>UD    { align1 compr4 };
+(+f0.0.x) sel(8) g10<1>.xUD     g9<4>.yUD       g9<4>.xUD       { align16 };
+(+f0.0) sel(16) g28<1>UD        g8<0,1,0>UD     0x00000000UD    { align1 compr };
+(+f0.0) sel(8)  g28<1>.yF       g32<4>.xF       0x0F  /* 0F */  { align16 };
+(-f0.0.z) sel(8) g28<1>.zUD     g31<4>.xUD      0x00000000UD    { align16 };
+(+f0.0) sel.sat(8) m5<1>F       g1<0>F          g3<4>F          { align16 };
+(-f0.0) sel(16) g8<1>F          (abs)g6<8,8,1>F 0x3f800000F  /* 1F */ { align1 compr };
+(-f0.0) sel(16) m3<1>UD         g14<8,8,1>UD    0x3f00022fUD    { align1 compr4 };
+(+f0.0) sel(16) m4<1>F          g4<8,8,1>F      g6<8,8,1>F      { align1 compr4 };
+(-f0.0.y) sel(8) g3<1>.yUD      g4<4>.xUD       0x00000000UD    { align16 };
+(+f0.0) sel(8)  g5<1>UD         g3<4>UD         0x00000000UD    { align16 };
+(+f0.0.y) sel(8) g3<1>.yUD      g1<0>.wUD       g1<0>.zUD       { align16 };
+(+f0.0) sel(8)  g5<1>.xyF       g1<0>.xyyyF     g1<0>.zF        { align16 };
+(+f0.0.x) sel(8) g5<1>.xF       g1<0>.xF        -g1<0>.xF       { align16 };
+(-f0.0) sel(8)  g5<1>.wUD       g5<4>.wUD       0x3f800000UD    { align16 };
+(-f0.0) sel(8)  g4<1>.xyzF      (abs)g4<4>.xyzzF 0x3f800000F  /* 1F */ { align16 };
+(+f0.0.x) sel(8) g4<1>.xD       -g4<4>.xD       0D              { align16 };
+(+f0.0) sel(16) g4<1>D          -g6<8,8,1>D     -1D             { align1 compr };
+(-f0.0.x) sel(8) g3<1>.xF       (abs)g3<4>.xF   0x3f800000F  /* 1F */ { align16 };
+(+f0.0) sel(16) m3<1>F          g4<8,8,1>F      0x3f800000F  /* 1F */ { align1 compr4 };
+(+f0.0) sel.sat(8) m5<1>F       g6<4>F          0xbf800000F  /* -1F */ { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/sel.expected b/src/intel/compiler/elk/tests/gen4.5/sel.expected
new file mode 100644
index 00000000000..d64a1da4022
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/sel.expected
@@ -0,0 +1,31 @@
+02 01 66 00 21 04 ef 20 24 01 6e 00 04 01 6e 00
+02 01 61 00 21 04 43 21 e4 00 65 00 6e 00 0f 00
+02 01 67 00 21 04 cf 20 c4 00 6e 00 e4 00 6e 00
+02 20 81 00 21 04 c0 20 00 05 8d 00 c0 05 8d 00
+02 20 81 00 22 0c 60 30 c0 03 8d 00 00 00 80 3f
+02 20 81 00 bd 7f 40 21 c0 00 8d 00 00 00 00 00
+02 20 91 00 21 0c 80 20 c0 00 8d 00 00 00 00 00
+02 01 62 00 21 0c c1 20 c5 00 65 00 00 00 a8 41
+02 01 72 00 21 0c c1 20 c0 00 60 00 00 00 b8 41
+02 20 81 00 bd 77 80 20 00 22 8d 00 00 21 8d 00
+02 20 81 00 22 04 60 30 40 02 8d 00 00 03 8d 00
+02 01 62 00 21 04 41 21 25 01 65 00 20 01 60 00
+02 20 81 00 21 0c 80 23 00 01 00 00 00 00 00 00
+02 01 61 00 bd 7f 82 23 00 04 60 00 00 00 00 00
+02 01 74 00 21 0c 84 23 e0 03 60 00 00 00 00 00
+02 01 61 80 be 77 af 20 24 00 0e 00 64 00 6e 00
+02 20 91 00 bd 7f 00 21 c0 20 8d 00 00 00 80 3f
+02 20 91 00 22 0c 60 30 c0 01 8d 00 2f 02 00 3f
+02 20 81 00 be 77 80 30 80 00 8d 00 c0 00 8d 00
+02 01 73 00 21 0c 62 20 80 00 60 00 00 00 00 00
+02 01 61 00 21 0c af 20 64 00 6e 00 00 00 00 00
+02 01 63 00 21 04 62 20 2f 00 0f 00 2a 00 0a 00
+02 01 61 00 bd 77 a3 20 24 00 05 00 2a 00 0a 00
+02 01 62 00 bd 77 a1 20 20 00 00 00 20 40 00 00
+02 01 71 00 21 0c a8 20 af 00 6f 00 00 00 80 3f
+02 01 71 00 bd 7f 87 20 84 20 6a 00 00 00 80 3f
+02 01 62 00 a5 1c 81 20 80 40 60 00 00 00 00 00
+02 20 81 00 a5 1c 80 20 c0 40 8d 00 ff ff ff ff
+02 01 72 00 bd 7f 61 20 60 20 60 00 00 00 80 3f
+02 20 81 00 be 7f 60 30 80 00 8d 00 00 00 80 3f
+02 01 61 80 be 7f af 20 c4 00 6e 00 00 00 80 bf
diff --git a/src/intel/compiler/elk/tests/gen4.5/send.asm b/src/intel/compiler/elk/tests/gen4.5/send.asm
new file mode 100644
index 00000000000..dc543f1262c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/send.asm
@@ -0,0 +1,222 @@
+send(16) 2      g12<1>F         g10<8,8,1>F     0x01110001
+                            math MsgDesc: inv mlen 1 rlen 1                 { align1 compr };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04800
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 10 rlen 0 { align1 EOT };
+send(8) 1       g8<1>.wF        g6<4>.wF        0x01110001
+                            math MsgDesc: inv mlen 1 rlen 1                 { align16 };
+send(8) 1       null<1>F        g0<4>F          0x8650c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 5 rlen 0 { align16 EOT };
+send(8) 1       null<1>F        g0<4>F          0x8640c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 4 rlen 0 { align16 EOT };
+send(8) 13      g0<1>F          g0<4>F          0x053190ff
+                            write MsgDesc: OWord dual block write MsgCtrl = 0x0 Surface = 255 mlen 3 rlen 1 { align16 };
+send(8) 14      g9<1>F          g0<4>F          0x042150ff
+                            read MsgDesc: OWord Dual Block Read MsgCtrl = 0x0 Surface = 255 mlen 2 rlen 1 { align16 };
+send(8) 1       null<1>F        g0<4>F          0x8680c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 8 rlen 0 { align16 EOT };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02780001
+                            sampler MsgDesc: (1, 0, 0, ) mlen 7 rlen 8      { align1 };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02580001
+                            sampler MsgDesc: (1, 0, 0, ) mlen 5 rlen 8      { align1 };
+send(8) 14      g3<1>UD         g0<4>F          0x04211000
+                            read MsgDesc: OWord Dual Block Read MsgCtrl = 0x0 Surface = 0 mlen 2 rlen 1 { align16 };
+send(8) 1       g6<1>.xF        g6<4>.xF        0x01110004
+                            math MsgDesc: sqrt mlen 1 rlen 1                { align16 };
+send(16) 1      g26<1>UW        g0<8,8,1>UW     0x02382001
+                            sampler MsgDesc: (1, 0, 2, ) mlen 3 rlen 8      { align1 };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02983001
+                            sampler MsgDesc: (1, 0, 3, ) mlen 9 rlen 8      { align1 };
+send(8) 1       null<1>F        g0<4>F          0x06d04400
+                            urb MsgDesc: 0 urb_write interleave used mlen 13 rlen 0 { align16 };
+send(8) 1       null<1>F        g0<4>F          0x8650c460
+                            urb MsgDesc: 6 urb_write interleave used complete mlen 5 rlen 0 { align16 EOT };
+send(8) 1       null<1>F        g0<4>F          0x8660c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 6 rlen 0 { align16 EOT };
+send(8) 2       g6<1>F          g4<8,8,1>F      0x0121000a
+                            math MsgDesc: pow mlen 2 rlen 1                 { align1 };
+send(16) 1      g16<1>UW        g0<8,8,1>UW     0x02380001
+                            sampler MsgDesc: (1, 0, 0, ) mlen 3 rlen 8      { align1 };
+send(16) 2      g6<1>F          g4<8,8,1>F      0x01110007
+                            math MsgDesc: cos mlen 1 rlen 1                 { align1 compr };
+send(16) 13     g8<1>UW         g0<8,8,1>F      0x02383001
+                            sampler MsgDesc: (1, 0, 3, ) mlen 3 rlen 8      { align1 };
+send(16) 2      g4<1>F          g2.4<0,1,0>F    0x01110081
+                            math MsgDesc: inv scalar mlen 1 rlen 1          { align1 compr };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02980001
+                            sampler MsgDesc: (1, 0, 0, ) mlen 9 rlen 8      { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85e04800
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 14 rlen 0 { align1 EOT };
+send(8) 1       null<1>F        g0<4>F          0x8670c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 7 rlen 0 { align16 EOT };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x85604c00
+                            write MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 6 rlen 0 { align1 EOT };
+send(8) 1       null<1>F        g0<4>F          0x8680c460
+                            urb MsgDesc: 6 urb_write interleave used complete mlen 8 rlen 0 { align16 EOT };
+send(8) 1       g5<1>.yF        g6<4>.xF        0x01110006
+                            math MsgDesc: sin mlen 1 rlen 1                 { align16 };
+send(8) 1       g7<1>.xD        g1<0>.zD        0x0121001c
+                            math MsgDesc: intdiv signed mlen 2 rlen 1       { align16 };
+send(8) 1       null<1>F        g0<4>F          0x8640c460
+                            urb MsgDesc: 6 urb_write interleave used complete mlen 4 rlen 0 { align16 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85c04800
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 12 rlen 0 { align1 EOT };
+send(8) 1       null<1>F        g0<4>F          0x86b0c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 11 rlen 0 { align16 EOT };
+send(16) 2      g6<1>F          g4<8,8,1>F      0x01110003
+                            math MsgDesc: exp mlen 1 rlen 1                 { align1 compr };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02983005
+                            sampler MsgDesc: (5, 0, 3, ) mlen 9 rlen 8      { align1 };
+send(16) 1      g12<1>UW        g0<8,8,1>UW     0x02983006
+                            sampler MsgDesc: (6, 0, 3, ) mlen 9 rlen 8      { align1 };
+send(16) 1      g20<1>UW        g0<8,8,1>UW     0x02983007
+                            sampler MsgDesc: (7, 0, 3, ) mlen 9 rlen 8      { align1 };
+send(16) 1      g28<1>UW        g0<8,8,1>UW     0x02983008
+                            sampler MsgDesc: (8, 0, 3, ) mlen 9 rlen 8      { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x05a04000
+                            write MsgDesc: RT write SIMD16 Surface = 0 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x05a04001
+                            write MsgDesc: RT write SIMD16 Surface = 1 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x05a04002
+                            write MsgDesc: RT write SIMD16 Surface = 2 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04803
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 3 mlen 10 rlen 0 { align1 EOT };
+send(8) 2       g4<1>D          g2.4<0,1,0>D    0x0121009c
+                            math MsgDesc: intdiv signed scalar mlen 2 rlen 1 { align1 };
+send(16) 14     g8<1>UW         null<8,8,1>F    0x04120301
+                            read MsgDesc: OWord Block Read MsgCtrl = 0x3 Surface = 1 mlen 1 rlen 2 { align1 nomask };
+send(8) 1       g30<1>.xF       (abs)g30<4>.xF  0x01110005
+                            math MsgDesc: rsq mlen 1 rlen 1                 { align16 };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02981001
+                            sampler MsgDesc: (1, 0, 1, ) mlen 9 rlen 8      { align1 };
+send(16) 2      g4<1>F          g2<0,1,0>F      0x01110086
+                            math MsgDesc: sin scalar mlen 1 rlen 1          { align1 compr };
+send(16) 2      g6<1>F          g2<0,1,0>F      0x01110087
+                            math MsgDesc: cos scalar mlen 1 rlen 1          { align1 compr };
+send(16) 2      g4<1>F          g2.1<0,1,0>F    0x01110085
+                            math MsgDesc: rsq scalar mlen 1 rlen 1          { align1 compr };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85f04800
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 15 rlen 0 { align1 EOT };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02982001
+                            sampler MsgDesc: (1, 0, 2, ) mlen 9 rlen 8      { align1 };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02580304
+                            sampler MsgDesc: (4, 3, 0, ) mlen 5 rlen 8      { align1 };
+send(8) 1       g5<1>.xF        g1<0>.xF        0x01110002
+                            math MsgDesc: log mlen 1 rlen 1                 { align16 };
+send(8) 1       g6<1>UW         g0<8,8,1>UW     0x02640001
+                            sampler MsgDesc: (1, 0, 0, ) mlen 6 rlen 4      { align1 };
+send(8) 1       g10<1>UW        g0<8,8,1>UW     0x02641001
+                            sampler MsgDesc: (1, 0, 1, ) mlen 6 rlen 4      { align1 };
+send(8) 2       g4<1>F          g2<0,1,0>F      0x012100ca
+                            math MsgDesc: pow sat scalar mlen 2 rlen 1      { align1 };
+send(16) 1      g16<1>UW        g0<8,8,1>UW     0x02982102
+                            sampler MsgDesc: (2, 1, 2, ) mlen 9 rlen 8      { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04801
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 10 rlen 0 { align1 EOT };
+send(8) 1       null<1>F        g0<4>F          0x8690c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 9 rlen 0 { align16 EOT };
+send(8) 1       null<1>F        g0<4>F          0x86c0c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 12 rlen 0 { align16 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04802
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 2 mlen 10 rlen 0 { align1 EOT };
+send(16) 1      g20<1>UW        g0<8,8,1>UW     0x02580102
+                            sampler MsgDesc: (2, 1, 0, ) mlen 5 rlen 8      { align1 };
+send(8) 1       null<1>F        g0<4>F          0x86a0c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 10 rlen 0 { align16 EOT };
+send(16) 2      g4<1>F          g2<0,1,0>F      0x01110082
+                            math MsgDesc: log scalar mlen 1 rlen 1          { align1 compr };
+send(16) 1      g14<1>UW        g0<8,8,1>UW     0x02382102
+                            sampler MsgDesc: (2, 1, 2, ) mlen 3 rlen 8      { align1 };
+send(16) 1      g26<1>UW        g0<8,8,1>UW     0x02382203
+                            sampler MsgDesc: (3, 2, 2, ) mlen 3 rlen 8      { align1 };
+send(16) 1      g26<1>UW        g0<8,8,1>UW     0x02580203
+                            sampler MsgDesc: (3, 2, 0, ) mlen 5 rlen 8      { align1 };
+send(16) 1      g34<1>UW        g0<8,8,1>UW     0x02382304
+                            sampler MsgDesc: (4, 3, 2, ) mlen 3 rlen 8      { align1 };
+send(16) 1      g42<1>UW        g0<8,8,1>UW     0x02382405
+                            sampler MsgDesc: (5, 4, 2, ) mlen 3 rlen 8      { align1 };
+send(16) 1      g42<1>UW        g0<8,8,1>UW     0x02580405
+                            sampler MsgDesc: (5, 4, 0, ) mlen 5 rlen 8      { align1 };
+send(16) 1      g50<1>UW        g0<8,8,1>UW     0x02382506
+                            sampler MsgDesc: (6, 5, 2, ) mlen 3 rlen 8      { align1 };
+send(16) 1      g50<1>UW        g0<8,8,1>UW     0x02580506
+                            sampler MsgDesc: (6, 5, 0, ) mlen 5 rlen 8      { align1 };
+send(16) 1      g58<1>UW        g0<8,8,1>UW     0x02382607
+                            sampler MsgDesc: (7, 6, 2, ) mlen 3 rlen 8      { align1 };
+send(16) 1      g58<1>UW        g0<8,8,1>UW     0x02580607
+                            sampler MsgDesc: (7, 6, 0, ) mlen 5 rlen 8      { align1 };
+send(16) 1      g66<1>UW        g0<8,8,1>UW     0x02382708
+                            sampler MsgDesc: (8, 7, 2, ) mlen 3 rlen 8      { align1 };
+send(16) 1      g66<1>UW        g0<8,8,1>UW     0x02580708
+                            sampler MsgDesc: (8, 7, 0, ) mlen 5 rlen 8      { align1 };
+send(8) 1       null<1>F        g0<4>F          0x86d0c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 13 rlen 0 { align16 EOT };
+send(8) 1       g10<1>UW        g0<8,8,1>UW     0x02641102
+                            sampler MsgDesc: (2, 1, 1, ) mlen 6 rlen 4      { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85b04800
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 11 rlen 0 { align1 EOT };
+send(8) 2       g3<1>F          g0<4>F          0x02211505
+                            sampler MsgDesc: (5, 5, 1, ) mlen 2 rlen 1      { align16 };
+send(16) 2      g4<1>F          g2<0,1,0>F      0x011100c4
+                            math MsgDesc: sqrt sat scalar mlen 1 rlen 1     { align1 compr };
+send(16) 2      g4<1>F          g2<0,1,0>F      0x011100c3
+                            math MsgDesc: exp sat scalar mlen 1 rlen 1      { align1 compr };
+send(8) 2       g3<1>F          g0<4>F          0x02211000
+                            sampler MsgDesc: (0, 0, 1, ) mlen 2 rlen 1      { align16 };
+send(16) 13     g24<1>UW        g0<8,8,1>F      0x02383002
+                            sampler MsgDesc: (2, 0, 3, ) mlen 3 rlen 8      { align1 };
+send(8) 1       g3<1>F          g1<0>F          0x01110044
+                            math MsgDesc: sqrt sat mlen 1 rlen 1            { align16 };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02983002
+                            sampler MsgDesc: (2, 0, 3, ) mlen 9 rlen 8      { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x05a04003
+                            write MsgDesc: RT write SIMD16 Surface = 3 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04804
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 4 mlen 10 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x05a04004
+                            write MsgDesc: RT write SIMD16 Surface = 4 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04805
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 5 mlen 10 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x05a04005
+                            write MsgDesc: RT write SIMD16 Surface = 5 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04806
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 6 mlen 10 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x05a04006
+                            write MsgDesc: RT write SIMD16 Surface = 6 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04807
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 7 mlen 10 rlen 0 { align1 EOT };
+send(8) 1       g8<1>UW         g0<8,8,1>UW     0x02742001
+                            sampler MsgDesc: (1, 0, 2, ) mlen 7 rlen 4      { align1 };
+send(16) 1      g12<1>UW        g0<8,8,1>UW     0x02780102
+                            sampler MsgDesc: (2, 1, 0, ) mlen 7 rlen 8      { align1 };
+send(8) 1       null<1>F        g0<4>F          0x8620c460
+                            urb MsgDesc: 6 urb_write interleave used complete mlen 2 rlen 0 { align16 EOT };
+send(16) 2      g6<1>F          g2<0,1,0>F      0x01110084
+                            math MsgDesc: sqrt scalar mlen 1 rlen 1         { align1 compr };
+send(8) 1       g3<1>F          g1<0>F          0x01110043
+                            math MsgDesc: exp sat mlen 1 rlen 1             { align16 };
+send(8) 2       g4<1>F          g2<0,1,0>F      0x0121008a
+                            math MsgDesc: pow scalar mlen 2 rlen 1          { align1 };
+send(8) 1       g8<1>UW         g0<8,8,1>UW     0x02640102
+                            sampler MsgDesc: (2, 1, 0, ) mlen 6 rlen 4      { align1 };
+send(16) 2      g4<1>F          g2<0,1,0>F      0x01110083
+                            math MsgDesc: exp scalar mlen 1 rlen 1          { align1 compr };
+send(8) 1       g8<1>UW         g0<8,8,1>UW     0x02a42001
+                            sampler MsgDesc: (1, 0, 2, ) mlen 10 rlen 4     { align1 };
+send(16) 1      g14<1>UW        g0<8,8,1>UW     0x02580003
+                            sampler MsgDesc: (3, 0, 0, ) mlen 5 rlen 8      { align1 };
+send(16) 1      g22<1>UW        g0<8,8,1>UW     0x02580004
+                            sampler MsgDesc: (4, 0, 0, ) mlen 5 rlen 8      { align1 };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02580f10
+                            sampler MsgDesc: (16, 15, 0, ) mlen 5 rlen 8    { align1 };
+send(8) 2       g3<1>F          g0<4>F          0x02211303
+                            sampler MsgDesc: (3, 3, 1, ) mlen 2 rlen 1      { align16 };
+send(8) 1       g3<1>F          g1<0>F          0x0121004a
+                            math MsgDesc: pow sat mlen 2 rlen 1             { align16 };
+send(16) 1      g10<1>UW        g0<8,8,1>UW     0x02382004
+                            sampler MsgDesc: (4, 0, 2, ) mlen 3 rlen 8      { align1 };
+send(16) 1      g10<1>UW        g0<8,8,1>UW     0x02382003
+                            sampler MsgDesc: (3, 0, 2, ) mlen 3 rlen 8      { align1 };
+send(16) 1      g10<1>UW        g0<8,8,1>UW     0x02382002
+                            sampler MsgDesc: (2, 0, 2, ) mlen 3 rlen 8      { align1 };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02580002
+                            sampler MsgDesc: (2, 0, 0, ) mlen 5 rlen 8      { align1 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/send.expected b/src/intel/compiler/elk/tests/gen4.5/send.expected
new file mode 100644
index 00000000000..4cd64a51224
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/send.expected
@@ -0,0 +1,111 @@
+31 20 80 02 bd 0f 80 21 40 01 8d 00 01 00 11 01
+31 00 80 01 28 0d 00 20 00 00 8d 00 00 48 a0 85
+31 01 60 01 bd 0f 08 21 cf 00 6f 00 01 00 11 01
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 50 86
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 40 86
+31 01 60 0d bd 0f 0f 20 04 00 6e 00 ff 90 31 05
+31 01 60 0e bd 0f 2f 21 04 00 6e 00 ff 50 21 04
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 80 86
+31 00 80 01 29 0d 80 20 00 00 8d 00 01 00 78 02
+31 00 80 01 29 0d 80 20 00 00 8d 00 01 00 58 02
+31 01 60 0e a1 0f 6f 20 04 00 6e 00 00 10 21 04
+31 01 60 01 bd 0f c1 20 c0 00 60 00 04 00 11 01
+31 00 80 01 29 0d 40 23 00 00 8d 00 01 20 38 02
+31 00 80 01 29 0d 80 20 00 00 8d 00 01 30 98 02
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 44 d0 06
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 60 c4 50 86
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 60 86
+31 00 60 02 bd 0f c0 20 80 00 8d 00 0a 00 21 01
+31 00 80 01 29 0d 00 22 00 00 8d 00 01 00 38 02
+31 20 80 02 bd 0f c0 20 80 00 8d 00 07 00 11 01
+31 00 80 0d a9 0f 00 21 00 00 8d 00 01 30 38 02
+31 20 80 02 bd 0f 80 20 50 00 00 00 81 00 11 01
+31 00 80 01 29 0d 80 20 00 00 8d 00 01 00 98 02
+31 00 80 01 28 0d 00 20 00 00 8d 00 00 48 e0 85
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 70 86
+31 00 60 01 28 0d 00 20 00 00 8d 00 00 4c 60 85
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 60 c4 80 86
+31 01 60 01 bd 0f a2 20 c0 00 60 00 06 00 11 01
+31 01 60 01 a5 0c e1 20 2a 00 0a 00 1c 00 21 01
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 60 c4 40 86
+31 00 80 01 28 0d 00 20 00 00 8d 00 00 48 c0 85
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 b0 86
+31 20 80 02 bd 0f c0 20 80 00 8d 00 03 00 11 01
+31 00 80 01 29 0d 80 20 00 00 8d 00 05 30 98 02
+31 00 80 01 29 0d 80 21 00 00 8d 00 06 30 98 02
+31 00 80 01 29 0d 80 22 00 00 8d 00 07 30 98 02
+31 00 80 01 29 0d 80 23 00 00 8d 00 08 30 98 02
+31 00 80 01 28 0d 00 20 00 00 8d 00 00 40 a0 05
+31 00 80 01 28 0d 00 20 00 00 8d 00 01 40 a0 05
+31 00 80 01 28 0d 00 20 00 00 8d 00 02 40 a0 05
+31 00 80 01 28 0d 00 20 00 00 8d 00 03 48 a0 85
+31 00 60 02 a5 0c 80 20 50 00 00 00 9c 00 21 01
+31 02 80 0e 89 0f 00 21 00 00 8d 00 01 03 12 04
+31 01 60 01 bd 0f c1 23 c0 23 60 00 05 00 11 01
+31 00 80 01 29 0d 80 20 00 00 8d 00 01 10 98 02
+31 20 80 02 bd 0f 80 20 40 00 00 00 86 00 11 01
+31 20 80 02 bd 0f c0 20 40 00 00 00 87 00 11 01
+31 20 80 02 bd 0f 80 20 44 00 00 00 85 00 11 01
+31 00 80 01 28 0d 00 20 00 00 8d 00 00 48 f0 85
+31 00 80 01 29 0d 80 20 00 00 8d 00 01 20 98 02
+31 00 80 01 29 0d 80 20 00 00 8d 00 04 03 58 02
+31 01 60 01 bd 0f a1 20 20 00 00 00 02 00 11 01
+31 00 60 01 29 0d c0 20 00 00 8d 00 01 00 64 02
+31 00 60 01 29 0d 40 21 00 00 8d 00 01 10 64 02
+31 00 60 02 bd 0f 80 20 40 00 00 00 ca 00 21 01
+31 00 80 01 29 0d 00 22 00 00 8d 00 02 21 98 02
+31 00 80 01 28 0d 00 20 00 00 8d 00 01 48 a0 85
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 90 86
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 c0 86
+31 00 80 01 28 0d 00 20 00 00 8d 00 02 48 a0 85
+31 00 80 01 29 0d 80 22 00 00 8d 00 02 01 58 02
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 a0 86
+31 20 80 02 bd 0f 80 20 40 00 00 00 82 00 11 01
+31 00 80 01 29 0d c0 21 00 00 8d 00 02 21 38 02
+31 00 80 01 29 0d 40 23 00 00 8d 00 03 22 38 02
+31 00 80 01 29 0d 40 23 00 00 8d 00 03 02 58 02
+31 00 80 01 29 0d 40 24 00 00 8d 00 04 23 38 02
+31 00 80 01 29 0d 40 25 00 00 8d 00 05 24 38 02
+31 00 80 01 29 0d 40 25 00 00 8d 00 05 04 58 02
+31 00 80 01 29 0d 40 26 00 00 8d 00 06 25 38 02
+31 00 80 01 29 0d 40 26 00 00 8d 00 06 05 58 02
+31 00 80 01 29 0d 40 27 00 00 8d 00 07 26 38 02
+31 00 80 01 29 0d 40 27 00 00 8d 00 07 06 58 02
+31 00 80 01 29 0d 40 28 00 00 8d 00 08 27 38 02
+31 00 80 01 29 0d 40 28 00 00 8d 00 08 07 58 02
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 d0 86
+31 00 60 01 29 0d 40 21 00 00 8d 00 02 11 64 02
+31 00 80 01 28 0d 00 20 00 00 8d 00 00 48 b0 85
+31 01 60 02 bd 0f 6f 20 04 00 6e 00 05 15 21 02
+31 20 80 02 bd 0f 80 20 40 00 00 00 c4 00 11 01
+31 20 80 02 bd 0f 80 20 40 00 00 00 c3 00 11 01
+31 01 60 02 bd 0f 6f 20 04 00 6e 00 00 10 21 02
+31 00 80 0d a9 0f 00 23 00 00 8d 00 02 30 38 02
+31 01 60 01 bd 0f 6f 20 24 00 0e 00 44 00 11 01
+31 00 80 01 29 0d 80 20 00 00 8d 00 02 30 98 02
+31 00 80 01 28 0d 00 20 00 00 8d 00 03 40 a0 05
+31 00 80 01 28 0d 00 20 00 00 8d 00 04 48 a0 85
+31 00 80 01 28 0d 00 20 00 00 8d 00 04 40 a0 05
+31 00 80 01 28 0d 00 20 00 00 8d 00 05 48 a0 85
+31 00 80 01 28 0d 00 20 00 00 8d 00 05 40 a0 05
+31 00 80 01 28 0d 00 20 00 00 8d 00 06 48 a0 85
+31 00 80 01 28 0d 00 20 00 00 8d 00 06 40 a0 05
+31 00 80 01 28 0d 00 20 00 00 8d 00 07 48 a0 85
+31 00 60 01 29 0d 00 21 00 00 8d 00 01 20 74 02
+31 00 80 01 29 0d 80 21 00 00 8d 00 02 01 78 02
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 60 c4 20 86
+31 20 80 02 bd 0f c0 20 40 00 00 00 84 00 11 01
+31 01 60 01 bd 0f 6f 20 24 00 0e 00 43 00 11 01
+31 00 60 02 bd 0f 80 20 40 00 00 00 8a 00 21 01
+31 00 60 01 29 0d 00 21 00 00 8d 00 02 01 64 02
+31 20 80 02 bd 0f 80 20 40 00 00 00 83 00 11 01
+31 00 60 01 29 0d 00 21 00 00 8d 00 01 20 a4 02
+31 00 80 01 29 0d c0 21 00 00 8d 00 03 00 58 02
+31 00 80 01 29 0d c0 22 00 00 8d 00 04 00 58 02
+31 00 80 01 29 0d 80 20 00 00 8d 00 10 0f 58 02
+31 01 60 02 bd 0f 6f 20 04 00 6e 00 03 13 21 02
+31 01 60 01 bd 0f 6f 20 24 00 0e 00 4a 00 21 01
+31 00 80 01 29 0d 40 21 00 00 8d 00 04 20 38 02
+31 00 80 01 29 0d 40 21 00 00 8d 00 03 20 38 02
+31 00 80 01 29 0d 40 21 00 00 8d 00 02 20 38 02
+31 00 80 01 29 0d 80 20 00 00 8d 00 02 00 58 02
diff --git a/src/intel/compiler/elk/tests/gen4.5/shl.asm b/src/intel/compiler/elk/tests/gen4.5/shl.asm
new file mode 100644
index 00000000000..65badb539c9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/shl.asm
@@ -0,0 +1,5 @@
+shl(8)          g4<1>.xD        g1<0>.yD        0x00000004UD    { align16 };
+shl(16)         g4<1>D          g2.4<0,1,0>D    0x00000004UD    { align1 compr };
+shl(16)         m14<1>D         g2<0,1,0>D      0x00000004UD    { align1 compr };
+shl(8)          g11<1>.xUD      g11<4>.xUD      4D              { align16 };
+shl(8)          g5<1>D          g3<4>D          g4<4>UD         { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/shl.expected b/src/intel/compiler/elk/tests/gen4.5/shl.expected
new file mode 100644
index 00000000000..a46c70a3067
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/shl.expected
@@ -0,0 +1,5 @@
+09 01 60 00 a5 0c 81 20 25 00 05 00 04 00 00 00
+09 20 80 00 a5 0c 80 20 50 00 00 00 04 00 00 00
+09 20 80 00 a6 0c c0 21 40 00 00 00 04 00 00 00
+09 01 60 00 21 1c 61 21 60 01 60 00 04 00 00 00
+09 01 60 00 a5 04 af 20 64 00 6e 00 84 00 6e 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/shr.asm b/src/intel/compiler/elk/tests/gen4.5/shr.asm
new file mode 100644
index 00000000000..3900d16aa3f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/shr.asm
@@ -0,0 +1 @@
+shr(1)          g10.4<1>UD      g10.4<0,1,0>UD  0x00000004UD    { align1 nomask };
diff --git a/src/intel/compiler/elk/tests/gen4.5/shr.expected b/src/intel/compiler/elk/tests/gen4.5/shr.expected
new file mode 100644
index 00000000000..e9e16d12c6b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/shr.expected
@@ -0,0 +1 @@
+08 02 00 00 21 0c 50 21 50 01 00 00 04 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/while.asm b/src/intel/compiler/elk/tests/gen4.5/while.asm
new file mode 100644
index 00000000000..9f9645fad90
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/while.asm
@@ -0,0 +1,4 @@
+while(16)       Jump: -10                                       { align1 };
+while(8)        Jump: -16                                       { align16 };
+(-f0.0) while(16) Jump: -11                                     { align1 };
+(-f0.0.x) while(8) Jump: -11                                    { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/while.expected b/src/intel/compiler/elk/tests/gen4.5/while.expected
new file mode 100644
index 00000000000..9707936afd3
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/while.expected
@@ -0,0 +1,4 @@
+27 00 80 00 00 1c 00 34 00 14 60 00 f6 ff 00 00
+27 01 60 00 00 1c 0f 34 04 14 6e 00 f0 ff 00 00
+27 00 91 00 00 1c 00 34 00 14 60 00 f5 ff 00 00
+27 01 72 00 00 1c 0f 34 04 14 6e 00 f5 ff 00 00
diff --git a/src/intel/compiler/elk/tests/gen4.5/xor.asm b/src/intel/compiler/elk/tests/gen4.5/xor.asm
new file mode 100644
index 00000000000..bcaaea879fc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/xor.asm
@@ -0,0 +1,2 @@
+xor(16)         g4<1>UD         g2<0,1,0>UD     g2.1<0,1,0>UD   { align1 compr };
+xor(8)          g5<1>.xUD       g1<0>.xUD       g1<0>.yUD       { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4.5/xor.expected b/src/intel/compiler/elk/tests/gen4.5/xor.expected
new file mode 100644
index 00000000000..f5d2ef3ecc7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4.5/xor.expected
@@ -0,0 +1,2 @@
+07 20 80 00 21 04 80 20 40 00 00 00 44 00 00 00
+07 01 60 00 21 04 a1 20 20 00 00 00 25 00 05 00
diff --git a/src/intel/compiler/elk/tests/gen4/add.asm b/src/intel/compiler/elk/tests/gen4/add.asm
new file mode 100644
index 00000000000..6c87d8cc2a6
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/add.asm
@@ -0,0 +1,48 @@
+add(16)         g10<1>UW        g1.4<2,4,0>UW   0x10101010V     { align1 };
+add(16)         g6<1>F          g10<8,8,1>UW    -g1<0,1,0>F     { align1 compr };
+add(16)         g4<1>F          g18<8,8,1>F     g6<8,8,1>F      { align1 compr };
+add(1)          m14.4<1>D       g8.4<0,1,0>D    16D             { align1 nomask };
+add(8)          g5<1>.xD        g2<4>.xD        64D             { align16 };
+add(8)          g4<1>.xD        g5<4>.xD        g4<4>.xD        { align16 };
+add(8)          g3<1>F          g3<4>F          g5<4>F          { align16 };
+add(16)         g24<1>F         g20<8,8,1>F     0x3f800000F  /* 1F */ { align1 compr };
+add(16)         g14<1>D         g14<8,8,1>D     1D              { align1 compr };
+add(8)          m5<1>.xyzF      g10<4>.xyzzF    g8<4>.xyzzF     { align16 NoDDClr };
+add.le.f0.0(16) g6<1>F          g8<8,8,1>F      g4<8,8,1>F      { align1 compr };
+add(8)          a0<1>UW         g4<16,8,2>UW    0x0040UW        { align1 };
+add(8)          a0<1>UW         g5<16,8,2>UW    0x0040UW        { align1 sechalf };
+add(8)          g3<1>.xyF       g2<4>.xyyyF     0x3f800000F  /* 1F */ { align16 };
+add(16)         m2<1>D          g6<8,8,1>D      g8.3<0,1,0>D    { align1 compr };
+add(16)         m14<1>D         g4<8,8,1>D      12D             { align1 compr };
+add.sat(16)     g6<1>F          g4<8,8,1>F      g2.1<0,1,0>F    { align1 compr };
+add(8)          g37<1>UW        g1.4<2,4,0>UW   0x10101010V     { align1 };
+add(8)          g37<1>F         g37<8,8,1>UW    -g1<0,1,0>F     { align1 };
+add(8)          g37<1>D         g2<0,1,0>D      1D              { align1 };
+add(8)          m5<1>.xF        g3<4>.xF        0x3f000000F  /* 0.5F */ { align16 };
+add(16)         g4<1>D          g2<0,1,0>D      -g2.2<0,1,0>D   { align1 compr };
+add.sat(8)      m5<1>F          g7<4>F          g8<4>F          { align16 };
+add(8)          g31<1>.xyzF     g28<4>.xyzzF    0x30300000VF /* [0F, 0F, 1F, 1F]VF */ { align16 };
+add.sat(8)      m5<1>.xyzF      g25<4>.xyzzF    g26<4>.xyzzF    { align16 NoDDClr };
+add.ge.f0.0(8)  g8<1>.xF        -g8<4>.xF       0x3f800000F  /* 1F */ { align16 };
+add(16)         g4.1<2>UW       g4.1<16,8,2>UW  g6<16,8,2>UW    { align1 compr };
+add.ge.f0.0(16) g4<1>F          -g6<8,8,1>F     0x3f800000F  /* 1F */ { align1 compr };
+add(8)          g4<1>.xyF       g4<4>.xyyyF     0xbf800000F  /* -1F */ { align16 NoDDClr };
+add(8)          m5<1>.xyzF      g4<4>.xyzzF     g2<0>.xyzzF     { align16 };
+add(16)         m2<1>F          -g16<8,8,1>F    0x3f800000F  /* 1F */ { align1 compr };
+add(8)          m5<1>.zwF       g8<4>.xxxyF     g9<4>.xxxyF     { align16 NoDDChk };
+add(8)          g4<1>.xUD       g4<4>.xUD       0x00000040UD    { align16 };
+add.sat(8)      m5<1>.yF        g1<0>.zF        0x3f000000F  /* 0.5F */ { align16 };
+add(16)         m2<1>F          g22<8,8,1>F     g2<0,1,0>F      { align1 compr };
+add(16)         m14<1>UD        g4<8,8,1>UD     0x00000110UD    { align1 compr };
+add(8)          g5<1>F          -g9<4>.xyxyF    g9<4>.zwzwF     { align16 sechalf };
+add.sat(8)      m5<1>.yF        g6<4>.xF        g7<4>.xF        { align16 NoDDClr,NoDDChk };
+add.sat(8)      m5<1>.wF        g6<4>.xF        g7<4>.xF        { align16 NoDDChk };
+add.ge.f0.0(16) g16<1>F         g18<8,8,1>F     g10<8,8,1>F     { align1 compr };
+add.sat(8)      m5<1>.yF        -g1<0>.xF       0x3f000000F  /* 0.5F */ { align16 NoDDClr };
+add.sat(8)      m5<1>.zF        g3<4>.yF        0x40000000F  /* 2F */ { align16 NoDDClr,NoDDChk };
+add.sat(8)      m5<1>.wF        g3<4>.yF        0xc0000000F  /* -2F */ { align16 NoDDChk };
+add(8)          m5<1>F          g3<4>F          0x2020a038VF /* [1.5F, -0.5F, 0.5F, 0.5F]VF */ { align16 };
+add(8)          g5<1>.zF        g4<4>.xF        0xbf800000F  /* -1F */ { align16 NoDDClr,NoDDChk };
+add(8)          m5<1>.xyF       g12<4>.xyyyF    0x3f000000F  /* 0.5F */ { align16 NoDDClr };
+add(8)          m5<1>.wF        -g3<4>.xF       0x3f800000F  /* 1F */ { align16 NoDDClr,NoDDChk };
+add(8)          g5<1>.xyF       g3<0>.xyyyF     g4<4>.xyyyF     { align16 NoDDClr };
diff --git a/src/intel/compiler/elk/tests/gen4/add.expected b/src/intel/compiler/elk/tests/gen4/add.expected
new file mode 100644
index 00000000000..3a1f8aeb7e9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/add.expected
@@ -0,0 +1,48 @@
+40 00 80 00 29 6d 40 21 28 00 48 00 10 10 10 10
+40 20 80 00 3d 75 c0 20 40 01 8d 00 20 40 00 00
+40 20 80 00 bd 77 80 20 40 02 8d 00 c0 00 8d 00
+40 02 00 00 a6 1c d0 21 10 01 00 00 10 00 00 00
+40 01 60 00 a5 1c a1 20 40 00 60 00 40 00 00 00
+40 01 60 00 a5 14 81 20 a0 00 60 00 80 00 60 00
+40 01 60 00 bd 77 6f 20 64 00 6e 00 a4 00 6e 00
+40 20 80 00 bd 7f 00 23 80 02 8d 00 00 00 80 3f
+40 20 80 00 a5 1c c0 21 c0 01 8d 00 01 00 00 00
+40 05 60 00 be 77 a7 20 44 01 6a 00 04 01 6a 00
+40 20 80 06 bd 77 c0 20 00 01 8d 00 80 00 8d 00
+40 00 60 00 28 2d 00 22 80 00 ae 00 40 00 40 00
+40 10 60 00 28 2d 00 22 a0 00 ae 00 40 00 40 00
+40 01 60 00 bd 7f 63 20 44 00 65 00 00 00 80 3f
+40 20 80 00 a6 14 40 20 c0 00 8d 00 0c 01 00 00
+40 20 80 00 a6 1c c0 21 80 00 8d 00 0c 00 00 00
+40 20 80 80 bd 77 c0 20 80 00 8d 00 44 00 00 00
+40 00 60 00 29 6d a0 24 28 00 48 00 10 10 10 10
+40 00 60 00 3d 75 a0 24 a0 04 8d 00 20 40 00 00
+40 00 60 00 a5 1c a0 24 40 00 00 00 01 00 00 00
+40 01 60 00 be 7f a1 20 60 00 60 00 00 00 00 3f
+40 20 80 00 a5 14 80 20 40 00 00 00 48 40 00 00
+40 01 60 80 be 77 af 20 e4 00 6e 00 04 01 6e 00
+40 01 60 00 bd 5f e7 23 84 03 6a 00 00 00 30 30
+40 05 60 80 be 77 a7 20 24 03 6a 00 44 03 6a 00
+40 01 60 04 bd 7f 01 21 00 41 60 00 00 00 80 3f
+40 20 80 00 29 25 82 40 82 00 ae 00 c0 00 ae 00
+40 20 80 04 bd 7f 80 20 c0 40 8d 00 00 00 80 3f
+40 05 60 00 bd 7f 83 20 84 00 65 00 00 00 80 bf
+40 01 60 00 be 77 a7 20 84 00 6a 00 44 00 0a 00
+40 20 80 00 be 7f 40 20 00 42 8d 00 00 00 80 3f
+40 09 60 00 be 77 ac 20 00 01 64 00 20 01 64 00
+40 01 60 00 21 0c 81 20 80 00 60 00 40 00 00 00
+40 01 60 80 be 7f a2 20 2a 00 0a 00 00 00 00 3f
+40 20 80 00 be 77 40 20 c0 02 8d 00 40 00 00 00
+40 20 80 00 22 0c c0 21 80 00 8d 00 10 01 00 00
+40 11 60 00 bd 77 af 20 24 41 64 00 2e 01 6e 00
+40 0d 60 80 be 77 a2 20 c0 00 60 00 e0 00 60 00
+40 09 60 80 be 77 a8 20 c0 00 60 00 e0 00 60 00
+40 20 80 04 bd 77 00 22 40 02 8d 00 40 01 8d 00
+40 05 60 80 be 7f a2 20 20 40 00 00 00 00 00 3f
+40 0d 60 80 be 7f a4 20 65 00 65 00 00 00 00 40
+40 09 60 80 be 7f a8 20 65 00 65 00 00 00 00 c0
+40 01 60 00 be 5f af 20 64 00 6e 00 38 a0 20 20
+40 0d 60 00 bd 7f a4 20 80 00 60 00 00 00 80 bf
+40 05 60 00 be 7f a3 20 84 01 65 00 00 00 00 3f
+40 0d 60 00 be 7f a8 20 60 40 60 00 00 00 80 3f
+40 05 60 00 bd 77 a3 20 64 00 05 00 84 00 65 00
diff --git a/src/intel/compiler/elk/tests/gen4/and.asm b/src/intel/compiler/elk/tests/gen4/and.asm
new file mode 100644
index 00000000000..d04c38153dd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/and.asm
@@ -0,0 +1,17 @@
+and(8)          g9<1>.wUD       g9<4>.wUD       524032D         { align16 };
+and(16)         g4<1>D          g6<8,8,1>D      1D              { align1 compr };
+and(8)          g10<1>.xD       g10<4>.xD       1D              { align16 };
+and(16)         g6<1>UD         g10<8,8,1>UD    g8<8,8,1>UD     { align1 compr };
+and.nz.f0.0(16) null<1>D        g6<8,8,1>UD     1D              { align1 compr };
+and(16)         g4<1>D          g8<8,8,1>UD     1D              { align1 compr };
+and(8)          g2<1>D          g2<8,8,1>UD     1D              { align1 };
+and.nz.f0.0(8)  null<1>.xD      g9<4>.xUD       1D              { align16 };
+and(16)         g12<1>UD        g2.4<0,1,0>UD   0x80000000UD    { align1 compr };
+and.nz.f0.0(16) g110<1>D        g6<8,8,1>D      1D              { align1 compr };
+and(8)          g17<1>.xUD      g1<0>.xUD       0x80000000UD    { align16 };
+and.nz.f0.0(16) g6<1>D          g4<8,8,1>UD     1D              { align1 compr };
+and(1)          g12<1>UD        f0<0,1,0>UW     0x0000000fUD    { align1 nomask };
+and(8)          g5<1>.xUD       g1<0>.xUD       g1<0>.yUD       { align16 };
+and(8)          g8<1>.xD        g7<4>.xUD       1D              { align16 };
+and.nz.f0.0(8)  g6<1>.xD        g6<4>.xD        1D              { align16 };
+and.nz.f0.0(1)  null<1>UD       g1.6<0,1,0>UD   0x04000000UD    { align1 };
diff --git a/src/intel/compiler/elk/tests/gen4/and.expected b/src/intel/compiler/elk/tests/gen4/and.expected
new file mode 100644
index 00000000000..0405439e960
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/and.expected
@@ -0,0 +1,17 @@
+05 01 60 00 21 1c 28 21 2f 01 6f 00 00 ff 07 00
+05 20 80 00 a5 1c 80 20 c0 00 8d 00 01 00 00 00
+05 01 60 00 a5 1c 41 21 40 01 60 00 01 00 00 00
+05 20 80 00 21 04 c0 20 40 01 8d 00 00 01 8d 00
+05 20 80 02 24 1c 00 20 c0 00 8d 00 01 00 00 00
+05 20 80 00 25 1c 80 20 00 01 8d 00 01 00 00 00
+05 00 60 00 25 1c 40 20 40 00 8d 00 01 00 00 00
+05 01 60 02 24 1c 01 20 20 01 60 00 01 00 00 00
+05 20 80 00 21 0c 80 21 50 00 00 00 00 00 00 80
+05 20 80 02 a5 1c c0 2d c0 00 8d 00 01 00 00 00
+05 01 60 00 21 0c 21 22 20 00 00 00 00 00 00 80
+05 20 80 02 25 1c c0 20 80 00 8d 00 01 00 00 00
+05 02 00 00 01 0d 80 21 00 06 00 00 0f 00 00 00
+05 01 60 00 21 04 a1 20 20 00 00 00 25 00 05 00
+05 01 60 00 25 1c 01 21 e0 00 60 00 01 00 00 00
+05 01 60 02 a5 1c c1 20 c0 00 60 00 01 00 00 00
+05 00 00 02 20 0c 00 20 38 00 00 00 00 00 00 04
diff --git a/src/intel/compiler/elk/tests/gen4/asr.asm b/src/intel/compiler/elk/tests/gen4/asr.asm
new file mode 100644
index 00000000000..3fdb60d77ec
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/asr.asm
@@ -0,0 +1,5 @@
+asr(16)         g4<1>D          -g1.6<0,1,0>D   31D             { align1 compr };
+asr.nz.f0.0(16) null<1>D        -g1.6<0,1,0>D   31D             { align1 compr };
+asr(8)          g4<1>D          g5<4>D          g4<4>UD         { align16 };
+asr(8)          g11<1>.xD       g5<4>.xD        0x00000002UD    { align16 };
+asr(16)         g10<1>D         g6<8,8,1>D      0x00000002UD    { align1 compr };
diff --git a/src/intel/compiler/elk/tests/gen4/asr.expected b/src/intel/compiler/elk/tests/gen4/asr.expected
new file mode 100644
index 00000000000..ed2318952b9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/asr.expected
@@ -0,0 +1,5 @@
+0c 20 80 00 a5 1c 80 20 38 40 00 00 1f 00 00 00
+0c 20 80 02 a4 1c 00 20 38 40 00 00 1f 00 00 00
+0c 01 60 00 a5 04 8f 20 a4 00 6e 00 84 00 6e 00
+0c 01 60 00 a5 0c 61 21 a0 00 60 00 02 00 00 00
+0c 20 80 00 a5 0c 40 21 c0 00 8d 00 02 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4/break.asm b/src/intel/compiler/elk/tests/gen4/break.asm
new file mode 100644
index 00000000000..35a5bebd930
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/break.asm
@@ -0,0 +1,5 @@
+(-f0.0) break(16) Jump: 10      Pop: 0                          { align1 };
+break(16)       Jump: 5         Pop: 1                          { align1 };
+(+f0.0) break(16) Jump: 156     Pop: 0                          { align1 };
+(+f0.0.x) break(8) Jump: 16     Pop: 0                          { align16 };
+break(8)        Jump: 6         Pop: 2                          { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4/break.expected b/src/intel/compiler/elk/tests/gen4/break.expected
new file mode 100644
index 00000000000..644aa5dc6b0
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/break.expected
@@ -0,0 +1,5 @@
+28 00 91 00 00 1c 00 34 00 14 60 00 0a 00 00 00
+28 00 80 00 00 1c 00 34 00 14 60 00 05 00 01 00
+28 00 81 00 00 1c 00 34 00 14 60 00 9c 00 00 00
+28 01 62 00 00 1c 0f 34 04 14 6e 00 10 00 00 00
+28 01 60 00 00 1c 0f 34 04 14 6e 00 06 00 02 00
diff --git a/src/intel/compiler/elk/tests/gen4/cmp.asm b/src/intel/compiler/elk/tests/gen4/cmp.asm
new file mode 100644
index 00000000000..93ec96623cb
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/cmp.asm
@@ -0,0 +1,80 @@
+cmp.l.f0.0(8)   null<1>F        g8<4>.wF        0x0F  /* 0F */  { align16 };
+cmp.nz.f0.0(8)  null<1>F        g3<0>.xyzzF     0x74746e64VF /* [10F, 15F, 20F, 20F]VF */ { align16 };
+cmp.nz.f0.0(8)  null<1>D        g7<4>.xyzzD     0D              { align16 };
+cmp.ge.f0.0(16) g6<1>F          g4<8,8,1>F      0x3189705fF  /* 4e-09F */ { align1 compr };
+cmp.l.f0.0(16)  g8<1>F          g4<8,8,1>F      0x3189705fF  /* 4e-09F */ { align1 compr };
+cmp.l.f0.0(16)  g8<1>F          g4<8,8,1>F      g6<8,8,1>F      { align1 compr };
+cmp.ge.f0.0(16) g10<1>F         g4<8,8,1>F      g6<8,8,1>F      { align1 compr };
+cmp.z.f0.0(8)   g10<1>.xD       g4<0>.xD        0D              { align16 };
+cmp.l.f0.0(8)   g7<1>.xF        g7<4>.xF        0x3189705fF  /* 4e-09F */ { align16 };
+cmp.ge.f0.0(8)  g6<1>.xF        g2<0>.xF        g6<4>.xF        { align16 };
+cmp.z.f0.0(8)   null<1>F        g3<0>.zwwwF     g3<0>.xyyyF     { align16 };
+cmp.ge.f0.0(16) null<1>D        g14<8,8,1>D     16D             { align1 compr };
+cmp.l.f0.0(16)  null<1>D        g2<0,1,0>D      1D              { align1 compr };
+cmp.z.f0.0(16)  g8<1>F          g32<8,8,1>F     g2.3<0,1,0>F    { align1 compr };
+cmp.ge.f0.0(16) null<1>F        g6<8,8,1>F      0x0F  /* 0F */  { align1 compr };
+cmp.nz.f0.0(8)  null<1>F        g12<4>.xyyyF    g1<0>.xyyyF     { align16 };
+cmp.z.f0.0(8)   null<1>D        g6<4>D          g2.4<0>D        { align16 };
+cmp.z.f0.0(16)  g6<1>D          g2.1<0,1,0>D    39D             { align1 compr };
+cmp.ge.f0.0(8)  null<1>F        g5<4>.xF        0x0F  /* 0F */  { align16 };
+cmp.le.f0.0(8)  g5<1>.xF        g5<4>.xF        0x0F  /* 0F */  { align16 };
+cmp.z.f0.0(16)  g4<1>F          g2.1<0,1,0>F    0x41000000F  /* 8F */ { align1 compr };
+cmp.z.f0.0(8)   g5<1>.xD        g5<4>.xD        g1<0>.zD        { align16 };
+cmp.l.f0.0(8)   g3<1>.xyF       g1<0>.xyyyF     g1<0>.zwwwF     { align16 };
+cmp.z.f0.0(16)  null<1>D        g2<0,1,0>D      1D              { align1 compr };
+cmp.z.f0.0(16)  null<1>F        g14<8,8,1>F     g2.1<0,1,0>F    { align1 compr };
+cmp.z.f0.0(8)   g6<1>.xF        g6<4>.xF        g3<0>.yF        { align16 };
+cmp.nz.f0.0(16) g4<1>F          g6<8,8,1>F      g2.2<0,1,0>F    { align1 compr };
+cmp.ge.f0.0(16) null<1>F        (abs)g14<8,8,1>F (abs)g8<8,8,1>F { align1 compr };
+cmp.nz.f0.0(8)  g5<1>F          g5<8,8,1>F      g37<8,8,1>F     { align1 };
+cmp.ge.f0.0(8)  null<1>.xD      g5<4>.xD        4D              { align16 };
+cmp.nz.f0.0(16) null<1>F        g2.4<0,1,0>F    0x0F  /* 0F */  { align1 compr };
+cmp.z.f0.0(16)  null<1>F        g4.1<0,1,0>F    0x3f800000F  /* 1F */ { align1 compr };
+cmp.ge.f0.0(16) g4<1>D          g2<0,1,0>D      1D              { align1 compr };
+cmp.nz.f0.0(16) g4<1>D          g2.1<0,1,0>D    0D              { align1 compr };
+cmp.z.f0.0(16)  g8<1>D          g6<8,8,1>D      g2.5<0,1,0>D    { align1 compr };
+cmp.nz.f0.0(16) null<1>D        g2<0,1,0>D      0D              { align1 compr };
+cmp.l.f0.0(16)  null<1>F        g4<8,8,1>F      g2.5<0,1,0>F    { align1 compr };
+cmp.l.f0.0(16)  g6<1>D          g3<0,1,0>D      1D              { align1 compr };
+cmp.ge.f0.0(16) g4<1>D          g2.3<0,1,0>D    g2<0,1,0>D      { align1 compr };
+cmp.nz.f0.0(8)  g3<1>.xD        g1<0>.xD        g1<0>.yD        { align16 };
+cmp.nz.f0.0(8)  g3<1>.xyzF      g1<0>.xyzzF     g1.4<0>.xyzzF   { align16 };
+cmp.nz.f0.0(8)  null<1>F        g1<0>.xF        0x0F  /* 0F */  { align16 };
+cmp.le.f0.0(8)  g5<1>.xD        g1<0>.xD        0D              { align16 };
+cmp.l.f0.0(16)  g4<1>D          g2.1<0,1,0>D    g2<0,1,0>D      { align1 compr };
+cmp.ge.f0.0(8)  g3<1>D          g1<0>D          g1.4<0>D        { align16 };
+cmp.le.f0.0(16) null<1>F        g4<8,8,1>F      0x3f000000F  /* 0.5F */ { align1 compr };
+cmp.le.f0.0(16) g20<1>F         g4<8,8,1>F      0x461c3f9aF  /* 9999.9F */ { align1 compr };
+cmp.z.f0.0(8)   null<1>F        g3<0>.xyzzF     0x6e6e6c6aVF /* [13F, 14F, 15F, 15F]VF */ { align16 };
+cmp.nz.f0.0(8)  null<1>D        g1<0>.xyzzD     g1.4<0>.xyzzD   { align16 };
+cmp.ge.f0.0(8)  null<1>.xD      g5<4>.xD        g3<0>.xD        { align16 };
+cmp.g.f0.0(8)   g4<1>F          g3<4>F          0x3f000000F  /* 0.5F */ { align16 };
+cmp.nz.f0.0(16) g8<1>F          g2.2<0,1,0>F    0x0F  /* 0F */  { align1 compr };
+cmp.l.f0.0(8)   null<1>F        g1<0>F          g3<4>F          { align16 };
+cmp.g.f0.0(8)   null<1>.xF      g2<4>.yF        0x0F  /* 0F */  { align16 };
+cmp.nz.f0.0(16) null<1>D        g16<8,8,1>D     g12<8,8,1>D     { align1 compr };
+cmp.l.f0.0(16)  null<1>F        g2<0,1,0>F      0x0F  /* 0F */  { align1 compr };
+cmp.z.f0.0(8)   null<1>.xD      g1<0>.xD        1D              { align16 };
+cmp.nz.f0.0(16) g6<1>D          g4<8,8,1>D      g2.2<0,1,0>D    { align1 compr };
+cmp.g.f0.0(16)  g16<1>F         (abs)g8<8,8,1>F 0x3f800000F  /* 1F */ { align1 compr };
+cmp.l.f0.0(8)   g5<1>.xD        g1<0>.yD        g1<0>.xD        { align16 };
+cmp.ge.f0.0(8)  g6<1>.xF        g3<4>.xF        0x41f00000F  /* 30F */ { align16 };
+cmp.g.f0.0(16)  null<1>D        g2.1<0,1,0>D    0D              { align1 compr };
+cmp.ge.f0.0(16) null<1>D        g4<8,8,1>D      g2.1<0,1,0>D    { align1 compr };
+cmp.le.f0.0(8)  null<1>.xF      g8<4>.xF        0x3f000000F  /* 0.5F */ { align16 };
+cmp.ge.f0.0(8)  null<1>.xF      g22<4>.xF       g10<4>.xF       { align16 };
+cmp.z.f0.0(8)   g9<1>.xF        g1<0>.xF        0x40b79581F  /* 5.737F */ { align16 };
+cmp.z.f0.0(16)  null<1>D        g6<8,8,1>D      g2<0,1,0>D      { align1 compr };
+cmp.nz.f0.0(16) null<1>F        g4<8,8,1>F      g8<8,8,1>F      { align1 compr };
+(+f0.1) cmp.z.f0.1(16) null<1>D g6<8,8,1>D      0D              { align1 compr };
+cmp.nz.f0.0(8)  g11<1>.xD       g4<4>.xD        10D             { align16 };
+cmp.nz.f0.0(8)  g3<1>F          g3<4>F          0x0F  /* 0F */  { align16 };
+cmp.le.f0.0(16) g4<1>D          g2<0,1,0>D      0D              { align1 compr };
+cmp.l.f0.0(8)   null<1>.xD      g6<4>.xD        g5<4>.xD        { align16 };
+cmp.ge.f0.0(8)  g10<1>.xD       g5<4>.xD        2D              { align16 };
+cmp.g.f0.0(8)   null<1>.xD      g3<0>.zD        4D              { align16 };
+cmp.g.f0.0(16)  null<1>F        g20<8,8,1>F     0x0F  /* 0F */  { align1 compr };
+cmp.l.f0.0(16)  null<1>D        g2<0,1,0>D      g6<8,8,1>D      { align1 compr };
+(+f0.1) cmp.nz.f0.1(16) null<1>UW g0<8,8,1>UW   g0<8,8,1>UW     { align1 };
+cmp.le.f0.0(8)  g3<1>.xUD       g1<0>.xUD       0x00000001UD    { align16 };
+cmp.g.f0.0(8)   g8<1>.xD        g1<0>.xD        2D              { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4/cmp.expected b/src/intel/compiler/elk/tests/gen4/cmp.expected
new file mode 100644
index 00000000000..de8d9f6537d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/cmp.expected
@@ -0,0 +1,80 @@
+10 01 60 05 bc 7f 0f 20 0f 01 6f 00 00 00 00 00
+10 01 60 02 bc 5f 0f 20 64 00 0a 00 64 6e 74 74
+10 01 60 02 a4 1c 0f 20 e4 00 6a 00 00 00 00 00
+10 20 80 04 bd 7f c0 20 80 00 8d 00 5f 70 89 31
+10 20 80 05 bd 7f 00 21 80 00 8d 00 5f 70 89 31
+10 20 80 05 bd 77 00 21 80 00 8d 00 c0 00 8d 00
+10 20 80 04 bd 77 40 21 80 00 8d 00 c0 00 8d 00
+10 01 60 01 a5 1c 41 21 80 00 00 00 00 00 00 00
+10 01 60 05 bd 7f e1 20 e0 00 60 00 5f 70 89 31
+10 01 60 04 bd 77 c1 20 40 00 00 00 c0 00 60 00
+10 01 60 01 bc 77 0f 20 6e 00 0f 00 64 00 05 00
+10 20 80 04 a4 1c 00 20 c0 01 8d 00 10 00 00 00
+10 20 80 05 a4 1c 00 20 40 00 00 00 01 00 00 00
+10 20 80 01 bd 77 00 21 00 04 8d 00 4c 00 00 00
+10 20 80 04 bc 7f 00 20 c0 00 8d 00 00 00 00 00
+10 01 60 02 bc 77 0f 20 84 01 65 00 24 00 05 00
+10 01 60 01 a4 14 0f 20 c4 00 6e 00 54 00 0e 00
+10 20 80 01 a5 1c c0 20 44 00 00 00 27 00 00 00
+10 01 60 04 bc 7f 0f 20 a0 00 60 00 00 00 00 00
+10 01 60 06 bd 7f a1 20 a0 00 60 00 00 00 00 00
+10 20 80 01 bd 7f 80 20 44 00 00 00 00 00 00 41
+10 01 60 01 a5 14 a1 20 a0 00 60 00 2a 00 0a 00
+10 01 60 05 bd 77 63 20 24 00 05 00 2e 00 0f 00
+10 20 80 01 a4 1c 00 20 40 00 00 00 01 00 00 00
+10 20 80 01 bc 77 00 20 c0 01 8d 00 44 00 00 00
+10 01 60 01 bd 77 c1 20 c0 00 60 00 65 00 05 00
+10 20 80 02 bd 77 80 20 c0 00 8d 00 48 00 00 00
+10 20 80 04 bc 77 00 20 c0 21 8d 00 00 21 8d 00
+10 00 60 02 bd 77 a0 20 a0 00 8d 00 a0 04 8d 00
+10 01 60 04 a4 1c 01 20 a0 00 60 00 04 00 00 00
+10 20 80 02 bc 7f 00 20 50 00 00 00 00 00 00 00
+10 20 80 01 bc 7f 00 20 84 00 00 00 00 00 80 3f
+10 20 80 04 a5 1c 80 20 40 00 00 00 01 00 00 00
+10 20 80 02 a5 1c 80 20 44 00 00 00 00 00 00 00
+10 20 80 01 a5 14 00 21 c0 00 8d 00 54 00 00 00
+10 20 80 02 a4 1c 00 20 40 00 00 00 00 00 00 00
+10 20 80 05 bc 77 00 20 80 00 8d 00 54 00 00 00
+10 20 80 05 a5 1c c0 20 60 00 00 00 01 00 00 00
+10 20 80 04 a5 14 80 20 4c 00 00 00 40 00 00 00
+10 01 60 02 a5 14 61 20 20 00 00 00 25 00 05 00
+10 01 60 02 bd 77 67 20 24 00 0a 00 34 00 0a 00
+10 01 60 02 bc 7f 0f 20 20 00 00 00 00 00 00 00
+10 01 60 06 a5 1c a1 20 20 00 00 00 00 00 00 00
+10 20 80 05 a5 14 80 20 44 00 00 00 40 00 00 00
+10 01 60 04 a5 14 6f 20 24 00 0e 00 34 00 0e 00
+10 20 80 06 bc 7f 00 20 80 00 8d 00 00 00 00 3f
+10 20 80 06 bd 7f 80 22 80 00 8d 00 9a 3f 1c 46
+10 01 60 01 bc 5f 0f 20 64 00 0a 00 6a 6c 6e 6e
+10 01 60 02 a4 14 0f 20 24 00 0a 00 34 00 0a 00
+10 01 60 04 a4 14 01 20 a0 00 60 00 60 00 00 00
+10 01 60 03 bd 7f 8f 20 64 00 6e 00 00 00 00 3f
+10 20 80 02 bd 7f 00 21 48 00 00 00 00 00 00 00
+10 01 60 05 bc 77 0f 20 24 00 0e 00 64 00 6e 00
+10 01 60 03 bc 7f 01 20 45 00 65 00 00 00 00 00
+10 20 80 02 a4 14 00 20 00 02 8d 00 80 01 8d 00
+10 20 80 05 bc 7f 00 20 40 00 00 00 00 00 00 00
+10 01 60 01 a4 1c 01 20 20 00 00 00 01 00 00 00
+10 20 80 02 a5 14 c0 20 80 00 8d 00 48 00 00 00
+10 20 80 03 bd 7f 00 22 00 21 8d 00 00 00 80 3f
+10 01 60 05 a5 14 a1 20 25 00 05 00 20 00 00 00
+10 01 60 04 bd 7f c1 20 60 00 60 00 00 00 f0 41
+10 20 80 03 a4 1c 00 20 44 00 00 00 00 00 00 00
+10 20 80 04 a4 14 00 20 80 00 8d 00 44 00 00 00
+10 01 60 06 bc 7f 01 20 00 01 60 00 00 00 00 3f
+10 01 60 04 bc 77 01 20 c0 02 60 00 40 01 60 00
+10 01 60 01 bd 7f 21 21 20 00 00 00 81 95 b7 40
+10 20 80 01 a4 14 00 20 c0 00 8d 00 40 00 00 00
+10 20 80 02 bc 77 00 20 80 00 8d 00 00 01 8d 00
+10 20 81 01 a4 1c 00 20 c0 00 8d 02 00 00 00 00
+10 01 60 02 a5 1c 61 21 80 00 60 00 0a 00 00 00
+10 01 60 02 bd 7f 6f 20 64 00 6e 00 00 00 00 00
+10 20 80 06 a5 1c 80 20 40 00 00 00 00 00 00 00
+10 01 60 05 a4 14 01 20 c0 00 60 00 a0 00 60 00
+10 01 60 04 a5 1c 41 21 a0 00 60 00 02 00 00 00
+10 01 60 03 a4 1c 01 20 6a 00 0a 00 04 00 00 00
+10 20 80 03 bc 7f 00 20 80 02 8d 00 00 00 00 00
+10 20 80 05 a4 14 00 20 40 00 00 00 c0 00 8d 00
+10 00 81 02 28 25 00 20 00 00 8d 02 00 00 8d 00
+10 01 60 06 21 0c 61 20 20 00 00 00 01 00 00 00
+10 01 60 03 a5 1c 01 21 20 00 00 00 02 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4/cont.asm b/src/intel/compiler/elk/tests/gen4/cont.asm
new file mode 100644
index 00000000000..a03dd989d99
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/cont.asm
@@ -0,0 +1,2 @@
+cont(16)        Jump: 4         Pop: 1                          { align1 };
+cont(8)         Jump: 4         Pop: 1                          { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4/cont.expected b/src/intel/compiler/elk/tests/gen4/cont.expected
new file mode 100644
index 00000000000..c40dc1ce543
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/cont.expected
@@ -0,0 +1,2 @@
+29 00 80 00 00 1c 00 34 00 14 60 00 04 00 01 00
+29 01 60 00 00 1c 0f 34 04 14 6e 00 04 00 01 00
diff --git a/src/intel/compiler/elk/tests/gen4/do.asm b/src/intel/compiler/elk/tests/gen4/do.asm
new file mode 100644
index 00000000000..f0121e9b663
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/do.asm
@@ -0,0 +1,2 @@
+do(16)                                                          { align1 };
+do(8)                                                           { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4/do.expected b/src/intel/compiler/elk/tests/gen4/do.expected
new file mode 100644
index 00000000000..4ca58b752d7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/do.expected
@@ -0,0 +1,2 @@
+26 00 80 00 9c 73 00 20 00 00 8d 00 00 00 8d 00
+26 01 60 00 9c 73 0f 20 04 00 6e 00 04 00 6e 00
diff --git a/src/intel/compiler/elk/tests/gen4/dp2.asm b/src/intel/compiler/elk/tests/gen4/dp2.asm
new file mode 100644
index 00000000000..6411dbdfdec
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/dp2.asm
@@ -0,0 +1,7 @@
+dp2(8)          g7<1>.xF        g7<4>.xyyyF     g7<4>.xyyyF     { align16 };
+dp2(8)          m5<1>.xF        g1<0>.yF        g1<0>.yF        { align16 };
+dp2(8)          m5<1>.yzF       g1<0>.xF        g1<0>.zwwwF     { align16 NoDDClr };
+dp2(8)          m5<1>.wF        g1<0>.ywwwF     g1<0>.wyyyF     { align16 NoDDChk };
+dp2(8)          g4<1>.yF        g1<0>.xyyyF     g1.4<0>.xyyyF   { align16 NoDDClr };
+dp2(8)          g4<1>.zF        g1<0>.xyyyF     g1.4<0>.zwwwF   { align16 NoDDClr,NoDDChk };
+dp2(8)          g4<1>.wF        g1<0>.xyyyF     g2<0>.xyyyF     { align16 NoDDChk };
diff --git a/src/intel/compiler/elk/tests/gen4/dp2.expected b/src/intel/compiler/elk/tests/gen4/dp2.expected
new file mode 100644
index 00000000000..491895d42ae
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/dp2.expected
@@ -0,0 +1,7 @@
+57 01 60 00 bd 77 e1 20 e4 00 65 00 e4 00 65 00
+57 01 60 00 be 77 a1 20 25 00 05 00 25 00 05 00
+57 05 60 00 be 77 a6 20 20 00 00 00 2e 00 0f 00
+57 09 60 00 be 77 a8 20 2d 00 0f 00 27 00 05 00
+57 05 60 00 bd 77 82 20 24 00 05 00 34 00 05 00
+57 0d 60 00 bd 77 84 20 24 00 05 00 3e 00 0f 00
+57 09 60 00 bd 77 88 20 24 00 05 00 44 00 05 00
diff --git a/src/intel/compiler/elk/tests/gen4/dp3.asm b/src/intel/compiler/elk/tests/gen4/dp3.asm
new file mode 100644
index 00000000000..bdde40040e7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/dp3.asm
@@ -0,0 +1,10 @@
+dp3(8)          g5<1>.xF        g5<4>.xyzzF     g5<4>.xyzzF     { align16 };
+dp3(8)          m5<1>.xF        g3<0>.xyzzF     g6<4>.xyzzF     { align16 NoDDClr };
+dp3(8)          m5<1>.yF        g3.4<0>.xyzzF   g6<4>.xyzzF     { align16 NoDDClr,NoDDChk };
+dp3(8)          g19<1>.xF       g3<0>.xyzzF     g3.4<0>.xyzzF   { align16 NoDDClr };
+dp3(8)          g19<1>.yF       g3<0>.xyzzF     g4<0>.xyzzF     { align16 NoDDClr,NoDDChk };
+dp3(8)          g19<1>.zF       g3<0>.xyzzF     g4.4<0>.xyzzF   { align16 NoDDChk };
+dp3(8)          m5<1>.xF        g4<4>.xyzzF     g5<4>.xyzzF     { align16 };
+dp3.le.f0.0(8)  g18<1>.xF       g17<4>.xyzzF    g3.4<0>.xyzzF   { align16 };
+dp3.sat(8)      g4<1>.xF        g4<4>.xyzzF     g5<4>.xyzzF     { align16 };
+dp3.sat(8)      m5<1>F          g3<4>.xyzzF     g3<4>.xyzzF     { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4/dp3.expected b/src/intel/compiler/elk/tests/gen4/dp3.expected
new file mode 100644
index 00000000000..108ce15bc95
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/dp3.expected
@@ -0,0 +1,10 @@
+56 01 60 00 bd 77 a1 20 a4 00 6a 00 a4 00 6a 00
+56 05 60 00 be 77 a1 20 64 00 0a 00 c4 00 6a 00
+56 0d 60 00 be 77 a2 20 74 00 0a 00 c4 00 6a 00
+56 05 60 00 bd 77 61 22 64 00 0a 00 74 00 0a 00
+56 0d 60 00 bd 77 62 22 64 00 0a 00 84 00 0a 00
+56 09 60 00 bd 77 64 22 64 00 0a 00 94 00 0a 00
+56 01 60 00 be 77 a1 20 84 00 6a 00 a4 00 6a 00
+56 01 60 06 bd 77 41 22 24 02 6a 00 74 00 0a 00
+56 01 60 80 bd 77 81 20 84 00 6a 00 a4 00 6a 00
+56 01 60 80 be 77 af 20 64 00 6a 00 64 00 6a 00
diff --git a/src/intel/compiler/elk/tests/gen4/dp4.asm b/src/intel/compiler/elk/tests/gen4/dp4.asm
new file mode 100644
index 00000000000..3ea82da7ffd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/dp4.asm
@@ -0,0 +1,6 @@
+dp4(8)          g6<1>.xF        g3<4>F          g1<0>F          { align16 };
+dp4(8)          g4<1>.xF        g5<4>F          g1<0>F          { align16 NoDDClr };
+dp4(8)          g4<1>.yF        g5<4>F          g1.4<0>F        { align16 NoDDClr,NoDDChk };
+dp4(8)          g4<1>.wF        g5<4>F          g2.4<0>F        { align16 NoDDChk };
+dp4(8)          m5<1>.xF        g4<4>F          g5<4>F          { align16 };
+dp4.sat(8)      m5<1>F          g3<4>.xF        g3<4>F          { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4/dp4.expected b/src/intel/compiler/elk/tests/gen4/dp4.expected
new file mode 100644
index 00000000000..cae5f7689ea
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/dp4.expected
@@ -0,0 +1,6 @@
+54 01 60 00 bd 77 c1 20 64 00 6e 00 24 00 0e 00
+54 05 60 00 bd 77 81 20 a4 00 6e 00 24 00 0e 00
+54 0d 60 00 bd 77 82 20 a4 00 6e 00 34 00 0e 00
+54 09 60 00 bd 77 88 20 a4 00 6e 00 54 00 0e 00
+54 01 60 00 be 77 a1 20 84 00 6e 00 a4 00 6e 00
+54 01 60 80 be 77 af 20 60 00 60 00 64 00 6e 00
diff --git a/src/intel/compiler/elk/tests/gen4/dph.asm b/src/intel/compiler/elk/tests/gen4/dph.asm
new file mode 100644
index 00000000000..16c9d525604
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/dph.asm
@@ -0,0 +1,5 @@
+dph(8)          m5<1>.xF        g4<4>.xyzxF     g5<4>F          { align16 };
+dph.sat(8)      m5<1>F          g1<0>.xyzxF     g3<4>F          { align16 };
+dph(8)          g5<1>.xF        g4<4>.xyzxF     g1<0>F          { align16 NoDDClr };
+dph(8)          g5<1>.yF        g4<4>.xyzxF     g1.4<0>F        { align16 NoDDClr,NoDDChk };
+dph(8)          g6<1>.wF        g5<4>.xyzxF     g2.4<0>F        { align16 NoDDChk };
diff --git a/src/intel/compiler/elk/tests/gen4/dph.expected b/src/intel/compiler/elk/tests/gen4/dph.expected
new file mode 100644
index 00000000000..aed1eaec314
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/dph.expected
@@ -0,0 +1,5 @@
+55 01 60 00 be 77 a1 20 84 00 62 00 a4 00 6e 00
+55 01 60 80 be 77 af 20 24 00 02 00 64 00 6e 00
+55 05 60 00 bd 77 a1 20 84 00 62 00 24 00 0e 00
+55 0d 60 00 bd 77 a2 20 84 00 62 00 34 00 0e 00
+55 09 60 00 bd 77 c8 20 a4 00 62 00 54 00 0e 00
diff --git a/src/intel/compiler/elk/tests/gen4/else.asm b/src/intel/compiler/elk/tests/gen4/else.asm
new file mode 100644
index 00000000000..7ce3494b66f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/else.asm
@@ -0,0 +1,2 @@
+else(16)        Jump: 7         Pop: 1                          { align1 switch };
+else(8)         Jump: 3         Pop: 1                          { align16 switch };
diff --git a/src/intel/compiler/elk/tests/gen4/else.expected b/src/intel/compiler/elk/tests/gen4/else.expected
new file mode 100644
index 00000000000..c56d1248844
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/else.expected
@@ -0,0 +1,2 @@
+24 80 80 00 00 1c 00 34 00 14 60 00 07 00 01 00
+24 81 60 00 00 1c 0f 34 04 14 6e 00 03 00 01 00
diff --git a/src/intel/compiler/elk/tests/gen4/endif.asm b/src/intel/compiler/elk/tests/gen4/endif.asm
new file mode 100644
index 00000000000..6c71e4a033a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/endif.asm
@@ -0,0 +1,2 @@
+endif(16)       Pop: 1                                          { align1 switch };
+endif(8)        Pop: 1                                          { align16 switch };
diff --git a/src/intel/compiler/elk/tests/gen4/endif.expected b/src/intel/compiler/elk/tests/gen4/endif.expected
new file mode 100644
index 00000000000..99daf4c5ab7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/endif.expected
@@ -0,0 +1,2 @@
+25 80 80 00 84 1c 00 20 00 00 8d 00 00 00 01 00
+25 81 60 00 84 1c 0f 20 04 00 6e 00 00 00 01 00
diff --git a/src/intel/compiler/elk/tests/gen4/frc.asm b/src/intel/compiler/elk/tests/gen4/frc.asm
new file mode 100644
index 00000000000..4a13a5359db
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/frc.asm
@@ -0,0 +1,3 @@
+frc.sat(8)      m5<1>F          g3<4>F                          { align16 };
+frc(8)          g7<1>.xF        (abs)g1<0>.xF                   { align16 };
+frc(16)         g4<1>F          g2<0,1,0>F                      { align1 compr };
diff --git a/src/intel/compiler/elk/tests/gen4/frc.expected b/src/intel/compiler/elk/tests/gen4/frc.expected
new file mode 100644
index 00000000000..52c9c7b7f16
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/frc.expected
@@ -0,0 +1,3 @@
+43 01 60 80 be 03 af 20 64 00 6e 00 00 00 00 00
+43 01 60 00 bd 03 e1 20 20 20 00 00 00 00 00 00
+43 20 80 00 bd 03 80 20 40 00 00 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4/if.asm b/src/intel/compiler/elk/tests/gen4/if.asm
new file mode 100644
index 00000000000..db56acacf21
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/if.asm
@@ -0,0 +1,2 @@
+(+f0.0) if(16)  Jump: 15                                        { align1 switch };
+(+f0.0.x) if(8) Jump: 7                                         { align16 switch };
diff --git a/src/intel/compiler/elk/tests/gen4/if.expected b/src/intel/compiler/elk/tests/gen4/if.expected
new file mode 100644
index 00000000000..cef48388bd3
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/if.expected
@@ -0,0 +1,2 @@
+22 80 81 00 00 1c 00 34 00 14 60 00 0f 00 00 00
+22 81 62 00 00 1c 0f 34 04 14 6e 00 07 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4/iff.asm b/src/intel/compiler/elk/tests/gen4/iff.asm
new file mode 100644
index 00000000000..1ff0b17a776
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/iff.asm
@@ -0,0 +1,3 @@
+(-f0.0) iff(16) Jump: 5                                         { align1 switch };
+(+f0.0.x) iff(8) Jump: 11                                       { align16 switch };
+(+f0.0) iff(16) Jump: 7                                         { align1 switch };
diff --git a/src/intel/compiler/elk/tests/gen4/iff.expected b/src/intel/compiler/elk/tests/gen4/iff.expected
new file mode 100644
index 00000000000..4ed27050911
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/iff.expected
@@ -0,0 +1,3 @@
+23 80 91 00 00 1c 00 34 00 14 60 00 05 00 00 00
+23 81 62 00 00 1c 0f 34 04 14 6e 00 0b 00 00 00
+23 80 81 00 00 1c 00 34 00 14 60 00 07 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4/jmpi.asm b/src/intel/compiler/elk/tests/gen4/jmpi.asm
new file mode 100644
index 00000000000..65d0d5357b7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/jmpi.asm
@@ -0,0 +1 @@
+(+f0.0) jmpi(1) 0x00000002UD                                    { align1 nomask };
diff --git a/src/intel/compiler/elk/tests/gen4/jmpi.expected b/src/intel/compiler/elk/tests/gen4/jmpi.expected
new file mode 100644
index 00000000000..682e0a75561
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/jmpi.expected
@@ -0,0 +1 @@
+20 02 01 00 00 0c 00 34 00 14 00 00 02 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4/line.asm b/src/intel/compiler/elk/tests/gen4/line.asm
new file mode 100644
index 00000000000..026d61081e7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/line.asm
@@ -0,0 +1,2 @@
+line(16)        null<1>F        g3.4<0,1,0>F    g6<8,8,1>F      { align1 compr };
+line(8)         null<1>F        g4.4<0,1,0>F    g37<8,8,1>F     { align1 };
diff --git a/src/intel/compiler/elk/tests/gen4/line.expected b/src/intel/compiler/elk/tests/gen4/line.expected
new file mode 100644
index 00000000000..681b350621a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/line.expected
@@ -0,0 +1,2 @@
+59 20 80 00 bc 77 00 20 70 00 00 00 c0 00 8d 00
+59 00 60 00 bc 77 00 20 90 00 00 00 a0 04 8d 00
diff --git a/src/intel/compiler/elk/tests/gen4/mac.asm b/src/intel/compiler/elk/tests/gen4/mac.asm
new file mode 100644
index 00000000000..064198d35e6
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/mac.asm
@@ -0,0 +1,3 @@
+mac(16)         g10<1>F         g3.5<0,1,0>F    g8<8,8,1>F      { align1 compr };
+mac(8)          g39<1>F         g4.5<0,1,0>F    g38<8,8,1>F     { align1 };
+mac(16)         m11<1>F         g7.1<0,1,0>F    g10<8,8,1>F     { align1 compr };
diff --git a/src/intel/compiler/elk/tests/gen4/mac.expected b/src/intel/compiler/elk/tests/gen4/mac.expected
new file mode 100644
index 00000000000..45e78426e6f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/mac.expected
@@ -0,0 +1,3 @@
+48 20 80 00 bd 77 40 21 74 00 00 00 00 01 8d 00
+48 00 60 00 bd 77 e0 24 94 00 00 00 c0 04 8d 00
+48 20 80 00 be 77 60 21 e4 00 00 00 40 01 8d 00
diff --git a/src/intel/compiler/elk/tests/gen4/mach.asm b/src/intel/compiler/elk/tests/gen4/mach.asm
new file mode 100644
index 00000000000..5e0ccc54566
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/mach.asm
@@ -0,0 +1 @@
+mach(8)         null<1>D        g1<0>.xD        g1<0>.yD        { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4/mach.expected b/src/intel/compiler/elk/tests/gen4/mach.expected
new file mode 100644
index 00000000000..90d1371bd61
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/mach.expected
@@ -0,0 +1 @@
+49 01 60 00 a4 14 0f 20 20 00 00 00 25 00 05 00
diff --git a/src/intel/compiler/elk/tests/gen4/mov.asm b/src/intel/compiler/elk/tests/gen4/mov.asm
new file mode 100644
index 00000000000..f8c4ad69cb4
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/mov.asm
@@ -0,0 +1,102 @@
+mov(8)          m3<1>F          g16<8,8,1>F                     { align1 };
+mov(8)          m7<1>F          g17<8,8,1>F                     { align1 sechalf };
+mov(8)          m2<1>UD         g1<8,8,1>UD                     { align1 nomask };
+mov(8)          g9<1>.xyzUD     0x00000000UD                    { align16 };
+mov.sat(8)      m5<1>F          g4<4>F                          { align16 };
+mov(8)          m4<1>F          g6<4>F                          { align16 };
+(+f0.0) mov(8)  g8<1>F          0x0F             /* 0F */       { align16 };
+mov(8)          m2<1>UD         g9<4>UD                         { align16 };
+mov(16)         g6<1>D          1065353216D                     { align1 compr };
+mov.nz.f0.0(16) null<1>D        g2<0,1,0>D                      { align1 compr };
+mov(8)          m3<1>F          0x0F             /* 0F */       { align1 };
+mov(8)          m7<1>F          0x0F             /* 0F */       { align1 sechalf };
+mov(8)          g8<1>F          0x30003000VF    /* [0F, 1F, 0F, 1F]VF */ { align16 };
+mov(8)          g7<1>.xD        0D                              { align16 };
+(+f0.0.any4h) mov(8) g7<1>.xD   -1D                             { align16 };
+mov(16)         g8<1>F          g4<8,8,1>D                      { align1 compr };
+mov(1)          m14<1>D         0D                              { align1 nomask };
+mov(8)          m15<1>D         g3<0>D                          { align16 };
+mov(1)          m14<1>D         g8<0,1,0>D                      { align1 nomask };
+mov(16)         m6<1>F          g9.3<0,1,0>F                    { align1 compr };
+mov(16)         g12<1>F         g4<8,8,1>UW                     { align1 compr };
+mov(16)         g4<1>D          g12<8,8,1>F                     { align1 compr };
+mov(8)          null<1>F        g7<8,8,1>F                      { align1 sechalf };
+mov(8)          null<1>F        g8<8,8,1>F                      { align1 };
+mov(8)          m15<1>D         g2<4>.xUD                       { align16 };
+mov(8)          g7<1>.xD        g4<0>.yD                        { align16 };
+mov(8)          g7<1>.xD        g10<4>.xD                       { align16 NoDDClr };
+mov(8)          g7<1>.yD        g4<0>.yD                        { align16 NoDDChk };
+mov(16)         m2<1>UD         0x00000000UD                    { align1 compr };
+mov(16)         m6<1>D          g9.3<0,1,0>D                    { align1 compr };
+mov(16)         m8<1>UD         0D                              { align1 compr };
+mov(16)         m2<1>D          g4<8,8,1>F                      { align1 compr };
+mov(8)          m5<1>.xF        g3<4>.xD                        { align16 NoDDClr };
+mov(8)          m5<1>.yzwD      0D                              { align16 NoDDChk };
+mov.sat(16)     g6<1>F          g2<0,1,0>F                      { align1 compr };
+mov(8)          m6<1>F          0x50484030VF    /* [1F, 2F, 3F, 4F]VF */ { align16 };
+mov(8)          m5<1>.wD        g8<4>.wD                        { align16 NoDDChk };
+mov(8)          g6<1>.xD        g6<4>.xF                        { align16 };
+mov(8)          g20<1>.yD       -1070881309D                    { align16 NoDDClr };
+mov(8)          g20<1>.zD       1091044167D                     { align16 NoDDChk };
+mov(8)          g28<1>.zD       -1102236248D                    { align16 NoDDClr,NoDDChk };
+mov(8)          g5<1>.xD        acc0<4>D                        { align16 };
+mov(8)          m13<1>.wD       1107296256D                     { align16 NoDDClr };
+mov(8)          g11<1>.yzwD     0x48403000VF    /* [0F, 1F, 2F, 3F]VF */ { align16 };
+mov(8)          m13<1>.xyzF     0x7f7e7dVF      /* [29F, 30F, 31F, 0F]VF */ { align16 NoDDChk };
+mov(8)          m3<1>UD         g4<8,8,1>UD                     { align1 };
+mov(8)          m7<1>UD         g4<8,8,1>UD                     { align1 sechalf };
+mov(8)          m6<1>.xF        0x0F             /* 0F */       { align16 };
+mov(16)         m8<1>F          0x40400000F      /* 3F */       { align1 compr };
+(+f0.0.all4h) mov(8) g3<1>.xD   -1D                             { align16 };
+mov(8)          g3<1>F          g2<0,1,0>D                      { align1 };
+mov(8)          m3<1>F          g2<8,8,1>D                      { align1 };
+mov(8)          m5<1>.yF        g3<4>.xD                        { align16 NoDDClr,NoDDChk };
+mov(8)          m5<1>.wF        g3<4>.xD                        { align16 NoDDChk };
+mov(8)          g3<1>.xF        g3<4>.xD                        { align16 NoDDClr };
+mov(8)          g3<1>.yF        g4<4>.xD                        { align16 NoDDClr,NoDDChk };
+mov(8)          g3<1>.wF        g4<4>.xD                        { align16 NoDDChk };
+mov(8)          g8<1>UD         g2<4>UD                         { align16 };
+mov(8)          g7<1>.xF        g3<0>.xD                        { align16 };
+mov(8)          g6<1>.xF        -g5<4>.yF                       { align16 NoDDClr };
+mov.nz.f0.0(16) g4<1>F          -(abs)g2<0,1,0>F                { align1 compr };
+(+f0.0) mov(16) g4<1>F          0xbf800000F      /* -1F */      { align1 compr };
+mov(16)         g24<1>D         g42<8,8,1>D                     { align1 compr };
+mov(8)          g8<1>F          g[a0]<VxH,1,0>F                 { align1 };
+mov(8)          g9<1>F          g[a0]<VxH,1,0>F                 { align1 sechalf };
+mov(8)          g3<1>.xyzF      0x0F             /* 0F */       { align16 };
+mov(8)          m3<1>D          g2<0,1,0>D                      { align1 };
+mov(8)          m3<1>D          g2<0,1,0>D                      { align1 sechalf };
+mov(1)          m14.2<1>UD      0x00000000UD                    { align1 nomask };
+mov(8)          null<1>F        g4<8,8,1>F                      { align1 nomask };
+mov(8)          g5<1>.zD        g1.4<0>.xD                      { align16 NoDDClr,NoDDChk };
+mov.sat(8)      m5<1>.wF        g20<4>.wF                       { align16 NoDDChk };
+mov(8)          g26<1>.xyzUD    0x00000000UD                    { align16 NoDDClr };
+mov(8)          m9<1>.xyD       g4<0>.yzzzD                     { align16 NoDDClr };
+mov(8)          m5<1>F          g3<4>D                          { align16 };
+mov(8)          m3<1>F          g4<8,8,1>F                      { align1 nomask };
+mov.sat(8)      m5<1>.zF        0x3eaaaaabF      /* 0.333333F */ { align16 };
+mov.sat(8)      m5<1>.wF        0x3dcccccdF      /* 0.1F */     { align16 NoDDClr };
+mov(8)          m5<1>.zD        g3<4>.zD                        { align16 NoDDClr,NoDDChk };
+mov(8)          m13<1>.yD       1107820544D                     { align16 NoDDClr,NoDDChk };
+mov.sat(8)      m5<1>F          g4<4>D                          { align16 };
+mov.sat(8)      m5<1>.wF        0x3f800000F      /* 1F */       { align16 NoDDChk };
+mov(16)         g8<1>F          g4.3<0,1,0>F                    { align1 compr };
+mov.sat(8)      m5<1>.zF        0x3f666660F      /* 0.9F */     { align16 NoDDClr,NoDDChk };
+mov(16)         g8<1>F          0x3f800000F      /* 1F */       { align1 compr };
+mov(8)          m15<1>D         0D                              { align16 };
+mov(8)          m2<1>UD         0x00000000UD                    { align16 };
+mov(8)          g2<1>.xyzF      g2<4>.wF                        { align16 };
+mov(8)          g5<1>.xyzF      0x7f7e7dVF      /* [29F, 30F, 31F, 0F]VF */ { align16 NoDDChk };
+mov.sat(8)      m5<1>.xF        g4<4>.xF                        { align16 NoDDClr };
+mov.sat(8)      m5<1>.yzF       g5<4>.xxyyF                     { align16 NoDDClr,NoDDChk };
+mov(1)          f0.1<1>UW       g0<0,1,0>UW                     { align1 nomask };
+mov(1)          g0<1>UW         f0.1<0,1,0>UW                   { align1 nomask };
+mov.sat(8)      m5<1>.xF        g5<4>.xD                        { align16 NoDDClr };
+mov.sat(8)      m5<1>.yF        g5<4>.xD                        { align16 NoDDClr,NoDDChk };
+mov.sat(8)      m5<1>.wF        g5<4>.xD                        { align16 NoDDChk };
+mov(8)          g6<1>.yzD       0xf7c000VF      /* [0F, -2F, -23F, 0F]VF */ { align16 NoDDChk };
+mov(8)          g5<1>F          g3<4>UD                         { align16 };
+mov(8)          m5<1>.xyzF      0x3000VF        /* [0F, 1F, 0F, 0F]VF */ { align16 NoDDClr };
+mov.nz.f0.0(8)  null<1>.xD      g8<4>.xD                        { align16 };
+mov.nz.f0.0(8)  g8<1>F          -(abs)g1<0>F                    { align16 };
+mov(16)         m2<1>F          g8<8,8,1>D                      { align1 compr };
diff --git a/src/intel/compiler/elk/tests/gen4/mov.expected b/src/intel/compiler/elk/tests/gen4/mov.expected
new file mode 100644
index 00000000000..a0fca3357a7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/mov.expected
@@ -0,0 +1,102 @@
+01 00 60 00 be 03 60 20 00 02 8d 00 00 00 00 00
+01 10 60 00 be 03 e0 20 20 02 8d 00 00 00 00 00
+01 02 60 00 22 00 40 20 20 00 8d 00 00 00 00 00
+01 01 60 00 61 00 27 21 00 00 00 00 00 00 00 00
+01 01 60 80 be 03 af 20 84 00 6e 00 00 00 00 00
+01 01 60 00 be 03 8f 20 c4 00 6e 00 00 00 00 00
+01 01 61 00 fd 73 0f 21 00 00 00 00 00 00 00 00
+01 01 60 00 22 00 4f 20 24 01 6e 00 00 00 00 00
+01 20 80 00 e5 10 c0 20 00 00 00 00 00 00 80 3f
+01 20 80 02 a4 00 00 20 40 00 00 00 00 00 00 00
+01 00 60 00 fe 73 60 20 00 00 00 00 00 00 00 00
+01 10 60 00 fe 73 e0 20 00 00 00 00 00 00 00 00
+01 01 60 00 fd 52 0f 21 00 00 00 00 00 30 00 30
+01 01 60 00 e5 10 e1 20 00 00 00 00 00 00 00 00
+01 01 66 00 e5 10 e1 20 00 00 00 00 ff ff ff ff
+01 20 80 00 bd 00 00 21 80 00 8d 00 00 00 00 00
+01 02 00 00 e6 10 c0 21 00 00 00 00 00 00 00 00
+01 01 60 00 a6 00 ef 21 64 00 0e 00 00 00 00 00
+01 02 00 00 a6 00 c0 21 00 01 00 00 00 00 00 00
+01 20 80 00 be 03 c0 20 2c 01 00 00 00 00 00 00
+01 20 80 00 3d 01 80 21 80 00 8d 00 00 00 00 00
+01 20 80 00 a5 03 80 20 80 01 8d 00 00 00 00 00
+01 10 60 00 bc 03 00 20 e0 00 8d 00 00 00 00 00
+01 00 60 00 bc 03 00 20 00 01 8d 00 00 00 00 00
+01 01 60 00 26 00 ef 21 40 00 60 00 00 00 00 00
+01 01 60 00 a5 00 e1 20 85 00 05 00 00 00 00 00
+01 05 60 00 a5 00 e1 20 40 01 60 00 00 00 00 00
+01 09 60 00 a5 00 e2 20 85 00 05 00 00 00 00 00
+01 20 80 00 62 00 40 20 00 00 00 00 00 00 00 00
+01 20 80 00 a6 00 c0 20 2c 01 00 00 00 00 00 00
+01 20 80 00 e2 10 00 21 00 00 00 00 00 00 00 00
+01 20 80 00 a6 03 40 20 80 00 8d 00 00 00 00 00
+01 05 60 00 be 00 a1 20 60 00 60 00 00 00 00 00
+01 09 60 00 e6 10 ae 20 00 00 00 00 00 00 00 00
+01 20 80 80 bd 03 c0 20 40 00 00 00 00 00 00 00
+01 01 60 00 fe 52 cf 20 00 00 00 00 30 40 48 50
+01 09 60 00 a6 00 a8 20 0f 01 6f 00 00 00 00 00
+01 01 60 00 a5 03 c1 20 c0 00 60 00 00 00 00 00
+01 05 60 00 e5 10 82 22 00 00 00 00 e3 a5 2b c0
+01 09 60 00 e5 10 84 22 00 00 00 00 47 03 08 41
+01 0d 60 00 e5 10 84 23 00 00 00 00 a8 35 4d be
+01 01 60 00 85 00 a1 20 04 04 6e 00 00 00 00 00
+01 05 60 00 e6 10 a8 21 00 00 00 00 00 00 00 42
+01 01 60 00 e5 52 6e 21 00 00 00 00 00 30 40 48
+01 09 60 00 fe 52 a7 21 00 00 00 00 7d 7e 7f 00
+01 00 60 00 22 00 60 20 80 00 8d 00 00 00 00 00
+01 10 60 00 22 00 e0 20 80 00 8d 00 00 00 00 00
+01 01 60 00 fe 73 c1 20 00 00 00 00 00 00 00 00
+01 20 80 00 fe 73 00 21 00 00 00 00 00 00 40 40
+01 01 67 00 e5 10 61 20 00 00 00 00 ff ff ff ff
+01 00 60 00 bd 00 60 20 40 00 00 00 00 00 00 00
+01 00 60 00 be 00 60 20 40 00 8d 00 00 00 00 00
+01 0d 60 00 be 00 a2 20 60 00 60 00 00 00 00 00
+01 09 60 00 be 00 a8 20 60 00 60 00 00 00 00 00
+01 05 60 00 bd 00 61 20 60 00 60 00 00 00 00 00
+01 0d 60 00 bd 00 62 20 80 00 60 00 00 00 00 00
+01 09 60 00 bd 00 68 20 80 00 60 00 00 00 00 00
+01 01 60 00 21 00 0f 21 44 00 6e 00 00 00 00 00
+01 01 60 00 bd 00 e1 20 60 00 00 00 00 00 00 00
+01 05 60 00 bd 03 c1 20 a5 40 65 00 00 00 00 00
+01 20 80 02 bd 03 80 20 40 60 00 00 00 00 00 00
+01 20 81 00 fd 73 80 20 00 00 00 00 00 00 80 bf
+01 20 80 00 a5 00 00 23 40 05 8d 00 00 00 00 00
+01 00 60 00 bd 03 00 21 00 80 e0 01 00 00 00 00
+01 10 60 00 bd 03 20 21 00 80 e0 01 00 00 00 00
+01 01 60 00 fd 73 67 20 00 00 00 00 00 00 00 00
+01 00 60 00 a6 00 60 20 40 00 00 00 00 00 00 00
+01 10 60 00 a6 00 60 20 40 00 00 00 00 00 00 00
+01 02 00 00 62 00 c8 21 00 00 00 00 00 00 00 00
+01 02 60 00 bc 03 00 20 80 00 8d 00 00 00 00 00
+01 0d 60 00 a5 00 a4 20 30 00 00 00 00 00 00 00
+01 09 60 80 be 03 a8 20 8f 02 6f 00 00 00 00 00
+01 05 60 00 61 00 47 23 00 00 00 00 00 00 00 00
+01 05 60 00 a6 00 23 21 89 00 0a 00 00 00 00 00
+01 01 60 00 be 00 af 20 64 00 6e 00 00 00 00 00
+01 02 60 00 be 03 60 20 80 00 8d 00 00 00 00 00
+01 01 60 80 fe 73 a4 20 00 00 00 00 ab aa aa 3e
+01 05 60 80 fe 73 a8 20 00 00 00 00 cd cc cc 3d
+01 0d 60 00 a6 00 a4 20 6a 00 6a 00 00 00 00 00
+01 0d 60 00 e6 10 a2 21 00 00 00 00 00 00 08 42
+01 01 60 80 be 00 af 20 84 00 6e 00 00 00 00 00
+01 09 60 80 fe 73 a8 20 00 00 00 00 00 00 80 3f
+01 20 80 00 bd 03 00 21 8c 00 00 00 00 00 00 00
+01 0d 60 80 fe 73 a4 20 00 00 00 00 60 66 66 3f
+01 20 80 00 fd 73 00 21 00 00 00 00 00 00 80 3f
+01 01 60 00 e6 10 ef 21 00 00 00 00 00 00 00 00
+01 01 60 00 62 00 4f 20 00 00 00 00 00 00 00 00
+01 01 60 00 bd 03 47 20 4f 00 6f 00 00 00 00 00
+01 09 60 00 fd 52 a7 20 00 00 00 00 7d 7e 7f 00
+01 05 60 80 be 03 a1 20 80 00 60 00 00 00 00 00
+01 0d 60 80 be 03 a6 20 a0 00 65 00 00 00 00 00
+01 02 00 00 28 01 02 26 00 00 00 00 00 00 00 00
+01 02 00 00 09 01 00 20 02 06 00 00 00 00 00 00
+01 05 60 80 be 00 a1 20 a0 00 60 00 00 00 00 00
+01 0d 60 80 be 00 a2 20 a0 00 60 00 00 00 00 00
+01 09 60 80 be 00 a8 20 a0 00 60 00 00 00 00 00
+01 09 60 00 e5 52 c6 20 00 00 00 00 00 c0 f7 00
+01 01 60 00 3d 00 af 20 64 00 6e 00 00 00 00 00
+01 05 60 00 fe 52 a7 20 00 00 00 00 00 30 00 00
+01 01 60 02 a4 00 01 20 00 01 60 00 00 00 00 00
+01 01 60 02 bd 03 0f 21 24 60 0e 00 00 00 00 00
+01 20 80 00 be 00 40 20 00 01 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4/mul.asm b/src/intel/compiler/elk/tests/gen4/mul.asm
new file mode 100644
index 00000000000..82629fae802
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/mul.asm
@@ -0,0 +1,37 @@
+mul(16)         g16<1>F         g14<8,8,1>F     g12<8,8,1>F     { align1 compr };
+mul(8)          g8<1>.xyzF      g6<4>.xyzzF     g8<4>.wF        { align16 };
+mul(8)          g9<1>.wUD       g7<4>.wF        0x45000000F  /* 2048F */ { align16 };
+mul(8)          g8<1>.xD        g8<4>.xD        g5<0>.xD        { align16 };
+mul(8)          g8<1>.xD        g8<4>.xD        32D             { align16 };
+mul(16)         m2<1>F          g4<8,8,1>F      g8.3<0,1,0>F    { align1 compr };
+mul(16)         g22<1>F         g16<8,8,1>F     0x41800000F  /* 16F */ { align1 compr };
+mul(8)          m5<1>.xyF       g3<4>.xyyyF     0x3f000000F  /* 0.5F */ { align16 NoDDClr };
+mul(8)          g5<1>F          g3<4>F          0x37800000F  /* 1.52588e-05F */ { align16 };
+mul.sat(16)     m2<1>F          g16<8,8,1>F     g6<8,8,1>F      { align1 compr };
+mul(8)          acc0<1>D        g1<0>.xD        g1<0>.yD        { align16 };
+mul(8)          m5<1>F          g3<4>F          0x3f000000F  /* 0.5F */ { align16 };
+mul(16)         g4<1>D          g6<8,8,1>D      g2<0,1,0>D      { align1 compr };
+mul.sat(16)     g18<1>F         g16<8,8,1>F     g14<8,8,1>F     { align1 compr };
+mul(8)          g4<1>F          g4<8,8,1>F      g55<8,8,1>F     { align1 };
+mul(8)          g26<1>.wUD      g29<4>.wF       0x45000000F  /* 2048F */ { align16 NoDDChk };
+mul(8)          g2<1>.xyzF      g2<4>.wF        0x40404830VF /* [1F, 3F, 2F, 2F]VF */ { align16 };
+mul(16)         g4<1>D          g2<0,1,0>UW     g2.2<0,1,0>D    { align1 compr };
+mul(16)         m8<1>F          g24<8,8,1>F     0x3f000000F  /* 0.5F */ { align1 compr };
+mul.sat(8)      g6<1>.xyzF      g6<4>.xyzzF     g7<4>.xF        { align16 };
+mul.sat(8)      m5<1>F          g6<4>F          0x3b800000F  /* 0.00390625F */ { align16 };
+mul.sat(8)      m5<1>.xyzF      g3<4>.xyzzF     0x3f000000F  /* 0.5F */ { align16 NoDDClr };
+mul.g.f0.0(16)  null<1>F        g18<8,8,1>F     g12<8,8,1>F     { align1 compr };
+mul.sat(8)      m5<1>F          g3<4>F          g3<4>F          { align16 };
+mul.l.f0.0(8)   null<1>.xF      g1<0>.zF        g1<0>.yF        { align16 };
+mul.l.f0.0(16)  null<1>F        g2.2<0,1,0>F    g2.1<0,1,0>F    { align1 compr };
+mul.l.f0.0(16)  g16<1>F         g14<8,8,1>F     g12<8,8,1>F     { align1 compr };
+mul.nz.f0.0(16) g18<1>F         g16<8,8,1>F     g12<8,8,1>F     { align1 compr };
+mul.sat(8)      m6<1>.xyzF      g32<4>.xF       g30<4>.xyzzF    { align16 NoDDClr };
+mul.nz.f0.0(16) g6<1>F          g4<8,8,1>F      0x3f808000F  /* 1.00391F */ { align1 compr };
+mul.sat(8)      m5<1>F          g4<4>F          0x20303030VF /* [1F, 1F, 1F, 0.5F]VF */ { align16 };
+mul(8)          m5<1>F          g3<4>F          0x20305454VF /* [5F, 5F, 1F, 0.5F]VF */ { align16 };
+mul(8)          m6<1>.xyzF      g12<4>.xyzzF    g13<4>.xF       { align16 NoDDClr };
+mul(8)          m5<1>.xyzF      g3<4>.xyzzF     0x30302020VF /* [0.5F, 0.5F, 1F, 1F]VF */ { align16 NoDDClr };
+mul(8)          m5<1>.zF        g3<4>.zF        0x3f000000F  /* 0.5F */ { align16 NoDDClr,NoDDChk };
+mul(8)          m5<1>F          g3<4>F          g1<0>.xF        { align16 };
+mul.sat(8)      m5<1>.xyzF      g7<4>.xF        0x3030VF /* [1F, 1F, 0F, 0F]VF */ { align16 NoDDClr };
diff --git a/src/intel/compiler/elk/tests/gen4/mul.expected b/src/intel/compiler/elk/tests/gen4/mul.expected
new file mode 100644
index 00000000000..c67cda04aa4
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/mul.expected
@@ -0,0 +1,37 @@
+41 20 80 00 bd 77 00 22 c0 01 8d 00 80 01 8d 00
+41 01 60 00 bd 77 07 21 c4 00 6a 00 0f 01 6f 00
+41 01 60 00 a1 7f 28 21 ef 00 6f 00 00 00 00 45
+41 01 60 00 a5 14 01 21 00 01 60 00 a0 00 00 00
+41 01 60 00 a5 1c 01 21 00 01 60 00 20 00 00 00
+41 20 80 00 be 77 40 20 80 00 8d 00 0c 01 00 00
+41 20 80 00 bd 7f c0 22 00 02 8d 00 00 00 80 41
+41 05 60 00 be 7f a3 20 64 00 65 00 00 00 00 3f
+41 01 60 00 bd 7f af 20 64 00 6e 00 00 00 80 37
+41 20 80 80 be 77 40 20 00 02 8d 00 c0 00 8d 00
+41 01 60 00 a4 14 0f 24 20 00 00 00 25 00 05 00
+41 01 60 00 be 7f af 20 64 00 6e 00 00 00 00 3f
+41 20 80 00 a5 14 80 20 c0 00 8d 00 40 00 00 00
+41 20 80 80 bd 77 40 22 00 02 8d 00 c0 01 8d 00
+41 00 60 00 bd 77 80 20 80 00 8d 00 e0 06 8d 00
+41 09 60 00 a1 7f 48 23 af 03 6f 00 00 00 00 45
+41 01 60 00 bd 5f 47 20 4f 00 6f 00 30 48 40 40
+41 20 80 00 25 15 80 20 40 00 00 00 48 00 00 00
+41 20 80 00 be 7f 00 21 00 03 8d 00 00 00 00 3f
+41 01 60 80 bd 77 c7 20 c4 00 6a 00 e0 00 60 00
+41 01 60 80 be 7f af 20 c4 00 6e 00 00 00 80 3b
+41 05 60 80 be 7f a7 20 64 00 6a 00 00 00 00 3f
+41 20 80 03 bc 77 00 20 40 02 8d 00 80 01 8d 00
+41 01 60 80 be 77 af 20 64 00 6e 00 64 00 6e 00
+41 01 60 05 bc 77 01 20 2a 00 0a 00 25 00 05 00
+41 20 80 05 bc 77 00 20 48 00 00 00 44 00 00 00
+41 20 80 05 bd 77 00 22 c0 01 8d 00 80 01 8d 00
+41 20 80 02 bd 77 40 22 00 02 8d 00 80 01 8d 00
+41 05 60 80 be 77 c7 20 00 04 60 00 c4 03 6a 00
+41 20 80 02 bd 7f c0 20 80 00 8d 00 00 80 80 3f
+41 01 60 80 be 5f af 20 84 00 6e 00 30 30 30 20
+41 01 60 00 be 5f af 20 64 00 6e 00 54 54 30 20
+41 05 60 00 be 77 c7 20 84 01 6a 00 a0 01 60 00
+41 05 60 00 be 5f a7 20 64 00 6a 00 20 20 30 30
+41 0d 60 00 be 7f a4 20 6a 00 6a 00 00 00 00 3f
+41 01 60 00 be 77 af 20 64 00 6e 00 20 00 00 00
+41 05 60 80 be 5f a7 20 e0 00 60 00 30 30 00 00
diff --git a/src/intel/compiler/elk/tests/gen4/not.asm b/src/intel/compiler/elk/tests/gen4/not.asm
new file mode 100644
index 00000000000..e245cb403ed
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/not.asm
@@ -0,0 +1,3 @@
+not(16)         g6<1>D          -g4<8,8,1>D                     { align1 compr };
+not(8)          g2<1>D          -g2<8,8,1>D                     { align1 };
+not(8)          g5<1>.xD        g5<4>.xD                        { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4/not.expected b/src/intel/compiler/elk/tests/gen4/not.expected
new file mode 100644
index 00000000000..93498187119
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/not.expected
@@ -0,0 +1,3 @@
+04 20 80 00 a5 00 c0 20 80 40 8d 00 00 00 00 00
+04 00 60 00 a5 00 40 20 40 40 8d 00 00 00 00 00
+04 01 60 00 a5 00 a1 20 a0 00 60 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4/or.asm b/src/intel/compiler/elk/tests/gen4/or.asm
new file mode 100644
index 00000000000..48ecdd9be2b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/or.asm
@@ -0,0 +1,6 @@
+(+f0.0) or(8)   g9<1>.wUD       g9<4>.wUD       0x00000040UD    { align16 };
+or(8)           g13<1>.xUD      g13<4>.xUD      g14<4>.xUD      { align16 };
+or(8)           g3<1>UD         g3<8,8,1>UD     g5<8,8,1>UD     { align1 };
+or(16)          g12<1>UD        g14<8,8,1>UD    g20<8,8,1>UD    { align1 compr };
+(+f0.0) or(16)  g12<1>UD        g12<8,8,1>UD    0x3f800000UD    { align1 compr };
+or(8)           g8<1>.wUD       g11<4>.xUD      g12<4>.xUD      { align16 NoDDChk };
diff --git a/src/intel/compiler/elk/tests/gen4/or.expected b/src/intel/compiler/elk/tests/gen4/or.expected
new file mode 100644
index 00000000000..62758bf9ab8
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/or.expected
@@ -0,0 +1,6 @@
+06 01 61 00 21 0c 28 21 2f 01 6f 00 40 00 00 00
+06 01 60 00 21 04 a1 21 a0 01 60 00 c0 01 60 00
+06 00 60 00 21 04 60 20 60 00 8d 00 a0 00 8d 00
+06 20 80 00 21 04 80 21 c0 01 8d 00 80 02 8d 00
+06 20 81 00 21 0c 80 21 80 01 8d 00 00 00 80 3f
+06 09 60 00 21 04 08 21 60 01 60 00 80 01 60 00
diff --git a/src/intel/compiler/elk/tests/gen4/rndd.asm b/src/intel/compiler/elk/tests/gen4/rndd.asm
new file mode 100644
index 00000000000..bdacb266560
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/rndd.asm
@@ -0,0 +1,5 @@
+rndd(16)        g16<1>F         g26<8,8,1>F                     { align1 compr };
+rndd(8)         g6<1>.xF        g1<0>.xF                        { align16 };
+rndd(8)         g6<1>.xF        (abs)g1<0>.xF                   { align16 NoDDClr };
+rndd(8)         g6<1>.yF        g7<4>.xF                        { align16 NoDDClr,NoDDChk };
+rndd.sat(8)     m5<1>F          g4<4>F                          { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4/rndd.expected b/src/intel/compiler/elk/tests/gen4/rndd.expected
new file mode 100644
index 00000000000..8cd4c86238b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/rndd.expected
@@ -0,0 +1,5 @@
+45 20 80 00 bd 03 00 22 40 03 8d 00 00 00 00 00
+45 01 60 00 bd 03 c1 20 20 00 00 00 00 00 00 00
+45 05 60 00 bd 03 c1 20 20 20 00 00 00 00 00 00
+45 0d 60 00 bd 03 c2 20 e0 00 60 00 00 00 00 00
+45 01 60 80 be 03 af 20 84 00 6e 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4/sel.asm b/src/intel/compiler/elk/tests/gen4/sel.asm
new file mode 100644
index 00000000000..246133426ae
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/sel.asm
@@ -0,0 +1,27 @@
+(+f0.0.any4h) sel(8) g7<1>UD    g9<4>UD         g8<4>UD         { align16 };
+(+f0.0) sel(8)  g10<1>.xyUD     g7<4>.xyyyUD    g3<0>.zwwwUD    { align16 };
+(+f0.0.all4h) sel(8) g6<1>UD    g6<4>UD         g7<4>UD         { align16 };
+(+f0.0) sel(16) g6<1>UD         g40<8,8,1>UD    g46<8,8,1>UD    { align1 compr };
+(+f0.0) sel(16) g6<1>UD         g30<8,8,1>UD    0x3f800000UD    { align1 compr };
+(+f0.0) sel(16) g10<1>F         g6<8,8,1>F      0x0F  /* 0F */  { align1 compr };
+(-f0.0) sel(16) g4<1>UD         g6<8,8,1>UD     0x00000000UD    { align1 compr };
+(+f0.0) sel(8)  g4<1>.yF        g5<4>.xF        0x0F  /* 0F */  { align16 };
+(-f0.0.z) sel(8) g4<1>.zUD      g6<4>.xUD       0x00000000UD    { align16 };
+(+f0.0.x) sel(8) g6<1>.xUD      g6<4>.yUD       0x41a80000UD    { align16 };
+(-f0.0.x) sel(8) g6<1>.xUD      g6<4>.xUD       0x41b80000UD    { align16 };
+(+f0.0) sel(16) g4<1>F          (abs)g14<8,8,1>F (abs)g8<8,8,1>F { align1 compr };
+(+f0.0.x) sel(8) g10<1>.xUD     g9<4>.yUD       g9<4>.xUD       { align16 };
+(+f0.0) sel.sat(8) m5<1>F       g3<4>F          0x3f000000F  /* 0.5F */ { align16 };
+(+f0.0) sel(16) m11<1>UD        g6<8,8,1>UD     0x3f000000UD    { align1 compr };
+(+f0.0) sel.sat(8) m5<1>F       g1<0>F          g3<4>F          { align16 };
+(-f0.0) sel(16) g8<1>F          (abs)g6<8,8,1>F 0x3f800000F  /* 1F */ { align1 compr };
+(-f0.0.y) sel(8) g3<1>.yUD      g4<4>.xUD       0x00000000UD    { align16 };
+(+f0.0) sel(8)  g5<1>UD         g3<4>UD         0x00000000UD    { align16 };
+(+f0.0.y) sel(8) g3<1>.yUD      g1<0>.wUD       g1<0>.zUD       { align16 };
+(+f0.0) sel(8)  g5<1>.xyF       g1<0>.xyyyF     g1<0>.zF        { align16 };
+(+f0.0.x) sel(8) g5<1>.xF       g1<0>.xF        -g1<0>.xF       { align16 };
+(-f0.0) sel(8)  g5<1>.wUD       g5<4>.wUD       0x3f800000UD    { align16 };
+(-f0.0) sel(8)  g4<1>.xyzF      (abs)g4<4>.xyzzF 0x3f800000F  /* 1F */ { align16 };
+(+f0.0.x) sel(8) g4<1>.xD       -g4<4>.xD       0D              { align16 };
+(+f0.0) sel(16) g4<1>D          -g6<8,8,1>D     -1D             { align1 compr };
+(-f0.0.x) sel(8) g3<1>.xF       (abs)g3<4>.xF   0x3f800000F  /* 1F */ { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4/sel.expected b/src/intel/compiler/elk/tests/gen4/sel.expected
new file mode 100644
index 00000000000..606b085128e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/sel.expected
@@ -0,0 +1,27 @@
+02 01 66 00 21 04 ef 20 24 01 6e 00 04 01 6e 00
+02 01 61 00 21 04 43 21 e4 00 65 00 6e 00 0f 00
+02 01 67 00 21 04 cf 20 c4 00 6e 00 e4 00 6e 00
+02 20 81 00 21 04 c0 20 00 05 8d 00 c0 05 8d 00
+02 20 81 00 21 0c c0 20 c0 03 8d 00 00 00 80 3f
+02 20 81 00 bd 7f 40 21 c0 00 8d 00 00 00 00 00
+02 20 91 00 21 0c 80 20 c0 00 8d 00 00 00 00 00
+02 01 61 00 bd 7f 82 20 a0 00 60 00 00 00 00 00
+02 01 74 00 21 0c 84 20 c0 00 60 00 00 00 00 00
+02 01 62 00 21 0c c1 20 c5 00 65 00 00 00 a8 41
+02 01 72 00 21 0c c1 20 c0 00 60 00 00 00 b8 41
+02 20 81 00 bd 77 80 20 c0 21 8d 00 00 21 8d 00
+02 01 62 00 21 04 41 21 25 01 65 00 20 01 60 00
+02 01 61 80 be 7f af 20 64 00 6e 00 00 00 00 3f
+02 20 81 00 22 0c 60 21 c0 00 8d 00 00 00 00 3f
+02 01 61 80 be 77 af 20 24 00 0e 00 64 00 6e 00
+02 20 91 00 bd 7f 00 21 c0 20 8d 00 00 00 80 3f
+02 01 73 00 21 0c 62 20 80 00 60 00 00 00 00 00
+02 01 61 00 21 0c af 20 64 00 6e 00 00 00 00 00
+02 01 63 00 21 04 62 20 2f 00 0f 00 2a 00 0a 00
+02 01 61 00 bd 77 a3 20 24 00 05 00 2a 00 0a 00
+02 01 62 00 bd 77 a1 20 20 00 00 00 20 40 00 00
+02 01 71 00 21 0c a8 20 af 00 6f 00 00 00 80 3f
+02 01 71 00 bd 7f 87 20 84 20 6a 00 00 00 80 3f
+02 01 62 00 a5 1c 81 20 80 40 60 00 00 00 00 00
+02 20 81 00 a5 1c 80 20 c0 40 8d 00 ff ff ff ff
+02 01 72 00 bd 7f 61 20 60 20 60 00 00 00 80 3f
diff --git a/src/intel/compiler/elk/tests/gen4/send.asm b/src/intel/compiler/elk/tests/gen4/send.asm
new file mode 100644
index 00000000000..2731d03032b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/send.asm
@@ -0,0 +1,214 @@
+send(8) 2       g12<1>F         g10<8,8,1>F     0x01110001
+                            math MsgDesc: inv mlen 1 rlen 1                 { align1 };
+send(8) 2       g13<1>F         g11<8,8,1>F     0x01110001
+                            math MsgDesc: inv mlen 1 rlen 1                 { align1 sechalf };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04800
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 10 rlen 0 { align1 EOT };
+send(8) 1       null<1>F        g0<4>F          0x8650c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 5 rlen 0 { align16 EOT };
+send(8) 1       null<1>F        g0<4>F          0x8640c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 4 rlen 0 { align16 EOT };
+send(8) 13      g0<1>F          g0<4>F          0x053190ff
+                            write MsgDesc: OWord dual block write MsgCtrl = 0x0 Surface = 255 mlen 3 rlen 1 { align16 };
+send(8) 14      g10<1>F         g0<4>F          0x042150ff
+                            read MsgDesc: OWord Dual Block Read MsgCtrl = 0x0 Surface = 255 mlen 2 rlen 1 { align16 };
+send(8) 1       null<1>F        g0<4>F          0x8680c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 8 rlen 0 { align16 EOT };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02780001
+                            sampler MsgDesc: (1, 0, 0, F) mlen 7 rlen 8     { align1 };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02580001
+                            sampler MsgDesc: (1, 0, 0, F) mlen 5 rlen 8     { align1 };
+send(8) 14      g3<1>UD         g0<4>F          0x04211000
+                            read MsgDesc: OWord Dual Block Read MsgCtrl = 0x0 Surface = 0 mlen 2 rlen 1 { align16 };
+send(8) 1       g6<1>.xF        g6<4>.xF        0x01110004
+                            math MsgDesc: sqrt mlen 1 rlen 1                { align16 };
+send(16) 1      g26<1>UW        g0<8,8,1>UW     0x0238a001
+                            sampler MsgDesc: (1, 0, 2, UD) mlen 3 rlen 8    { align1 };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x0298c001
+                            sampler MsgDesc: (1, 0, 3, F) mlen 9 rlen 8     { align1 };
+send(8) 1       null<1>F        g0<4>F          0x06d04400
+                            urb MsgDesc: 0 urb_write interleave used mlen 13 rlen 0 { align16 };
+send(8) 1       null<1>F        g0<4>F          0x8650c460
+                            urb MsgDesc: 6 urb_write interleave used complete mlen 5 rlen 0 { align16 EOT };
+send(8) 1       null<1>F        g0<4>F          0x8660c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 6 rlen 0 { align16 EOT };
+send(8) 2       g6<1>F          g4<8,8,1>F      0x0121000a
+                            math MsgDesc: pow mlen 2 rlen 1                 { align1 };
+send(16) 1      g16<1>UW        g0<8,8,1>UW     0x02380001
+                            sampler MsgDesc: (1, 0, 0, F) mlen 3 rlen 8     { align1 };
+send(8) 2       g6<1>F          g4<8,8,1>F      0x01110007
+                            math MsgDesc: cos mlen 1 rlen 1                 { align1 };
+send(16) 13     g8<1>UW         g0<8,8,1>F      0x0238c001
+                            sampler MsgDesc: (1, 0, 3, F) mlen 3 rlen 8     { align1 };
+send(8) 2       g4<1>F          g2.4<0,1,0>F    0x01110081
+                            math MsgDesc: inv scalar mlen 1 rlen 1          { align1 };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02980001
+                            sampler MsgDesc: (1, 0, 0, F) mlen 9 rlen 8     { align1 };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x0298e001
+                            sampler MsgDesc: (1, 0, 3, UD) mlen 9 rlen 8    { align1 };
+send(8) 1       null<1>F        g0<4>F          0x8670c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 7 rlen 0 { align16 EOT };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x85604c00
+                            write MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 6 rlen 0 { align1 EOT };
+send(8) 1       null<1>F        g0<4>F          0x8680c460
+                            urb MsgDesc: 6 urb_write interleave used complete mlen 8 rlen 0 { align16 EOT };
+send(8) 1       g5<1>.yF        g6<4>.xF        0x01110006
+                            math MsgDesc: sin mlen 1 rlen 1                 { align16 };
+send(8) 1       g7<1>.xD        g1<0>.zD        0x0121001c
+                            math MsgDesc: intdiv signed mlen 2 rlen 1       { align16 };
+send(8) 1       null<1>F        g0<4>F          0x8640c460
+                            urb MsgDesc: 6 urb_write interleave used complete mlen 4 rlen 0 { align16 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85e04800
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 14 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85c04800
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 12 rlen 0 { align1 EOT };
+send(8) 1       null<1>F        g0<4>F          0x86b0c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 11 rlen 0 { align16 EOT };
+send(8) 2       g6<1>F          g4<8,8,1>F      0x01110003
+                            math MsgDesc: exp mlen 1 rlen 1                 { align1 };
+send(8) 2       g4<1>D          g2.4<0,1,0>D    0x0121009c
+                            math MsgDesc: intdiv signed scalar mlen 2 rlen 1 { align1 };
+send(16) 14     g8<1>UW         null<8,8,1>F    0x04120301
+                            read MsgDesc: OWord Block Read MsgCtrl = 0x3 Surface = 1 mlen 1 rlen 2 { align1 nomask };
+send(8) 1       g30<1>.xF       (abs)g30<4>.xF  0x01110005
+                            math MsgDesc: rsq mlen 1 rlen 1                 { align16 };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02984001
+                            sampler MsgDesc: (1, 0, 1, F) mlen 9 rlen 8     { align1 };
+send(8) 2       g4<1>F          g2<0,1,0>F      0x01110086
+                            math MsgDesc: sin scalar mlen 1 rlen 1          { align1 };
+send(8) 2       g6<1>F          g2<0,1,0>F      0x01110087
+                            math MsgDesc: cos scalar mlen 1 rlen 1          { align1 };
+send(8) 2       g6<1>F          g2.1<0,1,0>F    0x01110085
+                            math MsgDesc: rsq scalar mlen 1 rlen 1          { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85f04800
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 15 rlen 0 { align1 EOT };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02988001
+                            sampler MsgDesc: (1, 0, 2, F) mlen 9 rlen 8     { align1 };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02580304
+                            sampler MsgDesc: (4, 3, 0, F) mlen 5 rlen 8     { align1 };
+send(8) 1       g5<1>.xF        g1<0>.xF        0x01110002
+                            math MsgDesc: log mlen 1 rlen 1                 { align16 };
+send(8) 1       g6<1>UW         g0<8,8,1>UW     0x02640001
+                            sampler MsgDesc: (1, 0, 0, F) mlen 6 rlen 4     { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x05a04000
+                            write MsgDesc: RT write SIMD16 Surface = 0 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x05a04001
+                            write MsgDesc: RT write SIMD16 Surface = 1 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x05a04002
+                            write MsgDesc: RT write SIMD16 Surface = 2 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04803
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 3 mlen 10 rlen 0 { align1 EOT };
+send(8) 1       g8<1>UW         g0<8,8,1>UW     0x02644001
+                            sampler MsgDesc: (1, 0, 1, F) mlen 6 rlen 4     { align1 };
+send(8) 2       g4<1>F          g2<0,1,0>F      0x012100ca
+                            math MsgDesc: pow sat scalar mlen 2 rlen 1      { align1 };
+send(16) 1      g16<1>UW        g0<8,8,1>UW     0x02988102
+                            sampler MsgDesc: (2, 1, 2, F) mlen 9 rlen 8     { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04801
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 10 rlen 0 { align1 EOT };
+send(8) 1       null<1>F        g0<4>F          0x8690c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 9 rlen 0 { align16 EOT };
+send(8) 1       null<1>F        g0<4>F          0x86c0c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 12 rlen 0 { align16 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04802
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 2 mlen 10 rlen 0 { align1 EOT };
+send(16) 1      g20<1>UW        g0<8,8,1>UW     0x02580102
+                            sampler MsgDesc: (2, 1, 0, F) mlen 5 rlen 8     { align1 };
+send(8) 1       null<1>F        g0<4>F          0x86a0c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 10 rlen 0 { align16 EOT };
+send(8) 2       g4<1>F          g2<0,1,0>F      0x01110082
+                            math MsgDesc: log scalar mlen 1 rlen 1          { align1 };
+send(16) 1      g14<1>UW        g0<8,8,1>UW     0x0238a102
+                            sampler MsgDesc: (2, 1, 2, UD) mlen 3 rlen 8    { align1 };
+send(16) 1      g26<1>UW        g0<8,8,1>UW     0x0238a203
+                            sampler MsgDesc: (3, 2, 2, UD) mlen 3 rlen 8    { align1 };
+send(16) 1      g26<1>UW        g0<8,8,1>UW     0x02580203
+                            sampler MsgDesc: (3, 2, 0, F) mlen 5 rlen 8     { align1 };
+send(16) 1      g34<1>UW        g0<8,8,1>UW     0x0238a304
+                            sampler MsgDesc: (4, 3, 2, UD) mlen 3 rlen 8    { align1 };
+send(16) 1      g42<1>UW        g0<8,8,1>UW     0x0238a405
+                            sampler MsgDesc: (5, 4, 2, UD) mlen 3 rlen 8    { align1 };
+send(16) 1      g42<1>UW        g0<8,8,1>UW     0x02580405
+                            sampler MsgDesc: (5, 4, 0, F) mlen 5 rlen 8     { align1 };
+send(16) 1      g50<1>UW        g0<8,8,1>UW     0x0238a506
+                            sampler MsgDesc: (6, 5, 2, UD) mlen 3 rlen 8    { align1 };
+send(16) 1      g50<1>UW        g0<8,8,1>UW     0x02580506
+                            sampler MsgDesc: (6, 5, 0, F) mlen 5 rlen 8     { align1 };
+send(16) 1      g58<1>UW        g0<8,8,1>UW     0x0238a607
+                            sampler MsgDesc: (7, 6, 2, UD) mlen 3 rlen 8    { align1 };
+send(16) 1      g58<1>UW        g0<8,8,1>UW     0x02580607
+                            sampler MsgDesc: (7, 6, 0, F) mlen 5 rlen 8     { align1 };
+send(16) 1      g66<1>UW        g0<8,8,1>UW     0x0238a708
+                            sampler MsgDesc: (8, 7, 2, UD) mlen 3 rlen 8    { align1 };
+send(16) 1      g66<1>UW        g0<8,8,1>UW     0x02580708
+                            sampler MsgDesc: (8, 7, 0, F) mlen 5 rlen 8     { align1 };
+send(8) 1       null<1>F        g0<4>F          0x86d0c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 13 rlen 0 { align16 EOT };
+send(8) 1       g10<1>UW        g0<8,8,1>UW     0x02644102
+                            sampler MsgDesc: (2, 1, 1, F) mlen 6 rlen 4     { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85b04800
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 11 rlen 0 { align1 EOT };
+send(8) 2       g3<1>F          g0<4>F          0x02214505
+                            sampler MsgDesc: (5, 5, 1, F) mlen 2 rlen 1     { align16 };
+send(8) 2       g4<1>F          g2<0,1,0>F      0x011100c4
+                            math MsgDesc: sqrt sat scalar mlen 1 rlen 1     { align1 };
+send(8) 2       g4<1>F          g2<0,1,0>F      0x011100c3
+                            math MsgDesc: exp sat scalar mlen 1 rlen 1      { align1 };
+send(8) 2       g3<1>F          g0<4>F          0x02214000
+                            sampler MsgDesc: (0, 0, 1, F) mlen 2 rlen 1     { align16 };
+send(16) 13     g24<1>UW        g0<8,8,1>F      0x0238c002
+                            sampler MsgDesc: (2, 0, 3, F) mlen 3 rlen 8     { align1 };
+send(8) 1       g3<1>F          g1<0>F          0x01110044
+                            math MsgDesc: sqrt sat mlen 1 rlen 1            { align16 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x05a04003
+                            write MsgDesc: RT write SIMD16 Surface = 3 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04804
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 4 mlen 10 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x05a04004
+                            write MsgDesc: RT write SIMD16 Surface = 4 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04805
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 5 mlen 10 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x05a04005
+                            write MsgDesc: RT write SIMD16 Surface = 5 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04806
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 6 mlen 10 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x05a04006
+                            write MsgDesc: RT write SIMD16 Surface = 6 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x85a04807
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 7 mlen 10 rlen 0 { align1 EOT };
+send(8) 1       g8<1>UW         g0<8,8,1>UW     0x02748001
+                            sampler MsgDesc: (1, 0, 2, F) mlen 7 rlen 4     { align1 };
+send(16) 1      g12<1>UW        g0<8,8,1>UW     0x02780102
+                            sampler MsgDesc: (2, 1, 0, F) mlen 7 rlen 8     { align1 };
+send(8) 1       null<1>F        g0<4>F          0x8620c460
+                            urb MsgDesc: 6 urb_write interleave used complete mlen 2 rlen 0 { align16 EOT };
+send(8) 2       g6<1>F          g2<0,1,0>F      0x01110084
+                            math MsgDesc: sqrt scalar mlen 1 rlen 1         { align1 };
+send(8) 1       g3<1>F          g1<0>F          0x01110043
+                            math MsgDesc: exp sat mlen 1 rlen 1             { align16 };
+send(8) 2       g4<1>F          g2<0,1,0>F      0x0121008a
+                            math MsgDesc: pow scalar mlen 2 rlen 1          { align1 };
+send(8) 1       g8<1>UW         g0<8,8,1>UW     0x02640102
+                            sampler MsgDesc: (2, 1, 0, F) mlen 6 rlen 4     { align1 };
+send(8) 2       g4<1>F          g2<0,1,0>F      0x01110083
+                            math MsgDesc: exp scalar mlen 1 rlen 1          { align1 };
+send(8) 1       g8<1>UW         g0<8,8,1>UW     0x02a48001
+                            sampler MsgDesc: (1, 0, 2, F) mlen 10 rlen 4    { align1 };
+send(16) 1      g14<1>UW        g0<8,8,1>UW     0x02580003
+                            sampler MsgDesc: (3, 0, 0, F) mlen 5 rlen 8     { align1 };
+send(16) 1      g22<1>UW        g0<8,8,1>UW     0x02580004
+                            sampler MsgDesc: (4, 0, 0, F) mlen 5 rlen 8     { align1 };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02580f10
+                            sampler MsgDesc: (16, 15, 0, F) mlen 5 rlen 8   { align1 };
+send(8) 2       g3<1>F          g0<4>F          0x02214303
+                            sampler MsgDesc: (3, 3, 1, F) mlen 2 rlen 1     { align16 };
+send(8) 1       g3<1>F          g1<0>F          0x0121004a
+                            math MsgDesc: pow sat mlen 2 rlen 1             { align16 };
+send(16) 1      g10<1>UW        g0<8,8,1>UW     0x0238a004
+                            sampler MsgDesc: (4, 0, 2, UD) mlen 3 rlen 8    { align1 };
+send(16) 1      g10<1>UW        g0<8,8,1>UW     0x0238a003
+                            sampler MsgDesc: (3, 0, 2, UD) mlen 3 rlen 8    { align1 };
+send(16) 1      g10<1>UW        g0<8,8,1>UW     0x0238a002
+                            sampler MsgDesc: (2, 0, 2, UD) mlen 3 rlen 8    { align1 };
+send(16) 1      g4<1>UW         g0<8,8,1>UW     0x02580002
+                            sampler MsgDesc: (2, 0, 0, F) mlen 5 rlen 8     { align1 };
diff --git a/src/intel/compiler/elk/tests/gen4/send.expected b/src/intel/compiler/elk/tests/gen4/send.expected
new file mode 100644
index 00000000000..e0c11f4ad19
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/send.expected
@@ -0,0 +1,107 @@
+31 00 60 02 bd 0f 80 21 40 01 8d 00 01 00 11 01
+31 10 60 02 bd 0f a0 21 60 01 8d 00 01 00 11 01
+31 00 80 01 28 0d 00 20 00 00 8d 00 00 48 a0 85
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 50 86
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 40 86
+31 01 60 0d bd 0f 0f 20 04 00 6e 00 ff 90 31 05
+31 01 60 0e bd 0f 4f 21 04 00 6e 00 ff 50 21 04
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 80 86
+31 00 80 01 29 0d 80 20 00 00 8d 00 01 00 78 02
+31 00 80 01 29 0d 80 20 00 00 8d 00 01 00 58 02
+31 01 60 0e a1 0f 6f 20 04 00 6e 00 00 10 21 04
+31 01 60 01 bd 0f c1 20 c0 00 60 00 04 00 11 01
+31 00 80 01 29 0d 40 23 00 00 8d 00 01 a0 38 02
+31 00 80 01 29 0d 80 20 00 00 8d 00 01 c0 98 02
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 44 d0 06
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 60 c4 50 86
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 60 86
+31 00 60 02 bd 0f c0 20 80 00 8d 00 0a 00 21 01
+31 00 80 01 29 0d 00 22 00 00 8d 00 01 00 38 02
+31 00 60 02 bd 0f c0 20 80 00 8d 00 07 00 11 01
+31 00 80 0d a9 0f 00 21 00 00 8d 00 01 c0 38 02
+31 00 60 02 bd 0f 80 20 50 00 00 00 81 00 11 01
+31 00 80 01 29 0d 80 20 00 00 8d 00 01 00 98 02
+31 00 80 01 29 0d 80 20 00 00 8d 00 01 e0 98 02
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 70 86
+31 00 60 01 28 0d 00 20 00 00 8d 00 00 4c 60 85
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 60 c4 80 86
+31 01 60 01 bd 0f a2 20 c0 00 60 00 06 00 11 01
+31 01 60 01 a5 0c e1 20 2a 00 0a 00 1c 00 21 01
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 60 c4 40 86
+31 00 80 01 28 0d 00 20 00 00 8d 00 00 48 e0 85
+31 00 80 01 28 0d 00 20 00 00 8d 00 00 48 c0 85
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 b0 86
+31 00 60 02 bd 0f c0 20 80 00 8d 00 03 00 11 01
+31 00 60 02 a5 0c 80 20 50 00 00 00 9c 00 21 01
+31 02 80 0e 89 0f 00 21 00 00 8d 00 01 03 12 04
+31 01 60 01 bd 0f c1 23 c0 23 60 00 05 00 11 01
+31 00 80 01 29 0d 80 20 00 00 8d 00 01 40 98 02
+31 00 60 02 bd 0f 80 20 40 00 00 00 86 00 11 01
+31 00 60 02 bd 0f c0 20 40 00 00 00 87 00 11 01
+31 00 60 02 bd 0f c0 20 44 00 00 00 85 00 11 01
+31 00 80 01 28 0d 00 20 00 00 8d 00 00 48 f0 85
+31 00 80 01 29 0d 80 20 00 00 8d 00 01 80 98 02
+31 00 80 01 29 0d 80 20 00 00 8d 00 04 03 58 02
+31 01 60 01 bd 0f a1 20 20 00 00 00 02 00 11 01
+31 00 60 01 29 0d c0 20 00 00 8d 00 01 00 64 02
+31 00 80 01 28 0d 00 20 00 00 8d 00 00 40 a0 05
+31 00 80 01 28 0d 00 20 00 00 8d 00 01 40 a0 05
+31 00 80 01 28 0d 00 20 00 00 8d 00 02 40 a0 05
+31 00 80 01 28 0d 00 20 00 00 8d 00 03 48 a0 85
+31 00 60 01 29 0d 00 21 00 00 8d 00 01 40 64 02
+31 00 60 02 bd 0f 80 20 40 00 00 00 ca 00 21 01
+31 00 80 01 29 0d 00 22 00 00 8d 00 02 81 98 02
+31 00 80 01 28 0d 00 20 00 00 8d 00 01 48 a0 85
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 90 86
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 c0 86
+31 00 80 01 28 0d 00 20 00 00 8d 00 02 48 a0 85
+31 00 80 01 29 0d 80 22 00 00 8d 00 02 01 58 02
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 a0 86
+31 00 60 02 bd 0f 80 20 40 00 00 00 82 00 11 01
+31 00 80 01 29 0d c0 21 00 00 8d 00 02 a1 38 02
+31 00 80 01 29 0d 40 23 00 00 8d 00 03 a2 38 02
+31 00 80 01 29 0d 40 23 00 00 8d 00 03 02 58 02
+31 00 80 01 29 0d 40 24 00 00 8d 00 04 a3 38 02
+31 00 80 01 29 0d 40 25 00 00 8d 00 05 a4 38 02
+31 00 80 01 29 0d 40 25 00 00 8d 00 05 04 58 02
+31 00 80 01 29 0d 40 26 00 00 8d 00 06 a5 38 02
+31 00 80 01 29 0d 40 26 00 00 8d 00 06 05 58 02
+31 00 80 01 29 0d 40 27 00 00 8d 00 07 a6 38 02
+31 00 80 01 29 0d 40 27 00 00 8d 00 07 06 58 02
+31 00 80 01 29 0d 40 28 00 00 8d 00 08 a7 38 02
+31 00 80 01 29 0d 40 28 00 00 8d 00 08 07 58 02
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 00 c4 d0 86
+31 00 60 01 29 0d 40 21 00 00 8d 00 02 41 64 02
+31 00 80 01 28 0d 00 20 00 00 8d 00 00 48 b0 85
+31 01 60 02 bd 0f 6f 20 04 00 6e 00 05 45 21 02
+31 00 60 02 bd 0f 80 20 40 00 00 00 c4 00 11 01
+31 00 60 02 bd 0f 80 20 40 00 00 00 c3 00 11 01
+31 01 60 02 bd 0f 6f 20 04 00 6e 00 00 40 21 02
+31 00 80 0d a9 0f 00 23 00 00 8d 00 02 c0 38 02
+31 01 60 01 bd 0f 6f 20 24 00 0e 00 44 00 11 01
+31 00 80 01 28 0d 00 20 00 00 8d 00 03 40 a0 05
+31 00 80 01 28 0d 00 20 00 00 8d 00 04 48 a0 85
+31 00 80 01 28 0d 00 20 00 00 8d 00 04 40 a0 05
+31 00 80 01 28 0d 00 20 00 00 8d 00 05 48 a0 85
+31 00 80 01 28 0d 00 20 00 00 8d 00 05 40 a0 05
+31 00 80 01 28 0d 00 20 00 00 8d 00 06 48 a0 85
+31 00 80 01 28 0d 00 20 00 00 8d 00 06 40 a0 05
+31 00 80 01 28 0d 00 20 00 00 8d 00 07 48 a0 85
+31 00 60 01 29 0d 00 21 00 00 8d 00 01 80 74 02
+31 00 80 01 29 0d 80 21 00 00 8d 00 02 01 78 02
+31 01 60 01 bc 0f 0f 20 04 00 6e 00 60 c4 20 86
+31 00 60 02 bd 0f c0 20 40 00 00 00 84 00 11 01
+31 01 60 01 bd 0f 6f 20 24 00 0e 00 43 00 11 01
+31 00 60 02 bd 0f 80 20 40 00 00 00 8a 00 21 01
+31 00 60 01 29 0d 00 21 00 00 8d 00 02 01 64 02
+31 00 60 02 bd 0f 80 20 40 00 00 00 83 00 11 01
+31 00 60 01 29 0d 00 21 00 00 8d 00 01 80 a4 02
+31 00 80 01 29 0d c0 21 00 00 8d 00 03 00 58 02
+31 00 80 01 29 0d c0 22 00 00 8d 00 04 00 58 02
+31 00 80 01 29 0d 80 20 00 00 8d 00 10 0f 58 02
+31 01 60 02 bd 0f 6f 20 04 00 6e 00 03 43 21 02
+31 01 60 01 bd 0f 6f 20 24 00 0e 00 4a 00 21 01
+31 00 80 01 29 0d 40 21 00 00 8d 00 04 a0 38 02
+31 00 80 01 29 0d 40 21 00 00 8d 00 03 a0 38 02
+31 00 80 01 29 0d 40 21 00 00 8d 00 02 a0 38 02
+31 00 80 01 29 0d 80 20 00 00 8d 00 02 00 58 02
diff --git a/src/intel/compiler/elk/tests/gen4/shl.asm b/src/intel/compiler/elk/tests/gen4/shl.asm
new file mode 100644
index 00000000000..533acb4dbb9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/shl.asm
@@ -0,0 +1,5 @@
+shl(8)          g4<1>.xD        g1<0>.yD        0x00000004UD    { align16 };
+shl(16)         g4<1>D          g2.4<0,1,0>D    0x00000004UD    { align1 compr };
+shl(16)         m14<1>D         g2<0,1,0>D      0x00000004UD    { align1 compr };
+shl(8)          g13<1>.xUD      g13<4>.xUD      4D              { align16 };
+shl(8)          g5<1>D          g3<4>D          g4<4>UD         { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4/shl.expected b/src/intel/compiler/elk/tests/gen4/shl.expected
new file mode 100644
index 00000000000..ce814beed94
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/shl.expected
@@ -0,0 +1,5 @@
+09 01 60 00 a5 0c 81 20 25 00 05 00 04 00 00 00
+09 20 80 00 a5 0c 80 20 50 00 00 00 04 00 00 00
+09 20 80 00 a6 0c c0 21 40 00 00 00 04 00 00 00
+09 01 60 00 21 1c a1 21 a0 01 60 00 04 00 00 00
+09 01 60 00 a5 04 af 20 64 00 6e 00 84 00 6e 00
diff --git a/src/intel/compiler/elk/tests/gen4/shr.asm b/src/intel/compiler/elk/tests/gen4/shr.asm
new file mode 100644
index 00000000000..e8578056528
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/shr.asm
@@ -0,0 +1 @@
+shr(1)          g12.4<1>UD      g12.4<0,1,0>UD  0x00000004UD    { align1 nomask };
diff --git a/src/intel/compiler/elk/tests/gen4/shr.expected b/src/intel/compiler/elk/tests/gen4/shr.expected
new file mode 100644
index 00000000000..be1a8a0571d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/shr.expected
@@ -0,0 +1 @@
+08 02 00 00 21 0c 90 21 90 01 00 00 04 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen4/while.asm b/src/intel/compiler/elk/tests/gen4/while.asm
new file mode 100644
index 00000000000..9f9645fad90
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/while.asm
@@ -0,0 +1,4 @@
+while(16)       Jump: -10                                       { align1 };
+while(8)        Jump: -16                                       { align16 };
+(-f0.0) while(16) Jump: -11                                     { align1 };
+(-f0.0.x) while(8) Jump: -11                                    { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4/while.expected b/src/intel/compiler/elk/tests/gen4/while.expected
new file mode 100644
index 00000000000..9707936afd3
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/while.expected
@@ -0,0 +1,4 @@
+27 00 80 00 00 1c 00 34 00 14 60 00 f6 ff 00 00
+27 01 60 00 00 1c 0f 34 04 14 6e 00 f0 ff 00 00
+27 00 91 00 00 1c 00 34 00 14 60 00 f5 ff 00 00
+27 01 72 00 00 1c 0f 34 04 14 6e 00 f5 ff 00 00
diff --git a/src/intel/compiler/elk/tests/gen4/xor.asm b/src/intel/compiler/elk/tests/gen4/xor.asm
new file mode 100644
index 00000000000..bcaaea879fc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/xor.asm
@@ -0,0 +1,2 @@
+xor(16)         g4<1>UD         g2<0,1,0>UD     g2.1<0,1,0>UD   { align1 compr };
+xor(8)          g5<1>.xUD       g1<0>.xUD       g1<0>.yUD       { align16 };
diff --git a/src/intel/compiler/elk/tests/gen4/xor.expected b/src/intel/compiler/elk/tests/gen4/xor.expected
new file mode 100644
index 00000000000..f5d2ef3ecc7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen4/xor.expected
@@ -0,0 +1,2 @@
+07 20 80 00 21 04 80 20 40 00 00 00 44 00 00 00
+07 01 60 00 21 04 a1 20 20 00 00 00 25 00 05 00
diff --git a/src/intel/compiler/elk/tests/gen5/add.asm b/src/intel/compiler/elk/tests/gen5/add.asm
new file mode 100644
index 00000000000..eff22ad88a7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/add.asm
@@ -0,0 +1,49 @@
+add(8)          g2<1>UW         g1.4<2,4,0>UW   0x10101010V     { align1 };
+add(8)          g8<1>F          g2<8,8,1>UW     -g1<0,1,0>F     { align1 };
+add(16)         g10<1>UW        g1.4<2,4,0>UW   0x10101010V     { align1 };
+add(8)          g8<1>F          g10.8<8,8,1>UW  -g1<0,1,0>F     { align1 sechalf };
+add(8)          g2<1>F          g2<8,8,1>F      g6.7<0,1,0>F    { align1 };
+add(16)         g4<1>F          g10<8,8,1>F     g6.7<0,1,0>F    { align1 compr };
+add(8)          g5<1>.xD        g2<4>.xD        64D             { align16 };
+add(8)          g4<1>.xD        g5<4>.xD        g4<4>.xD        { align16 };
+add(8)          g3<1>F          g3<4>F          g5<4>F          { align16 };
+add(8)          g14<1>F         g6<8,8,1>F      0x3f800000F  /* 1F */ { align1 };
+add(8)          g12<1>D         g12<8,8,1>D     1D              { align1 };
+add(16)         g24<1>F         g20<8,8,1>F     0x3f800000F  /* 1F */ { align1 compr };
+add(16)         g14<1>D         g14<8,8,1>D     1D              { align1 compr };
+add(8)          m3<1>F          g4<8,8,1>F      g2.1<0,1,0>F    { align1 };
+add(16)         m3<1>F          g6<8,8,1>F      g2.1<0,1,0>F    { align1 compr4 };
+add(8)          m5<1>.xyzF      g10<4>.xyzzF    g8<4>.xyzzF     { align16 NoDDClr };
+add.le.f0.0(8)  g3<1>F          g3<8,8,1>F      g4<8,8,1>F      { align1 };
+add.le.f0.0(16) g6<1>F          g8<8,8,1>F      g4<8,8,1>F      { align1 compr };
+add(8)          g3<1>.xyF       g2<4>.xyyyF     0x3f800000F  /* 1F */ { align16 };
+add(8)          m4<1>F          -g2<8,8,1>F     0x3f800000F  /* 1F */ { align1 };
+add(16)         m4<1>F          -g6<8,8,1>F     0x3f800000F  /* 1F */ { align1 compr4 };
+add.sat(8)      g3<1>F          g3<8,8,1>F      g2.1<0,1,0>F    { align1 };
+add.sat(16)     g6<1>F          g4<8,8,1>F      g2.1<0,1,0>F    { align1 compr };
+add(8)          g2<1>D          g2<8,8,1>D      -g9.3<0,1,0>D   { align1 };
+add(16)         g12<1>D         g4<8,8,1>D      -g9.3<0,1,0>D   { align1 compr };
+add(8)          m5<1>.xF        g3<4>.xF        0x3f000000F  /* 0.5F */ { align16 };
+add(8)          g31<1>.xyzF     g28<4>.xyzzF    0x30300000VF /* [0F, 0F, 1F, 1F]VF */ { align16 };
+add.sat(8)      m5<1>.xyzF      g25<4>.xyzzF    g26<4>.xyzzF    { align16 NoDDClr };
+add(8)          g3.1<2>UW       g3.1<16,8,2>UW  g4<16,8,2>UW    { align1 };
+add(16)         g4.1<2>UW       g4.1<16,8,2>UW  g6<16,8,2>UW    { align1 compr };
+add(8)          g4<1>.xyF       g4<4>.xyyyF     0xbf800000F  /* -1F */ { align16 NoDDClr };
+add.sat(8)      m5<1>F          g3<4>.yzxwF     -g3<4>F         { align16 };
+add(8)          m5<1>.zwF       g8<4>.xxxyF     g9<4>.xxxyF     { align16 NoDDChk };
+add(8)          m8<1>.xyF       g8<4>.xyyyF     g9<4>.xyyyF     { align16 };
+add(1)          m15.4<1>D       g5.4<0,1,0>D    16D             { align1 nomask };
+add.sat(8)      m3<1>F          g6<8,8,1>F      g12<8,8,1>F     { align1 };
+add.sat(16)     m3<1>F          g16<8,8,1>F     g12<8,8,1>F     { align1 compr4 };
+add.sat(8)      m5<1>.xF        -g8<4>.xF       0x3f800000F  /* 1F */ { align16 };
+add(16)         g4<1>F          -g8<4>.xyxyF    g8<4>.zwzwF     { align16 compr };
+add(8)          m14<1>D         g3<8,8,1>D      12D             { align1 };
+add(16)         m14<1>D         g4<8,8,1>D      12D             { align1 compr };
+add.sat(8)      m5<1>.yF        g6<4>.xF        g7<4>.xF        { align16 NoDDClr,NoDDChk };
+add.sat(8)      m5<1>.wF        g6<4>.xF        g7<4>.xF        { align16 NoDDChk };
+add.ge.f0.0(8)  g8<1>F          g8<8,8,1>F      g9<8,8,1>F      { align1 };
+add.ge.f0.0(16) g16<1>F         g18<8,8,1>F     g10<8,8,1>F     { align1 compr };
+add(8)          g5<1>.zF        g4<4>.xF        0xbf800000F  /* -1F */ { align16 NoDDClr,NoDDChk };
+add(8)          m5<1>.xyF       g12<4>.xyyyF    0x3f000000F  /* 0.5F */ { align16 NoDDClr };
+add(8)          g5<1>.xUD       g7<4>.xUD       0x00000080UD    { align16 };
+add(8)          m5<1>.yF        -g5<4>.xF       0x3f800000F  /* 1F */ { align16 NoDDClr,NoDDChk };
diff --git a/src/intel/compiler/elk/tests/gen5/add.expected b/src/intel/compiler/elk/tests/gen5/add.expected
new file mode 100644
index 00000000000..7b4013159fb
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/add.expected
@@ -0,0 +1,49 @@
+40 00 60 00 29 6d 40 20 28 00 48 00 10 10 10 10
+40 00 60 00 3d 75 00 21 40 00 8d 00 20 40 00 00
+40 00 80 00 29 6d 40 21 28 00 48 00 10 10 10 10
+40 10 60 00 3d 75 00 21 50 01 8d 00 20 40 00 00
+40 00 60 00 bd 77 40 20 40 00 8d 00 dc 00 00 00
+40 20 80 00 bd 77 80 20 40 01 8d 00 dc 00 00 00
+40 01 60 00 a5 1c a1 20 40 00 60 00 40 00 00 00
+40 01 60 00 a5 14 81 20 a0 00 60 00 80 00 60 00
+40 01 60 00 bd 77 6f 20 64 00 6e 00 a4 00 6e 00
+40 00 60 00 bd 7f c0 21 c0 00 8d 00 00 00 80 3f
+40 00 60 00 a5 1c 80 21 80 01 8d 00 01 00 00 00
+40 20 80 00 bd 7f 00 23 80 02 8d 00 00 00 80 3f
+40 20 80 00 a5 1c c0 21 c0 01 8d 00 01 00 00 00
+40 00 60 00 be 77 60 20 80 00 8d 00 44 00 00 00
+40 20 80 00 be 77 60 30 c0 00 8d 00 44 00 00 00
+40 05 60 00 be 77 a7 20 44 01 6a 00 04 01 6a 00
+40 00 60 06 bd 77 60 20 60 00 8d 00 80 00 8d 00
+40 20 80 06 bd 77 c0 20 00 01 8d 00 80 00 8d 00
+40 01 60 00 bd 7f 63 20 44 00 65 00 00 00 80 3f
+40 00 60 00 be 7f 80 20 40 40 8d 00 00 00 80 3f
+40 20 80 00 be 7f 80 30 c0 40 8d 00 00 00 80 3f
+40 00 60 80 bd 77 60 20 60 00 8d 00 44 00 00 00
+40 20 80 80 bd 77 c0 20 80 00 8d 00 44 00 00 00
+40 00 60 00 a5 14 40 20 40 00 8d 00 2c 41 00 00
+40 20 80 00 a5 14 80 21 80 00 8d 00 2c 41 00 00
+40 01 60 00 be 7f a1 20 60 00 60 00 00 00 00 3f
+40 01 60 00 bd 5f e7 23 84 03 6a 00 00 00 30 30
+40 05 60 80 be 77 a7 20 24 03 6a 00 44 03 6a 00
+40 00 60 00 29 25 62 40 62 00 ae 00 80 00 ae 00
+40 20 80 00 29 25 82 40 82 00 ae 00 c0 00 ae 00
+40 05 60 00 bd 7f 83 20 84 00 65 00 00 00 80 bf
+40 01 60 80 be 77 af 20 69 00 6c 00 64 40 6e 00
+40 09 60 00 be 77 ac 20 00 01 64 00 20 01 64 00
+40 01 60 00 be 77 03 21 04 01 65 00 24 01 65 00
+40 02 00 00 a6 1c f0 21 b0 00 00 00 10 00 00 00
+40 00 60 80 be 77 60 20 c0 00 8d 00 80 01 8d 00
+40 20 80 80 be 77 60 30 00 02 8d 00 80 01 8d 00
+40 01 60 80 be 7f a1 20 00 41 60 00 00 00 80 3f
+40 21 80 00 bd 77 8f 20 04 41 64 00 0e 01 6e 00
+40 00 60 00 a6 1c c0 21 60 00 8d 00 0c 00 00 00
+40 20 80 00 a6 1c c0 21 80 00 8d 00 0c 00 00 00
+40 0d 60 80 be 77 a2 20 c0 00 60 00 e0 00 60 00
+40 09 60 80 be 77 a8 20 c0 00 60 00 e0 00 60 00
+40 00 60 04 bd 77 00 21 00 01 8d 00 20 01 8d 00
+40 20 80 04 bd 77 00 22 40 02 8d 00 40 01 8d 00
+40 0d 60 00 bd 7f a4 20 80 00 60 00 00 00 80 bf
+40 05 60 00 be 7f a3 20 84 01 65 00 00 00 00 3f
+40 01 60 00 21 0c a1 20 e0 00 60 00 80 00 00 00
+40 0d 60 00 be 7f a2 20 a0 40 60 00 00 00 80 3f
diff --git a/src/intel/compiler/elk/tests/gen5/and.asm b/src/intel/compiler/elk/tests/gen5/and.asm
new file mode 100644
index 00000000000..cc2f7608a12
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/and.asm
@@ -0,0 +1,20 @@
+and(8)          g9<1>.wUD       g9<4>.wUD       524032D         { align16 };
+and(8)          g5<1>.xD        g5<4>.xD        1D              { align16 };
+and(8)          g5<1>D          g6<8,8,1>D      1D              { align1 };
+and(16)         g14<1>D         g12<8,8,1>D     1D              { align1 compr };
+and(8)          g2<1>D          g2<8,8,1>UD     1D              { align1 };
+and.nz.f0.0(8)  null<1>.xD      g9<4>.xUD       1D              { align16 };
+and.nz.f0.0(8)  null<1>D        g8<8,8,1>UD     1D              { align1 };
+and(8)          g8<1>UD         g2.4<0,1,0>UD   0x80000000UD    { align1 };
+and.nz.f0.0(16) null<1>D        g12<8,8,1>UD    1D              { align1 compr };
+and(16)         g12<1>UD        g2.4<0,1,0>UD   0x80000000UD    { align1 compr };
+and(16)         g6<1>D          g4<8,8,1>UD     1D              { align1 compr };
+and(8)          g2<1>UD         g4<8,8,1>UD     g3<8,8,1>UD     { align1 };
+and(16)         g6<1>UD         g8<8,8,1>UD     g4<8,8,1>UD     { align1 compr };
+and(8)          g17<1>.xUD      g1<0>.xUD       0x80000000UD    { align16 };
+and(8)          g47<1>.xUD      g48<4>.xUD      g47<4>.xUD      { align16 };
+and(1)          g8<1>UD         f0<0,1,0>UW     0x0000000fUD    { align1 nomask };
+and.nz.f0.0(8)  g7<1>D          g7<8,8,1>D      1D              { align1 };
+and.nz.f0.0(16) g14<1>D         g8<8,8,1>D      1D              { align1 compr };
+and(8)          g3<1>.xD        g3<4>.xUD       1D              { align16 };
+and.nz.f0.0(1)  null<1>UD       g1.6<0,1,0>UD   0x04000000UD    { align1 };
diff --git a/src/intel/compiler/elk/tests/gen5/and.expected b/src/intel/compiler/elk/tests/gen5/and.expected
new file mode 100644
index 00000000000..7d12dd55a7a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/and.expected
@@ -0,0 +1,20 @@
+05 01 60 00 21 1c 28 21 2f 01 6f 00 00 ff 07 00
+05 01 60 00 a5 1c a1 20 a0 00 60 00 01 00 00 00
+05 00 60 00 a5 1c a0 20 c0 00 8d 00 01 00 00 00
+05 20 80 00 a5 1c c0 21 80 01 8d 00 01 00 00 00
+05 00 60 00 25 1c 40 20 40 00 8d 00 01 00 00 00
+05 01 60 02 24 1c 01 20 20 01 60 00 01 00 00 00
+05 00 60 02 24 1c 00 20 00 01 8d 00 01 00 00 00
+05 00 60 00 21 0c 00 21 50 00 00 00 00 00 00 80
+05 20 80 02 24 1c 00 20 80 01 8d 00 01 00 00 00
+05 20 80 00 21 0c 80 21 50 00 00 00 00 00 00 80
+05 20 80 00 25 1c c0 20 80 00 8d 00 01 00 00 00
+05 00 60 00 21 04 40 20 80 00 8d 00 60 00 8d 00
+05 20 80 00 21 04 c0 20 00 01 8d 00 80 00 8d 00
+05 01 60 00 21 0c 21 22 20 00 00 00 00 00 00 80
+05 01 60 00 21 04 e1 25 00 06 60 00 e0 05 60 00
+05 02 00 00 01 0d 00 21 00 06 00 00 0f 00 00 00
+05 00 60 02 a5 1c e0 20 e0 00 8d 00 01 00 00 00
+05 20 80 02 a5 1c c0 21 00 01 8d 00 01 00 00 00
+05 01 60 00 25 1c 61 20 60 00 60 00 01 00 00 00
+05 00 00 02 20 0c 00 20 38 00 00 00 00 00 00 04
diff --git a/src/intel/compiler/elk/tests/gen5/asr.asm b/src/intel/compiler/elk/tests/gen5/asr.asm
new file mode 100644
index 00000000000..4374a805e85
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/asr.asm
@@ -0,0 +1,6 @@
+asr.nz.f0.0(8)  null<1>D        -g1.6<0,1,0>D   31D             { align1 };
+asr.nz.f0.0(16) null<1>D        -g1.6<0,1,0>D   31D             { align1 compr };
+asr(8)          g4<1>D          g5<4>D          g4<4>UD         { align16 };
+asr(8)          g11<1>.xD       g5<4>.xD        0x00000002UD    { align16 };
+asr(8)          g5<1>D          g3<8,8,1>D      0x00000002UD    { align1 };
+asr(16)         g10<1>D         g6<8,8,1>D      0x00000002UD    { align1 compr };
diff --git a/src/intel/compiler/elk/tests/gen5/asr.expected b/src/intel/compiler/elk/tests/gen5/asr.expected
new file mode 100644
index 00000000000..8da3c86b04a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/asr.expected
@@ -0,0 +1,6 @@
+0c 00 60 02 a4 1c 00 20 38 40 00 00 1f 00 00 00
+0c 20 80 02 a4 1c 00 20 38 40 00 00 1f 00 00 00
+0c 01 60 00 a5 04 8f 20 a4 00 6e 00 84 00 6e 00
+0c 01 60 00 a5 0c 61 21 a0 00 60 00 02 00 00 00
+0c 00 60 00 a5 0c a0 20 60 00 8d 00 02 00 00 00
+0c 20 80 00 a5 0c 40 21 c0 00 8d 00 02 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen5/break.asm b/src/intel/compiler/elk/tests/gen5/break.asm
new file mode 100644
index 00000000000..26ab8819c18
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/break.asm
@@ -0,0 +1,4 @@
+(+f0.0) break(8) Jump: 282      Pop: 0                          { align1 };
+(+f0.0) break(16) Jump: 282     Pop: 0                          { align1 };
+(+f0.0.x) break(8) Jump: 32     Pop: 0                          { align16 };
+break(8)        Jump: 12        Pop: 2                          { align16 };
diff --git a/src/intel/compiler/elk/tests/gen5/break.expected b/src/intel/compiler/elk/tests/gen5/break.expected
new file mode 100644
index 00000000000..1ccb9111760
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/break.expected
@@ -0,0 +1,4 @@
+28 00 61 00 00 1c 00 34 00 14 60 00 1a 01 00 00
+28 00 81 00 00 1c 00 34 00 14 60 00 1a 01 00 00
+28 01 62 00 00 1c 0f 34 04 14 6e 00 20 00 00 00
+28 01 60 00 00 1c 0f 34 04 14 6e 00 0c 00 02 00
diff --git a/src/intel/compiler/elk/tests/gen5/cmp.asm b/src/intel/compiler/elk/tests/gen5/cmp.asm
new file mode 100644
index 00000000000..d2cbcf031b2
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/cmp.asm
@@ -0,0 +1,91 @@
+cmp.ge.f0.0(8)  null<1>D        g12<8,8,1>D     16D             { align1 };
+cmp.ge.f0.0(16) null<1>D        g14<8,8,1>D     16D             { align1 compr };
+cmp.ge.f0.0(8)  null<1>F        g3<8,8,1>F      0x0F  /* 0F */  { align1 };
+cmp.ge.f0.0(16) null<1>F        g6<8,8,1>F      0x0F  /* 0F */  { align1 compr };
+cmp.ge.f0.0(8)  null<1>F        g5<4>.xF        0x0F  /* 0F */  { align16 };
+cmp.l.f0.0(8)   null<1>F        g5<4>.wF        0x43000000F  /* 128F */ { align16 };
+cmp.le.f0.0(8)  g5<1>.xF        g5<4>.xF        0x0F  /* 0F */  { align16 };
+cmp.nz.f0.0(8)  null<1>.zD      -g5<4>.xD       0D              { align16 };
+cmp.ge.f0.0(8)  g6<1>F          g4<8,8,1>F      0x26901d7dF  /* 1e-15F */ { align1 };
+cmp.ge.f0.0(16) g12<1>F         g8<8,8,1>F      0x26901d7dF  /* 1e-15F */ { align1 compr };
+cmp.ge.f0.0(8)  null<1>F        (abs)g4<8,8,1>F (abs)g3<8,8,1>F { align1 };
+cmp.ge.f0.0(16) null<1>F        (abs)g16<8,8,1>F (abs)g8<8,8,1>F { align1 compr };
+cmp.z.f0.0(8)   null<1>D        g3<8,8,1>D      1D              { align1 };
+cmp.z.f0.0(16)  null<1>D        g8<8,8,1>D      1D              { align1 compr };
+cmp.nz.f0.0(8)  g5<1>F          g5<8,8,1>F      g38<8,8,1>F     { align1 };
+cmp.ge.f0.0(8)  null<1>.xD      g5<4>.xD        4D              { align16 };
+cmp.z.f0.0(8)   g9<1>.xD        g4<4>.xD        g1<0>.xD        { align16 };
+cmp.z.f0.0(8)   g10<1>.xD       g1<0>.xD        1D              { align16 };
+cmp.nz.f0.0(8)  null<1>F        g2.4<0,1,0>F    0x0F  /* 0F */  { align1 };
+cmp.nz.f0.0(16) g20<1>F         g6<8,8,1>F      -g14<8,8,1>F    { align1 compr };
+cmp.nz.f0.0(16) null<1>F        g2.4<0,1,0>F    0x0F  /* 0F */  { align1 compr };
+cmp.z.f0.0(8)   null<1>F        g4.1<0,1,0>F    0x3f800000F  /* 1F */ { align1 };
+cmp.z.f0.0(16)  null<1>F        g4.1<0,1,0>F    0x3f800000F  /* 1F */ { align1 compr };
+cmp.l.f0.0(8)   g31<1>.xyzF     g8<0>.wF        g30<4>.xF       { align16 };
+cmp.z.f0.0(8)   g3<1>D          g3<8,8,1>D      g2.5<0,1,0>D    { align1 };
+cmp.z.f0.0(16)  g4<1>D          g8<8,8,1>D      g2.5<0,1,0>D    { align1 compr };
+cmp.nz.f0.0(8)  null<1>F        g1<0>.xF        0x0F  /* 0F */  { align16 };
+cmp.nz.f0.0(8)  g17<1>.xF       g17<4>.xF       g1<0>.zF        { align16 };
+cmp.le.f0.0(8)  g5<1>.xD        g1<0>.xD        0D              { align16 };
+cmp.le.f0.0(8)  null<1>F        g2<8,8,1>F      0x3f000000F  /* 0.5F */ { align1 };
+cmp.le.f0.0(8)  g10<1>F         g2<8,8,1>F      0x461c3f9aF  /* 9999.9F */ { align1 };
+cmp.ge.f0.0(8)  g9<1>F          -g3<8,8,1>F     g9<8,8,1>F      { align1 };
+cmp.nz.f0.0(8)  null<1>D        g20<8,8,1>D     0D              { align1 };
+cmp.nz.f0.0(8)  g24<1>D         g20<8,8,1>D     2D              { align1 };
+cmp.z.f0.0(8)   g25<1>D         g20<8,8,1>D     2D              { align1 };
+cmp.le.f0.0(16) null<1>F        g4<8,8,1>F      0x3f000000F  /* 0.5F */ { align1 compr };
+cmp.le.f0.0(16) g20<1>F         g4<8,8,1>F      0x461c3f9aF  /* 9999.9F */ { align1 compr };
+cmp.ge.f0.0(16) g24<1>F         -g6<8,8,1>F     g12<8,8,1>F     { align1 compr };
+cmp.nz.f0.0(16) null<1>D        g40<8,8,1>D     0D              { align1 compr };
+cmp.nz.f0.0(16) g48<1>D         g40<8,8,1>D     2D              { align1 compr };
+cmp.z.f0.0(16)  g52<1>D         g40<8,8,1>D     2D              { align1 compr };
+cmp.g.f0.0(8)   g4<1>F          g3<4>F          0x3f000000F  /* 0.5F */ { align16 };
+cmp.l.f0.0(8)   null<1>F        g1<0>F          g3<4>F          { align16 };
+cmp.z.f0.0(8)   null<1>.xD      g1<0>.xD        1D              { align16 };
+cmp.ge.f0.0(8)  g3<1>F          g1<0>F          g1.4<0>F        { align16 };
+cmp.g.f0.0(8)   g3<1>F          (abs)g2<8,8,1>F 0x3a83126fF  /* 0.001F */ { align1 };
+cmp.g.f0.0(16)  g6<1>F          (abs)g4<8,8,1>F 0x3a83126fF  /* 0.001F */ { align1 compr };
+cmp.le.f0.0(8)  null<1>.xF      g8<4>.xF        0x3f000000F  /* 0.5F */ { align16 };
+cmp.ge.f0.0(8)  g48<1>.xF       g8<4>.xF        0x3727c5acF  /* 1e-05F */ { align16 };
+cmp.ge.f0.0(8)  null<1>.xF      g22<4>.xF       g10<4>.xF       { align16 };
+cmp.l.f0.0(8)   null<1>F        g2<0,1,0>F      0x3eb33333F  /* 0.35F */ { align1 };
+cmp.l.f0.0(8)   null<1>F        g3<8,8,1>F      g4<8,8,1>F      { align1 };
+cmp.l.f0.0(16)  null<1>F        g2<0,1,0>F      0x3eb33333F  /* 0.35F */ { align1 compr };
+cmp.l.f0.0(16)  null<1>F        g4<8,8,1>F      g6<8,8,1>F      { align1 compr };
+cmp.l.f0.0(8)   g2<1>F          g2<8,8,1>F      0x3b800000F  /* 0.00390625F */ { align1 };
+cmp.l.f0.0(16)  g4<1>F          g6<8,8,1>F      0x3b800000F  /* 0.00390625F */ { align1 compr };
+cmp.z.f0.0(8)   null<1>D        g3<8,8,1>D      g2<0,1,0>D      { align1 };
+cmp.nz.f0.0(8)  g4<1>D          g3<8,8,1>D      g2.1<0,1,0>D    { align1 };
+cmp.z.f0.0(16)  null<1>D        g6<8,8,1>D      g2<0,1,0>D      { align1 compr };
+cmp.nz.f0.0(16) g4<1>D          g6<8,8,1>D      g2.1<0,1,0>D    { align1 compr };
+cmp.nz.f0.0(8)  g3<1>F          g3<8,8,1>F      0x0F  /* 0F */  { align1 };
+(+f0.1) cmp.z.f0.1(8) null<1>D  g3<8,8,1>D      0D              { align1 };
+cmp.nz.f0.0(16) g8<1>F          g6<8,8,1>F      0x0F  /* 0F */  { align1 compr };
+(+f0.1) cmp.z.f0.1(16) null<1>D g6<8,8,1>D      0D              { align1 compr };
+cmp.ge.f0.0(8)  null<1>D        g4<8,8,1>D      g2<0,1,0>D      { align1 };
+cmp.ge.f0.0(16) null<1>D        g6<8,8,1>D      g2<0,1,0>D      { align1 compr };
+cmp.nz.f0.0(8)  null<1>F        g4<4>.xyyyF     g3<4>.xyyyF     { align16 };
+cmp.z.f0.0(8)   g3<1>F          g3<4>F          0x0F  /* 0F */  { align16 };
+cmp.nz.f0.0(8)  g11<1>.xD       g4<4>.xD        10D             { align16 };
+cmp.nz.f0.0(8)  null<1>.xD      g6<4>.xD        g3<4>.xD        { align16 };
+cmp.z.f0.0(8)   g3<1>F          g3<8,8,1>F      g2.2<0,1,0>F    { align1 };
+cmp.z.f0.0(16)  g8<1>F          g6<8,8,1>F      g2.2<0,1,0>F    { align1 compr };
+cmp.nz.f0.0(8)  g3<1>F          g3<4>F          0x0F  /* 0F */  { align16 };
+cmp.l.f0.0(8)   g5<1>F          g2<0,1,0>F      g7<8,8,1>F      { align1 };
+cmp.l.f0.0(16)  g4<1>F          g2<0,1,0>F      g16<8,8,1>F     { align1 compr };
+cmp.le.f0.0(8)  g3<1>D          g2<0,1,0>D      0D              { align1 };
+cmp.le.f0.0(16) g4<1>D          g2<0,1,0>D      0D              { align1 compr };
+cmp.l.f0.0(8)   g5<1>.xF        g3<0>.zF        0x3f000000F  /* 0.5F */ { align16 };
+cmp.ge.f0.0(8)  null<1>.xD      g5<4>.xD        g1<0>.xD        { align16 };
+cmp.l.f0.0(8)   null<1>.xD      g6<4>.xD        g5<4>.xD        { align16 };
+cmp.ge.f0.0(8)  g10<1>.xD       g5<4>.xD        2D              { align16 };
+cmp.z.f0.0(8)   g3<1>F          g3<8,8,1>F      0x40a00000F  /* 5F */ { align1 };
+cmp.z.f0.0(16)  g8<1>F          g4<8,8,1>F      0x40a00000F  /* 5F */ { align1 compr };
+cmp.g.f0.0(8)   null<1>.xF      g2<4>.zF        0x3f400000F  /* 0.75F */ { align16 };
+cmp.le.f0.0(8)  g3<1>.xUD       g1<0>.xUD       0x00000001UD    { align16 };
+cmp.z.f0.0(8)   null<1>F        g8<4>.xyzzF     g3<0>.yzwwF     { align16 };
+cmp.z.f0.0(8)   g8<1>.xF        g8<4>.xF        g3<0>.yF        { align16 };
+cmp.g.f0.0(8)   null<1>.xD      g1<0>.xD        0D              { align16 };
+cmp.g.f0.0(8)   g8<1>.xD        g1<0>.xD        2D              { align16 };
+cmp.nz.f0.0(8)  null<1>F        g27<8,8,1>F     g2.6<0,1,0>F    { align1 };
+cmp.nz.f0.0(16) null<1>F        g16<8,8,1>F     g2.6<0,1,0>F    { align1 compr };
diff --git a/src/intel/compiler/elk/tests/gen5/cmp.expected b/src/intel/compiler/elk/tests/gen5/cmp.expected
new file mode 100644
index 00000000000..c8ba62906ec
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/cmp.expected
@@ -0,0 +1,91 @@
+10 00 60 04 a4 1c 00 20 80 01 8d 00 10 00 00 00
+10 20 80 04 a4 1c 00 20 c0 01 8d 00 10 00 00 00
+10 00 60 04 bc 7f 00 20 60 00 8d 00 00 00 00 00
+10 20 80 04 bc 7f 00 20 c0 00 8d 00 00 00 00 00
+10 01 60 04 bc 7f 0f 20 a0 00 60 00 00 00 00 00
+10 01 60 05 bc 7f 0f 20 af 00 6f 00 00 00 00 43
+10 01 60 06 bd 7f a1 20 a0 00 60 00 00 00 00 00
+10 01 60 02 a4 1c 04 20 a0 40 60 00 00 00 00 00
+10 00 60 04 bd 7f c0 20 80 00 8d 00 7d 1d 90 26
+10 20 80 04 bd 7f 80 21 00 01 8d 00 7d 1d 90 26
+10 00 60 04 bc 77 00 20 80 20 8d 00 60 20 8d 00
+10 20 80 04 bc 77 00 20 00 22 8d 00 00 21 8d 00
+10 00 60 01 a4 1c 00 20 60 00 8d 00 01 00 00 00
+10 20 80 01 a4 1c 00 20 00 01 8d 00 01 00 00 00
+10 00 60 02 bd 77 a0 20 a0 00 8d 00 c0 04 8d 00
+10 01 60 04 a4 1c 01 20 a0 00 60 00 04 00 00 00
+10 01 60 01 a5 14 21 21 80 00 60 00 20 00 00 00
+10 01 60 01 a5 1c 41 21 20 00 00 00 01 00 00 00
+10 00 60 02 bc 7f 00 20 50 00 00 00 00 00 00 00
+10 20 80 02 bd 77 80 22 c0 00 8d 00 c0 41 8d 00
+10 20 80 02 bc 7f 00 20 50 00 00 00 00 00 00 00
+10 00 60 01 bc 7f 00 20 84 00 00 00 00 00 80 3f
+10 20 80 01 bc 7f 00 20 84 00 00 00 00 00 80 3f
+10 01 60 05 bd 77 e7 23 0f 01 0f 00 c0 03 60 00
+10 00 60 01 a5 14 60 20 60 00 8d 00 54 00 00 00
+10 20 80 01 a5 14 80 20 00 01 8d 00 54 00 00 00
+10 01 60 02 bc 7f 0f 20 20 00 00 00 00 00 00 00
+10 01 60 02 bd 77 21 22 20 02 60 00 2a 00 0a 00
+10 01 60 06 a5 1c a1 20 20 00 00 00 00 00 00 00
+10 00 60 06 bc 7f 00 20 40 00 8d 00 00 00 00 3f
+10 00 60 06 bd 7f 40 21 40 00 8d 00 9a 3f 1c 46
+10 00 60 04 bd 77 20 21 60 40 8d 00 20 01 8d 00
+10 00 60 02 a4 1c 00 20 80 02 8d 00 00 00 00 00
+10 00 60 02 a5 1c 00 23 80 02 8d 00 02 00 00 00
+10 00 60 01 a5 1c 20 23 80 02 8d 00 02 00 00 00
+10 20 80 06 bc 7f 00 20 80 00 8d 00 00 00 00 3f
+10 20 80 06 bd 7f 80 22 80 00 8d 00 9a 3f 1c 46
+10 20 80 04 bd 77 00 23 c0 40 8d 00 80 01 8d 00
+10 20 80 02 a4 1c 00 20 00 05 8d 00 00 00 00 00
+10 20 80 02 a5 1c 00 26 00 05 8d 00 02 00 00 00
+10 20 80 01 a5 1c 80 26 00 05 8d 00 02 00 00 00
+10 01 60 03 bd 7f 8f 20 64 00 6e 00 00 00 00 3f
+10 01 60 05 bc 77 0f 20 24 00 0e 00 64 00 6e 00
+10 01 60 01 a4 1c 01 20 20 00 00 00 01 00 00 00
+10 01 60 04 bd 77 6f 20 24 00 0e 00 34 00 0e 00
+10 00 60 03 bd 7f 60 20 40 20 8d 00 6f 12 83 3a
+10 20 80 03 bd 7f c0 20 80 20 8d 00 6f 12 83 3a
+10 01 60 06 bc 7f 01 20 00 01 60 00 00 00 00 3f
+10 01 60 04 bd 7f 01 26 00 01 60 00 ac c5 27 37
+10 01 60 04 bc 77 01 20 c0 02 60 00 40 01 60 00
+10 00 60 05 bc 7f 00 20 40 00 00 00 33 33 b3 3e
+10 00 60 05 bc 77 00 20 60 00 8d 00 80 00 8d 00
+10 20 80 05 bc 7f 00 20 40 00 00 00 33 33 b3 3e
+10 20 80 05 bc 77 00 20 80 00 8d 00 c0 00 8d 00
+10 00 60 05 bd 7f 40 20 40 00 8d 00 00 00 80 3b
+10 20 80 05 bd 7f 80 20 c0 00 8d 00 00 00 80 3b
+10 00 60 01 a4 14 00 20 60 00 8d 00 40 00 00 00
+10 00 60 02 a5 14 80 20 60 00 8d 00 44 00 00 00
+10 20 80 01 a4 14 00 20 c0 00 8d 00 40 00 00 00
+10 20 80 02 a5 14 80 20 c0 00 8d 00 44 00 00 00
+10 00 60 02 bd 7f 60 20 60 00 8d 00 00 00 00 00
+10 00 61 01 a4 1c 00 20 60 00 8d 02 00 00 00 00
+10 20 80 02 bd 7f 00 21 c0 00 8d 00 00 00 00 00
+10 20 81 01 a4 1c 00 20 c0 00 8d 02 00 00 00 00
+10 00 60 04 a4 14 00 20 80 00 8d 00 40 00 00 00
+10 20 80 04 a4 14 00 20 c0 00 8d 00 40 00 00 00
+10 01 60 02 bc 77 0f 20 84 00 65 00 64 00 65 00
+10 01 60 01 bd 7f 6f 20 64 00 6e 00 00 00 00 00
+10 01 60 02 a5 1c 61 21 80 00 60 00 0a 00 00 00
+10 01 60 02 a4 14 01 20 c0 00 60 00 60 00 60 00
+10 00 60 01 bd 77 60 20 60 00 8d 00 48 00 00 00
+10 20 80 01 bd 77 00 21 c0 00 8d 00 48 00 00 00
+10 01 60 02 bd 7f 6f 20 64 00 6e 00 00 00 00 00
+10 00 60 05 bd 77 a0 20 40 00 00 00 e0 00 8d 00
+10 20 80 05 bd 77 80 20 40 00 00 00 00 02 8d 00
+10 00 60 06 a5 1c 60 20 40 00 00 00 00 00 00 00
+10 20 80 06 a5 1c 80 20 40 00 00 00 00 00 00 00
+10 01 60 05 bd 7f a1 20 6a 00 0a 00 00 00 00 3f
+10 01 60 04 a4 14 01 20 a0 00 60 00 20 00 00 00
+10 01 60 05 a4 14 01 20 c0 00 60 00 a0 00 60 00
+10 01 60 04 a5 1c 41 21 a0 00 60 00 02 00 00 00
+10 00 60 01 bd 7f 60 20 60 00 8d 00 00 00 a0 40
+10 20 80 01 bd 7f 00 21 80 00 8d 00 00 00 a0 40
+10 01 60 03 bc 7f 01 20 4a 00 6a 00 00 00 40 3f
+10 01 60 06 21 0c 61 20 20 00 00 00 01 00 00 00
+10 01 60 01 bc 77 0f 20 04 01 6a 00 69 00 0f 00
+10 01 60 01 bd 77 01 21 00 01 60 00 65 00 05 00
+10 01 60 03 a4 1c 01 20 20 00 00 00 00 00 00 00
+10 01 60 03 a5 1c 01 21 20 00 00 00 02 00 00 00
+10 00 60 02 bc 77 00 20 60 03 8d 00 58 00 00 00
+10 20 80 02 bc 77 00 20 00 02 8d 00 58 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen5/do.asm b/src/intel/compiler/elk/tests/gen5/do.asm
new file mode 100644
index 00000000000..945fdc1c7e6
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/do.asm
@@ -0,0 +1,3 @@
+do(8)                                                           { align1 };
+do(16)                                                          { align1 };
+do(8)                                                           { align16 };
diff --git a/src/intel/compiler/elk/tests/gen5/do.expected b/src/intel/compiler/elk/tests/gen5/do.expected
new file mode 100644
index 00000000000..7dac5f7f435
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/do.expected
@@ -0,0 +1,3 @@
+26 00 60 00 9c 73 00 20 00 00 8d 00 00 00 8d 00
+26 00 80 00 9c 73 00 20 00 00 8d 00 00 00 8d 00
+26 01 60 00 9c 73 0f 20 04 00 6e 00 04 00 6e 00
diff --git a/src/intel/compiler/elk/tests/gen5/dp3.asm b/src/intel/compiler/elk/tests/gen5/dp3.asm
new file mode 100644
index 00000000000..4af3bc91ae1
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/dp3.asm
@@ -0,0 +1,10 @@
+dp3(8)          m5<1>.xF        g3<0>.xyzzF     g6<4>.xyzzF     { align16 NoDDClr };
+dp3(8)          m5<1>.yF        g3.4<0>.xyzzF   g6<4>.xyzzF     { align16 NoDDClr,NoDDChk };
+dp3(8)          g25<1>.xF       g17<4>.xyzzF    g3<0>.xyzzF     { align16 };
+dp3(8)          g19<1>.xF       g3<0>.xyzzF     g3.4<0>.xyzzF   { align16 NoDDClr };
+dp3(8)          g19<1>.yF       g3<0>.xyzzF     g4<0>.xyzzF     { align16 NoDDClr,NoDDChk };
+dp3(8)          g19<1>.zF       g3<0>.xyzzF     g4.4<0>.xyzzF   { align16 NoDDChk };
+dp3(8)          m5<1>.xF        g4<4>.xyzzF     g5<4>.xyzzF     { align16 };
+dp3.le.f0.0(8)  g18<1>.xF       g17<4>.xyzzF    g3.4<0>.xyzzF   { align16 };
+dp3.sat(8)      g4<1>.xF        g4<4>.xyzzF     g5<4>.xyzzF     { align16 };
+dp3.sat(8)      m5<1>F          g3<4>.xyzzF     g3<4>.xyzzF     { align16 };
diff --git a/src/intel/compiler/elk/tests/gen5/dp3.expected b/src/intel/compiler/elk/tests/gen5/dp3.expected
new file mode 100644
index 00000000000..01365ea8889
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/dp3.expected
@@ -0,0 +1,10 @@
+56 05 60 00 be 77 a1 20 64 00 0a 00 c4 00 6a 00
+56 0d 60 00 be 77 a2 20 74 00 0a 00 c4 00 6a 00
+56 01 60 00 bd 77 21 23 24 02 6a 00 64 00 0a 00
+56 05 60 00 bd 77 61 22 64 00 0a 00 74 00 0a 00
+56 0d 60 00 bd 77 62 22 64 00 0a 00 84 00 0a 00
+56 09 60 00 bd 77 64 22 64 00 0a 00 94 00 0a 00
+56 01 60 00 be 77 a1 20 84 00 6a 00 a4 00 6a 00
+56 01 60 06 bd 77 41 22 24 02 6a 00 74 00 0a 00
+56 01 60 80 bd 77 81 20 84 00 6a 00 a4 00 6a 00
+56 01 60 80 be 77 af 20 64 00 6a 00 64 00 6a 00
diff --git a/src/intel/compiler/elk/tests/gen5/dp4.asm b/src/intel/compiler/elk/tests/gen5/dp4.asm
new file mode 100644
index 00000000000..3ea82da7ffd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/dp4.asm
@@ -0,0 +1,6 @@
+dp4(8)          g6<1>.xF        g3<4>F          g1<0>F          { align16 };
+dp4(8)          g4<1>.xF        g5<4>F          g1<0>F          { align16 NoDDClr };
+dp4(8)          g4<1>.yF        g5<4>F          g1.4<0>F        { align16 NoDDClr,NoDDChk };
+dp4(8)          g4<1>.wF        g5<4>F          g2.4<0>F        { align16 NoDDChk };
+dp4(8)          m5<1>.xF        g4<4>F          g5<4>F          { align16 };
+dp4.sat(8)      m5<1>F          g3<4>.xF        g3<4>F          { align16 };
diff --git a/src/intel/compiler/elk/tests/gen5/dp4.expected b/src/intel/compiler/elk/tests/gen5/dp4.expected
new file mode 100644
index 00000000000..cae5f7689ea
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/dp4.expected
@@ -0,0 +1,6 @@
+54 01 60 00 bd 77 c1 20 64 00 6e 00 24 00 0e 00
+54 05 60 00 bd 77 81 20 a4 00 6e 00 24 00 0e 00
+54 0d 60 00 bd 77 82 20 a4 00 6e 00 34 00 0e 00
+54 09 60 00 bd 77 88 20 a4 00 6e 00 54 00 0e 00
+54 01 60 00 be 77 a1 20 84 00 6e 00 a4 00 6e 00
+54 01 60 80 be 77 af 20 60 00 60 00 64 00 6e 00
diff --git a/src/intel/compiler/elk/tests/gen5/dph.asm b/src/intel/compiler/elk/tests/gen5/dph.asm
new file mode 100644
index 00000000000..2992b2802ba
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/dph.asm
@@ -0,0 +1,4 @@
+dph(8)          m5<1>.xF        g4<4>.xyzxF     g5<4>F          { align16 };
+dph.sat(8)      m5<1>F          g1<0>.xyzxF     g3<4>F          { align16 };
+dph(8)          g5<1>.xF        g4<4>.xyzxF     g1<0>F          { align16 NoDDClr };
+dph(8)          g5<1>.yF        g4<4>.xyzxF     g1.4<0>F        { align16 NoDDClr,NoDDChk };
diff --git a/src/intel/compiler/elk/tests/gen5/dph.expected b/src/intel/compiler/elk/tests/gen5/dph.expected
new file mode 100644
index 00000000000..c54009202f9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/dph.expected
@@ -0,0 +1,4 @@
+55 01 60 00 be 77 a1 20 84 00 62 00 a4 00 6e 00
+55 01 60 80 be 77 af 20 24 00 02 00 64 00 6e 00
+55 05 60 00 bd 77 a1 20 84 00 62 00 24 00 0e 00
+55 0d 60 00 bd 77 a2 20 84 00 62 00 34 00 0e 00
diff --git a/src/intel/compiler/elk/tests/gen5/else.asm b/src/intel/compiler/elk/tests/gen5/else.asm
new file mode 100644
index 00000000000..114c607ef53
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/else.asm
@@ -0,0 +1,3 @@
+else(8)         Jump: 86        Pop: 1                          { align1 switch };
+else(16)        Jump: 86        Pop: 1                          { align1 switch };
+else(8)         Jump: 14        Pop: 1                          { align16 switch };
diff --git a/src/intel/compiler/elk/tests/gen5/else.expected b/src/intel/compiler/elk/tests/gen5/else.expected
new file mode 100644
index 00000000000..1ef88bcbeb8
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/else.expected
@@ -0,0 +1,3 @@
+24 80 60 00 00 1c 00 34 00 14 60 00 56 00 01 00
+24 80 80 00 00 1c 00 34 00 14 60 00 56 00 01 00
+24 81 60 00 00 1c 0f 34 04 14 6e 00 0e 00 01 00
diff --git a/src/intel/compiler/elk/tests/gen5/endif.asm b/src/intel/compiler/elk/tests/gen5/endif.asm
new file mode 100644
index 00000000000..48994f75772
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/endif.asm
@@ -0,0 +1,3 @@
+endif(8)        Pop: 1                                          { align16 switch };
+endif(8)        Pop: 1                                          { align1 switch };
+endif(16)       Pop: 1                                          { align1 switch };
diff --git a/src/intel/compiler/elk/tests/gen5/endif.expected b/src/intel/compiler/elk/tests/gen5/endif.expected
new file mode 100644
index 00000000000..67335a7e387
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/endif.expected
@@ -0,0 +1,3 @@
+25 81 60 00 84 1c 0f 20 04 00 6e 00 00 00 01 00
+25 80 60 00 84 1c 00 20 00 00 8d 00 00 00 01 00
+25 80 80 00 84 1c 00 20 00 00 8d 00 00 00 01 00
diff --git a/src/intel/compiler/elk/tests/gen5/frc.asm b/src/intel/compiler/elk/tests/gen5/frc.asm
new file mode 100644
index 00000000000..102fba1959d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/frc.asm
@@ -0,0 +1,4 @@
+frc.sat(8)      m5<1>F          g3<4>F                          { align16 };
+frc(8)          g7<1>.xF        (abs)g1<0>.xF                   { align16 };
+frc(8)          g4<1>F          g3<8,8,1>F                      { align1 };
+frc(16)         g4<1>F          g6<8,8,1>F                      { align1 compr };
diff --git a/src/intel/compiler/elk/tests/gen5/frc.expected b/src/intel/compiler/elk/tests/gen5/frc.expected
new file mode 100644
index 00000000000..4bd4f50ee3c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/frc.expected
@@ -0,0 +1,4 @@
+43 01 60 80 be 03 af 20 64 00 6e 00 00 00 00 00
+43 01 60 00 bd 03 e1 20 20 20 00 00 00 00 00 00
+43 00 60 00 bd 03 80 20 60 00 8d 00 00 00 00 00
+43 20 80 00 bd 03 80 20 c0 00 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen5/if.asm b/src/intel/compiler/elk/tests/gen5/if.asm
new file mode 100644
index 00000000000..e6f832ec474
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/if.asm
@@ -0,0 +1,3 @@
+(+f0.0) if(8)   Jump: 10                                        { align1 switch };
+(+f0.0) if(16)  Jump: 10                                        { align1 switch };
+(+f0.0.x) if(8) Jump: 26                                        { align16 switch };
diff --git a/src/intel/compiler/elk/tests/gen5/if.expected b/src/intel/compiler/elk/tests/gen5/if.expected
new file mode 100644
index 00000000000..d6199765627
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/if.expected
@@ -0,0 +1,3 @@
+22 80 61 00 00 1c 00 34 00 14 60 00 0a 00 00 00
+22 80 81 00 00 1c 00 34 00 14 60 00 0a 00 00 00
+22 81 62 00 00 1c 0f 34 04 14 6e 00 1a 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen5/iff.asm b/src/intel/compiler/elk/tests/gen5/iff.asm
new file mode 100644
index 00000000000..6ccc9c49c46
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/iff.asm
@@ -0,0 +1,3 @@
+(+f0.0.x) iff(8) Jump: 22                                       { align16 switch };
+(+f0.0) iff(8)  Jump: 44                                        { align1 switch };
+(+f0.0) iff(16) Jump: 44                                        { align1 switch };
diff --git a/src/intel/compiler/elk/tests/gen5/iff.expected b/src/intel/compiler/elk/tests/gen5/iff.expected
new file mode 100644
index 00000000000..75a81c3788e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/iff.expected
@@ -0,0 +1,3 @@
+23 81 62 00 00 1c 0f 34 04 14 6e 00 16 00 00 00
+23 80 61 00 00 1c 00 34 00 14 60 00 2c 00 00 00
+23 80 81 00 00 1c 00 34 00 14 60 00 2c 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen5/jmpi.asm b/src/intel/compiler/elk/tests/gen5/jmpi.asm
new file mode 100644
index 00000000000..e3dc60543d9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/jmpi.asm
@@ -0,0 +1 @@
+(+f0.0) jmpi(1) 0x00000004UD                                    { align1 nomask };
diff --git a/src/intel/compiler/elk/tests/gen5/jmpi.expected b/src/intel/compiler/elk/tests/gen5/jmpi.expected
new file mode 100644
index 00000000000..9c2dfcfcf5a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/jmpi.expected
@@ -0,0 +1 @@
+20 02 01 00 00 0c 00 34 00 14 00 00 04 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen5/mach.asm b/src/intel/compiler/elk/tests/gen5/mach.asm
new file mode 100644
index 00000000000..4854911925b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/mach.asm
@@ -0,0 +1,4 @@
+mach(8)         g3<1>UD         g2<8,8,1>UD     0xaaaaaaabUD    { align1 };
+mach(8)         g2<1>D          g2<8,8,1>D      1431655766D     { align1 };
+mach(16)        g4<1>UD         g12<8,8,1>UD    0xaaaaaaabUD    { align1 compr };
+mach(16)        g4<1>D          g12<8,8,1>D     1431655766D     { align1 compr };
diff --git a/src/intel/compiler/elk/tests/gen5/mach.expected b/src/intel/compiler/elk/tests/gen5/mach.expected
new file mode 100644
index 00000000000..472d8b03ba0
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/mach.expected
@@ -0,0 +1,4 @@
+49 00 60 00 21 0c 60 20 40 00 8d 00 ab aa aa aa
+49 00 60 00 a5 1c 40 20 40 00 8d 00 56 55 55 55
+49 20 80 00 21 0c 80 20 80 01 8d 00 ab aa aa aa
+49 20 80 00 a5 1c 80 20 80 01 8d 00 56 55 55 55
diff --git a/src/intel/compiler/elk/tests/gen5/mov.asm b/src/intel/compiler/elk/tests/gen5/mov.asm
new file mode 100644
index 00000000000..d441898a286
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/mov.asm
@@ -0,0 +1,103 @@
+mov(8)          m2<1>UD         g1<8,8,1>UD                     { align1 nomask };
+mov(8)          g9<1>.xyzUD     0x00000000UD                    { align16 };
+mov.sat(8)      m5<1>F          g4<4>F                          { align16 };
+mov(8)          m4<1>F          g6<4>F                          { align16 };
+mov(8)          m2<1>UD         g9<4>UD                         { align16 };
+mov(8)          m3<1>F          g4.3<0,1,0>F                    { align1 };
+mov(16)         m3<1>F          g4.3<0,1,0>F                    { align1 compr4 };
+mov(8)          g2<1>F          g2<8,8,1>UW                     { align1 };
+mov(8)          g2<1>D          g2<8,8,1>F                      { align1 };
+mov(8)          g2<1>F          g2<8,8,1>D                      { align1 };
+mov(16)         g12<1>F         g4<8,8,1>UW                     { align1 compr };
+mov(16)         g4<1>D          g12<8,8,1>F                     { align1 compr };
+mov(16)         g12<1>F         g4<8,8,1>D                      { align1 compr };
+mov(8)          m2<1>UD         0x00000000UD                    { align16 };
+mov(8)          m4<1>F          0x0F             /* 0F */       { align1 };
+mov(16)         m3<1>F          0x0F             /* 0F */       { align1 compr4 };
+mov(8)          g2<1>.xD        224D                            { align16 };
+mov(8)          m15<1>D         g2<4>.xUD                       { align16 };
+mov(8)          g12<1>D         0D                              { align1 };
+mov(8)          m2<1>UD         0x00000000UD                    { align1 };
+mov(16)         g14<1>D         0D                              { align1 compr };
+mov(16)         m2<1>UD         0x00000000UD                    { align1 compr };
+mov(8)          g5<1>.xyF       0x3000VF        /* [0F, 1F, 0F, 0F]VF */ { align16 };
+mov(8)          g4<1>.xyD       g2<4>.xyyyD                     { align16 };
+mov(8)          m4<1>D          g9.3<0,1,0>D                    { align1 };
+mov(8)          m5<1>UD         0D                              { align1 };
+mov(8)          m2<1>D          g2<8,8,1>F                      { align1 };
+mov(16)         m6<1>D          g9.3<0,1,0>D                    { align1 compr };
+mov(16)         m8<1>UD         0D                              { align1 compr };
+mov(16)         m2<1>D          g4<8,8,1>F                      { align1 compr };
+mov.sat(8)      m3<1>F          g2<0,1,0>F                      { align1 };
+mov.sat(16)     m3<1>F          g2<0,1,0>F                      { align1 compr4 };
+mov(8)          m5<1>.wD        0D                              { align16 NoDDChk };
+mov(8)          m3<1>F          0x42fc6666F      /* 126.2F */   { align1 sechalf };
+mov(8)          m5<1>.wD        g8<4>.wD                        { align16 NoDDChk };
+mov(8)          g6<1>.xD        g6<4>.xF                        { align16 };
+mov(8)          m3<1>UD         g2<8,8,1>UD                     { align1 };
+mov(16)         m3<1>UD         g4<8,8,1>UD                     { align1 compr4 };
+mov(8)          m5<1>F          0x28000030VF    /* [1F, 0F, 0F, 0.75F]VF */ { align16 };
+mov(8)          m6<1>.xF        0x0F             /* 0F */       { align16 };
+mov(8)          m3<1>F          g2<8,8,1>D                      { align1 };
+mov(8)          m5<1>.xF        g1<0>.xD                        { align16 NoDDClr };
+mov(8)          m5<1>.yF        g3<4>.xD                        { align16 NoDDClr,NoDDChk };
+mov(8)          m5<1>.wF        g3<4>.xD                        { align16 NoDDChk };
+mov(8)          g3<1>.xF        g3<4>.xD                        { align16 NoDDClr };
+mov(8)          g3<1>.yF        g4<4>.xD                        { align16 NoDDClr,NoDDChk };
+mov(8)          g3<1>.wF        g4<4>.xD                        { align16 NoDDChk };
+mov(8)          g8<1>UD         g2<4>UD                         { align16 };
+mov(8)          g7<1>.xF        g3<0>.xD                        { align16 };
+mov(8)          g6<1>.xF        -g5<4>.yF                       { align16 NoDDClr };
+mov(8)          g6<1>.yD        g5<4>.xD                        { align16 NoDDChk };
+mov(8)          m2<1>D          g3<0>.xD                        { align16 };
+mov.nz.f0.0(8)  g4<1>F          -(abs)g2<0,1,0>F                { align1 };
+(+f0.0) mov(8)  g4<1>F          0xbf800000F      /* -1F */      { align1 };
+mov.nz.f0.0(16) g4<1>F          -(abs)g2<0,1,0>F                { align1 compr };
+(+f0.0) mov(16) g4<1>F          0xbf800000F      /* -1F */      { align1 compr };
+mov(8)          g3<1>.xyzF      0x0F             /* 0F */       { align16 };
+mov(8)          g3<1>.xyD       g2<4>.xyyyD                     { align16 NoDDClr };
+mov.sat(8)      m5<1>.wF        g20<4>.wF                       { align16 NoDDChk };
+mov(8)          g26<1>.xyzUD    0x00000000UD                    { align16 NoDDClr };
+mov(8)          g21<1>.xD       1065353216D                     { align16 NoDDClr };
+mov(8)          g5<1>.zwD       0D                              { align16 NoDDChk };
+mov(16)         m4<1>F          g4<8,8,1>D                      { align1 compr4 };
+mov(8)          g3<1>D          g2<8,8,1>D                      { align1 };
+mov(16)         g6<1>D          g4<8,8,1>D                      { align1 compr };
+mov(8)          m3<1>F          g4<8,8,1>F                      { align1 nomask };
+mov(8)          m15<1>F         g6<8,8,1>F                      { align1 sechalf };
+mov.sat(8)      m5<1>.zF        0x3eaaaaabF      /* 0.333333F */ { align16 };
+mov.sat(8)      m5<1>.wF        0x3dcccccdF      /* 0.1F */     { align16 NoDDClr };
+mov(8)          m5<1>.xyF       0x2030VF        /* [1F, 0.5F, 0F, 0F]VF */ { align16 NoDDChk };
+mov.sat(8)      m5<1>F          g4<4>D                          { align16 };
+mov(8)          g10<1>F         g10<8,8,1>F                     { align1 };
+mov(8)          g11<1>F         g4<8,8,1>F                      { align1 sechalf };
+mov.sat(8)      m5<1>.zF        0x3f666660F      /* 0.9F */     { align16 NoDDClr,NoDDChk };
+mov.sat(8)      m5<1>.wF        0x3e4cccc0F      /* 0.2F */     { align16 NoDDChk };
+mov(16)         g10<1>F         g2<0,1,0>F                      { align1 compr };
+mov(8)          g5<1>F          0x3f800000F      /* 1F */       { align1 };
+mov(16)         g10<1>F         0x3f800000F      /* 1F */       { align1 compr };
+mov(8)          g3<1>.zD        g1<0>.xD                        { align16 NoDDClr,NoDDChk };
+mov(1)          m14<1>D         96D                             { align1 nomask };
+mov(1)          m15<1>D         g5<0,1,0>D                      { align1 nomask };
+mov(8)          g33<1>.zD       1053609165D                     { align16 NoDDClr,NoDDChk };
+mov(8)          g2<1>.xyzF      g2<4>.wF                        { align16 };
+mov.nz.f0.0(8)  null<1>D        g2<8,8,1>D                      { align1 };
+mov.nz.f0.0(16) null<1>D        g4<8,8,1>D                      { align1 compr };
+mov(8)          m2<1>.zwF       0D                              { align16 };
+mov(8)          m5<1>.xD        1036831949D                     { align16 };
+mov(8)          m5<1>.yD        1045220557D                     { align16 NoDDClr };
+mov(8)          m5<1>.zD        1050253722D                     { align16 NoDDClr,NoDDChk };
+mov(1)          f0.1<1>UW       g0<0,1,0>UW                     { align1 nomask };
+mov(1)          g0<1>UW         f0.1<0,1,0>UW                   { align1 nomask };
+(+f0.0.any4h) mov(8) g3<1>.xD   -1D                             { align16 };
+mov.sat(8)      m5<1>.yzF       g1<0>.xxzzF                     { align16 NoDDClr };
+mov(8)          m5<1>F          g3<4>D                          { align16 };
+mov.sat(8)      m5<1>.xF        g5<4>.xD                        { align16 NoDDClr };
+mov.sat(8)      m5<1>.yF        g5<4>.xD                        { align16 NoDDClr,NoDDChk };
+mov.sat(8)      m5<1>.wF        g5<4>.xD                        { align16 NoDDChk };
+mov(8)          g4<1>D          0x7e767676VF    /* [22F, 22F, 22F, 30F]VF */ { align16 };
+mov(8)          g5<1>F          g3<4>UD                         { align16 };
+mov(8)          m5<1>.xyzF      0x3000VF        /* [0F, 1F, 0F, 0F]VF */ { align16 NoDDClr };
+mov.nz.f0.0(8)  null<1>.xD      g8<4>.xD                        { align16 };
+mov.nz.f0.0(8)  g8<1>F          -(abs)g1<0>F                    { align16 };
+(+f0.0) mov(8)  g8<1>F          0xbf800000F      /* -1F */      { align16 };
diff --git a/src/intel/compiler/elk/tests/gen5/mov.expected b/src/intel/compiler/elk/tests/gen5/mov.expected
new file mode 100644
index 00000000000..ab3947c1600
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/mov.expected
@@ -0,0 +1,103 @@
+01 02 60 00 22 00 40 20 20 00 8d 00 00 00 00 00
+01 01 60 00 61 00 27 21 00 00 00 00 00 00 00 00
+01 01 60 80 be 03 af 20 84 00 6e 00 00 00 00 00
+01 01 60 00 be 03 8f 20 c4 00 6e 00 00 00 00 00
+01 01 60 00 22 00 4f 20 24 01 6e 00 00 00 00 00
+01 00 60 00 be 03 60 20 8c 00 00 00 00 00 00 00
+01 20 80 00 be 03 60 30 8c 00 00 00 00 00 00 00
+01 00 60 00 3d 01 40 20 40 00 8d 00 00 00 00 00
+01 00 60 00 a5 03 40 20 40 00 8d 00 00 00 00 00
+01 00 60 00 bd 00 40 20 40 00 8d 00 00 00 00 00
+01 20 80 00 3d 01 80 21 80 00 8d 00 00 00 00 00
+01 20 80 00 a5 03 80 20 80 01 8d 00 00 00 00 00
+01 20 80 00 bd 00 80 21 80 00 8d 00 00 00 00 00
+01 01 60 00 62 00 4f 20 00 00 00 00 00 00 00 00
+01 00 60 00 fe 73 80 20 00 00 00 00 00 00 00 00
+01 20 80 00 fe 73 60 30 00 00 00 00 00 00 00 00
+01 01 60 00 e5 10 41 20 00 00 00 00 e0 00 00 00
+01 01 60 00 26 00 ef 21 40 00 60 00 00 00 00 00
+01 00 60 00 e5 10 80 21 00 00 00 00 00 00 00 00
+01 00 60 00 62 00 40 20 00 00 00 00 00 00 00 00
+01 20 80 00 e5 10 c0 21 00 00 00 00 00 00 00 00
+01 20 80 00 62 00 40 20 00 00 00 00 00 00 00 00
+01 01 60 00 fd 52 a3 20 00 00 00 00 00 30 00 00
+01 01 60 00 a5 00 83 20 44 00 65 00 00 00 00 00
+01 00 60 00 a6 00 80 20 2c 01 00 00 00 00 00 00
+01 00 60 00 e2 10 a0 20 00 00 00 00 00 00 00 00
+01 00 60 00 a6 03 40 20 40 00 8d 00 00 00 00 00
+01 20 80 00 a6 00 c0 20 2c 01 00 00 00 00 00 00
+01 20 80 00 e2 10 00 21 00 00 00 00 00 00 00 00
+01 20 80 00 a6 03 40 20 80 00 8d 00 00 00 00 00
+01 00 60 80 be 03 60 20 40 00 00 00 00 00 00 00
+01 20 80 80 be 03 60 30 40 00 00 00 00 00 00 00
+01 09 60 00 e6 10 a8 20 00 00 00 00 00 00 00 00
+01 10 60 00 fe 73 60 20 00 00 00 00 66 66 fc 42
+01 09 60 00 a6 00 a8 20 0f 01 6f 00 00 00 00 00
+01 01 60 00 a5 03 c1 20 c0 00 60 00 00 00 00 00
+01 00 60 00 22 00 60 20 40 00 8d 00 00 00 00 00
+01 20 80 00 22 00 60 30 80 00 8d 00 00 00 00 00
+01 01 60 00 fe 52 af 20 00 00 00 00 30 00 00 28
+01 01 60 00 fe 73 c1 20 00 00 00 00 00 00 00 00
+01 00 60 00 be 00 60 20 40 00 8d 00 00 00 00 00
+01 05 60 00 be 00 a1 20 20 00 00 00 00 00 00 00
+01 0d 60 00 be 00 a2 20 60 00 60 00 00 00 00 00
+01 09 60 00 be 00 a8 20 60 00 60 00 00 00 00 00
+01 05 60 00 bd 00 61 20 60 00 60 00 00 00 00 00
+01 0d 60 00 bd 00 62 20 80 00 60 00 00 00 00 00
+01 09 60 00 bd 00 68 20 80 00 60 00 00 00 00 00
+01 01 60 00 21 00 0f 21 44 00 6e 00 00 00 00 00
+01 01 60 00 bd 00 e1 20 60 00 00 00 00 00 00 00
+01 05 60 00 bd 03 c1 20 a5 40 65 00 00 00 00 00
+01 09 60 00 a5 00 c2 20 a0 00 60 00 00 00 00 00
+01 01 60 00 a6 00 4f 20 60 00 00 00 00 00 00 00
+01 00 60 02 bd 03 80 20 40 60 00 00 00 00 00 00
+01 00 61 00 fd 73 80 20 00 00 00 00 00 00 80 bf
+01 20 80 02 bd 03 80 20 40 60 00 00 00 00 00 00
+01 20 81 00 fd 73 80 20 00 00 00 00 00 00 80 bf
+01 01 60 00 fd 73 67 20 00 00 00 00 00 00 00 00
+01 05 60 00 a5 00 63 20 44 00 65 00 00 00 00 00
+01 09 60 80 be 03 a8 20 8f 02 6f 00 00 00 00 00
+01 05 60 00 61 00 47 23 00 00 00 00 00 00 00 00
+01 05 60 00 e5 10 a1 22 00 00 00 00 00 00 80 3f
+01 09 60 00 e5 10 ac 20 00 00 00 00 00 00 00 00
+01 20 80 00 be 00 80 30 80 00 8d 00 00 00 00 00
+01 00 60 00 a5 00 60 20 40 00 8d 00 00 00 00 00
+01 20 80 00 a5 00 c0 20 80 00 8d 00 00 00 00 00
+01 02 60 00 be 03 60 20 80 00 8d 00 00 00 00 00
+01 10 60 00 be 03 e0 21 c0 00 8d 00 00 00 00 00
+01 01 60 80 fe 73 a4 20 00 00 00 00 ab aa aa 3e
+01 05 60 80 fe 73 a8 20 00 00 00 00 cd cc cc 3d
+01 09 60 00 fe 52 a3 20 00 00 00 00 30 20 00 00
+01 01 60 80 be 00 af 20 84 00 6e 00 00 00 00 00
+01 00 60 00 bd 03 40 21 40 01 8d 00 00 00 00 00
+01 10 60 00 bd 03 60 21 80 00 8d 00 00 00 00 00
+01 0d 60 80 fe 73 a4 20 00 00 00 00 60 66 66 3f
+01 09 60 80 fe 73 a8 20 00 00 00 00 c0 cc 4c 3e
+01 20 80 00 bd 03 40 21 40 00 00 00 00 00 00 00
+01 00 60 00 fd 73 a0 20 00 00 00 00 00 00 80 3f
+01 20 80 00 fd 73 40 21 00 00 00 00 00 00 80 3f
+01 0d 60 00 a5 00 64 20 20 00 00 00 00 00 00 00
+01 02 00 00 e6 10 c0 21 00 00 00 00 60 00 00 00
+01 02 00 00 a6 00 e0 21 a0 00 00 00 00 00 00 00
+01 0d 60 00 e5 10 24 24 00 00 00 00 cd cc cc 3e
+01 01 60 00 bd 03 47 20 4f 00 6f 00 00 00 00 00
+01 00 60 02 a4 00 00 20 40 00 8d 00 00 00 00 00
+01 20 80 02 a4 00 00 20 80 00 8d 00 00 00 00 00
+01 01 60 00 fe 10 4c 20 00 00 00 00 00 00 00 00
+01 01 60 00 e6 10 a1 20 00 00 00 00 cd cc cc 3d
+01 05 60 00 e6 10 a2 20 00 00 00 00 cd cc 4c 3e
+01 0d 60 00 e6 10 a4 20 00 00 00 00 9a 99 99 3e
+01 02 00 00 28 01 02 26 00 00 00 00 00 00 00 00
+01 02 00 00 09 01 00 20 02 06 00 00 00 00 00 00
+01 01 66 00 e5 10 61 20 00 00 00 00 ff ff ff ff
+01 05 60 80 be 03 a6 20 20 00 0a 00 00 00 00 00
+01 01 60 00 be 00 af 20 64 00 6e 00 00 00 00 00
+01 05 60 80 be 00 a1 20 a0 00 60 00 00 00 00 00
+01 0d 60 80 be 00 a2 20 a0 00 60 00 00 00 00 00
+01 09 60 80 be 00 a8 20 a0 00 60 00 00 00 00 00
+01 01 60 00 e5 52 8f 20 00 00 00 00 76 76 76 7e
+01 01 60 00 3d 00 af 20 64 00 6e 00 00 00 00 00
+01 05 60 00 fe 52 a7 20 00 00 00 00 00 30 00 00
+01 01 60 02 a4 00 01 20 00 01 60 00 00 00 00 00
+01 01 60 02 bd 03 0f 21 24 60 0e 00 00 00 00 00
+01 01 61 00 fd 73 0f 21 00 00 00 00 00 00 80 bf
diff --git a/src/intel/compiler/elk/tests/gen5/mul.asm b/src/intel/compiler/elk/tests/gen5/mul.asm
new file mode 100644
index 00000000000..06998c2f310
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/mul.asm
@@ -0,0 +1,35 @@
+mul(8)          m3<1>F          g3<8,8,1>F      g2<8,8,1>F      { align1 };
+mul(16)         m3<1>F          g10<8,8,1>F     g12<8,8,1>F     { align1 compr4 };
+mul(8)          g8<1>.xyzF      g6<4>.xyzzF     g8<4>.wF        { align16 };
+mul(8)          g9<1>.wUD       g7<4>.wF        0x45000000F  /* 2048F */ { align16 };
+mul(8)          g2<1>F          g2<8,8,1>F      g6.3<0,1,0>F    { align1 };
+mul(16)         g10<1>F         g12<8,8,1>F     g6.3<0,1,0>F    { align1 compr };
+mul(8)          g2<1>.xD        g2<4>.xD        g1<0>.xD        { align16 };
+mul(8)          g5<1>F          g3<8,8,1>F      0x41800000F  /* 16F */ { align1 };
+mul(8)          m3<1>F          g8<8,8,1>F      0x3b800000F  /* 0.00390625F */ { align1 };
+mul(16)         g22<1>F         g16<8,8,1>F     0x41800000F  /* 16F */ { align1 compr };
+mul(16)         m3<1>F          g6<8,8,1>F      0x3b800000F  /* 0.00390625F */ { align1 compr4 };
+mul(8)          m5<1>.xyF       g3<4>.xyyyF     0x3f000000F  /* 0.5F */ { align16 NoDDClr };
+mul(8)          g5<1>F          g3<4>F          0x37800000F  /* 1.52588e-05F */ { align16 };
+mul.sat(8)      m2<1>F          g6<8,8,1>F      g2<8,8,1>F      { align1 };
+mul.sat(16)     m2<1>F          g14<8,8,1>F     g6<8,8,1>F      { align1 compr };
+mul.sat(8)      g8<1>F          g7<8,8,1>F      g3<8,8,1>F      { align1 };
+mul.sat(16)     g18<1>F         g16<8,8,1>F     g14<8,8,1>F     { align1 compr };
+mul(8)          acc0<1>UD       g2<8,8,1>UD     0xaaaaaaabUD    { align1 };
+mul(8)          g3<1>D          g4<8,8,1>D      g3<8,8,1>D      { align1 };
+mul(8)          acc0<1>D        g2<8,8,1>D      1431655766D     { align1 };
+mul(16)         acc0<1>UD       g12<8,8,1>UD    0xaaaaaaabUD    { align1 compr };
+mul(16)         g4<1>D          g16<8,8,1>D     g8<8,8,1>D      { align1 compr };
+mul(16)         acc0<1>D        g12<8,8,1>D     1431655766D     { align1 compr };
+mul(8)          g26<1>.wUD      g29<4>.wF       0x45000000F  /* 2048F */ { align16 NoDDChk };
+mul(8)          g2<1>.xyzF      g2<4>.wF        0x40404830VF /* [1F, 3F, 2F, 2F]VF */ { align16 };
+mul(8)          g3<1>D          g2<0,1,0>UW     g2.2<0,1,0>D    { align1 };
+mul(16)         g4<1>D          g2<0,1,0>UW     g2.2<0,1,0>D    { align1 compr };
+mul(8)          m5<1>F          g3<4>F          0x3f000000F  /* 0.5F */ { align16 };
+mul.sat(8)      m5<1>F          g6<4>F          0x3b800000F  /* 0.00390625F */ { align16 };
+mul(8)          g5<1>.xD        g5<4>.xD        32D             { align16 };
+mul.sat(8)      m5<1>F          g3<4>F          g3<4>F          { align16 };
+mul.sat(8)      m6<1>.xyzF      g32<4>.xF       g30<4>.xyzzF    { align16 NoDDClr };
+mul.sat(8)      m5<1>F          g4<4>F          0x20303030VF /* [1F, 1F, 1F, 0.5F]VF */ { align16 };
+mul(8)          m6<1>.xyzF      g12<4>.xyzzF    g13<4>.xF       { align16 NoDDClr };
+mul.sat(8)      m5<1>.xyzF      g7<4>.xF        0x3030VF /* [1F, 1F, 0F, 0F]VF */ { align16 NoDDClr };
diff --git a/src/intel/compiler/elk/tests/gen5/mul.expected b/src/intel/compiler/elk/tests/gen5/mul.expected
new file mode 100644
index 00000000000..d2a3e29b69b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/mul.expected
@@ -0,0 +1,35 @@
+41 00 60 00 be 77 60 20 60 00 8d 00 40 00 8d 00
+41 20 80 00 be 77 60 30 40 01 8d 00 80 01 8d 00
+41 01 60 00 bd 77 07 21 c4 00 6a 00 0f 01 6f 00
+41 01 60 00 a1 7f 28 21 ef 00 6f 00 00 00 00 45
+41 00 60 00 bd 77 40 20 40 00 8d 00 cc 00 00 00
+41 20 80 00 bd 77 40 21 80 01 8d 00 cc 00 00 00
+41 01 60 00 a5 14 41 20 40 00 60 00 20 00 00 00
+41 00 60 00 bd 7f a0 20 60 00 8d 00 00 00 80 41
+41 00 60 00 be 7f 60 20 00 01 8d 00 00 00 80 3b
+41 20 80 00 bd 7f c0 22 00 02 8d 00 00 00 80 41
+41 20 80 00 be 7f 60 30 c0 00 8d 00 00 00 80 3b
+41 05 60 00 be 7f a3 20 64 00 65 00 00 00 00 3f
+41 01 60 00 bd 7f af 20 64 00 6e 00 00 00 80 37
+41 00 60 80 be 77 40 20 c0 00 8d 00 40 00 8d 00
+41 20 80 80 be 77 40 20 c0 01 8d 00 c0 00 8d 00
+41 00 60 80 bd 77 00 21 e0 00 8d 00 60 00 8d 00
+41 20 80 80 bd 77 40 22 00 02 8d 00 c0 01 8d 00
+41 00 60 00 20 0c 00 24 40 00 8d 00 ab aa aa aa
+41 00 60 00 a5 14 60 20 80 00 8d 00 60 00 8d 00
+41 00 60 00 a4 1c 00 24 40 00 8d 00 56 55 55 55
+41 20 80 00 20 0c 00 24 80 01 8d 00 ab aa aa aa
+41 20 80 00 a5 14 80 20 00 02 8d 00 00 01 8d 00
+41 20 80 00 a4 1c 00 24 80 01 8d 00 56 55 55 55
+41 09 60 00 a1 7f 48 23 af 03 6f 00 00 00 00 45
+41 01 60 00 bd 5f 47 20 4f 00 6f 00 30 48 40 40
+41 00 60 00 25 15 60 20 40 00 00 00 48 00 00 00
+41 20 80 00 25 15 80 20 40 00 00 00 48 00 00 00
+41 01 60 00 be 7f af 20 64 00 6e 00 00 00 00 3f
+41 01 60 80 be 7f af 20 c4 00 6e 00 00 00 80 3b
+41 01 60 00 a5 1c a1 20 a0 00 60 00 20 00 00 00
+41 01 60 80 be 77 af 20 64 00 6e 00 64 00 6e 00
+41 05 60 80 be 77 c7 20 00 04 60 00 c4 03 6a 00
+41 01 60 80 be 5f af 20 84 00 6e 00 30 30 30 20
+41 05 60 00 be 77 c7 20 84 01 6a 00 a0 01 60 00
+41 05 60 80 be 5f a7 20 e0 00 60 00 30 30 00 00
diff --git a/src/intel/compiler/elk/tests/gen5/not.asm b/src/intel/compiler/elk/tests/gen5/not.asm
new file mode 100644
index 00000000000..699da8b1273
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/not.asm
@@ -0,0 +1,2 @@
+not(8)          g2<1>D          -g2<8,8,1>D                     { align1 };
+not(16)         g4<1>D          -g6<8,8,1>D                     { align1 compr };
diff --git a/src/intel/compiler/elk/tests/gen5/not.expected b/src/intel/compiler/elk/tests/gen5/not.expected
new file mode 100644
index 00000000000..dae14234b34
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/not.expected
@@ -0,0 +1,2 @@
+04 00 60 00 a5 00 40 20 40 40 8d 00 00 00 00 00
+04 20 80 00 a5 00 80 20 c0 40 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen5/or.asm b/src/intel/compiler/elk/tests/gen5/or.asm
new file mode 100644
index 00000000000..aa482dc1c63
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/or.asm
@@ -0,0 +1,7 @@
+or(8)           g3<1>UD         g3<8,8,1>UD     g5<8,8,1>UD     { align1 };
+or(8)           g9<1>.xUD       g10<4>.xUD      g9<4>.xUD       { align16 };
+(+f0.0) or(8)   g8<1>UD         g8<8,8,1>UD     0x3f800000UD    { align1 };
+or(16)          g12<1>UD        g14<8,8,1>UD    g20<8,8,1>UD    { align1 compr };
+(+f0.0) or(16)  g12<1>UD        g12<8,8,1>UD    0x3f800000UD    { align1 compr };
+(+f0.0) or(8)   g17<1>.xUD      g17<4>.xUD      0x3f800000UD    { align16 };
+or(8)           m2<1>.wUD       g8<4>.xUD       g11<4>.xUD      { align16 };
diff --git a/src/intel/compiler/elk/tests/gen5/or.expected b/src/intel/compiler/elk/tests/gen5/or.expected
new file mode 100644
index 00000000000..4d56bda1b0f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/or.expected
@@ -0,0 +1,7 @@
+06 00 60 00 21 04 60 20 60 00 8d 00 a0 00 8d 00
+06 01 60 00 21 04 21 21 40 01 60 00 20 01 60 00
+06 00 61 00 21 0c 00 21 00 01 8d 00 00 00 80 3f
+06 20 80 00 21 04 80 21 c0 01 8d 00 80 02 8d 00
+06 20 81 00 21 0c 80 21 80 01 8d 00 00 00 80 3f
+06 01 61 00 21 0c 21 22 20 02 60 00 00 00 80 3f
+06 01 60 00 22 04 48 20 00 01 60 00 60 01 60 00
diff --git a/src/intel/compiler/elk/tests/gen5/pln.asm b/src/intel/compiler/elk/tests/gen5/pln.asm
new file mode 100644
index 00000000000..e730b2dd1fc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/pln.asm
@@ -0,0 +1,4 @@
+pln(8)          g2<1>F          g3.4<0,1,0>F    g8<8,8,1>F      { align1 };
+pln(16)         g10<1>F         g3.4<0,1,0>F    g6<8,8,1>F      { align1 compr };
+pln(8)          m4<1>F          g5.4<0,1,0>F    g6<8,8,1>F      { align1 };
+pln(16)         m4<1>F          g5.4<0,1,0>F    g6<8,8,1>F      { align1 compr4 };
diff --git a/src/intel/compiler/elk/tests/gen5/pln.expected b/src/intel/compiler/elk/tests/gen5/pln.expected
new file mode 100644
index 00000000000..7f4987ff9b6
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/pln.expected
@@ -0,0 +1,4 @@
+5a 00 60 00 bd 77 40 20 70 00 00 00 00 01 8d 00
+5a 20 80 00 bd 77 40 21 70 00 00 00 c0 00 8d 00
+5a 00 60 00 be 77 80 20 b0 00 00 00 c0 00 8d 00
+5a 20 80 00 be 77 80 30 b0 00 00 00 c0 00 8d 00
diff --git a/src/intel/compiler/elk/tests/gen5/rndd.asm b/src/intel/compiler/elk/tests/gen5/rndd.asm
new file mode 100644
index 00000000000..4dff9546499
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/rndd.asm
@@ -0,0 +1,6 @@
+rndd(8)         g3<1>F          g5<8,8,1>F                      { align1 };
+rndd(16)        g16<1>F         g24<8,8,1>F                     { align1 compr };
+rndd(8)         g6<1>.xF        g1<0>.xF                        { align16 };
+rndd(8)         g6<1>.xF        (abs)g1<0>.xF                   { align16 NoDDClr };
+rndd(8)         g6<1>.yF        g7<4>.xF                        { align16 NoDDClr,NoDDChk };
+rndd.sat(8)     m5<1>F          g4<4>F                          { align16 };
diff --git a/src/intel/compiler/elk/tests/gen5/rndd.expected b/src/intel/compiler/elk/tests/gen5/rndd.expected
new file mode 100644
index 00000000000..ad3844874f2
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/rndd.expected
@@ -0,0 +1,6 @@
+45 00 60 00 bd 03 60 20 a0 00 8d 00 00 00 00 00
+45 20 80 00 bd 03 00 22 00 03 8d 00 00 00 00 00
+45 01 60 00 bd 03 c1 20 20 00 00 00 00 00 00 00
+45 05 60 00 bd 03 c1 20 20 20 00 00 00 00 00 00
+45 0d 60 00 bd 03 c2 20 e0 00 60 00 00 00 00 00
+45 01 60 80 be 03 af 20 84 00 6e 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen5/sel.asm b/src/intel/compiler/elk/tests/gen5/sel.asm
new file mode 100644
index 00000000000..a97923f0ba8
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/sel.asm
@@ -0,0 +1,34 @@
+(+f0.0) sel(8)  g6<1>F          g3<8,8,1>F      0x0F  /* 0F */  { align1 };
+(-f0.0) sel(8)  g2<1>UD         g2<8,8,1>UD     0x00000000UD    { align1 };
+(+f0.0) sel(16) g10<1>F         g6<8,8,1>F      0x0F  /* 0F */  { align1 compr };
+(-f0.0) sel(16) g4<1>UD         g6<8,8,1>UD     0x00000000UD    { align1 compr };
+(+f0.0) sel(8)  g4<1>.yF        g5<4>.xF        0x0F  /* 0F */  { align16 };
+(-f0.0.z) sel(8) g4<1>.zUD      g6<4>.xUD       0x00000000UD    { align16 };
+(+f0.0) sel(8)  g2<1>F          (abs)g4<8,8,1>F (abs)g3<8,8,1>F { align1 };
+(+f0.0) sel(16) g4<1>F          (abs)g16<8,8,1>F (abs)g8<8,8,1>F { align1 compr };
+(+f0.0) sel(8)  g2<1>UD         g5<8,8,1>UD     g6<8,8,1>UD     { align1 };
+(+f0.0) sel(8)  m3<1>UD         g4<8,8,1>UD     g2<8,8,1>UD     { align1 };
+(+f0.0) sel(16) g4<1>UD         g12<8,8,1>UD    g14<8,8,1>UD    { align1 compr };
+(+f0.0) sel(16) m3<1>UD         g10<8,8,1>UD    g4<8,8,1>UD     { align1 compr4 };
+(+f0.0) sel.sat(8) m5<1>F       g3<4>F          0x3f000000F  /* 0.5F */ { align16 };
+(+f0.0) sel(8)  m7<1>UD         g2<8,8,1>UD     0x3f000000UD    { align1 };
+(+f0.0) sel(16) m11<1>UD        g4<8,8,1>UD     0x3f000000UD    { align1 compr };
+(+f0.0) sel(8)  g15<1>UD        g16<4>UD        g15<4>UD        { align16 };
+(+f0.0) sel.sat(8) m5<1>F       g1<0>F          g3<4>F          { align16 };
+(-f0.0.x) sel(8) g17<1>.xUD     g17<4>.xUD      0x00000000UD    { align16 };
+(+f0.0) sel(8)  m4<1>F          g3<8,8,1>F      g4<8,8,1>F      { align1 };
+(+f0.0) sel(16) m4<1>F          g4<8,8,1>F      g6<8,8,1>F      { align1 compr4 };
+(-f0.0) sel(8)  m5<1>UD         g2<8,8,1>UD     0x00000000UD    { align1 };
+(-f0.0) sel(16) m5<1>UD         g6<8,8,1>UD     0x00000000UD    { align1 compr4 };
+(+f0.0.any4h) sel(8) g4<1>UD    g4<4>UD         g5<4>UD         { align16 };
+(+f0.0) sel(8)  g3<1>.xyUD      g3<4>.xyyyUD    0x3e4ccccdUD    { align16 };
+(+f0.0.x) sel(8) g4<1>.xD       -g4<4>.xD       0D              { align16 };
+(+f0.0) sel(8)  g2<1>D          -g2<8,8,1>D     -1D             { align1 };
+(+f0.0) sel(16) g4<1>D          -g6<8,8,1>D     -1D             { align1 compr };
+(+f0.0) sel(8)  m3<1>F          g2<8,8,1>F      0x3f800000F  /* 1F */ { align1 };
+(+f0.0) sel(16) m3<1>F          g4<8,8,1>F      0x3f800000F  /* 1F */ { align1 compr4 };
+(+f0.0.x) sel(8) g3<1>.xUD      g3<4>.xUD       0x3e4ccccdUD    { align16 };
+(+f0.0) sel(8)  g3<1>UD         g2.1<0,1,0>UD   0x40800000UD    { align1 };
+(+f0.0) sel(16) g4<1>UD         g2.1<0,1,0>UD   0x40800000UD    { align1 compr };
+(+f0.0.all4h) sel(8) g6<1>UD    g6<4>UD         g7<4>UD         { align16 };
+(+f0.0.x) sel(8) g8<1>.xUD      g3<0>.wUD       g3<0>.zUD       { align16 };
diff --git a/src/intel/compiler/elk/tests/gen5/sel.expected b/src/intel/compiler/elk/tests/gen5/sel.expected
new file mode 100644
index 00000000000..8d4449a4ec4
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/sel.expected
@@ -0,0 +1,34 @@
+02 00 61 00 bd 7f c0 20 60 00 8d 00 00 00 00 00
+02 00 71 00 21 0c 40 20 40 00 8d 00 00 00 00 00
+02 20 81 00 bd 7f 40 21 c0 00 8d 00 00 00 00 00
+02 20 91 00 21 0c 80 20 c0 00 8d 00 00 00 00 00
+02 01 61 00 bd 7f 82 20 a0 00 60 00 00 00 00 00
+02 01 74 00 21 0c 84 20 c0 00 60 00 00 00 00 00
+02 00 61 00 bd 77 40 20 80 20 8d 00 60 20 8d 00
+02 20 81 00 bd 77 80 20 00 22 8d 00 00 21 8d 00
+02 00 61 00 21 04 40 20 a0 00 8d 00 c0 00 8d 00
+02 00 61 00 22 04 60 20 80 00 8d 00 40 00 8d 00
+02 20 81 00 21 04 80 20 80 01 8d 00 c0 01 8d 00
+02 20 81 00 22 04 60 30 40 01 8d 00 80 00 8d 00
+02 01 61 80 be 7f af 20 64 00 6e 00 00 00 00 3f
+02 00 61 00 22 0c e0 20 40 00 8d 00 00 00 00 3f
+02 20 81 00 22 0c 60 21 80 00 8d 00 00 00 00 3f
+02 01 61 00 21 04 ef 21 04 02 6e 00 e4 01 6e 00
+02 01 61 80 be 77 af 20 24 00 0e 00 64 00 6e 00
+02 01 72 00 21 0c 21 22 20 02 60 00 00 00 00 00
+02 00 61 00 be 77 80 20 60 00 8d 00 80 00 8d 00
+02 20 81 00 be 77 80 30 80 00 8d 00 c0 00 8d 00
+02 00 71 00 22 0c a0 20 40 00 8d 00 00 00 00 00
+02 20 91 00 22 0c a0 30 c0 00 8d 00 00 00 00 00
+02 01 66 00 21 04 8f 20 84 00 6e 00 a4 00 6e 00
+02 01 61 00 21 0c 63 20 64 00 65 00 cd cc 4c 3e
+02 01 62 00 a5 1c 81 20 80 40 60 00 00 00 00 00
+02 00 61 00 a5 1c 40 20 40 40 8d 00 ff ff ff ff
+02 20 81 00 a5 1c 80 20 c0 40 8d 00 ff ff ff ff
+02 00 61 00 be 7f 60 20 40 00 8d 00 00 00 80 3f
+02 20 81 00 be 7f 60 30 80 00 8d 00 00 00 80 3f
+02 01 62 00 21 0c 61 20 60 00 60 00 cd cc 4c 3e
+02 00 61 00 21 0c 60 20 44 00 00 00 00 00 80 40
+02 20 81 00 21 0c 80 20 44 00 00 00 00 00 80 40
+02 01 67 00 21 04 cf 20 c4 00 6e 00 e4 00 6e 00
+02 01 62 00 21 04 01 21 6f 00 0f 00 6a 00 0a 00
diff --git a/src/intel/compiler/elk/tests/gen5/send.asm b/src/intel/compiler/elk/tests/gen5/send.asm
new file mode 100644
index 00000000000..4500f06ae26
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/send.asm
@@ -0,0 +1,300 @@
+send(8) 2       g2<1>F          g2<8,8,1>F      0x02100001
+                            math MsgDesc: inv mlen 1 rlen 1                 { align1 };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x8c084c00
+                            write MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 6 rlen 0 { align1 EOT };
+send(16) 2      g12<1>F         g10<8,8,1>F     0x02100001
+                            math MsgDesc: inv mlen 1 rlen 1                 { align1 compr };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x94084800
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 10 rlen 0 { align1 EOT };
+send(8) 1       null<1>F        g0<4>F          0x8a08c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 5 rlen 0 { align16 EOT };
+send(8) 2       g2<1>UW         null<8,8,1>F    0x06410001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 };
+send(16) 2      g4<1>UW         null<8,8,1>F    0x0c820001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 };
+send(8) 2       g2<1>UW         null<8,8,1>F    0x04410001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 };
+send(16) 2      g4<1>UW         null<8,8,1>F    0x08820001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 };
+send(8) 14      g3<1>UD         g0<4>F          0x04181000
+                            read MsgDesc: OWord Dual Block Read MsgCtrl = 0x0 Surface = 0 mlen 2 rlen 1 { align16 };
+send(8) 1       null<1>F        g0<4>F          0x8808c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 4 rlen 0 { align16 EOT };
+send(8) 2       g13<1>UW        null<8,8,1>F    0x0241a001
+                            sampler MsgDesc: resinfo SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 };
+send(16) 2      g26<1>UW        null<8,8,1>F    0x0482a001
+                            sampler MsgDesc: resinfo SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 };
+send(8) 2       g2<1>UW         null<8,8,1>F    0x08417001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 };
+send(16) 2      g4<1>UW         null<8,8,1>F    0x10827001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 };
+send(8) 1       null<1>F        g0<4>F          0x8c08c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 6 rlen 0 { align16 EOT };
+send(8) 2       g2<1>F          g2<8,8,1>F      0x0410000a
+                            math MsgDesc: pow mlen 2 rlen 1                 { align1 };
+send(8) 2       g12<1>UW        null<8,8,1>F    0x02410001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 };
+send(16) 2      g16<1>UW        null<8,8,1>F    0x04820001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 };
+send(8) 2       g2<1>F          g2<8,8,1>F      0x02100007
+                            math MsgDesc: cos mlen 1 rlen 1                 { align1 };
+send(8) 2       g2<1>UW         null<8,8,1>F    0x0a411001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 };
+send(16) 2      g4<1>UW         null<8,8,1>F    0x14821001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x90084c00
+                            write MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 8 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x9c084800
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 14 rlen 0 { align1 EOT };
+send(8) 1       null<1>F        g0<4>F          0x1a084400
+                            urb MsgDesc: 0 urb_write interleave used mlen 13 rlen 0 { align16 };
+send(8) 1       null<1>F        g0<4>F          0x9008c460
+                            urb MsgDesc: 6 urb_write interleave used complete mlen 8 rlen 0 { align16 EOT };
+send(8) 1       g5<1>.yF        g6<4>.xF        0x02100006
+                            math MsgDesc: sin mlen 1 rlen 1                 { align16 };
+send(8) 1       g7<1>.xD        g1<0>.zD        0x0410001c
+                            math MsgDesc: intdiv signed mlen 2 rlen 1       { align16 };
+send(8) 2       g3<1>F          g2.3<0,1,0>F    0x02100081
+                            math MsgDesc: inv scalar mlen 1 rlen 1          { align1 };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x8e084c00
+                            write MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 7 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x98084800
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 12 rlen 0 { align1 EOT };
+send(8) 1       g30<1>.xF       (abs)g30<4>.xF  0x02100005
+                            math MsgDesc: rsq mlen 1 rlen 1                 { align16 };
+send(8) 2       g2<1>UW         null<8,8,1>F    0x0a412001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 };
+send(16) 2      g4<1>UW         null<8,8,1>F    0x14822001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 };
+send(8) 2       g2<1>F          g2<8,8,1>F      0x02100004
+                            math MsgDesc: sqrt mlen 1 rlen 1                { align1 };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x92084c00
+                            write MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 9 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x9e084800
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 15 rlen 0 { align1 EOT };
+send(8) 2       g2<1>UW         null<8,8,1>F    0x04410304
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 3 mlen 2 rlen 4 { align1 };
+send(16) 2      g4<1>UW         null<8,8,1>F    0x08820304
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 3 mlen 4 rlen 8 { align1 };
+send(8) 2       g2<1>UW         null<8,8,1>F    0x0a413001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 };
+send(16) 2      g4<1>UW         null<8,8,1>F    0x14823001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 };
+send(8) 1       null<1>F        g0<4>F          0x9008c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 8 rlen 0 { align16 EOT };
+send(8) 1       null<1>F        g0<4>F          0x9608c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 11 rlen 0 { align16 EOT };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x0c084400
+                            write MsgDesc: RT write SIMD8 Surface = 0 mlen 6 rlen 0 { align1 };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x0c084401
+                            write MsgDesc: RT write SIMD8 Surface = 1 mlen 6 rlen 0 { align1 };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x0c084402
+                            write MsgDesc: RT write SIMD8 Surface = 2 mlen 6 rlen 0 { align1 };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x8c084c03
+                            write MsgDesc: RT write SIMD8 LastRT Surface = 3 mlen 6 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x14084000
+                            write MsgDesc: RT write SIMD16 Surface = 0 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x14084001
+                            write MsgDesc: RT write SIMD16 Surface = 1 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x14084002
+                            write MsgDesc: RT write SIMD16 Surface = 2 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x94084803
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 3 mlen 10 rlen 0 { align1 EOT };
+send(8) 2       g2<1>F          g2<8,8,1>F      0x02100002
+                            math MsgDesc: log mlen 1 rlen 1                 { align1 };
+send(8) 2       g2<1>UW         null<8,8,1>F    0x0c416001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 };
+send(8) 2       g3<1>F          g2<0,1,0>F      0x041000ca
+                            math MsgDesc: pow sat scalar mlen 2 rlen 1      { align1 };
+send(8) 2       g7<1>UW         null<8,8,1>F    0x0a413102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 };
+send(16) 2      g14<1>UW        null<8,8,1>F    0x14823102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 10 rlen 8 { align1 };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x8c084c01
+                            write MsgDesc: RT write SIMD8 LastRT Surface = 1 mlen 6 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x94084801
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 10 rlen 0 { align1 EOT };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x8c084c02
+                            write MsgDesc: RT write SIMD8 LastRT Surface = 2 mlen 6 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x94084802
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 2 mlen 10 rlen 0 { align1 EOT };
+send(8) 2       g9<1>UW         null<8,8,1>F    0x04410102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 2 rlen 4 { align1 };
+send(16) 2      g20<1>UW        null<8,8,1>F    0x08820102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 4 rlen 8 { align1 };
+send(8) 1       null<1>F        g0<4>F          0x8e08c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 7 rlen 0 { align16 EOT };
+send(8) 13      g0<1>F          g0<4>F          0x061890ff
+                            write MsgDesc: OWord dual block write MsgCtrl = 0x0 Surface = 255 mlen 3 rlen 1 { align16 };
+send(8) 14      g6<1>F          g0<4>F          0x041850ff
+                            read MsgDesc: OWord Dual Block Read MsgCtrl = 0x0 Surface = 255 mlen 2 rlen 1 { align16 };
+send(8) 2       g17<1>UW        null<8,8,1>F    0x0241a102
+                            sampler MsgDesc: resinfo SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 4 { align1 };
+send(8) 2       g30<1>UW        null<8,8,1>F    0x0241a203
+                            sampler MsgDesc: resinfo SIMD8 Surface = 3 Sampler = 2 mlen 1 rlen 4 { align1 };
+send(8) 2       g30<1>UW        null<8,8,1>F    0x04410203
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 2 mlen 2 rlen 4 { align1 };
+send(8) 2       g13<1>UW        null<8,8,1>F    0x0241a304
+                            sampler MsgDesc: resinfo SIMD8 Surface = 4 Sampler = 3 mlen 1 rlen 4 { align1 };
+send(8) 2       g34<1>UW        null<8,8,1>F    0x0241a405
+                            sampler MsgDesc: resinfo SIMD8 Surface = 5 Sampler = 4 mlen 1 rlen 4 { align1 };
+send(8) 2       g34<1>UW        null<8,8,1>F    0x04410405
+                            sampler MsgDesc: sample SIMD8 Surface = 5 Sampler = 4 mlen 2 rlen 4 { align1 };
+send(8) 2       g38<1>UW        null<8,8,1>F    0x0241a506
+                            sampler MsgDesc: resinfo SIMD8 Surface = 6 Sampler = 5 mlen 1 rlen 4 { align1 };
+send(8) 2       g9<1>UW         null<8,8,1>F    0x04410506
+                            sampler MsgDesc: sample SIMD8 Surface = 6 Sampler = 5 mlen 2 rlen 4 { align1 };
+send(8) 2       g38<1>UW        null<8,8,1>F    0x0241a607
+                            sampler MsgDesc: resinfo SIMD8 Surface = 7 Sampler = 6 mlen 1 rlen 4 { align1 };
+send(8) 2       g38<1>UW        null<8,8,1>F    0x04410607
+                            sampler MsgDesc: sample SIMD8 Surface = 7 Sampler = 6 mlen 2 rlen 4 { align1 };
+send(8) 2       g42<1>UW        null<8,8,1>F    0x0241a708
+                            sampler MsgDesc: resinfo SIMD8 Surface = 8 Sampler = 7 mlen 1 rlen 4 { align1 };
+send(8) 2       g42<1>UW        null<8,8,1>F    0x04410708
+                            sampler MsgDesc: sample SIMD8 Surface = 8 Sampler = 7 mlen 2 rlen 4 { align1 };
+send(16) 2      g14<1>UW        null<8,8,1>F    0x0482a102
+                            sampler MsgDesc: resinfo SIMD16 Surface = 2 Sampler = 1 mlen 2 rlen 8 { align1 };
+send(16) 2      g26<1>UW        null<8,8,1>F    0x0482a203
+                            sampler MsgDesc: resinfo SIMD16 Surface = 3 Sampler = 2 mlen 2 rlen 8 { align1 };
+send(16) 2      g26<1>UW        null<8,8,1>F    0x08820203
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 2 mlen 4 rlen 8 { align1 };
+send(16) 2      g34<1>UW        null<8,8,1>F    0x0482a304
+                            sampler MsgDesc: resinfo SIMD16 Surface = 4 Sampler = 3 mlen 2 rlen 8 { align1 };
+send(16) 2      g42<1>UW        null<8,8,1>F    0x0482a405
+                            sampler MsgDesc: resinfo SIMD16 Surface = 5 Sampler = 4 mlen 2 rlen 8 { align1 };
+send(16) 2      g42<1>UW        null<8,8,1>F    0x08820405
+                            sampler MsgDesc: sample SIMD16 Surface = 5 Sampler = 4 mlen 4 rlen 8 { align1 };
+send(16) 2      g50<1>UW        null<8,8,1>F    0x0482a506
+                            sampler MsgDesc: resinfo SIMD16 Surface = 6 Sampler = 5 mlen 2 rlen 8 { align1 };
+send(16) 2      g50<1>UW        null<8,8,1>F    0x08820506
+                            sampler MsgDesc: sample SIMD16 Surface = 6 Sampler = 5 mlen 4 rlen 8 { align1 };
+send(16) 2      g58<1>UW        null<8,8,1>F    0x0482a607
+                            sampler MsgDesc: resinfo SIMD16 Surface = 7 Sampler = 6 mlen 2 rlen 8 { align1 };
+send(16) 2      g58<1>UW        null<8,8,1>F    0x08820607
+                            sampler MsgDesc: sample SIMD16 Surface = 7 Sampler = 6 mlen 4 rlen 8 { align1 };
+send(16) 2      g66<1>UW        null<8,8,1>F    0x0482a708
+                            sampler MsgDesc: resinfo SIMD16 Surface = 8 Sampler = 7 mlen 2 rlen 8 { align1 };
+send(16) 2      g66<1>UW        null<8,8,1>F    0x08820708
+                            sampler MsgDesc: sample SIMD16 Surface = 8 Sampler = 7 mlen 4 rlen 8 { align1 };
+send(8) 1       null<1>F        g0<4>F          0x9a08c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 13 rlen 0 { align16 EOT };
+send(8) 1       null<1>F        g0<4>F          0x9808c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 12 rlen 0 { align16 EOT };
+send(8) 2       g8<1>UW         null<8,8,1>F    0x0c416102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 6 rlen 4 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x96084800
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 11 rlen 0 { align1 EOT };
+send(8) 2       g3<1>F          null<4>UD       0x04102505
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 5 Sampler = 5 mlen 2 rlen 1 { align16 };
+send(8) 2       g3<1>F          g2<0,1,0>F      0x021000c4
+                            math MsgDesc: sqrt sat scalar mlen 1 rlen 1     { align1 };
+send(8) 2       g3<1>F          g2<0,1,0>F      0x021000c3
+                            math MsgDesc: exp sat scalar mlen 1 rlen 1      { align1 };
+send(8) 2       g3<1>F          null<4>UD       0x04102000
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 0 Sampler = 0 mlen 2 rlen 1 { align16 };
+send(8) 1       g3<1>F          g1<0>F          0x02100044
+                            math MsgDesc: sqrt sat mlen 1 rlen 1            { align16 };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x0c084403
+                            write MsgDesc: RT write SIMD8 Surface = 3 mlen 6 rlen 0 { align1 };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x8c084c04
+                            write MsgDesc: RT write SIMD8 LastRT Surface = 4 mlen 6 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x14084003
+                            write MsgDesc: RT write SIMD16 Surface = 3 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x94084804
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 4 mlen 10 rlen 0 { align1 EOT };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x0c084404
+                            write MsgDesc: RT write SIMD8 Surface = 4 mlen 6 rlen 0 { align1 };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x8c084c05
+                            write MsgDesc: RT write SIMD8 LastRT Surface = 5 mlen 6 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x14084004
+                            write MsgDesc: RT write SIMD16 Surface = 4 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x94084805
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 5 mlen 10 rlen 0 { align1 EOT };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x0c084405
+                            write MsgDesc: RT write SIMD8 Surface = 5 mlen 6 rlen 0 { align1 };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x8c084c06
+                            write MsgDesc: RT write SIMD8 LastRT Surface = 6 mlen 6 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x14084005
+                            write MsgDesc: RT write SIMD16 Surface = 5 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x94084806
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 6 mlen 10 rlen 0 { align1 EOT };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x0c084406
+                            write MsgDesc: RT write SIMD8 Surface = 6 mlen 6 rlen 0 { align1 };
+send(8) 1       null<1>UW       g0<8,8,1>UW     0x8c084c07
+                            write MsgDesc: RT write SIMD8 LastRT Surface = 7 mlen 6 rlen 0 { align1 EOT };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x14084006
+                            write MsgDesc: RT write SIMD16 Surface = 6 mlen 10 rlen 0 { align1 };
+send(16) 1      null<1>UW       g0<8,8,1>UW     0x94084807
+                            write MsgDesc: RT write SIMD16 LastRT Surface = 7 mlen 10 rlen 0 { align1 EOT };
+send(8) 2       g2<1>UW         null<8,8,1>F    0x10414001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 8 rlen 4 { align1 };
+send(8) 2       g6<1>UW         null<8,8,1>F    0x06410102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 };
+send(16) 2      g12<1>UW        null<8,8,1>F    0x0c820102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 };
+send(8) 13      g11<1>UW        g0<8,8,1>F      0x04497001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 };
+send(16) 13     g4<1>UW         g0<8,8,1>F      0x068a7001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 3 rlen 8 { align1 };
+send(8) 1       null<1>F        g0<4>F          0x9408c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 10 rlen 0 { align16 EOT };
+send(8) 1       null<1>F        g0<4>F          0x8408c460
+                            urb MsgDesc: 6 urb_write interleave used complete mlen 2 rlen 0 { align16 EOT };
+send(8) 2       g4<1>F          g2<0,1,0>F      0x02100084
+                            math MsgDesc: sqrt scalar mlen 1 rlen 1         { align1 };
+send(8) 2       g2<1>UW         null<8,8,1>F    0x0c414001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 };
+send(8) 1       g3<1>F          g1<0>F          0x02100043
+                            math MsgDesc: exp sat mlen 1 rlen 1             { align16 };
+send(8) 1       null<1>F        g0<4>F          0x9208c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 9 rlen 0 { align16 EOT };
+send(8) 2       g2<1>UW         null<8,8,1>F    0x0c415001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 };
+send(8) 2       g5<1>F          g2<0,1,0>F      0x02100087
+                            math MsgDesc: cos scalar mlen 1 rlen 1          { align1 };
+send(8) 1       g4<1>.xF        g3<4>.xF        0x02100003
+                            math MsgDesc: exp mlen 1 rlen 1                 { align16 };
+send(8) 2       g7<1>UW         null<8,8,1>F    0x0c415102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 6 rlen 4 { align1 };
+send(8) 2       g2<1>UW         null<8,8,1>F    0x14414001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 10 rlen 4 { align1 };
+send(8) 2       g3<1>F          g2<0,1,0>F      0x02100083
+                            math MsgDesc: exp scalar mlen 1 rlen 1          { align1 };
+send(8) 2       g6<1>UW         null<8,8,1>F    0x04410003
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 0 mlen 2 rlen 4 { align1 };
+send(8) 2       g10<1>UW        null<8,8,1>F    0x04410004
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 0 mlen 2 rlen 4 { align1 };
+send(16) 2      g14<1>UW        null<8,8,1>F    0x08820003
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 0 mlen 4 rlen 8 { align1 };
+send(16) 2      g22<1>UW        null<8,8,1>F    0x08820004
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 0 mlen 4 rlen 8 { align1 };
+send(8) 2       g2<1>UW         null<8,8,1>F    0x04419001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 };
+send(16) 2      g4<1>UW         null<8,8,1>F    0x08829001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 };
+send(8) 2       g2<1>UW         null<8,8,1>F    0x04410f10
+                            sampler MsgDesc: sample SIMD8 Surface = 16 Sampler = 15 mlen 2 rlen 4 { align1 };
+send(16) 2      g4<1>UW         null<8,8,1>F    0x08820f10
+                            sampler MsgDesc: sample SIMD16 Surface = 16 Sampler = 15 mlen 4 rlen 8 { align1 };
+send(8) 2       g3<1>F          null<4>UD       0x04102303
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 3 Sampler = 3 mlen 2 rlen 1 { align16 };
+send(8) 1       g3<1>F          g1<0>F          0x0410004a
+                            math MsgDesc: pow sat mlen 2 rlen 1             { align16 };
+send(8) 2       g4<1>UW         null<8,8,1>F    0x0241a004
+                            sampler MsgDesc: resinfo SIMD8 Surface = 4 Sampler = 0 mlen 1 rlen 4 { align1 };
+send(16) 2      g10<1>UW        null<8,8,1>F    0x0482a004
+                            sampler MsgDesc: resinfo SIMD16 Surface = 4 Sampler = 0 mlen 2 rlen 8 { align1 };
+send(8) 2       g4<1>UW         null<8,8,1>F    0x0241a003
+                            sampler MsgDesc: resinfo SIMD8 Surface = 3 Sampler = 0 mlen 1 rlen 4 { align1 };
+send(16) 2      g10<1>UW        null<8,8,1>F    0x0482a003
+                            sampler MsgDesc: resinfo SIMD16 Surface = 3 Sampler = 0 mlen 2 rlen 8 { align1 };
+send(8) 2       g4<1>UW         null<8,8,1>F    0x0241a002
+                            sampler MsgDesc: resinfo SIMD8 Surface = 2 Sampler = 0 mlen 1 rlen 4 { align1 };
+send(8) 2       g2<1>UW         null<8,8,1>F    0x04410002
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 0 mlen 2 rlen 4 { align1 };
+send(16) 2      g10<1>UW        null<8,8,1>F    0x0482a002
+                            sampler MsgDesc: resinfo SIMD16 Surface = 2 Sampler = 0 mlen 2 rlen 8 { align1 };
+send(16) 2      g4<1>UW         null<8,8,1>F    0x08820002
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 0 mlen 4 rlen 8 { align1 };
+send(8) 2       g5<1>F          g2<0,1,0>F      0x02100086
+                            math MsgDesc: sin scalar mlen 1 rlen 1          { align1 };
diff --git a/src/intel/compiler/elk/tests/gen5/send.expected b/src/intel/compiler/elk/tests/gen5/send.expected
new file mode 100644
index 00000000000..30aa14788ef
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/send.expected
@@ -0,0 +1,150 @@
+31 00 60 02 bd 0f 40 20 40 00 8d 10 01 00 10 02
+31 00 60 01 28 0d 00 20 00 00 8d 50 00 4c 08 8c
+31 20 80 02 bd 0f 80 21 40 01 8d 10 01 00 10 02
+31 00 80 01 28 0d 00 20 00 00 8d 50 00 48 08 94
+31 01 60 01 bc 0f 0f 20 04 00 6e 60 00 c4 08 8a
+31 00 60 02 89 0f 40 20 00 00 8d 20 01 00 41 06
+31 00 80 02 89 0f 80 20 00 00 8d 20 01 00 82 0c
+31 00 60 02 89 0f 40 20 00 00 8d 20 01 00 41 04
+31 00 80 02 89 0f 80 20 00 00 8d 20 01 00 82 08
+31 01 60 0e a1 0f 6f 20 04 00 6e 40 00 10 18 04
+31 01 60 01 bc 0f 0f 20 04 00 6e 60 00 c4 08 88
+31 00 60 02 89 0f a0 21 00 00 8d 20 01 a0 41 02
+31 00 80 02 89 0f 40 23 00 00 8d 20 01 a0 82 04
+31 00 60 02 89 0f 40 20 00 00 8d 20 01 70 41 08
+31 00 80 02 89 0f 80 20 00 00 8d 20 01 70 82 10
+31 01 60 01 bc 0f 0f 20 04 00 6e 60 00 c4 08 8c
+31 00 60 02 bd 0f 40 20 40 00 8d 10 0a 00 10 04
+31 00 60 02 89 0f 80 21 00 00 8d 20 01 00 41 02
+31 00 80 02 89 0f 00 22 00 00 8d 20 01 00 82 04
+31 00 60 02 bd 0f 40 20 40 00 8d 10 07 00 10 02
+31 00 60 02 89 0f 40 20 00 00 8d 20 01 10 41 0a
+31 00 80 02 89 0f 80 20 00 00 8d 20 01 10 82 14
+31 00 60 01 28 0d 00 20 00 00 8d 50 00 4c 08 90
+31 00 80 01 28 0d 00 20 00 00 8d 50 00 48 08 9c
+31 01 60 01 bc 0f 0f 20 04 00 6e 60 00 44 08 1a
+31 01 60 01 bc 0f 0f 20 04 00 6e 60 60 c4 08 90
+31 01 60 01 bd 0f a2 20 c0 00 60 10 06 00 10 02
+31 01 60 01 a5 0c e1 20 2a 00 0a 10 1c 00 10 04
+31 00 60 02 bd 0f 60 20 4c 00 00 10 81 00 10 02
+31 00 60 01 28 0d 00 20 00 00 8d 50 00 4c 08 8e
+31 00 80 01 28 0d 00 20 00 00 8d 50 00 48 08 98
+31 01 60 01 bd 0f c1 23 c0 23 60 10 05 00 10 02
+31 00 60 02 89 0f 40 20 00 00 8d 20 01 20 41 0a
+31 00 80 02 89 0f 80 20 00 00 8d 20 01 20 82 14
+31 00 60 02 bd 0f 40 20 40 00 8d 10 04 00 10 02
+31 00 60 01 28 0d 00 20 00 00 8d 50 00 4c 08 92
+31 00 80 01 28 0d 00 20 00 00 8d 50 00 48 08 9e
+31 00 60 02 89 0f 40 20 00 00 8d 20 04 03 41 04
+31 00 80 02 89 0f 80 20 00 00 8d 20 04 03 82 08
+31 00 60 02 89 0f 40 20 00 00 8d 20 01 30 41 0a
+31 00 80 02 89 0f 80 20 00 00 8d 20 01 30 82 14
+31 01 60 01 bc 0f 0f 20 04 00 6e 60 00 c4 08 90
+31 01 60 01 bc 0f 0f 20 04 00 6e 60 00 c4 08 96
+31 00 60 01 28 0d 00 20 00 00 8d 50 00 44 08 0c
+31 00 60 01 28 0d 00 20 00 00 8d 50 01 44 08 0c
+31 00 60 01 28 0d 00 20 00 00 8d 50 02 44 08 0c
+31 00 60 01 28 0d 00 20 00 00 8d 50 03 4c 08 8c
+31 00 80 01 28 0d 00 20 00 00 8d 50 00 40 08 14
+31 00 80 01 28 0d 00 20 00 00 8d 50 01 40 08 14
+31 00 80 01 28 0d 00 20 00 00 8d 50 02 40 08 14
+31 00 80 01 28 0d 00 20 00 00 8d 50 03 48 08 94
+31 00 60 02 bd 0f 40 20 40 00 8d 10 02 00 10 02
+31 00 60 02 89 0f 40 20 00 00 8d 20 01 60 41 0c
+31 00 60 02 bd 0f 60 20 40 00 00 10 ca 00 10 04
+31 00 60 02 89 0f e0 20 00 00 8d 20 02 31 41 0a
+31 00 80 02 89 0f c0 21 00 00 8d 20 02 31 82 14
+31 00 60 01 28 0d 00 20 00 00 8d 50 01 4c 08 8c
+31 00 80 01 28 0d 00 20 00 00 8d 50 01 48 08 94
+31 00 60 01 28 0d 00 20 00 00 8d 50 02 4c 08 8c
+31 00 80 01 28 0d 00 20 00 00 8d 50 02 48 08 94
+31 00 60 02 89 0f 20 21 00 00 8d 20 02 01 41 04
+31 00 80 02 89 0f 80 22 00 00 8d 20 02 01 82 08
+31 01 60 01 bc 0f 0f 20 04 00 6e 60 00 c4 08 8e
+31 01 60 0d bd 0f 0f 20 04 00 6e 50 ff 90 18 06
+31 01 60 0e bd 0f cf 20 04 00 6e 40 ff 50 18 04
+31 00 60 02 89 0f 20 22 00 00 8d 20 02 a1 41 02
+31 00 60 02 89 0f c0 23 00 00 8d 20 03 a2 41 02
+31 00 60 02 89 0f c0 23 00 00 8d 20 03 02 41 04
+31 00 60 02 89 0f a0 21 00 00 8d 20 04 a3 41 02
+31 00 60 02 89 0f 40 24 00 00 8d 20 05 a4 41 02
+31 00 60 02 89 0f 40 24 00 00 8d 20 05 04 41 04
+31 00 60 02 89 0f c0 24 00 00 8d 20 06 a5 41 02
+31 00 60 02 89 0f 20 21 00 00 8d 20 06 05 41 04
+31 00 60 02 89 0f c0 24 00 00 8d 20 07 a6 41 02
+31 00 60 02 89 0f c0 24 00 00 8d 20 07 06 41 04
+31 00 60 02 89 0f 40 25 00 00 8d 20 08 a7 41 02
+31 00 60 02 89 0f 40 25 00 00 8d 20 08 07 41 04
+31 00 80 02 89 0f c0 21 00 00 8d 20 02 a1 82 04
+31 00 80 02 89 0f 40 23 00 00 8d 20 03 a2 82 04
+31 00 80 02 89 0f 40 23 00 00 8d 20 03 02 82 08
+31 00 80 02 89 0f 40 24 00 00 8d 20 04 a3 82 04
+31 00 80 02 89 0f 40 25 00 00 8d 20 05 a4 82 04
+31 00 80 02 89 0f 40 25 00 00 8d 20 05 04 82 08
+31 00 80 02 89 0f 40 26 00 00 8d 20 06 a5 82 04
+31 00 80 02 89 0f 40 26 00 00 8d 20 06 05 82 08
+31 00 80 02 89 0f 40 27 00 00 8d 20 07 a6 82 04
+31 00 80 02 89 0f 40 27 00 00 8d 20 07 06 82 08
+31 00 80 02 89 0f 40 28 00 00 8d 20 08 a7 82 04
+31 00 80 02 89 0f 40 28 00 00 8d 20 08 07 82 08
+31 01 60 01 bc 0f 0f 20 04 00 6e 60 00 c4 08 9a
+31 01 60 01 bc 0f 0f 20 04 00 6e 60 00 c4 08 98
+31 00 60 02 89 0f 00 21 00 00 8d 20 02 61 41 0c
+31 00 80 01 28 0d 00 20 00 00 8d 50 00 48 08 96
+31 01 60 02 1d 0c 6f 20 04 00 6e 20 05 25 10 04
+31 00 60 02 bd 0f 60 20 40 00 00 10 c4 00 10 02
+31 00 60 02 bd 0f 60 20 40 00 00 10 c3 00 10 02
+31 01 60 02 1d 0c 6f 20 04 00 6e 20 00 20 10 04
+31 01 60 01 bd 0f 6f 20 24 00 0e 10 44 00 10 02
+31 00 60 01 28 0d 00 20 00 00 8d 50 03 44 08 0c
+31 00 60 01 28 0d 00 20 00 00 8d 50 04 4c 08 8c
+31 00 80 01 28 0d 00 20 00 00 8d 50 03 40 08 14
+31 00 80 01 28 0d 00 20 00 00 8d 50 04 48 08 94
+31 00 60 01 28 0d 00 20 00 00 8d 50 04 44 08 0c
+31 00 60 01 28 0d 00 20 00 00 8d 50 05 4c 08 8c
+31 00 80 01 28 0d 00 20 00 00 8d 50 04 40 08 14
+31 00 80 01 28 0d 00 20 00 00 8d 50 05 48 08 94
+31 00 60 01 28 0d 00 20 00 00 8d 50 05 44 08 0c
+31 00 60 01 28 0d 00 20 00 00 8d 50 06 4c 08 8c
+31 00 80 01 28 0d 00 20 00 00 8d 50 05 40 08 14
+31 00 80 01 28 0d 00 20 00 00 8d 50 06 48 08 94
+31 00 60 01 28 0d 00 20 00 00 8d 50 06 44 08 0c
+31 00 60 01 28 0d 00 20 00 00 8d 50 07 4c 08 8c
+31 00 80 01 28 0d 00 20 00 00 8d 50 06 40 08 14
+31 00 80 01 28 0d 00 20 00 00 8d 50 07 48 08 94
+31 00 60 02 89 0f 40 20 00 00 8d 20 01 40 41 10
+31 00 60 02 89 0f c0 20 00 00 8d 20 02 01 41 06
+31 00 80 02 89 0f 80 21 00 00 8d 20 02 01 82 0c
+31 00 60 0d a9 0f 60 21 00 00 8d 20 01 70 49 04
+31 00 80 0d a9 0f 80 20 00 00 8d 20 01 70 8a 06
+31 01 60 01 bc 0f 0f 20 04 00 6e 60 00 c4 08 94
+31 01 60 01 bc 0f 0f 20 04 00 6e 60 60 c4 08 84
+31 00 60 02 bd 0f 80 20 40 00 00 10 84 00 10 02
+31 00 60 02 89 0f 40 20 00 00 8d 20 01 40 41 0c
+31 01 60 01 bd 0f 6f 20 24 00 0e 10 43 00 10 02
+31 01 60 01 bc 0f 0f 20 04 00 6e 60 00 c4 08 92
+31 00 60 02 89 0f 40 20 00 00 8d 20 01 50 41 0c
+31 00 60 02 bd 0f a0 20 40 00 00 10 87 00 10 02
+31 01 60 01 bd 0f 81 20 60 00 60 10 03 00 10 02
+31 00 60 02 89 0f e0 20 00 00 8d 20 02 51 41 0c
+31 00 60 02 89 0f 40 20 00 00 8d 20 01 40 41 14
+31 00 60 02 bd 0f 60 20 40 00 00 10 83 00 10 02
+31 00 60 02 89 0f c0 20 00 00 8d 20 03 00 41 04
+31 00 60 02 89 0f 40 21 00 00 8d 20 04 00 41 04
+31 00 80 02 89 0f c0 21 00 00 8d 20 03 00 82 08
+31 00 80 02 89 0f c0 22 00 00 8d 20 04 00 82 08
+31 00 60 02 89 0f 40 20 00 00 8d 20 01 90 41 04
+31 00 80 02 89 0f 80 20 00 00 8d 20 01 90 82 08
+31 00 60 02 89 0f 40 20 00 00 8d 20 10 0f 41 04
+31 00 80 02 89 0f 80 20 00 00 8d 20 10 0f 82 08
+31 01 60 02 1d 0c 6f 20 04 00 6e 20 03 23 10 04
+31 01 60 01 bd 0f 6f 20 24 00 0e 10 4a 00 10 04
+31 00 60 02 89 0f 80 20 00 00 8d 20 04 a0 41 02
+31 00 80 02 89 0f 40 21 00 00 8d 20 04 a0 82 04
+31 00 60 02 89 0f 80 20 00 00 8d 20 03 a0 41 02
+31 00 80 02 89 0f 40 21 00 00 8d 20 03 a0 82 04
+31 00 60 02 89 0f 80 20 00 00 8d 20 02 a0 41 02
+31 00 60 02 89 0f 40 20 00 00 8d 20 02 00 41 04
+31 00 80 02 89 0f 40 21 00 00 8d 20 02 a0 82 04
+31 00 80 02 89 0f 80 20 00 00 8d 20 02 00 82 08
+31 00 60 02 bd 0f a0 20 40 00 00 10 86 00 10 02
diff --git a/src/intel/compiler/elk/tests/gen5/shl.asm b/src/intel/compiler/elk/tests/gen5/shl.asm
new file mode 100644
index 00000000000..a02bfff871f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/shl.asm
@@ -0,0 +1,7 @@
+shl(8)          g4<1>.xD        g1<0>.yD        0x00000004UD    { align16 };
+shl(8)          g4<1>D          g3<8,8,1>D      0x00000001UD    { align1 };
+shl(16)         g6<1>D          g4<8,8,1>D      0x00000001UD    { align1 compr };
+shl(8)          g11<1>.xUD      g11<4>.xUD      4D              { align16 };
+shl(8)          m14<1>D         g4<0,1,0>D      0x00000004UD    { align1 };
+shl(16)         m14<1>D         g4<0,1,0>D      0x00000004UD    { align1 compr };
+shl(8)          g5<1>D          g3<4>D          g4<4>UD         { align16 };
diff --git a/src/intel/compiler/elk/tests/gen5/shl.expected b/src/intel/compiler/elk/tests/gen5/shl.expected
new file mode 100644
index 00000000000..f9c9cef2e3f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/shl.expected
@@ -0,0 +1,7 @@
+09 01 60 00 a5 0c 81 20 25 00 05 00 04 00 00 00
+09 00 60 00 a5 0c 80 20 60 00 8d 00 01 00 00 00
+09 20 80 00 a5 0c c0 20 80 00 8d 00 01 00 00 00
+09 01 60 00 21 1c 61 21 60 01 60 00 04 00 00 00
+09 00 60 00 a6 0c c0 21 80 00 00 00 04 00 00 00
+09 20 80 00 a6 0c c0 21 80 00 00 00 04 00 00 00
+09 01 60 00 a5 04 af 20 64 00 6e 00 84 00 6e 00
diff --git a/src/intel/compiler/elk/tests/gen5/shr.asm b/src/intel/compiler/elk/tests/gen5/shr.asm
new file mode 100644
index 00000000000..51db89dd17d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/shr.asm
@@ -0,0 +1,3 @@
+shr(8)          g3<1>UD         g3<8,8,1>UD     0x00000001UD    { align1 };
+shr(16)         g8<1>UD         g4<8,8,1>UD     0x00000001UD    { align1 compr };
+shr(1)          g8.4<1>UD       g8.4<0,1,0>UD   0x00000004UD    { align1 nomask };
diff --git a/src/intel/compiler/elk/tests/gen5/shr.expected b/src/intel/compiler/elk/tests/gen5/shr.expected
new file mode 100644
index 00000000000..fe61e45da45
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/shr.expected
@@ -0,0 +1,3 @@
+08 00 60 00 21 0c 60 20 60 00 8d 00 01 00 00 00
+08 20 80 00 21 0c 00 21 80 00 8d 00 01 00 00 00
+08 02 00 00 21 0c 10 21 10 01 00 00 04 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen5/while.asm b/src/intel/compiler/elk/tests/gen5/while.asm
new file mode 100644
index 00000000000..a5985d985d8
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/while.asm
@@ -0,0 +1,3 @@
+while(8)        Jump: -282                                      { align1 };
+while(16)       Jump: -282                                      { align1 };
+while(8)        Jump: -32                                       { align16 };
diff --git a/src/intel/compiler/elk/tests/gen5/while.expected b/src/intel/compiler/elk/tests/gen5/while.expected
new file mode 100644
index 00000000000..290e75f50e5
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen5/while.expected
@@ -0,0 +1,3 @@
+27 00 60 00 00 1c 00 34 00 14 60 00 e6 fe 00 00
+27 00 80 00 00 1c 00 34 00 14 60 00 e6 fe 00 00
+27 01 60 00 00 1c 0f 34 04 14 6e 00 e0 ff 00 00
diff --git a/src/intel/compiler/elk/tests/gen6/add.asm b/src/intel/compiler/elk/tests/gen6/add.asm
new file mode 100644
index 00000000000..37b2e5dff75
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/add.asm
@@ -0,0 +1,79 @@
+add(16)         g6<1>UW         g1.4<1,4,0>UW   0x11001010V     { align1 WE_all 1H };
+add(16)         g4<1>UW         g1.4<2,4,0>UW   0x10101010V     { align1 1H };
+add(8)          g30<1>F         g26<8,8,1>F     -g4.4<0,1,0>F   { align1 1Q };
+add(16)         g18<1>F         g8<8,8,1>F      -g6.4<0,1,0>F   { align1 1H };
+add(1)          m22.4<1>D       g39.4<0,1,0>D   1D              { align1 WE_all 1N };
+add(8)          m1<1>D          g3.3<0,1,0>D    g2<0,1,0>D      { align1 1Q };
+add(16)         m1<1>D          g3.3<0,1,0>D    g2<0,1,0>D      { align1 1H };
+add(8)          g11<1>.xyF      g1<0>.xyyyF     g2<0>.xF        { align16 1Q };
+add(8)          m6<1>F          g4<8,8,1>F      0xbd4ccccdF  /* -0.05F */ { align1 1Q };
+add(8)          g2<1>F          g4<8,8,1>F      0x3d4ccccdF  /* 0.05F */ { align1 1Q };
+add(16)         g2<1>F          g11<8,8,1>F     0xbd4ccccdF  /* -0.05F */ { align1 1H };
+add(8)          g6<1>.xUD       g6<4>.xUD       0x00000001UD    { align16 1Q };
+add(8)          g19<1>.xUD      g6<4>.xUD       3D              { align16 1Q };
+add(8)          m3<1>F          g2<4>F          g1<0>F          { align16 1Q };
+add(8)          g67<1>.xD       g38<4>.xD       g40<4>.xD       { align16 1Q };
+add(8)          g21<1>.xD       g19<4>.xD       -1D             { align16 1Q };
+add(8)          a0<1>UW         g3<16,8,2>UW    0x0040UW        { align1 1Q };
+add(8)          a0<1>UW         g4<16,8,2>UW    0x0040UW        { align1 2Q };
+add(8)          m2<1>.xD        g3.4<0>.xD      7D              { align16 NoDDClr 1Q };
+add(8)          g6<1>.xyF       g1<4>.xyyyF     0x3f800000F  /* 1F */ { align16 1Q };
+add(16)         m3<1>F          -g39<8,8,1>F    0x3f800000F  /* 1F */ { align1 1H };
+add(8)          m4<1>F          g10<8,8,1>F     g2.7<0,1,0>F    { align1 1Q };
+add(16)         m7<1>F          g35<8,8,1>F     g2.7<0,1,0>F    { align1 1H };
+add(8)          m3<1>.xyF       g10<4>.xyyyF    0x3f000000F  /* 0.5F */ { align16 NoDDClr 1Q };
+add(8)          g26<1>UD        g26<4>UD        g28<4>UD        { align16 1Q };
+add(8)          m6<1>.xD        g5<4>.zD        g5<4>.xD        { align16 1Q };
+add(8)          g4<1>D          g2<0,1,0>D      g2.4<0,1,0>D    { align1 1Q };
+add(16)         g4<1>D          g2<0,1,0>D      g2.4<0,1,0>D    { align1 1H };
+add(8)          m2<1>.xyzD      g3.4<0>.xyzzD   g11<4>.xyzzD    { align16 NoDDClr 1Q };
+add(8)          g70<1>D         g4<0,1,0>D      1D              { align1 1Q };
+add(8)          m2<1>F          g4<8,8,1>D      1D              { align1 1Q };
+add(16)         g75<1>D         g6<0,1,0>D      1D              { align1 1H };
+add(16)         m3<1>F          g89<8,8,1>D     1D              { align1 1H };
+add(8)          g37<1>F         g34<8,8,1>D     1D              { align1 1Q };
+add(16)         g68<1>F         g62<8,8,1>D     1D              { align1 1H };
+add(8)          g11<1>F         g10<4>.xF       0x48403000VF /* [0F, 1F, 2F, 3F]VF */ { align16 1Q };
+add(8)          m4<1>.zD        g1<0>.xD        2D              { align16 NoDDClr,NoDDChk 1Q };
+add(8)          g15<1>.yD       g1<0>.xD        49D             { align16 NoDDClr 1Q };
+add(8)          g15<1>.zD       g1<0>.xD        50D             { align16 NoDDClr,NoDDChk 1Q };
+add(8)          m5<1>.wD        g1<0>.xD        7D              { align16 NoDDChk 1Q };
+add(8)          g15<1>.wD       g1<0>.xD        51D             { align16 NoDDChk 1Q };
+add(8)          g3<1>.yF        g5<4>.xF        -g1<0>.xF       { align16 NoDDClr 1Q };
+add(8)          g3<1>.yF        g13<4>.xF       -g1<0>.xF       { align16 NoDDClr,NoDDChk 1Q };
+add(8)          g42<1>.wF       g2<0>.xF        0x40400000F  /* 3F */ { align16 NoDDClr 1Q };
+add(8)          g43<1>.wF       g2<0>.xF        0x40e00000F  /* 7F */ { align16 NoDDChk 1Q };
+add(8)          m5<1>.zF        g1<0>.xF        0x40000000F  /* 2F */ { align16 NoDDClr,NoDDChk 1Q };
+add(8)          m14<1>.zF       g1<0>.xF        0x42180000F  /* 38F */ { align16 NoDDChk 1Q };
+add(8)          m3<1>F          g1<4>.xF        0x48403000VF /* [0F, 1F, 2F, 3F]VF */ { align16 1Q };
+add(8)          g99<1>.xD       g8<4>.xUD       32D             { align16 1Q };
+add(8)          m4<1>.xF        g1<4>.xF        0x42c80000F  /* 100F */ { align16 1Q };
+add(8)          g3.1<2>UW       g3.1<16,8,2>UW  g13<16,8,2>UW   { align1 1Q };
+add(16)         g3.1<2>UW       g3.1<16,8,2>UW  g5<16,8,2>UW    { align1 1H };
+add.sat(8)      m4<1>F          g2<4>.yzxwF     -g2<4>F         { align16 1Q };
+add(8)          g15<1>.wF       g2<0>.xF        0x40400000F  /* 3F */ { align16 NoDDClr,NoDDChk 1Q };
+add(8)          g2<1>UD         g22<0,1,0>UD    g12<1,4,0>UW    { align1 1Q };
+add(8)          g3<1>UD         g22<0,1,0>UD    g12.2<1,4,0>UW  { align1 2Q };
+add(8)          m3<1>.xyF       g2<4>.xyyyF     g1<0>.xyyyF     { align16 NoDDChk 1Q };
+add.sat(8)      m4<1>F          g2<0,1,0>F      g2.4<0,1,0>F    { align1 1Q };
+add.sat(8)      g3<1>F          g2.3<0,1,0>F    g2.4<0,1,0>F    { align1 1Q };
+add.sat(16)     m7<1>F          g2<0,1,0>F      g2.4<0,1,0>F    { align1 1H };
+add.sat(16)     g3<1>F          g2.3<0,1,0>F    g2.4<0,1,0>F    { align1 1H };
+add(8)          m17<1>D         g3<8,8,1>D      12D             { align1 1Q };
+add(16)         m17<1>D         g3<8,8,1>D      12D             { align1 1H };
+add(8)          m3<1>.yF        g1<4>.yF        -g9<4>.xF       { align16 NoDDClr,NoDDChk 1Q };
+add(8)          m5<1>.xyD       g6<4>.xyyyD     g12<4>.xD       { align16 NoDDClr,NoDDChk 1Q };
+add(8)          m5<1>.xD        g11<4>.xD       1D              { align16 1Q };
+add.sat(8)      m4<1>.xF        -g15<4>.xF      0x3f800000F  /* 1F */ { align16 NoDDClr 1Q };
+add(8)          m10<1>UD        g13<0,1,0>UD    g10<1,4,0>UW    { align1 1Q };
+add(8)          m11<1>UD        g13<0,1,0>UD    g10.2<1,4,0>UW  { align1 2Q };
+add(8)          m17<1>UD        g6<8,8,1>UD     0x00000110UD    { align1 1Q };
+add(16)         m17<1>UD        g9<8,8,1>UD     0x00000110UD    { align1 1H };
+add.sat(8)      g22<1>.xUD      g20<4>.xUD      g10<4>.xUD      { align16 1Q };
+add.l.f0.0(8)   g14<1>.xD       g12<4>.xD       -g12<4>.yD      { align16 1Q };
+add(8)          g18<1>F         -g16<4>.xyxyF   g16<4>.zwzwF    { align16 2Q };
+add.sat(8)      m4<1>F          g7<4>.xF        0xbf800000F  /* -1F */ { align16 1Q };
+add.sat(8)      m4<1>.yF        -g1<0>.xF       0x3f000000F  /* 0.5F */ { align16 NoDDClr,NoDDChk 1Q };
+add.sat(8)      m4<1>.wF        g3<4>.yF        0xc0000000F  /* -2F */ { align16 NoDDChk 1Q };
+add(8)          m5<1>.xF        g25<4>.xF       -g3<4>.yF       { align16 NoDDClr 1Q };
+add(8)          g18<1>.yF       g21<4>.xF       g27<4>.xF       { align16 NoDDChk 1Q };
diff --git a/src/intel/compiler/elk/tests/gen6/add.expected b/src/intel/compiler/elk/tests/gen6/add.expected
new file mode 100644
index 00000000000..7419dcac64d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/add.expected
@@ -0,0 +1,79 @@
+40 02 80 00 29 6d c0 20 28 00 28 00 10 10 00 11
+40 00 80 00 29 6d 80 20 28 00 48 00 10 10 10 10
+40 00 60 00 bd 77 c0 23 40 03 8d 00 90 40 00 00
+40 00 80 00 bd 77 40 22 00 01 8d 00 d0 40 00 00
+40 02 00 00 a6 1c d0 22 f0 04 00 00 01 00 00 00
+40 00 60 00 a6 14 20 20 6c 00 00 00 40 00 00 00
+40 00 80 00 a6 14 20 20 6c 00 00 00 40 00 00 00
+40 01 60 00 bd 77 63 21 24 00 05 00 40 00 00 00
+40 00 60 00 be 7f c0 20 80 00 8d 00 cd cc 4c bd
+40 00 60 00 bd 7f 40 20 80 00 8d 00 cd cc 4c 3d
+40 00 80 00 bd 7f 40 20 60 01 8d 00 cd cc 4c bd
+40 01 60 00 21 0c c1 20 c0 00 60 00 01 00 00 00
+40 01 60 00 21 1c 61 22 c0 00 60 00 03 00 00 00
+40 01 60 00 be 77 6f 20 44 00 6e 00 24 00 0e 00
+40 01 60 00 a5 14 61 28 c0 04 60 00 00 05 60 00
+40 01 60 00 a5 1c a1 22 60 02 60 00 ff ff ff ff
+40 00 60 00 28 2d 00 22 60 00 ae 00 40 00 40 00
+40 10 60 00 28 2d 00 22 80 00 ae 00 40 00 40 00
+40 05 60 00 a6 1c 41 20 70 00 00 00 07 00 00 00
+40 01 60 00 bd 7f c3 20 24 00 65 00 00 00 80 3f
+40 00 80 00 be 7f 60 20 e0 44 8d 00 00 00 80 3f
+40 00 60 00 be 77 80 20 40 01 8d 00 5c 00 00 00
+40 00 80 00 be 77 e0 20 60 04 8d 00 5c 00 00 00
+40 05 60 00 be 7f 63 20 44 01 65 00 00 00 00 3f
+40 01 60 00 21 04 4f 23 44 03 6e 00 84 03 6e 00
+40 01 60 00 a6 14 c1 20 aa 00 6a 00 a0 00 60 00
+40 00 60 00 a5 14 80 20 40 00 00 00 50 00 00 00
+40 00 80 00 a5 14 80 20 40 00 00 00 50 00 00 00
+40 05 60 00 a6 14 47 20 74 00 0a 00 64 01 6a 00
+40 00 60 00 a5 1c c0 28 80 00 00 00 01 00 00 00
+40 00 60 00 be 1c 40 20 80 00 8d 00 01 00 00 00
+40 00 80 00 a5 1c 60 29 c0 00 00 00 01 00 00 00
+40 00 80 00 be 1c 60 20 20 0b 8d 00 01 00 00 00
+40 00 60 00 bd 1c a0 24 40 04 8d 00 01 00 00 00
+40 00 80 00 bd 1c 80 28 c0 07 8d 00 01 00 00 00
+40 01 60 00 bd 5f 6f 21 40 01 60 00 00 30 40 48
+40 0d 60 00 a6 1c 84 20 20 00 00 00 02 00 00 00
+40 05 60 00 a5 1c e2 21 20 00 00 00 31 00 00 00
+40 0d 60 00 a5 1c e4 21 20 00 00 00 32 00 00 00
+40 09 60 00 a6 1c a8 20 20 00 00 00 07 00 00 00
+40 09 60 00 a5 1c e8 21 20 00 00 00 33 00 00 00
+40 05 60 00 bd 77 62 20 a0 00 60 00 20 40 00 00
+40 0d 60 00 bd 77 62 20 a0 01 60 00 20 40 00 00
+40 05 60 00 bd 7f 48 25 40 00 00 00 00 00 40 40
+40 09 60 00 bd 7f 68 25 40 00 00 00 00 00 e0 40
+40 0d 60 00 be 7f a4 20 20 00 00 00 00 00 00 40
+40 09 60 00 be 7f c4 21 20 00 00 00 00 00 18 42
+40 01 60 00 be 5f 6f 20 20 00 60 00 00 30 40 48
+40 01 60 00 25 1c 61 2c 00 01 60 00 20 00 00 00
+40 01 60 00 be 7f 81 20 20 00 60 00 00 00 c8 42
+40 00 60 00 29 25 62 40 62 00 ae 00 a0 01 ae 00
+40 00 80 00 29 25 62 40 62 00 ae 00 a0 00 ae 00
+40 01 60 80 be 77 8f 20 49 00 6c 00 44 40 6e 00
+40 0d 60 00 bd 7f e8 21 40 00 00 00 00 00 40 40
+40 00 60 00 21 24 40 20 c0 02 00 00 80 01 28 00
+40 10 60 00 21 24 60 20 c0 02 00 00 84 01 28 00
+40 09 60 00 be 77 63 20 44 00 65 00 24 00 05 00
+40 00 60 80 be 77 80 20 40 00 00 00 50 00 00 00
+40 00 60 80 bd 77 60 20 4c 00 00 00 50 00 00 00
+40 00 80 80 be 77 e0 20 40 00 00 00 50 00 00 00
+40 00 80 80 bd 77 60 20 4c 00 00 00 50 00 00 00
+40 00 60 00 a6 1c 20 22 60 00 8d 00 0c 00 00 00
+40 00 80 00 a6 1c 20 22 60 00 8d 00 0c 00 00 00
+40 0d 60 00 be 77 62 20 25 00 65 00 20 41 60 00
+40 0d 60 00 a6 14 a3 20 c4 00 65 00 80 01 60 00
+40 01 60 00 a6 1c a1 20 60 01 60 00 01 00 00 00
+40 05 60 80 be 7f 81 20 e0 41 60 00 00 00 80 3f
+40 00 60 00 22 24 40 21 a0 01 00 00 40 01 28 00
+40 10 60 00 22 24 60 21 a0 01 00 00 44 01 28 00
+40 00 60 00 22 0c 20 22 c0 00 8d 00 10 01 00 00
+40 00 80 00 22 0c 20 22 20 01 8d 00 10 01 00 00
+40 01 60 80 21 04 c1 22 80 02 60 00 40 01 60 00
+40 01 60 05 a5 14 c1 21 80 01 60 00 85 41 65 00
+40 11 60 00 bd 77 4f 22 04 42 64 00 0e 02 6e 00
+40 01 60 80 be 7f 8f 20 e0 00 60 00 00 00 80 bf
+40 0d 60 80 be 7f 82 20 20 40 00 00 00 00 00 3f
+40 09 60 80 be 7f 88 20 65 00 65 00 00 00 00 c0
+40 05 60 00 be 77 a1 20 20 03 60 00 65 40 65 00
+40 09 60 00 bd 77 42 22 a0 02 60 00 60 03 60 00
diff --git a/src/intel/compiler/elk/tests/gen6/and.asm b/src/intel/compiler/elk/tests/gen6/and.asm
new file mode 100644
index 00000000000..72f35e547b8
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/and.asm
@@ -0,0 +1,19 @@
+and(8)          g22<1>UD        g21<8,8,1>UD    g20<8,8,1>UD    { align1 1Q };
+and.nz.f0.0(8)  null<1>UD       g24<8,8,1>UD    g25<8,8,1>UD    { align1 1Q };
+and(16)         g41<1>UD        g39<8,8,1>UD    g37<8,8,1>UD    { align1 1H };
+and.nz.f0.0(16) null<1>UD       g45<8,8,1>UD    g47<8,8,1>UD    { align1 1H };
+and(1)          g28<1>UD        g55<0,1,0>UD    0x0000ffffUD    { align1 1N };
+and(8)          g64<1>.xUD      g27<4>.xUD      0x0000ffffUD    { align16 1Q };
+and(8)          g12<1>UD        g11<8,8,1>UD    0x00000001UD    { align1 1Q };
+and(16)         g19<1>UD        g17<8,8,1>UD    0x00000001UD    { align1 1H };
+and(8)          g16<1>.xUD      g4<4>.yUD       g3<4>.xUD       { align16 1Q };
+and(8)          g5<1>D          g2.4<0,1,0>D    -g2.4<0,1,0>D   { align1 1Q };
+and(16)         g6<1>D          g2.4<0,1,0>D    -g2.4<0,1,0>D   { align1 1H };
+and(1)          g22<1>UD        g0<0,1,0>UD     0x000000c0UD    { align1 WE_all 1N };
+and(8)          g12<1>D         g1.4<0>D        -g1.4<0>D       { align16 1Q };
+and.nz.f0.0(8)  null<1>.xUD     g90<4>.xUD      g89<4>.xUD      { align16 1Q };
+and.nz.f0.0(8)  null<1>UD       g4<0,1,0>UD     0x00000001UD    { align1 1Q };
+and.nz.f0.0(16) null<1>UD       g6<0,1,0>UD     0x00000001UD    { align1 1H };
+and.z.f0.0(8)   null<1>UD       g20<8,8,1>UD    0x00000001UD    { align1 1Q };
+and.z.f0.0(16)  null<1>UD       g33<8,8,1>UD    0x00000001UD    { align1 1H };
+and.z.f0.0(8)   null<1>.xUD     g3<4>.xUD       0x00000001UD    { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen6/and.expected b/src/intel/compiler/elk/tests/gen6/and.expected
new file mode 100644
index 00000000000..c8926d09e09
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/and.expected
@@ -0,0 +1,19 @@
+05 00 60 00 21 04 c0 22 a0 02 8d 00 80 02 8d 00
+05 00 60 02 20 04 00 20 00 03 8d 00 20 03 8d 00
+05 00 80 00 21 04 20 25 e0 04 8d 00 a0 04 8d 00
+05 00 80 02 20 04 00 20 a0 05 8d 00 e0 05 8d 00
+05 00 00 00 21 0c 80 23 e0 06 00 00 ff ff 00 00
+05 01 60 00 21 0c 01 28 60 03 60 00 ff ff 00 00
+05 00 60 00 21 0c 80 21 60 01 8d 00 01 00 00 00
+05 00 80 00 21 0c 60 22 20 02 8d 00 01 00 00 00
+05 01 60 00 21 04 01 22 85 00 65 00 60 00 60 00
+05 00 60 00 a5 14 a0 20 50 00 00 00 50 40 00 00
+05 00 80 00 a5 14 c0 20 50 00 00 00 50 40 00 00
+05 02 00 00 21 0c c0 22 00 00 00 00 c0 00 00 00
+05 01 60 00 a5 14 8f 21 34 00 0e 00 34 40 0e 00
+05 01 60 02 20 04 01 20 40 0b 60 00 20 0b 60 00
+05 00 60 02 20 0c 00 20 80 00 00 00 01 00 00 00
+05 00 80 02 20 0c 00 20 c0 00 00 00 01 00 00 00
+05 00 60 01 20 0c 00 20 80 02 8d 00 01 00 00 00
+05 00 80 01 20 0c 00 20 20 04 8d 00 01 00 00 00
+05 01 60 01 20 0c 01 20 60 00 60 00 01 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen6/asr.asm b/src/intel/compiler/elk/tests/gen6/asr.asm
new file mode 100644
index 00000000000..65cb48eb352
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/asr.asm
@@ -0,0 +1,13 @@
+asr(8)          g11<1>D         g11<4>D         16D             { align16 1Q };
+asr(8)          g2<1>D          g2<8,8,1>D      16D             { align1 1Q };
+asr(16)         g2<1>D          g2<8,8,1>D      16D             { align1 1H };
+asr(8)          g6<1>D          g5<8,8,1>D      0x00000001UD    { align1 1Q };
+asr(16)         g8<1>D          g6<8,8,1>D      0x00000001UD    { align1 1H };
+asr.nz.f0.0(8)  null<1>D        -g0<0,1,0>W     15D             { align1 1Q };
+asr.nz.f0.0(16) null<1>D        -g0<0,1,0>W     15D             { align1 1H };
+asr(8)          g2<1>D          -g0<0,1,0>W     15D             { align1 1Q };
+asr(16)         g2<1>D          -g0<0,1,0>W     15D             { align1 1H };
+asr(8)          g26<1>D         g25<4>D         g20<4>UD        { align16 1Q };
+asr(8)          g16<1>D         g15<4>D         0x00000017UD    { align16 1Q };
+asr(8)          g14<1>D         g10<8,8,1>D     g8<8,8,1>UD     { align1 1Q };
+asr(16)         g21<1>D         g13<8,8,1>D     g9<8,8,1>UD     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen6/asr.expected b/src/intel/compiler/elk/tests/gen6/asr.expected
new file mode 100644
index 00000000000..da7461db745
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/asr.expected
@@ -0,0 +1,13 @@
+0c 01 60 00 a5 1c 6f 21 64 01 6e 00 10 00 00 00
+0c 00 60 00 a5 1c 40 20 40 00 8d 00 10 00 00 00
+0c 00 80 00 a5 1c 40 20 40 00 8d 00 10 00 00 00
+0c 00 60 00 a5 0c c0 20 a0 00 8d 00 01 00 00 00
+0c 00 80 00 a5 0c 00 21 c0 00 8d 00 01 00 00 00
+0c 00 60 02 a4 1d 00 20 00 40 00 00 0f 00 00 00
+0c 00 80 02 a4 1d 00 20 00 40 00 00 0f 00 00 00
+0c 00 60 00 a5 1d 40 20 00 40 00 00 0f 00 00 00
+0c 00 80 00 a5 1d 40 20 00 40 00 00 0f 00 00 00
+0c 01 60 00 a5 04 4f 23 24 03 6e 00 84 02 6e 00
+0c 01 60 00 a5 0c 0f 22 e4 01 6e 00 17 00 00 00
+0c 00 60 00 a5 04 c0 21 40 01 8d 00 00 01 8d 00
+0c 00 80 00 a5 04 a0 22 a0 01 8d 00 20 01 8d 00
diff --git a/src/intel/compiler/elk/tests/gen6/break.asm b/src/intel/compiler/elk/tests/gen6/break.asm
new file mode 100644
index 00000000000..afb85864129
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/break.asm
@@ -0,0 +1,9 @@
+break(8)        JIP: LABEL1    UIP: LABEL2                    { align16 1Q };
+LABEL1:
+break(8)        JIP: LABEL2    UIP: LABEL2                    { align1 1Q };
+break(16)       JIP: LABEL2    UIP: LABEL2                    { align1 1H };
+LABEL2:
+(+f0.0) break(8) JIP: LABEL3   UIP: LABEL3                    { align1 1Q };
+(+f0.0) break(16) JIP: LABEL3  UIP: LABEL3                    { align1 1H };
+(+f0.0.x) break(8) JIP: LABEL3 UIP: LABEL3                    { align16 1Q };
+LABEL3:
diff --git a/src/intel/compiler/elk/tests/gen6/break.expected b/src/intel/compiler/elk/tests/gen6/break.expected
new file mode 100644
index 00000000000..dcf4d4f721d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/break.expected
@@ -0,0 +1,6 @@
+28 01 60 00 84 1c 0f 20 04 00 6e 00 02 00 06 00
+28 00 60 00 84 1c 00 20 00 00 8d 00 04 00 04 00
+28 00 80 00 84 1c 00 20 00 00 8d 00 02 00 02 00
+28 00 61 00 84 1c 00 20 00 00 8d 00 06 00 06 00
+28 00 81 00 84 1c 00 20 00 00 8d 00 04 00 04 00
+28 01 62 00 84 1c 0f 20 04 00 6e 00 02 00 02 00
diff --git a/src/intel/compiler/elk/tests/gen6/cmp.asm b/src/intel/compiler/elk/tests/gen6/cmp.asm
new file mode 100644
index 00000000000..9ac10ddac49
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/cmp.asm
@@ -0,0 +1,135 @@
+cmp.ge.f0.0(8)  g38<1>F         g37<8,8,1>F     0x3189705fF  /* 4e-09F */ { align1 1Q };
+cmp.l.f0.0(8)   g39<1>F         g37<8,8,1>F     0x3189705fF  /* 4e-09F */ { align1 1Q };
+cmp.ge.f0.0(16) g6<1>F          g4<8,8,1>F      0x3189705fF  /* 4e-09F */ { align1 1H };
+cmp.l.f0.0(16)  g8<1>F          g4<8,8,1>F      0x3189705fF  /* 4e-09F */ { align1 1H };
+cmp.ge.f0.0(8)  null<1>F        g38<4>.xF       g36<4>.xF       { align16 1Q };
+cmp.g.f0.0(8)   null<1>UD       g17<4>.xUD      0x00000000UD    { align16 1Q };
+cmp.ge.f0.0(8)  null<1>UD       g18<4>.xUD      g17<4>.xUD      { align16 1Q };
+cmp.l.f0.0(8)   null<1>F        g4.4<0,1,0>F    0x0F  /* 0F */  { align1 1Q };
+cmp.l.f0.0(16)  null<1>F        g6.4<0,1,0>F    0x0F  /* 0F */  { align1 1H };
+cmp.z.f0.0(8)   null<1>UD       g9<4>.xUD       0x00000000UD    { align16 1Q };
+cmp.l.f0.0(8)   null<1>UD       g20<4>.xUD      0x00000004UD    { align16 1Q };
+(+f0.0) cmp.nz.f0.0(8) null<1>UD g20<4>.xUD     0x00000000UD    { align16 1Q };
+cmp.l.f0.0(8)   null<1>D        g4<0,1,0>D      1D              { align1 1Q };
+cmp.z.f0.0(8)   g20<1>F         g3<8,8,1>F      g4.3<0,1,0>F    { align1 1Q };
+cmp.l.f0.0(16)  null<1>D        g6<0,1,0>D      1D              { align1 1H };
+cmp.z.f0.0(16)  g37<1>F         g4<8,8,1>F      g6.3<0,1,0>F    { align1 1H };
+cmp.z.f0.0(8)   null<1>F        g66<4>F         g3.4<0>F        { align16 1Q };
+cmp.l.f0.0(8)   g22<1>F         g21<8,8,1>F     g20<8,8,1>F     { align1 1Q };
+cmp.ge.f0.0(8)  g23<1>F         g21<8,8,1>F     g20<8,8,1>F     { align1 1Q };
+cmp.l.f0.0(16)  g25<1>F         g23<8,8,1>F     g2<8,8,1>F      { align1 1H };
+cmp.ge.f0.0(16) g27<1>F         g23<8,8,1>F     g2<8,8,1>F      { align1 1H };
+cmp.le.f0.0(8)  null<1>.zF      g7<4>.xF        0x0F  /* 0F */  { align16 1Q };
+cmp.le.f0.0(8)  null<1>UD       g60<4>UD        g29<4>UD        { align16 1Q };
+cmp.l.f0.0(8)   null<1>UD       g60<4>UD        g55<4>.xUD      { align16 1Q };
+cmp.z.f0.0(8)   null<1>D        g4<0,1,0>D      1D              { align1 1Q };
+cmp.z.f0.0(8)   null<1>F        g11<8,8,1>F     g4.1<0,1,0>F    { align1 1Q };
+cmp.z.f0.0(16)  null<1>D        g6<0,1,0>D      1D              { align1 1H };
+cmp.z.f0.0(16)  null<1>F        g17<8,8,1>F     g6.1<0,1,0>F    { align1 1H };
+cmp.z.f0.0(8)   g31<1>.yzwD     g3<0>.xD        g19<4>.yyzwD    { align16 1Q };
+cmp.nz.f0.0(8)  null<1>.xD      g31<4>.yD       0D              { align16 1Q };
+cmp.nz.f0.0(8)  g8<1>D          g5<8,8,1>D      g3.1<0,1,0>D    { align1 1Q };
+cmp.nz.f0.0(16) g12<1>D         g6<8,8,1>D      g3.1<0,1,0>D    { align1 1H };
+cmp.nz.f0.0(8)  null<1>D        g4<0,1,0>D      0D              { align1 1Q };
+cmp.nz.f0.0(16) null<1>D        g6<0,1,0>D      0D              { align1 1H };
+cmp.nz.f0.0(8)  g72<1>F         g74<8,8,1>F     g71<8,8,1>F     { align1 1Q };
+cmp.nz.f0.0(16) g77<1>F         g75<8,8,1>F     g71<8,8,1>F     { align1 1H };
+cmp.ge.f0.0(8)  null<1>.xD      g5<4>.xD        4D              { align16 1Q };
+cmp.z.f0.0(8)   g17<1>.xD       g1<0>.xD        1D              { align16 1Q };
+cmp.z.f0.0(8)   null<1>D        g2<4>.zD        g17<4>.xD       { align16 1Q };
+cmp.nz.f0.0(8)  null<1>F        g2.4<0,1,0>F    0x0F  /* 0F */  { align1 1Q };
+cmp.nz.f0.0(16) null<1>F        g2.4<0,1,0>F    0x0F  /* 0F */  { align1 1H };
+cmp.z.f0.0(8)   null<1>D        g4.4<0>.xD      0D              { align16 1Q };
+cmp.z.f0.0(8)   null<1>.xF      (abs)g13<4>.xF  0x7f800000F  /* infF */ { align16 1Q };
+cmp.nz.f0.0(8)  null<1>.xF      g13<4>.xF       g13<4>.xF       { align16 1Q };
+cmp.nz.f0.0(8)  null<1>F        g13<4>.xF       0x0F  /* 0F */  { align16 1Q };
+cmp.l.f0.0(8)   null<1>.xF      g5<4>.xF        g13<4>.xF       { align16 1Q };
+cmp.l.f0.0(8)   g10<1>UD        g9<4>UD         g1<0>UD         { align16 1Q };
+cmp.nz.f0.0(8)  null<1>D        -g10<4>D        g2<0>D          { align16 1Q };
+cmp.g.f0.0(8)   g34<1>F         g33<8,8,1>F     0x3727c5acF  /* 1e-05F */ { align1 1Q };
+cmp.le.f0.0(8)  g35<1>F         g33<8,8,1>F     0x3727c5acF  /* 1e-05F */ { align1 1Q };
+cmp.g.f0.0(16)  g8<1>F          g6<8,8,1>F      0x3727c5acF  /* 1e-05F */ { align1 1H };
+cmp.le.f0.0(16) g10<1>F         g6<8,8,1>F      0x3727c5acF  /* 1e-05F */ { align1 1H };
+cmp.z.f0.0(8)   g11<1>F         g9<4>.xF        g2<4>F          { align16 1Q };
+cmp.z.f0.0(8)   g3<1>D          g2.3<0,1,0>D    0D              { align1 1Q };
+cmp.nz.f0.0(8)  g4<1>D          g2.3<0,1,0>D    0D              { align1 1Q };
+cmp.z.f0.0(16)  g3<1>D          g2.3<0,1,0>D    0D              { align1 1H };
+cmp.nz.f0.0(16) g5<1>D          g2.3<0,1,0>D    0D              { align1 1H };
+cmp.le.f0.0(8)  g50<1>.xF       g44<4>.xF       0x3727c5acF  /* 1e-05F */ { align16 1Q };
+cmp.g.f0.0(8)   null<1>.xF      g49<4>.xF       0x3727c5acF  /* 1e-05F */ { align16 1Q };
+cmp.l.f0.0(8)   null<1>F        g39<4>.xF       0x3189705fF  /* 4e-09F */ { align16 1Q };
+cmp.nz.f0.0(8)  null<1>F        g4.4<0>F        0x48403000VF /* [0F, 1F, 2F, 3F]VF */ { align16 1Q };
+cmp.ge.f0.0(8)  g5<1>D          g2<0,1,0>D      1D              { align1 1Q };
+cmp.ge.f0.0(16) g7<1>D          g2<0,1,0>D      1D              { align1 1H };
+cmp.z.f0.0(8)   null<1>F        (abs)g10<8,8,1>F 0x7f800000F  /* infF */ { align1 1Q };
+cmp.nz.f0.0(8)  null<1>F        g10<8,8,1>F     g10<8,8,1>F     { align1 1Q };
+cmp.g.f0.0(8)   null<1>F        g10<8,8,1>F     0x0F  /* 0F */  { align1 1Q };
+cmp.z.f0.0(16)  null<1>F        (abs)g15<8,8,1>F 0x7f800000F  /* infF */ { align1 1H };
+cmp.nz.f0.0(16) null<1>F        g15<8,8,1>F     g15<8,8,1>F     { align1 1H };
+cmp.g.f0.0(16)  null<1>F        g15<8,8,1>F     0x0F  /* 0F */  { align1 1H };
+cmp.z.f0.0(8)   g3<1>F          g2.1<0,1,0>F    0x40800000F  /* 4F */ { align1 1Q };
+cmp.z.f0.0(16)  g3<1>F          g2.1<0,1,0>F    0x40800000F  /* 4F */ { align1 1H };
+cmp.ge.f0.0(8)  g77<1>.xD       g2<0>.xD        16D             { align16 1Q };
+cmp.l.f0.0(8)   g76<1>.xyzF     g8<0>.wF        g75<4>.xF       { align16 1Q };
+cmp.z.f0.0(8)   g6<1>D          g5<8,8,1>D      g2.5<0,1,0>D    { align1 1Q };
+cmp.z.f0.0(16)  g9<1>D          g7<8,8,1>D      g2.5<0,1,0>D    { align1 1H };
+cmp.le.f0.0(8)  null<1>D        g1<0>.xD        0D              { align16 1Q };
+cmp.l.f0.0(8)   g76<1>.xD       g74<4>.xD       7D              { align16 1Q };
+cmp.l.f0.0(8)   null<1>.xD      g74<4>.xD       3D              { align16 1Q };
+cmp.le.f0.0(8)  g4<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 1Q };
+cmp.g.f0.0(8)   g5<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 1Q };
+cmp.le.f0.0(16) g5<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 1H };
+cmp.g.f0.0(16)  g7<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 1H };
+cmp.le.f0.0(8)  null<1>F        g68<8,8,1>F     0x3f000000F  /* 0.5F */ { align1 1Q };
+cmp.le.f0.0(16) null<1>F        g2<8,8,1>F      0x3f000000F  /* 0.5F */ { align1 1H };
+cmp.z.f0.0(8)   null<1>F        g3<0>.xyzzF     0x6e6e6c6aVF /* [13F, 14F, 15F, 15F]VF */ { align16 1Q };
+cmp.ge.f0.0(8)  g31<1>UD        g30<8,8,1>UD    g5.7<0,1,0>UD   { align1 1Q };
+cmp.l.f0.0(8)   g32<1>UD        g30<8,8,1>UD    g5.3<0,1,0>UD   { align1 1Q };
+(+f0.1) cmp.z.f0.1(8) null<1>D  g37<8,8,1>D     0D              { align1 1Q };
+cmp.ge.f0.0(16) g49<1>UD        g47<8,8,1>UD    g7.7<0,1,0>UD   { align1 1H };
+cmp.l.f0.0(16)  g51<1>UD        g47<8,8,1>UD    g7.3<0,1,0>UD   { align1 1H };
+(+f0.1) cmp.z.f0.1(16) null<1>D g80<8,8,1>D     0D              { align1 1H };
+cmp.ge.f0.0(8)  null<1>F        g13<8,8,1>F     0x38d1b717F  /* 0.0001F */ { align1 1Q };
+cmp.ge.f0.0(16) null<1>F        g21<8,8,1>F     0x38d1b717F  /* 0.0001F */ { align1 1H };
+cmp.g.f0.0(8)   g6<1>F          g2<4>F          0x3f000000F  /* 0.5F */ { align16 1Q };
+cmp.nz.f0.0(8)  g4<1>F          g2.2<0,1,0>F    0x0F  /* 0F */  { align1 1Q };
+cmp.nz.f0.0(16) g5<1>F          g2.2<0,1,0>F    0x0F  /* 0F */  { align1 1H };
+cmp.ge.f0.0(8)  null<1>D        g6<8,8,1>D      4D              { align1 1Q };
+cmp.nz.f0.0(8)  null<1>D        g8<8,8,1>D      g6<8,8,1>D      { align1 1Q };
+cmp.ge.f0.0(16) null<1>D        g10<8,8,1>D     4D              { align1 1H };
+cmp.nz.f0.0(16) null<1>D        g14<8,8,1>D     g10<8,8,1>D     { align1 1H };
+cmp.ge.f0.0(8)  g8<1>F          g1<0>F          g1.4<0>F        { align16 1Q };
+cmp.l.f0.0(8)   g54<1>D         g5<0,1,0>D      1D              { align1 1Q };
+cmp.l.f0.0(16)  g52<1>D         g7<0,1,0>D      1D              { align1 1H };
+cmp.nz.f0.0(8)  g13<1>.xyF      g12<4>.xyyyF    g12<4>.xyyyF    { align16 1Q };
+cmp.ge.f0.0(8)  null<1>F        g3<4>.xF        0x41f00000F  /* 30F */ { align16 1Q };
+cmp.g.f0.0(8)   null<1>D        g2.1<0,1,0>D    0D              { align1 1Q };
+cmp.ge.f0.0(8)  null<1>D        g3<8,8,1>D      g2.1<0,1,0>D    { align1 1Q };
+cmp.g.f0.0(16)  null<1>D        g2.1<0,1,0>D    0D              { align1 1H };
+cmp.ge.f0.0(16) null<1>D        g3<8,8,1>D      g2.1<0,1,0>D    { align1 1H };
+cmp.ge.f0.0(8)  g88<1>.xF       g8<4>.xF        0x3727c5acF  /* 1e-05F */ { align16 1Q };
+cmp.ge.f0.0(8)  null<1>.xD      g6<4>.xD        g3<0>.yD        { align16 1Q };
+cmp.g.f0.0(8)   null<1>.xD      g1<0>.xD        0D              { align16 1Q };
+cmp.z.f0.0(8)   null<1>D        g4<8,8,1>D      g2<0,1,0>D      { align1 1Q };
+cmp.z.f0.0(16)  null<1>D        g5<8,8,1>D      g2<0,1,0>D      { align1 1H };
+cmp.ge.f0.0(8)  null<1>F        g2<8,8,1>F      g8<8,8,1>F      { align1 1Q };
+cmp.ge.f0.0(16) null<1>F        g2<8,8,1>F      g11<8,8,1>F     { align1 1H };
+cmp.z.f0.0(8)   g8<1>.xF        g7<4>.xF        0x40533333F  /* 3.3F */ { align16 1Q };
+cmp.nz.f0.0(8)  g40<1>.xD       g7<4>.xD        g39<4>.xD       { align16 1Q };
+(+f0.1) cmp.nz.f0.1(8) null<1>UW g0<8,8,1>UW    g0<8,8,1>UW     { align1 1Q };
+(+f0.1) cmp.nz.f0.1(16) null<1>UW g0<8,8,1>UW   g0<8,8,1>UW     { align1 1H };
+cmp.nz.f0.0(8)  g20<1>.xD       g19<4>.xD       0D              { align16 1Q };
+cmp.nz.f0.0(8)  g8<1>F          g7<4>F          0x0F  /* 0F */  { align16 1Q };
+cmp.le.f0.0(8)  g3<1>D          g2<0,1,0>D      0D              { align1 1Q };
+cmp.le.f0.0(16) g3<1>D          g2<0,1,0>D      0D              { align1 1H };
+cmp.l.f0.0(8)   g12<1>.xF       g3<0>.zF        0x3f000000F  /* 0.5F */ { align16 1Q };
+cmp.l.f0.0(8)   null<1>.xD      g9<4>.xD        g5<4>.xD        { align16 1Q };
+cmp.ge.f0.0(8)  null<1>UD       g4<8,8,1>UD     g2.3<0,1,0>UD   { align1 1Q };
+cmp.ge.f0.0(16) null<1>UD       g5<8,8,1>UD     g2.3<0,1,0>UD   { align1 1H };
+cmp.l.f0.0(8)   null<1>D        g2.1<0,1,0>D    g3<8,8,1>D      { align1 1Q };
+cmp.l.f0.0(16)  null<1>D        g2.1<0,1,0>D    g3<8,8,1>D      { align1 1H };
+cmp.le.f0.0(8)  g9<1>.xUD       g1<0>.xUD       0x00000001UD    { align16 1Q };
+cmp.l.f0.0(8)   null<1>UD       g2<8,8,1>UD     g3<8,8,1>UD     { align1 1Q };
+cmp.l.f0.0(16)  null<1>UD       g2<8,8,1>UD     g4<8,8,1>UD     { align1 1H };
+cmp.l.f0.0(8)   null<1>F        (abs)g7<8,8,1>F (abs)g16<8,8,1>F { align1 1Q };
+cmp.l.f0.0(16)  null<1>F        (abs)g31<8,8,1>F (abs)g33<8,8,1>F { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen6/cmp.expected b/src/intel/compiler/elk/tests/gen6/cmp.expected
new file mode 100644
index 00000000000..3a4d09617c7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/cmp.expected
@@ -0,0 +1,135 @@
+10 00 60 04 bd 7f c0 24 a0 04 8d 00 5f 70 89 31
+10 00 60 05 bd 7f e0 24 a0 04 8d 00 5f 70 89 31
+10 00 80 04 bd 7f c0 20 80 00 8d 00 5f 70 89 31
+10 00 80 05 bd 7f 00 21 80 00 8d 00 5f 70 89 31
+10 01 60 04 bc 77 0f 20 c0 04 60 00 80 04 60 00
+10 01 60 03 20 0c 0f 20 20 02 60 00 00 00 00 00
+10 01 60 04 20 04 0f 20 40 02 60 00 20 02 60 00
+10 00 60 05 bc 7f 00 20 90 00 00 00 00 00 00 00
+10 00 80 05 bc 7f 00 20 d0 00 00 00 00 00 00 00
+10 01 60 01 20 0c 0f 20 20 01 60 00 00 00 00 00
+10 01 60 05 20 0c 0f 20 80 02 60 00 04 00 00 00
+10 01 61 02 20 0c 0f 20 80 02 60 00 00 00 00 00
+10 00 60 05 a4 1c 00 20 80 00 00 00 01 00 00 00
+10 00 60 01 bd 77 80 22 60 00 8d 00 8c 00 00 00
+10 00 80 05 a4 1c 00 20 c0 00 00 00 01 00 00 00
+10 00 80 01 bd 77 a0 24 80 00 8d 00 cc 00 00 00
+10 01 60 01 bc 77 0f 20 44 08 6e 00 74 00 0e 00
+10 00 60 05 bd 77 c0 22 a0 02 8d 00 80 02 8d 00
+10 00 60 04 bd 77 e0 22 a0 02 8d 00 80 02 8d 00
+10 00 80 05 bd 77 20 23 e0 02 8d 00 40 00 8d 00
+10 00 80 04 bd 77 60 23 e0 02 8d 00 40 00 8d 00
+10 01 60 06 bc 7f 04 20 e0 00 60 00 00 00 00 00
+10 01 60 06 20 04 0f 20 84 07 6e 00 a4 03 6e 00
+10 01 60 05 20 04 0f 20 84 07 6e 00 e0 06 60 00
+10 00 60 01 a4 1c 00 20 80 00 00 00 01 00 00 00
+10 00 60 01 bc 77 00 20 60 01 8d 00 84 00 00 00
+10 00 80 01 a4 1c 00 20 c0 00 00 00 01 00 00 00
+10 00 80 01 bc 77 00 20 20 02 8d 00 c4 00 00 00
+10 01 60 01 a5 14 ee 23 60 00 00 00 65 02 6e 00
+10 01 60 02 a4 1c 01 20 e5 03 65 00 00 00 00 00
+10 00 60 02 a5 14 00 21 a0 00 8d 00 64 00 00 00
+10 00 80 02 a5 14 80 21 c0 00 8d 00 64 00 00 00
+10 00 60 02 a4 1c 00 20 80 00 00 00 00 00 00 00
+10 00 80 02 a4 1c 00 20 c0 00 00 00 00 00 00 00
+10 00 60 02 bd 77 00 29 40 09 8d 00 e0 08 8d 00
+10 00 80 02 bd 77 a0 29 60 09 8d 00 e0 08 8d 00
+10 01 60 04 a4 1c 01 20 a0 00 60 00 04 00 00 00
+10 01 60 01 a5 1c 21 22 20 00 00 00 01 00 00 00
+10 01 60 01 a4 14 0f 20 4a 00 6a 00 20 02 60 00
+10 00 60 02 bc 7f 00 20 50 00 00 00 00 00 00 00
+10 00 80 02 bc 7f 00 20 50 00 00 00 00 00 00 00
+10 01 60 01 a4 1c 0f 20 90 00 00 00 00 00 00 00
+10 01 60 01 bc 7f 01 20 a0 21 60 00 00 00 80 7f
+10 01 60 02 bc 77 01 20 a0 01 60 00 a0 01 60 00
+10 01 60 02 bc 7f 0f 20 a0 01 60 00 00 00 00 00
+10 01 60 05 bc 77 01 20 a0 00 60 00 a0 01 60 00
+10 01 60 05 21 04 4f 21 24 01 6e 00 24 00 0e 00
+10 01 60 02 a4 14 0f 20 44 41 6e 00 44 00 0e 00
+10 00 60 03 bd 7f 40 24 20 04 8d 00 ac c5 27 37
+10 00 60 06 bd 7f 60 24 20 04 8d 00 ac c5 27 37
+10 00 80 03 bd 7f 00 21 c0 00 8d 00 ac c5 27 37
+10 00 80 06 bd 7f 40 21 c0 00 8d 00 ac c5 27 37
+10 01 60 01 bd 77 6f 21 20 01 60 00 44 00 6e 00
+10 00 60 01 a5 1c 60 20 4c 00 00 00 00 00 00 00
+10 00 60 02 a5 1c 80 20 4c 00 00 00 00 00 00 00
+10 00 80 01 a5 1c 60 20 4c 00 00 00 00 00 00 00
+10 00 80 02 a5 1c a0 20 4c 00 00 00 00 00 00 00
+10 01 60 06 bd 7f 41 26 80 05 60 00 ac c5 27 37
+10 01 60 03 bc 7f 01 20 20 06 60 00 ac c5 27 37
+10 01 60 05 bc 7f 0f 20 e0 04 60 00 5f 70 89 31
+10 01 60 02 bc 5f 0f 20 94 00 0e 00 00 30 40 48
+10 00 60 04 a5 1c a0 20 40 00 00 00 01 00 00 00
+10 00 80 04 a5 1c e0 20 40 00 00 00 01 00 00 00
+10 00 60 01 bc 7f 00 20 40 21 8d 00 00 00 80 7f
+10 00 60 02 bc 77 00 20 40 01 8d 00 40 01 8d 00
+10 00 60 03 bc 7f 00 20 40 01 8d 00 00 00 00 00
+10 00 80 01 bc 7f 00 20 e0 21 8d 00 00 00 80 7f
+10 00 80 02 bc 77 00 20 e0 01 8d 00 e0 01 8d 00
+10 00 80 03 bc 7f 00 20 e0 01 8d 00 00 00 00 00
+10 00 60 01 bd 7f 60 20 44 00 00 00 00 00 80 40
+10 00 80 01 bd 7f 60 20 44 00 00 00 00 00 80 40
+10 01 60 04 a5 1c a1 29 40 00 00 00 10 00 00 00
+10 01 60 05 bd 77 87 29 0f 01 0f 00 60 09 60 00
+10 00 60 01 a5 14 c0 20 a0 00 8d 00 54 00 00 00
+10 00 80 01 a5 14 20 21 e0 00 8d 00 54 00 00 00
+10 01 60 06 a4 1c 0f 20 20 00 00 00 00 00 00 00
+10 01 60 05 a5 1c 81 29 40 09 60 00 07 00 00 00
+10 01 60 05 a4 1c 01 20 40 09 60 00 03 00 00 00
+10 00 60 06 21 0c 80 20 40 00 00 00 01 00 00 00
+10 00 60 03 21 0c a0 20 40 00 00 00 01 00 00 00
+10 00 80 06 21 0c a0 20 40 00 00 00 01 00 00 00
+10 00 80 03 21 0c e0 20 40 00 00 00 01 00 00 00
+10 00 60 06 bc 7f 00 20 80 08 8d 00 00 00 00 3f
+10 00 80 06 bc 7f 00 20 40 00 8d 00 00 00 00 3f
+10 01 60 01 bc 5f 0f 20 64 00 0a 00 6a 6c 6e 6e
+10 00 60 04 21 04 e0 23 c0 03 8d 00 bc 00 00 00
+10 00 60 05 21 04 00 24 c0 03 8d 00 ac 00 00 00
+10 00 61 01 a4 1c 00 20 a0 04 8d 02 00 00 00 00
+10 00 80 04 21 04 20 26 e0 05 8d 00 fc 00 00 00
+10 00 80 05 21 04 60 26 e0 05 8d 00 ec 00 00 00
+10 00 81 01 a4 1c 00 20 00 0a 8d 02 00 00 00 00
+10 00 60 04 bc 7f 00 20 a0 01 8d 00 17 b7 d1 38
+10 00 80 04 bc 7f 00 20 a0 02 8d 00 17 b7 d1 38
+10 01 60 03 bd 7f cf 20 44 00 6e 00 00 00 00 3f
+10 00 60 02 bd 7f 80 20 48 00 00 00 00 00 00 00
+10 00 80 02 bd 7f a0 20 48 00 00 00 00 00 00 00
+10 00 60 04 a4 1c 00 20 c0 00 8d 00 04 00 00 00
+10 00 60 02 a4 14 00 20 00 01 8d 00 c0 00 8d 00
+10 00 80 04 a4 1c 00 20 40 01 8d 00 04 00 00 00
+10 00 80 02 a4 14 00 20 c0 01 8d 00 40 01 8d 00
+10 01 60 04 bd 77 0f 21 24 00 0e 00 34 00 0e 00
+10 00 60 05 a5 1c c0 26 a0 00 00 00 01 00 00 00
+10 00 80 05 a5 1c 80 26 e0 00 00 00 01 00 00 00
+10 01 60 02 bd 77 a3 21 84 01 65 00 84 01 65 00
+10 01 60 04 bc 7f 0f 20 60 00 60 00 00 00 f0 41
+10 00 60 03 a4 1c 00 20 44 00 00 00 00 00 00 00
+10 00 60 04 a4 14 00 20 60 00 8d 00 44 00 00 00
+10 00 80 03 a4 1c 00 20 44 00 00 00 00 00 00 00
+10 00 80 04 a4 14 00 20 60 00 8d 00 44 00 00 00
+10 01 60 04 bd 7f 01 2b 00 01 60 00 ac c5 27 37
+10 01 60 04 a4 14 01 20 c0 00 60 00 65 00 05 00
+10 01 60 03 a4 1c 01 20 20 00 00 00 00 00 00 00
+10 00 60 01 a4 14 00 20 80 00 8d 00 40 00 00 00
+10 00 80 01 a4 14 00 20 a0 00 8d 00 40 00 00 00
+10 00 60 04 bc 77 00 20 40 00 8d 00 00 01 8d 00
+10 00 80 04 bc 77 00 20 40 00 8d 00 60 01 8d 00
+10 01 60 01 bd 7f 01 21 e0 00 60 00 33 33 53 40
+10 01 60 02 a5 14 01 25 e0 00 60 00 e0 04 60 00
+10 00 61 02 28 25 00 20 00 00 8d 02 00 00 8d 00
+10 00 81 02 28 25 00 20 00 00 8d 02 00 00 8d 00
+10 01 60 02 a5 1c 81 22 60 02 60 00 00 00 00 00
+10 01 60 02 bd 7f 0f 21 e4 00 6e 00 00 00 00 00
+10 00 60 06 a5 1c 60 20 40 00 00 00 00 00 00 00
+10 00 80 06 a5 1c 60 20 40 00 00 00 00 00 00 00
+10 01 60 05 bd 7f 81 21 6a 00 0a 00 00 00 00 3f
+10 01 60 05 a4 14 01 20 20 01 60 00 a0 00 60 00
+10 00 60 04 20 04 00 20 80 00 8d 00 4c 00 00 00
+10 00 80 04 20 04 00 20 a0 00 8d 00 4c 00 00 00
+10 00 60 05 a4 14 00 20 44 00 00 00 60 00 8d 00
+10 00 80 05 a4 14 00 20 44 00 00 00 60 00 8d 00
+10 01 60 06 21 0c 21 21 20 00 00 00 01 00 00 00
+10 00 60 05 20 04 00 20 40 00 8d 00 60 00 8d 00
+10 00 80 05 20 04 00 20 40 00 8d 00 80 00 8d 00
+10 00 60 05 bc 77 00 20 e0 20 8d 00 00 22 8d 00
+10 00 80 05 bc 77 00 20 e0 23 8d 00 20 24 8d 00
diff --git a/src/intel/compiler/elk/tests/gen6/cont.asm b/src/intel/compiler/elk/tests/gen6/cont.asm
new file mode 100644
index 00000000000..497a1155efc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/cont.asm
@@ -0,0 +1,6 @@
+cont(8)         JIP: LABEL0          UIP: LABEL2                { align1 1Q };
+LABEL0:
+cont(16)        JIP: LABEL1          UIP: LABEL2                { align1 1H };
+LABEL1:
+cont(8)         JIP: LABEL2          UIP: LABEL2                { align16 1Q };
+LABEL2:
diff --git a/src/intel/compiler/elk/tests/gen6/cont.expected b/src/intel/compiler/elk/tests/gen6/cont.expected
new file mode 100644
index 00000000000..704ea3afd86
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/cont.expected
@@ -0,0 +1,3 @@
+29 00 60 00 00 1c 00 34 00 14 60 00 02 00 06 00
+29 00 80 00 00 1c 00 34 00 14 60 00 02 00 04 00
+29 01 60 00 00 1c 0f 34 04 14 6e 00 02 00 02 00
diff --git a/src/intel/compiler/elk/tests/gen6/dp2.asm b/src/intel/compiler/elk/tests/gen6/dp2.asm
new file mode 100644
index 00000000000..dd2b52ca274
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/dp2.asm
@@ -0,0 +1,7 @@
+dp2(8)          g91<1>F         g88<4>.xyyyF    g88<4>.xyyyF    { align16 1Q };
+dp2(8)          m4<1>.xF        g1<0>.yF        g1<0>.yF        { align16 NoDDClr 1Q };
+dp2(8)          m4<1>.yzF       g1<0>.xF        g1<0>.zwwwF     { align16 NoDDClr,NoDDChk 1Q };
+dp2(8)          m4<1>.wF        g1<0>.ywwwF     g1<0>.wyyyF     { align16 NoDDChk 1Q };
+dp2(8)          g4<1>.yF        g1<0>.xyyyF     g1.4<0>.xyyyF   { align16 NoDDClr 1Q };
+dp2(8)          g4<1>.zF        g1<0>.xyyyF     g1.4<0>.zwwwF   { align16 NoDDClr,NoDDChk 1Q };
+dp2(8)          g4<1>.wF        g1<0>.xyyyF     g2<0>.xyyyF     { align16 NoDDChk 1Q };
diff --git a/src/intel/compiler/elk/tests/gen6/dp2.expected b/src/intel/compiler/elk/tests/gen6/dp2.expected
new file mode 100644
index 00000000000..9aa792876ab
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/dp2.expected
@@ -0,0 +1,7 @@
+57 01 60 00 bd 77 6f 2b 04 0b 65 00 04 0b 65 00
+57 05 60 00 be 77 81 20 25 00 05 00 25 00 05 00
+57 0d 60 00 be 77 86 20 20 00 00 00 2e 00 0f 00
+57 09 60 00 be 77 88 20 2d 00 0f 00 27 00 05 00
+57 05 60 00 bd 77 82 20 24 00 05 00 34 00 05 00
+57 0d 60 00 bd 77 84 20 24 00 05 00 3e 00 0f 00
+57 09 60 00 bd 77 88 20 24 00 05 00 44 00 05 00
diff --git a/src/intel/compiler/elk/tests/gen6/dp3.asm b/src/intel/compiler/elk/tests/gen6/dp3.asm
new file mode 100644
index 00000000000..c51880a4e4b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/dp3.asm
@@ -0,0 +1,10 @@
+dp3(8)          m4<1>.xF        g3<0>.xyzzF     g6<4>.xyzzF     { align16 NoDDClr 1Q };
+dp3(8)          m4<1>.yF        g3.4<0>.xyzzF   g6<4>.xyzzF     { align16 NoDDClr,NoDDChk 1Q };
+dp3(8)          g70<1>F         g67<4>.xyzzF    g67<4>.xyzzF    { align16 1Q };
+dp3(8)          m4<1>.xF        g4<4>.xyzzF     g5<4>.xyzzF     { align16 1Q };
+dp3.le.f0.0(8)  g42<1>.xF       g33<4>.xyzzF    g3.4<0>.xyzzF   { align16 1Q };
+dp3(8)          g21<1>.xF       g20<4>.xyzzF    g1<0>.xyzzF     { align16 NoDDClr 1Q };
+dp3(8)          g21<1>.yF       g20<4>.xyzzF    g1.4<0>.xyzzF   { align16 NoDDClr,NoDDChk 1Q };
+dp3(8)          g21<1>.zF       g20<4>.xyzzF    g2<0>.xyzzF     { align16 NoDDChk 1Q };
+dp3.sat(8)      g49<1>F         g38<4>.xyzzF    g43<4>.xyzzF    { align16 1Q };
+dp3.sat(8)      m4<1>F          g2<4>.xyzzF     g2<4>.xyzzF     { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen6/dp3.expected b/src/intel/compiler/elk/tests/gen6/dp3.expected
new file mode 100644
index 00000000000..2d71488b091
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/dp3.expected
@@ -0,0 +1,10 @@
+56 05 60 00 be 77 81 20 64 00 0a 00 c4 00 6a 00
+56 0d 60 00 be 77 82 20 74 00 0a 00 c4 00 6a 00
+56 01 60 00 bd 77 cf 28 64 08 6a 00 64 08 6a 00
+56 01 60 00 be 77 81 20 84 00 6a 00 a4 00 6a 00
+56 01 60 06 bd 77 41 25 24 04 6a 00 74 00 0a 00
+56 05 60 00 bd 77 a1 22 84 02 6a 00 24 00 0a 00
+56 0d 60 00 bd 77 a2 22 84 02 6a 00 34 00 0a 00
+56 09 60 00 bd 77 a4 22 84 02 6a 00 44 00 0a 00
+56 01 60 80 bd 77 2f 26 c4 04 6a 00 64 05 6a 00
+56 01 60 80 be 77 8f 20 44 00 6a 00 44 00 6a 00
diff --git a/src/intel/compiler/elk/tests/gen6/dp4.asm b/src/intel/compiler/elk/tests/gen6/dp4.asm
new file mode 100644
index 00000000000..c873b54a9f1
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/dp4.asm
@@ -0,0 +1,9 @@
+dp4(8)          m3<1>.xF        g3<4>F          g1<0>F          { align16 NoDDClr 1Q };
+dp4(8)          m3<1>.yF        g3<4>F          g1.4<0>F        { align16 NoDDClr,NoDDChk 1Q };
+dp4(8)          m3<1>.wF        g3<4>F          g2.4<0>F        { align16 NoDDChk 1Q };
+dp4(8)          g6<1>.xF        g3<4>F          g1<0>F          { align16 1Q };
+dp4(8)          m3<1>.wF        g4<4>F          g2.4<0>F        { align16 1Q };
+dp4(8)          g26<1>.xF       g24<4>F         g5<0>F          { align16 NoDDClr 1Q };
+dp4(8)          g26<1>.yF       g24<4>F         g5.4<0>F        { align16 NoDDChk 1Q };
+dp4.sat(8)      m4<1>F          g2<4>.xF        g2<4>F          { align16 1Q };
+dp4(8)          g18<1>.xF       g2.4<0>F        0x3f800000F  /* 1F */ { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen6/dp4.expected b/src/intel/compiler/elk/tests/gen6/dp4.expected
new file mode 100644
index 00000000000..4de79010c24
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/dp4.expected
@@ -0,0 +1,9 @@
+54 05 60 00 be 77 61 20 64 00 6e 00 24 00 0e 00
+54 0d 60 00 be 77 62 20 64 00 6e 00 34 00 0e 00
+54 09 60 00 be 77 68 20 64 00 6e 00 54 00 0e 00
+54 01 60 00 bd 77 c1 20 64 00 6e 00 24 00 0e 00
+54 01 60 00 be 77 68 20 84 00 6e 00 54 00 0e 00
+54 05 60 00 bd 77 41 23 04 03 6e 00 a4 00 0e 00
+54 09 60 00 bd 77 42 23 04 03 6e 00 b4 00 0e 00
+54 01 60 80 be 77 8f 20 40 00 60 00 44 00 6e 00
+54 01 60 00 bd 7f 41 22 54 00 0e 00 00 00 80 3f
diff --git a/src/intel/compiler/elk/tests/gen6/dph.asm b/src/intel/compiler/elk/tests/gen6/dph.asm
new file mode 100644
index 00000000000..e0fe6949fd6
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/dph.asm
@@ -0,0 +1,5 @@
+dph(8)          m4<1>.xF        g4<4>.xyzxF     g5<4>F          { align16 1Q };
+dph.sat(8)      m4<1>F          g1<0>.xyzxF     g3<4>F          { align16 1Q };
+dph(8)          m3<1>.xF        g5<4>.xyzxF     g1<0>F          { align16 NoDDClr 1Q };
+dph(8)          m3<1>.yF        g5<4>.xyzxF     g1.4<0>F        { align16 NoDDClr,NoDDChk 1Q };
+dph(8)          m3<1>.wF        g5<4>.xyzxF     g2.4<0>F        { align16 NoDDChk 1Q };
diff --git a/src/intel/compiler/elk/tests/gen6/dph.expected b/src/intel/compiler/elk/tests/gen6/dph.expected
new file mode 100644
index 00000000000..2ef78a0a7cb
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/dph.expected
@@ -0,0 +1,5 @@
+55 01 60 00 be 77 81 20 84 00 62 00 a4 00 6e 00
+55 01 60 80 be 77 8f 20 24 00 02 00 64 00 6e 00
+55 05 60 00 be 77 61 20 a4 00 62 00 24 00 0e 00
+55 0d 60 00 be 77 62 20 a4 00 62 00 34 00 0e 00
+55 09 60 00 be 77 68 20 a4 00 62 00 54 00 0e 00
diff --git a/src/intel/compiler/elk/tests/gen6/else.asm b/src/intel/compiler/elk/tests/gen6/else.asm
new file mode 100644
index 00000000000..b3f404427ff
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/else.asm
@@ -0,0 +1,4 @@
+else(8)         JIP: LABEL0                                    { align1 1Q };
+else(16)        JIP: LABEL0                                    { align1 1H };
+else(8)         JIP: LABEL0                                    { align16 1Q };
+LABEL0:
diff --git a/src/intel/compiler/elk/tests/gen6/else.expected b/src/intel/compiler/elk/tests/gen6/else.expected
new file mode 100644
index 00000000000..05f52d5c6c4
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/else.expected
@@ -0,0 +1,3 @@
+24 00 60 00 8f 10 06 00 00 00 8d 00 00 00 8d 00
+24 00 80 00 8f 10 04 00 00 00 8d 00 00 00 8d 00
+24 01 60 00 8f 10 02 00 04 00 6e 00 04 00 6e 00
diff --git a/src/intel/compiler/elk/tests/gen6/endif.asm b/src/intel/compiler/elk/tests/gen6/endif.asm
new file mode 100644
index 00000000000..c6015165367
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/endif.asm
@@ -0,0 +1,6 @@
+endif(8)        JIP: LABEL1                                          { align16 1Q };
+LABEL1:
+endif(8)        JIP: LABEL2                                          { align1 1Q };
+LABEL2:
+endif(16)       JIP: LABEL3                                          { align1 1H };
+LABEL3:
diff --git a/src/intel/compiler/elk/tests/gen6/endif.expected b/src/intel/compiler/elk/tests/gen6/endif.expected
new file mode 100644
index 00000000000..8bfc53eb0b0
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/endif.expected
@@ -0,0 +1,3 @@
+25 01 60 00 8f 10 02 00 04 00 6e 00 04 00 6e 00
+25 00 60 00 8f 10 02 00 00 00 8d 00 00 00 8d 00
+25 00 80 00 8f 10 02 00 00 00 8d 00 00 00 8d 00
diff --git a/src/intel/compiler/elk/tests/gen6/frc.asm b/src/intel/compiler/elk/tests/gen6/frc.asm
new file mode 100644
index 00000000000..7639a63b3e4
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/frc.asm
@@ -0,0 +1,6 @@
+frc.sat(8)      m4<1>F          g3<4>F                          { align16 1Q };
+frc(8)          g19<1>.xF       (abs)g1<0>.xF                   { align16 1Q };
+frc(8)          g12<1>F         g6<8,8,1>F                      { align1 1Q };
+frc(16)         g18<1>F         g9<8,8,1>F                      { align1 1H };
+frc(8)          m1<1>F          g9<8,8,1>F                      { align1 1Q };
+frc(16)         m1<1>F          g11<8,8,1>F                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen6/frc.expected b/src/intel/compiler/elk/tests/gen6/frc.expected
new file mode 100644
index 00000000000..c643a289788
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/frc.expected
@@ -0,0 +1,6 @@
+43 01 60 80 be 03 8f 20 64 00 6e 00 00 00 00 00
+43 01 60 00 bd 03 61 22 20 20 00 00 00 00 00 00
+43 00 60 00 bd 03 80 21 c0 00 8d 00 00 00 00 00
+43 00 80 00 bd 03 40 22 20 01 8d 00 00 00 00 00
+43 00 60 00 be 03 20 20 20 01 8d 00 00 00 00 00
+43 00 80 00 be 03 20 20 60 01 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen6/halt.asm b/src/intel/compiler/elk/tests/gen6/halt.asm
new file mode 100644
index 00000000000..5f29e88c57c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/halt.asm
@@ -0,0 +1,6 @@
+(-f0.1.any4h) halt(8) JIP: LABEL0      UIP: LABEL0              { align1 1Q };
+halt(8)         JIP: LABEL1            UIP: LABEL1              { align1 1Q };
+LABEL1:
+(-f0.1.any4h) halt(16) JIP: LABEL0     UIP: LABEL0              { align1 1H };
+halt(16)        JIP: LABEL0            UIP: LABEL0              { align1 1H };
+LABEL0:
diff --git a/src/intel/compiler/elk/tests/gen6/halt.expected b/src/intel/compiler/elk/tests/gen6/halt.expected
new file mode 100644
index 00000000000..f76a4b179f7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/halt.expected
@@ -0,0 +1,4 @@
+2a 00 76 00 84 1c 00 20 00 00 8d 02 08 00 08 00
+2a 00 60 00 84 1c 00 20 00 00 8d 00 02 00 02 00
+2a 00 96 00 84 1c 00 20 00 00 8d 02 04 00 04 00
+2a 00 80 00 84 1c 00 20 00 00 8d 00 02 00 02 00
diff --git a/src/intel/compiler/elk/tests/gen6/if.asm b/src/intel/compiler/elk/tests/gen6/if.asm
new file mode 100644
index 00000000000..958a2d6eb20
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/if.asm
@@ -0,0 +1,8 @@
+(+f0.0) if(8)   JIP: LABEL0                                     { align16 1Q };
+LABEL0:
+(+f0.0) if(8)   JIP: LABEL1                                     { align1 1Q };
+(+f0.0) if(16)  JIP: LABEL1                                     { align1 1H };
+(+f0.0.x) if(8) JIP: LABEL1                                     { align16 1Q };
+(-f0.0) if(8)   JIP: LABEL1                                     { align1 1Q };
+(-f0.0) if(16)  JIP: LABEL1                                     { align1 1H };
+LABEL1:
diff --git a/src/intel/compiler/elk/tests/gen6/if.expected b/src/intel/compiler/elk/tests/gen6/if.expected
new file mode 100644
index 00000000000..65c43f9787d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/if.expected
@@ -0,0 +1,6 @@
+22 01 61 00 8f 10 02 00 04 00 0e 00 04 00 0e 00
+22 00 61 00 8f 10 0a 00 00 00 00 00 00 00 00 00
+22 00 81 00 8f 10 08 00 00 00 00 00 00 00 00 00
+22 01 62 00 8f 10 06 00 04 00 0e 00 04 00 0e 00
+22 00 71 00 8f 10 04 00 00 00 00 00 00 00 00 00
+22 00 91 00 8f 10 02 00 00 00 00 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen6/lrp.asm b/src/intel/compiler/elk/tests/gen6/lrp.asm
new file mode 100644
index 00000000000..eaae1db07f6
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/lrp.asm
@@ -0,0 +1,8 @@
+lrp(8)          g10<1>.xF       g2.2<0,1,0>F    g2.1<0,1,0>F    g2.0<0,1,0>F { align16 1Q };
+lrp(8)          m1<1>F          g34<4,4,1>F     g4<4,4,1>F      g64<4,4,1>F { align16 1Q };
+lrp(8)          m2<1>F          g2<4,4,1>F      g6<4,4,1>F      g13<4,4,1>F { align16 2Q };
+lrp.sat(8)      g7<1>F          g12<4,4,1>F     g15<4,4,1>F     g18<4,4,1>F { align16 1Q };
+lrp.sat(8)      g18<1>F         g26<4,4,1>F     g13<4,4,1>F     g38<4,4,1>F { align16 2Q };
+lrp(8)          g2<1>F          g18<4,4,1>F     g2<4,4,1>F      g8<4,4,1>F { align16 2Q };
+lrp.sat(8)      m1<1>F          g6<4,4,1>F      g13<4,4,1>F     g4<4,4,1>F { align16 1Q };
+lrp.sat(8)      m2<1>F          g9<4,4,1>F      g27<4,4,1>F     g17<4,4,1>F { align16 2Q };
diff --git a/src/intel/compiler/elk/tests/gen6/lrp.expected b/src/intel/compiler/elk/tests/gen6/lrp.expected
new file mode 100644
index 00000000000..deb5789bb41
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/lrp.expected
@@ -0,0 +1,8 @@
+5c 01 60 00 00 00 02 0a 01 24 20 40 04 04 80 00
+5c 01 60 00 01 00 1e 01 c8 21 02 39 08 20 07 10
+5c 11 60 00 01 00 1e 02 c8 21 00 39 0c 20 47 03
+5c 01 60 80 00 00 1e 07 c8 c1 00 39 1e 20 87 04
+5c 11 60 80 00 00 1e 12 c8 a1 01 39 1a 20 87 09
+5c 11 60 00 00 00 1e 02 c8 21 01 39 04 20 07 02
+5c 01 60 80 01 00 1e 01 c8 61 00 39 1a 20 07 01
+5c 11 60 80 01 00 1e 02 c8 91 00 39 36 20 47 04
diff --git a/src/intel/compiler/elk/tests/gen6/lzd.asm b/src/intel/compiler/elk/tests/gen6/lzd.asm
new file mode 100644
index 00000000000..d9ba85681ba
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/lzd.asm
@@ -0,0 +1,3 @@
+lzd(8)          g16<1>UD        g17<4>UD                        { align16 1Q };
+lzd(8)          g4<1>UD         g5<8,8,1>UD                     { align1 1Q };
+lzd(16)         g4<1>UD         g6<8,8,1>UD                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen6/lzd.expected b/src/intel/compiler/elk/tests/gen6/lzd.expected
new file mode 100644
index 00000000000..18c1fec06e5
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/lzd.expected
@@ -0,0 +1,3 @@
+4a 01 60 00 21 00 0f 22 24 02 6e 00 00 00 00 00
+4a 00 60 00 21 00 80 20 a0 00 8d 00 00 00 00 00
+4a 00 80 00 21 00 80 20 c0 00 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen6/mach.asm b/src/intel/compiler/elk/tests/gen6/mach.asm
new file mode 100644
index 00000000000..45eb5a39eba
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/mach.asm
@@ -0,0 +1,13 @@
+mach(8)         g12<1>UD        g10<8,8,1>UD    0xaaaaaaabUD    { align1 1Q AccWrEnable };
+mach(8)         g16<1>D         g10<8,8,1>D     1431655766D     { align1 1Q AccWrEnable };
+mach(16)        g17<1>UD        g14<8,8,1>UD    0xaaaaaaabUD    { align1 1H AccWrEnable };
+mach(16)        g25<1>D         g14<8,8,1>D     1431655766D     { align1 1H AccWrEnable };
+mach(8)         g9<1>D          g1<0>D          g1.4<0>D        { align16 1Q AccWrEnable };
+mach(8)         null<1>D        g1<4>.xD        741092396D      { align16 1Q AccWrEnable };
+mach(8)         g12<1>UD        g4<8,8,1>UD     g8<8,8,1>UD     { align1 1Q AccWrEnable };
+mach(16)        g20<1>UD        g4<8,8,1>UD     g12<8,8,1>UD    { align1 1H AccWrEnable };
+mach(8)         g13<1>D         g5<8,8,1>D      g9<8,8,1>D      { align1 1Q AccWrEnable };
+mach(16)        g21<1>D         g5<8,8,1>D      g13<8,8,1>D     { align1 1H AccWrEnable };
+mach(8)         null<1>D        g9<4>D          g11<4>D         { align16 1Q AccWrEnable };
+mach(8)         g24<1>.xUD      g22<4>.xUD      0x80000001UD    { align16 1Q AccWrEnable };
+mach(8)         g12<1>UD        g9<4>UD         g11<4>UD        { align16 1Q AccWrEnable };
diff --git a/src/intel/compiler/elk/tests/gen6/mach.expected b/src/intel/compiler/elk/tests/gen6/mach.expected
new file mode 100644
index 00000000000..9799450164f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/mach.expected
@@ -0,0 +1,13 @@
+49 00 60 10 21 0c 80 21 40 01 8d 00 ab aa aa aa
+49 00 60 10 a5 1c 00 22 40 01 8d 00 56 55 55 55
+49 00 80 10 21 0c 20 22 c0 01 8d 00 ab aa aa aa
+49 00 80 10 a5 1c 20 23 c0 01 8d 00 56 55 55 55
+49 01 60 10 a5 14 2f 21 24 00 0e 00 34 00 0e 00
+49 01 60 10 a4 1c 0f 20 20 00 60 00 2c 2c 2c 2c
+49 00 60 10 21 04 80 21 80 00 8d 00 00 01 8d 00
+49 00 80 10 21 04 80 22 80 00 8d 00 80 01 8d 00
+49 00 60 10 a5 14 a0 21 a0 00 8d 00 20 01 8d 00
+49 00 80 10 a5 14 a0 22 a0 00 8d 00 a0 01 8d 00
+49 01 60 10 a4 14 0f 20 24 01 6e 00 64 01 6e 00
+49 01 60 10 21 0c 01 23 c0 02 60 00 01 00 00 80
+49 01 60 10 21 04 8f 21 24 01 6e 00 64 01 6e 00
diff --git a/src/intel/compiler/elk/tests/gen6/mad.asm b/src/intel/compiler/elk/tests/gen6/mad.asm
new file mode 100644
index 00000000000..164bfca6d8a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/mad.asm
@@ -0,0 +1,41 @@
+mad(8)          g11<1>F         g4.7<0,1,0>F    g4.3<0,1,0>F    g9<4,4,1>F { align16 1Q };
+mad(8)          g17<1>F         g6.7<0,1,0>F    g6.3<0,1,0>F    g13<4,4,1>F { align16 2Q };
+mad(8)          m4<1>.xyzF      g9<4,4,1>.xyzzF g6<4,4,1>.xyzzF g30<4,4,1>.xyzzF { align16 NoDDClr 1Q };
+mad(8)          m3<1>F          g20<4,4,1>F     g5<4,4,1>.wF    g22<4,4,1>F { align16 1Q };
+mad.le.f0.0(8)  g5<1>F          g3<4,4,1>F      g4.2<0,1,0>F    g21<4,4,1>F { align16 1Q };
+mad.le.f0.0(8)  g4<1>F          g2<4,4,1>F      g6.2<0,1,0>F    g24<4,4,1>F { align16 2Q };
+mad(8)          m2<1>F          g26<4,4,1>F     g10<4,4,1>F     g18<4,4,1>F { align16 2Q };
+mad(8)          g5<1>F          -g3.0<0,1,0>F   g2.3<0,1,0>F    g2.0<0,1,0>F { align16 1Q };
+mad(8)          g5<1>F          -g3.0<0,1,0>F   g2.3<0,1,0>F    g2.0<0,1,0>F { align16 2Q };
+mad.sat(8)      m4<1>F          g26<4,4,1>F     g4.7<0,1,0>F    g10<4,4,1>F { align16 1Q };
+mad.sat(8)      m4<1>.xyzF      g109<4,4,1>.xyzzF g100<4,4,1>.xyzzF g107<4,4,1>.zF { align16 NoDDClr 1Q };
+mad(8)          g42<1>F         g41<4,4,1>F     g4.2<0,1,0>F    -g64.0<0,1,0>F { align16 1Q };
+mad(8)          g58<1>F         -g57<4,4,1>F    g53<4,4,1>F     -g53<4,4,1>F { align16 1Q };
+mad(8)          g76<1>F         -g56<4,4,1>F    -g64.1<0,1,0>F  -g53<4,4,1>F { align16 1Q };
+mad.sat(8)      g44<1>F         g43<4,4,1>F     g41<4,4,1>F     g26<4,4,1>F { align16 1Q };
+mad(8)          g4<1>F          g2<4,4,1>F      g6.2<0,1,0>F    -g12.0<0,1,0>F { align16 2Q };
+mad(8)          g2<1>F          -g22<4,4,1>F    g14<4,4,1>F     -g14<4,4,1>F { align16 2Q };
+mad(8)          g43<1>F         -g20<4,4,1>F    -g12.1<0,1,0>F  -g14<4,4,1>F { align16 2Q };
+mad.sat(8)      g12<1>F         g4<4,4,1>F      g68<4,4,1>F     g14<4,4,1>F { align16 2Q };
+mad(8)          m2<1>F          g11<4,4,1>F     g9<4,4,1>F      -g18.1<0,1,0>F { align16 1Q };
+mad(8)          m4<1>F          g2<4,4,1>F      g15<4,4,1>F     -g64.1<0,1,0>F { align16 2Q };
+mad(8)          m3<1>F          -g11.1<0,1,0>F  g2<4,4,1>F      g9<4,4,1>F { align16 1Q };
+mad(8)          m3<1>.xF        g1<4,4,1>.xF    g9<4,4,1>.xF    g2<4,4,1>.xF { align16 NoDDChk 1Q };
+mad(8)          g30<1>F         g44.4<0,1,0>F   -g44.5<0,1,0>F  g27<4,4,1>F { align16 1Q };
+mad(8)          g2<1>F          g45.4<0,1,0>F   -g45.5<0,1,0>F  g5<4,4,1>F { align16 2Q };
+mad.sat(8)      m4<1>.xyzF      -g9<4,4,1>.xyzzF g8<4,4,1>.zxyyF g6<4,4,1>.yzxxF { align16 NoDDClr 1Q };
+mad(8)          g3<1>.yF        g17<4,4,1>.yF   g6<4,4,1>.xF    g19<4,4,1>.xF { align16 NoDDClr 1Q };
+mad(8)          g2<1>F          -g2<4,4,1>F     (abs)g8<4,4,1>F g17.0<0,1,0>F { align16 1Q };
+mad(8)          g13<1>F         -g5<4,4,1>F     (abs)g3<4,4,1>F g17.0<0,1,0>F { align16 2Q };
+mad(8)          m2<1>F          -g64.0<0,1,0>F  g64.1<0,1,0>F   g10<4,4,1>F { align16 2Q };
+mad(8)          g5<1>F          -g20.0<0,1,0>F  g11<4,4,1>F     (abs)g6<4,4,1>F { align16 1Q };
+mad(8)          g13<1>F         g20.1<0,1,0>F   g5<4,4,1>F      (abs)g6<4,4,1>F { align16 1Q };
+mad(8)          g3<1>F          -g25.0<0,1,0>F  g6<4,4,1>F      (abs)g10<4,4,1>F { align16 2Q };
+mad(8)          g4<1>F          g25.1<0,1,0>F   g3<4,4,1>F      (abs)g10<4,4,1>F { align16 2Q };
+mad(8)          g7<1>.zF        g79<4,4,1>.xF   g36<4,4,1>.xF   g1.3<0,1,0>F { align16 NoDDClr,NoDDChk 1Q };
+mad(8)          g8<1>.wF        g92<4,4,1>.xF   g52<4,4,1>.xF   g1.3<0,1,0>F { align16 NoDDChk 1Q };
+mad(8)          g5<1>.xF        -g16<4,4,1>.xF  g2.2<0,1,0>F    g1.5<0,1,0>F { align16 NoDDClr 1Q };
+mad(8)          g6<1>.yF        -g23<4,4,1>.xF  g2.2<0,1,0>F    g1.0<0,1,0>F { align16 NoDDClr,NoDDChk 1Q };
+mad(8)          g5<1>.zF        -g26<4,4,1>.xF  g1.6<0,1,0>F    g1.1<0,1,0>F { align16 NoDDChk 1Q };
+mad.nz.f0.0(8)  g13<1>F         -g23.0<0,1,0>F  g9<4,4,1>F      g12<4,4,1>F { align16 1Q };
+mad.nz.f0.0(8)  g19<1>F         -g30.0<0,1,0>F  g10<4,4,1>F     g17<4,4,1>F { align16 2Q };
diff --git a/src/intel/compiler/elk/tests/gen6/mad.expected b/src/intel/compiler/elk/tests/gen6/mad.expected
new file mode 100644
index 00000000000..217141ecbcb
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/mad.expected
@@ -0,0 +1,41 @@
+5b 01 60 00 00 00 1e 0b 01 4e 20 c0 08 20 47 02
+5b 11 60 00 00 00 1e 11 01 6e 20 c0 0c 20 47 03
+5b 05 60 00 01 00 0e 04 48 91 00 29 0c 20 85 07
+5b 01 60 00 01 00 1e 03 c8 41 c1 3f 0a 20 87 05
+5b 01 60 06 00 00 1e 05 c8 31 20 80 08 20 47 05
+5b 11 60 06 00 00 1e 04 c8 21 20 80 0c 20 07 06
+5b 11 60 00 01 00 1e 02 c8 a1 01 39 14 20 87 04
+5b 01 60 00 20 00 1e 05 01 30 20 c0 04 04 80 00
+5b 11 60 00 20 00 1e 05 01 30 20 c0 04 04 80 00
+5b 01 60 80 01 00 1e 04 c8 a1 21 c0 09 20 87 02
+5b 05 60 80 01 00 0e 04 48 d1 06 29 c8 50 c5 1a
+5b 01 60 00 00 02 1e 2a c8 91 22 80 08 04 00 10
+5b 01 60 00 20 02 1e 3a c8 91 03 39 6a 20 47 0d
+5b 01 60 00 a0 02 1e 4c c8 81 23 40 80 20 47 0d
+5b 01 60 80 00 00 1e 2c c8 b1 02 39 52 20 87 06
+5b 11 60 00 00 02 1e 04 c8 21 20 80 0c 04 00 03
+5b 11 60 00 20 02 1e 02 c8 61 01 39 1c 20 87 03
+5b 11 60 00 a0 02 1e 2b c8 41 21 40 18 20 87 03
+5b 11 60 80 00 00 1e 0c c8 41 00 39 88 20 87 03
+5b 01 60 00 01 02 1e 02 c8 b1 00 39 12 04 88 04
+5b 11 60 00 01 02 1e 04 c8 21 00 39 1e 04 08 10
+5b 01 60 00 21 00 1e 03 01 b2 00 39 04 20 47 02
+5b 09 60 00 01 00 02 03 00 10 00 00 12 00 80 00
+5b 01 60 00 80 00 1e 1e 01 c8 22 40 59 20 c7 06
+5b 11 60 00 80 00 1e 02 01 d8 22 40 5b 20 47 01
+5b 05 60 80 21 00 0e 04 48 91 80 14 10 48 80 01
+5b 05 60 00 00 00 04 03 aa 10 01 00 0c 00 c0 04
+5b 01 60 00 60 00 1e 02 c8 21 00 39 10 04 40 04
+5b 11 60 00 60 00 1e 0d c8 51 00 39 06 04 40 04
+5b 11 60 00 21 00 1e 02 01 00 24 40 80 20 87 02
+5b 01 60 00 20 01 1e 05 01 40 01 39 16 20 87 01
+5b 01 60 00 00 01 1e 0d 01 42 01 39 0a 20 87 01
+5b 11 60 00 20 01 1e 03 01 90 01 39 0c 20 87 02
+5b 11 60 00 00 01 1e 04 01 92 01 39 06 20 87 02
+5b 0d 60 00 00 00 08 07 00 f0 04 00 48 04 58 00
+5b 09 60 00 00 00 10 08 00 c0 05 00 68 04 58 00
+5b 05 60 00 20 00 02 05 00 00 21 80 04 04 68 00
+5b 0d 60 00 20 00 04 06 00 70 21 80 04 04 40 00
+5b 09 60 00 20 00 08 05 00 a0 21 80 03 04 48 00
+5b 01 60 02 20 00 1e 0d 01 70 01 39 12 20 07 03
+5b 11 60 02 20 00 1e 13 01 e0 01 39 14 20 47 04
diff --git a/src/intel/compiler/elk/tests/gen6/math.asm b/src/intel/compiler/elk/tests/gen6/math.asm
new file mode 100644
index 00000000000..e970850f022
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/math.asm
@@ -0,0 +1,26 @@
+math inv(8)     g7<1>F          g10<8,8,1>F     null<8,8,1>F    { align1 1Q };
+math inv(8)     g13<1>F         g18<8,8,1>F     null<8,8,1>F    { align1 2Q };
+math pow(8)     g16<1>F         g15<8,8,1>F     g14<8,8,1>F     { align1 1Q };
+math pow(8)     g24<1>F         g22<8,8,1>F     g20<8,8,1>F     { align1 2Q };
+math cos(8)     g3<1>F          g2<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math cos(8)     g5<1>F          g3<8,8,1>F      null<8,8,1>F    { align1 2Q };
+math sqrt(8)    g16<1>F         g15<8,8,1>F     null<8,8,1>F    { align1 1Q };
+math log(8)     g21<1>F         g20<8,8,1>F     null<8,8,1>F    { align1 1Q };
+math sqrt(8)    g11<1>F         g3<8,8,1>F      null<8,8,1>F    { align1 2Q };
+math log(8)     g2<1>F          g13<8,8,1>F     null<8,8,1>F    { align1 2Q };
+math sin(8)     g23<1>F         g22<4,4,1>F     null<8,8,1>F    { align1 1Q };
+math exp(8)     g18<1>F         g17<4,4,1>F     null<8,8,1>F    { align1 1Q };
+math exp(8)     g19<1>F         g8<8,8,1>F      null<8,8,1>F    { align1 2Q };
+math rsq(8)     g71<1>F         g70<4,4,1>F     null<8,8,1>F    { align1 1Q };
+math sin(8)     g6<1>F          g4<8,8,1>F      null<8,8,1>F    { align1 2Q };
+math rsq(8)     g3<1>F          g5<8,8,1>F      null<8,8,1>F    { align1 2Q };
+math.sat pow(8) g4<1>F          g7<8,8,1>F      g13<8,8,1>F     { align1 1Q };
+math.sat pow(8) g2<1>F          g8<8,8,1>F      g14<8,8,1>F     { align1 2Q };
+math.sat sqrt(8) g4<1>F         g9<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math.sat sqrt(8) g2<1>F         g6<8,8,1>F      null<8,8,1>F    { align1 2Q };
+math.sat exp(8) g2<1>F          g5<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math.sat exp(8) g3<1>F          g8<8,8,1>F      null<8,8,1>F    { align1 2Q };
+math intmod(8)  g45<1>UD        g44<8,8,1>UD    g13<8,8,1>UD    { align1 1Q };
+math intdiv(8)  g52<1>D         g51<8,8,1>D     g12<8,8,1>D     { align1 1Q };
+math intmod(8)  g75<1>UD        g73<8,8,1>UD    g15<8,8,1>UD    { align1 2Q };
+math intdiv(8)  g87<1>D         g85<8,8,1>D     g13<8,8,1>D     { align1 2Q };
diff --git a/src/intel/compiler/elk/tests/gen6/math.expected b/src/intel/compiler/elk/tests/gen6/math.expected
new file mode 100644
index 00000000000..a4313089a2d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/math.expected
@@ -0,0 +1,26 @@
+38 00 60 01 bd 73 e0 20 40 01 8d 00 00 00 8d 00
+38 10 60 01 bd 73 a0 21 40 02 8d 00 00 00 8d 00
+38 00 60 0a bd 77 00 22 e0 01 8d 00 c0 01 8d 00
+38 10 60 0a bd 77 00 23 c0 02 8d 00 80 02 8d 00
+38 00 60 07 bd 73 60 20 40 00 8d 00 00 00 8d 00
+38 10 60 07 bd 73 a0 20 60 00 8d 00 00 00 8d 00
+38 00 60 04 bd 73 00 22 e0 01 8d 00 00 00 8d 00
+38 00 60 02 bd 73 a0 22 80 02 8d 00 00 00 8d 00
+38 10 60 04 bd 73 60 21 60 00 8d 00 00 00 8d 00
+38 10 60 02 bd 73 40 20 a0 01 8d 00 00 00 8d 00
+38 00 60 06 bd 73 e0 22 c0 02 69 00 00 00 8d 00
+38 00 60 03 bd 73 40 22 20 02 69 00 00 00 8d 00
+38 10 60 03 bd 73 60 22 00 01 8d 00 00 00 8d 00
+38 00 60 05 bd 73 e0 28 c0 08 69 00 00 00 8d 00
+38 10 60 06 bd 73 c0 20 80 00 8d 00 00 00 8d 00
+38 10 60 05 bd 73 60 20 a0 00 8d 00 00 00 8d 00
+38 00 60 8a bd 77 80 20 e0 00 8d 00 a0 01 8d 00
+38 10 60 8a bd 77 40 20 00 01 8d 00 c0 01 8d 00
+38 00 60 84 bd 73 80 20 20 01 8d 00 00 00 8d 00
+38 10 60 84 bd 73 40 20 c0 00 8d 00 00 00 8d 00
+38 00 60 83 bd 73 40 20 a0 00 8d 00 00 00 8d 00
+38 10 60 83 bd 73 60 20 00 01 8d 00 00 00 8d 00
+38 00 60 0d 21 04 a0 25 80 05 8d 00 a0 01 8d 00
+38 00 60 0c a5 14 80 26 60 06 8d 00 80 01 8d 00
+38 10 60 0d 21 04 60 29 20 09 8d 00 e0 01 8d 00
+38 10 60 0c a5 14 e0 2a a0 0a 8d 00 a0 01 8d 00
diff --git a/src/intel/compiler/elk/tests/gen6/mov.asm b/src/intel/compiler/elk/tests/gen6/mov.asm
new file mode 100644
index 00000000000..797850033be
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/mov.asm
@@ -0,0 +1,164 @@
+mov(8)          g7<1>F          g4<8,8,1>D                      { align1 1Q };
+mov(8)          m1<1>F          g7<8,8,1>F                      { align1 1Q };
+mov(16)         g7<1>F          g5<8,8,1>D                      { align1 1H };
+mov(16)         m1<1>F          g7<8,8,1>F                      { align1 1H };
+mov(8)          m2<1>D          0D                              { align16 1Q };
+mov(8)          m3<1>F          0x41880000F      /* 17F */      { align16 1Q };
+mov(8)          m1<1>UD         g0<4>UD                         { align16 WE_all 1Q };
+mov.sat(8)      m4<1>F          g4<4>F                          { align16 1Q };
+mov(8)          m2<1>.wF        g5<4>.xF                        { align16 1Q };
+mov(4)          m2<1>F          g2.3<8,2,4>F                    { align1 WE_all 1N };
+mov(8)          m2<1>D          g3.3<0,1,0>D                    { align1 1Q };
+mov(8)          m5<1>UD         g4.7<0,1,0>D                    { align1 1Q };
+mov(8)          g10<1>F         g2<0,1,0>F                      { align1 1Q };
+mov(8)          m2<1>F          0x0F             /* 0F */       { align1 1Q };
+mov(16)         m2<1>D          g3.3<0,1,0>D                    { align1 1H };
+mov(16)         m8<1>UD         g4.7<0,1,0>D                    { align1 1H };
+mov(16)         g17<1>F         g2<0,1,0>F                      { align1 1H };
+mov(16)         m3<1>F          0x0F             /* 0F */       { align1 1H };
+mov(8)          m5<1>UD         0D                              { align1 1Q };
+mov(8)          g2<1>F          g6<8,4,1>UW                     { align1 1Q };
+mov(8)          g7<1>D          g2<8,8,1>F                      { align1 1Q };
+mov(8)          m2<1>D          g11<8,8,1>F                     { align1 1Q };
+mov(16)         m8<1>UD         0D                              { align1 1H };
+mov(16)         g2<1>F          g4<8,8,1>UW                     { align1 1H };
+mov(16)         g8<1>D          g2<8,8,1>F                      { align1 1H };
+mov(16)         m2<1>D          g16<8,8,1>F                     { align1 1H };
+mov(8)          m1<1>F          -g38<8,8,1>D                    { align1 1Q };
+mov(16)         m1<1>F          -g6<8,8,1>D                     { align1 1H };
+mov(1)          m22<1>D         0D                              { align1 WE_all 1N };
+mov(8)          m23<1>D         g3<0>D                          { align16 1Q };
+mov(8)          g28<1>.xD       1D                              { align16 1Q };
+mov(1)          m22<1>D         g39<0,1,0>D                     { align1 WE_all 1N };
+mov(8)          m4<1>.xD        1059749626D                     { align16 NoDDClr 1Q };
+mov(8)          m4<1>.yD        1143373824D                     { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          m5<1>.yD        -1093874483D                    { align16 NoDDChk 1Q };
+mov(8)          m5<1>.xzF       0x7e0020VF      /* [0.5F, 0F, 30F, 0F]VF */ { align16 NoDDChk 1Q };
+mov(8)          m1<1>F          g0<8,8,1>F                      { align1 WE_all 1Q };
+mov(1)          m1.2<1>UD       0x000003f2UD                    { align1 WE_all 1N };
+mov(8)          m2<1>F          g16<8,8,1>F                     { align1 2Q };
+mov(8)          g5<1>F          0x30003000VF    /* [0F, 1F, 0F, 1F]VF */ { align16 1Q };
+mov(8)          m4<1>F          0x30003000VF    /* [0F, 1F, 0F, 1F]VF */ { align16 1Q };
+mov(8)          g21<1>F         g11<8,8,1>F                     { align1 2Q };
+mov(1)          g0.2<1>UD       0x00000000UD                    { align1 WE_all 1N };
+mov(8)          g6<1>.xUD       0x00000000UD                    { align16 1Q };
+mov(8)          g20<1>.xD       0x00000000UD                    { align16 1Q };
+mov(8)          g21<1>UD        0x00000000UD                    { align16 WE_all 1Q };
+mov(8)          m2<1>.xyzD      g3.4<0>.xyzzD                   { align16 NoDDClr 1Q };
+mov(8)          g22<1>.xD       g6<4>.xUD                       { align16 1Q };
+mov(8)          g27<1>UD        7D                              { align16 1Q };
+mov(1)          m1.1<1>UD       g9<0,1,0>UD                     { align1 WE_all 1N };
+mov(8)          m2<1>F          g30<4>F                         { align16 WE_all 1Q };
+mov(8)          g13<1>D         0D                              { align1 1Q };
+mov(16)         g9<1>D          0D                              { align1 1H };
+mov.sat(8)      m1<1>F          g2<0,1,0>F                      { align1 1Q };
+mov.sat(16)     m1<1>F          g2<0,1,0>F                      { align1 1H };
+mov(8)          g16<1>UD        g1.4<0>UD                       { align16 1Q };
+mov(8)          m4<1>.wD        g9<4>.wD                        { align16 NoDDChk 1Q };
+mov(8)          g19<1>.xD       g18<4>.xF                       { align16 1Q };
+mov(8)          m4<1>F          g[a0]<VxH,1,0>F                 { align1 1Q switch };
+mov(8)          m8<1>F          g[a0]<VxH,1,0>F                 { align1 2Q switch };
+mov(8)          m2<1>UD         0x00000000UD                    { align1 1Q };
+mov(16)         m2<1>UD         0x00000000UD                    { align1 1H };
+mov(8)          m3<1>F          g14<4>.xD                       { align16 1Q };
+mov.sat(8)      m4<1>.xF        0x3f800000F      /* 1F */       { align16 NoDDClr 1Q };
+mov.sat(8)      m4<1>.yF        0x3f666666F      /* 0.9F */     { align16 NoDDClr,NoDDChk 1Q };
+mov.sat(8)      m4<1>.wF        0x3f333333F      /* 0.7F */     { align16 NoDDChk 1Q };
+mov(1)          g32<1>F         0x40000000F      /* 2F */       { align1 WE_all 1N };
+mov(8)          m17<1>UD        g0<8,8,1>UD                     { align1 WE_all 1Q };
+mov(8)          g12<1>F         g11<4>D                         { align16 1Q };
+mov(8)          g30<1>.xyD      g3.4<0>.xyyyD                   { align16 NoDDClr 1Q };
+mov(8)          g30<1>.wD       0D                              { align16 NoDDChk 1Q };
+mov(4)          g28<1>F         g23.1<4,4,1>F                   { align1 WE_all 1N };
+mov(8)          g26<1>UD        0x403000VF      /* [0F, 1F, 2F, 0F]VF */ { align16 WE_all 1Q };
+mov(4)          m2<1>UD         g101<4>UD                       { align16 1N };
+mov(8)          g19<1>.yzwD     0x48403000VF    /* [0F, 1F, 2F, 3F]VF */ { align16 1Q };
+mov(8)          g11<1>UD        g11<4>F                         { align16 1Q };
+mov(8)          g12<1>F         g11<4>UD                        { align16 1Q };
+mov(8)          m1<1>UD         g3<8,8,1>UD                     { align1 1Q };
+mov(16)         m1<1>UD         g3<8,8,1>UD                     { align1 1H };
+mov(16)         g8<1>UD         g0<8,8,1>UD                     { align1 WE_all 1H };
+mov(8)          m1<1>F          g4<8,8,1>UD                     { align1 1Q };
+mov(16)         m1<1>F          g4<8,8,1>UD                     { align1 1H };
+mov(8)          m4<1>.xF        g1<0>.xD                        { align16 NoDDClr 1Q };
+mov(8)          m4<1>.yF        g40<4>.xD                       { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          g15<1>.xF       g87<4>.xD                       { align16 NoDDClr 1Q };
+mov(8)          g15<1>.yF       g88<4>.xD                       { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          m4<1>.wF        g42<4>.xD                       { align16 NoDDChk 1Q };
+mov(8)          g15<1>.wF       g90<4>.xD                       { align16 NoDDChk 1Q };
+mov(8)          g7<1>F          g4<8,8,1>UD                     { align1 1Q };
+mov(16)         g7<1>F          g5<8,8,1>UD                     { align1 1H };
+mov(8)          g3<1>D          g11<4>D                         { align16 1Q };
+mov(8)          g20<1>F         g14<4>.xF                       { align16 1Q };
+mov(8)          g5<1>.yF        g21<4>.yF                       { align16 NoDDClr 1Q };
+mov(8)          g5<1>.zF        g23<4>.zF                       { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          g6<1>.xzwD      g5<4>.wwywD                     { align16 NoDDChk 1Q };
+mov(8)          m3<1>.zwF       0x30000000VF    /* [0F, 0F, 0F, 1F]VF */ { align16 NoDDClr 1Q };
+mov.nz.f0.0(8)  g4<1>F          -(abs)g2<0,1,0>F                { align1 1Q };
+(+f0.0) mov(8)  g4<1>F          0xbf800000F      /* -1F */      { align1 1Q };
+mov.nz.f0.0(16) g4<1>F          -(abs)g2<0,1,0>F                { align1 1H };
+(+f0.0) mov(16) g4<1>F          0xbf800000F      /* -1F */      { align1 1H };
+mov(8)          g21<1>.xzwD     0D                              { align16 NoDDClr 1Q };
+mov(8)          g3<1>.yzwF      0x30000000VF    /* [0F, 0F, 0F, 1F]VF */ { align16 NoDDChk 1Q };
+mov(8)          g3<1>.xD        g6<4>.xD                        { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          g3<1>.xF        -g18<4>.xF                      { align16 NoDDChk 1Q };
+mov.sat(8)      g12<1>.xF       g1<0>.zF                        { align16 1Q };
+(+f0.0.any4h) mov(8) g4<1>.xD   -1D                             { align16 1Q };
+mov(8)          m2<1>.zD        g2<4>.zD                        { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          g3<1>.xyzF      g1.4<0>.xyzzUD                  { align16 NoDDClr 1Q };
+mov(8)          g3<1>.wF        g1<0>.xUD                       { align16 NoDDChk 1Q };
+mov(8)          g3<1>D          -g2<0,1,0>D                     { align1 1Q };
+mov(16)         g3<1>D          -g2<0,1,0>D                     { align1 1H };
+mov.nz.f0.0(8)  null<1>.xD      g1<0>.xD                        { align16 1Q };
+mov(8)          g2<1>UD         g2<8,8,1>F                      { align1 1Q };
+mov(16)         g2<1>UD         g2<8,8,1>F                      { align1 1H };
+mov.sat(8)      g10<1>F         g2.2<0,1,0>F                    { align1 1Q };
+mov.sat(16)     g15<1>F         g2.2<0,1,0>F                    { align1 1H };
+mov(8)          m3<1>.zwF       0D                              { align16 NoDDChk 1Q };
+mov.nz.f0.0(8)  null<1>D        g2<8,8,1>D                      { align1 1Q };
+mov.nz.f0.0(16) null<1>D        g87<8,8,1>D                     { align1 1H };
+mov(8)          m2<1>D          0D                              { align1 1Q };
+mov(16)         m2<1>D          0D                              { align1 1H };
+mov.sat(8)      m4<1>.wF        g20<4>.wF                       { align16 NoDDChk 1Q };
+mov(8)          g55<1>.zD       1045220557D                     { align16 NoDDClr,NoDDChk 1Q };
+(+f0.0.all4h) mov(8) g60<1>.xD  -1D                             { align16 1Q };
+mov(8)          m7<1>F          0x0F             /* 0F */       { align1 2Q };
+mov.sat(8)      m4<1>F          0x3f800000F      /* 1F */       { align16 1Q };
+mov(8)          g33<1>.xF       0x3e8F           /* 1.4013e-42F */ { align16 1Q };
+mov(1)          f0<1>UW         g1.14<0,1,0>UW                  { align1 WE_all 1N };
+(-f0.0) mov(8)  g4<1>F          g2<8,8,1>F                      { align1 1Q };
+(-f0.0) mov(8)  g8<1>F          g4<8,8,1>F                      { align1 2Q };
+mov(8)          m4<1>.xF        g8<4>.xF                        { align16 NoDDClr 1Q };
+mov(8)          m4<1>.yF        g8<4>.xF                        { align16 NoDDChk 1Q };
+mov(1)          g1<1>UD         g0.1<0,1,0>UD                   { align1 WE_all 1N };
+mov(1)          g7.14<1>UW      f0.1<0,1,0>UW                   { align1 WE_all 1N };
+mov(8)          g12<1>UW        0x32103210V                     { align1 WE_all 1Q };
+mov.sat(8)      m4<1>F          -g6<4>D                         { align16 1Q };
+mov(8)          m5<1>.yF        g4<4>.yF                        { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          g12<1>.xD       acc0<4>D                        { align16 1Q };
+mov(8)          m8<1>.xyzD      0x737271VF      /* [17F, 18F, 19F, 0F]VF */ { align16 1Q };
+mov(8)          m7<1>.zwD       0x706e0000VF    /* [0F, 0F, 15F, 16F]VF */ { align16 NoDDChk 1Q };
+mov.sat(8)      m4<1>.xyzF      -g13<4>.xyzzD                   { align16 NoDDClr 1Q };
+mov(8)          m2<1>UD         0x0F             /* 0F */       { align1 1Q };
+mov(16)         m2<1>UD         0x0F             /* 0F */       { align1 1H };
+mov(8)          m1<1>UD         g4<8,8,1>UD                     { align1 WE_all 2Q };
+mov(8)          m4<1>.yF        0x40a00000F      /* 5F */       { align16 NoDDChk 1Q };
+mov(8)          g6<1>UD         0D                              { align1 1Q };
+mov(16)         g8<1>UD         0D                              { align1 1H };
+mov(8)          m4<1>.yzF       0x484000VF      /* [0F, 2F, 3F, 0F]VF */ { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          m4<1>F          g10<4>UD                        { align16 1Q };
+mov(8)          g20<1>.yzD      0x404800VF      /* [0F, 3F, 2F, 0F]VF */ { align16 NoDDChk 1Q };
+mov.sat(8)      m4<1>.yzF       g1<0>.xxzzF                     { align16 NoDDClr,NoDDChk 1Q };
+mov.sat(8)      m4<1>.xF        -g1<0>.wF                       { align16 NoDDClr 1Q };
+mov.sat(8)      m4<1>.yF        -g11<4>.xD                      { align16 NoDDClr,NoDDChk 1Q };
+mov.sat(8)      m4<1>.wF        -g13<4>.xD                      { align16 NoDDChk 1Q };
+mov(8)          m1<1>UD         g9<8,8,1>F                      { align1 1Q };
+mov(16)         m1<1>UD         g11<8,8,1>F                     { align1 1H };
+mov(8)          g84<1>UD        g80.1<16,8,2>UW                 { align1 1Q };
+mov(16)         g45<1>UD        g37.1<16,8,2>UW                 { align1 1H };
+mov(8)          g48<1>UD        g44.3<32,8,4>UB                 { align1 1Q };
+mov(16)         g99<1>UD        g91.3<32,8,4>UB                 { align1 1H };
+mov.z.f0.0(8)   null<1>D        g22<8,8,1>F                     { align1 1Q };
+mov.z.f0.0(16)  null<1>D        g28<8,8,1>F                     { align1 1H };
+mov.nz.f0.0(8)  g11<1>F         -(abs)g1<0>F                    { align16 1Q };
+(+f0.0) mov(8)  g11<1>F         0xbf800000F      /* -1F */      { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen6/mov.expected b/src/intel/compiler/elk/tests/gen6/mov.expected
new file mode 100644
index 00000000000..634f0d841f1
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/mov.expected
@@ -0,0 +1,164 @@
+01 00 60 00 bd 00 e0 20 80 00 8d 00 00 00 00 00
+01 00 60 00 be 03 20 20 e0 00 8d 00 00 00 00 00
+01 00 80 00 bd 00 e0 20 a0 00 8d 00 00 00 00 00
+01 00 80 00 be 03 20 20 e0 00 8d 00 00 00 00 00
+01 01 60 00 e6 10 4f 20 00 00 00 00 00 00 00 00
+01 01 60 00 fe 73 6f 20 00 00 00 00 00 00 88 41
+01 03 60 00 22 00 2f 20 04 00 6e 00 00 00 00 00
+01 01 60 80 be 03 8f 20 84 00 6e 00 00 00 00 00
+01 01 60 00 be 03 48 20 a0 00 60 00 00 00 00 00
+01 02 40 00 be 03 40 20 4c 00 87 00 00 00 00 00
+01 00 60 00 a6 00 40 20 6c 00 00 00 00 00 00 00
+01 00 60 00 a2 00 a0 20 9c 00 00 00 00 00 00 00
+01 00 60 00 bd 03 40 21 40 00 00 00 00 00 00 00
+01 00 60 00 fe 73 40 20 00 00 00 00 00 00 00 00
+01 00 80 00 a6 00 40 20 6c 00 00 00 00 00 00 00
+01 00 80 00 a2 00 00 21 9c 00 00 00 00 00 00 00
+01 00 80 00 bd 03 20 22 40 00 00 00 00 00 00 00
+01 00 80 00 fe 73 60 20 00 00 00 00 00 00 00 00
+01 00 60 00 e2 10 a0 20 00 00 00 00 00 00 00 00
+01 00 60 00 3d 01 40 20 c0 00 89 00 00 00 00 00
+01 00 60 00 a5 03 e0 20 40 00 8d 00 00 00 00 00
+01 00 60 00 a6 03 40 20 60 01 8d 00 00 00 00 00
+01 00 80 00 e2 10 00 21 00 00 00 00 00 00 00 00
+01 00 80 00 3d 01 40 20 80 00 8d 00 00 00 00 00
+01 00 80 00 a5 03 00 21 40 00 8d 00 00 00 00 00
+01 00 80 00 a6 03 40 20 00 02 8d 00 00 00 00 00
+01 00 60 00 be 00 20 20 c0 44 8d 00 00 00 00 00
+01 00 80 00 be 00 20 20 c0 40 8d 00 00 00 00 00
+01 02 00 00 e6 10 c0 22 00 00 00 00 00 00 00 00
+01 01 60 00 a6 00 ef 22 64 00 0e 00 00 00 00 00
+01 01 60 00 e5 10 81 23 00 00 00 00 01 00 00 00
+01 02 00 00 a6 00 c0 22 e0 04 00 00 00 00 00 00
+01 05 60 00 e6 10 81 20 00 00 00 00 fa 7e 2a 3f
+01 0d 60 00 e6 10 82 20 00 00 00 00 00 80 26 44
+01 09 60 00 e6 10 a2 20 00 00 00 00 cd cc cc be
+01 09 60 00 fe 52 a5 20 00 00 00 00 20 00 7e 00
+01 02 60 00 be 03 20 20 00 00 8d 00 00 00 00 00
+01 02 00 00 62 00 28 20 00 00 00 00 f2 03 00 00
+01 10 60 00 be 03 40 20 00 02 8d 00 00 00 00 00
+01 01 60 00 fd 52 af 20 00 00 00 00 00 30 00 30
+01 01 60 00 fe 52 8f 20 00 00 00 00 00 30 00 30
+01 10 60 00 bd 03 a0 22 60 01 8d 00 00 00 00 00
+01 02 00 00 61 00 08 20 00 00 00 00 00 00 00 00
+01 01 60 00 61 00 c1 20 00 00 00 00 00 00 00 00
+01 01 60 00 65 00 81 22 00 00 00 00 00 00 00 00
+01 03 60 00 61 00 af 22 00 00 00 00 00 00 00 00
+01 05 60 00 a6 00 47 20 74 00 0a 00 00 00 00 00
+01 01 60 00 25 00 c1 22 c0 00 60 00 00 00 00 00
+01 01 60 00 e1 10 6f 23 00 00 00 00 07 00 00 00
+01 02 00 00 22 00 24 20 20 01 00 00 00 00 00 00
+01 03 60 00 be 03 4f 20 c4 03 6e 00 00 00 00 00
+01 00 60 00 e5 10 a0 21 00 00 00 00 00 00 00 00
+01 00 80 00 e5 10 20 21 00 00 00 00 00 00 00 00
+01 00 60 80 be 03 20 20 40 00 00 00 00 00 00 00
+01 00 80 80 be 03 20 20 40 00 00 00 00 00 00 00
+01 01 60 00 21 00 0f 22 34 00 0e 00 00 00 00 00
+01 09 60 00 a6 00 88 20 2f 01 6f 00 00 00 00 00
+01 01 60 00 a5 03 61 22 40 02 60 00 00 00 00 00
+01 80 60 00 be 03 80 20 00 80 e0 01 00 00 00 00
+01 90 60 00 be 03 00 21 00 80 e0 01 00 00 00 00
+01 00 60 00 62 00 40 20 00 00 00 00 00 00 00 00
+01 00 80 00 62 00 40 20 00 00 00 00 00 00 00 00
+01 01 60 00 be 00 6f 20 c0 01 60 00 00 00 00 00
+01 05 60 80 fe 73 81 20 00 00 00 00 00 00 80 3f
+01 0d 60 80 fe 73 82 20 00 00 00 00 66 66 66 3f
+01 09 60 80 fe 73 88 20 00 00 00 00 33 33 33 3f
+01 02 00 00 fd 73 00 24 00 00 00 00 00 00 00 40
+01 02 60 00 22 00 20 22 00 00 8d 00 00 00 00 00
+01 01 60 00 bd 00 8f 21 64 01 6e 00 00 00 00 00
+01 05 60 00 a5 00 c3 23 74 00 05 00 00 00 00 00
+01 09 60 00 e5 10 c8 23 00 00 00 00 00 00 00 00
+01 02 40 00 bd 03 80 23 e4 02 69 00 00 00 00 00
+01 03 60 00 e1 52 4f 23 00 00 00 00 00 30 40 00
+01 01 40 00 22 00 4f 20 a4 0c 6e 00 00 00 00 00
+01 01 60 00 e5 52 6e 22 00 00 00 00 00 30 40 48
+01 01 60 00 a1 03 6f 21 64 01 6e 00 00 00 00 00
+01 01 60 00 3d 00 8f 21 64 01 6e 00 00 00 00 00
+01 00 60 00 22 00 20 20 60 00 8d 00 00 00 00 00
+01 00 80 00 22 00 20 20 60 00 8d 00 00 00 00 00
+01 02 80 00 21 00 00 21 00 00 8d 00 00 00 00 00
+01 00 60 00 3e 00 20 20 80 00 8d 00 00 00 00 00
+01 00 80 00 3e 00 20 20 80 00 8d 00 00 00 00 00
+01 05 60 00 be 00 81 20 20 00 00 00 00 00 00 00
+01 0d 60 00 be 00 82 20 00 05 60 00 00 00 00 00
+01 05 60 00 bd 00 e1 21 e0 0a 60 00 00 00 00 00
+01 0d 60 00 bd 00 e2 21 00 0b 60 00 00 00 00 00
+01 09 60 00 be 00 88 20 40 05 60 00 00 00 00 00
+01 09 60 00 bd 00 e8 21 40 0b 60 00 00 00 00 00
+01 00 60 00 3d 00 e0 20 80 00 8d 00 00 00 00 00
+01 00 80 00 3d 00 e0 20 a0 00 8d 00 00 00 00 00
+01 01 60 00 a5 00 6f 20 64 01 6e 00 00 00 00 00
+01 01 60 00 bd 03 8f 22 c0 01 60 00 00 00 00 00
+01 05 60 00 bd 03 a2 20 a5 02 65 00 00 00 00 00
+01 0d 60 00 bd 03 a4 20 ea 02 6a 00 00 00 00 00
+01 09 60 00 a5 00 cd 20 af 00 6d 00 00 00 00 00
+01 05 60 00 fe 52 6c 20 00 00 00 00 00 00 00 30
+01 00 60 02 bd 03 80 20 40 60 00 00 00 00 00 00
+01 00 61 00 fd 73 80 20 00 00 00 00 00 00 80 bf
+01 00 80 02 bd 03 80 20 40 60 00 00 00 00 00 00
+01 00 81 00 fd 73 80 20 00 00 00 00 00 00 80 bf
+01 05 60 00 e5 10 ad 22 00 00 00 00 00 00 00 00
+01 09 60 00 fd 52 6e 20 00 00 00 00 00 00 00 30
+01 0d 60 00 a5 00 61 20 c0 00 60 00 00 00 00 00
+01 09 60 00 bd 03 61 20 40 42 60 00 00 00 00 00
+01 01 60 80 bd 03 81 21 2a 00 0a 00 00 00 00 00
+01 01 66 00 e5 10 81 20 00 00 00 00 ff ff ff ff
+01 0d 60 00 a6 00 44 20 4a 00 6a 00 00 00 00 00
+01 05 60 00 3d 00 67 20 34 00 0a 00 00 00 00 00
+01 09 60 00 3d 00 68 20 20 00 00 00 00 00 00 00
+01 00 60 00 a5 00 60 20 40 40 00 00 00 00 00 00
+01 00 80 00 a5 00 60 20 40 40 00 00 00 00 00 00
+01 01 60 02 a4 00 01 20 20 00 00 00 00 00 00 00
+01 00 60 00 a1 03 40 20 40 00 8d 00 00 00 00 00
+01 00 80 00 a1 03 40 20 40 00 8d 00 00 00 00 00
+01 00 60 80 bd 03 40 21 48 00 00 00 00 00 00 00
+01 00 80 80 bd 03 e0 21 48 00 00 00 00 00 00 00
+01 09 60 00 fe 10 6c 20 00 00 00 00 00 00 00 00
+01 00 60 02 a4 00 00 20 40 00 8d 00 00 00 00 00
+01 00 80 02 a4 00 00 20 e0 0a 8d 00 00 00 00 00
+01 00 60 00 e6 10 40 20 00 00 00 00 00 00 00 00
+01 00 80 00 e6 10 40 20 00 00 00 00 00 00 00 00
+01 09 60 80 be 03 88 20 8f 02 6f 00 00 00 00 00
+01 0d 60 00 e5 10 e4 26 00 00 00 00 cd cc 4c 3e
+01 01 67 00 e5 10 81 27 00 00 00 00 ff ff ff ff
+01 10 60 00 fe 73 e0 20 00 00 00 00 00 00 00 00
+01 01 60 80 fe 73 8f 20 00 00 00 00 00 00 80 3f
+01 01 60 00 fd 73 21 24 00 00 00 00 e8 03 00 00
+01 02 00 00 28 01 00 26 3c 00 00 00 00 00 00 00
+01 00 71 00 bd 03 80 20 40 00 8d 00 00 00 00 00
+01 10 71 00 bd 03 00 21 80 00 8d 00 00 00 00 00
+01 05 60 00 be 03 81 20 00 01 60 00 00 00 00 00
+01 09 60 00 be 03 82 20 00 01 60 00 00 00 00 00
+01 02 00 00 21 00 20 20 04 00 00 00 00 00 00 00
+01 02 00 00 09 01 fc 20 02 06 00 00 00 00 00 00
+01 02 60 00 69 63 80 21 00 00 00 00 10 32 10 32
+01 01 60 80 be 00 8f 20 c4 40 6e 00 00 00 00 00
+01 0d 60 00 be 03 a2 20 85 00 65 00 00 00 00 00
+01 01 60 00 85 00 81 21 04 04 6e 00 00 00 00 00
+01 01 60 00 e6 52 07 21 00 00 00 00 71 72 73 00
+01 09 60 00 e6 52 ec 20 00 00 00 00 00 00 6e 70
+01 05 60 80 be 00 87 20 a4 41 6a 00 00 00 00 00
+01 00 60 00 e2 73 40 20 00 00 00 00 00 00 00 00
+01 00 80 00 e2 73 40 20 00 00 00 00 00 00 00 00
+01 12 60 00 22 00 20 20 80 00 8d 00 00 00 00 00
+01 09 60 00 fe 73 82 20 00 00 00 00 00 00 a0 40
+01 00 60 00 e1 10 c0 20 00 00 00 00 00 00 00 00
+01 00 80 00 e1 10 00 21 00 00 00 00 00 00 00 00
+01 0d 60 00 fe 52 86 20 00 00 00 00 00 40 48 00
+01 01 60 00 3e 00 8f 20 44 01 6e 00 00 00 00 00
+01 09 60 00 e5 52 86 22 00 00 00 00 00 48 40 00
+01 0d 60 80 be 03 86 20 20 00 0a 00 00 00 00 00
+01 05 60 80 be 03 81 20 2f 40 0f 00 00 00 00 00
+01 0d 60 80 be 00 82 20 60 41 60 00 00 00 00 00
+01 09 60 80 be 00 88 20 a0 41 60 00 00 00 00 00
+01 00 60 00 a2 03 20 20 20 01 8d 00 00 00 00 00
+01 00 80 00 a2 03 20 20 60 01 8d 00 00 00 00 00
+01 00 60 00 21 01 80 2a 02 0a ae 00 00 00 00 00
+01 00 80 00 21 01 a0 25 a2 04 ae 00 00 00 00 00
+01 00 60 00 21 02 00 26 83 05 cf 00 00 00 00 00
+01 00 80 00 21 02 60 2c 63 0b cf 00 00 00 00 00
+01 00 60 01 a4 03 00 20 c0 02 8d 00 00 00 00 00
+01 00 80 01 a4 03 00 20 80 03 8d 00 00 00 00 00
+01 01 60 02 bd 03 6f 21 24 60 0e 00 00 00 00 00
+01 01 61 00 fd 73 6f 21 00 00 00 00 00 00 80 bf
diff --git a/src/intel/compiler/elk/tests/gen6/mul.asm b/src/intel/compiler/elk/tests/gen6/mul.asm
new file mode 100644
index 00000000000..6fb1567088e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/mul.asm
@@ -0,0 +1,62 @@
+mul(8)          m3<1>F          g2<8,8,1>F      g8<8,8,1>F      { align1 1Q };
+mul(16)         m5<1>F          g2<8,8,1>F      g14<8,8,1>F     { align1 1H };
+mul(8)          g7<1>F          g44<8,8,1>F     g4.1<0,1,0>F    { align1 1Q };
+mul(16)         g18<1>F         g28<8,8,1>F     g6.1<0,1,0>F    { align1 1H };
+mul(8)          g39<1>.xD       g28<4>.xD       g5<0>.xD        { align16 1Q };
+mul(8)          g39<1>.xD       g39<4>.xD       2D              { align16 1Q };
+mul(8)          g38<1>.xF       g2<0>.yF        g2<0>.yF        { align16 1Q };
+mul(8)          m4<1>.xyF       g6<4>.xyyyF     0x3f000000F  /* 0.5F */ { align16 NoDDClr 1Q };
+mul(8)          g8<1>F          g3<4>F          0x37800000F  /* 1.52588e-05F */ { align16 1Q };
+mul(8)          g2<1>F          g5<8,8,1>F      0x40490fdbF  /* 3.14159F */ { align1 1Q };
+mul(16)         g2<1>F          g7<8,8,1>F      0x40490fdbF  /* 3.14159F */ { align1 1H };
+mul(8)          m4<1>F          g12<4>F         0x3b808081F  /* 0.00392157F */ { align16 1Q };
+mul(8)          g61<1>UD        g61<4>UD        0x00000003UD    { align16 1Q };
+mul(8)          m1<1>F          g6<8,8,1>F      0x3c23d70aF  /* 0.01F */ { align1 1Q };
+mul(16)         m1<1>F          g10<8,8,1>F     0x3c23d70aF  /* 0.01F */ { align1 1H };
+mul(8)          g41<1>F         g40<4>.yF       0x3000VF /* [0F, 1F, 0F, 0F]VF */ { align16 1Q };
+mul(8)          g3<1>.wF        g23<4>.xF       0x3f000000F  /* 0.5F */ { align16 NoDDClr 1Q };
+mul.sat(8)      g10<1>F         g9<8,8,1>F      0x40a00001F  /* 5F */ { align1 1Q };
+mul.sat(16)     g13<1>F         g11<8,8,1>F     0x40a00001F  /* 5F */ { align1 1H };
+mul(8)          acc0<1>UD       g10<8,8,1>UD    0xaaaaaaabUD    { align1 1Q };
+mul(8)          acc0<1>D        g10<8,8,1>D     1431655766D     { align1 1Q };
+mul(8)          g14<1>D         g14<8,8,1>D     g13<8,8,1>D     { align1 1Q };
+mul(16)         acc0<1>UD       g14<8,8,1>UD    0xaaaaaaabUD    { align1 1H };
+mul(16)         acc0<1>D        g14<8,8,1>D     1431655766D     { align1 1H };
+mul(16)         g21<1>D         g23<8,8,1>D     g19<8,8,1>D     { align1 1H };
+mul(8)          m4<1>.yF        g12<4>.xF       0x3b800000F  /* 0.00390625F */ { align16 NoDDChk 1Q };
+mul(8)          g3<1>D          g2<0,1,0>UW     g2.2<0,1,0>D    { align1 1Q };
+mul(16)         g3<1>D          g2<0,1,0>UW     g2.2<0,1,0>D    { align1 1H };
+mul(8)          acc0<1>D        g1<0>D          g1.4<0>D        { align16 1Q };
+mul.l.f0.0(8)   g20<1>F         g2<8,8,1>F      0x42700000F  /* 60F */ { align1 1Q };
+mul.l.f0.0(16)  g14<1>F         g2<8,8,1>F      0x42700000F  /* 60F */ { align1 1H };
+mul(8)          m3<1>.xF        g15<4>.xF       0x40a66666F  /* 5.2F */ { align16 NoDDClr,NoDDChk 1Q };
+mul.sat(8)      m4<1>F          g6<4>F          0x3b800000F  /* 0.00390625F */ { align16 1Q };
+mul(8)          acc0<1>D        g1<4>.xD        741092396D      { align16 1Q };
+mul(8)          acc0<1>UD       g4<8,8,1>UD     g8<8,8,1>UD     { align1 1Q };
+mul(16)         acc0<1>UD       g4<8,8,1>UD     g12<8,8,1>UD    { align1 1H };
+mul(8)          m3<1>.xyzF      g2<4>.xyzzF     g11<4>.xF       { align16 NoDDClr 1Q };
+mul.sat(8)      m2<1>F          g6<8,8,1>F      g5<8,8,1>F      { align1 1Q };
+mul.sat(16)     m2<1>F          g12<8,8,1>F     g10<8,8,1>F     { align1 1H };
+mul(8)          acc0<1>D        g5<8,8,1>D      g9<8,8,1>D      { align1 1Q };
+mul(16)         acc0<1>D        g5<8,8,1>D      g13<8,8,1>D     { align1 1H };
+mul.sat(8)      m4<1>F          g2<4>F          g2<4>F          { align16 1Q };
+mul(8)          m3<1>F          g2<4>F          g3<4>F          { align16 1Q };
+mul(8)          g3<1>D          g2<0,1,0>UW     1774483385D     { align1 1Q };
+mul(16)         g3<1>D          g2<0,1,0>UW     1774483385D     { align1 1H };
+mul(8)          g15<1>.zF       g61<4>.xF       0x3e800000F  /* 0.25F */ { align16 NoDDClr,NoDDChk 1Q };
+mul(8)          acc0<1>UD       g22<4>.xUD      0x80000001UD    { align16 1Q };
+mul.nz.f0.0(8)  g6<1>F          g12<8,8,1>F     0x3f808000F  /* 1.00391F */ { align1 1Q };
+mul.nz.f0.0(16) g9<1>F          g7<8,8,1>F      0x3f808000F  /* 1.00391F */ { align1 1H };
+mul.sat(8)      m4<1>.xyF       g1<0>.wzzzF     g3<4>.wzzzF     { align16 NoDDClr 1Q };
+mul.sat(8)      m4<1>.zwF       g1<0>.yyyxF     g3<4>.yyyxF     { align16 NoDDChk 1Q };
+mul.sat(8)      m4<1>F          g4<4>F          0x20303030VF /* [1F, 1F, 1F, 0.5F]VF */ { align16 1Q };
+mul(8)          g4<1>.xyzF      g16<4>.zyxxF    g20<4>.xF       { align16 NoDDClr 1Q };
+mul(8)          acc0<1>UD       g9<4>UD         g11<4>UD        { align16 1Q };
+mul(8)          m4<1>F          g3<4>F          0x20305454VF /* [5F, 5F, 1F, 0.5F]VF */ { align16 1Q };
+mul(8)          m4<1>.xyzF      g3<4>.xyzzF     0x30302020VF /* [0.5F, 0.5F, 1F, 1F]VF */ { align16 NoDDClr 1Q };
+mul(8)          g3<1>.wF        g1<0>.zF        g11<4>.xF       { align16 NoDDClr,NoDDChk 1Q };
+mul(8)          m4<1>.yF        g15<4>.xF       g11<4>.xF       { align16 NoDDClr,NoDDChk 1Q };
+mul(8)          m5<1>.yF        g31<4>.xF       g11<4>.xF       { align16 NoDDChk 1Q };
+mul(8)          m17<1>D         g10<8,8,1>D     g9<8,8,1>D      { align1 1Q };
+mul(16)         m17<1>D         g16<8,8,1>D     g14<8,8,1>D     { align1 1H };
+mul.sat(8)      m4<1>.xyzF      g12<4>.xF       0x3030VF /* [1F, 1F, 0F, 0F]VF */ { align16 NoDDClr 1Q };
diff --git a/src/intel/compiler/elk/tests/gen6/mul.expected b/src/intel/compiler/elk/tests/gen6/mul.expected
new file mode 100644
index 00000000000..7b85bb5b5ac
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/mul.expected
@@ -0,0 +1,62 @@
+41 00 60 00 be 77 60 20 40 00 8d 00 00 01 8d 00
+41 00 80 00 be 77 a0 20 40 00 8d 00 c0 01 8d 00
+41 00 60 00 bd 77 e0 20 80 05 8d 00 84 00 00 00
+41 00 80 00 bd 77 40 22 80 03 8d 00 c4 00 00 00
+41 01 60 00 a5 14 e1 24 80 03 60 00 a0 00 00 00
+41 01 60 00 a5 1c e1 24 e0 04 60 00 02 00 00 00
+41 01 60 00 bd 77 c1 24 45 00 05 00 45 00 05 00
+41 05 60 00 be 7f 83 20 c4 00 65 00 00 00 00 3f
+41 01 60 00 bd 7f 0f 21 64 00 6e 00 00 00 80 37
+41 00 60 00 bd 7f 40 20 a0 00 8d 00 db 0f 49 40
+41 00 80 00 bd 7f 40 20 e0 00 8d 00 db 0f 49 40
+41 01 60 00 be 7f 8f 20 84 01 6e 00 81 80 80 3b
+41 01 60 00 21 0c af 27 a4 07 6e 00 03 00 00 00
+41 00 60 00 be 7f 20 20 c0 00 8d 00 0a d7 23 3c
+41 00 80 00 be 7f 20 20 40 01 8d 00 0a d7 23 3c
+41 01 60 00 bd 5f 2f 25 05 05 65 00 00 30 00 00
+41 05 60 00 bd 7f 68 20 e0 02 60 00 00 00 00 3f
+41 00 60 80 bd 7f 40 21 20 01 8d 00 01 00 a0 40
+41 00 80 80 bd 7f a0 21 60 01 8d 00 01 00 a0 40
+41 00 60 00 20 0c 00 24 40 01 8d 00 ab aa aa aa
+41 00 60 00 a4 1c 00 24 40 01 8d 00 56 55 55 55
+41 00 60 00 a5 14 c0 21 c0 01 8d 00 a0 01 8d 00
+41 00 80 00 20 0c 00 24 c0 01 8d 00 ab aa aa aa
+41 00 80 00 a4 1c 00 24 c0 01 8d 00 56 55 55 55
+41 00 80 00 a5 14 a0 22 e0 02 8d 00 60 02 8d 00
+41 09 60 00 be 7f 82 20 80 01 60 00 00 00 80 3b
+41 00 60 00 25 15 60 20 40 00 00 00 48 00 00 00
+41 00 80 00 25 15 60 20 40 00 00 00 48 00 00 00
+41 01 60 00 a4 14 0f 24 24 00 0e 00 34 00 0e 00
+41 00 60 05 bd 7f 80 22 40 00 8d 00 00 00 70 42
+41 00 80 05 bd 7f c0 21 40 00 8d 00 00 00 70 42
+41 0d 60 00 be 7f 61 20 e0 01 60 00 66 66 a6 40
+41 01 60 80 be 7f 8f 20 c4 00 6e 00 00 00 80 3b
+41 01 60 00 a4 1c 0f 24 20 00 60 00 2c 2c 2c 2c
+41 00 60 00 20 04 00 24 80 00 8d 00 00 01 8d 00
+41 00 80 00 20 04 00 24 80 00 8d 00 80 01 8d 00
+41 05 60 00 be 77 67 20 44 00 6a 00 60 01 60 00
+41 00 60 80 be 77 40 20 c0 00 8d 00 a0 00 8d 00
+41 00 80 80 be 77 40 20 80 01 8d 00 40 01 8d 00
+41 00 60 00 a4 14 00 24 a0 00 8d 00 20 01 8d 00
+41 00 80 00 a4 14 00 24 a0 00 8d 00 a0 01 8d 00
+41 01 60 80 be 77 8f 20 44 00 6e 00 44 00 6e 00
+41 01 60 00 be 77 6f 20 44 00 6e 00 64 00 6e 00
+41 00 60 00 25 1d 60 20 40 00 00 00 b9 77 c4 69
+41 00 80 00 25 1d 60 20 40 00 00 00 b9 77 c4 69
+41 0d 60 00 bd 7f e4 21 a0 07 60 00 00 00 80 3e
+41 01 60 00 20 0c 0f 24 c0 02 60 00 01 00 00 80
+41 00 60 02 bd 7f c0 20 80 01 8d 00 00 80 80 3f
+41 00 80 02 bd 7f 20 21 e0 00 8d 00 00 80 80 3f
+41 05 60 80 be 77 83 20 2b 00 0a 00 6b 00 6a 00
+41 09 60 80 be 77 8c 20 25 00 01 00 65 00 61 00
+41 01 60 80 be 5f 8f 20 84 00 6e 00 30 30 30 20
+41 05 60 00 bd 77 87 20 06 02 60 00 80 02 60 00
+41 01 60 00 20 04 0f 24 24 01 6e 00 64 01 6e 00
+41 01 60 00 be 5f 8f 20 64 00 6e 00 54 54 30 20
+41 05 60 00 be 5f 87 20 64 00 6a 00 20 20 30 30
+41 0d 60 00 bd 77 68 20 2a 00 0a 00 60 01 60 00
+41 0d 60 00 be 77 82 20 e0 01 60 00 60 01 60 00
+41 09 60 00 be 77 a2 20 e0 03 60 00 60 01 60 00
+41 00 60 00 a6 14 20 22 40 01 8d 00 20 01 8d 00
+41 00 80 00 a6 14 20 22 00 02 8d 00 c0 01 8d 00
+41 05 60 80 be 5f 87 20 80 01 60 00 30 30 00 00
diff --git a/src/intel/compiler/elk/tests/gen6/not.asm b/src/intel/compiler/elk/tests/gen6/not.asm
new file mode 100644
index 00000000000..380433af6ac
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/not.asm
@@ -0,0 +1,4 @@
+not(8)          g29<1>.xD       g26<4>.xD                       { align16 1Q };
+not.nz.f0.0(8)  null<1>.xD      g13<4>.xD                       { align16 1Q };
+not(8)          g20<1>D         g19<8,8,1>D                     { align1 1Q };
+not(16)         g27<1>D         g25<8,8,1>D                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen6/not.expected b/src/intel/compiler/elk/tests/gen6/not.expected
new file mode 100644
index 00000000000..1d38da63129
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/not.expected
@@ -0,0 +1,4 @@
+04 01 60 00 a5 00 a1 23 40 03 60 00 00 00 00 00
+04 01 60 02 a4 00 01 20 a0 01 60 00 00 00 00 00
+04 00 60 00 a5 00 80 22 60 02 8d 00 00 00 00 00
+04 00 80 00 a5 00 60 23 20 03 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen6/or.asm b/src/intel/compiler/elk/tests/gen6/or.asm
new file mode 100644
index 00000000000..aa3de469889
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/or.asm
@@ -0,0 +1,15 @@
+or(8)           g29<1>UD        g9<4>.xUD       0x00000014UD    { align16 1Q };
+or(8)           g43<1>UD        g44<4>UD        1D              { align16 1Q };
+or(1)           g28<1>UD        g28<0,1,0>UD    g57<0,1,0>UD    { align1 1N };
+or(8)           g10<1>UD        g9<8,8,1>UD     g8<8,8,1>UD     { align1 1Q };
+or(16)          g16<1>UD        g14<8,8,1>UD    g12<8,8,1>UD    { align1 1H };
+or(1)           g9<1>UD         g0<0,1,0>UD     0x00000800UD    { align1 WE_all 1N };
+or.nz.f0.0(8)   null<1>.xUD     g17<4>.xUD      g16<4>.xUD      { align16 1Q };
+or.nz.f0.0(8)   null<1>UD       g16<8,8,1>UD    g17<8,8,1>UD    { align1 1Q };
+(+f0.0) or(8)   g18<1>UD        g18<8,8,1>UD    0x3f800000UD    { align1 1Q };
+or.nz.f0.0(16)  null<1>UD       g28<8,8,1>UD    g30<8,8,1>UD    { align1 1H };
+(+f0.0) or(16)  g31<1>UD        g31<8,8,1>UD    0x3f800000UD    { align1 1H };
+(+f0.0) or(8)   g22<1>.xUD      g22<4>.xUD      0x3f800000UD    { align16 1Q };
+or.nz.f0.0(8)   g8<1>UD         g4<8,8,1>UD     g7<8,8,1>UD     { align1 1Q };
+or.nz.f0.0(16)  g12<1>UD        g5<8,8,1>UD     g10<8,8,1>UD    { align1 1H };
+or(8)           g4<1>.xUD       g57<4>.xUD      g56<4>.xUD      { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen6/or.expected b/src/intel/compiler/elk/tests/gen6/or.expected
new file mode 100644
index 00000000000..ccaacac9467
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/or.expected
@@ -0,0 +1,15 @@
+06 01 60 00 21 0c af 23 20 01 60 00 14 00 00 00
+06 01 60 00 21 1c 6f 25 84 05 6e 00 01 00 00 00
+06 00 00 00 21 04 80 23 80 03 00 00 20 07 00 00
+06 00 60 00 21 04 40 21 20 01 8d 00 00 01 8d 00
+06 00 80 00 21 04 00 22 c0 01 8d 00 80 01 8d 00
+06 02 00 00 21 0c 20 21 00 00 00 00 00 08 00 00
+06 01 60 02 20 04 01 20 20 02 60 00 00 02 60 00
+06 00 60 02 20 04 00 20 00 02 8d 00 20 02 8d 00
+06 00 61 00 21 0c 40 22 40 02 8d 00 00 00 80 3f
+06 00 80 02 20 04 00 20 80 03 8d 00 c0 03 8d 00
+06 00 81 00 21 0c e0 23 e0 03 8d 00 00 00 80 3f
+06 01 61 00 21 0c c1 22 c0 02 60 00 00 00 80 3f
+06 00 60 02 21 04 00 21 80 00 8d 00 e0 00 8d 00
+06 00 80 02 21 04 80 21 a0 00 8d 00 40 01 8d 00
+06 01 60 00 21 04 81 20 20 07 60 00 00 07 60 00
diff --git a/src/intel/compiler/elk/tests/gen6/pln.asm b/src/intel/compiler/elk/tests/gen6/pln.asm
new file mode 100644
index 00000000000..179e2fa2e4b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/pln.asm
@@ -0,0 +1,12 @@
+pln(8)          m1<1>F          g4<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln(16)         m1<1>F          g6<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln(8)          g41<1>F         g5<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln(16)         g22<1>F         g7<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.sat(8)      g8<1>F          g4<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.sat(16)     g7<1>F          g6<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.g.f0.0(8)   g7<1>F          g4<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.g.f0.0(16)  g11<1>F         g6<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.l.f0.0(8)   g8<1>F          g4<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.l.f0.0(16)  g11<1>F         g6<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.nz.f0.0(8)  g18<1>F         g5<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.nz.f0.0(16) g14<1>F         g7<0,1,0>F      g2<8,8,1>F      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen6/pln.expected b/src/intel/compiler/elk/tests/gen6/pln.expected
new file mode 100644
index 00000000000..f35a3ed85fc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/pln.expected
@@ -0,0 +1,12 @@
+5a 00 60 00 be 77 20 20 80 00 00 00 40 00 8d 00
+5a 00 80 00 be 77 20 20 c0 00 00 00 40 00 8d 00
+5a 00 60 00 bd 77 20 25 a0 00 00 00 40 00 8d 00
+5a 00 80 00 bd 77 c0 22 e0 00 00 00 40 00 8d 00
+5a 00 60 80 bd 77 00 21 80 00 00 00 40 00 8d 00
+5a 00 80 80 bd 77 e0 20 c0 00 00 00 40 00 8d 00
+5a 00 60 03 bd 77 e0 20 80 00 00 00 40 00 8d 00
+5a 00 80 03 bd 77 60 21 c0 00 00 00 40 00 8d 00
+5a 00 60 05 bd 77 00 21 80 00 00 00 40 00 8d 00
+5a 00 80 05 bd 77 60 21 c0 00 00 00 40 00 8d 00
+5a 00 60 02 bd 77 40 22 a0 00 00 00 40 00 8d 00
+5a 00 80 02 bd 77 c0 21 e0 00 00 00 40 00 8d 00
diff --git a/src/intel/compiler/elk/tests/gen6/rndd.asm b/src/intel/compiler/elk/tests/gen6/rndd.asm
new file mode 100644
index 00000000000..0a0e25d53fb
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/rndd.asm
@@ -0,0 +1,7 @@
+rndd(8)         g18<1>.xF       g1<0>.xF                        { align16 1Q };
+rndd(8)         g3<1>F          g5<8,8,1>F                      { align1 1Q };
+rndd(16)        g8<1>F          g6<8,8,1>F                      { align1 1H };
+rndd(8)         g6<1>.zF        g22<4>.xF                       { align16 NoDDClr 1Q };
+rndd.z.f0.0(8)  null<1>F        g17<8,8,1>F                     { align1 1Q };
+rndd.z.f0.0(16) null<1>F        g28<8,8,1>F                     { align1 1H };
+rndd.sat(8)     m4<1>F          g6<4>F                          { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen6/rndd.expected b/src/intel/compiler/elk/tests/gen6/rndd.expected
new file mode 100644
index 00000000000..778037a86fc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/rndd.expected
@@ -0,0 +1,7 @@
+45 01 60 00 bd 03 41 22 20 00 00 00 00 00 00 00
+45 00 60 00 bd 03 60 20 a0 00 8d 00 00 00 00 00
+45 00 80 00 bd 03 00 21 c0 00 8d 00 00 00 00 00
+45 05 60 00 bd 03 c4 20 c0 02 60 00 00 00 00 00
+45 00 60 01 bc 03 00 20 20 02 8d 00 00 00 00 00
+45 00 80 01 bc 03 00 20 80 03 8d 00 00 00 00 00
+45 01 60 80 be 03 8f 20 c4 00 6e 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen6/rnde.asm b/src/intel/compiler/elk/tests/gen6/rnde.asm
new file mode 100644
index 00000000000..10d6924bc76
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/rnde.asm
@@ -0,0 +1,2 @@
+rnde(8)         g6<1>F          g3<8,8,1>F                      { align1 1Q };
+rnde(16)        g8<1>F          g4<8,8,1>F                      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen6/rnde.expected b/src/intel/compiler/elk/tests/gen6/rnde.expected
new file mode 100644
index 00000000000..374f68c6bb0
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/rnde.expected
@@ -0,0 +1,2 @@
+46 00 60 00 bd 03 c0 20 60 00 8d 00 00 00 00 00
+46 00 80 00 bd 03 00 21 80 00 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen6/rndz.asm b/src/intel/compiler/elk/tests/gen6/rndz.asm
new file mode 100644
index 00000000000..975bac6030a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/rndz.asm
@@ -0,0 +1,3 @@
+rndz(8)         g9<1>.xyzF      g1<0>.xyzzF                     { align16 1Q };
+rndz(8)         g6<1>F          g5<8,8,1>F                      { align1 1Q };
+rndz(16)        g8<1>F          g6<8,8,1>F                      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen6/rndz.expected b/src/intel/compiler/elk/tests/gen6/rndz.expected
new file mode 100644
index 00000000000..56068d9a010
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/rndz.expected
@@ -0,0 +1,3 @@
+47 01 60 00 bd 03 27 21 24 00 0a 00 00 00 00 00
+47 00 60 00 bd 03 c0 20 a0 00 8d 00 00 00 00 00
+47 00 80 00 bd 03 00 21 c0 00 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen6/sel.asm b/src/intel/compiler/elk/tests/gen6/sel.asm
new file mode 100644
index 00000000000..03a8fe5ff30
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/sel.asm
@@ -0,0 +1,58 @@
+(+f0.0) sel(8)  g40<1>UD        g5<4>UD         g6<4>UD         { align16 1Q };
+(-f0.0) sel(8)  g6<1>UD         g13<8,8,1>UD    0x00000000UD    { align1 1Q };
+(-f0.0) sel(16) g7<1>UD         g9<8,8,1>UD     0x00000000UD    { align1 1H };
+(+f0.0) sel(8)  g2<1>UD         g31<8,8,1>UD    g34<8,8,1>UD    { align1 1Q };
+(+f0.0) sel(8)  m1<1>UD         g67<8,8,1>UD    0x3f800000UD    { align1 1Q };
+(+f0.0) sel(16) g2<1>UD         g35<8,8,1>UD    g41<8,8,1>UD    { align1 1H };
+(+f0.0) sel(16) m1<1>UD         g31<8,8,1>UD    0x3f800000UD    { align1 1H };
+(+f0.0.all4h) sel(8) g45<1>UD   g23<4>UD        g24<4>UD        { align16 1Q };
+sel.ge(8)       g64<1>F         g5<8,8,1>F      0x0F  /* 0F */  { align1 1Q };
+sel.ge(16)      g17<1>F         g3<8,8,1>F      0x0F  /* 0F */  { align1 1H };
+sel.ge(8)       g3<1>.yF        g7<4>.xF        0x0F  /* 0F */  { align16 1Q };
+sel.l(8)        g11<1>.xF       g7<4>.wF        0x43000000F  /* 128F */ { align16 1Q };
+(-f0.0.z) sel(8) g3<1>.zUD      g17<4>.xUD      0x00000000UD    { align16 1Q };
+(+f0.0.x) sel(8) g32<1>.xUD     g12<4>.yUD      0x41a80000UD    { align16 1Q };
+(-f0.0.x) sel(8) g33<1>.xUD     g32<4>.xUD      0x41b80000UD    { align16 1Q };
+(+f0.0) sel(8)  m1<1>UD         g9<8,8,1>UD     g12<8,8,1>UD    { align1 1Q };
+(+f0.0) sel(16) m1<1>UD         g15<8,8,1>UD    g21<8,8,1>UD    { align1 1H };
+sel.ge(8)       g20<1>F         g19<8,8,1>F     g16<8,8,1>F     { align1 1Q };
+sel.ge(16)      g12<1>F         g10<8,8,1>F     g8<8,8,1>F      { align1 1H };
+sel.sat.l(8)    m4<1>F          g2<4>F          0x3f000000F  /* 0.5F */ { align16 1Q };
+(+f0.0.x) sel(8) g46<1>.xUD     g72<4>.yUD      g72<4>.xUD      { align16 1Q };
+sel.l(8)        g13<1>.xF       g1<0>.wF        g1<0>.zF        { align16 1Q };
+sel.ge(8)       g13<1>.xF       g1<0>.wF        g1<0>.zF        { align16 1Q };
+(+f0.0.any4h) sel(8) g15<1>UD   g14<4>UD        g4<4>UD         { align16 1Q };
+(-f0.0.any4h) sel(8) g67<1>.xUD g63<4>.xUD      0x00000000UD    { align16 1Q };
+(-f0.0) sel(8)  m1<1>UD         g13<8,8,1>UD    0x3f800000UD    { align1 1Q };
+(-f0.0) sel(16) m1<1>UD         g22<8,8,1>UD    0x3f800000UD    { align1 1H };
+sel.l(8)        g10<1>F         g2.3<0,1,0>F    g2.2<0,1,0>F    { align1 1Q };
+sel.l(16)       g15<1>F         g2.3<0,1,0>F    g2.2<0,1,0>F    { align1 1H };
+sel.ge(8)       g18<1>.zD       g18<4>.zD       1D              { align16 1Q };
+(+f0.0) sel(8)  g8<1>UD         g4<8,8,1>UD     0x00000000UD    { align1 1Q };
+(+f0.0) sel(16) g11<1>UD        g5<8,8,1>UD     0x00000000UD    { align1 1H };
+sel.ge(8)       g4<1>D          g3<0,1,0>D      -252D           { align1 1Q };
+sel.l(8)        g5<1>D          g4<8,8,1>D      254D            { align1 1Q };
+sel.ge(16)      g4<1>D          g3<0,1,0>D      -252D           { align1 1H };
+sel.l(16)       g6<1>D          g4<8,8,1>D      254D            { align1 1H };
+sel.sat.l(8)    m4<1>F          g1<0>F          g3<4>F          { align16 1Q };
+sel.l(8)        g6<1>F          g3<8,8,1>F      0x40400000F  /* 3F */ { align1 1Q };
+sel.l(16)       g20<1>F         g14<8,8,1>F     0x40400000F  /* 3F */ { align1 1H };
+(+f0.0) sel(8)  g8<1>F          (abs)g40<8,8,1>F g6<8,8,1>F     { align1 1Q };
+(-f0.0) sel(8)  g15<1>F         (abs)g14<8,8,1>F 0x3f800000F  /* 1F */ { align1 1Q };
+(+f0.0) sel(16) g13<1>F         (abs)g52<8,8,1>F g9<8,8,1>F     { align1 1H };
+(-f0.0) sel(16) g27<1>F         (abs)g25<8,8,1>F 0x3f800000F  /* 1F */ { align1 1H };
+(+f0.0) sel(8)  g21<1>.xyzUD    g19<4>.xyzzUD   0x00000000UD    { align16 1Q };
+sel.l(8)        m2<1>F          g3<8,8,1>F      g4<8,8,1>F      { align1 1Q };
+sel.l(16)       m3<1>F          g3<8,8,1>F      g5<8,8,1>F      { align1 1H };
+(-f0.0.y) sel(8) g3<1>.yUD      g10<4>.xUD      0x00000000UD    { align16 1Q };
+(+f0.0.y) sel(8) g3<1>.yUD      g1<0>.wUD       g1<0>.zUD       { align16 1Q };
+(-f0.0) sel(8)  g28<1>UD        g26<4>UD        0x00000000UD    { align16 1Q };
+sel.ge(8)       g22<1>.xD       g3.4<0>.xD      g5.4<0>.xD      { align16 1Q };
+sel.l(8)        m1<1>F          g36<8,8,1>F     0x3f800000F  /* 1F */ { align1 1Q };
+sel.l(16)       m1<1>F          g14<8,8,1>F     0x3f800000F  /* 1F */ { align1 1H };
+sel.sat.ge(8)   m4<1>F          g25<4>F         0xbf800000F  /* -1F */ { align16 1Q };
+sel.ge(8)       m2<1>F          g5<8,8,1>F      0x0F  /* 0F */  { align1 1Q };
+sel.ge(16)      m3<1>F          g7<8,8,1>F      0x0F  /* 0F */  { align1 1H };
+sel.l(8)        g13<1>D         g11<4>D         254D            { align16 1Q };
+sel.sat.l(8)    g47<1>F         g46<8,8,1>F     0x3f000000F  /* 0.5F */ { align1 1Q };
+sel.sat.l(16)   g54<1>F         g3<8,8,1>F      0x3f000000F  /* 0.5F */ { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen6/sel.expected b/src/intel/compiler/elk/tests/gen6/sel.expected
new file mode 100644
index 00000000000..246bc926597
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/sel.expected
@@ -0,0 +1,58 @@
+02 01 61 00 21 04 0f 25 a4 00 6e 00 c4 00 6e 00
+02 00 71 00 21 0c c0 20 a0 01 8d 00 00 00 00 00
+02 00 91 00 21 0c e0 20 20 01 8d 00 00 00 00 00
+02 00 61 00 21 04 40 20 e0 03 8d 00 40 04 8d 00
+02 00 61 00 22 0c 20 20 60 08 8d 00 00 00 80 3f
+02 00 81 00 21 04 40 20 60 04 8d 00 20 05 8d 00
+02 00 81 00 22 0c 20 20 e0 03 8d 00 00 00 80 3f
+02 01 67 00 21 04 af 25 e4 02 6e 00 04 03 6e 00
+02 00 60 04 bd 7f 00 28 a0 00 8d 00 00 00 00 00
+02 00 80 04 bd 7f 20 22 60 00 8d 00 00 00 00 00
+02 01 60 04 bd 7f 62 20 e0 00 60 00 00 00 00 00
+02 01 60 05 bd 7f 61 21 ef 00 6f 00 00 00 00 43
+02 01 74 00 21 0c 64 20 20 02 60 00 00 00 00 00
+02 01 62 00 21 0c 01 24 85 01 65 00 00 00 a8 41
+02 01 72 00 21 0c 21 24 00 04 60 00 00 00 b8 41
+02 00 61 00 22 04 20 20 20 01 8d 00 80 01 8d 00
+02 00 81 00 22 04 20 20 e0 01 8d 00 a0 02 8d 00
+02 00 60 04 bd 77 80 22 60 02 8d 00 00 02 8d 00
+02 00 80 04 bd 77 80 21 40 01 8d 00 00 01 8d 00
+02 01 60 85 be 7f 8f 20 44 00 6e 00 00 00 00 3f
+02 01 62 00 21 04 c1 25 05 09 65 00 00 09 60 00
+02 01 60 05 bd 77 a1 21 2f 00 0f 00 2a 00 0a 00
+02 01 60 04 bd 77 a1 21 2f 00 0f 00 2a 00 0a 00
+02 01 66 00 21 04 ef 21 c4 01 6e 00 84 00 6e 00
+02 01 76 00 21 0c 61 28 e0 07 60 00 00 00 00 00
+02 00 71 00 22 0c 20 20 a0 01 8d 00 00 00 80 3f
+02 00 91 00 22 0c 20 20 c0 02 8d 00 00 00 80 3f
+02 00 60 05 bd 77 40 21 4c 00 00 00 48 00 00 00
+02 00 80 05 bd 77 e0 21 4c 00 00 00 48 00 00 00
+02 01 60 04 a5 1c 44 22 4a 02 6a 00 01 00 00 00
+02 00 61 00 21 0c 00 21 80 00 8d 00 00 00 00 00
+02 00 81 00 21 0c 60 21 a0 00 8d 00 00 00 00 00
+02 00 60 04 a5 1c 80 20 60 00 00 00 04 ff ff ff
+02 00 60 05 a5 1c a0 20 80 00 8d 00 fe 00 00 00
+02 00 80 04 a5 1c 80 20 60 00 00 00 04 ff ff ff
+02 00 80 05 a5 1c c0 20 80 00 8d 00 fe 00 00 00
+02 01 60 85 be 77 8f 20 24 00 0e 00 64 00 6e 00
+02 00 60 05 bd 7f c0 20 60 00 8d 00 00 00 40 40
+02 00 80 05 bd 7f 80 22 c0 01 8d 00 00 00 40 40
+02 00 61 00 bd 77 00 21 00 25 8d 00 c0 00 8d 00
+02 00 71 00 bd 7f e0 21 c0 21 8d 00 00 00 80 3f
+02 00 81 00 bd 77 a0 21 80 26 8d 00 20 01 8d 00
+02 00 91 00 bd 7f 60 23 20 23 8d 00 00 00 80 3f
+02 01 61 00 21 0c a7 22 64 02 6a 00 00 00 00 00
+02 00 60 05 be 77 40 20 60 00 8d 00 80 00 8d 00
+02 00 80 05 be 77 60 20 60 00 8d 00 a0 00 8d 00
+02 01 73 00 21 0c 62 20 40 01 60 00 00 00 00 00
+02 01 63 00 21 04 62 20 2f 00 0f 00 2a 00 0a 00
+02 01 71 00 21 0c 8f 23 44 03 6e 00 00 00 00 00
+02 01 60 04 a5 14 c1 22 70 00 00 00 b0 00 00 00
+02 00 60 05 be 7f 20 20 80 04 8d 00 00 00 80 3f
+02 00 80 05 be 7f 20 20 c0 01 8d 00 00 00 80 3f
+02 01 60 84 be 7f 8f 20 24 03 6e 00 00 00 80 bf
+02 00 60 04 be 7f 40 20 a0 00 8d 00 00 00 00 00
+02 00 80 04 be 7f 60 20 e0 00 8d 00 00 00 00 00
+02 01 60 05 a5 1c af 21 64 01 6e 00 fe 00 00 00
+02 00 60 85 bd 7f e0 25 c0 05 8d 00 00 00 00 3f
+02 00 80 85 bd 7f c0 26 60 00 8d 00 00 00 00 3f
diff --git a/src/intel/compiler/elk/tests/gen6/send.asm b/src/intel/compiler/elk/tests/gen6/send.asm
new file mode 100644
index 00000000000..fa9fe227ab5
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/send.asm
@@ -0,0 +1,516 @@
+send(8)         null<1>F        m1<4>F          0x8608c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 3 rlen 0 { align16 1Q EOT };
+send(8)         null<1>F        m1<4>F          0x8a08c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 5 rlen 0 { align16 1Q EOT };
+send(8)         g2<1>UW         m2<8,8,1>F      0x08417001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x10827001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g0<1>F          m21<4>F         0x060920ff
+                            render MsgDesc: OWORD dual block write MsgCtrl = 0x0 Surface = 255 mlen 3 rlen 0 { align16 1Q };
+send(8)         g41<1>F         m22<4>F         0x041840ff
+                            render MsgDesc: OWORD dual block read MsgCtrl = 0x0 Surface = 255 mlen 2 rlen 1 { align16 1Q };
+send(8)         null<1>F        m1<4>F          0x8e08c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 7 rlen 0 { align16 1Q EOT };
+send(8)         g2<1>UW         m1<8,8,1>F      0x16494001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 11 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         m1<8,8,1>F      0x0e494001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+send(8)         g3<1>UW         m1<8,8,1>F      0x0e496001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+send(8)         g7<1>UW         m1<8,8,1>F      0x0e496102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 7 rlen 4 { align1 1Q };
+send(8)         g14<1>D         m2<4>F          0x04107040
+                            sampler MsgDesc: ld SIMD4x2 Surface = 64 Sampler = 0 mlen 2 rlen 1 { align16 1Q };
+send(8)         g7<1>.xUD       m1<4>UD         0x02182001
+                            urb MsgDesc: 0 ff_sync allocate mlen 1 rlen 1   { align16 1Q };
+send(8)         g7<1>UD         m1<4>F          0x0a18e400
+                            urb MsgDesc: 0 urb_write interleave allocate used complete mlen 5 rlen 1 { align16 1Q };
+send(8)         null<1>F        m1<4>F          0x82088400
+                            urb MsgDesc: 0 urb_write interleave complete mlen 1 rlen 0 { align16 1Q EOT };
+send(8)         g8<1>UD         m1<4>F          0x0618e400
+                            urb MsgDesc: 0 urb_write interleave allocate used complete mlen 3 rlen 1 { align16 1Q };
+send(8)         null<1>F        m1<4>F          0x1e084400
+                            urb MsgDesc: 0 urb_write interleave used mlen 15 rlen 0 { align16 1Q };
+send(8)         null<1>F        m1<4>F          0x8608c470
+                            urb MsgDesc: 7 urb_write interleave used complete mlen 3 rlen 0 { align16 1Q EOT };
+send(8)         g7<1>UW         m2<8,8,1>F      0x04410001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g9<1>UW         m2<8,8,1>F      0x08820001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g7<1>UW         m2<8,8,1>F      0x02410001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g9<1>UW         m2<8,8,1>F      0x04820001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         g24<1>UD        m17<4>F         0x04184000
+                            dp_sampler MsgDesc: (0, 0, 2, 0) mlen 2 rlen 1  { align16 1Q };
+send(8)         g8<1>UW         m2<8,8,1>F      0x0241a001
+                            sampler MsgDesc: resinfo SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x06418002
+                            sampler MsgDesc: gather4 SIMD8 Surface = 2 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g11<1>UW        m2<8,8,1>F      0x0482a001
+                            sampler MsgDesc: resinfo SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x0c828002
+                            sampler MsgDesc: gather4 SIMD16 Surface = 2 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g15<1>D         m2<4>F          0x02107040
+                            sampler MsgDesc: ld SIMD4x2 Surface = 64 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x10414001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 8 rlen 4 { align1 1Q };
+send(8)         g2<1>D          m2<4>F          0x0210a000
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 0 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g3<1>D          m2<4>F          0x0210a101
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 1 Sampler = 1 mlen 1 rlen 1 { align16 1Q };
+send(8)         g5<1>D          m2<4>F          0x0210a202
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 2 Sampler = 2 mlen 1 rlen 1 { align16 1Q };
+send(8)         g7<1>D          m2<4>F          0x0210a303
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 3 Sampler = 3 mlen 1 rlen 1 { align16 1Q };
+send(8)         g9<1>D          m2<4>F          0x0210a404
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 4 Sampler = 4 mlen 1 rlen 1 { align16 1Q };
+send(8)         g11<1>D         m2<4>F          0x0210a505
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 5 Sampler = 5 mlen 1 rlen 1 { align16 1Q };
+send(8)         g13<1>D         m2<4>F          0x0210a606
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 6 Sampler = 6 mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>F        m1<4>F          0x9208c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 9 rlen 0 { align16 1Q EOT };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04419001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08829001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(16)        g13<1>UW        m17<8,8,1>UD    0x02280301
+                            const MsgDesc: (1, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(8)         g11<1>D         m2<4>F          0x04188001
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 1 Sampler = 0 mlen 2 rlen 1 { align16 1Q };
+send(8)         null<1>F        m2<4>UD         0x0209a000
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 0 mlen 1 rlen 0 { align16 1Q };
+send(8)         g63<1>UD        m2<4>UD         0x021ba000
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x0a412001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x14822001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m2<8,8,1>F      0x02419001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x04829001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m2<8,8,1>F      0x0c416001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         null<1>F        m1<4>F          0x1e084470
+                            urb MsgDesc: 7 urb_write interleave used mlen 15 rlen 0 { align16 1Q };
+send(8)         null<1>F        m1<4>F          0x8e08c4e0
+                            urb MsgDesc: 14 urb_write interleave used complete mlen 7 rlen 0 { align16 1Q EOT };
+send(8)         g2<1>UW         m2<8,8,1>F      0x0a413001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x14823001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m2<8,8,1>F      0x06410001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x0c820001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04418002
+                            sampler MsgDesc: gather4 SIMD8 Surface = 2 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08828002
+                            sampler MsgDesc: gather4 SIMD16 Surface = 2 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g8<1>F          m2<4>F          0x02107000
+                            sampler MsgDesc: ld SIMD4x2 Surface = 0 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>F        m1<4>F          0x9e08c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 15 rlen 0 { align16 1Q EOT };
+send(8)         g2<1>UW         m16<8,8,1>F     0x04497001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m16<8,8,1>F     0x068a7001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 3 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m1<8,8,1>F      0x08498002
+                            sampler MsgDesc: gather4 SIMD8 Surface = 2 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m1<8,8,1>F      0x0e8a8002
+                            sampler MsgDesc: gather4 SIMD16 Surface = 2 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m1<8,8,1>F      0x0c491001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m1<8,8,1>F      0x168a1001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 11 rlen 8 { align1 1H };
+send(8)         g18<1>D         m2<4>F          0x0210a040
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 64 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g9<1>UW         m2<8,8,1>F      0x0241a102
+                            sampler MsgDesc: resinfo SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 4 { align1 1Q };
+send(8)         g9<1>UW         m2<8,8,1>F      0x0c416102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 6 rlen 4 { align1 1Q };
+send(16)        g7<1>UW         m2<8,8,1>F      0x0482a102
+                            sampler MsgDesc: resinfo SIMD16 Surface = 2 Sampler = 1 mlen 2 rlen 8 { align1 1H };
+send(8)         g9<1>UD         m1<4>F          0x1618e400
+                            urb MsgDesc: 0 urb_write interleave allocate used complete mlen 11 rlen 1 { align16 1Q };
+send(8)         g19<1>F         m17<4>F         0x04184040
+                            dp_sampler MsgDesc: (64, 0, 2, 0) mlen 2 rlen 1 { align16 1Q };
+send(8)         g3<1>UW         m1<8,8,1>F      0x0c493001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g7<1>UW         m1<8,8,1>F      0x0c493102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 6 rlen 4 { align1 1Q };
+send(16)        g4<1>UW         m1<8,8,1>F      0x168a3001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 11 rlen 8 { align1 1H };
+send(16)        g12<1>UW        m1<8,8,1>F      0x168a3102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 11 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m2<8,8,1>F      0x08418002
+                            sampler MsgDesc: gather4 SIMD8 Surface = 2 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x10828002
+                            sampler MsgDesc: gather4 SIMD16 Surface = 2 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m2<8,8,1>F      0x0a411001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x14821001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m2<8,8,1>F      0x0c415001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         m2<8,8,1>F      0x0c415102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 6 rlen 4 { align1 1Q };
+send(8)         g19<1>F         m17<4>F         0x04184001
+                            dp_sampler MsgDesc: (1, 0, 2, 0) mlen 2 rlen 1  { align16 1Q };
+send(8)         g2<1>UW         m1<8,8,1>F      0x06498002
+                            sampler MsgDesc: gather4 SIMD8 Surface = 2 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m1<8,8,1>F      0x0a8a8002
+                            sampler MsgDesc: gather4 SIMD16 Surface = 2 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(8)         null<1>F        m1<4>F          0x9608c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 11 rlen 0 { align16 1Q EOT };
+send(8)         g7<1>UD         m2<4>F          0x04107000
+                            sampler MsgDesc: ld SIMD4x2 Surface = 0 Sampler = 0 mlen 2 rlen 1 { align16 1Q };
+send(8)         g12<1>UD        m1<4>F          0x1a18e400
+                            urb MsgDesc: 0 urb_write interleave allocate used complete mlen 13 rlen 1 { align16 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x0a417001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x14827001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04410304
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 3 mlen 2 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08820304
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 3 mlen 4 rlen 8 { align1 1H };
+send(8)         g11<1>UD        m1<4>F          0x0e18e400
+                            urb MsgDesc: 0 urb_write interleave allocate used complete mlen 7 rlen 1 { align16 1Q };
+send(8)         g26<1>UD        m2<4>UD         0x021ba001
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 1 mlen 1 rlen 1 { align16 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x0c414001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x06419001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x0c829001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         null<1>F        m2<4>UD         0x0209a001
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 1 mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        m2<4>UD         0x0209a002
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 2 mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        m2<4>UD         0x0209a003
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 3 mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        m2<4>UD         0x0209a004
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 4 mlen 1 rlen 0 { align16 1Q };
+send(8)         g43<1>UD        m2<4>UD         0x021ba005
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 5 mlen 1 rlen 1 { align16 1Q };
+send(8)         g2<1>UW         m1<8,8,1>F      0x12494001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 9 rlen 4 { align1 1Q };
+send(16)        g3<1>UW         m17<8,8,1>UD    0x02280302
+                            const MsgDesc: (2, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(8)         g7<1>UW         m2<8,8,1>F      0x0a413102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g12<1>UW        m2<8,8,1>F      0x14823102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 10 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m1<8,8,1>F      0x06490001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m1<8,8,1>F      0x0a8a0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04410203
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 2 mlen 2 rlen 4 { align1 1Q };
+send(8)         g9<1>UW         m2<8,8,1>F      0x04410102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08820203
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 2 mlen 4 rlen 8 { align1 1H };
+send(16)        g13<1>UW        m2<8,8,1>F      0x08820102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 4 rlen 8 { align1 1H };
+send(8)         g3<1>UW         m2<8,8,1>F      0x0241a203
+                            sampler MsgDesc: resinfo SIMD8 Surface = 3 Sampler = 2 mlen 1 rlen 4 { align1 1Q };
+send(8)         g3<1>UW         m2<8,8,1>F      0x0241a304
+                            sampler MsgDesc: resinfo SIMD8 Surface = 4 Sampler = 3 mlen 1 rlen 4 { align1 1Q };
+send(8)         g3<1>UW         m2<8,8,1>F      0x0241a405
+                            sampler MsgDesc: resinfo SIMD8 Surface = 5 Sampler = 4 mlen 1 rlen 4 { align1 1Q };
+send(8)         g3<1>UW         m2<8,8,1>F      0x0241a506
+                            sampler MsgDesc: resinfo SIMD8 Surface = 6 Sampler = 5 mlen 1 rlen 4 { align1 1Q };
+send(8)         g3<1>UW         m2<8,8,1>F      0x0241a607
+                            sampler MsgDesc: resinfo SIMD8 Surface = 7 Sampler = 6 mlen 1 rlen 4 { align1 1Q };
+send(8)         g3<1>UW         m2<8,8,1>F      0x0241a708
+                            sampler MsgDesc: resinfo SIMD8 Surface = 8 Sampler = 7 mlen 1 rlen 4 { align1 1Q };
+send(8)         g3<1>UW         m2<8,8,1>F      0x0241a809
+                            sampler MsgDesc: resinfo SIMD8 Surface = 9 Sampler = 8 mlen 1 rlen 4 { align1 1Q };
+send(8)         g3<1>UW         m2<8,8,1>F      0x0241a90a
+                            sampler MsgDesc: resinfo SIMD8 Surface = 10 Sampler = 9 mlen 1 rlen 4 { align1 1Q };
+send(8)         g3<1>UW         m2<8,8,1>F      0x0241aa0b
+                            sampler MsgDesc: resinfo SIMD8 Surface = 11 Sampler = 10 mlen 1 rlen 4 { align1 1Q };
+send(8)         g3<1>UW         m2<8,8,1>F      0x0241ab0c
+                            sampler MsgDesc: resinfo SIMD8 Surface = 12 Sampler = 11 mlen 1 rlen 4 { align1 1Q };
+send(8)         g3<1>UW         m2<8,8,1>F      0x0241ac0d
+                            sampler MsgDesc: resinfo SIMD8 Surface = 13 Sampler = 12 mlen 1 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x0482a203
+                            sampler MsgDesc: resinfo SIMD16 Surface = 3 Sampler = 2 mlen 2 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x0482a304
+                            sampler MsgDesc: resinfo SIMD16 Surface = 4 Sampler = 3 mlen 2 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x0482a405
+                            sampler MsgDesc: resinfo SIMD16 Surface = 5 Sampler = 4 mlen 2 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x0482a506
+                            sampler MsgDesc: resinfo SIMD16 Surface = 6 Sampler = 5 mlen 2 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x0482a607
+                            sampler MsgDesc: resinfo SIMD16 Surface = 7 Sampler = 6 mlen 2 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x0482a708
+                            sampler MsgDesc: resinfo SIMD16 Surface = 8 Sampler = 7 mlen 2 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x0482a809
+                            sampler MsgDesc: resinfo SIMD16 Surface = 9 Sampler = 8 mlen 2 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x0482a90a
+                            sampler MsgDesc: resinfo SIMD16 Surface = 10 Sampler = 9 mlen 2 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x0482aa0b
+                            sampler MsgDesc: resinfo SIMD16 Surface = 11 Sampler = 10 mlen 2 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x0482ab0c
+                            sampler MsgDesc: resinfo SIMD16 Surface = 12 Sampler = 11 mlen 2 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x0482ac0d
+                            sampler MsgDesc: resinfo SIMD16 Surface = 13 Sampler = 12 mlen 2 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m2<8,8,1>F      0x14414001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 10 rlen 4 { align1 1Q };
+send(8)         g17<1>F         m2<4>F          0x04102000
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 0 Sampler = 0 mlen 2 rlen 1 { align16 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04410405
+                            sampler MsgDesc: sample SIMD8 Surface = 5 Sampler = 4 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04410506
+                            sampler MsgDesc: sample SIMD8 Surface = 6 Sampler = 5 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04410607
+                            sampler MsgDesc: sample SIMD8 Surface = 7 Sampler = 6 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04410708
+                            sampler MsgDesc: sample SIMD8 Surface = 8 Sampler = 7 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04410809
+                            sampler MsgDesc: sample SIMD8 Surface = 9 Sampler = 8 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x0441090a
+                            sampler MsgDesc: sample SIMD8 Surface = 10 Sampler = 9 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04410a0b
+                            sampler MsgDesc: sample SIMD8 Surface = 11 Sampler = 10 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04410b0c
+                            sampler MsgDesc: sample SIMD8 Surface = 12 Sampler = 11 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04410c0d
+                            sampler MsgDesc: sample SIMD8 Surface = 13 Sampler = 12 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04410d0e
+                            sampler MsgDesc: sample SIMD8 Surface = 14 Sampler = 13 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04410e0f
+                            sampler MsgDesc: sample SIMD8 Surface = 15 Sampler = 14 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04410f10
+                            sampler MsgDesc: sample SIMD8 Surface = 16 Sampler = 15 mlen 2 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08820405
+                            sampler MsgDesc: sample SIMD16 Surface = 5 Sampler = 4 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08820506
+                            sampler MsgDesc: sample SIMD16 Surface = 6 Sampler = 5 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08820607
+                            sampler MsgDesc: sample SIMD16 Surface = 7 Sampler = 6 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08820708
+                            sampler MsgDesc: sample SIMD16 Surface = 8 Sampler = 7 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08820809
+                            sampler MsgDesc: sample SIMD16 Surface = 9 Sampler = 8 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x0882090a
+                            sampler MsgDesc: sample SIMD16 Surface = 10 Sampler = 9 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08820a0b
+                            sampler MsgDesc: sample SIMD16 Surface = 11 Sampler = 10 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08820b0c
+                            sampler MsgDesc: sample SIMD16 Surface = 12 Sampler = 11 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08820c0d
+                            sampler MsgDesc: sample SIMD16 Surface = 13 Sampler = 12 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08820d0e
+                            sampler MsgDesc: sample SIMD16 Surface = 14 Sampler = 13 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08820e0f
+                            sampler MsgDesc: sample SIMD16 Surface = 15 Sampler = 14 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08820f10
+                            sampler MsgDesc: sample SIMD16 Surface = 16 Sampler = 15 mlen 4 rlen 8 { align1 1H };
+send(8)         g6<1>UW         m2<8,8,1>F      0x02410102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        m2<8,8,1>F      0x04820102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 2 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m1<8,8,1>F      0x0c492001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m1<8,8,1>F      0x168a2001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 11 rlen 8 { align1 1H };
+send(8)         g3<1>UW         m1<8,8,1>F      0x0e495001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+send(8)         g7<1>UW         m1<8,8,1>F      0x0e495102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 7 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         m1<8,8,1>F      0x04490001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m1<8,8,1>F      0x068a0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 3 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04410003
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x08820003
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m2<8,8,1>F      0x08417008
+                            sampler MsgDesc: ld SIMD8 Surface = 8 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         m2<8,8,1>F      0x08417109
+                            sampler MsgDesc: ld SIMD8 Surface = 9 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(8)         g7<1>UW         m2<8,8,1>F      0x0841720a
+                            sampler MsgDesc: ld SIMD8 Surface = 10 Sampler = 2 mlen 4 rlen 4 { align1 1Q };
+send(8)         g8<1>UW         m2<8,8,1>F      0x0841730b
+                            sampler MsgDesc: ld SIMD8 Surface = 11 Sampler = 3 mlen 4 rlen 4 { align1 1Q };
+send(8)         g9<1>UW         m2<8,8,1>F      0x0841740c
+                            sampler MsgDesc: ld SIMD8 Surface = 12 Sampler = 4 mlen 4 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        m2<8,8,1>F      0x0841750d
+                            sampler MsgDesc: ld SIMD8 Surface = 13 Sampler = 5 mlen 4 rlen 4 { align1 1Q };
+send(8)         g11<1>UW        m2<8,8,1>F      0x0841760e
+                            sampler MsgDesc: ld SIMD8 Surface = 14 Sampler = 6 mlen 4 rlen 4 { align1 1Q };
+send(8)         g12<1>UW        m2<8,8,1>F      0x0841770f
+                            sampler MsgDesc: ld SIMD8 Surface = 15 Sampler = 7 mlen 4 rlen 4 { align1 1Q };
+send(16)        g4<1>UW         m2<8,8,1>F      0x10827008
+                            sampler MsgDesc: ld SIMD16 Surface = 8 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(16)        g12<1>UW        m2<8,8,1>F      0x10827109
+                            sampler MsgDesc: ld SIMD16 Surface = 9 Sampler = 1 mlen 8 rlen 8 { align1 1H };
+send(16)        g12<1>UW        m2<8,8,1>F      0x1082720a
+                            sampler MsgDesc: ld SIMD16 Surface = 10 Sampler = 2 mlen 8 rlen 8 { align1 1H };
+send(16)        g13<1>UW        m2<8,8,1>F      0x1082730b
+                            sampler MsgDesc: ld SIMD16 Surface = 11 Sampler = 3 mlen 8 rlen 8 { align1 1H };
+send(16)        g14<1>UW        m2<8,8,1>F      0x1082740c
+                            sampler MsgDesc: ld SIMD16 Surface = 12 Sampler = 4 mlen 8 rlen 8 { align1 1H };
+send(16)        g15<1>UW        m2<8,8,1>F      0x1082750d
+                            sampler MsgDesc: ld SIMD16 Surface = 13 Sampler = 5 mlen 8 rlen 8 { align1 1H };
+send(16)        g16<1>UW        m2<8,8,1>F      0x1082760e
+                            sampler MsgDesc: ld SIMD16 Surface = 14 Sampler = 6 mlen 8 rlen 8 { align1 1H };
+send(16)        g17<1>UW        m2<8,8,1>F      0x1082770f
+                            sampler MsgDesc: ld SIMD16 Surface = 15 Sampler = 7 mlen 8 rlen 8 { align1 1H };
+send(8)         g30<1>UD        m2<4>UD         0x021ba002
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 2 mlen 1 rlen 1 { align16 1Q };
+send(8)         g5<1>F          m2<4>F          0x04102505
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 5 Sampler = 5 mlen 2 rlen 1 { align16 1Q };
+send(8)         g11<1>UW        m16<8,8,1>F     0x04497002
+                            sampler MsgDesc: ld SIMD8 Surface = 2 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g19<1>UW        m16<8,8,1>F     0x068a7002
+                            sampler MsgDesc: ld SIMD16 Surface = 2 Sampler = 0 mlen 3 rlen 8 { align1 1H };
+send(8)         g6<1>UW         m2<8,8,1>F      0x06410102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        m2<8,8,1>F      0x0c820102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         null<1>F        m1<4>F          0x8a08c470
+                            urb MsgDesc: 7 urb_write interleave used complete mlen 5 rlen 0 { align16 1Q EOT };
+send(8)         g6<1>UD         m1<4>F          0x1218e400
+                            urb MsgDesc: 0 urb_write interleave allocate used complete mlen 9 rlen 1 { align16 1Q };
+send(8)         null<1>F        m2<4>UD         0x0209a005
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 5 mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        m2<4>UD         0x0209a006
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 6 mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        m2<4>UD         0x0209a007
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 7 mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        m2<4>UD         0x0209a008
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 8 mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        m2<4>UD         0x0209a009
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 9 mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        m2<4>UD         0x0209a00a
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 10 mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        m2<4>UD         0x0209a00b
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 11 mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        m2<4>UD         0x0209a00c
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 12 mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        m2<4>UD         0x0209a00d
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 13 mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        m2<4>UD         0x0209a00e
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 14 mlen 1 rlen 0 { align16 1Q };
+send(8)         g18<1>UD        m2<4>UD         0x021ba00f
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 15 mlen 1 rlen 1 { align16 1Q };
+send(8)         g9<1>UD         m1<4>F          0x1a18e470
+                            urb MsgDesc: 7 urb_write interleave allocate used complete mlen 13 rlen 1 { align16 1Q };
+send(8)         null<1>F        m1<4>F          0x9a08c400
+                            urb MsgDesc: 0 urb_write interleave used complete mlen 13 rlen 0 { align16 1Q EOT };
+send(16)        g2<1>UW         m17<8,8,1>UD    0x02280304
+                            const MsgDesc: (4, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(16)        g2<1>UW         m17<8,8,1>UD    0x02280303
+                            const MsgDesc: (3, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(16)        g2<1>UW         m17<8,8,1>UD    0x02280306
+                            const MsgDesc: (6, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(16)        g2<1>UW         m17<8,8,1>UD    0x02280305
+                            const MsgDesc: (5, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(8)         g34<1>UD        m2<4>UD         0x021ba003
+                            render MsgDesc: streamed VB write MsgCtrl = 0x0 Surface = 3 mlen 1 rlen 1 { align16 1Q };
+send(8)         g15<1>D         m2<4>F          0x0210a707
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 7 Sampler = 7 mlen 1 rlen 1 { align16 1Q };
+send(8)         g17<1>D         m2<4>F          0x0210a808
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 8 Sampler = 8 mlen 1 rlen 1 { align16 1Q };
+send(8)         g19<1>D         m2<4>F          0x0210a909
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 9 Sampler = 9 mlen 1 rlen 1 { align16 1Q };
+send(8)         g21<1>D         m2<4>F          0x0210aa0a
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 10 Sampler = 10 mlen 1 rlen 1 { align16 1Q };
+send(8)         g23<1>D         m2<4>F          0x0210ab0b
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 11 Sampler = 11 mlen 1 rlen 1 { align16 1Q };
+send(8)         g25<1>D         m2<4>F          0x0210ac0c
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 12 Sampler = 12 mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>UW       m22<8,8,1>UD    0x040902ff
+                            render MsgDesc: OWORD block write MsgCtrl = 0x2 Surface = 255 mlen 2 rlen 0 { align1 1Q };
+send(8)         g69<1>UW        m22<8,8,1>UD    0x021802ff
+                            render MsgDesc: OWORD block read MsgCtrl = 0x2 Surface = 255 mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g9<1>UD         m1<4>F          0x0e18e4e0
+                            urb MsgDesc: 14 urb_write interleave allocate used complete mlen 7 rlen 1 { align16 1Q };
+send(8)         g2<1>UW         m1<8,8,1>F      0x08490001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m1<8,8,1>F      0x0e8a0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         g2<1>UW         m2<8,8,1>F      0x08410001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         m2<8,8,1>F      0x10820001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         null<1>F        m1<4>F          0x9a08c470
+                            urb MsgDesc: 7 urb_write interleave used complete mlen 13 rlen 0 { align16 1Q EOT };
+send(8)         g15<1>F         m17<4>F         0x04184043
+                            dp_sampler MsgDesc: (67, 0, 2, 0) mlen 2 rlen 1 { align16 1Q };
+send(8)         g21<1>F         m17<4>F         0x04184042
+                            dp_sampler MsgDesc: (66, 0, 2, 0) mlen 2 rlen 1 { align16 1Q };
+send(8)         g23<1>F         m17<4>F         0x04184041
+                            dp_sampler MsgDesc: (65, 0, 2, 0) mlen 2 rlen 1 { align16 1Q };
+send(8)         g4<1>F          m17<4>F         0x04184003
+                            dp_sampler MsgDesc: (3, 0, 2, 0) mlen 2 rlen 1  { align16 1Q };
+send(8)         g13<1>F         m17<4>F         0x04184002
+                            dp_sampler MsgDesc: (2, 0, 2, 0) mlen 2 rlen 1  { align16 1Q };
+send(8)         g14<1>UW        m2<8,8,1>F      0x0a417102
+                            sampler MsgDesc: ld SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g24<1>UW        m2<8,8,1>F      0x14827102
+                            sampler MsgDesc: ld SIMD16 Surface = 2 Sampler = 1 mlen 10 rlen 8 { align1 1H };
+send(16)        g2<1>UW         m17<8,8,1>UD    0x02280307
+                            const MsgDesc: (7, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(8)         g6<1>UW         m2<8,8,1>F      0x0a413203
+                            sampler MsgDesc: sample_c SIMD8 Surface = 3 Sampler = 2 mlen 5 rlen 4 { align1 1Q };
+send(16)        g13<1>UW        m2<8,8,1>F      0x14823203
+                            sampler MsgDesc: sample_c SIMD16 Surface = 3 Sampler = 2 mlen 10 rlen 8 { align1 1H };
+send(8)         g5<1>F          m2<4>F          0x04102303
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 3 Sampler = 3 mlen 2 rlen 1 { align16 1Q };
+send(8)         g2<1>UW         m2<8,8,1>F      0x04410002
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g20<1>UW        m2<8,8,1>F      0x04410008
+                            sampler MsgDesc: sample SIMD8 Surface = 8 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g24<1>UW        m2<8,8,1>F      0x04410109
+                            sampler MsgDesc: sample SIMD8 Surface = 9 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(8)         g28<1>UW        m2<8,8,1>F      0x0441020a
+                            sampler MsgDesc: sample SIMD8 Surface = 10 Sampler = 2 mlen 2 rlen 4 { align1 1Q };
+send(8)         g32<1>UW        m2<8,8,1>F      0x0441030b
+                            sampler MsgDesc: sample SIMD8 Surface = 11 Sampler = 3 mlen 2 rlen 4 { align1 1Q };
+send(8)         g36<1>UW        m2<8,8,1>F      0x0441040c
+                            sampler MsgDesc: sample SIMD8 Surface = 12 Sampler = 4 mlen 2 rlen 4 { align1 1Q };
+send(8)         g40<1>UW        m2<8,8,1>F      0x0441050d
+                            sampler MsgDesc: sample SIMD8 Surface = 13 Sampler = 5 mlen 2 rlen 4 { align1 1Q };
+send(8)         g44<1>UW        m2<8,8,1>F      0x0441060e
+                            sampler MsgDesc: sample SIMD8 Surface = 14 Sampler = 6 mlen 2 rlen 4 { align1 1Q };
+send(8)         g48<1>UW        m2<8,8,1>F      0x0441070f
+                            sampler MsgDesc: sample SIMD8 Surface = 15 Sampler = 7 mlen 2 rlen 4 { align1 1Q };
+send(16)        g22<1>UW        m2<8,8,1>F      0x08820008
+                            sampler MsgDesc: sample SIMD16 Surface = 8 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(16)        g30<1>UW        m2<8,8,1>F      0x08820109
+                            sampler MsgDesc: sample SIMD16 Surface = 9 Sampler = 1 mlen 4 rlen 8 { align1 1H };
+send(16)        g22<1>UW        m2<8,8,1>F      0x0882020a
+                            sampler MsgDesc: sample SIMD16 Surface = 10 Sampler = 2 mlen 4 rlen 8 { align1 1H };
+send(16)        g38<1>UW        m2<8,8,1>F      0x0882030b
+                            sampler MsgDesc: sample SIMD16 Surface = 11 Sampler = 3 mlen 4 rlen 8 { align1 1H };
+send(16)        g30<1>UW        m2<8,8,1>F      0x0882040c
+                            sampler MsgDesc: sample SIMD16 Surface = 12 Sampler = 4 mlen 4 rlen 8 { align1 1H };
+send(16)        g46<1>UW        m2<8,8,1>F      0x0882050d
+                            sampler MsgDesc: sample SIMD16 Surface = 13 Sampler = 5 mlen 4 rlen 8 { align1 1H };
+send(16)        g22<1>UW        m2<8,8,1>F      0x0882060e
+                            sampler MsgDesc: sample SIMD16 Surface = 14 Sampler = 6 mlen 4 rlen 8 { align1 1H };
+send(16)        g54<1>UW        m2<8,8,1>F      0x0882070f
+                            sampler MsgDesc: sample SIMD16 Surface = 15 Sampler = 7 mlen 4 rlen 8 { align1 1H };
+send(8)         g5<1>F          m2<4>F          0x04102101
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 1 Sampler = 1 mlen 2 rlen 1 { align16 1Q };
+send(8)         g6<1>F          m2<4>F          0x04102202
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 2 Sampler = 2 mlen 2 rlen 1 { align16 1Q };
+send(8)         g8<1>F          m2<4>F          0x04102404
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 4 Sampler = 4 mlen 2 rlen 1 { align16 1Q };
+send(8)         g10<1>F         m2<4>F          0x04102606
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 6 Sampler = 6 mlen 2 rlen 1 { align16 1Q };
+send(8)         g11<1>F         m2<4>F          0x04102707
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 7 Sampler = 7 mlen 2 rlen 1 { align16 1Q };
+send(8)         g9<1>UD         m1<4>F          0x0a18e470
+                            urb MsgDesc: 7 urb_write interleave allocate used complete mlen 5 rlen 1 { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen6/send.expected b/src/intel/compiler/elk/tests/gen6/send.expected
new file mode 100644
index 00000000000..b89db52d964
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/send.expected
@@ -0,0 +1,258 @@
+31 01 60 06 dc 0f 0f 20 24 00 6e 00 00 c4 08 86
+31 01 60 06 dc 0f 0f 20 24 00 6e 00 00 c4 08 8a
+31 00 60 02 c9 0f 40 20 40 00 8d 00 01 70 41 08
+31 00 80 02 c9 0f 40 20 40 00 8d 00 01 70 82 10
+31 01 60 05 dd 0f 0f 20 a4 02 6e 00 ff 20 09 06
+31 01 60 05 dd 0f 2f 25 c4 02 6e 00 ff 40 18 04
+31 01 60 06 dc 0f 0f 20 24 00 6e 00 00 c4 08 8e
+31 00 60 02 c9 0f 40 20 20 00 8d 00 01 40 49 16
+31 00 60 02 c9 0f 40 20 20 00 8d 00 01 40 49 0e
+31 00 60 02 c9 0f 60 20 20 00 8d 00 01 60 49 0e
+31 00 60 02 c9 0f e0 20 20 00 8d 00 02 61 49 0e
+31 01 60 02 c5 0f cf 21 44 00 6e 00 40 70 10 04
+31 01 60 06 41 0c e1 20 24 00 6e 00 01 20 18 02
+31 01 60 06 c1 0f ef 20 24 00 6e 00 00 e4 18 0a
+31 01 60 06 dc 0f 0f 20 24 00 6e 00 00 84 08 82
+31 01 60 06 c1 0f 0f 21 24 00 6e 00 00 e4 18 06
+31 01 60 06 dc 0f 0f 20 24 00 6e 00 00 44 08 1e
+31 01 60 06 dc 0f 0f 20 24 00 6e 00 70 c4 08 86
+31 00 60 02 c9 0f e0 20 40 00 8d 00 01 00 41 04
+31 00 80 02 c9 0f 20 21 40 00 8d 00 01 00 82 08
+31 00 60 02 c9 0f e0 20 40 00 8d 00 01 00 41 02
+31 00 80 02 c9 0f 20 21 40 00 8d 00 01 00 82 04
+31 01 60 04 c1 0f 0f 23 24 02 6e 00 00 40 18 04
+31 00 60 02 c9 0f 00 21 40 00 8d 00 01 a0 41 02
+31 00 60 02 c9 0f 40 20 40 00 8d 00 02 80 41 06
+31 00 80 02 c9 0f 60 21 40 00 8d 00 01 a0 82 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 02 80 82 0c
+31 01 60 02 c5 0f ef 21 44 00 6e 00 40 70 10 02
+31 00 60 02 c9 0f 40 20 40 00 8d 00 01 40 41 10
+31 01 60 02 c5 0f 4f 20 44 00 6e 00 00 a0 10 02
+31 01 60 02 c5 0f 6f 20 44 00 6e 00 01 a1 10 02
+31 01 60 02 c5 0f af 20 44 00 6e 00 02 a2 10 02
+31 01 60 02 c5 0f ef 20 44 00 6e 00 03 a3 10 02
+31 01 60 02 c5 0f 2f 21 44 00 6e 00 04 a4 10 02
+31 01 60 02 c5 0f 6f 21 44 00 6e 00 05 a5 10 02
+31 01 60 02 c5 0f af 21 44 00 6e 00 06 a6 10 02
+31 01 60 06 dc 0f 0f 20 24 00 6e 00 00 c4 08 92
+31 00 60 02 c9 0f 40 20 40 00 8d 00 01 90 41 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 01 90 82 08
+31 02 80 09 49 0c a0 21 20 02 8d 00 01 03 28 02
+31 01 60 02 c5 0f 6f 21 44 00 6e 00 01 80 18 04
+31 01 60 05 5c 0c 0f 20 44 00 6e 00 00 a0 09 02
+31 01 60 05 41 0c ef 27 44 00 6e 00 00 a0 1b 02
+31 00 60 02 c9 0f 40 20 40 00 8d 00 01 20 41 0a
+31 00 80 02 c9 0f 40 20 40 00 8d 00 01 20 82 14
+31 00 60 02 c9 0f 40 20 40 00 8d 00 01 90 41 02
+31 00 80 02 c9 0f 40 20 40 00 8d 00 01 90 82 04
+31 00 60 02 c9 0f 40 20 40 00 8d 00 01 60 41 0c
+31 01 60 06 dc 0f 0f 20 24 00 6e 00 70 44 08 1e
+31 01 60 06 dc 0f 0f 20 24 00 6e 00 e0 c4 08 8e
+31 00 60 02 c9 0f 40 20 40 00 8d 00 01 30 41 0a
+31 00 80 02 c9 0f 40 20 40 00 8d 00 01 30 82 14
+31 00 60 02 c9 0f 40 20 40 00 8d 00 01 00 41 06
+31 00 80 02 c9 0f 40 20 40 00 8d 00 01 00 82 0c
+31 00 60 02 c9 0f 40 20 40 00 8d 00 02 80 41 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 02 80 82 08
+31 01 60 02 dd 0f 0f 21 44 00 6e 00 00 70 10 02
+31 01 60 06 dc 0f 0f 20 24 00 6e 00 00 c4 08 9e
+31 00 60 02 c9 0f 40 20 00 02 8d 00 01 70 49 04
+31 00 80 02 c9 0f 40 20 00 02 8d 00 01 70 8a 06
+31 00 60 02 c9 0f 40 20 20 00 8d 00 02 80 49 08
+31 00 80 02 c9 0f 40 20 20 00 8d 00 02 80 8a 0e
+31 00 60 02 c9 0f 40 20 20 00 8d 00 01 10 49 0c
+31 00 80 02 c9 0f 40 20 20 00 8d 00 01 10 8a 16
+31 01 60 02 c5 0f 4f 22 44 00 6e 00 40 a0 10 02
+31 00 60 02 c9 0f 20 21 40 00 8d 00 02 a1 41 02
+31 00 60 02 c9 0f 20 21 40 00 8d 00 02 61 41 0c
+31 00 80 02 c9 0f e0 20 40 00 8d 00 02 a1 82 04
+31 01 60 06 c1 0f 2f 21 24 00 6e 00 00 e4 18 16
+31 01 60 04 dd 0f 6f 22 24 02 6e 00 40 40 18 04
+31 00 60 02 c9 0f 60 20 20 00 8d 00 01 30 49 0c
+31 00 60 02 c9 0f e0 20 20 00 8d 00 02 31 49 0c
+31 00 80 02 c9 0f 80 20 20 00 8d 00 01 30 8a 16
+31 00 80 02 c9 0f 80 21 20 00 8d 00 02 31 8a 16
+31 00 60 02 c9 0f 40 20 40 00 8d 00 02 80 41 08
+31 00 80 02 c9 0f 40 20 40 00 8d 00 02 80 82 10
+31 00 60 02 c9 0f 40 20 40 00 8d 00 01 10 41 0a
+31 00 80 02 c9 0f 40 20 40 00 8d 00 01 10 82 14
+31 00 60 02 c9 0f 40 20 40 00 8d 00 01 50 41 0c
+31 00 60 02 c9 0f c0 20 40 00 8d 00 02 51 41 0c
+31 01 60 04 dd 0f 6f 22 24 02 6e 00 01 40 18 04
+31 00 60 02 c9 0f 40 20 20 00 8d 00 02 80 49 06
+31 00 80 02 c9 0f 40 20 20 00 8d 00 02 80 8a 0a
+31 01 60 06 dc 0f 0f 20 24 00 6e 00 00 c4 08 96
+31 01 60 02 c1 0f ef 20 44 00 6e 00 00 70 10 04
+31 01 60 06 c1 0f 8f 21 24 00 6e 00 00 e4 18 1a
+31 00 60 02 c9 0f 40 20 40 00 8d 00 01 70 41 0a
+31 00 80 02 c9 0f 40 20 40 00 8d 00 01 70 82 14
+31 00 60 02 c9 0f 40 20 40 00 8d 00 04 03 41 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 04 03 82 08
+31 01 60 06 c1 0f 6f 21 24 00 6e 00 00 e4 18 0e
+31 01 60 05 41 0c 4f 23 44 00 6e 00 01 a0 1b 02
+31 00 60 02 c9 0f 40 20 40 00 8d 00 01 40 41 0c
+31 00 60 02 c9 0f 40 20 40 00 8d 00 01 90 41 06
+31 00 80 02 c9 0f 40 20 40 00 8d 00 01 90 82 0c
+31 01 60 05 5c 0c 0f 20 44 00 6e 00 01 a0 09 02
+31 01 60 05 5c 0c 0f 20 44 00 6e 00 02 a0 09 02
+31 01 60 05 5c 0c 0f 20 44 00 6e 00 03 a0 09 02
+31 01 60 05 5c 0c 0f 20 44 00 6e 00 04 a0 09 02
+31 01 60 05 41 0c 6f 25 44 00 6e 00 05 a0 1b 02
+31 00 60 02 c9 0f 40 20 20 00 8d 00 01 40 49 12
+31 02 80 09 49 0c 60 20 20 02 8d 00 02 03 28 02
+31 00 60 02 c9 0f e0 20 40 00 8d 00 02 31 41 0a
+31 00 80 02 c9 0f 80 21 40 00 8d 00 02 31 82 14
+31 00 60 02 c9 0f 40 20 20 00 8d 00 01 00 49 06
+31 00 80 02 c9 0f 40 20 20 00 8d 00 01 00 8a 0a
+31 00 60 02 c9 0f 40 20 40 00 8d 00 03 02 41 04
+31 00 60 02 c9 0f 20 21 40 00 8d 00 02 01 41 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 03 02 82 08
+31 00 80 02 c9 0f a0 21 40 00 8d 00 02 01 82 08
+31 00 60 02 c9 0f 60 20 40 00 8d 00 03 a2 41 02
+31 00 60 02 c9 0f 60 20 40 00 8d 00 04 a3 41 02
+31 00 60 02 c9 0f 60 20 40 00 8d 00 05 a4 41 02
+31 00 60 02 c9 0f 60 20 40 00 8d 00 06 a5 41 02
+31 00 60 02 c9 0f 60 20 40 00 8d 00 07 a6 41 02
+31 00 60 02 c9 0f 60 20 40 00 8d 00 08 a7 41 02
+31 00 60 02 c9 0f 60 20 40 00 8d 00 09 a8 41 02
+31 00 60 02 c9 0f 60 20 40 00 8d 00 0a a9 41 02
+31 00 60 02 c9 0f 60 20 40 00 8d 00 0b aa 41 02
+31 00 60 02 c9 0f 60 20 40 00 8d 00 0c ab 41 02
+31 00 60 02 c9 0f 60 20 40 00 8d 00 0d ac 41 02
+31 00 80 02 c9 0f 40 20 40 00 8d 00 03 a2 82 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 04 a3 82 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 05 a4 82 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 06 a5 82 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 07 a6 82 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 08 a7 82 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 09 a8 82 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 0a a9 82 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 0b aa 82 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 0c ab 82 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 0d ac 82 04
+31 00 60 02 c9 0f 40 20 40 00 8d 00 01 40 41 14
+31 01 60 02 dd 0f 2f 22 44 00 6e 00 00 20 10 04
+31 00 60 02 c9 0f 40 20 40 00 8d 00 05 04 41 04
+31 00 60 02 c9 0f 40 20 40 00 8d 00 06 05 41 04
+31 00 60 02 c9 0f 40 20 40 00 8d 00 07 06 41 04
+31 00 60 02 c9 0f 40 20 40 00 8d 00 08 07 41 04
+31 00 60 02 c9 0f 40 20 40 00 8d 00 09 08 41 04
+31 00 60 02 c9 0f 40 20 40 00 8d 00 0a 09 41 04
+31 00 60 02 c9 0f 40 20 40 00 8d 00 0b 0a 41 04
+31 00 60 02 c9 0f 40 20 40 00 8d 00 0c 0b 41 04
+31 00 60 02 c9 0f 40 20 40 00 8d 00 0d 0c 41 04
+31 00 60 02 c9 0f 40 20 40 00 8d 00 0e 0d 41 04
+31 00 60 02 c9 0f 40 20 40 00 8d 00 0f 0e 41 04
+31 00 60 02 c9 0f 40 20 40 00 8d 00 10 0f 41 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 05 04 82 08
+31 00 80 02 c9 0f 40 20 40 00 8d 00 06 05 82 08
+31 00 80 02 c9 0f 40 20 40 00 8d 00 07 06 82 08
+31 00 80 02 c9 0f 40 20 40 00 8d 00 08 07 82 08
+31 00 80 02 c9 0f 40 20 40 00 8d 00 09 08 82 08
+31 00 80 02 c9 0f 40 20 40 00 8d 00 0a 09 82 08
+31 00 80 02 c9 0f 40 20 40 00 8d 00 0b 0a 82 08
+31 00 80 02 c9 0f 40 20 40 00 8d 00 0c 0b 82 08
+31 00 80 02 c9 0f 40 20 40 00 8d 00 0d 0c 82 08
+31 00 80 02 c9 0f 40 20 40 00 8d 00 0e 0d 82 08
+31 00 80 02 c9 0f 40 20 40 00 8d 00 0f 0e 82 08
+31 00 80 02 c9 0f 40 20 40 00 8d 00 10 0f 82 08
+31 00 60 02 c9 0f c0 20 40 00 8d 00 02 01 41 02
+31 00 80 02 c9 0f 40 21 40 00 8d 00 02 01 82 04
+31 00 60 02 c9 0f 40 20 20 00 8d 00 01 20 49 0c
+31 00 80 02 c9 0f 40 20 20 00 8d 00 01 20 8a 16
+31 00 60 02 c9 0f 60 20 20 00 8d 00 01 50 49 0e
+31 00 60 02 c9 0f e0 20 20 00 8d 00 02 51 49 0e
+31 00 60 02 c9 0f 40 20 20 00 8d 00 01 00 49 04
+31 00 80 02 c9 0f 40 20 20 00 8d 00 01 00 8a 06
+31 00 60 02 c9 0f 40 20 40 00 8d 00 03 00 41 04
+31 00 80 02 c9 0f 40 20 40 00 8d 00 03 00 82 08
+31 00 60 02 c9 0f 40 20 40 00 8d 00 08 70 41 08
+31 00 60 02 c9 0f c0 20 40 00 8d 00 09 71 41 08
+31 00 60 02 c9 0f e0 20 40 00 8d 00 0a 72 41 08
+31 00 60 02 c9 0f 00 21 40 00 8d 00 0b 73 41 08
+31 00 60 02 c9 0f 20 21 40 00 8d 00 0c 74 41 08
+31 00 60 02 c9 0f 40 21 40 00 8d 00 0d 75 41 08
+31 00 60 02 c9 0f 60 21 40 00 8d 00 0e 76 41 08
+31 00 60 02 c9 0f 80 21 40 00 8d 00 0f 77 41 08
+31 00 80 02 c9 0f 80 20 40 00 8d 00 08 70 82 10
+31 00 80 02 c9 0f 80 21 40 00 8d 00 09 71 82 10
+31 00 80 02 c9 0f 80 21 40 00 8d 00 0a 72 82 10
+31 00 80 02 c9 0f a0 21 40 00 8d 00 0b 73 82 10
+31 00 80 02 c9 0f c0 21 40 00 8d 00 0c 74 82 10
+31 00 80 02 c9 0f e0 21 40 00 8d 00 0d 75 82 10
+31 00 80 02 c9 0f 00 22 40 00 8d 00 0e 76 82 10
+31 00 80 02 c9 0f 20 22 40 00 8d 00 0f 77 82 10
+31 01 60 05 41 0c cf 23 44 00 6e 00 02 a0 1b 02
+31 01 60 02 dd 0f af 20 44 00 6e 00 05 25 10 04
+31 00 60 02 c9 0f 60 21 00 02 8d 00 02 70 49 04
+31 00 80 02 c9 0f 60 22 00 02 8d 00 02 70 8a 06
+31 00 60 02 c9 0f c0 20 40 00 8d 00 02 01 41 06
+31 00 80 02 c9 0f 40 21 40 00 8d 00 02 01 82 0c
+31 01 60 06 dc 0f 0f 20 24 00 6e 00 70 c4 08 8a
+31 01 60 06 c1 0f cf 20 24 00 6e 00 00 e4 18 12
+31 01 60 05 5c 0c 0f 20 44 00 6e 00 05 a0 09 02
+31 01 60 05 5c 0c 0f 20 44 00 6e 00 06 a0 09 02
+31 01 60 05 5c 0c 0f 20 44 00 6e 00 07 a0 09 02
+31 01 60 05 5c 0c 0f 20 44 00 6e 00 08 a0 09 02
+31 01 60 05 5c 0c 0f 20 44 00 6e 00 09 a0 09 02
+31 01 60 05 5c 0c 0f 20 44 00 6e 00 0a a0 09 02
+31 01 60 05 5c 0c 0f 20 44 00 6e 00 0b a0 09 02
+31 01 60 05 5c 0c 0f 20 44 00 6e 00 0c a0 09 02
+31 01 60 05 5c 0c 0f 20 44 00 6e 00 0d a0 09 02
+31 01 60 05 5c 0c 0f 20 44 00 6e 00 0e a0 09 02
+31 01 60 05 41 0c 4f 22 44 00 6e 00 0f a0 1b 02
+31 01 60 06 c1 0f 2f 21 24 00 6e 00 70 e4 18 1a
+31 01 60 06 dc 0f 0f 20 24 00 6e 00 00 c4 08 9a
+31 02 80 09 49 0c 40 20 20 02 8d 00 04 03 28 02
+31 02 80 09 49 0c 40 20 20 02 8d 00 03 03 28 02
+31 02 80 09 49 0c 40 20 20 02 8d 00 06 03 28 02
+31 02 80 09 49 0c 40 20 20 02 8d 00 05 03 28 02
+31 01 60 05 41 0c 4f 24 44 00 6e 00 03 a0 1b 02
+31 01 60 02 c5 0f ef 21 44 00 6e 00 07 a7 10 02
+31 01 60 02 c5 0f 2f 22 44 00 6e 00 08 a8 10 02
+31 01 60 02 c5 0f 6f 22 44 00 6e 00 09 a9 10 02
+31 01 60 02 c5 0f af 22 44 00 6e 00 0a aa 10 02
+31 01 60 02 c5 0f ef 22 44 00 6e 00 0b ab 10 02
+31 01 60 02 c5 0f 2f 23 44 00 6e 00 0c ac 10 02
+31 00 60 05 48 0c 00 20 c0 02 8d 00 ff 02 09 04
+31 02 60 05 49 0c a0 28 c0 02 8d 00 ff 02 18 02
+31 01 60 06 c1 0f 2f 21 24 00 6e 00 e0 e4 18 0e
+31 00 60 02 c9 0f 40 20 20 00 8d 00 01 00 49 08
+31 00 80 02 c9 0f 40 20 20 00 8d 00 01 00 8a 0e
+31 00 60 02 c9 0f 40 20 40 00 8d 00 01 00 41 08
+31 00 80 02 c9 0f 40 20 40 00 8d 00 01 00 82 10
+31 01 60 06 dc 0f 0f 20 24 00 6e 00 70 c4 08 9a
+31 01 60 04 dd 0f ef 21 24 02 6e 00 43 40 18 04
+31 01 60 04 dd 0f af 22 24 02 6e 00 42 40 18 04
+31 01 60 04 dd 0f ef 22 24 02 6e 00 41 40 18 04
+31 01 60 04 dd 0f 8f 20 24 02 6e 00 03 40 18 04
+31 01 60 04 dd 0f af 21 24 02 6e 00 02 40 18 04
+31 00 60 02 c9 0f c0 21 40 00 8d 00 02 71 41 0a
+31 00 80 02 c9 0f 00 23 40 00 8d 00 02 71 82 14
+31 02 80 09 49 0c 40 20 20 02 8d 00 07 03 28 02
+31 00 60 02 c9 0f c0 20 40 00 8d 00 03 32 41 0a
+31 00 80 02 c9 0f a0 21 40 00 8d 00 03 32 82 14
+31 01 60 02 dd 0f af 20 44 00 6e 00 03 23 10 04
+31 00 60 02 c9 0f 40 20 40 00 8d 00 02 00 41 04
+31 00 60 02 c9 0f 80 22 40 00 8d 00 08 00 41 04
+31 00 60 02 c9 0f 00 23 40 00 8d 00 09 01 41 04
+31 00 60 02 c9 0f 80 23 40 00 8d 00 0a 02 41 04
+31 00 60 02 c9 0f 00 24 40 00 8d 00 0b 03 41 04
+31 00 60 02 c9 0f 80 24 40 00 8d 00 0c 04 41 04
+31 00 60 02 c9 0f 00 25 40 00 8d 00 0d 05 41 04
+31 00 60 02 c9 0f 80 25 40 00 8d 00 0e 06 41 04
+31 00 60 02 c9 0f 00 26 40 00 8d 00 0f 07 41 04
+31 00 80 02 c9 0f c0 22 40 00 8d 00 08 00 82 08
+31 00 80 02 c9 0f c0 23 40 00 8d 00 09 01 82 08
+31 00 80 02 c9 0f c0 22 40 00 8d 00 0a 02 82 08
+31 00 80 02 c9 0f c0 24 40 00 8d 00 0b 03 82 08
+31 00 80 02 c9 0f c0 23 40 00 8d 00 0c 04 82 08
+31 00 80 02 c9 0f c0 25 40 00 8d 00 0d 05 82 08
+31 00 80 02 c9 0f c0 22 40 00 8d 00 0e 06 82 08
+31 00 80 02 c9 0f c0 26 40 00 8d 00 0f 07 82 08
+31 01 60 02 dd 0f af 20 44 00 6e 00 01 21 10 04
+31 01 60 02 dd 0f cf 20 44 00 6e 00 02 22 10 04
+31 01 60 02 dd 0f 0f 21 44 00 6e 00 04 24 10 04
+31 01 60 02 dd 0f 4f 21 44 00 6e 00 06 26 10 04
+31 01 60 02 dd 0f 6f 21 44 00 6e 00 07 27 10 04
+31 01 60 06 c1 0f 2f 21 24 00 6e 00 70 e4 18 0a
diff --git a/src/intel/compiler/elk/tests/gen6/sendc.asm b/src/intel/compiler/elk/tests/gen6/sendc.asm
new file mode 100644
index 00000000000..6526e54d48a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/sendc.asm
@@ -0,0 +1,76 @@
+sendc(8)        null<1>UW       m1<8,8,1>F      0x88019400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x90019000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 8 rlen 0 { align1 1H EOT };
+sendc(16)       null<1>UW       m2<8,8,1>F      0x82019100
+                            render MsgDesc: RT write SIMD16/RepData LastRT Surface = 0 mlen 1 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x8c099401
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 1 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x94099001
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x0e098401
+                            render MsgDesc: RT write SIMD8 Surface = 1 mlen 7 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x8e099402
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 2 mlen 7 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x18098001
+                            render MsgDesc: RT write SIMD16 Surface = 1 mlen 12 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x98099002
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 2 mlen 12 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x0c098400
+                            render MsgDesc: RT write SIMD8 Surface = 0 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x0c098401
+                            render MsgDesc: RT write SIMD8 Surface = 1 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x0c098402
+                            render MsgDesc: RT write SIMD8 Surface = 2 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x0c098403
+                            render MsgDesc: RT write SIMD8 Surface = 3 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x0c098404
+                            render MsgDesc: RT write SIMD8 Surface = 4 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x8c099405
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 5 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x14098000
+                            render MsgDesc: RT write SIMD16 Surface = 0 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x14098001
+                            render MsgDesc: RT write SIMD16 Surface = 1 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x14098002
+                            render MsgDesc: RT write SIMD16 Surface = 2 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x14098003
+                            render MsgDesc: RT write SIMD16 Surface = 3 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x14098004
+                            render MsgDesc: RT write SIMD16 Surface = 4 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x94099005
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 5 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x8c099400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x94099000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x8a019400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 5 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x94099200
+                            render MsgDesc: RT write SIMD8/DualSrcLow LastRT Surface = 0 mlen 10 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x14099200
+                            render MsgDesc: RT write SIMD8/DualSrcLow LastRT Surface = 0 mlen 10 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x94099300
+                            render MsgDesc: RT write SIMD8/DualSrcHigh LastRT Surface = 0 mlen 10 rlen 0 { align1 2Q EOT };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x8c099402
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 2 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x94099002
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 2 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x0c098405
+                            render MsgDesc: RT write SIMD8 Surface = 5 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x0c098406
+                            render MsgDesc: RT write SIMD8 Surface = 6 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x8c099407
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 7 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x14098005
+                            render MsgDesc: RT write SIMD16 Surface = 5 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x14098006
+                            render MsgDesc: RT write SIMD16 Surface = 6 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x94099007
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 7 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x8e099401
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 1 mlen 7 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       m1<8,8,1>F      0x98099001
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 12 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       m1<8,8,1>F      0x0e098400
+                            render MsgDesc: RT write SIMD8 Surface = 0 mlen 7 rlen 0 { align1 1Q };
diff --git a/src/intel/compiler/elk/tests/gen6/sendc.expected b/src/intel/compiler/elk/tests/gen6/sendc.expected
new file mode 100644
index 00000000000..ff1644f7f69
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/sendc.expected
@@ -0,0 +1,38 @@
+32 00 60 05 c8 0f 00 20 20 00 8d 00 00 94 01 88
+32 00 80 05 c8 0f 00 20 20 00 8d 00 00 90 01 90
+32 00 80 05 c8 0f 00 20 40 00 8d 00 00 91 01 82
+32 00 60 05 c8 0f 00 20 20 00 8d 00 01 94 09 8c
+32 00 80 05 c8 0f 00 20 20 00 8d 00 01 90 09 94
+32 00 60 05 c8 0f 00 20 20 00 8d 00 01 84 09 0e
+32 00 60 05 c8 0f 00 20 20 00 8d 00 02 94 09 8e
+32 00 80 05 c8 0f 00 20 20 00 8d 00 01 80 09 18
+32 00 80 05 c8 0f 00 20 20 00 8d 00 02 90 09 98
+32 00 60 05 c8 0f 00 20 20 00 8d 00 00 84 09 0c
+32 00 60 05 c8 0f 00 20 20 00 8d 00 01 84 09 0c
+32 00 60 05 c8 0f 00 20 20 00 8d 00 02 84 09 0c
+32 00 60 05 c8 0f 00 20 20 00 8d 00 03 84 09 0c
+32 00 60 05 c8 0f 00 20 20 00 8d 00 04 84 09 0c
+32 00 60 05 c8 0f 00 20 20 00 8d 00 05 94 09 8c
+32 00 80 05 c8 0f 00 20 20 00 8d 00 00 80 09 14
+32 00 80 05 c8 0f 00 20 20 00 8d 00 01 80 09 14
+32 00 80 05 c8 0f 00 20 20 00 8d 00 02 80 09 14
+32 00 80 05 c8 0f 00 20 20 00 8d 00 03 80 09 14
+32 00 80 05 c8 0f 00 20 20 00 8d 00 04 80 09 14
+32 00 80 05 c8 0f 00 20 20 00 8d 00 05 90 09 94
+32 00 60 05 c8 0f 00 20 20 00 8d 00 00 94 09 8c
+32 00 80 05 c8 0f 00 20 20 00 8d 00 00 90 09 94
+32 00 60 05 c8 0f 00 20 20 00 8d 00 00 94 01 8a
+32 00 60 05 c8 0f 00 20 20 00 8d 00 00 92 09 94
+32 00 60 05 c8 0f 00 20 20 00 8d 00 00 92 09 14
+32 10 60 05 c8 0f 00 20 20 00 8d 00 00 93 09 94
+32 00 60 05 c8 0f 00 20 20 00 8d 00 02 94 09 8c
+32 00 80 05 c8 0f 00 20 20 00 8d 00 02 90 09 94
+32 00 60 05 c8 0f 00 20 20 00 8d 00 05 84 09 0c
+32 00 60 05 c8 0f 00 20 20 00 8d 00 06 84 09 0c
+32 00 60 05 c8 0f 00 20 20 00 8d 00 07 94 09 8c
+32 00 80 05 c8 0f 00 20 20 00 8d 00 05 80 09 14
+32 00 80 05 c8 0f 00 20 20 00 8d 00 06 80 09 14
+32 00 80 05 c8 0f 00 20 20 00 8d 00 07 90 09 94
+32 00 60 05 c8 0f 00 20 20 00 8d 00 01 94 09 8e
+32 00 80 05 c8 0f 00 20 20 00 8d 00 01 90 09 98
+32 00 60 05 c8 0f 00 20 20 00 8d 00 00 84 09 0e
diff --git a/src/intel/compiler/elk/tests/gen6/shl.asm b/src/intel/compiler/elk/tests/gen6/shl.asm
new file mode 100644
index 00000000000..a8fce90e111
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/shl.asm
@@ -0,0 +1,13 @@
+shl(8)          g25<1>.xD       g21<4>.xD       0x00000004UD    { align16 1Q };
+shl(8)          g3<1>D          g2.4<0,1,0>D    0x00000004UD    { align1 1Q };
+shl(16)         g3<1>D          g2.4<0,1,0>D    0x00000004UD    { align1 1H };
+shl(8)          g11<1>D         g11<4>D         16D             { align16 1Q };
+shl(1)          g28<1>UD        g28<0,1,0>UD    0x00000010UD    { align1 1N };
+shl(8)          g64<1>.xUD      g64<4>.xUD      0x00000010UD    { align16 1Q };
+shl(8)          m17<1>D         g2<0,1,0>D      0x00000004UD    { align1 1Q };
+shl(16)         m17<1>D         g2<0,1,0>D      0x00000004UD    { align1 1H };
+shl(8)          g2<1>D          g2<8,8,1>D      16D             { align1 1Q };
+shl(16)         g2<1>D          g2<8,8,1>D      16D             { align1 1H };
+shl(8)          g25<1>D         g2<0>D          g24<4>UD        { align16 1Q };
+shl(8)          g10<1>D         g2.5<0,1,0>D    g9<8,8,1>UD     { align1 1Q };
+shl(16)         g13<1>D         g2.5<0,1,0>D    g11<8,8,1>UD    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen6/shl.expected b/src/intel/compiler/elk/tests/gen6/shl.expected
new file mode 100644
index 00000000000..d07c862e6d5
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/shl.expected
@@ -0,0 +1,13 @@
+09 01 60 00 a5 0c 21 23 a0 02 60 00 04 00 00 00
+09 00 60 00 a5 0c 60 20 50 00 00 00 04 00 00 00
+09 00 80 00 a5 0c 60 20 50 00 00 00 04 00 00 00
+09 01 60 00 a5 1c 6f 21 64 01 6e 00 10 00 00 00
+09 00 00 00 21 0c 80 23 80 03 00 00 10 00 00 00
+09 01 60 00 21 0c 01 28 00 08 60 00 10 00 00 00
+09 00 60 00 a6 0c 20 22 40 00 00 00 04 00 00 00
+09 00 80 00 a6 0c 20 22 40 00 00 00 04 00 00 00
+09 00 60 00 a5 1c 40 20 40 00 8d 00 10 00 00 00
+09 00 80 00 a5 1c 40 20 40 00 8d 00 10 00 00 00
+09 01 60 00 a5 04 2f 23 44 00 0e 00 04 03 6e 00
+09 00 60 00 a5 04 40 21 54 00 00 00 20 01 8d 00
+09 00 80 00 a5 04 a0 21 54 00 00 00 60 01 8d 00
diff --git a/src/intel/compiler/elk/tests/gen6/shr.asm b/src/intel/compiler/elk/tests/gen6/shr.asm
new file mode 100644
index 00000000000..bd9e7c4ff55
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/shr.asm
@@ -0,0 +1,9 @@
+shr(8)          m18<1>D         g25<4>.xUD      4D              { align16 1Q };
+shr(8)          g13<1>UD        g12<8,8,1>UD    0x00000001UD    { align1 1Q };
+shr(16)         g19<1>UD        g17<8,8,1>UD    0x00000001UD    { align1 1H };
+shr(1)          g22<1>UD        g22<0,1,0>UD    5D              { align1 WE_all 1N };
+shr(8)          g34<1>UD        g3<0>UD         g1<0>.yUD       { align16 1Q };
+shr(8)          g3<1>.xUD       g3<4>.xUD       0x00000001UD    { align16 1Q };
+shr(8)          g28<1>UD        g3.5<0,1,0>UD   g4.1<0,1,0>UD   { align1 1Q };
+shr(16)         g48<1>UD        g3.5<0,1,0>UD   g4.1<0,1,0>UD   { align1 1H };
+shr(1)          g3<1>D          sr0<0,1,0>D     12D             { align1 1N };
diff --git a/src/intel/compiler/elk/tests/gen6/shr.expected b/src/intel/compiler/elk/tests/gen6/shr.expected
new file mode 100644
index 00000000000..bfd44f57ca1
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/shr.expected
@@ -0,0 +1,9 @@
+08 01 60 00 26 1c 4f 22 20 03 60 00 04 00 00 00
+08 00 60 00 21 0c a0 21 80 01 8d 00 01 00 00 00
+08 00 80 00 21 0c 60 22 20 02 8d 00 01 00 00 00
+08 02 00 00 21 1c c0 22 c0 02 00 00 05 00 00 00
+08 01 60 00 21 04 4f 24 64 00 0e 00 25 00 05 00
+08 01 60 00 21 0c 61 20 60 00 60 00 01 00 00 00
+08 00 60 00 21 04 80 23 74 00 00 00 84 00 00 00
+08 00 80 00 21 04 00 26 74 00 00 00 84 00 00 00
+08 00 00 00 85 1c 60 20 00 0e 00 00 0c 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen6/while.asm b/src/intel/compiler/elk/tests/gen6/while.asm
new file mode 100644
index 00000000000..df993482ba3
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/while.asm
@@ -0,0 +1,7 @@
+LABEL0:
+while(8)        JIP: LABEL0                                     { align16 1Q };
+while(8)        JIP: LABEL0                                     { align1 1Q };
+while(16)       JIP: LABEL0                                     { align1 1H };
+(-f0.0) while(8) JIP: LABEL0                                    { align1 1Q };
+(-f0.0) while(16) JIP: LABEL0                                   { align1 1H };
+(-f0.0.x) while(8) JIP: LABEL0                                  { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen6/while.expected b/src/intel/compiler/elk/tests/gen6/while.expected
new file mode 100644
index 00000000000..35f4eb4dbf7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/while.expected
@@ -0,0 +1,6 @@
+27 01 60 00 8f 10 00 00 04 00 6e 00 04 00 6e 00
+27 00 60 00 8f 10 fe ff 00 00 8d 00 00 00 8d 00
+27 00 80 00 8f 10 fc ff 00 00 8d 00 00 00 8d 00
+27 00 71 00 8f 10 fa ff 00 00 8d 00 00 00 8d 00
+27 00 91 00 8f 10 f8 ff 00 00 8d 00 00 00 8d 00
+27 01 72 00 8f 10 f6 ff 04 00 6e 00 04 00 6e 00
diff --git a/src/intel/compiler/elk/tests/gen6/xor.asm b/src/intel/compiler/elk/tests/gen6/xor.asm
new file mode 100644
index 00000000000..8df3d4716ea
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/xor.asm
@@ -0,0 +1,5 @@
+xor(8)          g17<1>D         g17<4>D         g2<0>D          { align16 1Q };
+xor(8)          g7<1>D          g7<8,8,1>D      g2.5<0,1,0>D    { align1 1Q };
+xor(16)         g8<1>D          g8<8,8,1>D      g2.5<0,1,0>D    { align1 1H };
+xor(8)          g9<1>UD         g5<8,8,1>UD     0x000003ffUD    { align1 1Q };
+xor(16)         g4<1>UD         g7<8,8,1>UD     0x000003ffUD    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen6/xor.expected b/src/intel/compiler/elk/tests/gen6/xor.expected
new file mode 100644
index 00000000000..48cf314acb6
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen6/xor.expected
@@ -0,0 +1,5 @@
+07 01 60 00 a5 14 2f 22 24 02 6e 00 44 00 0e 00
+07 00 60 00 a5 14 e0 20 e0 00 8d 00 54 00 00 00
+07 00 80 00 a5 14 00 21 00 01 8d 00 54 00 00 00
+07 00 60 00 21 0c 20 21 a0 00 8d 00 ff 03 00 00
+07 00 80 00 21 0c 80 20 e0 00 8d 00 ff 03 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/add.asm b/src/intel/compiler/elk/tests/gen7.5/add.asm
new file mode 100644
index 00000000000..efde5b6ac20
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/add.asm
@@ -0,0 +1,65 @@
+add(8)          g124<1>F        g9<8,8,1>D      1D              { align1 1Q };
+add(16)         g120<1>F        g15<8,8,1>D     1D              { align1 1H };
+add(16)         g6<1>UW         g1.4<1,4,0>UW   0x11001010V     { align1 WE_all 1H };
+add(16)         g4<1>UW         g1.4<2,4,0>UW   0x10101010V     { align1 1H };
+add(1)          g11.4<1>UD      g11<0,1,0>UD    0x00000001UD    { align1 1N };
+add(8)          g17<1>F         g6<0>F          g7.4<0>F        { align16 1Q };
+add(1)          a0<1>UW         g11<0,1,0>UW    0x0008UW        { align1 WE_all 1N };
+add(8)          g14<1>D         g12<4>D         -g1.4<0>D       { align16 1Q };
+add(8)          g3<1>F          g18<4>F         0x3e800000F  /* 0.25F */ { align16 1Q };
+add(1)          g126.4<1>D      g39.4<0,1,0>D   1D              { align1 WE_all 1N };
+add(8)          g46<1>F         g42<8,8,1>F     -g4.4<0,1,0>F   { align1 1Q };
+add(16)         g55<1>F         g47<8,8,1>F     -g6.4<0,1,0>F   { align1 1H };
+add(8)          g17<1>D         g15<8,8,1>D     -g7.3<0,1,0>D   { align1 1Q };
+add(16)         g83<1>D         g79<8,8,1>D     -g9.3<0,1,0>D   { align1 1H };
+add(8)          g11<1>.xD       g5<4>.xD        64D             { align16 1Q };
+add(1)          g8.3<1>UD       g0.3<0,1,0>UD   g7<0,1,0>UD     { align1 WE_all 1N };
+add(1)          g2<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 WE_all 1N };
+add(8)          g8<1>F          g2<8,8,1>F      0x3f000000F  /* 0.5F */ { align1 1Q };
+add(16)         g9<1>F          g2<8,8,1>F      0x3f000000F  /* 0.5F */ { align1 1H };
+add(8)          a0<1>UW         g3<16,8,2>UW    0x0040UW        { align1 1Q };
+add(8)          a0<1>UW         g4<16,8,2>UW    0x0040UW        { align1 2Q };
+add(8)          g115<1>.xyF     g2<0>.xyyyF     g8<4>.xyyyF     { align16 NoDDClr 1Q };
+add(8)          g114<1>.xD      g4<4>.xD        7D              { align16 NoDDClr 1Q };
+add(8)          g115<1>.xyF     g10<4>.xyyyF    0x3f000000F  /* 0.5F */ { align16 NoDDClr 1Q };
+add(8)          g3<1>D          g3<8,8,1>D      12D             { align1 1Q };
+add(16)         g5<1>D          g3<8,8,1>D      12D             { align1 1H };
+add(8)          g114<1>.xyzD    g4<4>.xyzzD     g7<4>.xyzzD     { align16 NoDDClr 1Q };
+add(8)          g11<1>F         g10<4>.xF       0x48403000VF /* [0F, 1F, 2F, 3F]VF */ { align16 1Q };
+add(8)          g116<1>.zD      g1<0>.xD        2D              { align16 NoDDClr,NoDDChk 1Q };
+add(8)          g117<1>.wD      g1<0>.xD        7D              { align16 NoDDChk 1Q };
+add(8)          g3<1>.yF        g13<4>.xF       -g1<0>.xF       { align16 NoDDClr,NoDDChk 1Q };
+add(8)          g117<1>.zF      g1<0>.xF        0x40000000F  /* 2F */ { align16 NoDDClr,NoDDChk 1Q };
+add(8)          g12<1>.zF       g1<0>.xF        0x42180000F  /* 38F */ { align16 NoDDChk 1Q };
+(+f0.0) add(8)  g16<1>D         -g16<4>D        31D             { align16 1Q };
+add(8)          g18<1>.xUD      g16<4>.xUD      0xffffffffUD    { align16 1Q };
+add(8)          g115<1>.xyF     g5<4>.xyyyF     0x30VF /* [1F, 0F, 0F, 0F]VF */ { align16 NoDDClr 1Q };
+add(1)          a0<1>UD         a0<0,1,0>UD     0x00000200UD    { align1 WE_all 1N };
+add(8)          g3.1<2>UW       g3.1<16,8,2>UW  g13<16,8,2>UW   { align1 1Q };
+add(16)         g3.1<2>UW       g3.1<16,8,2>UW  g5<16,8,2>UW    { align1 1H };
+add(8)          g16<1>UD        g29<0,1,0>UD    g26<1,4,0>UW    { align1 1Q };
+add(8)          g17<1>UD        g29<0,1,0>UD    g26.2<1,4,0>UW  { align1 2Q };
+add(8)          g14<1>.xDF      g12<0>.xyxyDF   g7<0>.xyxyDF    { align16 1Q };
+add.sat(8)      g116<1>F        g2<4>.yzxwF     -g2<4>F         { align16 1Q };
+add(8)          g8<1>UD         g6<8,8,1>D      0x00000001UD    { align1 1Q };
+add(16)         g11<1>UD        g9<8,8,1>D      0x00000001UD    { align1 1H };
+(+f0.0) add(8)  g7<1>D          -g7<8,8,1>D     31D             { align1 1Q };
+(+f0.0) add(16) g8<1>D          -g8<8,8,1>D     31D             { align1 1H };
+add.sat(8)      g127<1>F        g2<0,1,0>F      g2.4<0,1,0>F    { align1 1Q };
+add.sat(16)     g126<1>F        g2<0,1,0>F      g2.4<0,1,0>F    { align1 1H };
+add(8)          g117<1>.xyD     g6<4>.xyyyD     g12<4>.xD       { align16 NoDDClr,NoDDChk 1Q };
+add(8)          g115<1>.yF      g3<0>.yF        g26<4>.yF       { align16 NoDDChk 1Q };
+add(8)          g13<1>UD        g11<8,8,1>UD    1D              { align1 1Q };
+add(16)         g19<1>UD        g16<8,8,1>UD    1D              { align1 1H };
+add.sat(8)      g116<1>.yF      g1<0>.zF        0x3f000000F  /* 0.5F */ { align16 NoDDClr 1Q };
+add(8)          g7<1>UD         g2<8,8,1>UD     -g6<8,8,1>UD    { align1 WE_all 1Q };
+add(8)          g7<1>UD         g3<8,8,1>UD     0x00000110UD    { align1 1Q };
+add(16)         g24<1>UD        g4<8,8,1>UD     0x00000110UD    { align1 1H };
+add.l.f0.0(8)   g14<1>.xD       g12<4>.xD       -g12<4>.yD      { align16 1Q };
+add(16)         g17<1>F         -g15<4>.xyxyF   g15<4>.zwzwF    { align16 1H };
+add.sat(8)      g116<1>F        g5<4>.xF        0xbf800000F  /* -1F */ { align16 1Q };
+add.sat(8)      g116<1>.yF      -g1<0>.xF       0x3f000000F  /* 0.5F */ { align16 NoDDClr,NoDDChk 1Q };
+add.sat(8)      g116<1>.wF      g3<4>.yF        0xc0000000F  /* -2F */ { align16 NoDDChk 1Q };
+add(1)          g4<1>UD         g4<0,1,0>UD     0x00000001UD    { align1 WE_all 3N };
+add(1)          g23.3<1>UD      g0.3<0,1,0>UD   g22<0,1,0>UD    { align1 WE_all 3N };
+add.nz.f0.0(8)  g15<1>.xD       g1<4>.zD        g1<4>.xD        { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/add.expected b/src/intel/compiler/elk/tests/gen7.5/add.expected
new file mode 100644
index 00000000000..e639bd2f951
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/add.expected
@@ -0,0 +1,65 @@
+40 00 60 00 bd 1c 80 2f 20 01 8d 00 01 00 00 00
+40 00 80 00 bd 1c 00 2f e0 01 8d 00 01 00 00 00
+40 02 80 00 29 6d c0 20 28 00 28 00 10 10 00 11
+40 00 80 00 29 6d 80 20 28 00 48 00 10 10 10 10
+40 00 00 00 21 0c 70 21 60 01 00 00 01 00 00 00
+40 01 60 00 bd 77 2f 22 c4 00 0e 00 f4 00 0e 00
+40 02 00 00 28 2d 00 22 60 01 00 00 08 00 08 00
+40 01 60 00 a5 14 cf 21 84 01 6e 00 34 40 0e 00
+40 01 60 00 bd 7f 6f 20 44 02 6e 00 00 00 80 3e
+40 02 00 00 a5 1c d0 2f f0 04 00 00 01 00 00 00
+40 00 60 00 bd 77 c0 25 40 05 8d 00 90 40 00 00
+40 00 80 00 bd 77 e0 26 e0 05 8d 00 d0 40 00 00
+40 00 60 00 a5 14 20 22 e0 01 8d 00 ec 40 00 00
+40 00 80 00 a5 14 60 2a e0 09 8d 00 2c 41 00 00
+40 01 60 00 a5 1c 61 21 a0 00 60 00 40 00 00 00
+40 02 00 00 21 04 0c 21 0c 00 00 00 e0 00 00 00
+40 02 00 00 21 0c 40 20 40 00 00 00 01 00 00 00
+40 00 60 00 bd 7f 00 21 40 00 8d 00 00 00 00 3f
+40 00 80 00 bd 7f 20 21 40 00 8d 00 00 00 00 3f
+40 00 60 00 28 2d 00 22 60 00 ae 00 40 00 40 00
+40 10 60 00 28 2d 00 22 80 00 ae 00 40 00 40 00
+40 05 60 00 bd 77 63 2e 44 00 05 00 04 01 65 00
+40 05 60 00 a5 1c 41 2e 80 00 60 00 07 00 00 00
+40 05 60 00 bd 7f 63 2e 44 01 65 00 00 00 00 3f
+40 00 60 00 a5 1c 60 20 60 00 8d 00 0c 00 00 00
+40 00 80 00 a5 1c a0 20 60 00 8d 00 0c 00 00 00
+40 05 60 00 a5 14 47 2e 84 00 6a 00 e4 00 6a 00
+40 01 60 00 bd 5f 6f 21 40 01 60 00 00 30 40 48
+40 0d 60 00 a5 1c 84 2e 20 00 00 00 02 00 00 00
+40 09 60 00 a5 1c a8 2e 20 00 00 00 07 00 00 00
+40 0d 60 00 bd 77 62 20 a0 01 60 00 20 40 00 00
+40 0d 60 00 bd 7f a4 2e 20 00 00 00 00 00 00 40
+40 09 60 00 bd 7f 84 21 20 00 00 00 00 00 18 42
+40 01 61 00 a5 1c 0f 22 04 42 6e 00 1f 00 00 00
+40 01 60 00 21 0c 41 22 00 02 60 00 ff ff ff ff
+40 05 60 00 bd 5f 63 2e a4 00 65 00 30 00 00 00
+40 02 00 00 00 0c 00 22 00 02 00 00 00 02 00 00
+40 00 60 00 29 25 62 40 62 00 ae 00 a0 01 ae 00
+40 00 80 00 29 25 62 40 62 00 ae 00 a0 00 ae 00
+40 00 60 00 21 24 00 22 a0 03 00 00 40 03 28 00
+40 10 60 00 21 24 20 22 a0 03 00 00 44 03 28 00
+40 01 60 00 39 67 c1 21 84 01 04 00 e4 00 04 00
+40 01 60 80 bd 77 8f 2e 49 00 6c 00 44 40 6e 00
+40 00 60 00 a1 0c 00 21 c0 00 8d 00 01 00 00 00
+40 00 80 00 a1 0c 60 21 20 01 8d 00 01 00 00 00
+40 00 61 00 a5 1c e0 20 e0 40 8d 00 1f 00 00 00
+40 00 81 00 a5 1c 00 21 00 41 8d 00 1f 00 00 00
+40 00 60 80 bd 77 e0 2f 40 00 00 00 50 00 00 00
+40 00 80 80 bd 77 c0 2f 40 00 00 00 50 00 00 00
+40 0d 60 00 a5 14 a3 2e c4 00 65 00 80 01 60 00
+40 09 60 00 bd 77 62 2e 65 00 05 00 45 03 65 00
+40 00 60 00 21 1c a0 21 60 01 8d 00 01 00 00 00
+40 00 80 00 21 1c 60 22 00 02 8d 00 01 00 00 00
+40 05 60 80 bd 7f 82 2e 2a 00 0a 00 00 00 00 3f
+40 02 60 00 21 04 e0 20 40 00 8d 00 c0 40 8d 00
+40 00 60 00 21 0c e0 20 60 00 8d 00 10 01 00 00
+40 00 80 00 21 0c 00 23 80 00 8d 00 10 01 00 00
+40 01 60 05 a5 14 c1 21 80 01 60 00 85 41 65 00
+40 01 80 00 bd 77 2f 22 e4 41 64 00 ee 01 6e 00
+40 01 60 80 bd 7f 8f 2e a0 00 60 00 00 00 80 bf
+40 0d 60 80 bd 7f 82 2e 20 40 00 00 00 00 00 3f
+40 09 60 80 bd 7f 88 2e 65 00 65 00 00 00 00 c0
+40 12 00 00 21 0c 80 20 80 00 00 00 01 00 00 00
+40 12 00 00 21 04 ec 22 0c 00 00 00 c0 02 00 00
+40 01 60 02 a5 14 e1 21 2a 00 6a 00 20 00 60 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/and.asm b/src/intel/compiler/elk/tests/gen7.5/and.asm
new file mode 100644
index 00000000000..e07ff1dd696
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/and.asm
@@ -0,0 +1,37 @@
+and(8)          g4<1>UD         g2<0,1,0>UD     g3<8,8,1>UD     { align1 1Q };
+and(16)         g5<1>UD         g2<0,1,0>UD     g3<8,8,1>UD     { align1 1H };
+and(1)          g11<1>UD        g0.2<0,1,0>UD   0x00fe0000UD    { align1 1N };
+and(1)          g12.2<1>UD      g0.2<0,1,0>UD   0x0001e000UD    { align1 WE_all 1N };
+and(8)          g8<1>UD         g0.1<0,1,0>UW   0x07ffUW        { align1 1Q };
+and(16)         g20<1>UD        g0.1<0,1,0>UW   0x07ffUW        { align1 1H };
+and(8)          g16<1>UD        g14<8,8,1>UD    0xfffffff4UD    { align1 1Q };
+and(16)         g22<1>UD        g18<8,8,1>UD    0xfffffff4UD    { align1 1H };
+and(8)          g11<1>UD        g1<0>UD         g10<4>UD        { align16 1Q };
+and(8)          g59<1>.xUD      g38<4>.xUD      0x00000001UD    { align16 1Q };
+and.nz.f0.0(8)  null<1>UD       g24<8,8,1>UD    g25<8,8,1>UD    { align1 1Q };
+and.nz.f0.0(16) null<1>UD       g45<8,8,1>UD    g47<8,8,1>UD    { align1 1H };
+and(2)          g3<1>UD         g1.3<0,1,0>UD   0x00001fffUD    { align1 WE_all 1N };
+and(1)          a0<1>UD         a0<0,1,0>UD     0x00000fffUD    { align1 WE_all 1N };
+and.nz.f0.0(8)  null<1>.xUD     g25<4>.xUD      g24<4>.xUD      { align16 1Q };
+and(8)          g4<1>.xUD       g1<0>.xUD       0x0000ffffUD    { align16 NoDDClr 1Q };
+and(1)          a0<1>UD         g4<0,1,0>UD     0x000000ffUD    { align1 WE_all 1N };
+and(1)          g2<1>UD         g19<0,1,0>UD    0x000000ffUD    { align1 WE_all 3N };
+and.z.f0.0(8)   null<1>UD       g9<4>.xUD       0x0000001fUD    { align16 1Q };
+and(8)          g14<1>.xUD      g12<4>.xUD      0x00000003UD    { align16 WE_all 1Q };
+and.nz.f0.0(8)  null<1>UD       g4<0,1,0>UD     0x00000001UD    { align1 1Q };
+and.nz.f0.0(16) null<1>UD       g6<0,1,0>UD     0x00000001UD    { align1 1H };
+and.z.f0.0(8)   null<1>UD       g20<8,8,1>UD    0x00000001UD    { align1 1Q };
+and.z.f0.0(16)  null<1>UD       g44<8,8,1>UD    0x00000001UD    { align1 1H };
+and(8)          g6<1>UD         g2<8,8,1>UD     0x00000003UD    { align1 WE_all 1Q };
+and(8)          g4<1>UW         g3<8,8,1>UW     0xfffcUW        { align1 1Q };
+and(8)          g8<1>UD         g4<8,8,1>UW     0x5F  /* 7.00649e-45F */ { align1 1Q };
+and(16)         g13<1>UW        g21<16,8,2>UW   0xfffcUW        { align1 1H };
+and(16)         g13<1>UD        g19<8,8,1>UW    0x5F  /* 7.00649e-45F */ { align1 1H };
+and.nz.f0.0(8)  g10<1>UD        g9<8,8,1>UD     0x00000001UD    { align1 1Q };
+and.nz.f0.0(16) g15<1>UD        g13<8,8,1>UD    0x00000001UD    { align1 1H };
+and.nz.f0.0(8)  g29<1>UD        g28<8,8,1>UD    g27<8,8,1>UD    { align1 1Q };
+and.nz.f0.0(16) g53<1>UD        g51<8,8,1>UD    g49<8,8,1>UD    { align1 1H };
+and.z.f0.0(8)   g26<1>.xUD      g24<4>.xUD      0x00000003UD    { align16 1Q };
+and(16)         g120<1>D        g7<8,8,1>D      g2<8,8,1>D      { align1 1H };
+and.z.f0.0(8)   null<1>UD       g12<8,8,1>UD    g16<8,8,1>UD    { align1 1Q };
+and.z.f0.0(16)  null<1>UD       g27<8,8,1>UD    g18<8,8,1>UD    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/and.expected b/src/intel/compiler/elk/tests/gen7.5/and.expected
new file mode 100644
index 00000000000..4327e64d30d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/and.expected
@@ -0,0 +1,37 @@
+05 00 60 00 21 04 80 20 40 00 00 00 60 00 8d 00
+05 00 80 00 21 04 a0 20 40 00 00 00 60 00 8d 00
+05 00 00 00 21 0c 60 21 08 00 00 00 00 00 fe 00
+05 02 00 00 21 0c 88 21 08 00 00 00 00 e0 01 00
+05 00 60 00 21 2d 00 21 02 00 00 00 ff 07 ff 07
+05 00 80 00 21 2d 80 22 02 00 00 00 ff 07 ff 07
+05 00 60 00 21 0c 00 22 c0 01 8d 00 f4 ff ff ff
+05 00 80 00 21 0c c0 22 40 02 8d 00 f4 ff ff ff
+05 01 60 00 21 04 6f 21 24 00 0e 00 44 01 6e 00
+05 01 60 00 21 0c 61 27 c0 04 60 00 01 00 00 00
+05 00 60 02 20 04 00 20 00 03 8d 00 20 03 8d 00
+05 00 80 02 20 04 00 20 a0 05 8d 00 e0 05 8d 00
+05 02 20 00 21 0c 60 20 2c 00 00 00 ff 1f 00 00
+05 02 00 00 00 0c 00 22 00 02 00 00 ff 0f 00 00
+05 01 60 02 20 04 01 20 20 03 60 00 00 03 60 00
+05 05 60 00 21 0c 81 20 20 00 00 00 ff ff 00 00
+05 02 00 00 20 0c 00 22 80 00 00 00 ff 00 00 00
+05 12 00 00 21 0c 40 20 60 02 00 00 ff 00 00 00
+05 01 60 01 20 0c 0f 20 20 01 60 00 1f 00 00 00
+05 03 60 00 21 0c c1 21 80 01 60 00 03 00 00 00
+05 00 60 02 20 0c 00 20 80 00 00 00 01 00 00 00
+05 00 80 02 20 0c 00 20 c0 00 00 00 01 00 00 00
+05 00 60 01 20 0c 00 20 80 02 8d 00 01 00 00 00
+05 00 80 01 20 0c 00 20 80 05 8d 00 01 00 00 00
+05 02 60 00 21 0c c0 20 40 00 8d 00 03 00 00 00
+05 00 60 00 29 2d 80 20 60 00 8d 00 fc ff fc ff
+05 00 60 00 21 7d 00 21 80 00 8d 00 05 00 00 00
+05 00 80 00 29 2d a0 21 a0 02 ae 00 fc ff fc ff
+05 00 80 00 21 7d a0 21 60 02 8d 00 05 00 00 00
+05 00 60 02 21 0c 40 21 20 01 8d 00 01 00 00 00
+05 00 80 02 21 0c e0 21 a0 01 8d 00 01 00 00 00
+05 00 60 02 21 04 a0 23 80 03 8d 00 60 03 8d 00
+05 00 80 02 21 04 a0 26 60 06 8d 00 20 06 8d 00
+05 01 60 01 21 0c 41 23 00 03 60 00 03 00 00 00
+05 00 80 00 a5 14 00 2f e0 00 8d 00 40 00 8d 00
+05 00 60 01 20 04 00 20 80 01 8d 00 00 02 8d 00
+05 00 80 01 20 04 00 20 60 03 8d 00 40 02 8d 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/asr.asm b/src/intel/compiler/elk/tests/gen7.5/asr.asm
new file mode 100644
index 00000000000..dc388836dc5
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/asr.asm
@@ -0,0 +1,12 @@
+asr(8)          g13<1>.xD       g5.4<0>.zD      g5.4<0>.wUD     { align16 1Q };
+asr(8)          g57<1>.xD       g38<4>.xD       0x00000001UD    { align16 1Q };
+asr(8)          g3<1>D          g2<0,1,0>D      g2.1<0,1,0>UD   { align1 1Q };
+asr(16)         g3<1>D          g2<0,1,0>D      g2.1<0,1,0>UD   { align1 1H };
+asr(8)          g6<1>D          g5<8,8,1>D      0x00000001UD    { align1 1Q };
+asr(16)         g8<1>D          g6<8,8,1>D      0x00000001UD    { align1 1H };
+asr(8)          g4<1>.xD        g14<4>.xD       0x00000010UD    { align16 NoDDClr 1Q };
+asr(8)          g4<1>.yD        g1<0>.xD        0x00000010UD    { align16 NoDDChk 1Q };
+asr.nz.f0.0(8)  null<1>D        -g0<0,1,0>W     15D             { align1 1Q };
+asr.nz.f0.0(16) null<1>D        -g0<0,1,0>W     15D             { align1 1H };
+asr(8)          g2<1>D          -g0<0,1,0>W     15D             { align1 1Q };
+asr(16)         g2<1>D          -g0<0,1,0>W     15D             { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/asr.expected b/src/intel/compiler/elk/tests/gen7.5/asr.expected
new file mode 100644
index 00000000000..0ce1f33f74e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/asr.expected
@@ -0,0 +1,12 @@
+0c 01 60 00 a5 04 a1 21 ba 00 0a 00 bf 00 0f 00
+0c 01 60 00 a5 0c 21 27 c0 04 60 00 01 00 00 00
+0c 00 60 00 a5 04 60 20 40 00 00 00 44 00 00 00
+0c 00 80 00 a5 04 60 20 40 00 00 00 44 00 00 00
+0c 00 60 00 a5 0c c0 20 a0 00 8d 00 01 00 00 00
+0c 00 80 00 a5 0c 00 21 c0 00 8d 00 01 00 00 00
+0c 05 60 00 a5 0c 81 20 c0 01 60 00 10 00 00 00
+0c 09 60 00 a5 0c 82 20 20 00 00 00 10 00 00 00
+0c 00 60 02 a4 1d 00 20 00 40 00 00 0f 00 00 00
+0c 00 80 02 a4 1d 00 20 00 40 00 00 0f 00 00 00
+0c 00 60 00 a5 1d 40 20 00 40 00 00 0f 00 00 00
+0c 00 80 00 a5 1d 40 20 00 40 00 00 0f 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/bfe.asm b/src/intel/compiler/elk/tests/gen7.5/bfe.asm
new file mode 100644
index 00000000000..7d5532bd60e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/bfe.asm
@@ -0,0 +1,4 @@
+bfe(8)          g31<1>UD        g48<4,4,1>UD    g64<4,4,1>UD    g29<4,4,1>UD { align16 1Q };
+bfe(16)         g56<1>UD        g87<4,4,1>UD    g19<4,4,1>UD    g52<4,4,1>UD { align16 1H };
+bfe(8)          g20<1>D         g18<4,4,1>.xD   g17<4,4,1>.xD   g16<4,4,1>D { align16 1Q };
+bfe(16)         g13<1>D         g11<4,4,1>D     g42<4,4,1>D     g5<4,4,1>D { align16 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/bfe.expected b/src/intel/compiler/elk/tests/gen7.5/bfe.expected
new file mode 100644
index 00000000000..8711981556d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/bfe.expected
@@ -0,0 +1,4 @@
+18 01 60 00 00 28 1e 1f c8 01 03 39 80 20 47 07
+18 01 80 00 00 28 1e 38 c8 71 05 39 26 20 07 0d
+18 01 60 00 00 14 1e 14 00 20 01 00 22 20 07 04
+18 01 80 00 00 14 1e 0d c8 b1 00 39 54 20 47 01
diff --git a/src/intel/compiler/elk/tests/gen7.5/bfi1.asm b/src/intel/compiler/elk/tests/gen7.5/bfi1.asm
new file mode 100644
index 00000000000..1e50f9af7fd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/bfi1.asm
@@ -0,0 +1,3 @@
+bfi1(8)         g22<1>UD        g20<4>.xD       g19<4>.xD       { align16 1Q };
+bfi1(8)         g12<1>UD        g11<8,8,1>D     g10<8,8,1>D     { align1 1Q };
+bfi1(8)         g17<1>UD        g15<8,8,1>D     g21<8,8,1>D     { align1 2Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/bfi1.expected b/src/intel/compiler/elk/tests/gen7.5/bfi1.expected
new file mode 100644
index 00000000000..6e4e4a0ec22
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/bfi1.expected
@@ -0,0 +1,3 @@
+19 01 60 00 a1 14 cf 22 80 02 60 00 60 02 60 00
+19 00 60 00 a1 14 80 21 60 01 8d 00 40 01 8d 00
+19 10 60 00 a1 14 20 22 e0 01 8d 00 a0 02 8d 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/bfi2.asm b/src/intel/compiler/elk/tests/gen7.5/bfi2.asm
new file mode 100644
index 00000000000..6bde1082abb
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/bfi2.asm
@@ -0,0 +1,2 @@
+bfi2(8)         g23<1>UD        g22<4,4,1>UD    g18<4,4,1>UD    g17<4,4,1>UD { align16 1Q };
+bfi2(8)         g19<1>UD        g17<4,4,1>UD    g52<4,4,1>UD    g7<4,4,1>UD { align16 2Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/bfi2.expected b/src/intel/compiler/elk/tests/gen7.5/bfi2.expected
new file mode 100644
index 00000000000..b671e270706
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/bfi2.expected
@@ -0,0 +1,2 @@
+1a 01 60 00 00 28 1e 17 c8 61 01 39 24 20 47 04
+1a 11 60 00 00 28 1e 13 c8 11 01 39 68 20 c7 01
diff --git a/src/intel/compiler/elk/tests/gen7.5/bfrev.asm b/src/intel/compiler/elk/tests/gen7.5/bfrev.asm
new file mode 100644
index 00000000000..101d2cc6e6a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/bfrev.asm
@@ -0,0 +1,3 @@
+bfrev(8)        g5<1>UD         g5<8,8,1>UD                     { align1 1Q };
+bfrev(16)       g6<1>UD         g8<8,8,1>UD                     { align1 1H };
+bfrev(8)        g11<1>UD        g10<4>UD                        { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/bfrev.expected b/src/intel/compiler/elk/tests/gen7.5/bfrev.expected
new file mode 100644
index 00000000000..5dda563419d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/bfrev.expected
@@ -0,0 +1,3 @@
+17 00 60 00 21 00 a0 20 a0 00 8d 00 00 00 00 00
+17 00 80 00 21 00 c0 20 00 01 8d 00 00 00 00 00
+17 01 60 00 21 00 6f 21 44 01 6e 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/break.asm b/src/intel/compiler/elk/tests/gen7.5/break.asm
new file mode 100644
index 00000000000..d4c7619de6c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/break.asm
@@ -0,0 +1,8 @@
+break(8)        JIP: LABEL0          UIP: LABEL1                { align1 1Q };
+break(16)       JIP: LABEL0          UIP: LABEL1                { align1 1H };
+break(8)        JIP: LABEL0          UIP: LABEL1                { align16 1Q };
+LABEL0:
+(+f0.0) break(8) JIP: LABEL1         UIP: LABEL1                { align1 1Q };
+(+f0.0) break(16) JIP: LABEL1        UIP: LABEL1                { align1 1H };
+(+f0.0.x) break(8) JIP: LABEL1       UIP: LABEL1                { align16 1Q };
+LABEL1:
diff --git a/src/intel/compiler/elk/tests/gen7.5/break.expected b/src/intel/compiler/elk/tests/gen7.5/break.expected
new file mode 100644
index 00000000000..309b5511939
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/break.expected
@@ -0,0 +1,6 @@
+28 00 60 00 84 1c 00 20 00 00 8d 00 06 00 0c 00
+28 00 80 00 84 1c 00 20 00 00 8d 00 04 00 0a 00
+28 01 60 00 84 1c 0f 20 04 00 6e 00 02 00 08 00
+28 00 61 00 84 1c 00 20 00 00 8d 00 06 00 06 00
+28 00 81 00 84 1c 00 20 00 00 8d 00 04 00 04 00
+28 01 62 00 84 1c 0f 20 04 00 6e 00 02 00 02 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/cbit.asm b/src/intel/compiler/elk/tests/gen7.5/cbit.asm
new file mode 100644
index 00000000000..ffd6ed6d85a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/cbit.asm
@@ -0,0 +1,3 @@
+cbit(8)         g11<1>UD        g10<4>UD                        { align16 1Q };
+cbit(8)         g5<1>UD         g5<8,8,1>UD                     { align1 1Q };
+cbit(16)        g6<1>UD         g8<8,8,1>UD                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/cbit.expected b/src/intel/compiler/elk/tests/gen7.5/cbit.expected
new file mode 100644
index 00000000000..8d187358a7e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/cbit.expected
@@ -0,0 +1,3 @@
+4d 01 60 00 21 00 6f 21 44 01 6e 00 00 00 00 00
+4d 00 60 00 21 00 a0 20 a0 00 8d 00 00 00 00 00
+4d 00 80 00 21 00 c0 20 00 01 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/cmp.asm b/src/intel/compiler/elk/tests/gen7.5/cmp.asm
new file mode 100644
index 00000000000..9c59eb1c537
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/cmp.asm
@@ -0,0 +1,158 @@
+cmp.z.f0.0(8)   g7<1>D          g6<8,8,1>D      g2.5<0,1,0>D    { align1 1Q };
+cmp.z.f0.0(16)  g11<1>D         g9<8,8,1>D      g2.5<0,1,0>D    { align1 1H };
+cmp.ge.f0.0(8)  null<1>F        g45<4>.xF       g43<4>.xF       { align16 1Q switch };
+cmp.g.f0.0(8)   g18<1>.xyF      g13<4>.zwwwF    0x3f800000F  /* 1F */ { align16 1Q };
+cmp.nz.f0.0(8)  null<1>D        g18<4>.xyyyD    0D              { align16 1Q switch };
+cmp.g.f0.0(8)   null<1>F        g14<4>F         0x3f800000F  /* 1F */ { align16 1Q switch };
+cmp.le.f0.0(8)  g24<1>.xyF      g13<4>.zwwwF    0x3f800000F  /* 1F */ { align16 1Q };
+cmp.ge.f0.0(8)  g15<1>D         (abs)g14<4>D    1D              { align16 1Q };
+cmp.ge.f0.0(8)  g16<1>F         g15<4>F         0x3f800000F  /* 1F */ { align16 1Q };
+cmp.nz.f0.0(8)  null<1>F        g3<0>.xyzzF     0x74746e64VF /* [10F, 15F, 20F, 20F]VF */ { align16 1Q switch };
+cmp.z.f0.0(8)   null<1>D        g13<4>.xyyyD    g6<0>.yzzzD     { align16 1Q switch };
+cmp.ge.f0.0(8)  g33<1>F         g32<8,8,1>F     0x3189705fF  /* 4e-09F */ { align1 1Q };
+cmp.l.f0.0(8)   g34<1>F         g32<8,8,1>F     0x3189705fF  /* 4e-09F */ { align1 1Q };
+cmp.ge.f0.0(16) g71<1>F         g69<8,8,1>F     0x3189705fF  /* 4e-09F */ { align1 1H };
+cmp.l.f0.0(16)  g73<1>F         g69<8,8,1>F     0x3189705fF  /* 4e-09F */ { align1 1H };
+cmp.nz.f0.0(8)  g2<1>D          g6<8,8,1>D      255D            { align1 1Q };
+(+f0.1) cmp.z.f0.1(8) null<1>D  g2<8,8,1>D      0D              { align1 1Q switch };
+cmp.nz.f0.0(16) g2<1>D          g8<8,8,1>D      255D            { align1 1H };
+(+f0.1) cmp.z.f0.1(16) null<1>D g2<8,8,1>D      0D              { align1 1H switch };
+cmp.z.f0.0(8)   g6<1>D          g2<8,8,1>D      255D            { align1 1Q };
+cmp.z.f0.0(16)  g2<1>D          g40<8,8,1>D     255D            { align1 1H };
+cmp.z.f0.0(8)   null<1>D        g22<8,8,1>D     1D              { align1 1Q switch };
+cmp.z.f0.0(16)  null<1>D        g92<8,8,1>D     1D              { align1 1H switch };
+cmp.ge.f0.0(8)  g31<1>UD        g30<8,8,1>UD    g5.7<0,1,0>UD   { align1 1Q };
+cmp.l.f0.0(8)   g32<1>UD        g30<8,8,1>UD    g5.3<0,1,0>UD   { align1 1Q };
+cmp.ge.f0.0(16) g49<1>UD        g47<8,8,1>UD    g7.7<0,1,0>UD   { align1 1H };
+cmp.l.f0.0(16)  g51<1>UD        g47<8,8,1>UD    g7.3<0,1,0>UD   { align1 1H };
+cmp.l.f0.0(8)   g43<1>F         g42<8,8,1>F     g41<8,8,1>F     { align1 1Q };
+cmp.ge.f0.0(8)  g44<1>F         g42<8,8,1>F     g41<8,8,1>F     { align1 1Q };
+cmp.l.f0.0(16)  g80<1>F         g6<8,8,1>F      g78<8,8,1>F     { align1 1H };
+cmp.ge.f0.0(16) g82<1>F         g6<8,8,1>F      g78<8,8,1>F     { align1 1H };
+cmp.z.f0.0(8)   null<1>D        g4<0>.xD        0D              { align16 1Q switch };
+cmp.l.f0.0(8)   null<1>F        g35<4>.xF       0x3189705fF  /* 4e-09F */ { align16 1Q switch };
+cmp.z.f0.0(8)   null<1>F        g3<0>.zwwwF     g3<0>.xyyyF     { align16 1Q switch };
+cmp.l.f0.0(8)   g12<1>.xF       g5.4<0>.zF      g5.4<0>.wF      { align16 1Q };
+cmp.nz.f0.0(8)  g5<1>D          g4<8,8,1>D      g2.1<0,1,0>D    { align1 1Q };
+cmp.nz.f0.0(16) g7<1>D          g5<8,8,1>D      g2.1<0,1,0>D    { align1 1H };
+cmp.ge.f0.0(8)  g9<1>.xF        g1<0>.xF        g1<0>.yF        { align16 1Q };
+cmp.l.f0.0(8)   null<1>UD       g6<4>.xUD       0x00000003UD    { align16 1Q switch };
+cmp.nz.f0.0(8)  null<1>F        g42<4>F         g3<0>F          { align16 1Q switch };
+cmp.l.f0.0(8)   null<1>D        g4<0,1,0>D      1D              { align1 1Q switch };
+cmp.z.f0.0(8)   g20<1>F         g3<8,8,1>F      g4.3<0,1,0>F    { align1 1Q };
+cmp.l.f0.0(16)  null<1>D        g6<0,1,0>D      1D              { align1 1H switch };
+cmp.z.f0.0(16)  g37<1>F         g4<8,8,1>F      g6.3<0,1,0>F    { align1 1H };
+cmp.ge.f0.0(8)  g21<1>.xyUD     g1<0>.xyyyUD    g1<0>.zwwwUD    { align16 1Q };
+cmp.ge.f0.0(8)  null<1>.xD      g2<0>.xD        16D             { align16 1Q switch };
+cmp.le.f0.0(8)  null<1>.zF      g7<4>.xF        0x0F  /* 0F */  { align16 1Q switch };
+cmp.nz.f0.0(8)  null<1>F        g2<0,1,0>F      0x0F  /* 0F */  { align1 1Q switch };
+cmp.nz.f0.0(16) null<1>F        g2<0,1,0>F      0x0F  /* 0F */  { align1 1H switch };
+cmp.z.f0.0(8)   g3<1>F          g2.1<0,1,0>F    0x41000000F  /* 8F */ { align1 1Q };
+cmp.z.f0.0(16)  g3<1>F          g2.1<0,1,0>F    0x41000000F  /* 8F */ { align1 1H };
+cmp.nz.f0.0(8)  g20<1>.xyzD     g1<0>.xyzzD     g1.4<0>.xyzzD   { align16 1Q };
+cmp.z.f0.0(8)   g31<1>.yzwD     g3<0>.xD        g19<4>.yyzwD    { align16 1Q };
+cmp.z.f0.0(8)   null<1>F        g10<8,8,1>F     g4.1<0,1,0>F    { align1 1Q switch };
+cmp.z.f0.0(16)  null<1>F        g17<8,8,1>F     g6.1<0,1,0>F    { align1 1H switch };
+cmp.nz.f0.0(8)  g6<1>F          g5<8,8,1>F      g2.2<0,1,0>F    { align1 1Q };
+cmp.nz.f0.0(16) g8<1>F          g6<8,8,1>F      g2.2<0,1,0>F    { align1 1H };
+cmp.ge.f0.0(8)  g12<1>.xD       g5.4<0>.zD      g5.4<0>.wD      { align16 1Q };
+cmp.nz.f0.0(8)  g47<1>.xD       g5.4<0>.zD      0D              { align16 1Q };
+cmp.z.f0.0(8)   g11<1>.xF       g58<4>.xF       g56<4>.xF       { align16 1Q };
+cmp.nz.f0.0(8)  null<1>D        g13<4>.xyyyD    g42<4>.xD       { align16 1Q switch };
+cmp.nz.f0.0(8)  null<1>D        g4<0,1,0>D      0D              { align1 1Q switch };
+cmp.nz.f0.0(16) null<1>D        g6<0,1,0>D      0D              { align1 1H switch };
+cmp.z.f0.0(8)   g17<1>.xD       g1<0>.xD        1D              { align16 1Q };
+cmp.nz.f0.0(8)  null<1>F        g2.4<0,1,0>F    g22.1<0,1,0>F   { align1 1Q switch };
+cmp.nz.f0.0(16) null<1>F        g2.4<0,1,0>F    g39.1<0,1,0>F   { align1 1H switch };
+cmp.nz.f0.0(8)  null<1>F        g47<4>.xyzzF    0x0F  /* 0F */  { align16 1Q switch };
+cmp.l.f0.0(8)   g70<1>.xyzF     g68<4>.xyzzF    0x0F  /* 0F */  { align16 1Q };
+cmp.z.f0.0(8)   null<1>.xF      (abs)g13<4>.xF  0x7f800000F  /* infF */ { align16 1Q switch };
+cmp.l.f0.0(8)   null<1>.xF      g5<4>.xF        g13<4>.xF       { align16 1Q switch };
+cmp.l.f0.0(8)   g10<1>UD        g9<4>UD         g1<0>UD         { align16 1Q };
+cmp.g.f0.0(8)   g32<1>F         g31<8,8,1>F     0x3727c5acF  /* 1e-05F */ { align1 1Q };
+cmp.le.f0.0(8)  g33<1>F         g31<8,8,1>F     0x3727c5acF  /* 1e-05F */ { align1 1Q };
+cmp.g.f0.0(16)  g65<1>F         g63<8,8,1>F     0x3727c5acF  /* 1e-05F */ { align1 1H };
+cmp.le.f0.0(16) g67<1>F         g63<8,8,1>F     0x3727c5acF  /* 1e-05F */ { align1 1H };
+cmp.z.f0.0(8)   null<1>F        g4.1<0,1,0>F    0x3f800000F  /* 1F */ { align1 1Q switch };
+cmp.z.f0.0(16)  null<1>F        g6.1<0,1,0>F    0x3f800000F  /* 1F */ { align1 1H switch };
+cmp.ge.f0.0(8)  g5<1>D          g2<0,1,0>D      1D              { align1 1Q };
+cmp.ge.f0.0(16) g7<1>D          g2<0,1,0>D      1D              { align1 1H };
+cmp.g.f0.0(8)   null<1>F        g124<8,8,1>F    0x0F  /* 0F */  { align1 1Q switch };
+cmp.g.f0.0(16)  null<1>F        g120<8,8,1>F    0x0F  /* 0F */  { align1 1H switch };
+cmp.l.f0.0(8)   g24<1>.xD       g18<4>.xD       4D              { align16 1Q };
+cmp.nz.f0.0(8)  g26<1>F         g75<8,8,1>F     0x40000000F  /* 2F */ { align1 1Q };
+cmp.nz.f0.0(16) g88<1>F         g42<8,8,1>F     0x40000000F  /* 2F */ { align1 1H };
+cmp.l.f0.0(8)   g57<1>D         g3<0,1,0>D      1D              { align1 1Q };
+cmp.l.f0.0(16)  g110<1>D        g3<0,1,0>D      1D              { align1 1H };
+cmp.ge.f0.0(8)  g3<1>D          g2.3<0,1,0>D    g2<0,1,0>D      { align1 1Q };
+cmp.ge.f0.0(16) g3<1>D          g2.3<0,1,0>D    g2<0,1,0>D      { align1 1H };
+cmp.nz.f0.0(8)  null<1>D        g10<8,8,1>D     g15<8,8,1>D     { align1 1Q switch };
+cmp.nz.f0.0(16) null<1>D        g15<8,8,1>D     g25<8,8,1>D     { align1 1H switch };
+cmp.l.f0.0(8)   null<1>UD       g1<0>.yUD       g1<0>.xUD       { align16 1Q switch };
+cmp.nz.f0.0(8)  g8<1>.xyzF      g1<0>.xyzzF     g1.4<0>.xyzzF   { align16 1Q };
+cmp.z.f0.0(8)   g2<1>DF         g8<4,4,1>DF     g5<0,1,0>DF     { align1 1Q };
+cmp.z.f0.0(8)   g11<1>DF        g8<4,4,1>DF     g5<0,1,0>DF     { align1 2Q };
+cmp.le.f0.0(8)  null<1>D        g1<0>.xD        0D              { align16 1Q switch };
+cmp.l.f0.0(8)   g3<1>D          g2.1<0,1,0>D    g2<0,1,0>D      { align1 1Q };
+cmp.l.f0.0(16)  g3<1>D          g2.1<0,1,0>D    g2<0,1,0>D      { align1 1H };
+cmp.l.f0.0(8)   null<1>.xD      g68<4>.xD       3D              { align16 1Q switch };
+cmp.l.f0.0(8)   g21<1>.xyD      g1<0>.zwwwD     g1<0>.xyyyD     { align16 1Q };
+cmp.le.f0.0(8)  null<1>F        g63<8,8,1>F     g2.1<0,1,0>F    { align1 1Q switch };
+cmp.le.f0.0(8)  null<1>F        g79<8,8,1>F     0x3fc00000F  /* 1.5F */ { align1 1Q switch };
+cmp.le.f0.0(16) null<1>F        g116<8,8,1>F    g2.1<0,1,0>F    { align1 1H switch };
+cmp.le.f0.0(16) null<1>F        g38<8,8,1>F     0x3fc00000F  /* 1.5F */ { align1 1H switch };
+cmp.z.f0.0(8)   null<1>F        g3<0>.xyzzF     0x6e6e6c6aVF /* [13F, 14F, 15F, 15F]VF */ { align16 1Q switch };
+cmp.z.f0.0(8)   null<1>D        g6<0,1,0>D      g2<0,1,0>D      { align1 1Q switch };
+cmp.z.f0.0(16)  null<1>D        g6<0,1,0>D      g2<0,1,0>D      { align1 1H switch };
+cmp.le.f0.0(8)  null<1>D        g6<8,8,1>D      50D             { align1 1Q switch };
+cmp.ge.f0.0(8)  null<1>F        g25<8,8,1>F     0x3f000000F  /* 0.5F */ { align1 1Q switch };
+cmp.le.f0.0(16) null<1>D        g14<8,8,1>D     50D             { align1 1H switch };
+cmp.ge.f0.0(16) null<1>F        g42<8,8,1>F     0x3f000000F  /* 0.5F */ { align1 1H switch };
+cmp.z.f0.0(8)   g26<1>.xF       g2.4<0>.zF      0x40800000F  /* 4F */ { align16 1Q };
+cmp.ge.f0.0(8)  null<1>.xD      g5<4>.xD        g3<0>.xD        { align16 1Q switch };
+cmp.ge.f0.0(8)  null<1>D        g6<8,8,1>D      4D              { align1 1Q switch };
+cmp.ge.f0.0(16) null<1>D        g10<8,8,1>D     4D              { align1 1H switch };
+cmp.g.f0.0(8)   null<1>D        g1<0>.zD        31D             { align16 1Q switch };
+cmp.ge.f0.0(8)  null<1>.xF      (abs)g35<4>.xF  0x5d5e0b6bF  /* 1e+18F */ { align16 1Q switch };
+cmp.l.f0.0(8)   null<1>F        g4<0,1,0>F      0x0F  /* 0F */  { align1 1Q switch };
+cmp.l.f0.0(16)  null<1>F        g6<0,1,0>F      0x0F  /* 0F */  { align1 1H switch };
+cmp.ge.f0.0(8)  null<1>UD       g1<0>.yUD       g1<0>.xUD       { align16 1Q switch };
+cmp.le.f0.0(8)  g93<1>F         g2.4<0,1,0>F    g89<0,1,0>F     { align1 1Q };
+cmp.ge.f0.0(8)  null<1>F        (abs)g14<8,8,1>F g89.1<0,1,0>F  { align1 1Q switch };
+cmp.g.f0.0(8)   g86<1>F         (abs)g38<8,8,1>F g59<0,1,0>F    { align1 1Q };
+cmp.l.f0.0(8)   null<1>F        g118<8,8,1>F    g89<0,1,0>F     { align1 1Q switch };
+cmp.le.f0.0(16) g96<1>F         g2.4<0,1,0>F    g45<0,1,0>F     { align1 1H };
+cmp.ge.f0.0(16) null<1>F        (abs)g114<8,8,1>F g45.1<0,1,0>F { align1 1H switch };
+cmp.g.f0.0(16)  g60<1>F         (abs)g68<8,8,1>F g46<0,1,0>F    { align1 1H };
+cmp.l.f0.0(16)  null<1>F        g37<8,8,1>F     g45<0,1,0>F     { align1 1H switch };
+cmp.g.f0.0(8)   null<1>UD       g1<0>.zUD       0x0000001fUD    { align16 1Q switch };
+cmp.l.f0.0(8)   null<1>D        g1<0>.yD        g1<0>.xD        { align16 1Q switch };
+cmp.l.f0.0(8)   null<1>UD       g2<8,8,1>UD     g4.1<0,1,0>UD   { align1 1Q switch };
+cmp.l.f0.0(16)  null<1>UD       g24<8,8,1>UD    g6.1<0,1,0>UD   { align1 1H switch };
+cmp.g.f0.0(8)   null<1>D        g2.1<0,1,0>D    0D              { align1 1Q switch };
+cmp.ge.f0.0(8)  null<1>D        g3<8,8,1>D      g2.1<0,1,0>D    { align1 1Q switch };
+cmp.g.f0.0(16)  null<1>D        g2.1<0,1,0>D    0D              { align1 1H switch };
+cmp.ge.f0.0(16) null<1>D        g3<8,8,1>D      g2.1<0,1,0>D    { align1 1H switch };
+cmp.nz.f0.0(8)  null<1>UD       g9<4>.xUD       0x00000000UD    { align16 1Q switch };
+cmp.nz.f0.0(8)  g2<1>DF         g15<0,1,0>DF    g28<4,4,1>DF    { align1 1Q };
+cmp.nz.f0.0(8)  g4<1>DF         g63<0,1,0>DF    g16<4,4,1>DF    { align1 2Q };
+cmp.l.f0.0(8)   null<1>D        g2<8,8,1>D      g3<8,8,1>D      { align1 1Q switch };
+cmp.l.f0.0(16)  null<1>D        g2<8,8,1>D      g4<8,8,1>D      { align1 1H switch };
+cmp.g.f0.0(8)   null<1>F        (abs)g14<8,8,1>F g45<0,1,0>F    { align1 1Q switch };
+cmp.g.f0.0(16)  null<1>F        (abs)g21<8,8,1>F g6<0,1,0>F     { align1 1H switch };
+(+f0.1) cmp.nz.f0.1(8) null<1>UW g0<8,8,1>UW    g0<8,8,1>UW     { align1 1Q switch };
+(+f0.1) cmp.nz.f0.1(16) null<1>UW g0<8,8,1>UW   g0<8,8,1>UW     { align1 1H switch };
+cmp.nz.f0.0(8)  g8<1>F          g7<4>F          0x0F  /* 0F */  { align16 1Q };
+cmp.le.f0.0(8)  g3<1>D          g2<0,1,0>D      0D              { align1 1Q };
+cmp.le.f0.0(16) g3<1>D          g2<0,1,0>D      0D              { align1 1H };
+cmp.l.f0.0(8)   null<1>UD       g12<8,8,1>UD    0x00000040UD    { align1 1Q switch };
+cmp.l.f0.0(16)  null<1>UD       g19<8,8,1>UD    0x00000040UD    { align1 1H switch };
+cmp.l.f0.0(8)   g14<1>UD        g11<8,8,1>UD    0x00000007UD    { align1 1Q };
+cmp.le.f0.0(8)  null<1>UD       g19<8,8,1>UD    0x000000ffUD    { align1 1Q switch };
+cmp.l.f0.0(16)  g24<1>UD        g18<8,8,1>UD    0x00000007UD    { align1 1H };
+cmp.le.f0.0(16) null<1>UD       g32<8,8,1>UD    0x000000ffUD    { align1 1H switch };
+cmp.ge.f0.0(8)  null<1>UD       g4<8,8,1>UD     g2.3<0,1,0>UD   { align1 1Q switch };
+cmp.ge.f0.0(16) null<1>UD       g5<8,8,1>UD     g2.3<0,1,0>UD   { align1 1H switch };
+cmp.le.f0.0(8)  g9<1>.xUD       g1<0>.xUD       0x00000001UD    { align16 1Q };
+cmp.g.f0.0(8)   null<1>UD       g4.2<0,1,0>UD   0x0000001fUD    { align1 1Q switch };
+cmp.g.f0.0(16)  null<1>UD       g4.2<0,1,0>UD   0x0000001fUD    { align1 1H switch };
diff --git a/src/intel/compiler/elk/tests/gen7.5/cmp.expected b/src/intel/compiler/elk/tests/gen7.5/cmp.expected
new file mode 100644
index 00000000000..436fc77511a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/cmp.expected
@@ -0,0 +1,158 @@
+10 00 60 01 a5 14 e0 20 c0 00 8d 00 54 00 00 00
+10 00 80 01 a5 14 60 21 20 01 8d 00 54 00 00 00
+10 81 60 04 bc 77 0f 20 a0 05 60 00 60 05 60 00
+10 01 60 03 bd 7f 43 22 ae 01 6f 00 00 00 80 3f
+10 81 60 02 a4 1c 0f 20 44 02 65 00 00 00 00 00
+10 81 60 03 bc 7f 0f 20 c4 01 6e 00 00 00 80 3f
+10 01 60 06 bd 7f 03 23 ae 01 6f 00 00 00 80 3f
+10 01 60 04 a5 1c ef 21 c4 21 6e 00 01 00 00 00
+10 01 60 04 bd 7f 0f 22 e4 01 6e 00 00 00 80 3f
+10 81 60 02 bc 5f 0f 20 64 00 0a 00 64 6e 74 74
+10 81 60 01 a4 14 0f 20 a4 01 65 00 c9 00 0a 00
+10 00 60 04 bd 7f 20 24 00 04 8d 00 5f 70 89 31
+10 00 60 05 bd 7f 40 24 00 04 8d 00 5f 70 89 31
+10 00 80 04 bd 7f e0 28 a0 08 8d 00 5f 70 89 31
+10 00 80 05 bd 7f 20 29 a0 08 8d 00 5f 70 89 31
+10 00 60 02 a5 1c 40 20 c0 00 8d 00 ff 00 00 00
+10 80 61 01 a4 1c 00 20 40 00 8d 02 00 00 00 00
+10 00 80 02 a5 1c 40 20 00 01 8d 00 ff 00 00 00
+10 80 81 01 a4 1c 00 20 40 00 8d 02 00 00 00 00
+10 00 60 01 a5 1c c0 20 40 00 8d 00 ff 00 00 00
+10 00 80 01 a5 1c 40 20 00 05 8d 00 ff 00 00 00
+10 80 60 01 a4 1c 00 20 c0 02 8d 00 01 00 00 00
+10 80 80 01 a4 1c 00 20 80 0b 8d 00 01 00 00 00
+10 00 60 04 21 04 e0 23 c0 03 8d 00 bc 00 00 00
+10 00 60 05 21 04 00 24 c0 03 8d 00 ac 00 00 00
+10 00 80 04 21 04 20 26 e0 05 8d 00 fc 00 00 00
+10 00 80 05 21 04 60 26 e0 05 8d 00 ec 00 00 00
+10 00 60 05 bd 77 60 25 40 05 8d 00 20 05 8d 00
+10 00 60 04 bd 77 80 25 40 05 8d 00 20 05 8d 00
+10 00 80 05 bd 77 00 2a c0 00 8d 00 c0 09 8d 00
+10 00 80 04 bd 77 40 2a c0 00 8d 00 c0 09 8d 00
+10 81 60 01 a4 1c 0f 20 80 00 00 00 00 00 00 00
+10 81 60 05 bc 7f 0f 20 60 04 60 00 5f 70 89 31
+10 81 60 01 bc 77 0f 20 6e 00 0f 00 64 00 05 00
+10 01 60 05 bd 77 81 21 ba 00 0a 00 bf 00 0f 00
+10 00 60 02 a5 14 a0 20 80 00 8d 00 44 00 00 00
+10 00 80 02 a5 14 e0 20 a0 00 8d 00 44 00 00 00
+10 01 60 04 bd 77 21 21 20 00 00 00 25 00 05 00
+10 81 60 05 20 0c 0f 20 c0 00 60 00 03 00 00 00
+10 81 60 02 bc 77 0f 20 44 05 6e 00 64 00 0e 00
+10 80 60 05 a4 1c 00 20 80 00 00 00 01 00 00 00
+10 00 60 01 bd 77 80 22 60 00 8d 00 8c 00 00 00
+10 80 80 05 a4 1c 00 20 c0 00 00 00 01 00 00 00
+10 00 80 01 bd 77 a0 24 80 00 8d 00 cc 00 00 00
+10 01 60 04 21 04 a3 22 24 00 05 00 2e 00 0f 00
+10 81 60 04 a4 1c 01 20 40 00 00 00 10 00 00 00
+10 81 60 06 bc 7f 04 20 e0 00 60 00 00 00 00 00
+10 80 60 02 bc 7f 00 20 40 00 00 00 00 00 00 00
+10 80 80 02 bc 7f 00 20 40 00 00 00 00 00 00 00
+10 00 60 01 bd 7f 60 20 44 00 00 00 00 00 00 41
+10 00 80 01 bd 7f 60 20 44 00 00 00 00 00 00 41
+10 01 60 02 a5 14 87 22 24 00 0a 00 34 00 0a 00
+10 01 60 01 a5 14 ee 23 60 00 00 00 65 02 6e 00
+10 80 60 01 bc 77 00 20 40 01 8d 00 84 00 00 00
+10 80 80 01 bc 77 00 20 20 02 8d 00 c4 00 00 00
+10 00 60 02 bd 77 c0 20 a0 00 8d 00 48 00 00 00
+10 00 80 02 bd 77 00 21 c0 00 8d 00 48 00 00 00
+10 01 60 04 a5 14 81 21 ba 00 0a 00 bf 00 0f 00
+10 01 60 02 a5 1c e1 25 ba 00 0a 00 00 00 00 00
+10 01 60 01 bd 77 61 21 40 07 60 00 00 07 60 00
+10 81 60 02 a4 14 0f 20 a4 01 65 00 40 05 60 00
+10 80 60 02 a4 1c 00 20 80 00 00 00 00 00 00 00
+10 80 80 02 a4 1c 00 20 c0 00 00 00 00 00 00 00
+10 01 60 01 a5 1c 21 22 20 00 00 00 01 00 00 00
+10 80 60 02 bc 77 00 20 50 00 00 00 c4 02 00 00
+10 80 80 02 bc 77 00 20 50 00 00 00 e4 04 00 00
+10 81 60 02 bc 7f 0f 20 e4 05 6a 00 00 00 00 00
+10 01 60 05 bd 7f c7 28 84 08 6a 00 00 00 00 00
+10 81 60 01 bc 7f 01 20 a0 21 60 00 00 00 80 7f
+10 81 60 05 bc 77 01 20 a0 00 60 00 a0 01 60 00
+10 01 60 05 21 04 4f 21 24 01 6e 00 24 00 0e 00
+10 00 60 03 bd 7f 00 24 e0 03 8d 00 ac c5 27 37
+10 00 60 06 bd 7f 20 24 e0 03 8d 00 ac c5 27 37
+10 00 80 03 bd 7f 20 28 e0 07 8d 00 ac c5 27 37
+10 00 80 06 bd 7f 60 28 e0 07 8d 00 ac c5 27 37
+10 80 60 01 bc 7f 00 20 84 00 00 00 00 00 80 3f
+10 80 80 01 bc 7f 00 20 c4 00 00 00 00 00 80 3f
+10 00 60 04 a5 1c a0 20 40 00 00 00 01 00 00 00
+10 00 80 04 a5 1c e0 20 40 00 00 00 01 00 00 00
+10 80 60 03 bc 7f 00 20 80 0f 8d 00 00 00 00 00
+10 80 80 03 bc 7f 00 20 00 0f 8d 00 00 00 00 00
+10 01 60 05 a5 1c 01 23 40 02 60 00 04 00 00 00
+10 00 60 02 bd 7f 40 23 60 09 8d 00 00 00 00 40
+10 00 80 02 bd 7f 00 2b 40 05 8d 00 00 00 00 40
+10 00 60 05 a5 1c 20 27 60 00 00 00 01 00 00 00
+10 00 80 05 a5 1c c0 2d 60 00 00 00 01 00 00 00
+10 00 60 04 a5 14 60 20 4c 00 00 00 40 00 00 00
+10 00 80 04 a5 14 60 20 4c 00 00 00 40 00 00 00
+10 80 60 02 a4 14 00 20 40 01 8d 00 e0 01 8d 00
+10 80 80 02 a4 14 00 20 e0 01 8d 00 20 03 8d 00
+10 81 60 05 20 04 0f 20 25 00 05 00 20 00 00 00
+10 01 60 02 bd 77 07 21 24 00 0a 00 34 00 0a 00
+10 00 60 01 39 67 40 20 00 01 69 00 a0 00 00 00
+10 10 60 01 39 67 60 21 00 01 69 00 a0 00 00 00
+10 81 60 06 a4 1c 0f 20 20 00 00 00 00 00 00 00
+10 00 60 05 a5 14 60 20 44 00 00 00 40 00 00 00
+10 00 80 05 a5 14 60 20 44 00 00 00 40 00 00 00
+10 81 60 05 a4 1c 01 20 80 08 60 00 03 00 00 00
+10 01 60 05 a5 14 a3 22 2e 00 0f 00 24 00 05 00
+10 80 60 06 bc 77 00 20 e0 07 8d 00 44 00 00 00
+10 80 60 06 bc 7f 00 20 e0 09 8d 00 00 00 c0 3f
+10 80 80 06 bc 77 00 20 80 0e 8d 00 44 00 00 00
+10 80 80 06 bc 7f 00 20 c0 04 8d 00 00 00 c0 3f
+10 81 60 01 bc 5f 0f 20 64 00 0a 00 6a 6c 6e 6e
+10 80 60 01 a4 14 00 20 c0 00 00 00 40 00 00 00
+10 80 80 01 a4 14 00 20 c0 00 00 00 40 00 00 00
+10 80 60 06 a4 1c 00 20 c0 00 8d 00 32 00 00 00
+10 80 60 04 bc 7f 00 20 20 03 8d 00 00 00 00 3f
+10 80 80 06 a4 1c 00 20 c0 01 8d 00 32 00 00 00
+10 80 80 04 bc 7f 00 20 40 05 8d 00 00 00 00 3f
+10 01 60 01 bd 7f 41 23 5a 00 0a 00 00 00 80 40
+10 81 60 04 a4 14 01 20 a0 00 60 00 60 00 00 00
+10 80 60 04 a4 1c 00 20 c0 00 8d 00 04 00 00 00
+10 80 80 04 a4 1c 00 20 40 01 8d 00 04 00 00 00
+10 81 60 03 a4 1c 0f 20 2a 00 0a 00 1f 00 00 00
+10 81 60 04 bc 7f 01 20 60 24 60 00 6b 0b 5e 5d
+10 80 60 05 bc 7f 00 20 80 00 00 00 00 00 00 00
+10 80 80 05 bc 7f 00 20 c0 00 00 00 00 00 00 00
+10 81 60 04 20 04 0f 20 25 00 05 00 20 00 00 00
+10 00 60 06 bd 77 a0 2b 50 00 00 00 20 0b 00 00
+10 80 60 04 bc 77 00 20 c0 21 8d 00 24 0b 00 00
+10 00 60 03 bd 77 c0 2a c0 24 8d 00 60 07 00 00
+10 80 60 05 bc 77 00 20 c0 0e 8d 00 20 0b 00 00
+10 00 80 06 bd 77 00 2c 50 00 00 00 a0 05 00 00
+10 80 80 04 bc 77 00 20 40 2e 8d 00 a4 05 00 00
+10 00 80 03 bd 77 80 27 80 28 8d 00 c0 05 00 00
+10 80 80 05 bc 77 00 20 a0 04 8d 00 a0 05 00 00
+10 81 60 03 20 0c 0f 20 2a 00 0a 00 1f 00 00 00
+10 81 60 05 a4 14 0f 20 25 00 05 00 20 00 00 00
+10 80 60 05 20 04 00 20 40 00 8d 00 84 00 00 00
+10 80 80 05 20 04 00 20 00 03 8d 00 c4 00 00 00
+10 80 60 03 a4 1c 00 20 44 00 00 00 00 00 00 00
+10 80 60 04 a4 14 00 20 60 00 8d 00 44 00 00 00
+10 80 80 03 a4 1c 00 20 44 00 00 00 00 00 00 00
+10 80 80 04 a4 14 00 20 60 00 8d 00 44 00 00 00
+10 81 60 02 20 0c 0f 20 20 01 60 00 00 00 00 00
+10 00 60 02 39 67 40 20 e0 01 00 00 80 03 69 00
+10 10 60 02 39 67 80 20 e0 07 00 00 00 02 69 00
+10 80 60 05 a4 14 00 20 40 00 8d 00 60 00 8d 00
+10 80 80 05 a4 14 00 20 40 00 8d 00 80 00 8d 00
+10 80 60 03 bc 77 00 20 c0 21 8d 00 a0 05 00 00
+10 80 80 03 bc 77 00 20 a0 22 8d 00 c0 00 00 00
+10 80 61 02 28 25 00 20 00 00 8d 02 00 00 8d 00
+10 80 81 02 28 25 00 20 00 00 8d 02 00 00 8d 00
+10 01 60 02 bd 7f 0f 21 e4 00 6e 00 00 00 00 00
+10 00 60 06 a5 1c 60 20 40 00 00 00 00 00 00 00
+10 00 80 06 a5 1c 60 20 40 00 00 00 00 00 00 00
+10 80 60 05 20 0c 00 20 80 01 8d 00 40 00 00 00
+10 80 80 05 20 0c 00 20 60 02 8d 00 40 00 00 00
+10 00 60 05 21 0c c0 21 60 01 8d 00 07 00 00 00
+10 80 60 06 20 0c 00 20 60 02 8d 00 ff 00 00 00
+10 00 80 05 21 0c 00 23 40 02 8d 00 07 00 00 00
+10 80 80 06 20 0c 00 20 00 04 8d 00 ff 00 00 00
+10 80 60 04 20 04 00 20 80 00 8d 00 4c 00 00 00
+10 80 80 04 20 04 00 20 a0 00 8d 00 4c 00 00 00
+10 01 60 06 21 0c 21 21 20 00 00 00 01 00 00 00
+10 80 60 03 20 0c 00 20 88 00 00 00 1f 00 00 00
+10 80 80 03 20 0c 00 20 88 00 00 00 1f 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/cont.asm b/src/intel/compiler/elk/tests/gen7.5/cont.asm
new file mode 100644
index 00000000000..497a1155efc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/cont.asm
@@ -0,0 +1,6 @@
+cont(8)         JIP: LABEL0          UIP: LABEL2                { align1 1Q };
+LABEL0:
+cont(16)        JIP: LABEL1          UIP: LABEL2                { align1 1H };
+LABEL1:
+cont(8)         JIP: LABEL2          UIP: LABEL2                { align16 1Q };
+LABEL2:
diff --git a/src/intel/compiler/elk/tests/gen7.5/cont.expected b/src/intel/compiler/elk/tests/gen7.5/cont.expected
new file mode 100644
index 00000000000..704ea3afd86
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/cont.expected
@@ -0,0 +1,3 @@
+29 00 60 00 00 1c 00 34 00 14 60 00 02 00 06 00
+29 00 80 00 00 1c 00 34 00 14 60 00 02 00 04 00
+29 01 60 00 00 1c 0f 34 04 14 6e 00 02 00 02 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/dim.asm b/src/intel/compiler/elk/tests/gen7.5/dim.asm
new file mode 100644
index 00000000000..df8e9ca404e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/dim.asm
@@ -0,0 +1,2 @@
+dim(4)          g2<1>.xDF       0x3ff0000000000000F /* 1F */    { align16 WE_all 1N };
+dim(1)          g4<1>DF         0x4018000000000000F /* 6F */    { align1 WE_all 1N };
diff --git a/src/intel/compiler/elk/tests/gen7.5/dim.expected b/src/intel/compiler/elk/tests/gen7.5/dim.expected
new file mode 100644
index 00000000000..bd8d4972550
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/dim.expected
@@ -0,0 +1,2 @@
+0a 03 40 00 f9 73 41 20 00 00 00 00 00 00 f0 3f
+0a 02 00 00 f9 73 80 20 00 00 00 00 00 00 18 40
diff --git a/src/intel/compiler/elk/tests/gen7.5/dp2.asm b/src/intel/compiler/elk/tests/gen7.5/dp2.asm
new file mode 100644
index 00000000000..da76db0e1a3
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/dp2.asm
@@ -0,0 +1,4 @@
+dp2(8)          g24<1>.xF       g23<4>.xyyyF    g23<4>.xyyyF    { align16 1Q };
+dp2(8)          g116<1>.xF      g1<0>.yF        g1<0>.yF        { align16 NoDDClr 1Q };
+dp2(8)          g116<1>.yzF     g1<0>.xF        g1<0>.zwwwF     { align16 NoDDClr,NoDDChk 1Q };
+dp2(8)          g116<1>.wF      g1<0>.ywwwF     g1<0>.wyyyF     { align16 NoDDChk 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/dp2.expected b/src/intel/compiler/elk/tests/gen7.5/dp2.expected
new file mode 100644
index 00000000000..c0934051293
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/dp2.expected
@@ -0,0 +1,4 @@
+57 01 60 00 bd 77 01 23 e4 02 65 00 e4 02 65 00
+57 05 60 00 bd 77 81 2e 25 00 05 00 25 00 05 00
+57 0d 60 00 bd 77 86 2e 20 00 00 00 2e 00 0f 00
+57 09 60 00 bd 77 88 2e 2d 00 0f 00 27 00 05 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/dp3.asm b/src/intel/compiler/elk/tests/gen7.5/dp3.asm
new file mode 100644
index 00000000000..7ed7e9192e4
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/dp3.asm
@@ -0,0 +1,6 @@
+dp3(8)          g12<1>.xF       g11<4>.xyzzF    g11<4>.xyzzF    { align16 1Q };
+dp3(8)          g116<1>.xF      g3<0>.xyzzF     g6<4>.xyzzF     { align16 NoDDClr 1Q };
+dp3(8)          g116<1>.yF      g3.4<0>.xyzzF   g6<4>.xyzzF     { align16 NoDDClr,NoDDChk 1Q };
+dp3(8)          g5<1>.zF        g1<0>.xyzzF     g2.4<0>.xyzzF   { align16 NoDDChk 1Q };
+dp3.le.f0.0(8)  g40<1>.xF       g31<4>.xyzzF    g3.4<0>.xyzzF   { align16 1Q };
+dp3.sat(8)      g37<1>.xF       g31<4>.xyzzF    g35<4>.xyzzF    { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/dp3.expected b/src/intel/compiler/elk/tests/gen7.5/dp3.expected
new file mode 100644
index 00000000000..eced97ac9b8
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/dp3.expected
@@ -0,0 +1,6 @@
+56 01 60 00 bd 77 81 21 64 01 6a 00 64 01 6a 00
+56 05 60 00 bd 77 81 2e 64 00 0a 00 c4 00 6a 00
+56 0d 60 00 bd 77 82 2e 74 00 0a 00 c4 00 6a 00
+56 09 60 00 bd 77 a4 20 24 00 0a 00 54 00 0a 00
+56 01 60 06 bd 77 01 25 e4 03 6a 00 74 00 0a 00
+56 01 60 80 bd 77 a1 24 e4 03 6a 00 64 04 6a 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/dp4.asm b/src/intel/compiler/elk/tests/gen7.5/dp4.asm
new file mode 100644
index 00000000000..e63b3e0ce35
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/dp4.asm
@@ -0,0 +1,6 @@
+dp4(8)          g115<1>.xF      g3<4>F          g1<0>F          { align16 NoDDClr 1Q };
+dp4(8)          g115<1>.yF      g3<4>F          g1.4<0>F        { align16 NoDDClr,NoDDChk 1Q };
+dp4(8)          g115<1>.wF      g3<4>F          g2.4<0>F        { align16 NoDDChk 1Q };
+dp4(8)          g115<1>.wF      g5<4>F          g2.4<0>F        { align16 1Q };
+dp4.sat(8)      g116<1>F        g2<4>.xF        g2<4>F          { align16 1Q };
+dp4(8)          g5<1>.xF        g1<4>F          0x3f800000F  /* 1F */ { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/dp4.expected b/src/intel/compiler/elk/tests/gen7.5/dp4.expected
new file mode 100644
index 00000000000..4d76b244af7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/dp4.expected
@@ -0,0 +1,6 @@
+54 05 60 00 bd 77 61 2e 64 00 6e 00 24 00 0e 00
+54 0d 60 00 bd 77 62 2e 64 00 6e 00 34 00 0e 00
+54 09 60 00 bd 77 68 2e 64 00 6e 00 54 00 0e 00
+54 01 60 00 bd 77 68 2e a4 00 6e 00 54 00 0e 00
+54 01 60 80 bd 77 8f 2e 40 00 60 00 44 00 6e 00
+54 01 60 00 bd 7f a1 20 24 00 6e 00 00 00 80 3f
diff --git a/src/intel/compiler/elk/tests/gen7.5/dph.asm b/src/intel/compiler/elk/tests/gen7.5/dph.asm
new file mode 100644
index 00000000000..c28a84183dc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/dph.asm
@@ -0,0 +1,5 @@
+dph(8)          g116<1>.xF      g4<4>.xyzxF     g5<4>F          { align16 1Q };
+dph.sat(8)      g116<1>F        g1<0>.xyzxF     g3<4>F          { align16 1Q };
+dph(8)          g115<1>.xF      g5<4>.xyzxF     g1<0>F          { align16 NoDDClr 1Q };
+dph(8)          g115<1>.yF      g5<4>.xyzxF     g1.4<0>F        { align16 NoDDClr,NoDDChk 1Q };
+dph(8)          g115<1>.wF      g5<4>.xyzxF     g2.4<0>F        { align16 NoDDChk 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/dph.expected b/src/intel/compiler/elk/tests/gen7.5/dph.expected
new file mode 100644
index 00000000000..02bd5f64902
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/dph.expected
@@ -0,0 +1,5 @@
+55 01 60 00 bd 77 81 2e 84 00 62 00 a4 00 6e 00
+55 01 60 80 bd 77 8f 2e 24 00 02 00 64 00 6e 00
+55 05 60 00 bd 77 61 2e a4 00 62 00 24 00 0e 00
+55 0d 60 00 bd 77 62 2e a4 00 62 00 34 00 0e 00
+55 09 60 00 bd 77 68 2e a4 00 62 00 54 00 0e 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/else.asm b/src/intel/compiler/elk/tests/gen7.5/else.asm
new file mode 100644
index 00000000000..794e4afe178
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/else.asm
@@ -0,0 +1,4 @@
+else(8)         JIP: LABEL0                                     { align16 1Q };
+else(8)         JIP: LABEL0                                     { align1 1Q };
+else(16)        JIP: LABEL0                                     { align1 1H };
+LABEL0:
\ No newline at end of file
diff --git a/src/intel/compiler/elk/tests/gen7.5/else.expected b/src/intel/compiler/elk/tests/gen7.5/else.expected
new file mode 100644
index 00000000000..4e694cc8c1b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/else.expected
@@ -0,0 +1,3 @@
+24 01 60 00 84 3c 0f 20 04 00 6e 00 06 00 00 00
+24 00 60 00 84 3c 00 20 00 00 8d 00 04 00 00 00
+24 00 80 00 84 3c 00 20 00 00 8d 00 02 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/endif.asm b/src/intel/compiler/elk/tests/gen7.5/endif.asm
new file mode 100644
index 00000000000..e1dc0ebdbd8
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/endif.asm
@@ -0,0 +1,5 @@
+endif(8)        JIP: LABEL0                                          { align1 1Q };
+LABEL0:
+endif(16)       JIP: LABEL1                                          { align1 1H };
+endif(8)        JIP: LABEL1                                          { align16 1Q };
+LABEL1:
\ No newline at end of file
diff --git a/src/intel/compiler/elk/tests/gen7.5/endif.expected b/src/intel/compiler/elk/tests/gen7.5/endif.expected
new file mode 100644
index 00000000000..5d3b34d2cdd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/endif.expected
@@ -0,0 +1,3 @@
+25 00 60 00 84 3c 00 20 00 00 8d 00 02 00 00 00
+25 00 80 00 84 3c 00 20 00 00 8d 00 04 00 00 00
+25 01 60 00 84 3c 0f 20 04 00 6e 00 02 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/f16to32.asm b/src/intel/compiler/elk/tests/gen7.5/f16to32.asm
new file mode 100644
index 00000000000..aef3da1399d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/f16to32.asm
@@ -0,0 +1,2 @@
+f16to32(8)      g124<1>F        g2<16,8,2>UW                    { align1 1Q };
+f16to32(16)     g120<1>F        g10<16,8,2>UW                   { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/f16to32.expected b/src/intel/compiler/elk/tests/gen7.5/f16to32.expected
new file mode 100644
index 00000000000..1a37f6d4a55
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/f16to32.expected
@@ -0,0 +1,2 @@
+14 00 60 00 3d 01 80 2f 40 00 ae 00 00 00 00 00
+14 00 80 00 3d 01 00 2f 40 01 ae 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/f32to16.asm b/src/intel/compiler/elk/tests/gen7.5/f32to16.asm
new file mode 100644
index 00000000000..27e377d97d4
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/f32to16.asm
@@ -0,0 +1,2 @@
+f32to16(8)      g21<2>W         g22<8,8,1>F                     { align1 1Q };
+f32to16(16)     g40<2>W         g42<8,8,1>F                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/f32to16.expected b/src/intel/compiler/elk/tests/gen7.5/f32to16.expected
new file mode 100644
index 00000000000..50515f47295
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/f32to16.expected
@@ -0,0 +1,2 @@
+13 00 60 00 ad 03 a0 42 c0 02 8d 00 00 00 00 00
+13 00 80 00 ad 03 00 45 40 05 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/fbh.asm b/src/intel/compiler/elk/tests/gen7.5/fbh.asm
new file mode 100644
index 00000000000..f54933b6b9f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/fbh.asm
@@ -0,0 +1,3 @@
+fbh(8)          g16<1>D         g15<4>D                         { align16 1Q };
+fbh(8)          g7<1>D          g4<8,8,1>D                      { align1 1Q };
+fbh(16)         g8<1>D          g4<8,8,1>D                      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/fbh.expected b/src/intel/compiler/elk/tests/gen7.5/fbh.expected
new file mode 100644
index 00000000000..72fad951c82
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/fbh.expected
@@ -0,0 +1,3 @@
+4b 01 60 00 a5 00 0f 22 e4 01 6e 00 00 00 00 00
+4b 00 60 00 a5 00 e0 20 80 00 8d 00 00 00 00 00
+4b 00 80 00 a5 00 00 21 80 00 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/fbl.asm b/src/intel/compiler/elk/tests/gen7.5/fbl.asm
new file mode 100644
index 00000000000..cf176eda08c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/fbl.asm
@@ -0,0 +1,5 @@
+fbl(8)          g5<1>UD         g5<8,8,1>UD                     { align1 1Q };
+fbl(16)         g6<1>UD         g8<8,8,1>UD                     { align1 1H };
+fbl(1)          g27<1>UD        f1<0,1,0>UB                     { align1 WE_all 1N };
+fbl(1)          g57<1>UD        f1<0,1,0>UW                     { align1 WE_all 1N };
+fbl(8)          g11<1>UD        g10<4>UD                        { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/fbl.expected b/src/intel/compiler/elk/tests/gen7.5/fbl.expected
new file mode 100644
index 00000000000..ef623f0cc9a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/fbl.expected
@@ -0,0 +1,5 @@
+4c 00 60 00 21 00 a0 20 a0 00 8d 00 00 00 00 00
+4c 00 80 00 21 00 c0 20 00 01 8d 00 00 00 00 00
+4c 02 00 00 01 02 60 23 20 06 00 00 00 00 00 00
+4c 02 00 00 01 01 20 27 20 06 00 00 00 00 00 00
+4c 01 60 00 21 00 6f 21 44 01 6e 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/frc.asm b/src/intel/compiler/elk/tests/gen7.5/frc.asm
new file mode 100644
index 00000000000..91a7059c18a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/frc.asm
@@ -0,0 +1,4 @@
+frc.sat(8)      g116<1>F        g3<4>F                          { align16 1Q };
+frc(8)          g20<1>.xF       g1<0>.xF                        { align16 1Q };
+frc(8)          g52<1>F         g43<8,8,1>F                     { align1 1Q };
+frc(16)         g78<1>F         g95<8,8,1>F                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/frc.expected b/src/intel/compiler/elk/tests/gen7.5/frc.expected
new file mode 100644
index 00000000000..cedfd5a8c6b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/frc.expected
@@ -0,0 +1,4 @@
+43 01 60 80 bd 03 8f 2e 64 00 6e 00 00 00 00 00
+43 01 60 00 bd 03 81 22 20 00 00 00 00 00 00 00
+43 00 60 00 bd 03 80 26 60 05 8d 00 00 00 00 00
+43 00 80 00 bd 03 c0 29 e0 0b 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/halt.asm b/src/intel/compiler/elk/tests/gen7.5/halt.asm
new file mode 100644
index 00000000000..5f29e88c57c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/halt.asm
@@ -0,0 +1,6 @@
+(-f0.1.any4h) halt(8) JIP: LABEL0      UIP: LABEL0              { align1 1Q };
+halt(8)         JIP: LABEL1            UIP: LABEL1              { align1 1Q };
+LABEL1:
+(-f0.1.any4h) halt(16) JIP: LABEL0     UIP: LABEL0              { align1 1H };
+halt(16)        JIP: LABEL0            UIP: LABEL0              { align1 1H };
+LABEL0:
diff --git a/src/intel/compiler/elk/tests/gen7.5/halt.expected b/src/intel/compiler/elk/tests/gen7.5/halt.expected
new file mode 100644
index 00000000000..f76a4b179f7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/halt.expected
@@ -0,0 +1,4 @@
+2a 00 76 00 84 1c 00 20 00 00 8d 02 08 00 08 00
+2a 00 60 00 84 1c 00 20 00 00 8d 00 02 00 02 00
+2a 00 96 00 84 1c 00 20 00 00 8d 02 04 00 04 00
+2a 00 80 00 84 1c 00 20 00 00 8d 00 02 00 02 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/if.asm b/src/intel/compiler/elk/tests/gen7.5/if.asm
new file mode 100644
index 00000000000..596e8edb6bc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/if.asm
@@ -0,0 +1,9 @@
+(-f0.0) if(8)   JIP: LABEL0     UIP: LABEL2                     { align1 1Q };
+LABEL0:
+(-f0.0) if(16)  JIP: LABEL2     UIP: LABEL1                     { align1 1H };
+(+f0.0.x) if(8) JIP: LABEL2     UIP: LABEL1                     { align16 1Q };
+LABEL1:
+(+f0.0) if(8)   JIP: LABEL2     UIP: LABEL2                     { align16 1Q };
+(+f0.0) if(8)   JIP: LABEL2     UIP: LABEL2                     { align1 1Q };
+(+f0.0) if(16)  JIP: LABEL2     UIP: LABEL2                     { align1 1H };
+LABEL2:
diff --git a/src/intel/compiler/elk/tests/gen7.5/if.expected b/src/intel/compiler/elk/tests/gen7.5/if.expected
new file mode 100644
index 00000000000..2cb44c196a2
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/if.expected
@@ -0,0 +1,6 @@
+22 00 71 00 84 3c 00 20 00 00 00 00 02 00 0c 00
+22 00 91 00 84 3c 00 20 00 00 00 00 0a 00 04 00
+22 01 62 00 84 3c 0f 20 04 00 0e 00 08 00 02 00
+22 01 61 00 84 3c 0f 20 04 00 0e 00 06 00 06 00
+22 00 61 00 84 3c 00 20 00 00 00 00 04 00 04 00
+22 00 81 00 84 3c 00 20 00 00 00 00 02 00 02 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/lrp.asm b/src/intel/compiler/elk/tests/gen7.5/lrp.asm
new file mode 100644
index 00000000000..44988cfe3b6
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/lrp.asm
@@ -0,0 +1,4 @@
+lrp(8)          g42<1>F         g41<4,4,1>.xF   g40<4,4,1>F     g39<4,4,1>F { align16 1Q };
+lrp(16)         g4<1>F          g2.4<0,1,0>F    g2.2<0,1,0>F    g2.0<0,1,0>F { align16 1H };
+lrp.sat(8)      g7<1>F          g10<4,4,1>F     g13<4,4,1>F     g16<4,4,1>F { align16 1Q };
+lrp.sat(16)     g18<1>F         g20<4,4,1>F     g26<4,4,1>F     g32<4,4,1>F { align16 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/lrp.expected b/src/intel/compiler/elk/tests/gen7.5/lrp.expected
new file mode 100644
index 00000000000..f9e74260bd2
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/lrp.expected
@@ -0,0 +1,4 @@
+5c 01 60 00 00 00 1e 2a 00 90 02 39 50 20 c7 09
+5c 01 80 00 00 00 1e 04 01 28 20 80 04 04 80 00
+5c 01 60 80 00 00 1e 07 c8 a1 00 39 1a 20 07 04
+5c 01 80 80 00 00 1e 12 c8 41 01 39 34 20 07 08
diff --git a/src/intel/compiler/elk/tests/gen7.5/lzd.asm b/src/intel/compiler/elk/tests/gen7.5/lzd.asm
new file mode 100644
index 00000000000..da2da08af28
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/lzd.asm
@@ -0,0 +1,3 @@
+lzd(8)          g20<1>UD        g2.4<0>UD                       { align16 1Q };
+lzd(8)          g17<1>UD        g3.1<0,1,0>UD                   { align1 1Q };
+lzd(16)         g27<1>UD        g3.1<0,1,0>UD                   { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/lzd.expected b/src/intel/compiler/elk/tests/gen7.5/lzd.expected
new file mode 100644
index 00000000000..429bf4578d9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/lzd.expected
@@ -0,0 +1,3 @@
+4a 01 60 00 21 00 8f 22 54 00 0e 00 00 00 00 00
+4a 00 60 00 21 00 20 22 64 00 00 00 00 00 00 00
+4a 00 80 00 21 00 60 23 64 00 00 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/mach.asm b/src/intel/compiler/elk/tests/gen7.5/mach.asm
new file mode 100644
index 00000000000..fc7e3c5ea2e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/mach.asm
@@ -0,0 +1,14 @@
+mach(8)         g19<1>UD        g17<8,8,1>UD    0xaaaaaaabUD    { align1 1Q AccWrEnable };
+mach(8)         g10<1>D         g17<8,8,1>D     1431655766D     { align1 1Q AccWrEnable };
+mach(8)         g87<1>UD        g84<8,8,1>UD    0xaaaaaaabUD    { align1 2Q AccWrEnable };
+mach(8)         g95<1>D         g84<8,8,1>D     1431655766D     { align1 2Q AccWrEnable };
+mach(8)         null<1>D        g1<0>.xyzzD     g1<0>.wD        { align16 1Q AccWrEnable };
+mach(8)         g12<1>UD        g9<4>UD         g11<4>UD        { align16 1Q AccWrEnable };
+mach(8)         g12<1>UD        g4<8,8,1>UD     g8<8,8,1>UD     { align1 1Q AccWrEnable };
+mach(8)         g21<1>UD        g5<8,8,1>UD     g13<8,8,1>UD    { align1 2Q AccWrEnable };
+mach(8)         g15<1>D         g12<4>D         g14<4>D         { align16 1Q AccWrEnable };
+mach(8)         g7<1>.xUD       g5<4>.xUD       0xaaaaaaabUD    { align16 1Q AccWrEnable };
+mach(8)         g15<1>.xD       g5<4>.xD        1431655766D     { align16 1Q AccWrEnable };
+mach(8)         null<1>D        g1<4>.xD        741092396D      { align16 1Q AccWrEnable };
+mach(8)         g13<1>D         g5<8,8,1>D      g9<8,8,1>D      { align1 1Q AccWrEnable };
+mach(8)         g22<1>D         g6<8,8,1>D      g14<8,8,1>D     { align1 2Q AccWrEnable };
diff --git a/src/intel/compiler/elk/tests/gen7.5/mach.expected b/src/intel/compiler/elk/tests/gen7.5/mach.expected
new file mode 100644
index 00000000000..ffd7aa45b96
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/mach.expected
@@ -0,0 +1,14 @@
+49 00 60 10 21 0c 60 22 20 02 8d 00 ab aa aa aa
+49 00 60 10 a5 1c 40 21 20 02 8d 00 56 55 55 55
+49 10 60 10 21 0c e0 2a 80 0a 8d 00 ab aa aa aa
+49 10 60 10 a5 1c e0 2b 80 0a 8d 00 56 55 55 55
+49 01 60 10 a4 14 0f 20 24 00 0a 00 2f 00 0f 00
+49 01 60 10 21 04 8f 21 24 01 6e 00 64 01 6e 00
+49 00 60 10 21 04 80 21 80 00 8d 00 00 01 8d 00
+49 10 60 10 21 04 a0 22 a0 00 8d 00 a0 01 8d 00
+49 01 60 10 a5 14 ef 21 84 01 6e 00 c4 01 6e 00
+49 01 60 10 21 0c e1 20 a0 00 60 00 ab aa aa aa
+49 01 60 10 a5 1c e1 21 a0 00 60 00 56 55 55 55
+49 01 60 10 a4 1c 0f 20 20 00 60 00 2c 2c 2c 2c
+49 00 60 10 a5 14 a0 21 a0 00 8d 00 20 01 8d 00
+49 10 60 10 a5 14 c0 22 c0 00 8d 00 c0 01 8d 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/mad.asm b/src/intel/compiler/elk/tests/gen7.5/mad.asm
new file mode 100644
index 00000000000..2d9394ab390
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/mad.asm
@@ -0,0 +1,39 @@
+mad(8)          g11<1>F         g4.7<0,1,0>F    g4.3<0,1,0>F    g9<4,4,1>F { align16 1Q };
+mad(16)         g24<1>F         g6.7<0,1,0>F    g6.3<0,1,0>F    g20<4,4,1>F { align16 1H };
+mad(8)          g18<1>.xyzF     -g16<4,4,1>.xyzzF g11<4,4,1>.xyzzF g9<4,4,1>.xyzzF { align16 1Q };
+mad(16)         g3<1>F          -g2.4<0,1,0>F   g9.0<0,1,0>F    g2.0<0,1,0>F { align16 1H };
+mad(8)          g116<1>.xyzF    g9<4,4,1>.xyzzF g6<4,4,1>.xyzzF g30<4,4,1>.xyzzF { align16 NoDDClr 1Q };
+mad.le.f0.0(8)  g9<1>F          g3<4,4,1>F      g4.2<0,1,0>F    g15<4,4,1>F { align16 1Q };
+mad.le.f0.0(16) g15<1>F         g4<4,4,1>F      g6.2<0,1,0>F    g24<4,4,1>F { align16 1H };
+mad(8)          g32<1>F         g31<4,4,1>F     g2.3<0,1,0>F    -g15<4,4,1>F { align16 1Q };
+mad(16)         g56<1>F         g54<4,4,1>F     g2.3<0,1,0>F    -g5<4,4,1>F { align16 1H };
+mad.sat(8)      g12<1>F         g4.1<0,1,0>F    g4.0<0,1,0>F    g13<4,4,1>F { align16 1Q };
+mad.sat(16)     g18<1>F         g6.1<0,1,0>F    g6.0<0,1,0>F    g10<4,4,1>F { align16 1H };
+mad(8)          g22<1>.xF       g10<4,4,1>.xF   g21<4,4,1>.xF   (abs)g5.6<0,1,0>F { align16 1Q };
+mad.sat(8)      g116<1>.xyzF    g95<4,4,1>.xyzzF g89<4,4,1>.xyzzF g93<4,4,1>.zF { align16 NoDDClr 1Q };
+mad.ge.f0.0(8)  g16<1>.xF       g4<4,4,1>.xF    g15<4,4,1>.xF   -g1.4<0,1,0>F { align16 1Q };
+mad(8)          g5<1>.zF        g34<4,4,1>.xF   g25<4,4,1>.xF   g1.7<0,1,0>F { align16 NoDDClr,NoDDChk 1Q };
+mad.ge.f0.0(16) g23<1>F         g44.0<0,1,0>F   g6<4,4,1>F      -g3.0<0,1,0>F { align16 1H };
+mad(8)          g53<1>F         -g52<4,4,1>F    g21<4,4,1>F     -g21<4,4,1>F { align16 1Q };
+mad(8)          g71<1>F         -g8<4,4,1>F     -g2.4<0,1,0>F   -g21<4,4,1>F { align16 1Q };
+mad(16)         g96<1>F         -g94<4,4,1>F    g86<4,4,1>F     -g86<4,4,1>F { align16 1H };
+mad(16)         g5<1>F          -g92<4,4,1>F    -g2.4<0,1,0>F   -g86<4,4,1>F { align16 1H };
+mad(8)          g115<1>.xyF     g14<4,4,1>.xF   g13<4,4,1>.xF   g22<4,4,1>.xyyyF { align16 NoDDChk 1Q };
+mad(8)          g30<1>F         g44.1<0,1,0>F   -g44.0<0,1,0>F  g27<4,4,1>F { align16 1Q };
+mad(16)         g56<1>F         g6.1<0,1,0>F    -g6.0<0,1,0>F   g51<4,4,1>F { align16 1H };
+mad.sat(8)      g116<1>.xyzF    -g9<4,4,1>.xyzzF g8<4,4,1>.zxyyF g6<4,4,1>.yzxxF { align16 NoDDClr 1Q };
+mad(8)          g2<1>F          -g6<4,4,1>F     (abs)g5<4,4,1>F g14.0<0,1,0>F { align16 1Q };
+mad(16)         g2<1>F          -g8<4,4,1>F     (abs)g6<4,4,1>F g25.0<0,1,0>F { align16 1H };
+mad(8)          g17<1>.xF       -g29<4,4,1>.xF  g2.2<0,1,0>F    g1.5<0,1,0>F { align16 NoDDClr 1Q };
+mad(8)          g18<1>.yF       -g36<4,4,1>.xF  g2.2<0,1,0>F    g1.0<0,1,0>F { align16 NoDDClr,NoDDChk 1Q };
+mad(8)          g17<1>.zF       -g39<4,4,1>.xF  g1.6<0,1,0>F    g1.1<0,1,0>F { align16 NoDDChk 1Q };
+mad.l.f0.0(8)   g27<1>F         g7<4,4,1>F      g2.7<0,1,0>F    g3.3<0,1,0>F { align16 1Q };
+mad.l.f0.0(16)  g5<1>F          g9<4,4,1>F      g2.7<0,1,0>F    g3.3<0,1,0>F { align16 1H };
+mad(8)          g5<1>F          -g20.2<0,1,0>F  g11<4,4,1>F     (abs)g6<4,4,1>F { align16 1Q };
+mad(16)         g21<1>F         -g6.2<0,1,0>F   g19<4,4,1>F     (abs)g9<4,4,1>F { align16 1H };
+mad(16)         g23<1>F         g6.1<0,1,0>F    g21<4,4,1>F     (abs)g9<4,4,1>F { align16 1H };
+mad(8)          g9<1>F          g3.2<0,1,0>F    -g3.3<0,1,0>F   (abs)g2.0<0,1,0>F { align16 1Q };
+mad(16)         g15<1>F         g35.2<0,1,0>F   -g35.3<0,1,0>F  (abs)g2.0<0,1,0>F { align16 1H };
+mad(8)          g2<1>.xF        g12<4,4,1>.xF   g34<4,4,1>.xF   -g6<4,4,1>.xF { align16 NoDDClr 1Q };
+mad.nz.f0.0(8)  g10<1>F         -g12.0<0,1,0>F  g7<4,4,1>F      g10<4,4,1>F { align16 1Q };
+mad.nz.f0.0(16) g15<1>F         -g33.0<0,1,0>F  g9<4,4,1>F      g17<4,4,1>F { align16 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/mad.expected b/src/intel/compiler/elk/tests/gen7.5/mad.expected
new file mode 100644
index 00000000000..413c99ba26c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/mad.expected
@@ -0,0 +1,39 @@
+5b 01 60 00 00 00 1e 0b 01 4e 20 c0 08 20 47 02
+5b 01 80 00 00 00 1e 18 01 6e 20 c0 0c 20 07 05
+5b 01 60 00 20 00 0e 12 48 01 01 29 16 20 45 02
+5b 01 80 00 20 00 1e 03 01 28 20 00 12 04 80 00
+5b 05 60 00 00 00 0e 74 48 91 00 29 0c 20 85 07
+5b 01 60 06 00 00 1e 09 c8 31 20 80 08 20 c7 03
+5b 01 80 06 00 00 1e 0f c8 41 20 80 0c 20 07 06
+5b 01 60 00 00 02 1e 20 c8 f1 21 c0 04 20 c7 03
+5b 01 80 00 00 02 1e 38 c8 61 23 c0 04 20 47 01
+5b 01 60 80 00 00 1e 0c 01 42 20 00 08 20 47 03
+5b 01 80 80 00 00 1e 12 01 62 20 00 0c 20 87 02
+5b 01 60 00 00 01 02 16 00 a0 00 00 2a 04 70 01
+5b 05 60 80 00 00 0e 74 48 f1 05 29 b2 50 45 17
+5b 01 60 04 00 02 02 10 00 40 00 00 1e 04 60 00
+5b 0d 60 00 00 00 08 05 00 20 02 00 32 04 78 00
+5b 01 80 04 00 02 1e 17 01 c0 02 39 0c 04 c0 00
+5b 01 60 00 20 02 1e 35 c8 41 03 39 2a 20 47 05
+5b 01 60 00 a0 02 1e 47 c8 81 20 00 05 20 47 05
+5b 01 80 00 20 02 1e 60 c8 e1 05 39 ac 20 87 15
+5b 01 80 00 a0 02 1e 05 c8 c1 25 00 05 20 87 15
+5b 09 60 00 00 00 06 73 00 e0 00 00 1a a0 82 05
+5b 01 60 00 80 00 1e 1e 01 c2 22 00 58 20 c7 06
+5b 01 80 00 80 00 1e 38 01 62 20 00 0c 20 c7 0c
+5b 05 60 80 20 00 0e 74 48 91 80 14 10 48 80 01
+5b 01 60 00 60 00 1e 02 c8 61 00 39 0a 04 80 03
+5b 01 80 00 60 00 1e 02 c8 81 00 39 0c 04 40 06
+5b 05 60 00 20 00 02 11 00 d0 21 80 04 04 68 00
+5b 0d 60 00 20 00 04 12 00 40 22 80 04 04 40 00
+5b 09 60 00 20 00 08 11 00 70 22 80 03 04 48 00
+5b 01 60 05 00 00 1e 1b c8 71 20 c0 05 04 d8 00
+5b 01 80 05 00 00 1e 05 c8 91 20 c0 05 04 d8 00
+5b 01 60 00 20 01 1e 05 01 44 01 39 16 20 87 01
+5b 01 80 00 20 01 1e 15 01 64 00 39 26 20 47 02
+5b 01 80 00 00 01 1e 17 01 62 00 39 2a 20 47 02
+5b 01 60 00 80 01 1e 09 01 34 20 c0 06 04 80 00
+5b 01 80 00 80 01 1e 0f 01 34 22 c0 46 04 80 00
+5b 05 60 00 00 02 02 02 00 c0 00 00 44 00 80 01
+5b 01 60 02 20 00 1e 0a 01 c0 00 39 0e 20 87 02
+5b 01 80 02 20 00 1e 0f 01 10 02 39 12 20 47 04
diff --git a/src/intel/compiler/elk/tests/gen7.5/math.asm b/src/intel/compiler/elk/tests/gen7.5/math.asm
new file mode 100644
index 00000000000..e3179b56b25
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/math.asm
@@ -0,0 +1,45 @@
+math sqrt(8)    g25<1>.xF       g24<4>.xF       null<4>F        { align16 1Q };
+math inv(8)     g5<1>F          g4<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math inv(16)    g10<1>F         g8<8,8,1>F      null<8,8,1>F    { align1 1H };
+math intmod(8)  g21<1>.xyUD     g1<0>.xUD       g1<0>.yzzzUD    { align16 1Q };
+math inv(8)     g11<1>.xyzF     g2<0>.xyzzF     null<4>F        { align16 1Q };
+math sqrt(8)    g9<1>F          g8<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math sqrt(16)   g15<1>F         g13<8,8,1>F     null<8,8,1>F    { align1 1H };
+math pow(8)     g20<1>F         g14<8,8,1>F     g20<0,1,0>F     { align1 1Q };
+math pow(16)    g26<1>F         g24<8,8,1>F     g28<0,1,0>F     { align1 1H };
+math intmod(8)  g4<1>UD         g2<0,1,0>UD     g2.3<0,1,0>UD   { align1 1Q };
+math intmod(8)  g5<1>UD         g2<0,1,0>UD     g2.3<0,1,0>UD   { align1 2Q };
+math pow(8)     g14<1>.xF       g13<4>.xF       g12<4>.xF       { align16 1Q };
+math log(8)     g7<1>F          g6<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math log(16)    g11<1>F         g9<8,8,1>F      null<8,8,1>F    { align1 1H };
+math cos(8)     g3<1>F          g2<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math cos(16)    g4<1>F          g2<8,8,1>F      null<8,8,1>F    { align1 1H };
+math cos(8)     g5<1>.yF        g14<4>.xF       null<4>F        { align16 1Q };
+math sin(8)     g21<1>.xF       g16<4>.xF       null<4>F        { align16 1Q };
+math exp(8)     g16<1>.xF       g15<4>.xF       null<4>F        { align16 1Q };
+math log(8)     g15<1>.xF       g14<4>.xF       null<4>F        { align16 1Q };
+math intdiv(8)  g10<1>D         g1<0>.xD        g1.4<0>D        { align16 1Q };
+math exp(8)     g124<1>F        g5<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math exp(16)    g120<1>F        g7<8,8,1>F      null<8,8,1>F    { align1 1H };
+math intdiv(8)  g3<1>UD         g2<0,1,0>UD     g2.2<0,1,0>UD   { align1 1Q };
+math intdiv(8)  g4<1>UD         g2<0,1,0>UD     g2.2<0,1,0>UD   { align1 2Q };
+math intdiv(8)  g14<1>.xyzUD    g6<0>.xyzzUD    g6.4<0>.xyzzUD  { align16 1Q };
+math intdiv(8)  g4<1>D          g2<0,1,0>D      g2.4<0,1,0>D    { align1 1Q };
+math intdiv(8)  g5<1>D          g2<0,1,0>D      g2.4<0,1,0>D    { align1 2Q };
+math rsq(8)     g69<1>.xF       (abs)g68<4>.xF  null<4>F        { align16 1Q };
+math sin(8)     g3<1>F          g2<0,1,0>F      null<8,8,1>F    { align1 1Q };
+math sin(16)    g3<1>F          g2<0,1,0>F      null<8,8,1>F    { align1 1H };
+math rsq(8)     g127<1>F        g2.1<0,1,0>F    null<8,8,1>F    { align1 1Q };
+math rsq(16)    g126<1>F        g2.1<0,1,0>F    null<8,8,1>F    { align1 1H };
+math.sat pow(8) g3<1>F          g2<0,1,0>F      g2.4<0,1,0>F    { align1 1Q };
+math.sat pow(16) g3<1>F         g2<0,1,0>F      g2.4<0,1,0>F    { align1 1H };
+math.sat sqrt(8) g3<1>F         g2<0,1,0>F      null<8,8,1>F    { align1 1Q };
+math.sat sqrt(16) g3<1>F        g2<0,1,0>F      null<8,8,1>F    { align1 1H };
+math.sat exp(8) g3<1>F          g2<0,1,0>F      null<8,8,1>F    { align1 1Q };
+math.sat exp(16) g3<1>F         g2<0,1,0>F      null<8,8,1>F    { align1 1H };
+math.sat sqrt(8) g8<1>F         g1<0>F          null<4>F        { align16 1Q };
+math.sat exp(8) g8<1>F          g1<0>F          null<4>F        { align16 1Q };
+math.sat rsq(8) g116<1>F        (abs)g6<4>.xF   null<4>F        { align16 1Q };
+math.sat pow(8) g116<1>F        g2<4>.xF        g2<4>.yF        { align16 1Q };
+math.sat inv(8) g116<1>.xF      g1<0>.xF        null<4>F        { align16 1Q };
+math.sat log(8) g116<1>F        g6<4>.xF        null<4>F        { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/math.expected b/src/intel/compiler/elk/tests/gen7.5/math.expected
new file mode 100644
index 00000000000..e7ddbd6d333
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/math.expected
@@ -0,0 +1,45 @@
+38 01 60 04 bd 73 21 23 00 03 60 00 04 00 6e 00
+38 00 60 01 bd 73 a0 20 80 00 8d 00 00 00 8d 00
+38 00 80 01 bd 73 40 21 00 01 8d 00 00 00 8d 00
+38 01 60 0d 21 04 a3 22 20 00 00 00 29 00 0a 00
+38 01 60 01 bd 73 67 21 44 00 0a 00 04 00 6e 00
+38 00 60 04 bd 73 20 21 00 01 8d 00 00 00 8d 00
+38 00 80 04 bd 73 e0 21 a0 01 8d 00 00 00 8d 00
+38 00 60 0a bd 77 80 22 c0 01 8d 00 80 02 00 00
+38 00 80 0a bd 77 40 23 00 03 8d 00 80 03 00 00
+38 00 60 0d 21 04 80 20 40 00 00 00 4c 00 00 00
+38 10 60 0d 21 04 a0 20 40 00 00 00 4c 00 00 00
+38 01 60 0a bd 77 c1 21 a0 01 60 00 80 01 60 00
+38 00 60 02 bd 73 e0 20 c0 00 8d 00 00 00 8d 00
+38 00 80 02 bd 73 60 21 20 01 8d 00 00 00 8d 00
+38 00 60 07 bd 73 60 20 40 00 8d 00 00 00 8d 00
+38 00 80 07 bd 73 80 20 40 00 8d 00 00 00 8d 00
+38 01 60 07 bd 73 a2 20 c0 01 60 00 04 00 6e 00
+38 01 60 06 bd 73 a1 22 00 02 60 00 04 00 6e 00
+38 01 60 03 bd 73 01 22 e0 01 60 00 04 00 6e 00
+38 01 60 02 bd 73 e1 21 c0 01 60 00 04 00 6e 00
+38 01 60 0c a5 14 4f 21 20 00 00 00 34 00 0e 00
+38 00 60 03 bd 73 80 2f a0 00 8d 00 00 00 8d 00
+38 00 80 03 bd 73 00 2f e0 00 8d 00 00 00 8d 00
+38 00 60 0c 21 04 60 20 40 00 00 00 48 00 00 00
+38 10 60 0c 21 04 80 20 40 00 00 00 48 00 00 00
+38 01 60 0c 21 04 c7 21 c4 00 0a 00 d4 00 0a 00
+38 00 60 0c a5 14 80 20 40 00 00 00 50 00 00 00
+38 10 60 0c a5 14 a0 20 40 00 00 00 50 00 00 00
+38 01 60 05 bd 73 a1 28 80 28 60 00 04 00 6e 00
+38 00 60 06 bd 73 60 20 40 00 00 00 00 00 8d 00
+38 00 80 06 bd 73 60 20 40 00 00 00 00 00 8d 00
+38 00 60 05 bd 73 e0 2f 44 00 00 00 00 00 8d 00
+38 00 80 05 bd 73 c0 2f 44 00 00 00 00 00 8d 00
+38 00 60 8a bd 77 60 20 40 00 00 00 50 00 00 00
+38 00 80 8a bd 77 60 20 40 00 00 00 50 00 00 00
+38 00 60 84 bd 73 60 20 40 00 00 00 00 00 8d 00
+38 00 80 84 bd 73 60 20 40 00 00 00 00 00 8d 00
+38 00 60 83 bd 73 60 20 40 00 00 00 00 00 8d 00
+38 00 80 83 bd 73 60 20 40 00 00 00 00 00 8d 00
+38 01 60 84 bd 73 0f 21 24 00 0e 00 04 00 6e 00
+38 01 60 83 bd 73 0f 21 24 00 0e 00 04 00 6e 00
+38 01 60 85 bd 73 8f 2e c0 20 60 00 04 00 6e 00
+38 01 60 8a bd 77 8f 2e 40 00 60 00 45 00 65 00
+38 01 60 81 bd 73 81 2e 20 00 00 00 04 00 6e 00
+38 01 60 82 bd 73 8f 2e c0 00 60 00 04 00 6e 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/mov.asm b/src/intel/compiler/elk/tests/gen7.5/mov.asm
new file mode 100644
index 00000000000..8a3d6a8b7bb
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/mov.asm
@@ -0,0 +1,187 @@
+mov(8)          g114<1>D        0D                              { align16 1Q };
+mov.sat(8)      g116<1>F        g4<4>F                          { align16 1Q };
+mov(8)          g114<1>.wF      g5<4>.xF                        { align16 1Q };
+mov(8)          g113<1>UD       g0<4>UD                         { align16 WE_all 1Q };
+mov(4)          g114<1>F        g2.3<8,2,4>F                    { align1 WE_all 1N };
+mov(8)          g126<1>F        0x0F             /* 0F */       { align1 1Q };
+mov(8)          g125<1>F        -g9<8,8,1>D                     { align1 1Q };
+mov(16)         g124<1>F        0x0F             /* 0F */       { align1 1H };
+mov(16)         g122<1>F        -g15<8,8,1>D                    { align1 1H };
+mov(8)          g126<1>D        1065353216D                     { align1 1Q };
+mov.nz.f0.0(8)  null<1>D        g2<0,1,0>D                      { align1 1Q };
+mov(16)         g124<1>D        1065353216D                     { align1 1H };
+mov.nz.f0.0(16) null<1>D        g2<0,1,0>D                      { align1 1H };
+mov(8)          g115<1>F        0x41700000F      /* 15F */      { align16 1Q };
+mov(8)          g124<1>F        g2<0,1,0>F                      { align1 1Q };
+mov(16)         g120<1>F        g2<0,1,0>F                      { align1 1H };
+mov(8)          g2<1>F          g6<8,4,1>UW                     { align1 1Q };
+mov(8)          g7<1>D          g2<8,8,1>F                      { align1 1Q };
+mov(16)         g2<1>F          g4<8,8,1>UW                     { align1 1H };
+mov(16)         g8<1>D          g2<8,8,1>F                      { align1 1H };
+mov(8)          g12<1>F         0x30003000VF    /* [0F, 1F, 0F, 1F]VF */ { align16 1Q };
+mov(8)          g51<1>UD        0x00000000UD                    { align1 WE_all 1Q };
+mov(1)          g51.5<1>UD      0x0000ff00UD                    { align1 WE_all 1N };
+mov(1)          g51<1>UD        g[a0]<0,1,0>UD                  { align1 WE_all 1N };
+mov(2)          g12<1>UD        g0<0,1,0>UD                     { align1 WE_all 1N };
+mov(8)          g13<1>D         g50<4>D                         { align16 WE_all 1Q };
+mov(8)          g15<1>.xUD      g5<0>.wUD                       { align16 1Q };
+(+f0.0.any4h) mov(8) g19<1>.xD  -1D                             { align16 1Q };
+mov.z.f0.0(8)   null<1>F        g11<0>.xUD                      { align16 1Q };
+mov(8)          g126<1>F        0x00000000UD                    { align1 WE_all 1Q };
+mov(8)          g39<1>D         g3.4<0>D                        { align16 1Q };
+mov(8)          g114<1>.xD      g1<0>.xD                        { align16 NoDDClr 1Q };
+mov(8)          g114<1>.yzwF    0x0VF           /* [0F, 0F, 0F, 0F]VF */ { align16 NoDDChk 1Q };
+mov.nz.f0.0(8)  null<1>.xD      g16<4>.xD                       { align16 1Q };
+mov(8)          g17<1>F         -g15<4>D                        { align16 1Q };
+mov(8)          g15<1>F         g14<4>UD                        { align16 1Q };
+mov(2)          g113.3<1>UD     0x00000000UD                    { align1 WE_all 1N };
+mov(2)          g113.4<1>UW     g33<8,1,0>UW                    { align1 WE_all 1N };
+mov(1)          g126<1>D        0D                              { align1 WE_all 1N };
+mov(1)          g126<1>D        g39<0,1,0>D                     { align1 WE_all 1N };
+mov(1)          f0.1<1>UW       g1.14<0,1,0>UW                  { align1 WE_all 1N };
+mov(8)          g18<1>UD        g2<8,8,1>D                      { align1 1Q };
+mov(8)          g124<1>D        g8<8,8,1>D                      { align1 1Q };
+mov(1)          g32<1>F         0x3e800000F      /* 0.25F */    { align1 WE_all 1N };
+mov(16)         g12<1>UD        g40<8,8,1>D                     { align1 1H };
+mov(16)         g120<1>D        g48<8,8,1>D                     { align1 1H };
+mov(8)          g116<1>.xD      1059749626D                     { align16 NoDDClr 1Q };
+mov(8)          g116<1>.yD      1143373824D                     { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          g117<1>.yD      -1093874483D                    { align16 NoDDChk 1Q };
+mov(8)          g7<1>UD         g0<8,8,1>UD                     { align1 WE_all 1Q };
+mov(8)          g21<1>UD        g0<8,8,1>UD                     { align1 WE_all 2Q };
+mov(8)          g23<1>F         g6<0,1,0>F                      { align1 2Q };
+mov(1)          g21.2<1>UD      0x000003f2UD                    { align1 WE_all 3N };
+mov(8)          g8<1>.yD        g4<0>.yD                        { align16 NoDDChk 1Q };
+mov(8)          g21<1>.xyzD     acc0<4>D                        { align16 1Q };
+mov(8)          g116<1>.xF      -g9<4>.xD                       { align16 NoDDClr 1Q };
+mov(8)          g115<1>.yUD     0x00000000UD                    { align16 NoDDChk 1Q };
+mov(8)          g7<1>UD         g0.1<0,1,0>UD                   { align1 1Q };
+mov.sat(8)      g124<1>F        g2<0,1,0>F                      { align1 1Q };
+mov.sat(16)     g120<1>F        g2<0,1,0>F                      { align1 1H };
+mov(8)          g19<1>.xD       g18<4>.xF                       { align16 1Q };
+mov(8)          g8<1>D          1065353216D                     { align16 WE_all 1Q };
+mov(8)          g6<1>F          g2<8,8,1>UD                     { align1 1Q };
+mov(16)         g2<1>F          g18<8,8,1>UD                    { align1 1H };
+mov(8)          g115<1>.wF      0D                              { align16 NoDDChk 1Q };
+(+f0.0.all4h) mov(8) g66<1>.xD  -1D                             { align16 1Q };
+mov.sat(8)      g116<1>.xF      0x3f800000F      /* 1F */       { align16 NoDDClr 1Q };
+mov.sat(8)      g116<1>.yF      0x3f666666F      /* 0.9F */     { align16 NoDDClr,NoDDChk 1Q };
+mov.sat(8)      g116<1>.wF      0x3f333333F      /* 0.7F */     { align16 NoDDChk 1Q };
+mov(8)          g19<1>.yzwD     0x48403000VF    /* [0F, 1F, 2F, 3F]VF */ { align16 1Q };
+mov(16)         g13<1>UD        g0<8,8,1>UD                     { align1 WE_all 1H };
+mov(1)          f1<1>UD         g1.7<0,1,0>UD                   { align1 WE_all 1N };
+mov(8)          g116<1>.yF      g56<4>.xD                       { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          g116<1>.wF      g58<4>.xD                       { align16 NoDDChk 1Q };
+mov(8)          g115<1>.zwF     0x30000000VF    /* [0F, 0F, 0F, 1F]VF */ { align16 NoDDClr 1Q };
+mov.nz.f0.0(8)  g4<1>F          -(abs)g2<0,1,0>F                { align1 1Q };
+(+f0.0) mov(8)  g4<1>F          -g22<0,1,0>F                    { align1 1Q };
+mov.nz.f0.0(16) g4<1>F          -(abs)g2<0,1,0>F                { align1 1H };
+(+f0.0) mov(16) g4<1>F          -g39<0,1,0>F                    { align1 1H };
+mov(8)          g11<1>UD        0D                              { align1 WE_all 1Q };
+mov(1)          g11.7<1>UD      65535D                          { align1 WE_all 1N };
+mov(8)          g11<1>UD        0D                              { align1 WE_all 2Q };
+mov(8)          g12<1>D         g25<8,8,1>D                     { align1 2Q };
+mov(1)          g11.7<1>UD      65535D                          { align1 WE_all 3N };
+mov(1)          g7.7<1>UD       g1.7<0,1,0>UD                   { align1 WE_all 3N };
+mov(8)          g116<1>.xyD     g4<4>.xyyyD                     { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          g3<1>.xF        -g16<4>.xF                      { align16 NoDDChk 1Q };
+mov(8)          g7<1>.xUD       0x00000000UD                    { align16 1Q };
+mov(8)          g20<1>.xUD      0D                              { align16 WE_all 1Q };
+mov(8)          g19<1>.xUD      g11<4>.xD                       { align16 1Q };
+mov.nz.f1.0(4)  null<1>F        g14<4>.xUD                      { align16 WE_all 1N };
+mov(8)          g8<1>.xUD       g1<0>.xF                        { align16 1Q };
+mov(8)          g9<1>UD         g7<8,8,1>F                      { align1 1Q };
+mov(8)          g11<1>UD        g9<16,8,2>UW                    { align1 1Q };
+mov(16)         g15<1>UD        g11<8,8,1>F                     { align1 1H };
+mov(16)         g19<1>UD        g15<16,8,2>UW                   { align1 1H };
+mov(8)          g4<1>.xUD       0x00000000UD                    { align16 WE_all 1Q };
+mov.z.f0.0(8)   null<1>.xD      0x00000000UD                    { align16 1Q };
+mov(8)          g7<1>D          0x00000000UD                    { align1 1Q };
+mov(8)          g9<1>D          g5<8,8,1>UD                     { align1 1Q };
+mov(16)         g75<1>D         0x00000000UD                    { align1 1H };
+mov(16)         g79<1>D         g17<8,8,1>UD                    { align1 1H };
+mov(8)          g116<1>.xyzF    g3.4<0>.xyzzF                   { align16 NoDDClr 1Q };
+mov.sat(8)      g116<1>.wF      g20<4>.wF                       { align16 NoDDChk 1Q };
+mov(1)          f1<1>UD         0x00000000UD                    { align1 WE_all 1N };
+mov.z.f1.0(8)   null<1>UW       0x0000UW                        { align1 1Q };
+mov.z.f1.0(16)  null<1>UW       0x0000UW                        { align1 1H };
+mov(8)          g3<1>.xyzF      g1.4<0>.xyzzUD                  { align16 NoDDClr 1Q };
+mov(8)          g3<1>.wF        g1<0>.xUD                       { align16 NoDDChk 1Q };
+mov(8)          g26<1>UW        0x32103210V                     { align1 WE_all 1Q };
+mov(8)          g5<1>.yF        -g32<4>.xF                      { align16 NoDDClr,NoDDChk 1Q };
+mov(4)          g4<1>.xDF       g2<0>.xyxyDF                    { align16 1N };
+mov(8)          g5<1>.xDF       g2<0>.xyxyDF                    { align16 1Q };
+mov(4)          g4<1>.zDF       g6<0>.xyxyDF                    { align16 2N };
+mov(8)          g9<1>F          g7<4>F                          { align16 WE_all 2N };
+mov(4)          g6<2>UD         g2.3<0,1,0>UD                   { align1 1N };
+mov(4)          g7<2>UD         g2.3<0,1,0>UD                   { align1 2N };
+mov(4)          g6<2>UD         g2.3<0,1,0>UD                   { align1 3N };
+mov(4)          g7<2>UD         g2.3<0,1,0>UD                   { align1 4N };
+mov(8)          g10<1>D         g11<8,4,2>UD                    { align1 2Q };
+mov(8)          g113<1>UD       g0<4>UD                         { align16 WE_all 2N };
+mov(4)          g6<2>D          g5.3<0,1,0>D                    { align1 1N };
+mov(4)          g7<2>D          g5.3<0,1,0>D                    { align1 2N };
+mov(8)          g14<2>F         g6<4,4,1>DF                     { align1 1Q };
+mov(4)          g8<2>D          g5.3<0,1,0>D                    { align1 3N };
+mov(4)          g9<2>D          g5.3<0,1,0>D                    { align1 4N };
+mov(8)          g4<2>F          g8<4,4,1>DF                     { align1 2Q };
+mov(8)          g121<1>UD       g4<8,4,2>UD                     { align1 2Q };
+mov.sat(8)      g116<1>F        0x3f800000F      /* 1F */       { align16 1Q };
+mov(8)          g2<1>.xUD       2D                              { align16 NoDDClr 1Q };
+mov(8)          g2<1>.yzwUD     0D                              { align16 NoDDChk 1Q };
+mov(8)          g5<1>F          0x0F             /* 0F */       { align1 WE_all 1Q };
+mov(16)         g4<1>UD         0x00000000UD                    { align1 WE_all 1H };
+mov(8)          g11<1>D         0D                              { align1 2Q };
+mov(8)          g8<1>UD         0x00000006UD                    { align1 1Q };
+mov(16)         g10<1>UD        0x00000006UD                    { align1 1H };
+mov(8)          g18<1>F         g19<4>F                         { align16 WE_all 1Q };
+mov(8)          g25<1>UD        0x00000000UD                    { align1 WE_all 2N };
+mov(1)          g25.5<1>UD      0x0000ff00UD                    { align1 WE_all 2N };
+mov(2)          g25<1>UD        g0<0,1,0>UD                     { align1 WE_all 2N };
+mov.sat(8)      g116<1>F        -g6<4>D                         { align16 1Q };
+mov(8)          g119<1>.zwD     0x706e0000VF    /* [0F, 0F, 15F, 16F]VF */ { align16 NoDDChk 1Q };
+mov.sat(8)      g116<1>.xyzF    -g11<4>.xyzzD                   { align16 NoDDClr 1Q };
+mov(8)          g116<1>.xF      0x3e800000F      /* 0.25F */    { align16 NoDDChk 1Q };
+mov(1)          g26.7<1>UD      f0.1<0,1,0>UW                   { align1 WE_all 1N };
+mov(1)          g2.7<1>UD       f0.1<0,1,0>UW                   { align1 WE_all 3N };
+mov(4)          g65<1>.xUD      0x00000001UD                    { align16 WE_all 1N };
+mov(4)          g65<1>.xUD      0x00000000UD                    { align16 1N };
+mov(8)          g14<1>UD        g13<32,8,4>UB                   { align1 1Q };
+mov(16)         g22<1>UD        g20<32,8,4>UB                   { align1 1H };
+mov(8)          g44<1>F         0xffffffe0F      /* -nanF */    { align1 2Q };
+mov(8)          g5<1>F          g2<0,1,0>B                      { align1 1Q };
+mov(16)         g6<1>F          g2<0,1,0>B                      { align1 1H };
+mov(8)          g4<1>.xUD       0x00000020UD                    { align16 NoDDClr 1Q };
+mov(8)          g116<1>.zwF     0x30000000VF    /* [0F, 0F, 0F, 1F]VF */ { align16 NoDDClr,NoDDChk 1Q };
+mov(16)         g5<1>UD         g2<0,1,0>UD                     { align1 1H };
+mov(1)          f1<1>UW         f0.1<0,1,0>UW                   { align1 WE_all 1N };
+mov.sat(8)      g116<1>.xF      g4<4>.xF                        { align16 NoDDClr 1Q };
+mov.sat(8)      g116<1>.yzF     g5<4>.xxyyF                     { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          g9<1>.zwF       0D                              { align16 1Q };
+mov(8)          g3<1>.yzwUD     0D                              { align16 1Q };
+mov(8)          g5<1>UD         1043072D                        { align1 1Q };
+mov(8)          g18<2>UW        g9<8,8,1>F                      { align1 1Q };
+mov(8)          g3<1>UW         g18<16,8,2>UW                   { align1 1Q };
+mov(8)          g12<1>UW        g8<16,8,2>UW                    { align1 WE_all 1Q };
+mov(16)         g15<1>UD        1043072D                        { align1 1H };
+mov(16)         g21<2>UW        g17<8,8,1>F                     { align1 1H };
+mov(16)         g4<1>UW         g13<16,8,2>UW                   { align1 WE_all 1H };
+mov(8)          g4<1>.xF        0x42100000F      /* 36F */      { align16 NoDDClr 1Q };
+mov(8)          g11<1>UD        0x78706000VF    /* [0F, 8F, 16F, 24F]VF */ { align16 1Q };
+mov(8)          g13<1>F         g12<4,1,0>UB                    { align1 1Q };
+mov(16)         g6<1>F          g2<0,1,0>UB                     { align1 1H };
+mov.z.f0.0(8)   null<1>.xD      g1<4>D                          { align16 1Q };
+mov(8)          g9<1>UD         0x0F             /* 0F */       { align1 1Q };
+mov(16)         g10<1>UD        0x0F             /* 0F */       { align1 1H };
+mov.sat(8)      g116<1>.yF      -g11<4>.xD                      { align16 NoDDClr,NoDDChk 1Q };
+mov.sat(8)      g116<1>.wF      -g13<4>.xD                      { align16 NoDDChk 1Q };
+mov(8)          g2<1>D          g14<16,8,2>W                    { align1 1Q };
+mov(16)         g41<1>D         g20<16,8,2>W                    { align1 1H };
+mov(8)          g2<1>D          g14<32,8,4>B                    { align1 1Q };
+mov(16)         g41<1>D         g20<32,8,4>B                    { align1 1H };
+mov(8)          g2<1>F          g14<16,8,2>W                    { align1 1Q };
+mov(16)         g43<1>F         g4<16,8,2>W                     { align1 1H };
+mov(4)          g8<1>UB         g13<4,1,0>UB                    { align1 NoDDClr 1N };
+mov(4)          g8.16<1>UB      g13.16<4,1,0>UB                 { align1 NoDDChk 1N };
+mov.nz.f0.0(8)  g11<1>F         -(abs)g1<0>F                    { align16 1Q };
+(+f0.0) mov(8)  g11<1>F         0xbf800000F      /* -1F */      { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/mov.expected b/src/intel/compiler/elk/tests/gen7.5/mov.expected
new file mode 100644
index 00000000000..25fddcfa796
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/mov.expected
@@ -0,0 +1,187 @@
+01 01 60 00 e5 10 4f 2e 00 00 00 00 00 00 00 00
+01 01 60 80 bd 03 8f 2e 84 00 6e 00 00 00 00 00
+01 01 60 00 bd 03 48 2e a0 00 60 00 00 00 00 00
+01 03 60 00 21 00 2f 2e 04 00 6e 00 00 00 00 00
+01 02 40 00 bd 03 40 2e 4c 00 87 00 00 00 00 00
+01 00 60 00 fd 73 c0 2f 00 00 00 00 00 00 00 00
+01 00 60 00 bd 00 a0 2f 20 41 8d 00 00 00 00 00
+01 00 80 00 fd 73 80 2f 00 00 00 00 00 00 00 00
+01 00 80 00 bd 00 40 2f e0 41 8d 00 00 00 00 00
+01 00 60 00 e5 10 c0 2f 00 00 00 00 00 00 80 3f
+01 00 60 02 a4 00 00 20 40 00 00 00 00 00 00 00
+01 00 80 00 e5 10 80 2f 00 00 00 00 00 00 80 3f
+01 00 80 02 a4 00 00 20 40 00 00 00 00 00 00 00
+01 01 60 00 fd 73 6f 2e 00 00 00 00 00 00 70 41
+01 00 60 00 bd 03 80 2f 40 00 00 00 00 00 00 00
+01 00 80 00 bd 03 00 2f 40 00 00 00 00 00 00 00
+01 00 60 00 3d 01 40 20 c0 00 89 00 00 00 00 00
+01 00 60 00 a5 03 e0 20 40 00 8d 00 00 00 00 00
+01 00 80 00 3d 01 40 20 80 00 8d 00 00 00 00 00
+01 00 80 00 a5 03 00 21 40 00 8d 00 00 00 00 00
+01 01 60 00 fd 52 8f 21 00 00 00 00 00 30 00 30
+01 02 60 00 61 00 60 26 00 00 00 00 00 00 00 00
+01 02 00 00 61 00 74 26 00 00 00 00 00 ff 00 00
+01 02 00 00 21 00 60 26 00 80 00 00 00 00 00 00
+01 02 20 00 21 00 80 21 00 00 00 00 00 00 00 00
+01 03 60 00 a5 00 af 21 44 06 6e 00 00 00 00 00
+01 01 60 00 21 00 e1 21 af 00 0f 00 00 00 00 00
+01 01 66 00 e5 10 61 22 00 00 00 00 ff ff ff ff
+01 01 60 01 3c 00 0f 20 60 01 00 00 00 00 00 00
+01 02 60 00 7d 00 c0 2f 00 00 00 00 00 00 00 00
+01 01 60 00 a5 00 ef 24 74 00 0e 00 00 00 00 00
+01 05 60 00 a5 00 41 2e 20 00 00 00 00 00 00 00
+01 09 60 00 fd 52 4e 2e 00 00 00 00 00 00 00 00
+01 01 60 02 a4 00 01 20 00 02 60 00 00 00 00 00
+01 01 60 00 bd 00 2f 22 e4 41 6e 00 00 00 00 00
+01 01 60 00 3d 00 ef 21 c4 01 6e 00 00 00 00 00
+01 02 20 00 61 00 2c 2e 00 00 00 00 00 00 00 00
+01 02 20 00 29 01 28 2e 20 04 80 00 00 00 00 00
+01 02 00 00 e5 10 c0 2f 00 00 00 00 00 00 00 00
+01 02 00 00 a5 00 c0 2f e0 04 00 00 00 00 00 00
+01 02 00 00 28 01 02 26 3c 00 00 00 00 00 00 00
+01 00 60 00 a1 00 40 22 40 00 8d 00 00 00 00 00
+01 00 60 00 a5 00 80 2f 00 01 8d 00 00 00 00 00
+01 02 00 00 fd 73 00 24 00 00 00 00 00 00 80 3e
+01 00 80 00 a1 00 80 21 00 05 8d 00 00 00 00 00
+01 00 80 00 a5 00 00 2f 00 06 8d 00 00 00 00 00
+01 05 60 00 e5 10 81 2e 00 00 00 00 fa 7e 2a 3f
+01 0d 60 00 e5 10 82 2e 00 00 00 00 00 80 26 44
+01 09 60 00 e5 10 a2 2e 00 00 00 00 cd cc cc be
+01 02 60 00 21 00 e0 20 00 00 8d 00 00 00 00 00
+01 12 60 00 21 00 a0 22 00 00 8d 00 00 00 00 00
+01 10 60 00 bd 03 e0 22 c0 00 00 00 00 00 00 00
+01 12 00 00 61 00 a8 22 00 00 00 00 f2 03 00 00
+01 09 60 00 a5 00 02 21 85 00 05 00 00 00 00 00
+01 01 60 00 85 00 a7 22 04 04 6e 00 00 00 00 00
+01 05 60 00 bd 00 81 2e 20 41 60 00 00 00 00 00
+01 09 60 00 61 00 62 2e 00 00 00 00 00 00 00 00
+01 00 60 00 21 00 e0 20 04 00 00 00 00 00 00 00
+01 00 60 80 bd 03 80 2f 40 00 00 00 00 00 00 00
+01 00 80 80 bd 03 00 2f 40 00 00 00 00 00 00 00
+01 01 60 00 a5 03 61 22 40 02 60 00 00 00 00 00
+01 03 60 00 e5 10 0f 21 00 00 00 00 00 00 80 3f
+01 00 60 00 3d 00 c0 20 40 00 8d 00 00 00 00 00
+01 00 80 00 3d 00 40 20 40 02 8d 00 00 00 00 00
+01 09 60 00 fd 10 68 2e 00 00 00 00 00 00 00 00
+01 01 67 00 e5 10 41 28 00 00 00 00 ff ff ff ff
+01 05 60 80 fd 73 81 2e 00 00 00 00 00 00 80 3f
+01 0d 60 80 fd 73 82 2e 00 00 00 00 66 66 66 3f
+01 09 60 80 fd 73 88 2e 00 00 00 00 33 33 33 3f
+01 01 60 00 e5 52 6e 22 00 00 00 00 00 30 40 48
+01 02 80 00 21 00 a0 21 00 00 8d 00 00 00 00 00
+01 02 00 00 20 00 20 26 3c 00 00 00 00 00 00 00
+01 0d 60 00 bd 00 82 2e 00 07 60 00 00 00 00 00
+01 09 60 00 bd 00 88 2e 40 07 60 00 00 00 00 00
+01 05 60 00 fd 52 6c 2e 00 00 00 00 00 00 00 30
+01 00 60 02 bd 03 80 20 40 60 00 00 00 00 00 00
+01 00 61 00 bd 03 80 20 c0 42 00 00 00 00 00 00
+01 00 80 02 bd 03 80 20 40 60 00 00 00 00 00 00
+01 00 81 00 bd 03 80 20 e0 44 00 00 00 00 00 00
+01 02 60 00 e1 10 60 21 00 00 00 00 00 00 00 00
+01 02 00 00 e1 10 7c 21 00 00 00 00 ff ff 00 00
+01 12 60 00 e1 10 60 21 00 00 00 00 00 00 00 00
+01 10 60 00 a5 00 80 21 20 03 8d 00 00 00 00 00
+01 12 00 00 e1 10 7c 21 00 00 00 00 ff ff 00 00
+01 12 00 00 21 00 fc 20 3c 00 00 00 00 00 00 00
+01 0d 60 00 a5 00 83 2e 84 00 65 00 00 00 00 00
+01 09 60 00 bd 03 61 20 00 42 60 00 00 00 00 00
+01 01 60 00 61 00 e1 20 00 00 00 00 00 00 00 00
+01 03 60 00 e1 10 81 22 00 00 00 00 00 00 00 00
+01 01 60 00 a1 00 61 22 60 01 60 00 00 00 00 00
+01 03 40 02 3c 00 0f 20 c0 01 60 04 00 00 00 00
+01 01 60 00 a1 03 01 21 20 00 00 00 00 00 00 00
+01 00 60 00 a1 03 20 21 e0 00 8d 00 00 00 00 00
+01 00 60 00 21 01 60 21 20 01 ae 00 00 00 00 00
+01 00 80 00 a1 03 e0 21 60 01 8d 00 00 00 00 00
+01 00 80 00 21 01 60 22 e0 01 ae 00 00 00 00 00
+01 03 60 00 61 00 81 20 00 00 00 00 00 00 00 00
+01 01 60 01 64 00 01 20 00 00 00 00 00 00 00 00
+01 00 60 00 65 00 e0 20 00 00 00 00 00 00 00 00
+01 00 60 00 25 00 20 21 a0 00 8d 00 00 00 00 00
+01 00 80 00 65 00 60 29 00 00 00 00 00 00 00 00
+01 00 80 00 25 00 e0 29 20 02 8d 00 00 00 00 00
+01 05 60 00 bd 03 87 2e 74 00 0a 00 00 00 00 00
+01 09 60 80 bd 03 88 2e 8f 02 6f 00 00 00 00 00
+01 02 00 00 60 00 20 26 00 00 00 00 00 00 00 00
+01 00 60 01 68 21 00 20 00 00 00 04 00 00 00 00
+01 00 80 01 68 21 00 20 00 00 00 04 00 00 00 00
+01 05 60 00 3d 00 67 20 34 00 0a 00 00 00 00 00
+01 09 60 00 3d 00 68 20 20 00 00 00 00 00 00 00
+01 02 60 00 69 63 40 23 00 00 00 00 10 32 10 32
+01 0d 60 00 bd 03 a2 20 00 44 60 00 00 00 00 00
+01 01 40 00 39 03 81 20 44 00 04 00 00 00 00 00
+01 01 60 00 39 03 a1 20 44 00 04 00 00 00 00 00
+01 01 40 00 39 83 84 20 c4 00 04 00 00 00 00 00
+01 03 60 00 bd 83 2f 21 e4 00 6e 00 00 00 00 00
+01 00 40 00 21 00 c0 40 4c 00 00 00 00 00 00 00
+01 00 40 00 21 80 e0 40 4c 00 00 00 00 00 00 00
+01 10 40 00 21 00 c0 40 4c 00 00 00 00 00 00 00
+01 10 40 00 21 80 e0 40 4c 00 00 00 00 00 00 00
+01 10 60 00 25 00 40 21 60 01 8a 00 00 00 00 00
+01 03 60 00 21 80 2f 2e 04 00 6e 00 00 00 00 00
+01 00 40 00 a5 00 c0 40 ac 00 00 00 00 00 00 00
+01 00 40 00 a5 80 e0 40 ac 00 00 00 00 00 00 00
+01 00 60 00 3d 03 c0 41 c0 00 69 00 00 00 00 00
+01 10 40 00 a5 00 00 41 ac 00 00 00 00 00 00 00
+01 10 40 00 a5 80 20 41 ac 00 00 00 00 00 00 00
+01 10 60 00 3d 03 80 40 00 01 69 00 00 00 00 00
+01 10 60 00 21 00 20 2f 80 00 8a 00 00 00 00 00
+01 01 60 80 fd 73 8f 2e 00 00 00 00 00 00 80 3f
+01 05 60 00 e1 10 41 20 00 00 00 00 02 00 00 00
+01 09 60 00 e1 10 4e 20 00 00 00 00 00 00 00 00
+01 02 60 00 fd 73 a0 20 00 00 00 00 00 00 00 00
+01 02 80 00 61 00 80 20 00 00 00 00 00 00 00 00
+01 10 60 00 e5 10 60 21 00 00 00 00 00 00 00 00
+01 00 60 00 61 00 00 21 00 00 00 00 06 00 00 00
+01 00 80 00 61 00 40 21 00 00 00 00 06 00 00 00
+01 03 60 00 bd 03 4f 22 64 02 6e 00 00 00 00 00
+01 02 60 00 61 80 20 23 00 00 00 00 00 00 00 00
+01 02 00 00 61 80 34 23 00 00 00 00 00 ff 00 00
+01 02 20 00 21 80 20 23 00 00 00 00 00 00 00 00
+01 01 60 80 bd 00 8f 2e c4 40 6e 00 00 00 00 00
+01 09 60 00 e5 52 ec 2e 00 00 00 00 00 00 6e 70
+01 05 60 80 bd 00 87 2e 64 41 6a 00 00 00 00 00
+01 09 60 00 fd 73 81 2e 00 00 00 00 00 00 80 3e
+01 02 00 00 01 01 5c 23 02 06 00 00 00 00 00 00
+01 12 00 00 01 01 5c 20 02 06 00 00 00 00 00 00
+01 03 40 00 61 00 21 28 00 00 00 00 01 00 00 00
+01 01 40 00 61 00 21 28 00 00 00 00 00 00 00 00
+01 00 60 00 21 02 c0 21 a0 01 cf 00 00 00 00 00
+01 00 80 00 21 02 c0 22 80 02 cf 00 00 00 00 00
+01 10 60 00 fd 73 80 25 00 00 00 00 e0 ff ff ff
+01 00 60 00 bd 02 a0 20 40 00 00 00 00 00 00 00
+01 00 80 00 bd 02 c0 20 40 00 00 00 00 00 00 00
+01 05 60 00 61 00 81 20 00 00 00 00 20 00 00 00
+01 0d 60 00 fd 52 8c 2e 00 00 00 00 00 00 00 30
+01 00 80 00 21 00 a0 20 40 00 00 00 00 00 00 00
+01 02 00 00 08 01 20 26 02 06 00 00 00 00 00 00
+01 05 60 80 bd 03 81 2e 80 00 60 00 00 00 00 00
+01 0d 60 80 bd 03 86 2e a0 00 65 00 00 00 00 00
+01 01 60 00 fd 10 2c 21 00 00 00 00 00 00 00 00
+01 01 60 00 e1 10 6e 20 00 00 00 00 00 00 00 00
+01 00 60 00 e1 10 a0 20 00 00 00 00 80 ea 0f 00
+01 00 60 00 a9 03 40 42 20 01 8d 00 00 00 00 00
+01 00 60 00 29 01 60 20 40 02 ae 00 00 00 00 00
+01 02 60 00 29 01 80 21 00 01 ae 00 00 00 00 00
+01 00 80 00 e1 10 e0 21 00 00 00 00 80 ea 0f 00
+01 00 80 00 a9 03 a0 42 20 02 8d 00 00 00 00 00
+01 02 80 00 29 01 80 20 a0 01 ae 00 00 00 00 00
+01 05 60 00 fd 73 81 20 00 00 00 00 00 00 10 42
+01 01 60 00 e1 52 6f 21 00 00 00 00 00 60 70 78
+01 00 60 00 3d 02 a0 21 80 01 60 00 00 00 00 00
+01 00 80 00 3d 02 c0 20 40 00 00 00 00 00 00 00
+01 01 60 01 a4 00 01 20 24 00 6e 00 00 00 00 00
+01 00 60 00 e1 73 20 21 00 00 00 00 00 00 00 00
+01 00 80 00 e1 73 40 21 00 00 00 00 00 00 00 00
+01 0d 60 80 bd 00 82 2e 60 41 60 00 00 00 00 00
+01 09 60 80 bd 00 88 2e a0 41 60 00 00 00 00 00
+01 00 60 00 a5 01 40 20 c0 01 ae 00 00 00 00 00
+01 00 80 00 a5 01 20 25 80 02 ae 00 00 00 00 00
+01 00 60 00 a5 02 40 20 c0 01 cf 00 00 00 00 00
+01 00 80 00 a5 02 20 25 80 02 cf 00 00 00 00 00
+01 00 60 00 bd 01 40 20 c0 01 ae 00 00 00 00 00
+01 00 80 00 bd 01 60 25 80 00 ae 00 00 00 00 00
+01 04 40 00 31 02 00 21 a0 01 60 00 00 00 00 00
+01 08 40 00 31 02 10 21 b0 01 60 00 00 00 00 00
+01 01 60 02 bd 03 6f 21 24 60 0e 00 00 00 00 00
+01 01 61 00 fd 73 6f 21 00 00 00 00 00 00 80 bf
diff --git a/src/intel/compiler/elk/tests/gen7.5/mul.asm b/src/intel/compiler/elk/tests/gen7.5/mul.asm
new file mode 100644
index 00000000000..560a9c0ad67
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/mul.asm
@@ -0,0 +1,55 @@
+mul(8)          g45<1>.xF       g5.4<0>.zF      g5.4<0>.zF      { align16 1Q };
+mul(8)          g18<1>F         g17<4>F         0x3f000000F  /* 0.5F */ { align16 1Q };
+mul(8)          g39<1>.xD       g5<0>.xD        2D              { align16 1Q };
+mul(8)          g7<1>F          g39<8,8,1>F     g4.1<0,1,0>F    { align1 1Q };
+mul(16)         g19<1>F         g37<8,8,1>F     g6.1<0,1,0>F    { align1 1H };
+mul(8)          acc0<1>UD       g17<8,8,1>UD    0xaaaaaaabUD    { align1 1Q };
+mul(8)          acc0<1>D        g17<8,8,1>D     1431655766D     { align1 1Q };
+mul(8)          g21<1>D         g20<8,8,1>D     3W              { align1 1Q };
+mul(8)          g7<1>F          g5<8,8,1>F      0x3e800000F  /* 0.25F */ { align1 1Q };
+mul(8)          acc0<1>UD       g84<8,8,1>UD    0xaaaaaaabUD    { align1 2Q };
+mul(16)         g90<1>D         g88<8,8,1>D     3W              { align1 1H };
+mul(8)          acc0<1>D        g84<8,8,1>D     1431655766D     { align1 2Q };
+mul(16)         g74<1>F         g37<8,8,1>F     0x3e800000F  /* 0.25F */ { align1 1H };
+mul(8)          acc0<1>D        g1<0>.xyzzD     g1<0>.wD        { align16 1Q };
+mul(2)          g113.3<1>UD     g12<8,2,4>UD    0x0001UW        { align1 WE_all 1N };
+mul.sat(8)      g19<1>.xyzF     g15<4>.xyzzF    g18<4>.xF       { align16 1Q };
+mul(8)          g116<1>.xyF     g6<4>.xyyyF     0x3f000000F  /* 0.5F */ { align16 NoDDClr 1Q };
+mul.sat(8)      g2<1>F          g6<8,8,1>F      g5<8,8,1>F      { align1 1Q };
+mul.sat(16)     g2<1>F          g12<8,8,1>F     g10<8,8,1>F     { align1 1H };
+mul(8)          g29<1>F         g28<4>.yF       0x3000VF /* [0F, 1F, 0F, 0F]VF */ { align16 1Q };
+mul.l.f0.0(8)   null<1>.xF      g6<0>.xF        g5.4<0>.wF      { align16 1Q };
+mul(8)          acc0<1>UD       g9<4>UD         g11<4>UD        { align16 1Q };
+mul(8)          acc0<1>UD       g4<8,8,1>UD     g8<8,8,1>UD     { align1 1Q };
+mul(8)          acc0<1>UD       g5<8,8,1>UD     g13<8,8,1>UD    { align1 2Q };
+mul(8)          g3<1>D          g2<0,1,0>D      g2.4<0,1,0>UW   { align1 1Q };
+mul(16)         g3<1>D          g2<0,1,0>D      g2.4<0,1,0>UW   { align1 1H };
+mul(8)          g116<1>.xyF     g7<4>.xyyyF     0x3f000000F  /* 0.5F */ { align16 NoDDChk 1Q };
+mul(8)          g115<1>.xyzF    g2<4>.xyzzF     g10<4>.xF       { align16 NoDDClr 1Q };
+mul(8)          g115<1>.xF      g15<4>.xF       0x40a66666F  /* 5.2F */ { align16 NoDDClr,NoDDChk 1Q };
+mul(1)          g6<1>UD         g12<0,1,0>UD    0x0101UW        { align1 WE_all 1N };
+mul(8)          acc0<1>UD       g5<4>.xUD       0xaaaaaaabUD    { align16 1Q };
+mul(8)          acc0<1>D        g5<4>.xD        1431655766D     { align16 1Q };
+mul.sat(8)      g116<1>F        g6<4>F          0x3b800000F  /* 0.00390625F */ { align16 1Q };
+mul(8)          acc0<1>D        g5<8,8,1>D      g9<8,8,1>D      { align1 1Q };
+mul(8)          acc0<1>D        g6<8,8,1>D      g14<8,8,1>D     { align1 2Q };
+mul(8)          g3<1>D          g2<0,1,0>D      0x77b9UW        { align1 1Q };
+mul(16)         g3<1>D          g2<0,1,0>D      0x77b9UW        { align1 1H };
+mul(8)          g37<1>.xD       g6<4>.xD        g14<4>.xD       { align16 1Q };
+mul.l.f0.0(8)   null<1>F        g2.2<0,1,0>F    g2.1<0,1,0>F    { align1 1Q };
+mul.l.f0.0(16)  null<1>F        g2.2<0,1,0>F    g2.1<0,1,0>F    { align1 1H };
+mul(8)          g6<1>UW         g6<8,8,1>UW     0x0808UW        { align1 1Q };
+mul(16)         g15<1>UW        g14<16,16,1>UW  0x0808UW        { align1 1H };
+mul.sat(8)      g11<1>F         g17<8,8,1>F     0x40800000F  /* 4F */ { align1 1Q };
+mul.sat(16)     g21<1>F         g17<8,8,1>F     0x40800000F  /* 4F */ { align1 1H };
+mul.nz.f0.0(8)  g6<1>F          g12<8,8,1>F     0x3f808000F  /* 1.00391F */ { align1 1Q };
+mul.nz.f0.0(16) g9<1>F          g7<8,8,1>F      0x3f808000F  /* 1.00391F */ { align1 1H };
+mul.sat(8)      g116<1>.xyF     g1<0>.wzzzF     g3<4>.wzzzF     { align16 NoDDClr 1Q };
+mul.sat(8)      g116<1>.zwF     g1<0>.yyyxF     g3<4>.yyyxF     { align16 NoDDChk 1Q };
+mul(8)          g116<1>.xyzF    g3<4>.xyzzF     0x30302020VF /* [0.5F, 0.5F, 1F, 1F]VF */ { align16 NoDDClr 1Q };
+mul.sat(8)      g116<1>F        g6<4>F          0x20303030VF /* [1F, 1F, 1F, 0.5F]VF */ { align16 1Q };
+mul(8)          g3<1>.wF        g1<0>.zF        g9<4>.xF        { align16 NoDDClr,NoDDChk 1Q };
+mul(1)          g4<1>UD         g4<0,1,0>UD     0x0101UW        { align1 WE_all 3N };
+mul(8)          g117<1>.yF      g29<4>.xF       g9<4>.xF        { align16 NoDDChk 1Q };
+mul.sat(8)      g116<1>.xF      g19<4>.xF       0x3dcccccdF  /* 0.1F */ { align16 NoDDClr 1Q };
+mul.sat(8)      g116<1>.xyzF    g12<4>.xF       0x3030VF /* [1F, 1F, 0F, 0F]VF */ { align16 NoDDClr 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/mul.expected b/src/intel/compiler/elk/tests/gen7.5/mul.expected
new file mode 100644
index 00000000000..18093fad8aa
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/mul.expected
@@ -0,0 +1,55 @@
+41 01 60 00 bd 77 a1 25 ba 00 0a 00 ba 00 0a 00
+41 01 60 00 bd 7f 4f 22 24 02 6e 00 00 00 00 3f
+41 01 60 00 a5 1c e1 24 a0 00 00 00 02 00 00 00
+41 00 60 00 bd 77 e0 20 e0 04 8d 00 84 00 00 00
+41 00 80 00 bd 77 60 22 a0 04 8d 00 c4 00 00 00
+41 00 60 00 20 0c 00 24 20 02 8d 00 ab aa aa aa
+41 00 60 00 a4 1c 00 24 20 02 8d 00 56 55 55 55
+41 00 60 00 a5 3c a0 22 80 02 8d 00 03 00 03 00
+41 00 60 00 bd 7f e0 20 a0 00 8d 00 00 00 80 3e
+41 10 60 00 20 0c 00 24 80 0a 8d 00 ab aa aa aa
+41 00 80 00 a5 3c 40 2b 00 0b 8d 00 03 00 03 00
+41 10 60 00 a4 1c 00 24 80 0a 8d 00 56 55 55 55
+41 00 80 00 bd 7f 40 29 a0 04 8d 00 00 00 80 3e
+41 01 60 00 a4 14 0f 24 24 00 0a 00 2f 00 0f 00
+41 02 20 00 21 2c 2c 2e 80 01 87 00 01 00 01 00
+41 01 60 80 bd 77 67 22 e4 01 6a 00 40 02 60 00
+41 05 60 00 bd 7f 83 2e c4 00 65 00 00 00 00 3f
+41 00 60 80 bd 77 40 20 c0 00 8d 00 a0 00 8d 00
+41 00 80 80 bd 77 40 20 80 01 8d 00 40 01 8d 00
+41 01 60 00 bd 5f af 23 85 03 65 00 00 30 00 00
+41 01 60 05 bc 77 01 20 c0 00 00 00 bf 00 0f 00
+41 01 60 00 20 04 0f 24 24 01 6e 00 64 01 6e 00
+41 00 60 00 20 04 00 24 80 00 8d 00 00 01 8d 00
+41 10 60 00 20 04 00 24 a0 00 8d 00 a0 01 8d 00
+41 00 60 00 a5 24 60 20 40 00 00 00 48 00 00 00
+41 00 80 00 a5 24 60 20 40 00 00 00 48 00 00 00
+41 09 60 00 bd 7f 83 2e e4 00 65 00 00 00 00 3f
+41 05 60 00 bd 77 67 2e 44 00 6a 00 40 01 60 00
+41 0d 60 00 bd 7f 61 2e e0 01 60 00 66 66 a6 40
+41 02 00 00 21 2c c0 20 80 01 00 00 01 01 01 01
+41 01 60 00 20 0c 0f 24 a0 00 60 00 ab aa aa aa
+41 01 60 00 a4 1c 0f 24 a0 00 60 00 56 55 55 55
+41 01 60 80 bd 7f 8f 2e c4 00 6e 00 00 00 80 3b
+41 00 60 00 a4 14 00 24 a0 00 8d 00 20 01 8d 00
+41 10 60 00 a4 14 00 24 c0 00 8d 00 c0 01 8d 00
+41 00 60 00 a5 2c 60 20 40 00 00 00 b9 77 b9 77
+41 00 80 00 a5 2c 60 20 40 00 00 00 b9 77 b9 77
+41 01 60 00 a5 14 a1 24 c0 00 60 00 c0 01 60 00
+41 00 60 05 bc 77 00 20 48 00 00 00 44 00 00 00
+41 00 80 05 bc 77 00 20 48 00 00 00 44 00 00 00
+41 00 60 00 29 2d c0 20 c0 00 8d 00 08 08 08 08
+41 00 80 00 29 2d e0 21 c0 01 b1 00 08 08 08 08
+41 00 60 80 bd 7f 60 21 20 02 8d 00 00 00 80 40
+41 00 80 80 bd 7f a0 22 20 02 8d 00 00 00 80 40
+41 00 60 02 bd 7f c0 20 80 01 8d 00 00 80 80 3f
+41 00 80 02 bd 7f 20 21 e0 00 8d 00 00 80 80 3f
+41 05 60 80 bd 77 83 2e 2b 00 0a 00 6b 00 6a 00
+41 09 60 80 bd 77 8c 2e 25 00 01 00 65 00 61 00
+41 05 60 00 bd 5f 87 2e 64 00 6a 00 20 20 30 30
+41 01 60 80 bd 5f 8f 2e c4 00 6e 00 30 30 30 20
+41 0d 60 00 bd 77 68 20 2a 00 0a 00 20 01 60 00
+41 12 00 00 21 2c 80 20 80 00 00 00 01 01 01 01
+41 09 60 00 bd 77 a2 2e a0 03 60 00 20 01 60 00
+41 05 60 80 bd 7f 81 2e 60 02 60 00 cd cc cc 3d
+41 05 60 80 bd 5f 87 2e 80 01 60 00 30 30 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/not.asm b/src/intel/compiler/elk/tests/gen7.5/not.asm
new file mode 100644
index 00000000000..c93eb1a7cd9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/not.asm
@@ -0,0 +1,4 @@
+not(8)          g3<1>D          g2.2<0,1,0>D                    { align1 1Q };
+not(16)         g3<1>D          g2.2<0,1,0>D                    { align1 1H };
+not(8)          g10<1>D         (abs)g1.4<0>D                   { align16 1Q };
+not.nz.f0.0(8)  null<1>.xD      g13<4>.xD                       { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/not.expected b/src/intel/compiler/elk/tests/gen7.5/not.expected
new file mode 100644
index 00000000000..0df69662e06
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/not.expected
@@ -0,0 +1,4 @@
+04 00 60 00 a5 00 60 20 48 00 00 00 00 00 00 00
+04 00 80 00 a5 00 60 20 48 00 00 00 00 00 00 00
+04 01 60 00 a5 00 4f 21 34 20 0e 00 00 00 00 00
+04 01 60 02 a4 00 01 20 a0 01 60 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/or.asm b/src/intel/compiler/elk/tests/gen7.5/or.asm
new file mode 100644
index 00000000000..9f7ea869fdd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/or.asm
@@ -0,0 +1,22 @@
+or(1)           g113.5<1>UD     g0.5<0,1,0>UD   0x0000ff00UD    { align1 WE_all 1N };
+or.nz.f0.0(8)   null<1>.xUD     g21<4>.xUD      g19<4>.xUD      { align16 1Q };
+or(8)           g13<1>.xyUD     g5.4<0>.zwwwUD  g6<0>.xUD       { align16 1Q };
+or.nz.f0.0(8)   null<1>UD       g12<8,8,1>UD    g6<8,8,1>UD     { align1 1Q };
+or.nz.f0.0(16)  null<1>UD       g4<8,8,1>UD     g2<8,8,1>UD     { align1 1H };
+or(8)           g20<1>UD        g19<8,8,1>UD    g17<8,8,1>UD    { align1 1Q };
+or(16)          g30<1>UD        g28<8,8,1>UD    g24<8,8,1>UD    { align1 1H };
+or(1)           g2<1>UD         g2<0,1,0>UD     g4<0,1,0>UD     { align1 WE_all 1N };
+or(1)           a0<1>UD         g2<0,1,0>UD     0x064a7000UD    { align1 WE_all 1N };
+(+f0.0) or(8)   g3<1>UD         g3<8,8,1>UD     0x3f800000UD    { align1 1Q };
+(+f0.0) or(16)  g3<1>UD         g3<8,8,1>UD     0x3f800000UD    { align1 1H };
+(+f0.0) or(8)   g64<1>.xyzUD    g64<4>.xyzzUD   0x3f800000UD    { align16 1Q };
+or(1)           a0<1>UD         a0<0,1,0>UD     g15<0,1,0>UD    { align1 WE_all 1N };
+or(1)           a0<1>UD         a0<0,1,0>UD     0x06182000UD    { align1 WE_all 1N };
+or.nz.f0.0(8)   g8<1>UD         g4<8,8,1>UD     g7<8,8,1>UD     { align1 1Q };
+or.nz.f0.0(16)  g12<1>UD        g5<8,8,1>UD     g10<8,8,1>UD    { align1 1H };
+or(8)           g13<1>UD        g12<8,8,1>UD    0x00000001UD    { align1 1Q };
+or(16)          g20<1>UD        g18<8,8,1>UD    0x00000001UD    { align1 1H };
+or(1)           a0<1>UD         g2<0,1,0>UD     0x0e0b6000UD    { align1 WE_all 3N };
+or(1)           g113.21<1>UB    g16<0,1,0>UB    g16.16<0,1,0>UB { align1 WE_all 1N };
+or(8)           g4<1>UW         g4<8,8,1>UW     g6<8,8,1>UW     { align1 1Q };
+or(16)          g16<1>UW        g14<16,16,1>UW  g15<16,16,1>UW  { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/or.expected b/src/intel/compiler/elk/tests/gen7.5/or.expected
new file mode 100644
index 00000000000..2d77bf68e38
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/or.expected
@@ -0,0 +1,22 @@
+06 02 00 00 21 0c 34 2e 14 00 00 00 00 ff 00 00
+06 01 60 02 20 04 01 20 a0 02 60 00 60 02 60 00
+06 01 60 00 21 04 a3 21 be 00 0f 00 c0 00 00 00
+06 00 60 02 20 04 00 20 80 01 8d 00 c0 00 8d 00
+06 00 80 02 20 04 00 20 80 00 8d 00 40 00 8d 00
+06 00 60 00 21 04 80 22 60 02 8d 00 20 02 8d 00
+06 00 80 00 21 04 c0 23 80 03 8d 00 00 03 8d 00
+06 02 00 00 21 04 40 20 40 00 00 00 80 00 00 00
+06 02 00 00 20 0c 00 22 40 00 00 00 00 70 4a 06
+06 00 61 00 21 0c 60 20 60 00 8d 00 00 00 80 3f
+06 00 81 00 21 0c 60 20 60 00 8d 00 00 00 80 3f
+06 01 61 00 21 0c 07 28 04 08 6a 00 00 00 80 3f
+06 02 00 00 00 04 00 22 00 02 00 00 e0 01 00 00
+06 02 00 00 00 0c 00 22 00 02 00 00 00 20 18 06
+06 00 60 02 21 04 00 21 80 00 8d 00 e0 00 8d 00
+06 00 80 02 21 04 80 21 a0 00 8d 00 40 01 8d 00
+06 00 60 00 21 0c a0 21 80 01 8d 00 01 00 00 00
+06 00 80 00 21 0c 80 22 40 02 8d 00 01 00 00 00
+06 12 00 00 20 0c 00 22 40 00 00 00 00 60 0b 0e
+06 02 00 00 31 46 35 2e 00 02 00 00 10 02 00 00
+06 00 60 00 29 25 80 20 80 00 8d 00 c0 00 8d 00
+06 00 80 00 29 25 00 22 c0 01 b1 00 e0 01 b1 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/pln.asm b/src/intel/compiler/elk/tests/gen7.5/pln.asm
new file mode 100644
index 00000000000..5b0adcf28cd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/pln.asm
@@ -0,0 +1,10 @@
+pln(8)          g124<1>F        g4<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln(16)         g120<1>F        g6<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.sat(8)      g9<1>F          g5<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.sat(16)     g12<1>F         g7<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.g.f0.0(8)   g7<1>F          g4<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.g.f0.0(16)  g11<1>F         g6<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.l.f0.0(8)   g8<1>F          g4<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.l.f0.0(16)  g11<1>F         g6<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.nz.f0.0(8)  g18<1>F         g5<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.nz.f0.0(16) g14<1>F         g7<0,1,0>F      g2<8,8,1>F      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/pln.expected b/src/intel/compiler/elk/tests/gen7.5/pln.expected
new file mode 100644
index 00000000000..471b1c933e8
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/pln.expected
@@ -0,0 +1,10 @@
+5a 00 60 00 bd 77 80 2f 80 00 00 00 40 00 8d 00
+5a 00 80 00 bd 77 00 2f c0 00 00 00 40 00 8d 00
+5a 00 60 80 bd 77 20 21 a0 00 00 00 40 00 8d 00
+5a 00 80 80 bd 77 80 21 e0 00 00 00 40 00 8d 00
+5a 00 60 03 bd 77 e0 20 80 00 00 00 40 00 8d 00
+5a 00 80 03 bd 77 60 21 c0 00 00 00 40 00 8d 00
+5a 00 60 05 bd 77 00 21 80 00 00 00 40 00 8d 00
+5a 00 80 05 bd 77 60 21 c0 00 00 00 40 00 8d 00
+5a 00 60 02 bd 77 40 22 a0 00 00 00 40 00 8d 00
+5a 00 80 02 bd 77 c0 21 e0 00 00 00 40 00 8d 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/rndd.asm b/src/intel/compiler/elk/tests/gen7.5/rndd.asm
new file mode 100644
index 00000000000..aa450ba7361
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/rndd.asm
@@ -0,0 +1,7 @@
+rndd(8)         g18<1>.xF       g1<0>.xF                        { align16 1Q };
+rndd(8)         g5<1>F          g4<8,8,1>F                      { align1 1Q };
+rndd(16)        g7<1>F          g5<8,8,1>F                      { align1 1H };
+rndd(8)         g6<1>.zF        g22<4>.xF                       { align16 NoDDClr 1Q };
+rndd.z.f0.0(8)  null<1>F        g17<8,8,1>F                     { align1 1Q };
+rndd.z.f0.0(16) null<1>F        g38<8,8,1>F                     { align1 1H };
+rndd.sat(8)     g116<1>F        g6<4>F                          { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/rndd.expected b/src/intel/compiler/elk/tests/gen7.5/rndd.expected
new file mode 100644
index 00000000000..0444a6d5d64
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/rndd.expected
@@ -0,0 +1,7 @@
+45 01 60 00 bd 03 41 22 20 00 00 00 00 00 00 00
+45 00 60 00 bd 03 a0 20 80 00 8d 00 00 00 00 00
+45 00 80 00 bd 03 e0 20 a0 00 8d 00 00 00 00 00
+45 05 60 00 bd 03 c4 20 c0 02 60 00 00 00 00 00
+45 00 60 01 bc 03 00 20 20 02 8d 00 00 00 00 00
+45 00 80 01 bc 03 00 20 c0 04 8d 00 00 00 00 00
+45 01 60 80 bd 03 8f 2e c4 00 6e 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/rnde.asm b/src/intel/compiler/elk/tests/gen7.5/rnde.asm
new file mode 100644
index 00000000000..2a2fc14ffbd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/rnde.asm
@@ -0,0 +1,3 @@
+rnde(8)         g7<1>F          g5<8,8,1>F                      { align1 1Q };
+rnde(16)        g11<1>F         g7<8,8,1>F                      { align1 1H };
+rnde(8)         g8<1>.xyzF      g1<0>.xyzzF                     { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/rnde.expected b/src/intel/compiler/elk/tests/gen7.5/rnde.expected
new file mode 100644
index 00000000000..91b8b5ae0fe
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/rnde.expected
@@ -0,0 +1,3 @@
+46 00 60 00 bd 03 e0 20 a0 00 8d 00 00 00 00 00
+46 00 80 00 bd 03 60 21 e0 00 8d 00 00 00 00 00
+46 01 60 00 bd 03 07 21 24 00 0a 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/rndz.asm b/src/intel/compiler/elk/tests/gen7.5/rndz.asm
new file mode 100644
index 00000000000..15d9b70e817
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/rndz.asm
@@ -0,0 +1,3 @@
+rndz(8)         g8<1>.xF        g1<0>.xF                        { align16 1Q };
+rndz(8)         g54<1>F         g43<8,8,1>F                     { align1 1Q };
+rndz(16)        g98<1>F         g95<8,8,1>F                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/rndz.expected b/src/intel/compiler/elk/tests/gen7.5/rndz.expected
new file mode 100644
index 00000000000..120cdcba4ee
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/rndz.expected
@@ -0,0 +1,3 @@
+47 01 60 00 bd 03 01 21 20 00 00 00 00 00 00 00
+47 00 60 00 bd 03 c0 26 60 05 8d 00 00 00 00 00
+47 00 80 00 bd 03 40 2c e0 0b 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/sel.asm b/src/intel/compiler/elk/tests/gen7.5/sel.asm
new file mode 100644
index 00000000000..1c77a4999ba
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/sel.asm
@@ -0,0 +1,63 @@
+(+f0.0) sel(8)  g47<1>UD        g12<4>UD        g13<4>UD        { align16 1Q };
+(-f0.0) sel(8)  g25<1>.xyUD     g13<4>.zwwwUD   0x40000000UD    { align16 1Q };
+(+f0.0.any4h) sel(8) g30<1>UD   g13<4>UD        g12<4>UD        { align16 1Q };
+(+f0.0.all4h) sel(8) g16<1>UD   g8<4>UD         g9<4>UD         { align16 1Q };
+(+f0.0) sel(8)  g23<1>UD        g8<8,8,1>UD     g23<8,8,1>UD    { align1 1Q };
+(+f0.0) sel(16) g42<1>UD        g76<8,8,1>UD    g78<8,8,1>UD    { align1 1H };
+sel.l(8)        g3<1>UD         g2.1<0,1,0>UD   0x00000001UD    { align1 1Q };
+sel.l(16)       g3<1>UD         g2.1<0,1,0>UD   0x00000001UD    { align1 1H };
+sel.ge(8)       g3<1>D          g2<0,1,0>D      -1D             { align1 1Q };
+sel.l(8)        g4<1>D          g3<8,8,1>D      1D              { align1 1Q };
+sel.ge(16)      g3<1>D          g2<0,1,0>D      -1D             { align1 1H };
+sel.l(16)       g5<1>D          g3<8,8,1>D      1D              { align1 1H };
+(+f0.0) sel(8)  g124<1>UD       g67<8,8,1>UD    0x3f800000UD    { align1 1Q };
+(+f0.0) sel(16) g120<1>UD       g27<8,8,1>UD    0x3f800000UD    { align1 1H };
+sel.ge(8)       g64<1>F         g9<8,8,1>F      0x0F  /* 0F */  { align1 1Q };
+(-f0.0) sel(8)  g16<1>UD        g20<8,8,1>UD    0x00000000UD    { align1 1Q };
+sel.ge(16)      g24<1>F         g20<8,8,1>F     0x0F  /* 0F */  { align1 1H };
+(-f0.0) sel(16) g28<1>UD        g26<8,8,1>UD    0x00000000UD    { align1 1H };
+(+f0.0) sel(8)  g8<1>.xyUD      g17<4>.xyyyUD   0x3f000000UD    { align16 1Q };
+sel.l(8)        g13<1>.xyzD     g6<0>.xyzzD     g5.4<0>.zD      { align16 1Q };
+sel.l(8)        g86<1>UD        g14<4>.xUD      0x0fffffffUD    { align16 1Q };
+sel.ge(8)       g3<1>.yF        g7<4>.xF        0x0F  /* 0F */  { align16 1Q };
+sel.l(8)        g11<1>.xF       g7<4>.wF        0x43000000F  /* 128F */ { align16 1Q };
+(-f0.0.z) sel(8) g3<1>.zUD      g14<4>.xUD      0x00000000UD    { align16 1Q };
+sel.l(8)        g14<1>UD        g6<0>UD         g6.4<0>UD       { align16 1Q };
+(+f0.0.x) sel(8) g32<1>.xUD     g12<4>.yUD      0x41a80000UD    { align16 1Q };
+(-f0.0.x) sel(8) g33<1>.xUD     g32<4>.xUD      0x41b80000UD    { align16 1Q };
+sel.ge(8)       g4<1>D          g2<0,1,0>D      g2.4<0,1,0>D    { align1 1Q };
+sel.l(8)        g8<1>D          g4<8,8,1>D      g3<0,1,0>D      { align1 1Q };
+sel.ge(16)      g4<1>D          g2<0,1,0>D      g2.4<0,1,0>D    { align1 1H };
+sel.l(16)       g12<1>D         g4<8,8,1>D      g3<0,1,0>D      { align1 1H };
+sel.ge(8)       g21<1>.xyD      g1<0>.xyyyD     g1<0>.zwwwD     { align16 1Q };
+(+f0.0.x) sel(8) g25<1>.xUD     g23<4>.yUD      g23<4>.xUD      { align16 1Q };
+sel.ge(8)       g22<1>UD        g1<0>UD         g1.4<0>.xUD     { align16 1Q };
+sel.l(8)        g3<1>UD         g2<0,1,0>UD     g2.1<0,1,0>UD   { align1 1Q };
+sel.l(16)       g3<1>UD         g2<0,1,0>UD     g2.1<0,1,0>UD   { align1 1H };
+sel.sat.l(8)    g116<1>F        g2<4>F          0x3f000000F  /* 0.5F */ { align16 1Q };
+(+f0.0) sel(8)  g37<1>.xyzF     (abs)g1.4<0>.xyzzF g1<0>.xyzzF  { align16 1Q };
+sel.l(8)        g68<1>.xyzF     g1<0>.xyzzF     g42<4>.xyzzF    { align16 1Q };
+(-f0.0) sel(8)  g47<1>.xyzF     (abs)g44<4>.xyzzF 0x3f800000F  /* 1F */ { align16 1Q };
+sel.ge(8)       g13<1>.xF       g1<0>.wF        g1<0>.zF        { align16 1Q };
+sel.ge(8)       g13<1>F         g12<8,8,1>F     (abs)g7<8,8,1>F { align1 1Q };
+sel.ge(16)      g29<1>F         g27<8,8,1>F     (abs)g17<8,8,1>F { align1 1H };
+(+f1.0) sel(4)  g15<1>.xUD      g13.4<4>.xUD    g13<4>.xUD      { align16 WE_all 1N };
+(-f0.0.any4h) sel(8) g67<1>.xUD g63<4>.xUD      0x00000000UD    { align16 1Q };
+sel.ge(8)       g4<1>UD         g2<0,1,0>UD     g2.3<0,1,0>UD   { align1 1Q };
+sel.ge(16)      g4<1>UD         g2<0,1,0>UD     g2.3<0,1,0>UD   { align1 1H };
+(+f0.0.x) sel(8) g17<1>.xF      g5.4<0>.zF      -g5.4<0>.zF     { align16 1Q };
+sel.l(8)        g124<1>F        g2.3<0,1,0>F    g2.2<0,1,0>F    { align1 1Q };
+sel.l(16)       g120<1>F        g2.3<0,1,0>F    g2.2<0,1,0>F    { align1 1H };
+(+f0.0.any4h) sel(8) g17<1>.xUD g8<4>.xUD       0x00000001UD    { align16 1Q };
+sel.ge(8)       g12<1>.xD       g5.4<0>.zD      -1D             { align16 1Q };
+sel.l(8)        g14<1>.xD       g12<4>.xD       1D              { align16 1Q };
+sel.sat.l(8)    g116<1>F        g1<0>F          g3<4>F          { align16 1Q };
+(-f0.0.x) sel(8) g44<1>.xF      (abs)g41<4>.xF  0x3f800000F  /* 1F */ { align16 1Q };
+sel.l(8)        g6<1>F          g3<8,8,1>F      0x40400000F  /* 3F */ { align1 1Q };
+sel.l(16)       g2<1>F          g20<8,8,1>F     0x40400000F  /* 3F */ { align1 1H };
+(+f0.0) sel(8)  g8<1>F          (abs)g40<8,8,1>F g6<8,8,1>F     { align1 1Q };
+(-f0.0) sel(8)  g15<1>F         (abs)g14<8,8,1>F 0x3f800000F  /* 1F */ { align1 1Q };
+(+f0.0) sel(16) g13<1>F         (abs)g72<8,8,1>F g58<8,8,1>F    { align1 1H };
+(-f0.0) sel(16) g27<1>F         (abs)g25<8,8,1>F 0x3f800000F  /* 1F */ { align1 1H };
+(-f0.0.y) sel(8) g3<1>.yUD      g10<4>.xUD      0x00000000UD    { align16 1Q };
+(+f0.0.y) sel(8) g3<1>.yUD      g1<0>.wUD       g1<0>.zUD       { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/sel.expected b/src/intel/compiler/elk/tests/gen7.5/sel.expected
new file mode 100644
index 00000000000..12131bbeb17
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/sel.expected
@@ -0,0 +1,63 @@
+02 01 61 00 21 04 ef 25 84 01 6e 00 a4 01 6e 00
+02 01 71 00 21 0c 23 23 ae 01 6f 00 00 00 00 40
+02 01 66 00 21 04 cf 23 a4 01 6e 00 84 01 6e 00
+02 01 67 00 21 04 0f 22 04 01 6e 00 24 01 6e 00
+02 00 61 00 21 04 e0 22 00 01 8d 00 e0 02 8d 00
+02 00 81 00 21 04 40 25 80 09 8d 00 c0 09 8d 00
+02 00 60 05 21 0c 60 20 44 00 00 00 01 00 00 00
+02 00 80 05 21 0c 60 20 44 00 00 00 01 00 00 00
+02 00 60 04 a5 1c 60 20 40 00 00 00 ff ff ff ff
+02 00 60 05 a5 1c 80 20 60 00 8d 00 01 00 00 00
+02 00 80 04 a5 1c 60 20 40 00 00 00 ff ff ff ff
+02 00 80 05 a5 1c a0 20 60 00 8d 00 01 00 00 00
+02 00 61 00 21 0c 80 2f 60 08 8d 00 00 00 80 3f
+02 00 81 00 21 0c 00 2f 60 03 8d 00 00 00 80 3f
+02 00 60 04 bd 7f 00 28 20 01 8d 00 00 00 00 00
+02 00 71 00 21 0c 00 22 80 02 8d 00 00 00 00 00
+02 00 80 04 bd 7f 00 23 80 02 8d 00 00 00 00 00
+02 00 91 00 21 0c 80 23 40 03 8d 00 00 00 00 00
+02 01 61 00 21 0c 03 21 24 02 65 00 00 00 00 3f
+02 01 60 05 a5 14 a7 21 c4 00 0a 00 ba 00 0a 00
+02 01 60 05 21 0c cf 2a c0 01 60 00 ff ff ff 0f
+02 01 60 04 bd 7f 62 20 e0 00 60 00 00 00 00 00
+02 01 60 05 bd 7f 61 21 ef 00 6f 00 00 00 00 43
+02 01 74 00 21 0c 64 20 c0 01 60 00 00 00 00 00
+02 01 60 05 21 04 cf 21 c4 00 0e 00 d4 00 0e 00
+02 01 62 00 21 0c 01 24 85 01 65 00 00 00 a8 41
+02 01 72 00 21 0c 21 24 00 04 60 00 00 00 b8 41
+02 00 60 04 a5 14 80 20 40 00 00 00 50 00 00 00
+02 00 60 05 a5 14 00 21 80 00 8d 00 60 00 00 00
+02 00 80 04 a5 14 80 20 40 00 00 00 50 00 00 00
+02 00 80 05 a5 14 80 21 80 00 8d 00 60 00 00 00
+02 01 60 04 a5 14 a3 22 24 00 05 00 2e 00 0f 00
+02 01 62 00 21 04 21 23 e5 02 65 00 e0 02 60 00
+02 01 60 04 21 04 cf 22 24 00 0e 00 30 00 00 00
+02 00 60 05 21 04 60 20 40 00 00 00 44 00 00 00
+02 00 80 05 21 04 60 20 40 00 00 00 44 00 00 00
+02 01 60 85 bd 7f 8f 2e 44 00 6e 00 00 00 00 3f
+02 01 61 00 bd 77 a7 24 34 20 0a 00 24 00 0a 00
+02 01 60 05 bd 77 87 28 24 00 0a 00 44 05 6a 00
+02 01 71 00 bd 7f e7 25 84 25 6a 00 00 00 80 3f
+02 01 60 04 bd 77 a1 21 2f 00 0f 00 2a 00 0a 00
+02 00 60 04 bd 77 a0 21 80 01 8d 00 e0 20 8d 00
+02 00 80 04 bd 77 a0 23 60 03 8d 00 20 22 8d 00
+02 03 41 00 21 04 e1 21 b0 01 60 04 a0 01 60 00
+02 01 76 00 21 0c 61 28 e0 07 60 00 00 00 00 00
+02 00 60 04 21 04 80 20 40 00 00 00 4c 00 00 00
+02 00 80 04 21 04 80 20 40 00 00 00 4c 00 00 00
+02 01 62 00 bd 77 21 22 ba 00 0a 00 ba 40 0a 00
+02 00 60 05 bd 77 80 2f 4c 00 00 00 48 00 00 00
+02 00 80 05 bd 77 00 2f 4c 00 00 00 48 00 00 00
+02 01 66 00 21 0c 21 22 00 01 60 00 01 00 00 00
+02 01 60 04 a5 1c 81 21 ba 00 0a 00 ff ff ff ff
+02 01 60 05 a5 1c c1 21 80 01 60 00 01 00 00 00
+02 01 60 85 bd 77 8f 2e 24 00 0e 00 64 00 6e 00
+02 01 72 00 bd 7f 81 25 20 25 60 00 00 00 80 3f
+02 00 60 05 bd 7f c0 20 60 00 8d 00 00 00 40 40
+02 00 80 05 bd 7f 40 20 80 02 8d 00 00 00 40 40
+02 00 61 00 bd 77 00 21 00 25 8d 00 c0 00 8d 00
+02 00 71 00 bd 7f e0 21 c0 21 8d 00 00 00 80 3f
+02 00 81 00 bd 77 a0 21 00 29 8d 00 40 07 8d 00
+02 00 91 00 bd 7f 60 23 20 23 8d 00 00 00 80 3f
+02 01 73 00 21 0c 62 20 40 01 60 00 00 00 00 00
+02 01 63 00 21 04 62 20 2f 00 0f 00 2a 00 0a 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/send.asm b/src/intel/compiler/elk/tests/gen7.5/send.asm
new file mode 100644
index 00000000000..f672d60dfb8
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/send.asm
@@ -0,0 +1,1504 @@
+send(8)         null<1>F        g113<4>F        0x8a08c000
+                            urb MsgDesc: 0 write HWord interleave complete mlen 5 rlen 0 { align16 1Q EOT };
+send(8)         null<1>F        g113<4>F        0x8608c000
+                            urb MsgDesc: 0 write HWord interleave complete mlen 3 rlen 0 { align16 1Q EOT };
+send(8)         g124<1>UW       g13<8,8,1>UD    0x08427001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g10<8,8,1>UD    0x10847001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g50<1>D         g51<4>UD        0x02194013
+                            urb MsgDesc: 2 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>F        g12<4>UD        0x04094019
+                            urb MsgDesc: 3 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g13<4>UD        0x04094011
+                            urb MsgDesc: 2 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g12<4>UD        0x04094009
+                            urb MsgDesc: 1 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g12<4>UD        0x04094001
+                            urb MsgDesc: 0 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         g14<1>D         g15<4>UD        0x0219400b
+                            urb MsgDesc: 1 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g13<1>D         g12<4>UD        0x02194003
+                            urb MsgDesc: 0 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>UW       g12<4,4,1>UD    0x02008004
+                            gateway MsgDesc: (barrier msg) mlen 1 rlen 0    { align1 WE_all 1Q };
+send(8)         null<1>F        g13<4>UD        0x0208c003
+                            urb MsgDesc: 0 read OWord interleave complete mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        g126<4>F        0x84080001
+                            urb MsgDesc: 0 write OWord mlen 2 rlen 0        { align16 1Q EOT };
+send(8)         g12<1>D         g114<4>F        0x02107000
+                            sampler MsgDesc: ld SIMD4x2 Surface = 0 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x0a094000
+                            urb MsgDesc: 0 write HWord per-slot interleave mlen 5 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x82084000
+                            urb MsgDesc: 0 write HWord interleave mlen 1 rlen 0 { align16 1Q EOT };
+send(8)         g0<1>F          g125<4>F        0x060a80ff
+                            data MsgDesc: ( DC OWORD dual block write, 255, 0) mlen 3 rlen 0 { align16 1Q };
+send(8)         g41<1>F         g126<4>F        0x041880ff
+                            data MsgDesc: ( DC OWORD dual block read, 255, 0) mlen 2 rlen 1 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x8e08c000
+                            urb MsgDesc: 0 write HWord interleave complete mlen 7 rlen 0 { align16 1Q EOT };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x04420001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x08840001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x0643d001
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g8<1>UW         g16<8,8,1>UD    0x0c85d001
+                            sampler MsgDesc: ld_mcs SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g8<1>UW         g17<8,8,1>UD    0x0a43e001
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g48<1>UW        g10<8,8,1>UD    0x1485e001
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g11<8,8,1>UD    0x06420001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g16<8,8,1>UD    0x0c840001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g7<8,8,1>UD     0x144a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 10 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g8<8,8,1>UD     a0<0,1,0>UD     0x00000200
+                            sampler MsgDesc: indirect                       { align1 1Q };
+send(8)         g124<1>UW       g8<8,8,1>UD     0x084a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g7<8,8,1>UD     0x064a8002
+                            sampler MsgDesc: gather4 SIMD8 Surface = 2 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x0a8c8002
+                            sampler MsgDesc: gather4 SIMD16 Surface = 2 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x0a4a6001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g12<8,8,1>UD    0x0a4a6102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x128c6001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g20<8,8,1>UD    0x128c6102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 9 rlen 8 { align1 1H };
+send(8)         g10<1>D         g114<4>F        0x0411e000
+                            sampler MsgDesc: ld2dms SIMD4x2 Surface = 0 Sampler = 0 mlen 2 rlen 1 { align16 1Q };
+send(8)         null<1>F        g9<4>UD         0x04094021
+                            urb MsgDesc: 4 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g9<4>UD         0x02088003
+                            urb MsgDesc: 0 read OWord complete mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x06094000
+                            urb MsgDesc: 0 write HWord per-slot interleave mlen 3 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x1a084000
+                            urb MsgDesc: 0 write HWord interleave mlen 13 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x8a08c030
+                            urb MsgDesc: 6 write HWord interleave complete mlen 5 rlen 0 { align16 1Q EOT };
+send(8)         g5<1>UW         g21<8,8,1>UD    0x02420001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g7<1>UW         g26<8,8,1>UD    0x04840001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         g12<1>D         g114<4>F        0x0210a000
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 0 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g8<1>UW         g10<8,8,1>UD    0x0242a001
+                            sampler MsgDesc: resinfo SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x0c4b1002
+                            sampler MsgDesc: gather4_po SIMD8 Surface = 2 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(16)        g22<1>UW        g19<8,8,1>UD    0x0484a001
+                            sampler MsgDesc: resinfo SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(16)        g18<1>UW        g7<8,8,1>UD     0x168d1002
+                            sampler MsgDesc: gather4_po SIMD16 Surface = 2 Sampler = 0 mlen 11 rlen 8 { align1 1H };
+send(8)         null<1>F        g9<4>UD         0x04094029
+                            urb MsgDesc: 5 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x084a8002
+                            sampler MsgDesc: gather4 SIMD8 Surface = 2 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g14<1>UW        g7<8,8,1>UD     0x0e8c8002
+                            sampler MsgDesc: gather4 SIMD16 Surface = 2 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         g14<1>D         g114<4>F        0x06191001
+                            sampler MsgDesc: gather4_po SIMD4x2 Surface = 1 Sampler = 0 mlen 3 rlen 1 { align16 1Q };
+send(8)         g3<1>D          g114<4>F        0x0210a101
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 1 Sampler = 1 mlen 1 rlen 1 { align16 1Q };
+send(8)         g5<1>D          g114<4>F        0x0210a202
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 2 Sampler = 2 mlen 1 rlen 1 { align16 1Q };
+send(8)         g7<1>D          g114<4>F        0x0210a303
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 3 Sampler = 3 mlen 1 rlen 1 { align16 1Q };
+send(8)         g9<1>D          g114<4>F        0x0210a404
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 4 Sampler = 4 mlen 1 rlen 1 { align16 1Q };
+send(8)         g11<1>D         g114<4>F        0x0210a505
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 5 Sampler = 5 mlen 1 rlen 1 { align16 1Q };
+send(8)         g13<1>D         g114<4>F        0x0210a606
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 6 Sampler = 6 mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>F        g54<4>UD        0x04094109
+                            urb MsgDesc: 33 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094111
+                            urb MsgDesc: 34 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094119
+                            urb MsgDesc: 35 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094121
+                            urb MsgDesc: 36 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094129
+                            urb MsgDesc: 37 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094131
+                            urb MsgDesc: 38 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094139
+                            urb MsgDesc: 39 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094141
+                            urb MsgDesc: 40 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094149
+                            urb MsgDesc: 41 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094151
+                            urb MsgDesc: 42 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094159
+                            urb MsgDesc: 43 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094161
+                            urb MsgDesc: 44 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094169
+                            urb MsgDesc: 45 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094171
+                            urb MsgDesc: 46 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094179
+                            urb MsgDesc: 47 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094181
+                            urb MsgDesc: 48 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094189
+                            urb MsgDesc: 49 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094191
+                            urb MsgDesc: 50 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x04094199
+                            urb MsgDesc: 51 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g53<4>UD        0x040941a1
+                            urb MsgDesc: 52 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g54<4>UD        0x040941a9
+                            urb MsgDesc: 53 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g55<4>UD        0x040941b1
+                            urb MsgDesc: 54 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g56<4>UD        0x040941b9
+                            urb MsgDesc: 55 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g59<4>UD        0x040941c1
+                            urb MsgDesc: 56 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g60<4>UD        0x040941c9
+                            urb MsgDesc: 57 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g61<4>UD        0x040941d1
+                            urb MsgDesc: 58 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g62<4>UD        0x040941d9
+                            urb MsgDesc: 59 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g63<4>UD        0x040941e1
+                            urb MsgDesc: 60 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g64<4>UD        0x040941e9
+                            urb MsgDesc: 61 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g65<4>UD        0x040941f1
+                            urb MsgDesc: 62 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g66<4>UD        0x040941f9
+                            urb MsgDesc: 63 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g71<4>UD        0x04094031
+                            urb MsgDesc: 6 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g72<4>UD        0x04094039
+                            urb MsgDesc: 7 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g73<4>UD        0x04094041
+                            urb MsgDesc: 8 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g74<4>UD        0x04094049
+                            urb MsgDesc: 9 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g75<4>UD        0x04094051
+                            urb MsgDesc: 10 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g76<4>UD        0x04094059
+                            urb MsgDesc: 11 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g77<4>UD        0x04094061
+                            urb MsgDesc: 12 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g78<4>UD        0x04094069
+                            urb MsgDesc: 13 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g79<4>UD        0x04094071
+                            urb MsgDesc: 14 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g80<4>UD        0x04094079
+                            urb MsgDesc: 15 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g81<4>UD        0x04094081
+                            urb MsgDesc: 16 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g82<4>UD        0x04094089
+                            urb MsgDesc: 17 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g83<4>UD        0x04094091
+                            urb MsgDesc: 18 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g84<4>UD        0x04094099
+                            urb MsgDesc: 19 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g85<4>UD        0x040940a1
+                            urb MsgDesc: 20 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g86<4>UD        0x040940a9
+                            urb MsgDesc: 21 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g87<4>UD        0x040940b1
+                            urb MsgDesc: 22 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g88<4>UD        0x040940b9
+                            urb MsgDesc: 23 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g89<4>UD        0x040940c1
+                            urb MsgDesc: 24 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g90<4>UD        0x040940c9
+                            urb MsgDesc: 25 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g91<4>UD        0x040940d1
+                            urb MsgDesc: 26 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g92<4>UD        0x040940d9
+                            urb MsgDesc: 27 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g93<4>UD        0x040940e1
+                            urb MsgDesc: 28 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g94<4>UD        0x040940e9
+                            urb MsgDesc: 29 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g95<4>UD        0x040940f1
+                            urb MsgDesc: 30 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g96<4>UD        0x040940f9
+                            urb MsgDesc: 31 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g8<4>UD         0x04094101
+                            urb MsgDesc: 32 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         g17<1>D         g16<4>UD        0x0219418b
+                            urb MsgDesc: 49 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g22<1>D         g21<4>UD        0x0219428b
+                            urb MsgDesc: 81 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g27<1>D         g26<4>UD        0x0219438b
+                            urb MsgDesc: 113 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g32<1>D         g31<4>UD        0x0219448b
+                            urb MsgDesc: 145 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g39<1>D         g38<4>UD        0x02194093
+                            urb MsgDesc: 18 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g42<1>D         g41<4>UD        0x0219410b
+                            urb MsgDesc: 33 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g47<1>D         g46<4>UD        0x0219420b
+                            urb MsgDesc: 65 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g52<1>D         g51<4>UD        0x0219430b
+                            urb MsgDesc: 97 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g57<1>D         g56<4>UD        0x0219440b
+                            urb MsgDesc: 129 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g71<1>D         g3<4>UD         0x02194103
+                            urb MsgDesc: 32 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g74<1>D         g3<4>UD         0x02194203
+                            urb MsgDesc: 64 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g77<1>D         g3<4>UD         0x02194303
+                            urb MsgDesc: 96 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g80<1>D         g3<4>UD         0x02194403
+                            urb MsgDesc: 128 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x9208c000
+                            urb MsgDesc: 0 write HWord interleave complete mlen 9 rlen 0 { align16 1Q EOT };
+send(8)         g5<1>UW         g3<8,8,1>UD     0x02427001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g8<1>UW         g5<8,8,1>UD     0x04847001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x1a084030
+                            urb MsgDesc: 6 write HWord interleave mlen 13 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x9608c060
+                            urb MsgDesc: 12 write HWord interleave complete mlen 11 rlen 0 { align16 1Q EOT };
+send(8)         g58<1>D         g59<4>UD        0x0219401b
+                            urb MsgDesc: 3 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g65<1>D         g66<4>UD        0x02194023
+                            urb MsgDesc: 4 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g72<1>D         g73<4>UD        0x0219402b
+                            urb MsgDesc: 5 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g79<1>D         g80<4>UD        0x02194033
+                            urb MsgDesc: 6 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g86<1>D         g87<4>UD        0x0219403b
+                            urb MsgDesc: 7 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g93<1>D         g94<4>UD        0x02194043
+                            urb MsgDesc: 8 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g100<1>D        g101<4>UD       0x0219404b
+                            urb MsgDesc: 9 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g107<1>D        g108<4>UD       0x02194053
+                            urb MsgDesc: 10 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g13<1>D         g14<4>UD        0x0219405b
+                            urb MsgDesc: 11 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g20<1>D         g21<4>UD        0x02194063
+                            urb MsgDesc: 12 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g27<1>D         g28<4>UD        0x0219406b
+                            urb MsgDesc: 13 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g34<1>D         g35<4>UD        0x02194073
+                            urb MsgDesc: 14 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g43<1>D         g47<4>UD        0x0219407b
+                            urb MsgDesc: 15 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g53<1>D         g54<4>UD        0x02194083
+                            urb MsgDesc: 16 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g60<1>D         g61<4>UD        0x0219408b
+                            urb MsgDesc: 17 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g74<1>D         g75<4>UD        0x0219409b
+                            urb MsgDesc: 19 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g81<1>D         g82<4>UD        0x021940a3
+                            urb MsgDesc: 20 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g88<1>D         g89<4>UD        0x021940ab
+                            urb MsgDesc: 21 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g95<1>D         g96<4>UD        0x021940b3
+                            urb MsgDesc: 22 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g102<1>D        g103<4>UD       0x021940bb
+                            urb MsgDesc: 23 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g110<4>UD       0x021940c3
+                            urb MsgDesc: 24 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g13<1>D         g14<4>UD        0x021940cb
+                            urb MsgDesc: 25 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g20<1>D         g21<4>UD        0x021940d3
+                            urb MsgDesc: 26 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g27<1>D         g28<4>UD        0x021940db
+                            urb MsgDesc: 27 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g34<1>D         g35<4>UD        0x021940e3
+                            urb MsgDesc: 28 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g43<1>D         g47<4>UD        0x021940eb
+                            urb MsgDesc: 29 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g53<1>D         g54<4>UD        0x021940f3
+                            urb MsgDesc: 30 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g60<1>D         g61<4>UD        0x021940fb
+                            urb MsgDesc: 31 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x02429001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x04849001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g10<8,8,1>UD    0x0e434001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+(+f1.0) send(8) g12<1>UW        g2<8,8,1>UD     0x0410b201
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, or) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g2<8,8,1>UD     0x02009501
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, inc) mlen 1 rlen 0 { align1 1Q };
+(+f1.0) send(16) g14<1>UW       g16<8,8,1>UD    0x0820a201
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, or) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) null<1>UW      g2<8,8,1>UD     0x04008501
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, inc) mlen 2 rlen 0 { align1 1H };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x08434001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g5<1>UW         g11<8,8,1>UD    0x06495001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0x0) mlen 3 rlen 4 { align1 1Q };
+send(8)         null<1>UW       g14<8,8,1>UD    0x0e0b5002
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD16, Mask = 0x0) mlen 7 rlen 0 { align1 1Q };
+send(8)         g2<1>UW         g11<8,8,1>UD    0x06496001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0x0) mlen 3 rlen 4 { align1 2Q };
+send(8)         null<1>UW       g7<8,8,1>UD     0x0e0b6002
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD8, Mask = 0x0) mlen 7 rlen 0 { align1 2Q };
+send(8)         null<1>F        g113<4>F        0x8608c030
+                            urb MsgDesc: 6 write HWord interleave complete mlen 3 rlen 0 { align16 1Q EOT };
+send(8)         g12<1>F         g114<4>F        0x06190001
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 1 Sampler = 0 mlen 3 rlen 1 { align16 1Q };
+send(8)         g7<1>UW         g0<8,8,1>UD     0x02200008
+                            pixel interp MsgDesc: (persp, per_message_offset, 0x08) mlen 1 rlen 2 { align1 1Q };
+send(16)        g9<1>UW         g0<8,8,1>UD     0x02410008
+                            pixel interp MsgDesc: (persp, per_message_offset, 0x08) mlen 1 rlen 4 { align1 1H };
+send(8)         g2<1>UW         g11<8,8,1>UD    0x0443d001
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g9<8,8,1>UD     0x0843e001
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g15<8,8,1>UD    0x0885d001
+                            sampler MsgDesc: ld_mcs SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(16)        g43<1>UW        g11<8,8,1>UD    0x1085e001
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g11<8,8,1>UD    0x0a4a1001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g19<8,8,1>UD    0x128c1001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x9608c000
+                            urb MsgDesc: 0 write HWord interleave complete mlen 11 rlen 0 { align16 1Q EOT };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x0a4a8002
+                            sampler MsgDesc: gather4 SIMD8 Surface = 2 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g16<1>UW        g7<8,8,1>UD     0x128c8002
+                            sampler MsgDesc: gather4 SIMD16 Surface = 2 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x06094008
+                            urb MsgDesc: 1 write HWord per-slot interleave mlen 3 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x04084001
+                            urb MsgDesc: 0 write OWord interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         g14<1>UD        g114<4>F        0x04188001
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 1 Sampler = 0 mlen 2 rlen 1 { align16 1Q };
+send(8)         g16<1>.xD       g114<4>F        0x0218b000
+                            sampler MsgDesc: sampleinfo SIMD4x2 Surface = 0 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x0a094008
+                            urb MsgDesc: 1 write HWord per-slot interleave mlen 5 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x16094000
+                            urb MsgDesc: 0 write HWord per-slot interleave mlen 11 rlen 0 { align16 1Q };
+send(8)         g2<1>UW         g6<8,8,1>UD     0x08427005
+                            sampler MsgDesc: ld SIMD8 Surface = 5 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g10<8,8,1>UD    0x08427006
+                            sampler MsgDesc: ld SIMD8 Surface = 6 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g14<8,8,1>UD    0x08427007
+                            sampler MsgDesc: ld SIMD8 Surface = 7 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g18<8,8,1>UD    0x08427008
+                            sampler MsgDesc: ld SIMD8 Surface = 8 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g65<1>UW        g73<8,8,1>UD    0x10847005
+                            sampler MsgDesc: ld SIMD16 Surface = 5 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(16)        g32<1>UW        g81<8,8,1>UD    0x10847006
+                            sampler MsgDesc: ld SIMD16 Surface = 6 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(16)        g40<1>UW        g49<8,8,1>UD    0x10847007
+                            sampler MsgDesc: ld SIMD16 Surface = 7 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(16)        g48<1>UW        g57<8,8,1>UD    0x10847008
+                            sampler MsgDesc: ld SIMD16 Surface = 8 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g6<8,8,1>UD     0x064a3001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g9<8,8,1>UD     0x064a3102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0a8c3001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0a8c3102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 5 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x04420102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x06420304
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 3 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08840102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 4 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0c840304
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 3 mlen 6 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x04420304
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 3 mlen 2 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x06420708
+                            sampler MsgDesc: sample SIMD8 Surface = 8 Sampler = 7 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08840304
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 3 mlen 4 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0c840708
+                            sampler MsgDesc: sample SIMD16 Surface = 8 Sampler = 7 mlen 6 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x06421001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x0c841001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(16)        g4<1>UD         g13<8,8,1>UD    0x02280301
+                            const MsgDesc: (1, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x0a4b1002
+                            sampler MsgDesc: gather4_po SIMD8 Surface = 2 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g35<1>UW        g7<8,8,1>UD     0x128d1002
+                            sampler MsgDesc: gather4_po SIMD16 Surface = 2 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g12<8,8,1>UD    0x084b0002
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 2 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x0e8d0002
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 2 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g17<8,8,1>UD    0x0e434102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 7 rlen 4 { align1 1Q };
+send(8)         g4<1>D          g114<4>F        0x04188003
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 3 Sampler = 0 mlen 2 rlen 1 { align16 1Q };
+send(8)         g5<1>D          g114<4>F        0x04188104
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 4 Sampler = 1 mlen 2 rlen 1 { align16 1Q };
+send(8)         g8<1>D          g114<4>F        0x04188205
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 5 Sampler = 2 mlen 2 rlen 1 { align16 1Q };
+send(8)         g124<1>UW       g4<8,8,1>UD     0x0c424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g5<8,8,1>UD     0x06427001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0c847001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g9<1>F          g114<4>F        0x04102000
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 0 Sampler = 0 mlen 2 rlen 1 { align16 1Q };
+send(8)         g3<1>.xUW       g1<4>UD         0x0410eb00
+                            dp data 1 MsgDesc: ( DC untyped 4x2 atomic op, Surface = 0,  imin) mlen 2 rlen 1 { align16 1Q };
+send(8)         g5<1>UW         g4<8,8,1>UD     0x08495001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0x0) mlen 4 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x08496001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0x0) mlen 4 rlen 4 { align1 2Q };
+send(8)         g2<1>UW         g9<8,8,1>UD     0x08427002
+                            sampler MsgDesc: ld SIMD8 Surface = 2 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g14<8,8,1>UD    0x10847002
+                            sampler MsgDesc: ld SIMD16 Surface = 2 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x06423001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x0c843001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x1a094000
+                            urb MsgDesc: 0 write HWord per-slot interleave mlen 13 rlen 0 { align16 1Q };
+send(8)         g124<1>UW       g8<8,8,1>UD     0x08422001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g10<8,8,1>UD    0x10842001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g16<1>F         g17<4>.xUD      0x02107001
+                            sampler MsgDesc: ld SIMD4x2 Surface = 1 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g2<1>UW         g9<8,8,1>UD     0x08423001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g16<1>UW        g8<8,8,1>UD     0x10843001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x04423001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x08843001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g4<1>UW         g0<8,8,1>UD     0x02201000
+                            pixel interp MsgDesc: (persp, sample_position, 0x00) mlen 1 rlen 2 { align1 1Q };
+send(16)        g6<1>UW         g0<8,8,1>UD     0x02411000
+                            pixel interp MsgDesc: (persp, sample_position, 0x00) mlen 1 rlen 4 { align1 1H };
+send(8)         g124<1>UW       g14<8,8,1>UD    0x0a4b0002
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 2 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x128d0002
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 2 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x06422001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g14<8,8,1>UD    0x04429001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g14<1>UW        g8<8,8,1>UD     0x0c842001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(16)        g14<1>UW        g10<8,8,1>UD    0x08849001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x0e094000
+                            urb MsgDesc: 0 write HWord per-slot interleave mlen 7 rlen 0 { align16 1Q };
+send(8)         g36<1>D         g35<4>UD        0x0219419b
+                            urb MsgDesc: 51 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g41<1>D         g40<4>UD        0x0219429b
+                            urb MsgDesc: 83 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g46<1>D         g45<4>UD        0x0219439b
+                            urb MsgDesc: 115 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g58<1>D         g57<4>UD        0x0219411b
+                            urb MsgDesc: 35 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g63<1>D         g62<4>UD        0x0219421b
+                            urb MsgDesc: 67 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g68<1>D         g67<4>UD        0x0219431b
+                            urb MsgDesc: 99 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g78<1>D         g19<4>UD        0x02194113
+                            urb MsgDesc: 34 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g81<1>D         g19<4>UD        0x02194213
+                            urb MsgDesc: 66 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g84<1>D         g19<4>UD        0x02194313
+                            urb MsgDesc: 98 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x04422001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x08842001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x06429001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g12<8,8,1>UD    0x0c849001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x12094000
+                            urb MsgDesc: 0 write HWord per-slot interleave mlen 9 rlen 0 { align16 1Q };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x06426001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x0c846001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x04427001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x08847001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g5<8,8,1>UD     0x08425001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x10845001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x064a8006
+                            sampler MsgDesc: gather4 SIMD8 Surface = 6 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g14<8,8,1>UD    0x064a840a
+                            sampler MsgDesc: gather4 SIMD8 Surface = 10 Sampler = 4 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x084a8107
+                            sampler MsgDesc: gather4 SIMD8 Surface = 7 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g10<8,8,1>UD    0x084a8208
+                            sampler MsgDesc: gather4 SIMD8 Surface = 8 Sampler = 2 mlen 4 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g26<8,8,1>UD    0x0a4a8309
+                            sampler MsgDesc: gather4 SIMD8 Surface = 9 Sampler = 3 mlen 5 rlen 4 { align1 1Q };
+send(16)        g35<1>UW        g2<8,8,1>UD     0x0a8c8006
+                            sampler MsgDesc: gather4 SIMD16 Surface = 6 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(16)        g18<1>UW        g43<8,8,1>UD    0x0a8c840a
+                            sampler MsgDesc: gather4 SIMD16 Surface = 10 Sampler = 4 mlen 5 rlen 8 { align1 1H };
+send(16)        g43<1>UW        g7<8,8,1>UD     0x0e8c8107
+                            sampler MsgDesc: gather4 SIMD16 Surface = 7 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g51<8,8,1>UD    0x0e8c8208
+                            sampler MsgDesc: gather4 SIMD16 Surface = 8 Sampler = 2 mlen 7 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g26<8,8,1>UD    0x128c8309
+                            sampler MsgDesc: gather4 SIMD16 Surface = 9 Sampler = 3 mlen 9 rlen 8 { align1 1H };
+(+f1.0) send(8) null<1>UW       g7<8,8,1>UD     0x0a026001
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD8, Mask = 0x0) mlen 5 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g9<8,8,1>UD     0x14025001
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD16, Mask = 0x0) mlen 10 rlen 0 { align1 1H };
+send(8)         g124<1>UW       g10<8,8,1>UD    0x0e4a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g9<8,8,1>UD     0x0a422001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x14842001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g9<8,8,1>UD     0x0a421001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x14841001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(8)         g25<1>UW        g13<8,8,1>UD    0x06195e01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0xe) mlen 3 rlen 1 { align1 1Q };
+send(8)         null<1>UW       g26<8,8,1>UD    0x080b5e01
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD16, Mask = 0xe) mlen 4 rlen 0 { align1 1Q };
+send(8)         g11<1>UW        g18<8,8,1>UD    0x06196e01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0xe) mlen 3 rlen 1 { align1 2Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x080b6e01
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD8, Mask = 0xe) mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g13<8,8,1>UD    0x06098501
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, inc) mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06099501
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, inc) mlen 3 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g26<8,8,1>UD    0x08098c01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, umax) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099c01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, umax) mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g26<8,8,1>UD    0x08098401
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, mov) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099401
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, mov) mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g28<8,8,1>UD    0x0a098e01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, cmpwr) mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g16<8,8,1>UD    0x0a099e01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, cmpwr) mlen 5 rlen 0 { align1 2Q };
+send(8)         g124<1>UW       g5<8,8,1>UD     0x064a0001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x0a8c0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x06423102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g26<8,8,1>UD    0x0c843102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         g22<1>UW        g22<8,8,1>UD    0x0242a203
+                            sampler MsgDesc: resinfo SIMD8 Surface = 3 Sampler = 2 mlen 1 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g26<8,8,1>UD    0x0242a304
+                            sampler MsgDesc: resinfo SIMD8 Surface = 4 Sampler = 3 mlen 1 rlen 4 { align1 1Q };
+send(8)         g30<1>UW        g30<8,8,1>UD    0x0242a405
+                            sampler MsgDesc: resinfo SIMD8 Surface = 5 Sampler = 4 mlen 1 rlen 4 { align1 1Q };
+send(8)         g34<1>UW        g34<8,8,1>UD    0x0242a506
+                            sampler MsgDesc: resinfo SIMD8 Surface = 6 Sampler = 5 mlen 1 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g20<8,8,1>UD    0x0242a102
+                            sampler MsgDesc: resinfo SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 4 { align1 1Q };
+send(8)         g38<1>UW        g38<8,8,1>UD    0x0242a607
+                            sampler MsgDesc: resinfo SIMD8 Surface = 7 Sampler = 6 mlen 1 rlen 4 { align1 1Q };
+send(8)         g42<1>UW        g42<8,8,1>UD    0x0242a708
+                            sampler MsgDesc: resinfo SIMD8 Surface = 8 Sampler = 7 mlen 1 rlen 4 { align1 1Q };
+send(8)         g46<1>UW        g46<8,8,1>UD    0x0242a809
+                            sampler MsgDesc: resinfo SIMD8 Surface = 9 Sampler = 8 mlen 1 rlen 4 { align1 1Q };
+send(8)         g50<1>UW        g50<8,8,1>UD    0x0242a90a
+                            sampler MsgDesc: resinfo SIMD8 Surface = 10 Sampler = 9 mlen 1 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g54<8,8,1>UD    0x0242aa0b
+                            sampler MsgDesc: resinfo SIMD8 Surface = 11 Sampler = 10 mlen 1 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g55<8,8,1>UD    0x0242ab0c
+                            sampler MsgDesc: resinfo SIMD8 Surface = 12 Sampler = 11 mlen 1 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g56<8,8,1>UD    0x0242ac0d
+                            sampler MsgDesc: resinfo SIMD8 Surface = 13 Sampler = 12 mlen 1 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0484a102
+                            sampler MsgDesc: resinfo SIMD16 Surface = 2 Sampler = 1 mlen 2 rlen 8 { align1 1H };
+send(16)        g82<1>UW        g110<8,8,1>UD   0x0484aa0b
+                            sampler MsgDesc: resinfo SIMD16 Surface = 11 Sampler = 10 mlen 2 rlen 8 { align1 1H };
+send(16)        g18<1>UW        g26<8,8,1>UD    0x0484a203
+                            sampler MsgDesc: resinfo SIMD16 Surface = 3 Sampler = 2 mlen 2 rlen 8 { align1 1H };
+send(16)        g90<1>UW        g112<8,8,1>UD   0x0484ab0c
+                            sampler MsgDesc: resinfo SIMD16 Surface = 12 Sampler = 11 mlen 2 rlen 8 { align1 1H };
+send(16)        g98<1>UW        g106<8,8,1>UD   0x0484ac0d
+                            sampler MsgDesc: resinfo SIMD16 Surface = 13 Sampler = 12 mlen 2 rlen 8 { align1 1H };
+send(16)        g26<1>UW        g34<8,8,1>UD    0x0484a304
+                            sampler MsgDesc: resinfo SIMD16 Surface = 4 Sampler = 3 mlen 2 rlen 8 { align1 1H };
+send(16)        g34<1>UW        g42<8,8,1>UD    0x0484a405
+                            sampler MsgDesc: resinfo SIMD16 Surface = 5 Sampler = 4 mlen 2 rlen 8 { align1 1H };
+send(16)        g42<1>UW        g50<8,8,1>UD    0x0484a506
+                            sampler MsgDesc: resinfo SIMD16 Surface = 6 Sampler = 5 mlen 2 rlen 8 { align1 1H };
+send(16)        g50<1>UW        g58<8,8,1>UD    0x0484a607
+                            sampler MsgDesc: resinfo SIMD16 Surface = 7 Sampler = 6 mlen 2 rlen 8 { align1 1H };
+send(16)        g58<1>UW        g66<8,8,1>UD    0x0484a708
+                            sampler MsgDesc: resinfo SIMD16 Surface = 8 Sampler = 7 mlen 2 rlen 8 { align1 1H };
+send(16)        g66<1>UW        g74<8,8,1>UD    0x0484a809
+                            sampler MsgDesc: resinfo SIMD16 Surface = 9 Sampler = 8 mlen 2 rlen 8 { align1 1H };
+send(16)        g74<1>UW        g108<8,8,1>UD   0x0484a90a
+                            sampler MsgDesc: resinfo SIMD16 Surface = 10 Sampler = 9 mlen 2 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x06424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x04420203
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 2 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g11<8,8,1>UD    0x04420405
+                            sampler MsgDesc: sample SIMD8 Surface = 5 Sampler = 4 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x04420506
+                            sampler MsgDesc: sample SIMD8 Surface = 6 Sampler = 5 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x04420607
+                            sampler MsgDesc: sample SIMD8 Surface = 7 Sampler = 6 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g14<8,8,1>UD    0x04420708
+                            sampler MsgDesc: sample SIMD8 Surface = 8 Sampler = 7 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g15<8,8,1>UD    0x04420809
+                            sampler MsgDesc: sample SIMD8 Surface = 9 Sampler = 8 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g16<8,8,1>UD    0x0442090a
+                            sampler MsgDesc: sample SIMD8 Surface = 10 Sampler = 9 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g17<8,8,1>UD    0x04420a0b
+                            sampler MsgDesc: sample SIMD8 Surface = 11 Sampler = 10 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g18<8,8,1>UD    0x04420b0c
+                            sampler MsgDesc: sample SIMD8 Surface = 12 Sampler = 11 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g19<8,8,1>UD    0x04420c0d
+                            sampler MsgDesc: sample SIMD8 Surface = 13 Sampler = 12 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g20<8,8,1>UD    0x04420d0e
+                            sampler MsgDesc: sample SIMD8 Surface = 14 Sampler = 13 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g21<8,8,1>UD    0x04420e0f
+                            sampler MsgDesc: sample SIMD8 Surface = 15 Sampler = 14 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g22<8,8,1>UD    0x04420f10
+                            sampler MsgDesc: sample SIMD8 Surface = 16 Sampler = 15 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0011
+                            sampler MsgDesc: sample SIMD8 Surface = 17 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0112
+                            sampler MsgDesc: sample SIMD8 Surface = 18 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0213
+                            sampler MsgDesc: sample SIMD8 Surface = 19 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0314
+                            sampler MsgDesc: sample SIMD8 Surface = 20 Sampler = 3 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0415
+                            sampler MsgDesc: sample SIMD8 Surface = 21 Sampler = 4 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0516
+                            sampler MsgDesc: sample SIMD8 Surface = 22 Sampler = 5 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0617
+                            sampler MsgDesc: sample SIMD8 Surface = 23 Sampler = 6 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0718
+                            sampler MsgDesc: sample SIMD8 Surface = 24 Sampler = 7 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0819
+                            sampler MsgDesc: sample SIMD8 Surface = 25 Sampler = 8 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a091a
+                            sampler MsgDesc: sample SIMD8 Surface = 26 Sampler = 9 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0a1b
+                            sampler MsgDesc: sample SIMD8 Surface = 27 Sampler = 10 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0b1c
+                            sampler MsgDesc: sample SIMD8 Surface = 28 Sampler = 11 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0c1d
+                            sampler MsgDesc: sample SIMD8 Surface = 29 Sampler = 12 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0d1e
+                            sampler MsgDesc: sample SIMD8 Surface = 30 Sampler = 13 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0e1f
+                            sampler MsgDesc: sample SIMD8 Surface = 31 Sampler = 14 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0f20
+                            sampler MsgDesc: sample SIMD8 Surface = 32 Sampler = 15 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g26<8,8,1>UD    0x08840203
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 2 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g28<8,8,1>UD    0x08840405
+                            sampler MsgDesc: sample SIMD16 Surface = 5 Sampler = 4 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g29<8,8,1>UD    0x08840506
+                            sampler MsgDesc: sample SIMD16 Surface = 6 Sampler = 5 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g30<8,8,1>UD    0x08840607
+                            sampler MsgDesc: sample SIMD16 Surface = 7 Sampler = 6 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g31<8,8,1>UD    0x08840708
+                            sampler MsgDesc: sample SIMD16 Surface = 8 Sampler = 7 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g32<8,8,1>UD    0x08840809
+                            sampler MsgDesc: sample SIMD16 Surface = 9 Sampler = 8 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g33<8,8,1>UD    0x0884090a
+                            sampler MsgDesc: sample SIMD16 Surface = 10 Sampler = 9 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g34<8,8,1>UD    0x08840a0b
+                            sampler MsgDesc: sample SIMD16 Surface = 11 Sampler = 10 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g35<8,8,1>UD    0x08840b0c
+                            sampler MsgDesc: sample SIMD16 Surface = 12 Sampler = 11 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g36<8,8,1>UD    0x08840c0d
+                            sampler MsgDesc: sample SIMD16 Surface = 13 Sampler = 12 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g37<8,8,1>UD    0x08840d0e
+                            sampler MsgDesc: sample SIMD16 Surface = 14 Sampler = 13 mlen 4 rlen 8 { align1 1H };
+send(16)        g7<1>UW         g38<8,8,1>UD    0x08840e0f
+                            sampler MsgDesc: sample SIMD16 Surface = 15 Sampler = 14 mlen 4 rlen 8 { align1 1H };
+send(16)        g23<1>UW        g39<8,8,1>UD    0x08840f10
+                            sampler MsgDesc: sample SIMD16 Surface = 16 Sampler = 15 mlen 4 rlen 8 { align1 1H };
+send(16)        g17<1>UW        g2<8,8,1>UD     0x0a8c0011
+                            sampler MsgDesc: sample SIMD16 Surface = 17 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(16)        g29<1>UW        g7<8,8,1>UD     0x0a8c0112
+                            sampler MsgDesc: sample SIMD16 Surface = 18 Sampler = 1 mlen 5 rlen 8 { align1 1H };
+send(16)        g27<1>UW        g12<8,8,1>UD    0x0a8c0213
+                            sampler MsgDesc: sample SIMD16 Surface = 19 Sampler = 2 mlen 5 rlen 8 { align1 1H };
+send(16)        g32<1>UW        g17<8,8,1>UD    0x0a8c0314
+                            sampler MsgDesc: sample SIMD16 Surface = 20 Sampler = 3 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g22<8,8,1>UD    0x0a8c0415
+                            sampler MsgDesc: sample SIMD16 Surface = 21 Sampler = 4 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g27<8,8,1>UD    0x0a8c0516
+                            sampler MsgDesc: sample SIMD16 Surface = 22 Sampler = 5 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g32<8,8,1>UD    0x0a8c0617
+                            sampler MsgDesc: sample SIMD16 Surface = 23 Sampler = 6 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g37<8,8,1>UD    0x0a8c0718
+                            sampler MsgDesc: sample SIMD16 Surface = 24 Sampler = 7 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g42<8,8,1>UD    0x0a8c0819
+                            sampler MsgDesc: sample SIMD16 Surface = 25 Sampler = 8 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g47<8,8,1>UD    0x0a8c091a
+                            sampler MsgDesc: sample SIMD16 Surface = 26 Sampler = 9 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g52<8,8,1>UD    0x0a8c0a1b
+                            sampler MsgDesc: sample SIMD16 Surface = 27 Sampler = 10 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g57<8,8,1>UD    0x0a8c0b1c
+                            sampler MsgDesc: sample SIMD16 Surface = 28 Sampler = 11 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g62<8,8,1>UD    0x0a8c0c1d
+                            sampler MsgDesc: sample SIMD16 Surface = 29 Sampler = 12 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g67<8,8,1>UD    0x0a8c0d1e
+                            sampler MsgDesc: sample SIMD16 Surface = 30 Sampler = 13 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g72<8,8,1>UD    0x0a8c0e1f
+                            sampler MsgDesc: sample SIMD16 Surface = 31 Sampler = 14 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g77<8,8,1>UD    0x0a8c0f20
+                            sampler MsgDesc: sample SIMD16 Surface = 32 Sampler = 15 mlen 5 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g2<8,8,1>UD     0x02420102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g2<8,8,1>UD     0x04840102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 2 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x02406001
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD8, Mask = 0x0) mlen 1 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x04805001
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD16, Mask = 0x0) mlen 2 rlen 8 { align1 1H };
+send(8)         g29<1>UW        g5<8,8,1>UD     0x0e4b2002
+                            sampler MsgDesc: gather4_po_c SIMD8 Surface = 2 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x084a2001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x0e8c2001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g3<8,8,1>UD     0x044a0001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g4<8,8,1>UD     0x068c0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 3 rlen 8 { align1 1H };
+send(8)         g17<1>UW        g12<8,8,1>UD    0x04420003
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g7<1>UW         g39<8,8,1>UD    0x08840003
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g8<8,8,1>UD     0x064a1001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g15<8,8,1>UD    0x0a8c1001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x9a08c000
+                            urb MsgDesc: 0 write HWord interleave complete mlen 13 rlen 0 { align16 1Q EOT };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x084a6001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x084a6102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x0e8c6001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0e8c6102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(8)         g108<1>D        g105<4>UD       0x02194223
+                            urb MsgDesc: 68 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g107<1>D        g105<4>UD       0x02194323
+                            urb MsgDesc: 100 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g106<1>D        g105<4>UD       0x02194123
+                            urb MsgDesc: 36 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g111<1>D        g105<4>UD       0x0219412b
+                            urb MsgDesc: 37 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g107<1>D        g105<4>UD       0x0219422b
+                            urb MsgDesc: 69 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x0219432b
+                            urb MsgDesc: 101 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g108<1>D        g105<4>UD       0x02194133
+                            urb MsgDesc: 38 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g110<1>D        g105<4>UD       0x02194233
+                            urb MsgDesc: 70 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x02194333
+                            urb MsgDesc: 102 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g107<1>D        g105<4>UD       0x0219413b
+                            urb MsgDesc: 39 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x0219423b
+                            urb MsgDesc: 71 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g111<1>D        g105<4>UD       0x0219433b
+                            urb MsgDesc: 103 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g110<1>D        g105<4>UD       0x02194143
+                            urb MsgDesc: 40 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g106<1>D        g105<4>UD       0x02194243
+                            urb MsgDesc: 72 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g111<1>D        g105<4>UD       0x02194343
+                            urb MsgDesc: 104 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x0219414b
+                            urb MsgDesc: 41 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g111<1>D        g105<4>UD       0x0219424b
+                            urb MsgDesc: 73 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g106<1>D        g105<4>UD       0x0219434b
+                            urb MsgDesc: 105 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g106<1>D        g105<4>UD       0x02194153
+                            urb MsgDesc: 42 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g108<1>D        g105<4>UD       0x02194253
+                            urb MsgDesc: 74 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g107<1>D        g105<4>UD       0x02194353
+                            urb MsgDesc: 106 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g107<1>D        g105<4>UD       0x0219415b
+                            urb MsgDesc: 43 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x0219425b
+                            urb MsgDesc: 75 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g108<1>D        g105<4>UD       0x0219435b
+                            urb MsgDesc: 107 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g108<1>D        g105<4>UD       0x02194163
+                            urb MsgDesc: 44 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g110<1>D        g105<4>UD       0x02194263
+                            urb MsgDesc: 76 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x02194363
+                            urb MsgDesc: 108 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g107<1>D        g105<4>UD       0x0219416b
+                            urb MsgDesc: 45 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x0219426b
+                            urb MsgDesc: 77 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g111<1>D        g105<4>UD       0x0219436b
+                            urb MsgDesc: 109 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g110<1>D        g105<4>UD       0x02194173
+                            urb MsgDesc: 46 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g106<1>D        g105<4>UD       0x02194273
+                            urb MsgDesc: 78 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g111<1>D        g105<4>UD       0x02194373
+                            urb MsgDesc: 110 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x0219417b
+                            urb MsgDesc: 47 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g111<1>D        g105<4>UD       0x0219427b
+                            urb MsgDesc: 79 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g106<1>D        g105<4>UD       0x0219437b
+                            urb MsgDesc: 111 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g106<1>D        g105<4>UD       0x02194183
+                            urb MsgDesc: 48 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g108<1>D        g105<4>UD       0x02194283
+                            urb MsgDesc: 80 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g107<1>D        g105<4>UD       0x02194383
+                            urb MsgDesc: 112 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g108<1>D        g105<4>UD       0x02194193
+                            urb MsgDesc: 50 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g110<1>D        g105<4>UD       0x02194293
+                            urb MsgDesc: 82 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x02194393
+                            urb MsgDesc: 114 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g108<1>D        g105<4>UD       0x021941a3
+                            urb MsgDesc: 52 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g110<1>D        g105<4>UD       0x021942a3
+                            urb MsgDesc: 84 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g106<1>D        g105<4>UD       0x021943a3
+                            urb MsgDesc: 116 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g111<1>D        g105<4>UD       0x021941ab
+                            urb MsgDesc: 53 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g106<1>D        g105<4>UD       0x021942ab
+                            urb MsgDesc: 85 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g108<1>D        g105<4>UD       0x021943ab
+                            urb MsgDesc: 117 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g107<1>D        g105<4>UD       0x021941b3
+                            urb MsgDesc: 54 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x021942b3
+                            urb MsgDesc: 86 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g108<1>D        g105<4>UD       0x021943b3
+                            urb MsgDesc: 118 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g108<1>D        g105<4>UD       0x021941bb
+                            urb MsgDesc: 55 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g110<1>D        g105<4>UD       0x021942bb
+                            urb MsgDesc: 87 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x021943bb
+                            urb MsgDesc: 119 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g110<1>D        g105<4>UD       0x021941c3
+                            urb MsgDesc: 56 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g106<1>D        g105<4>UD       0x021942c3
+                            urb MsgDesc: 88 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x021943c3
+                            urb MsgDesc: 120 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x021941cb
+                            urb MsgDesc: 57 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g111<1>D        g105<4>UD       0x021942cb
+                            urb MsgDesc: 89 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g106<1>D        g105<4>UD       0x021943cb
+                            urb MsgDesc: 121 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g106<1>D        g105<4>UD       0x021941d3
+                            urb MsgDesc: 58 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g108<1>D        g105<4>UD       0x021942d3
+                            urb MsgDesc: 90 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g111<1>D        g105<4>UD       0x021943d3
+                            urb MsgDesc: 122 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x021941db
+                            urb MsgDesc: 59 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g111<1>D        g105<4>UD       0x021942db
+                            urb MsgDesc: 91 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g106<1>D        g105<4>UD       0x021943db
+                            urb MsgDesc: 123 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g106<1>D        g105<4>UD       0x021941e3
+                            urb MsgDesc: 60 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g108<1>D        g105<4>UD       0x021942e3
+                            urb MsgDesc: 92 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g110<1>D        g105<4>UD       0x021943e3
+                            urb MsgDesc: 124 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x021941eb
+                            urb MsgDesc: 61 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g111<1>D        g105<4>UD       0x021942eb
+                            urb MsgDesc: 93 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g108<1>D        g105<4>UD       0x021943eb
+                            urb MsgDesc: 125 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g108<1>D        g105<4>UD       0x021941f3
+                            urb MsgDesc: 62 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g110<1>D        g105<4>UD       0x021942f3
+                            urb MsgDesc: 94 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x021943f3
+                            urb MsgDesc: 126 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g109<1>D        g105<4>UD       0x021941fb
+                            urb MsgDesc: 63 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g111<1>D        g105<4>UD       0x021942fb
+                            urb MsgDesc: 95 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g106<1>D        g105<4>UD       0x021943fb
+                            urb MsgDesc: 127 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x02106e01
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x04205e01
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD16, Mask = 0xe) mlen 2 rlen 2 { align1 1H };
+(+f1.0) send(8) null<1>UW       g6<8,8,1>UD     0x04026e01
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD8, Mask = 0xe) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g6<8,8,1>UD     0x04026e02
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 2, SIMD8, Mask = 0xe) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g8<8,8,1>UD     0x08025e01
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD16, Mask = 0xe) mlen 4 rlen 0 { align1 1H };
+(+f1.0) send(16) null<1>UW      g8<8,8,1>UD     0x08025e02
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 2, SIMD16, Mask = 0xe) mlen 4 rlen 0 { align1 1H };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x0a4a5001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g17<8,8,1>UD    0x0a4a5102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g25<1>UW        g7<8,8,1>UD     0x128c5001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(16)        g33<1>UW        g16<8,8,1>UD    0x128c5102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 9 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x0c4b2002
+                            sampler MsgDesc: gather4_po_c SIMD8 Surface = 2 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g15<8,8,1>UD    0x168d2002
+                            sampler MsgDesc: gather4_po_c SIMD16 Surface = 2 Sampler = 0 mlen 11 rlen 8 { align1 1H };
+send(8)         g68<1>.xUW      g65<4>UD        0x0210e500
+                            dp data 1 MsgDesc: ( DC untyped 4x2 atomic op, Surface = 0,  inc) mlen 1 rlen 1 { align16 1Q };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x124b4001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 9 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g16<8,8,1>UD    0x124b4102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 9 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g7<8,8,1>UD     0x0a4a2001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g9<8,8,1>UD     0x128c2001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         g24<1>F         g25<4>.xUD      0x02107002
+                            sampler MsgDesc: ld SIMD4x2 Surface = 2 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g30<1>F         g31<4>.xUD      0x02107003
+                            sampler MsgDesc: ld SIMD4x2 Surface = 3 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g35<1>F         g36<4>.xUD      0x02107004
+                            sampler MsgDesc: ld SIMD4x2 Surface = 4 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g41<1>F         g42<4>.xUD      0x02107005
+                            sampler MsgDesc: ld SIMD4x2 Surface = 5 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g47<1>F         g48<4>.xUD      0x02107006
+                            sampler MsgDesc: ld SIMD4x2 Surface = 6 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g53<1>F         g54<4>.xUD      0x02107007
+                            sampler MsgDesc: ld SIMD4x2 Surface = 7 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g59<1>F         g60<4>.xUD      0x02107008
+                            sampler MsgDesc: ld SIMD4x2 Surface = 8 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g65<1>F         g66<4>.xUD      0x02107009
+                            sampler MsgDesc: ld SIMD4x2 Surface = 9 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g71<1>F         g72<4>.xUD      0x0210700a
+                            sampler MsgDesc: ld SIMD4x2 Surface = 10 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g77<1>F         g78<4>.xUD      0x0210700b
+                            sampler MsgDesc: ld SIMD4x2 Surface = 11 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g83<1>F         g84<4>.xUD      0x0210700c
+                            sampler MsgDesc: ld SIMD4x2 Surface = 12 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g89<1>F         g90<4>.xUD      0x0210700d
+                            sampler MsgDesc: ld SIMD4x2 Surface = 13 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g5<1>F          g114<4>F        0x04102505
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 5 Sampler = 5 mlen 2 rlen 1 { align16 1Q };
+(+f1.0) send(8) g3<1>UW         g3<8,8,1>UD     0x0410b702
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD8, add) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(16) g4<1>UW        g6<8,8,1>UD     0x0820a702
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, add) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(8) g2<1>UW         g5<8,8,1>UD     0x0210b501
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, inc) mlen 1 rlen 1 { align1 1Q };
+(+f1.0) send(16) g2<1>UW        g7<8,8,1>UD     0x0420a501
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, inc) mlen 2 rlen 2 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x08420001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x10840001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+(+f1.0) send(8) g4<1>UW         g12<8,8,1>UD    0x0210b502
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD8, inc) mlen 1 rlen 1 { align1 1Q };
+(+f1.0) send(16) g5<1>UW        g17<8,8,1>UD    0x0420a502
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, inc) mlen 2 rlen 2 { align1 1H };
+send(8)         g2<1>UW         g11<8,8,1>UD    0x084a5001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g15<8,8,1>UD    0x084a5102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g21<8,8,1>UD    0x0e8c5001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g28<8,8,1>UD    0x0e8c5102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g9<8,8,1>UD     0x024ab001
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x028cb001
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 1 Sampler = 0 mlen 1 rlen 8 { align1 1H };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x06026c01
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD8, Mask = 0xc) mlen 3 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g4<8,8,1>UD     0x0c025c01
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD16, Mask = 0xc) mlen 6 rlen 0 { align1 1H };
+send(8)         g3<1>UW         g8<8,8,1>UD     0x02427002
+                            sampler MsgDesc: ld SIMD8 Surface = 2 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g4<1>UW         g26<8,8,1>UD    0x04847002
+                            sampler MsgDesc: ld SIMD16 Surface = 2 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+(+f1.0) send(8) g14<1>UW        g13<8,8,1>UD    0x0410bb02
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD8, imin) mlen 2 rlen 1 { align1 1Q };
+send(8)         g15<1>UW        g3<8,8,1>UD     0x02106e02
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+(+f1.0) send(8) g18<1>UW        g4<8,8,1>UD     0x0410b402
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD8, mov) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(16) g21<1>UW       g23<8,8,1>UD    0x0820ab02
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, imin) mlen 4 rlen 2 { align1 1H };
+send(16)        g22<1>UW        g3<8,8,1>UD     0x04205e02
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD16, Mask = 0xe) mlen 2 rlen 2 { align1 1H };
+(+f1.0) send(16) g25<1>UW       g6<8,8,1>UD     0x0820a402
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, mov) mlen 4 rlen 2 { align1 1H };
+send(8)         g2<1>UW         g3<8,8,1>UD     0x04203000
+                            pixel interp MsgDesc: (persp, per_slot_offset, 0x00) mlen 2 rlen 2 { align1 1Q };
+send(16)        g2<1>UW         g15<8,8,1>UD    0x08413000
+                            pixel interp MsgDesc: (persp, per_slot_offset, 0x00) mlen 4 rlen 4 { align1 1H };
+send(1)         g2<1>UW         g2<0,1,0>UW     0x0209c000
+                            data MsgDesc: ( DC mfence, 0, 0) mlen 1 rlen 0  { align1 WE_all 1N };
+send(8)         g2<1>UW         g0<8,8,1>UD     0x02201010
+                            pixel interp MsgDesc: (persp, sample_position, 0x10) mlen 1 rlen 2 { align1 1Q };
+send(16)        g2<1>UW         g0<8,8,1>UD     0x02411010
+                            pixel interp MsgDesc: (persp, sample_position, 0x10) mlen 1 rlen 4 { align1 1H };
+send(8)         g2<1>UW         g0<8,8,1>UD     0x02201020
+                            pixel interp MsgDesc: (persp, sample_position, 0x20) mlen 1 rlen 2 { align1 1Q };
+send(16)        g2<1>UW         g0<8,8,1>UD     0x02411020
+                            pixel interp MsgDesc: (persp, sample_position, 0x20) mlen 1 rlen 4 { align1 1H };
+send(8)         g2<1>UW         g0<8,8,1>UD     0x02201030
+                            pixel interp MsgDesc: (persp, sample_position, 0x30) mlen 1 rlen 2 { align1 1Q };
+send(16)        g2<1>UW         g0<8,8,1>UD     0x02411030
+                            pixel interp MsgDesc: (persp, sample_position, 0x30) mlen 1 rlen 4 { align1 1H };
+(+f1.0) send(8) null<1>UW       g119<8,8,1>UD   0x02009601
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, dec) mlen 1 rlen 0 { align1 1Q };
+(+f1.0) send(8) g48<1>UW        g119<8,8,1>UD   0x0210b601
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, dec) mlen 1 rlen 1 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g3<8,8,1>UD     0x04008601
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, dec) mlen 2 rlen 0 { align1 1H };
+(+f1.0) send(16) g97<1>UW       g3<8,8,1>UD     0x0420a601
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, dec) mlen 2 rlen 2 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x064a8004
+                            sampler MsgDesc: gather4 SIMD8 Surface = 4 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x084a8105
+                            sampler MsgDesc: gather4 SIMD8 Surface = 5 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g10<8,8,1>UD    0x064a8206
+                            sampler MsgDesc: gather4 SIMD8 Surface = 6 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0a8c8004
+                            sampler MsgDesc: gather4 SIMD16 Surface = 4 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0e8c8105
+                            sampler MsgDesc: gather4 SIMD16 Surface = 5 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(16)        g26<1>UW        g34<8,8,1>UD    0x0a8c8206
+                            sampler MsgDesc: gather4 SIMD16 Surface = 6 Sampler = 2 mlen 5 rlen 8 { align1 1H };
+send(8)         g39<1>.xUW      g36<4>UD        0x0210e600
+                            dp data 1 MsgDesc: ( DC untyped 4x2 atomic op, Surface = 0,  dec) mlen 1 rlen 1 { align16 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x06420102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0c840102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x8e08c030
+                            urb MsgDesc: 6 write HWord interleave complete mlen 7 rlen 0 { align16 1Q EOT };
+send(8)         g6<1>UW         g9<8,8,1>UD     0x02406002
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD8, Mask = 0x0) mlen 1 rlen 4 { align1 1Q };
+send(16)        g16<1>UW        g14<8,8,1>UD    0x04805002
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD16, Mask = 0x0) mlen 2 rlen 8 { align1 1H };
+send(8)         g5<1>UW         g5<8,8,1>UD     0x04195e01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0xe) mlen 2 rlen 1 { align1 1Q };
+send(8)         null<1>UW       g7<8,8,1>UD     0x060b5e02
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD16, Mask = 0xe) mlen 3 rlen 0 { align1 1Q };
+send(8)         g7<1>UW         g10<8,8,1>UD    0x04196e01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0xe) mlen 2 rlen 1 { align1 2Q };
+send(8)         null<1>UW       g12<8,8,1>UD    0x060b6e02
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD8, Mask = 0xe) mlen 3 rlen 0 { align1 2Q };
+send(8)         null<1>F        g113<4>F        0x1a094030
+                            urb MsgDesc: 6 write HWord per-slot interleave mlen 13 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x06094060
+                            urb MsgDesc: 12 write HWord per-slot interleave mlen 3 rlen 0 { align16 1Q };
+send(8)         g124<1>UW       g10<8,8,1>UD    0x084a1001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g15<8,8,1>UD    0x0e8c1001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         g13<1>UD        g114<4>F        0x0211d000
+                            sampler MsgDesc: ld_mcs SIMD4x2 Surface = 0 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g3<1>UD         g114<4>F        0x04188005
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 5 Sampler = 0 mlen 2 rlen 1 { align16 1Q };
+send(8)         g4<1>UD         g114<4>F        0x04188106
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 6 Sampler = 1 mlen 2 rlen 1 { align16 1Q };
+send(8)         g6<1>UD         g114<4>F        0x04188207
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 7 Sampler = 2 mlen 2 rlen 1 { align16 1Q };
+send(8)         g9<1>UD         g114<4>F        0x04188308
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 8 Sampler = 3 mlen 2 rlen 1 { align16 1Q };
+send(8)         g11<1>UD        g114<4>F        0x04188409
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 9 Sampler = 4 mlen 2 rlen 1 { align16 1Q };
+(+f1.0) send(8) null<1>UW       g11<8,8,1>UD    0x0a026002
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 2, SIMD8, Mask = 0x0) mlen 5 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g13<8,8,1>UD    0x14025002
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 2, SIMD16, Mask = 0x0) mlen 10 rlen 0 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x16094060
+                            urb MsgDesc: 12 write HWord per-slot interleave mlen 11 rlen 0 { align16 1Q };
+send(8)         null<1>UW       g126<8,8,1>UD   0x040a02ff
+                            data MsgDesc: ( DC OWORD block write, 255, 2) mlen 2 rlen 0 { align1 1Q };
+send(8)         g66<1>UW        g0<8,8,1>F      0x021c0009
+                            data MsgDesc: ( DC OWORD block read, 9, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g66<1>UW        g0<8,8,1>F      0x021c0001
+                            data MsgDesc: ( DC OWORD block read, 1, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g66<1>UW        g0<8,8,1>F      0x021c0008
+                            data MsgDesc: ( DC OWORD block read, 8, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g66<1>UW        g0<8,8,1>F      0x021c0002
+                            data MsgDesc: ( DC OWORD block read, 2, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g66<1>UW        g0<8,8,1>F      0x021c0007
+                            data MsgDesc: ( DC OWORD block read, 7, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g66<1>UW        g0<8,8,1>F      0x021c0000
+                            data MsgDesc: ( DC OWORD block read, 0, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g68<1>UW        g0<8,8,1>F      0x021c0005
+                            data MsgDesc: ( DC OWORD block read, 5, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g66<1>UW        g0<8,8,1>F      0x021c0004
+                            data MsgDesc: ( DC OWORD block read, 4, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g64<1>UW        g0<8,8,1>F      0x021c0006
+                            data MsgDesc: ( DC OWORD block read, 6, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g64<1>UW        g0<8,8,1>F      0x021c0003
+                            data MsgDesc: ( DC OWORD block read, 3, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g2<1>UW         g9<8,8,1>UD     0x0a423001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g19<1>UW        g9<8,8,1>UD     0x14843001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g8<8,8,1>UD     0x08426001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g10<8,8,1>UD    0x10846001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+(+f1.0) send(8) g14<1>UW        g13<8,8,1>UD    0x0410bd02
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD8, umin) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(16) g21<1>UW       g23<8,8,1>UD    0x0820ad02
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, umin) mlen 4 rlen 2 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x12424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 9 rlen 4 { align1 1Q };
+send(8)         null<1>UW       g7<8,8,1>UD     0x080b5e02
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD16, Mask = 0xe) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x080b6e02
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD8, Mask = 0xe) mlen 4 rlen 0 { align1 2Q };
+send(8)         g26<1>UW        g20<8,8,1>UD    0x06295c01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0xc) mlen 3 rlen 2 { align1 1Q };
+send(8)         null<1>UW       g23<8,8,1>UD    0x0a0b5c02
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD16, Mask = 0xc) mlen 5 rlen 0 { align1 1Q };
+send(8)         g5<1>UW         g43<8,8,1>UD    0x06296c01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0xc) mlen 3 rlen 2 { align1 2Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x0a0b6c02
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD8, Mask = 0xc) mlen 5 rlen 0 { align1 2Q };
+send(8)         null<1>F        g113<4>F        0x8608c060
+                            urb MsgDesc: 12 write HWord interleave complete mlen 3 rlen 0 { align16 1Q EOT };
+send(8)         g2<1>UW         g16<8,8,1>UD    0x04495001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0x0) mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g31<8,8,1>UD    0x04496001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0x0) mlen 2 rlen 4 { align1 2Q };
+send(8)         g2<1>UW         g16<8,8,1>UD    0x04295c01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0xc) mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UW         g31<8,8,1>UD    0x04296c01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0xc) mlen 2 rlen 2 { align1 2Q };
+send(8)         g20<1>UW        g16<8,8,1>UD    0x04195e02
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 2, SIMD16, Mask = 0xe) mlen 2 rlen 1 { align1 1Q };
+send(8)         g124<1>UW       g18<8,8,1>UD    0x0619a701
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, add) mlen 3 rlen 1 { align1 1Q };
+send(8)         g4<1>UW         g31<8,8,1>UD    0x04196e02
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 2, SIMD8, Mask = 0xe) mlen 2 rlen 1 { align1 2Q };
+send(8)         g121<1>UW       g2<8,8,1>UD     0x0619b701
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, add) mlen 3 rlen 1 { align1 2Q };
+send(8)         g124<1>UW       g18<8,8,1>UD    0x0619ad01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, umin) mlen 3 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g2<8,8,1>UD     0x0619bd01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, umin) mlen 3 rlen 1 { align1 2Q };
+send(8)         g124<1>UW       g18<8,8,1>UD    0x0619ac01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, umax) mlen 3 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g2<8,8,1>UD     0x0619bc01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, umax) mlen 3 rlen 1 { align1 2Q };
+send(8)         g124<1>UW       g18<8,8,1>UD    0x0619a101
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, and) mlen 3 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g2<8,8,1>UD     0x0619b101
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, and) mlen 3 rlen 1 { align1 2Q };
+send(8)         g124<1>UW       g18<8,8,1>UD    0x0619a201
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, or) mlen 3 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g2<8,8,1>UD     0x0619b201
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, or) mlen 3 rlen 1 { align1 2Q };
+send(8)         g124<1>UW       g18<8,8,1>UD    0x0619a301
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, xor) mlen 3 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g2<8,8,1>UD     0x0619b301
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, xor) mlen 3 rlen 1 { align1 2Q };
+send(8)         g124<1>UW       g18<8,8,1>UD    0x0619a401
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, mov) mlen 3 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g2<8,8,1>UD     0x0619b401
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, mov) mlen 3 rlen 1 { align1 2Q };
+send(8)         g124<1>UW       g18<8,8,1>UD    0x0819ae01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, cmpwr) mlen 4 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g2<8,8,1>UD     0x0819be01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, cmpwr) mlen 4 rlen 1 { align1 2Q };
+send(8)         g9<1>UW         g19<8,8,1>UD    0x0843e102
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g23<1>UW        g7<8,8,1>UD     0x1085e102
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 2 Sampler = 1 mlen 8 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g5<8,8,1>UD     0x0c4b0002
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 2 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x168d0002
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 2 Sampler = 0 mlen 11 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g5<8,8,1>UD     0x06425001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x0c845001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+(+f1.0) send(8) null<1>UW       g9<8,8,1>UD     0x0a026003
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 3, SIMD8, Mask = 0x0) mlen 5 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g11<8,8,1>UD    0x14025003
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 3, SIMD16, Mask = 0x0) mlen 10 rlen 0 { align1 1H };
+send(8)         null<1>UW       g2<8,8,1>UD     0x060b5e01
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD16, Mask = 0xe) mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x060b6e01
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD8, Mask = 0xe) mlen 3 rlen 0 { align1 2Q };
+send(8)         g2<1>UW         g11<8,8,1>UD    0x084a3001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g15<8,8,1>UD    0x084a3102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g22<8,8,1>UD    0x0e8c3001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g29<8,8,1>UD    0x0e8c3102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(16)        g2<1>UD         g26<8,8,1>UD    0x02280302
+                            const MsgDesc: (2, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(16)        g16<1>UD        g27<8,8,1>UD    0x02280303
+                            const MsgDesc: (3, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x0a4a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g12<8,8,1>UD    0x04009701
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, add) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g19<8,8,1>UD    0x08008701
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, add) mlen 4 rlen 0 { align1 1H };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x0c4b4001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g13<8,8,1>UD    0x0c4b4102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 6 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x0443d002
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 2 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0885d002
+                            sampler MsgDesc: ld_mcs SIMD16 Surface = 2 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x02306801
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD8, Mask = 0x8) mlen 1 rlen 3 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x04605801
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD16, Mask = 0x8) mlen 2 rlen 6 { align1 1H };
+(+f1.0) send(8) null<1>UW       g2<8,8,1>UD     a0<0,1,0>UD     0x00000200
+                            dp data 1 MsgDesc: indirect                     { align1 1Q };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x104b4001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 8 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g20<8,8,1>UD    0x104b4102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 8 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g10<8,8,1>UD    0x04420004
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g26<8,8,1>UD    0x08840004
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g11<1>F         g114<4>F        0x06192001
+                            sampler MsgDesc: gather4_po_c SIMD4x2 Surface = 1 Sampler = 0 mlen 3 rlen 1 { align16 1Q };
+(+f1.0) send(8) g3<1>UW         g10<8,8,1>UD    0x0410b701
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, add) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g5<1>UW         g10<8,8,1>UD    0x0410bd01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, umin) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g6<1>UW         g10<8,8,1>UD    0x0410bc01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, umax) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g7<1>UW         g10<8,8,1>UD    0x0410b101
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, and) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g9<1>UW         g10<8,8,1>UD    0x0410b301
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, xor) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g10<1>UW        g10<8,8,1>UD    0x0410b401
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, mov) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g11<1>UW        g11<8,8,1>UD    0x0610be01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, cmpwr) mlen 3 rlen 1 { align1 1Q };
+(+f1.0) send(16) g3<1>UW        g19<8,8,1>UD    0x0820a701
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, add) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g7<1>UW        g19<8,8,1>UD    0x0820ad01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, umin) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g9<1>UW        g19<8,8,1>UD    0x0820ac01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, umax) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g11<1>UW       g19<8,8,1>UD    0x0820a101
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, and) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g15<1>UW       g19<8,8,1>UD    0x0820a301
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, xor) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g17<1>UW       g19<8,8,1>UD    0x0820a401
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, mov) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g19<1>UW       g21<8,8,1>UD    0x0c20ae01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, cmpwr) mlen 6 rlen 2 { align1 1H };
+send(8)         null<1>UW       g5<8,8,1>UD     0x0e0b5001
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD16, Mask = 0x0) mlen 7 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g7<8,8,1>UD     0x0e0b6001
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD8, Mask = 0x0) mlen 7 rlen 0 { align1 2Q };
+send(8)         g4<1>F          g114<4>F        0x06190005
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 5 Sampler = 0 mlen 3 rlen 1 { align16 1Q };
+send(8)         g5<1>F          g114<4>F        0x06190106
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 6 Sampler = 1 mlen 3 rlen 1 { align16 1Q };
+send(8)         g7<1>F          g114<4>F        0x06190207
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 7 Sampler = 2 mlen 3 rlen 1 { align16 1Q };
+send(8)         g10<1>F         g114<4>F        0x06190308
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 8 Sampler = 3 mlen 3 rlen 1 { align16 1Q };
+send(8)         g12<1>F         g114<4>F        0x06190409
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 9 Sampler = 4 mlen 3 rlen 1 { align16 1Q };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x04009d01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, umin) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x04009c01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, umax) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x04009101
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, and) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x04009201
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, or) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x04009301
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, xor) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x04009401
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, mov) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g9<8,8,1>UD     0x06009e01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, cmpwr) mlen 3 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g13<8,8,1>UD    0x08008d01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, umin) mlen 4 rlen 0 { align1 1H };
+(+f1.0) send(16) null<1>UW      g13<8,8,1>UD    0x08008c01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, umax) mlen 4 rlen 0 { align1 1H };
+(+f1.0) send(16) null<1>UW      g13<8,8,1>UD    0x08008101
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, and) mlen 4 rlen 0 { align1 1H };
+(+f1.0) send(16) null<1>UW      g13<8,8,1>UD    0x08008201
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, or) mlen 4 rlen 0 { align1 1H };
+(+f1.0) send(16) null<1>UW      g13<8,8,1>UD    0x08008301
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, xor) mlen 4 rlen 0 { align1 1H };
+(+f1.0) send(16) null<1>UW      g13<8,8,1>UD    0x08008401
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, mov) mlen 4 rlen 0 { align1 1H };
+(+f1.0) send(16) null<1>UW      g13<8,8,1>UD    0x0c008e01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, cmpwr) mlen 6 rlen 0 { align1 1H };
+send(8)         g9<1>UW         g5<8,8,1>UD     0x04420002
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g13<1>UW        g7<8,8,1>UD     0x08840002
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x0419a501
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, inc) mlen 2 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g2<8,8,1>UD     0x0419b501
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, inc) mlen 2 rlen 1 { align1 2Q };
+send(8)         null<1>UW       g20<8,8,1>UD    0x06098101
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, and) mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06099101
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, and) mlen 3 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g19<8,8,1>UD    0x06098201
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, or) mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06099201
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, or) mlen 3 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g19<8,8,1>UD    0x06098301
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, xor) mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06099301
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, xor) mlen 3 rlen 0 { align1 2Q };
+send(8)         g29<1>UW        g18<8,8,1>UD    0x04420008
+                            sampler MsgDesc: sample SIMD8 Surface = 8 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g35<1>UW        g18<8,8,1>UD    0x04420109
+                            sampler MsgDesc: sample SIMD8 Surface = 9 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(8)         g41<1>UW        g18<8,8,1>UD    0x0442020a
+                            sampler MsgDesc: sample SIMD8 Surface = 10 Sampler = 2 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g18<8,8,1>UD    0x0442030b
+                            sampler MsgDesc: sample SIMD8 Surface = 11 Sampler = 3 mlen 2 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g18<8,8,1>UD    0x0442040c
+                            sampler MsgDesc: sample SIMD8 Surface = 12 Sampler = 4 mlen 2 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g18<8,8,1>UD    0x0442050d
+                            sampler MsgDesc: sample SIMD8 Surface = 13 Sampler = 5 mlen 2 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g18<8,8,1>UD    0x0442060e
+                            sampler MsgDesc: sample SIMD8 Surface = 14 Sampler = 6 mlen 2 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x0442070f
+                            sampler MsgDesc: sample SIMD8 Surface = 15 Sampler = 7 mlen 2 rlen 4 { align1 1Q };
+send(16)        g32<1>UW        g22<8,8,1>UD    0x08840008
+                            sampler MsgDesc: sample SIMD16 Surface = 8 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(16)        g42<1>UW        g22<8,8,1>UD    0x08840109
+                            sampler MsgDesc: sample SIMD16 Surface = 9 Sampler = 1 mlen 4 rlen 8 { align1 1H };
+send(16)        g60<1>UW        g22<8,8,1>UD    0x0884020a
+                            sampler MsgDesc: sample SIMD16 Surface = 10 Sampler = 2 mlen 4 rlen 8 { align1 1H };
+send(16)        g70<1>UW        g22<8,8,1>UD    0x0884030b
+                            sampler MsgDesc: sample SIMD16 Surface = 11 Sampler = 3 mlen 4 rlen 8 { align1 1H };
+send(16)        g78<1>UW        g22<8,8,1>UD    0x0884040c
+                            sampler MsgDesc: sample SIMD16 Surface = 12 Sampler = 4 mlen 4 rlen 8 { align1 1H };
+send(16)        g86<1>UW        g22<8,8,1>UD    0x0884050d
+                            sampler MsgDesc: sample SIMD16 Surface = 13 Sampler = 5 mlen 4 rlen 8 { align1 1H };
+send(16)        g94<1>UW        g22<8,8,1>UD    0x0884060e
+                            sampler MsgDesc: sample SIMD16 Surface = 14 Sampler = 6 mlen 4 rlen 8 { align1 1H };
+send(16)        g52<1>UW        g22<8,8,1>UD    0x0884070f
+                            sampler MsgDesc: sample SIMD16 Surface = 15 Sampler = 7 mlen 4 rlen 8 { align1 1H };
+send(8)         g5<1>F          g114<4>F        0x04102101
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 1 Sampler = 1 mlen 2 rlen 1 { align16 1Q };
+send(8)         g6<1>F          g114<4>F        0x04102202
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 2 Sampler = 2 mlen 2 rlen 1 { align16 1Q };
+send(8)         g7<1>F          g114<4>F        0x04102303
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 3 Sampler = 3 mlen 2 rlen 1 { align16 1Q };
+send(8)         g8<1>F          g114<4>F        0x04102404
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 4 Sampler = 4 mlen 2 rlen 1 { align16 1Q };
+send(8)         g10<1>F         g114<4>F        0x04102606
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 6 Sampler = 6 mlen 2 rlen 1 { align16 1Q };
+send(8)         g11<1>F         g114<4>F        0x04102707
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 7 Sampler = 7 mlen 2 rlen 1 { align16 1Q };
+send(8)         g124<1>UW       g10<8,8,1>UD    0x084a0001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g18<8,8,1>UD    0x0e8c0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x0e094030
+                            urb MsgDesc: 6 write HWord per-slot interleave mlen 7 rlen 0 { align16 1Q };
+send(8)         null<1>UW       g20<8,8,1>UD    0x08098701
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, add) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099701
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, add) mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g20<8,8,1>UD    0x08098d01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, umin) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099d01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, umin) mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g20<8,8,1>UD    0x08098101
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, and) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099101
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, and) mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g20<8,8,1>UD    0x08098201
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, or) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099201
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, or) mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g20<8,8,1>UD    0x08098301
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, xor) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099301
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, xor) mlen 4 rlen 0 { align1 2Q };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x084b0006
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 6 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x0a4b0107
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 7 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g11<8,8,1>UD    0x0a4b0208
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 8 Sampler = 2 mlen 5 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x0c4b0309
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 9 Sampler = 3 mlen 6 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g24<8,8,1>UD    0x084b040a
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 10 Sampler = 4 mlen 4 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g26<8,8,1>UD    0x128d0208
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 8 Sampler = 2 mlen 9 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0e8d0006
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 6 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(16)        g26<1>UW        g35<8,8,1>UD    0x168d0309
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 9 Sampler = 3 mlen 11 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g53<8,8,1>UD    0x128d0107
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 7 Sampler = 1 mlen 9 rlen 8 { align1 1H };
+send(16)        g34<1>UW        g46<8,8,1>UD    0x0e8d040a
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 10 Sampler = 4 mlen 7 rlen 8 { align1 1H };
+send(8)         null<1>UW       g9<8,8,1>UD     0x0e0b5003
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 3, SIMD16, Mask = 0x0) mlen 7 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g15<8,8,1>UD    0x0e0b6003
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 3, SIMD8, Mask = 0x0) mlen 7 rlen 0 { align1 2Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/send.expected b/src/intel/compiler/elk/tests/gen7.5/send.expected
new file mode 100644
index 00000000000..749b53b1b98
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/send.expected
@@ -0,0 +1,752 @@
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 c0 08 8a
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 c0 08 86
+31 00 60 02 29 0c 80 2f a0 01 8d 00 01 70 42 08
+31 00 80 02 29 0c 00 2f 40 01 8d 00 01 70 84 10
+31 01 60 06 25 0c 4f 26 64 06 6e 00 13 40 19 02
+31 01 60 06 3c 0c 0f 20 84 01 6e 00 19 40 09 04
+31 01 60 06 3c 0c 0f 20 a4 01 6e 00 11 40 09 04
+31 01 60 06 3c 0c 0f 20 84 01 6e 00 09 40 09 04
+31 01 60 06 3c 0c 0f 20 84 01 6e 00 01 40 09 04
+31 01 60 06 25 0c cf 21 e4 01 6e 00 0b 40 19 02
+31 01 60 06 25 0c af 21 84 01 6e 00 03 40 19 02
+31 02 60 03 28 0c 00 20 80 01 69 00 04 80 00 02
+31 01 60 06 3c 0c 0f 20 a4 01 6e 00 03 c0 08 02
+31 01 60 06 bc 0f 0f 20 c4 0f 6e 00 01 00 08 84
+31 01 60 02 a5 0f 8f 21 44 0e 6e 00 00 70 10 02
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 40 09 0a
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 40 08 82
+31 01 60 0a bd 0f 0f 20 a4 0f 6e 00 ff 80 0a 06
+31 01 60 0a bd 0f 2f 25 c4 0f 6e 00 ff 80 18 04
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 c0 08 8e
+31 00 60 02 29 0c 80 2f 40 00 8d 00 01 00 42 04
+31 00 80 02 29 0c 00 2f 40 00 8d 00 01 00 84 08
+31 00 60 02 29 0c c0 20 c0 00 8d 00 01 d0 43 06
+31 00 80 02 29 0c 00 21 00 02 8d 00 01 d0 85 0c
+31 00 60 02 29 0c 00 21 20 02 8d 00 01 e0 43 0a
+31 00 80 02 29 0c 00 26 40 01 8d 00 01 e0 85 14
+31 00 60 02 29 0c 80 2f 60 01 8d 00 01 00 42 06
+31 00 80 02 29 0c 00 2f 00 02 8d 00 01 00 84 0c
+31 00 60 02 29 0c 80 2f e0 00 8d 00 01 40 4a 14
+31 00 60 02 29 00 80 2f 00 01 8d 00 00 02 00 00
+31 00 60 02 29 0c 80 2f 00 01 8d 00 01 40 4a 08
+31 00 60 02 29 0c 80 2f e0 00 8d 00 02 80 4a 06
+31 00 80 02 29 0c 00 2f 00 01 8d 00 02 80 8c 0a
+31 00 60 02 29 0c 40 20 e0 00 8d 00 01 60 4a 0a
+31 00 60 02 29 0c c0 20 80 01 8d 00 02 61 4a 0a
+31 00 80 02 29 0c 40 20 60 01 8d 00 01 60 8c 12
+31 00 80 02 29 0c 40 21 80 02 8d 00 02 61 8c 12
+31 01 60 02 a5 0f 4f 21 44 0e 6e 00 00 e0 11 04
+31 01 60 06 3c 0c 0f 20 24 01 6e 00 21 40 09 04
+31 01 60 06 3c 0c 0f 20 24 01 6e 00 03 80 08 02
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 40 09 06
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 40 08 1a
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 30 c0 08 8a
+31 00 60 02 29 0c a0 20 a0 02 8d 00 01 00 42 02
+31 00 80 02 29 0c e0 20 40 03 8d 00 01 00 84 04
+31 01 60 02 a5 0f 8f 21 44 0e 6e 00 00 a0 10 02
+31 00 60 02 29 0c 00 21 40 01 8d 00 01 a0 42 02
+31 00 60 02 29 0c 40 20 80 01 8d 00 02 10 4b 0c
+31 00 80 02 29 0c c0 22 60 02 8d 00 01 a0 84 04
+31 00 80 02 29 0c 40 22 e0 00 8d 00 02 10 8d 16
+31 01 60 06 3c 0c 0f 20 24 01 6e 00 29 40 09 04
+31 00 60 02 29 0c 40 20 80 01 8d 00 02 80 4a 08
+31 00 80 02 29 0c c0 21 e0 00 8d 00 02 80 8c 0e
+31 01 60 02 a5 0f cf 21 44 0e 6e 00 01 10 19 06
+31 01 60 02 a5 0f 6f 20 44 0e 6e 00 01 a1 10 02
+31 01 60 02 a5 0f af 20 44 0e 6e 00 02 a2 10 02
+31 01 60 02 a5 0f ef 20 44 0e 6e 00 03 a3 10 02
+31 01 60 02 a5 0f 2f 21 44 0e 6e 00 04 a4 10 02
+31 01 60 02 a5 0f 6f 21 44 0e 6e 00 05 a5 10 02
+31 01 60 02 a5 0f af 21 44 0e 6e 00 06 a6 10 02
+31 01 60 06 3c 0c 0f 20 c4 06 6e 00 09 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 11 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 19 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 21 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 29 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 31 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 39 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 41 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 49 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 51 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 59 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 61 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 69 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 71 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 79 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 81 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 89 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 91 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 99 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 06 6e 00 a1 41 09 04
+31 01 60 06 3c 0c 0f 20 c4 06 6e 00 a9 41 09 04
+31 01 60 06 3c 0c 0f 20 e4 06 6e 00 b1 41 09 04
+31 01 60 06 3c 0c 0f 20 04 07 6e 00 b9 41 09 04
+31 01 60 06 3c 0c 0f 20 64 07 6e 00 c1 41 09 04
+31 01 60 06 3c 0c 0f 20 84 07 6e 00 c9 41 09 04
+31 01 60 06 3c 0c 0f 20 a4 07 6e 00 d1 41 09 04
+31 01 60 06 3c 0c 0f 20 c4 07 6e 00 d9 41 09 04
+31 01 60 06 3c 0c 0f 20 e4 07 6e 00 e1 41 09 04
+31 01 60 06 3c 0c 0f 20 04 08 6e 00 e9 41 09 04
+31 01 60 06 3c 0c 0f 20 24 08 6e 00 f1 41 09 04
+31 01 60 06 3c 0c 0f 20 44 08 6e 00 f9 41 09 04
+31 01 60 06 3c 0c 0f 20 e4 08 6e 00 31 40 09 04
+31 01 60 06 3c 0c 0f 20 04 09 6e 00 39 40 09 04
+31 01 60 06 3c 0c 0f 20 24 09 6e 00 41 40 09 04
+31 01 60 06 3c 0c 0f 20 44 09 6e 00 49 40 09 04
+31 01 60 06 3c 0c 0f 20 64 09 6e 00 51 40 09 04
+31 01 60 06 3c 0c 0f 20 84 09 6e 00 59 40 09 04
+31 01 60 06 3c 0c 0f 20 a4 09 6e 00 61 40 09 04
+31 01 60 06 3c 0c 0f 20 c4 09 6e 00 69 40 09 04
+31 01 60 06 3c 0c 0f 20 e4 09 6e 00 71 40 09 04
+31 01 60 06 3c 0c 0f 20 04 0a 6e 00 79 40 09 04
+31 01 60 06 3c 0c 0f 20 24 0a 6e 00 81 40 09 04
+31 01 60 06 3c 0c 0f 20 44 0a 6e 00 89 40 09 04
+31 01 60 06 3c 0c 0f 20 64 0a 6e 00 91 40 09 04
+31 01 60 06 3c 0c 0f 20 84 0a 6e 00 99 40 09 04
+31 01 60 06 3c 0c 0f 20 a4 0a 6e 00 a1 40 09 04
+31 01 60 06 3c 0c 0f 20 c4 0a 6e 00 a9 40 09 04
+31 01 60 06 3c 0c 0f 20 e4 0a 6e 00 b1 40 09 04
+31 01 60 06 3c 0c 0f 20 04 0b 6e 00 b9 40 09 04
+31 01 60 06 3c 0c 0f 20 24 0b 6e 00 c1 40 09 04
+31 01 60 06 3c 0c 0f 20 44 0b 6e 00 c9 40 09 04
+31 01 60 06 3c 0c 0f 20 64 0b 6e 00 d1 40 09 04
+31 01 60 06 3c 0c 0f 20 84 0b 6e 00 d9 40 09 04
+31 01 60 06 3c 0c 0f 20 a4 0b 6e 00 e1 40 09 04
+31 01 60 06 3c 0c 0f 20 c4 0b 6e 00 e9 40 09 04
+31 01 60 06 3c 0c 0f 20 e4 0b 6e 00 f1 40 09 04
+31 01 60 06 3c 0c 0f 20 04 0c 6e 00 f9 40 09 04
+31 01 60 06 3c 0c 0f 20 04 01 6e 00 01 41 09 04
+31 01 60 06 25 0c 2f 22 04 02 6e 00 8b 41 19 02
+31 01 60 06 25 0c cf 22 a4 02 6e 00 8b 42 19 02
+31 01 60 06 25 0c 6f 23 44 03 6e 00 8b 43 19 02
+31 01 60 06 25 0c 0f 24 e4 03 6e 00 8b 44 19 02
+31 01 60 06 25 0c ef 24 c4 04 6e 00 93 40 19 02
+31 01 60 06 25 0c 4f 25 24 05 6e 00 0b 41 19 02
+31 01 60 06 25 0c ef 25 c4 05 6e 00 0b 42 19 02
+31 01 60 06 25 0c 8f 26 64 06 6e 00 0b 43 19 02
+31 01 60 06 25 0c 2f 27 04 07 6e 00 0b 44 19 02
+31 01 60 06 25 0c ef 28 64 00 6e 00 03 41 19 02
+31 01 60 06 25 0c 4f 29 64 00 6e 00 03 42 19 02
+31 01 60 06 25 0c af 29 64 00 6e 00 03 43 19 02
+31 01 60 06 25 0c 0f 2a 64 00 6e 00 03 44 19 02
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 c0 08 92
+31 00 60 02 29 0c a0 20 60 00 8d 00 01 70 42 02
+31 00 80 02 29 0c 00 21 a0 00 8d 00 01 70 84 04
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 30 40 08 1a
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 60 c0 08 96
+31 01 60 06 25 0c 4f 27 64 07 6e 00 1b 40 19 02
+31 01 60 06 25 0c 2f 28 44 08 6e 00 23 40 19 02
+31 01 60 06 25 0c 0f 29 24 09 6e 00 2b 40 19 02
+31 01 60 06 25 0c ef 29 04 0a 6e 00 33 40 19 02
+31 01 60 06 25 0c cf 2a e4 0a 6e 00 3b 40 19 02
+31 01 60 06 25 0c af 2b c4 0b 6e 00 43 40 19 02
+31 01 60 06 25 0c 8f 2c a4 0c 6e 00 4b 40 19 02
+31 01 60 06 25 0c 6f 2d 84 0d 6e 00 53 40 19 02
+31 01 60 06 25 0c af 21 c4 01 6e 00 5b 40 19 02
+31 01 60 06 25 0c 8f 22 a4 02 6e 00 63 40 19 02
+31 01 60 06 25 0c 6f 23 84 03 6e 00 6b 40 19 02
+31 01 60 06 25 0c 4f 24 64 04 6e 00 73 40 19 02
+31 01 60 06 25 0c 6f 25 e4 05 6e 00 7b 40 19 02
+31 01 60 06 25 0c af 26 c4 06 6e 00 83 40 19 02
+31 01 60 06 25 0c 8f 27 a4 07 6e 00 8b 40 19 02
+31 01 60 06 25 0c 4f 29 64 09 6e 00 9b 40 19 02
+31 01 60 06 25 0c 2f 2a 44 0a 6e 00 a3 40 19 02
+31 01 60 06 25 0c 0f 2b 24 0b 6e 00 ab 40 19 02
+31 01 60 06 25 0c ef 2b 04 0c 6e 00 b3 40 19 02
+31 01 60 06 25 0c cf 2c e4 0c 6e 00 bb 40 19 02
+31 01 60 06 25 0c af 2d c4 0d 6e 00 c3 40 19 02
+31 01 60 06 25 0c af 21 c4 01 6e 00 cb 40 19 02
+31 01 60 06 25 0c 8f 22 a4 02 6e 00 d3 40 19 02
+31 01 60 06 25 0c 6f 23 84 03 6e 00 db 40 19 02
+31 01 60 06 25 0c 4f 24 64 04 6e 00 e3 40 19 02
+31 01 60 06 25 0c 6f 25 e4 05 6e 00 eb 40 19 02
+31 01 60 06 25 0c af 26 c4 06 6e 00 f3 40 19 02
+31 01 60 06 25 0c 8f 27 a4 07 6e 00 fb 40 19 02
+31 00 60 02 29 0c 40 20 40 00 8d 00 01 90 42 02
+31 00 80 02 29 0c 40 20 40 01 8d 00 01 90 84 04
+31 00 60 02 29 0c 80 2f 40 01 8d 00 01 40 43 0e
+31 00 61 0c 29 0c 80 21 40 00 8d 04 01 b2 10 04
+31 00 61 0c 28 0c 00 20 40 00 8d 04 01 95 00 02
+31 00 81 0c 29 0c c0 21 00 02 8d 04 01 a2 20 08
+31 00 81 0c 28 0c 00 20 40 00 8d 04 01 85 00 04
+31 00 60 02 29 0c 80 2f c0 00 8d 00 01 40 43 08
+31 00 60 0c 29 0c a0 20 60 01 8d 00 01 50 49 06
+31 00 60 0c 28 0c 00 20 c0 01 8d 00 02 50 0b 0e
+31 10 60 0c 29 0c 40 20 60 01 8d 00 01 60 49 06
+31 10 60 0c 28 0c 00 20 e0 00 8d 00 02 60 0b 0e
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 30 c0 08 86
+31 01 60 02 bd 0f 8f 21 44 0e 6e 00 01 00 19 06
+31 00 60 0b 29 0c e0 20 00 00 8d 00 08 00 20 02
+31 00 80 0b 29 0c 20 21 00 00 8d 00 08 00 41 02
+31 00 60 02 29 0c 40 20 60 01 8d 00 01 d0 43 04
+31 00 60 02 29 0c 40 20 20 01 8d 00 01 e0 43 08
+31 00 80 02 29 0c 40 20 e0 01 8d 00 01 d0 85 08
+31 00 80 02 29 0c 60 25 60 01 8d 00 01 e0 85 10
+31 00 60 02 29 0c 80 2f 60 01 8d 00 01 10 4a 0a
+31 00 80 02 29 0c 00 2f 60 02 8d 00 01 10 8c 12
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 c0 08 96
+31 00 60 02 29 0c 40 20 80 01 8d 00 02 80 4a 0a
+31 00 80 02 29 0c 00 22 e0 00 8d 00 02 80 8c 12
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 08 40 09 06
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 01 40 08 04
+31 01 60 02 a1 0f cf 21 44 0e 6e 00 01 80 18 04
+31 01 60 02 a5 0f 01 22 44 0e 6e 00 00 b0 18 02
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 08 40 09 0a
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 40 09 16
+31 00 60 02 29 0c 40 20 c0 00 8d 00 05 70 42 08
+31 00 60 02 29 0c c0 20 40 01 8d 00 06 70 42 08
+31 00 60 02 29 0c 40 21 c0 01 8d 00 07 70 42 08
+31 00 60 02 29 0c c0 21 40 02 8d 00 08 70 42 08
+31 00 80 02 29 0c 20 28 20 09 8d 00 05 70 84 10
+31 00 80 02 29 0c 00 24 20 0a 8d 00 06 70 84 10
+31 00 80 02 29 0c 00 25 20 06 8d 00 07 70 84 10
+31 00 80 02 29 0c 00 26 20 07 8d 00 08 70 84 10
+31 00 60 02 29 0c 40 20 c0 00 8d 00 01 30 4a 06
+31 00 60 02 29 0c c0 20 20 01 8d 00 02 31 4a 06
+31 00 80 02 29 0c 40 20 40 01 8d 00 01 30 8c 0a
+31 00 80 02 29 0c 40 21 40 02 8d 00 02 31 8c 0a
+31 00 60 02 29 0c 40 20 40 00 8d 00 02 01 42 04
+31 00 60 02 29 0c c0 20 c0 00 8d 00 04 03 42 06
+31 00 80 02 29 0c 40 20 40 01 8d 00 02 01 84 08
+31 00 80 02 29 0c 40 21 40 02 8d 00 04 03 84 0c
+31 00 60 02 29 0c 40 20 40 00 8d 00 04 03 42 04
+31 00 60 02 29 0c c0 20 c0 00 8d 00 08 07 42 06
+31 00 80 02 29 0c 40 20 40 01 8d 00 04 03 84 08
+31 00 80 02 29 0c 40 21 40 02 8d 00 08 07 84 0c
+31 00 60 02 29 0c 80 2f 40 00 8d 00 01 10 42 06
+31 00 80 02 29 0c 00 2f 40 00 8d 00 01 10 84 0c
+31 02 80 09 21 0c 80 20 a0 01 8d 00 01 03 28 02
+31 00 60 02 29 0c 40 20 80 01 8d 00 02 10 4b 0a
+31 00 80 02 29 0c 60 24 e0 00 8d 00 02 10 8d 12
+31 00 60 02 29 0c 80 2f 80 01 8d 00 02 00 4b 08
+31 00 80 02 29 0c 00 2f e0 00 8d 00 02 00 8d 0e
+31 00 60 02 29 0c c0 20 20 02 8d 00 02 41 43 0e
+31 01 60 02 a5 0f 8f 20 44 0e 6e 00 03 80 18 04
+31 01 60 02 a5 0f af 20 44 0e 6e 00 04 81 18 04
+31 01 60 02 a5 0f 0f 21 44 0e 6e 00 05 82 18 04
+31 00 60 02 29 0c 80 2f 80 00 8d 00 01 40 42 0c
+31 00 60 02 29 0c 40 20 a0 00 8d 00 01 70 42 06
+31 00 80 02 29 0c 40 20 40 01 8d 00 01 70 84 0c
+31 01 60 02 bd 0f 2f 21 44 0e 6e 00 00 20 10 04
+31 01 60 0c 29 0c 61 20 24 00 6e 00 00 eb 10 04
+31 00 60 0c 29 0c a0 20 80 00 8d 00 01 50 49 08
+31 10 60 0c 29 0c 40 20 40 01 8d 00 01 60 49 08
+31 00 60 02 29 0c 40 20 20 01 8d 00 02 70 42 08
+31 00 80 02 29 0c 40 20 c0 01 8d 00 02 70 84 10
+31 00 60 02 29 0c 80 2f 40 00 8d 00 01 30 42 06
+31 00 80 02 29 0c 00 2f 40 00 8d 00 01 30 84 0c
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 40 09 1a
+31 00 60 02 29 0c 80 2f 00 01 8d 00 01 20 42 08
+31 00 80 02 29 0c 00 2f 40 01 8d 00 01 20 84 10
+31 01 60 02 3d 0c 0f 22 20 02 60 00 01 70 10 02
+31 00 60 02 29 0c 40 20 20 01 8d 00 01 30 42 08
+31 00 80 02 29 0c 00 22 00 01 8d 00 01 30 84 10
+31 00 60 02 29 0c 80 2f 40 00 8d 00 01 30 42 04
+31 00 80 02 29 0c 00 2f 40 00 8d 00 01 30 84 08
+31 00 60 0b 29 0c 80 20 00 00 8d 00 00 10 20 02
+31 00 80 0b 29 0c c0 20 00 00 8d 00 00 10 41 02
+31 00 60 02 29 0c 80 2f c0 01 8d 00 02 00 4b 0a
+31 00 80 02 29 0c 00 2f e0 00 8d 00 02 00 8d 12
+31 00 60 02 29 0c 40 20 a0 01 8d 00 01 20 42 06
+31 00 60 02 29 0c 40 20 c0 01 8d 00 01 90 42 04
+31 00 80 02 29 0c c0 21 00 01 8d 00 01 20 84 0c
+31 00 80 02 29 0c c0 21 40 01 8d 00 01 90 84 08
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 40 09 0e
+31 01 60 06 25 0c 8f 24 64 04 6e 00 9b 41 19 02
+31 01 60 06 25 0c 2f 25 04 05 6e 00 9b 42 19 02
+31 01 60 06 25 0c cf 25 a4 05 6e 00 9b 43 19 02
+31 01 60 06 25 0c 4f 27 24 07 6e 00 1b 41 19 02
+31 01 60 06 25 0c ef 27 c4 07 6e 00 1b 42 19 02
+31 01 60 06 25 0c 8f 28 64 08 6e 00 1b 43 19 02
+31 01 60 06 25 0c cf 29 64 02 6e 00 13 41 19 02
+31 01 60 06 25 0c 2f 2a 64 02 6e 00 13 42 19 02
+31 01 60 06 25 0c 8f 2a 64 02 6e 00 13 43 19 02
+31 00 60 02 29 0c 80 2f 40 00 8d 00 01 20 42 04
+31 00 80 02 29 0c 00 2f e0 00 8d 00 01 20 84 08
+31 00 60 02 29 0c 40 20 40 00 8d 00 01 90 42 06
+31 00 80 02 29 0c 40 20 80 01 8d 00 01 90 84 0c
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 40 09 12
+31 00 60 02 29 0c 80 2f c0 00 8d 00 01 60 42 06
+31 00 80 02 29 0c 00 2f 00 01 8d 00 01 60 84 0c
+31 00 60 02 29 0c 80 2f 40 00 8d 00 01 70 42 04
+31 00 80 02 29 0c 00 2f 40 00 8d 00 01 70 84 08
+31 00 60 02 29 0c 80 2f a0 00 8d 00 01 50 42 08
+31 00 80 02 29 0c 00 2f e0 00 8d 00 01 50 84 10
+31 00 60 02 29 0c 40 20 40 00 8d 00 06 80 4a 06
+31 00 60 02 29 0c c0 22 c0 01 8d 00 0a 84 4a 06
+31 00 60 02 29 0c c0 20 c0 00 8d 00 07 81 4a 08
+31 00 60 02 29 0c c0 21 40 01 8d 00 08 82 4a 08
+31 00 60 02 29 0c 40 22 40 03 8d 00 09 83 4a 0a
+31 00 80 02 29 0c 60 24 40 00 8d 00 06 80 8c 0a
+31 00 80 02 29 0c 40 22 60 05 8d 00 0a 84 8c 0a
+31 00 80 02 29 0c 60 25 e0 00 8d 00 07 81 8c 0e
+31 00 80 02 29 0c 40 20 60 06 8d 00 08 82 8c 0e
+31 00 80 02 29 0c 40 21 40 03 8d 00 09 83 8c 12
+31 00 61 0c 28 0c 00 20 e0 00 8d 04 01 60 02 0a
+31 00 81 0c 28 0c 00 20 20 01 8d 04 01 50 02 14
+31 00 60 02 29 0c 80 2f 40 01 8d 00 01 40 4a 0e
+31 00 60 02 29 0c 80 2f 20 01 8d 00 01 20 42 0a
+31 00 80 02 29 0c 00 2f 00 01 8d 00 01 20 84 14
+31 00 60 02 29 0c 80 2f 20 01 8d 00 01 10 42 0a
+31 00 80 02 29 0c 00 2f 00 01 8d 00 01 10 84 14
+31 00 60 0c 29 0c 20 23 a0 01 8d 00 01 5e 19 06
+31 00 60 0c 28 0c 00 20 40 03 8d 00 01 5e 0b 08
+31 10 60 0c 29 0c 60 21 40 02 8d 00 01 6e 19 06
+31 10 60 0c 28 0c 00 20 40 00 8d 00 01 6e 0b 08
+31 00 60 0c 28 0c 00 20 a0 01 8d 00 01 85 09 06
+31 10 60 0c 28 0c 00 20 40 00 8d 00 01 95 09 06
+31 00 60 0c 28 0c 00 20 40 03 8d 00 01 8c 09 08
+31 10 60 0c 28 0c 00 20 40 00 8d 00 01 9c 09 08
+31 00 60 0c 28 0c 00 20 40 03 8d 00 01 84 09 08
+31 10 60 0c 28 0c 00 20 40 00 8d 00 01 94 09 08
+31 00 60 0c 28 0c 00 20 80 03 8d 00 01 8e 09 0a
+31 10 60 0c 28 0c 00 20 00 02 8d 00 01 9e 09 0a
+31 00 60 02 29 0c 80 2f a0 00 8d 00 01 00 4a 06
+31 00 80 02 29 0c 00 2f e0 00 8d 00 01 00 8c 0a
+31 00 60 02 29 0c c0 20 c0 00 8d 00 02 31 42 06
+31 00 80 02 29 0c 40 21 40 03 8d 00 02 31 84 0c
+31 00 60 02 29 0c c0 22 c0 02 8d 00 03 a2 42 02
+31 00 60 02 29 0c 40 23 40 03 8d 00 04 a3 42 02
+31 00 60 02 29 0c c0 23 c0 03 8d 00 05 a4 42 02
+31 00 60 02 29 0c 40 24 40 04 8d 00 06 a5 42 02
+31 00 60 02 29 0c 40 22 80 02 8d 00 02 a1 42 02
+31 00 60 02 29 0c c0 24 c0 04 8d 00 07 a6 42 02
+31 00 60 02 29 0c 40 25 40 05 8d 00 08 a7 42 02
+31 00 60 02 29 0c c0 25 c0 05 8d 00 09 a8 42 02
+31 00 60 02 29 0c 40 26 40 06 8d 00 0a a9 42 02
+31 00 60 02 29 0c 40 20 c0 06 8d 00 0b aa 42 02
+31 00 60 02 29 0c c0 20 e0 06 8d 00 0c ab 42 02
+31 00 60 02 29 0c 40 21 00 07 8d 00 0d ac 42 02
+31 00 80 02 29 0c 40 21 40 02 8d 00 02 a1 84 04
+31 00 80 02 29 0c 40 2a c0 0d 8d 00 0b aa 84 04
+31 00 80 02 29 0c 40 22 40 03 8d 00 03 a2 84 04
+31 00 80 02 29 0c 40 2b 00 0e 8d 00 0c ab 84 04
+31 00 80 02 29 0c 40 2c 40 0d 8d 00 0d ac 84 04
+31 00 80 02 29 0c 40 23 40 04 8d 00 04 a3 84 04
+31 00 80 02 29 0c 40 24 40 05 8d 00 05 a4 84 04
+31 00 80 02 29 0c 40 25 40 06 8d 00 06 a5 84 04
+31 00 80 02 29 0c 40 26 40 07 8d 00 07 a6 84 04
+31 00 80 02 29 0c 40 27 40 08 8d 00 08 a7 84 04
+31 00 80 02 29 0c 40 28 40 09 8d 00 09 a8 84 04
+31 00 80 02 29 0c 40 29 80 0d 8d 00 0a a9 84 04
+31 00 60 02 29 0c 80 2f 40 00 8d 00 01 40 42 06
+31 00 60 02 29 0c 40 20 e0 00 8d 00 03 02 42 04
+31 00 60 02 29 0c 40 20 60 01 8d 00 05 04 42 04
+31 00 60 02 29 0c 40 20 80 01 8d 00 06 05 42 04
+31 00 60 02 29 0c 40 20 a0 01 8d 00 07 06 42 04
+31 00 60 02 29 0c 40 20 c0 01 8d 00 08 07 42 04
+31 00 60 02 29 0c 40 20 e0 01 8d 00 09 08 42 04
+31 00 60 02 29 0c 40 20 00 02 8d 00 0a 09 42 04
+31 00 60 02 29 0c 40 20 20 02 8d 00 0b 0a 42 04
+31 00 60 02 29 0c 40 20 40 02 8d 00 0c 0b 42 04
+31 00 60 02 29 0c 40 20 60 02 8d 00 0d 0c 42 04
+31 00 60 02 29 0c 40 20 80 02 8d 00 0e 0d 42 04
+31 00 60 02 29 0c 40 20 a0 02 8d 00 0f 0e 42 04
+31 00 60 02 29 0c 40 20 c0 02 8d 00 10 0f 42 04
+31 00 60 02 29 0c 40 20 40 01 8d 00 11 00 4a 06
+31 00 60 02 29 0c 40 20 a0 01 8d 00 12 01 4a 06
+31 00 60 02 29 0c 40 20 40 01 8d 00 13 02 4a 06
+31 00 60 02 29 0c 40 20 a0 01 8d 00 14 03 4a 06
+31 00 60 02 29 0c 40 20 40 01 8d 00 15 04 4a 06
+31 00 60 02 29 0c 40 20 a0 01 8d 00 16 05 4a 06
+31 00 60 02 29 0c 40 20 40 01 8d 00 17 06 4a 06
+31 00 60 02 29 0c 40 20 a0 01 8d 00 18 07 4a 06
+31 00 60 02 29 0c 40 20 40 01 8d 00 19 08 4a 06
+31 00 60 02 29 0c 40 20 a0 01 8d 00 1a 09 4a 06
+31 00 60 02 29 0c 40 20 40 01 8d 00 1b 0a 4a 06
+31 00 60 02 29 0c 40 20 a0 01 8d 00 1c 0b 4a 06
+31 00 60 02 29 0c 40 20 40 01 8d 00 1d 0c 4a 06
+31 00 60 02 29 0c 40 20 a0 01 8d 00 1e 0d 4a 06
+31 00 60 02 29 0c 40 20 40 01 8d 00 1f 0e 4a 06
+31 00 60 02 29 0c 40 20 a0 01 8d 00 20 0f 4a 06
+31 00 80 02 29 0c 40 20 40 03 8d 00 03 02 84 08
+31 00 80 02 29 0c 40 20 80 03 8d 00 05 04 84 08
+31 00 80 02 29 0c 40 20 a0 03 8d 00 06 05 84 08
+31 00 80 02 29 0c 40 20 c0 03 8d 00 07 06 84 08
+31 00 80 02 29 0c 40 20 e0 03 8d 00 08 07 84 08
+31 00 80 02 29 0c 40 20 00 04 8d 00 09 08 84 08
+31 00 80 02 29 0c 40 20 20 04 8d 00 0a 09 84 08
+31 00 80 02 29 0c 40 20 40 04 8d 00 0b 0a 84 08
+31 00 80 02 29 0c 40 20 60 04 8d 00 0c 0b 84 08
+31 00 80 02 29 0c 40 20 80 04 8d 00 0d 0c 84 08
+31 00 80 02 29 0c 40 20 a0 04 8d 00 0e 0d 84 08
+31 00 80 02 29 0c e0 20 c0 04 8d 00 0f 0e 84 08
+31 00 80 02 29 0c e0 22 e0 04 8d 00 10 0f 84 08
+31 00 80 02 29 0c 20 22 40 00 8d 00 11 00 8c 0a
+31 00 80 02 29 0c a0 23 e0 00 8d 00 12 01 8c 0a
+31 00 80 02 29 0c 60 23 80 01 8d 00 13 02 8c 0a
+31 00 80 02 29 0c 00 24 20 02 8d 00 14 03 8c 0a
+31 00 80 02 29 0c 40 20 c0 02 8d 00 15 04 8c 0a
+31 00 80 02 29 0c 40 20 60 03 8d 00 16 05 8c 0a
+31 00 80 02 29 0c 40 20 00 04 8d 00 17 06 8c 0a
+31 00 80 02 29 0c 40 20 a0 04 8d 00 18 07 8c 0a
+31 00 80 02 29 0c 40 20 40 05 8d 00 19 08 8c 0a
+31 00 80 02 29 0c 40 20 e0 05 8d 00 1a 09 8c 0a
+31 00 80 02 29 0c 40 20 80 06 8d 00 1b 0a 8c 0a
+31 00 80 02 29 0c 40 20 20 07 8d 00 1c 0b 8c 0a
+31 00 80 02 29 0c 40 20 c0 07 8d 00 1d 0c 8c 0a
+31 00 80 02 29 0c 40 20 60 08 8d 00 1e 0d 8c 0a
+31 00 80 02 29 0c 40 20 00 09 8d 00 1f 0e 8c 0a
+31 00 80 02 29 0c 40 20 a0 09 8d 00 20 0f 8c 0a
+31 00 60 02 29 0c c0 20 40 00 8d 00 02 01 42 02
+31 00 80 02 29 0c 40 21 40 00 8d 00 02 01 84 04
+31 00 60 0c 29 0c 80 2f 40 00 8d 00 01 60 40 02
+31 00 80 0c 29 0c 00 2f 40 00 8d 00 01 50 80 04
+31 00 60 02 29 0c a0 23 a0 00 8d 00 02 20 4b 0e
+31 00 60 02 29 0c 80 2f c0 00 8d 00 01 20 4a 08
+31 00 80 02 29 0c 00 2f 00 01 8d 00 01 20 8c 0e
+31 00 60 02 29 0c 80 2f 60 00 8d 00 01 00 4a 04
+31 00 80 02 29 0c 00 2f 80 00 8d 00 01 00 8c 06
+31 00 60 02 29 0c 20 22 80 01 8d 00 03 00 42 04
+31 00 80 02 29 0c e0 20 e0 04 8d 00 03 00 84 08
+31 00 60 02 29 0c 80 2f 00 01 8d 00 01 10 4a 06
+31 00 80 02 29 0c 00 2f e0 01 8d 00 01 10 8c 0a
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 c0 08 9a
+31 00 60 02 29 0c 40 20 e0 00 8d 00 01 60 4a 08
+31 00 60 02 29 0c c0 20 60 01 8d 00 02 61 4a 08
+31 00 80 02 29 0c 40 20 60 01 8d 00 01 60 8c 0e
+31 00 80 02 29 0c 40 21 40 02 8d 00 02 61 8c 0e
+31 01 60 06 25 0c 8f 2d 24 0d 6e 00 23 42 19 02
+31 01 60 06 25 0c 6f 2d 24 0d 6e 00 23 43 19 02
+31 01 60 06 25 0c 4f 2d 24 0d 6e 00 23 41 19 02
+31 01 60 06 25 0c ef 2d 24 0d 6e 00 2b 41 19 02
+31 01 60 06 25 0c 6f 2d 24 0d 6e 00 2b 42 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 2b 43 19 02
+31 01 60 06 25 0c 8f 2d 24 0d 6e 00 33 41 19 02
+31 01 60 06 25 0c cf 2d 24 0d 6e 00 33 42 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 33 43 19 02
+31 01 60 06 25 0c 6f 2d 24 0d 6e 00 3b 41 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 3b 42 19 02
+31 01 60 06 25 0c ef 2d 24 0d 6e 00 3b 43 19 02
+31 01 60 06 25 0c cf 2d 24 0d 6e 00 43 41 19 02
+31 01 60 06 25 0c 4f 2d 24 0d 6e 00 43 42 19 02
+31 01 60 06 25 0c ef 2d 24 0d 6e 00 43 43 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 4b 41 19 02
+31 01 60 06 25 0c ef 2d 24 0d 6e 00 4b 42 19 02
+31 01 60 06 25 0c 4f 2d 24 0d 6e 00 4b 43 19 02
+31 01 60 06 25 0c 4f 2d 24 0d 6e 00 53 41 19 02
+31 01 60 06 25 0c 8f 2d 24 0d 6e 00 53 42 19 02
+31 01 60 06 25 0c 6f 2d 24 0d 6e 00 53 43 19 02
+31 01 60 06 25 0c 6f 2d 24 0d 6e 00 5b 41 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 5b 42 19 02
+31 01 60 06 25 0c 8f 2d 24 0d 6e 00 5b 43 19 02
+31 01 60 06 25 0c 8f 2d 24 0d 6e 00 63 41 19 02
+31 01 60 06 25 0c cf 2d 24 0d 6e 00 63 42 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 63 43 19 02
+31 01 60 06 25 0c 6f 2d 24 0d 6e 00 6b 41 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 6b 42 19 02
+31 01 60 06 25 0c ef 2d 24 0d 6e 00 6b 43 19 02
+31 01 60 06 25 0c cf 2d 24 0d 6e 00 73 41 19 02
+31 01 60 06 25 0c 4f 2d 24 0d 6e 00 73 42 19 02
+31 01 60 06 25 0c ef 2d 24 0d 6e 00 73 43 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 7b 41 19 02
+31 01 60 06 25 0c ef 2d 24 0d 6e 00 7b 42 19 02
+31 01 60 06 25 0c 4f 2d 24 0d 6e 00 7b 43 19 02
+31 01 60 06 25 0c 4f 2d 24 0d 6e 00 83 41 19 02
+31 01 60 06 25 0c 8f 2d 24 0d 6e 00 83 42 19 02
+31 01 60 06 25 0c 6f 2d 24 0d 6e 00 83 43 19 02
+31 01 60 06 25 0c 8f 2d 24 0d 6e 00 93 41 19 02
+31 01 60 06 25 0c cf 2d 24 0d 6e 00 93 42 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 93 43 19 02
+31 01 60 06 25 0c 8f 2d 24 0d 6e 00 a3 41 19 02
+31 01 60 06 25 0c cf 2d 24 0d 6e 00 a3 42 19 02
+31 01 60 06 25 0c 4f 2d 24 0d 6e 00 a3 43 19 02
+31 01 60 06 25 0c ef 2d 24 0d 6e 00 ab 41 19 02
+31 01 60 06 25 0c 4f 2d 24 0d 6e 00 ab 42 19 02
+31 01 60 06 25 0c 8f 2d 24 0d 6e 00 ab 43 19 02
+31 01 60 06 25 0c 6f 2d 24 0d 6e 00 b3 41 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 b3 42 19 02
+31 01 60 06 25 0c 8f 2d 24 0d 6e 00 b3 43 19 02
+31 01 60 06 25 0c 8f 2d 24 0d 6e 00 bb 41 19 02
+31 01 60 06 25 0c cf 2d 24 0d 6e 00 bb 42 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 bb 43 19 02
+31 01 60 06 25 0c cf 2d 24 0d 6e 00 c3 41 19 02
+31 01 60 06 25 0c 4f 2d 24 0d 6e 00 c3 42 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 c3 43 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 cb 41 19 02
+31 01 60 06 25 0c ef 2d 24 0d 6e 00 cb 42 19 02
+31 01 60 06 25 0c 4f 2d 24 0d 6e 00 cb 43 19 02
+31 01 60 06 25 0c 4f 2d 24 0d 6e 00 d3 41 19 02
+31 01 60 06 25 0c 8f 2d 24 0d 6e 00 d3 42 19 02
+31 01 60 06 25 0c ef 2d 24 0d 6e 00 d3 43 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 db 41 19 02
+31 01 60 06 25 0c ef 2d 24 0d 6e 00 db 42 19 02
+31 01 60 06 25 0c 4f 2d 24 0d 6e 00 db 43 19 02
+31 01 60 06 25 0c 4f 2d 24 0d 6e 00 e3 41 19 02
+31 01 60 06 25 0c 8f 2d 24 0d 6e 00 e3 42 19 02
+31 01 60 06 25 0c cf 2d 24 0d 6e 00 e3 43 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 eb 41 19 02
+31 01 60 06 25 0c ef 2d 24 0d 6e 00 eb 42 19 02
+31 01 60 06 25 0c 8f 2d 24 0d 6e 00 eb 43 19 02
+31 01 60 06 25 0c 8f 2d 24 0d 6e 00 f3 41 19 02
+31 01 60 06 25 0c cf 2d 24 0d 6e 00 f3 42 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 f3 43 19 02
+31 01 60 06 25 0c af 2d 24 0d 6e 00 fb 41 19 02
+31 01 60 06 25 0c ef 2d 24 0d 6e 00 fb 42 19 02
+31 01 60 06 25 0c 4f 2d 24 0d 6e 00 fb 43 19 02
+31 00 60 0c 29 0c 80 2f 40 00 8d 00 01 6e 10 02
+31 00 80 0c 29 0c 00 2f 40 00 8d 00 01 5e 20 04
+31 00 61 0c 28 0c 00 20 c0 00 8d 04 01 6e 02 04
+31 00 61 0c 28 0c 00 20 c0 00 8d 04 02 6e 02 04
+31 00 81 0c 28 0c 00 20 00 01 8d 04 01 5e 02 08
+31 00 81 0c 28 0c 00 20 00 01 8d 04 02 5e 02 08
+31 00 60 02 29 0c 40 20 80 01 8d 00 01 50 4a 0a
+31 00 60 02 29 0c c0 20 20 02 8d 00 02 51 4a 0a
+31 00 80 02 29 0c 20 23 e0 00 8d 00 01 50 8c 12
+31 00 80 02 29 0c 20 24 00 02 8d 00 02 51 8c 12
+31 00 60 02 29 0c 40 20 80 01 8d 00 02 20 4b 0c
+31 00 80 02 29 0c 40 20 e0 01 8d 00 02 20 8d 16
+31 01 60 0c 29 0c 81 28 24 08 6e 00 00 e5 10 02
+31 00 60 02 29 0c 40 20 e0 00 8d 00 01 40 4b 12
+31 00 60 02 29 0c c0 20 00 02 8d 00 02 41 4b 12
+31 00 60 02 29 0c 80 2f e0 00 8d 00 01 20 4a 0a
+31 00 80 02 29 0c 00 2f 20 01 8d 00 01 20 8c 12
+31 01 60 02 3d 0c 0f 23 20 03 60 00 02 70 10 02
+31 01 60 02 3d 0c cf 23 e0 03 60 00 03 70 10 02
+31 01 60 02 3d 0c 6f 24 80 04 60 00 04 70 10 02
+31 01 60 02 3d 0c 2f 25 40 05 60 00 05 70 10 02
+31 01 60 02 3d 0c ef 25 00 06 60 00 06 70 10 02
+31 01 60 02 3d 0c af 26 c0 06 60 00 07 70 10 02
+31 01 60 02 3d 0c 6f 27 80 07 60 00 08 70 10 02
+31 01 60 02 3d 0c 2f 28 40 08 60 00 09 70 10 02
+31 01 60 02 3d 0c ef 28 00 09 60 00 0a 70 10 02
+31 01 60 02 3d 0c af 29 c0 09 60 00 0b 70 10 02
+31 01 60 02 3d 0c 6f 2a 80 0a 60 00 0c 70 10 02
+31 01 60 02 3d 0c 2f 2b 40 0b 60 00 0d 70 10 02
+31 01 60 02 bd 0f af 20 44 0e 6e 00 05 25 10 04
+31 00 61 0c 29 0c 60 20 60 00 8d 04 02 b7 10 04
+31 00 81 0c 29 0c 80 20 c0 00 8d 04 02 a7 20 08
+31 00 61 0c 29 0c 40 20 a0 00 8d 04 01 b5 10 02
+31 00 81 0c 29 0c 40 20 e0 00 8d 04 01 a5 20 04
+31 00 60 02 29 0c 80 2f 40 00 8d 00 01 00 42 08
+31 00 80 02 29 0c 00 2f 40 00 8d 00 01 00 84 10
+31 00 61 0c 29 0c 80 20 80 01 8d 04 02 b5 10 02
+31 00 81 0c 29 0c a0 20 20 02 8d 04 02 a5 20 04
+31 00 60 02 29 0c 40 20 60 01 8d 00 01 50 4a 08
+31 00 60 02 29 0c c0 20 e0 01 8d 00 02 51 4a 08
+31 00 80 02 29 0c 40 20 a0 02 8d 00 01 50 8c 0e
+31 00 80 02 29 0c 40 21 80 03 8d 00 02 51 8c 0e
+31 00 60 02 29 0c 40 20 20 01 8d 00 01 b0 4a 02
+31 00 80 02 29 0c 40 20 40 01 8d 00 01 b0 8c 02
+31 00 61 0c 28 0c 00 20 60 00 8d 04 01 6c 02 06
+31 00 81 0c 28 0c 00 20 80 00 8d 04 01 5c 02 0c
+31 00 60 02 29 0c 60 20 00 01 8d 00 02 70 42 02
+31 00 80 02 29 0c 80 20 40 03 8d 00 02 70 84 04
+31 00 61 0c 29 0c c0 21 a0 01 8d 04 02 bb 10 04
+31 00 60 0c 29 0c e0 21 60 00 8d 00 02 6e 10 02
+31 00 61 0c 29 0c 40 22 80 00 8d 04 02 b4 10 04
+31 00 81 0c 29 0c a0 22 e0 02 8d 04 02 ab 20 08
+31 00 80 0c 29 0c c0 22 60 00 8d 00 02 5e 20 04
+31 00 81 0c 29 0c 20 23 c0 00 8d 04 02 a4 20 08
+31 00 60 0b 29 0c 40 20 60 00 8d 00 00 30 20 04
+31 00 80 0b 29 0c 40 20 e0 01 8d 00 00 30 41 08
+31 02 00 0a 29 0d 40 20 40 00 00 00 00 c0 09 02
+31 00 60 0b 29 0c 40 20 00 00 8d 00 10 10 20 02
+31 00 80 0b 29 0c 40 20 00 00 8d 00 10 10 41 02
+31 00 60 0b 29 0c 40 20 00 00 8d 00 20 10 20 02
+31 00 80 0b 29 0c 40 20 00 00 8d 00 20 10 41 02
+31 00 60 0b 29 0c 40 20 00 00 8d 00 30 10 20 02
+31 00 80 0b 29 0c 40 20 00 00 8d 00 30 10 41 02
+31 00 61 0c 28 0c 00 20 e0 0e 8d 04 01 96 00 02
+31 00 61 0c 29 0c 00 26 e0 0e 8d 04 01 b6 10 02
+31 00 81 0c 28 0c 00 20 60 00 8d 04 01 86 00 04
+31 00 81 0c 29 0c 20 2c 60 00 8d 04 01 a6 20 04
+31 00 60 02 29 0c 40 20 40 00 8d 00 04 80 4a 06
+31 00 60 02 29 0c c0 20 c0 00 8d 00 05 81 4a 08
+31 00 60 02 29 0c c0 21 40 01 8d 00 06 82 4a 06
+31 00 80 02 29 0c 40 20 40 01 8d 00 04 80 8c 0a
+31 00 80 02 29 0c 40 21 40 02 8d 00 05 81 8c 0e
+31 00 80 02 29 0c 40 23 40 04 8d 00 06 82 8c 0a
+31 01 60 0c 29 0c e1 24 84 04 6e 00 00 e6 10 02
+31 00 60 02 29 0c c0 20 c0 00 8d 00 02 01 42 06
+31 00 80 02 29 0c 40 21 40 02 8d 00 02 01 84 0c
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 30 c0 08 8e
+31 00 60 0c 29 0c c0 20 20 01 8d 00 02 60 40 02
+31 00 80 0c 29 0c 00 22 c0 01 8d 00 02 50 80 04
+31 00 60 0c 29 0c a0 20 a0 00 8d 00 01 5e 19 04
+31 00 60 0c 28 0c 00 20 e0 00 8d 00 02 5e 0b 06
+31 10 60 0c 29 0c e0 20 40 01 8d 00 01 6e 19 04
+31 10 60 0c 28 0c 00 20 80 01 8d 00 02 6e 0b 06
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 30 40 09 1a
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 60 40 09 06
+31 00 60 02 29 0c 80 2f 40 01 8d 00 01 10 4a 08
+31 00 80 02 29 0c 00 2f e0 01 8d 00 01 10 8c 0e
+31 01 60 02 a1 0f af 21 44 0e 6e 00 00 d0 11 02
+31 01 60 02 a1 0f 6f 20 44 0e 6e 00 05 80 18 04
+31 01 60 02 a1 0f 8f 20 44 0e 6e 00 06 81 18 04
+31 01 60 02 a1 0f cf 20 44 0e 6e 00 07 82 18 04
+31 01 60 02 a1 0f 2f 21 44 0e 6e 00 08 83 18 04
+31 01 60 02 a1 0f 6f 21 44 0e 6e 00 09 84 18 04
+31 00 61 0c 28 0c 00 20 60 01 8d 04 02 60 02 0a
+31 00 81 0c 28 0c 00 20 a0 01 8d 04 02 50 02 14
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 60 40 09 16
+31 00 60 0a 28 0c 00 20 c0 0f 8d 00 ff 02 0a 04
+31 02 60 0a a9 0f 40 28 00 00 8d 00 09 00 1c 02
+31 02 60 0a a9 0f 40 28 00 00 8d 00 01 00 1c 02
+31 02 60 0a a9 0f 40 28 00 00 8d 00 08 00 1c 02
+31 02 60 0a a9 0f 40 28 00 00 8d 00 02 00 1c 02
+31 02 60 0a a9 0f 40 28 00 00 8d 00 07 00 1c 02
+31 02 60 0a a9 0f 40 28 00 00 8d 00 00 00 1c 02
+31 02 60 0a a9 0f 80 28 00 00 8d 00 05 00 1c 02
+31 02 60 0a a9 0f 40 28 00 00 8d 00 04 00 1c 02
+31 02 60 0a a9 0f 00 28 00 00 8d 00 06 00 1c 02
+31 02 60 0a a9 0f 00 28 00 00 8d 00 03 00 1c 02
+31 00 60 02 29 0c 40 20 20 01 8d 00 01 30 42 0a
+31 00 80 02 29 0c 60 22 20 01 8d 00 01 30 84 14
+31 00 60 02 29 0c 80 2f 00 01 8d 00 01 60 42 08
+31 00 80 02 29 0c 00 2f 40 01 8d 00 01 60 84 10
+31 00 61 0c 29 0c c0 21 a0 01 8d 04 02 bd 10 04
+31 00 81 0c 29 0c a0 22 e0 02 8d 04 02 ad 20 08
+31 00 60 02 29 0c 80 2f 40 00 8d 00 01 40 42 12
+31 00 60 0c 28 0c 00 20 e0 00 8d 00 02 5e 0b 08
+31 10 60 0c 28 0c 00 20 40 00 8d 00 02 6e 0b 08
+31 00 60 0c 29 0c 40 23 80 02 8d 00 01 5c 29 06
+31 00 60 0c 28 0c 00 20 e0 02 8d 00 02 5c 0b 0a
+31 10 60 0c 29 0c a0 20 60 05 8d 00 01 6c 29 06
+31 10 60 0c 28 0c 00 20 40 00 8d 00 02 6c 0b 0a
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 60 c0 08 86
+31 00 60 0c 29 0c 40 20 00 02 8d 00 01 50 49 04
+31 10 60 0c 29 0c 40 20 e0 03 8d 00 01 60 49 04
+31 00 60 0c 29 0c 40 20 00 02 8d 00 01 5c 29 04
+31 10 60 0c 29 0c 40 20 e0 03 8d 00 01 6c 29 04
+31 00 60 0c 29 0c 80 22 00 02 8d 00 02 5e 19 04
+31 00 60 0c 29 0c 80 2f 40 02 8d 00 01 a7 19 06
+31 10 60 0c 29 0c 80 20 e0 03 8d 00 02 6e 19 04
+31 10 60 0c 29 0c 20 2f 40 00 8d 00 01 b7 19 06
+31 00 60 0c 29 0c 80 2f 40 02 8d 00 01 ad 19 06
+31 10 60 0c 29 0c 20 2f 40 00 8d 00 01 bd 19 06
+31 00 60 0c 29 0c 80 2f 40 02 8d 00 01 ac 19 06
+31 10 60 0c 29 0c 20 2f 40 00 8d 00 01 bc 19 06
+31 00 60 0c 29 0c 80 2f 40 02 8d 00 01 a1 19 06
+31 10 60 0c 29 0c 20 2f 40 00 8d 00 01 b1 19 06
+31 00 60 0c 29 0c 80 2f 40 02 8d 00 01 a2 19 06
+31 10 60 0c 29 0c 20 2f 40 00 8d 00 01 b2 19 06
+31 00 60 0c 29 0c 80 2f 40 02 8d 00 01 a3 19 06
+31 10 60 0c 29 0c 20 2f 40 00 8d 00 01 b3 19 06
+31 00 60 0c 29 0c 80 2f 40 02 8d 00 01 a4 19 06
+31 10 60 0c 29 0c 20 2f 40 00 8d 00 01 b4 19 06
+31 00 60 0c 29 0c 80 2f 40 02 8d 00 01 ae 19 08
+31 10 60 0c 29 0c 20 2f 40 00 8d 00 01 be 19 08
+31 00 60 02 29 0c 20 21 60 02 8d 00 02 e1 43 08
+31 00 80 02 29 0c e0 22 e0 00 8d 00 02 e1 85 10
+31 00 60 02 29 0c 80 2f a0 00 8d 00 02 00 4b 0c
+31 00 80 02 29 0c 00 2f e0 00 8d 00 02 00 8d 16
+31 00 60 02 29 0c 80 2f a0 00 8d 00 01 50 42 06
+31 00 80 02 29 0c 00 2f e0 00 8d 00 01 50 84 0c
+31 00 61 0c 28 0c 00 20 20 01 8d 04 03 60 02 0a
+31 00 81 0c 28 0c 00 20 60 01 8d 04 03 50 02 14
+31 00 60 0c 28 0c 00 20 40 00 8d 00 01 5e 0b 06
+31 10 60 0c 28 0c 00 20 40 00 8d 00 01 6e 0b 06
+31 00 60 02 29 0c 40 20 60 01 8d 00 01 30 4a 08
+31 00 60 02 29 0c c0 20 e0 01 8d 00 02 31 4a 08
+31 00 80 02 29 0c 40 20 c0 02 8d 00 01 30 8c 0e
+31 00 80 02 29 0c 40 21 a0 03 8d 00 02 31 8c 0e
+31 02 80 09 21 0c 40 20 40 03 8d 00 02 03 28 02
+31 02 80 09 21 0c 00 22 60 03 8d 00 03 03 28 02
+31 00 60 02 29 0c 80 2f c0 00 8d 00 01 40 4a 0a
+31 00 61 0c 28 0c 00 20 80 01 8d 04 01 97 00 04
+31 00 81 0c 28 0c 00 20 60 02 8d 04 01 87 00 08
+31 00 60 02 29 0c 40 20 e0 00 8d 00 01 40 4b 0c
+31 00 60 02 29 0c c0 20 a0 01 8d 00 02 41 4b 0c
+31 00 60 02 29 0c 40 20 40 00 8d 00 02 d0 43 04
+31 00 80 02 29 0c 40 20 40 01 8d 00 02 d0 85 08
+31 00 60 0c 29 0c 80 2f 40 00 8d 00 01 68 30 02
+31 00 80 0c 29 0c 00 2f 40 00 8d 00 01 58 60 04
+31 00 61 0c 28 00 00 20 40 00 8d 04 00 02 00 00
+31 00 60 02 29 0c 40 20 80 01 8d 00 01 40 4b 10
+31 00 60 02 29 0c c0 20 80 02 8d 00 02 41 4b 10
+31 00 60 02 29 0c 40 21 40 01 8d 00 04 00 42 04
+31 00 80 02 29 0c 40 22 40 03 8d 00 04 00 84 08
+31 01 60 02 bd 0f 6f 21 44 0e 6e 00 01 20 19 06
+31 00 61 0c 29 0c 60 20 40 01 8d 04 01 b7 10 04
+31 00 61 0c 29 0c a0 20 40 01 8d 04 01 bd 10 04
+31 00 61 0c 29 0c c0 20 40 01 8d 04 01 bc 10 04
+31 00 61 0c 29 0c e0 20 40 01 8d 04 01 b1 10 04
+31 00 61 0c 29 0c 20 21 40 01 8d 04 01 b3 10 04
+31 00 61 0c 29 0c 40 21 40 01 8d 04 01 b4 10 04
+31 00 61 0c 29 0c 60 21 60 01 8d 04 01 be 10 06
+31 00 81 0c 29 0c 60 20 60 02 8d 04 01 a7 20 08
+31 00 81 0c 29 0c e0 20 60 02 8d 04 01 ad 20 08
+31 00 81 0c 29 0c 20 21 60 02 8d 04 01 ac 20 08
+31 00 81 0c 29 0c 60 21 60 02 8d 04 01 a1 20 08
+31 00 81 0c 29 0c e0 21 60 02 8d 04 01 a3 20 08
+31 00 81 0c 29 0c 20 22 60 02 8d 04 01 a4 20 08
+31 00 81 0c 29 0c 60 22 a0 02 8d 04 01 ae 20 0c
+31 00 60 0c 28 0c 00 20 a0 00 8d 00 01 50 0b 0e
+31 10 60 0c 28 0c 00 20 e0 00 8d 00 01 60 0b 0e
+31 01 60 02 bd 0f 8f 20 44 0e 6e 00 05 00 19 06
+31 01 60 02 bd 0f af 20 44 0e 6e 00 06 01 19 06
+31 01 60 02 bd 0f ef 20 44 0e 6e 00 07 02 19 06
+31 01 60 02 bd 0f 4f 21 44 0e 6e 00 08 03 19 06
+31 01 60 02 bd 0f 8f 21 44 0e 6e 00 09 04 19 06
+31 00 61 0c 28 0c 00 20 60 00 8d 04 01 9d 00 04
+31 00 61 0c 28 0c 00 20 60 00 8d 04 01 9c 00 04
+31 00 61 0c 28 0c 00 20 60 00 8d 04 01 91 00 04
+31 00 61 0c 28 0c 00 20 60 00 8d 04 01 92 00 04
+31 00 61 0c 28 0c 00 20 60 00 8d 04 01 93 00 04
+31 00 61 0c 28 0c 00 20 60 00 8d 04 01 94 00 04
+31 00 61 0c 28 0c 00 20 20 01 8d 04 01 9e 00 06
+31 00 81 0c 28 0c 00 20 a0 01 8d 04 01 8d 00 08
+31 00 81 0c 28 0c 00 20 a0 01 8d 04 01 8c 00 08
+31 00 81 0c 28 0c 00 20 a0 01 8d 04 01 81 00 08
+31 00 81 0c 28 0c 00 20 a0 01 8d 04 01 82 00 08
+31 00 81 0c 28 0c 00 20 a0 01 8d 04 01 83 00 08
+31 00 81 0c 28 0c 00 20 a0 01 8d 04 01 84 00 08
+31 00 81 0c 28 0c 00 20 a0 01 8d 04 01 8e 00 0c
+31 00 60 02 29 0c 20 21 a0 00 8d 00 02 00 42 04
+31 00 80 02 29 0c a0 21 e0 00 8d 00 02 00 84 08
+31 00 60 0c 29 0c 80 2f 40 00 8d 00 01 a5 19 04
+31 10 60 0c 29 0c 20 2f 40 00 8d 00 01 b5 19 04
+31 00 60 0c 28 0c 00 20 80 02 8d 00 01 81 09 06
+31 10 60 0c 28 0c 00 20 40 00 8d 00 01 91 09 06
+31 00 60 0c 28 0c 00 20 60 02 8d 00 01 82 09 06
+31 10 60 0c 28 0c 00 20 40 00 8d 00 01 92 09 06
+31 00 60 0c 28 0c 00 20 60 02 8d 00 01 83 09 06
+31 10 60 0c 28 0c 00 20 40 00 8d 00 01 93 09 06
+31 00 60 02 29 0c a0 23 40 02 8d 00 08 00 42 04
+31 00 60 02 29 0c 60 24 40 02 8d 00 09 01 42 04
+31 00 60 02 29 0c 20 25 40 02 8d 00 0a 02 42 04
+31 00 60 02 29 0c 40 20 40 02 8d 00 0b 03 42 04
+31 00 60 02 29 0c c0 20 40 02 8d 00 0c 04 42 04
+31 00 60 02 29 0c 40 21 40 02 8d 00 0d 05 42 04
+31 00 60 02 29 0c c0 21 40 02 8d 00 0e 06 42 04
+31 00 60 02 29 0c 40 22 40 02 8d 00 0f 07 42 04
+31 00 80 02 29 0c 00 24 c0 02 8d 00 08 00 84 08
+31 00 80 02 29 0c 40 25 c0 02 8d 00 09 01 84 08
+31 00 80 02 29 0c 80 27 c0 02 8d 00 0a 02 84 08
+31 00 80 02 29 0c c0 28 c0 02 8d 00 0b 03 84 08
+31 00 80 02 29 0c c0 29 c0 02 8d 00 0c 04 84 08
+31 00 80 02 29 0c c0 2a c0 02 8d 00 0d 05 84 08
+31 00 80 02 29 0c c0 2b c0 02 8d 00 0e 06 84 08
+31 00 80 02 29 0c 80 26 c0 02 8d 00 0f 07 84 08
+31 01 60 02 bd 0f af 20 44 0e 6e 00 01 21 10 04
+31 01 60 02 bd 0f cf 20 44 0e 6e 00 02 22 10 04
+31 01 60 02 bd 0f ef 20 44 0e 6e 00 03 23 10 04
+31 01 60 02 bd 0f 0f 21 44 0e 6e 00 04 24 10 04
+31 01 60 02 bd 0f 4f 21 44 0e 6e 00 06 26 10 04
+31 01 60 02 bd 0f 6f 21 44 0e 6e 00 07 27 10 04
+31 00 60 02 29 0c 80 2f 40 01 8d 00 01 00 4a 08
+31 00 80 02 29 0c 00 2f 40 02 8d 00 01 00 8c 0e
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 30 40 09 0e
+31 00 60 0c 28 0c 00 20 80 02 8d 00 01 87 09 08
+31 10 60 0c 28 0c 00 20 40 00 8d 00 01 97 09 08
+31 00 60 0c 28 0c 00 20 80 02 8d 00 01 8d 09 08
+31 10 60 0c 28 0c 00 20 40 00 8d 00 01 9d 09 08
+31 00 60 0c 28 0c 00 20 80 02 8d 00 01 81 09 08
+31 10 60 0c 28 0c 00 20 40 00 8d 00 01 91 09 08
+31 00 60 0c 28 0c 00 20 80 02 8d 00 01 82 09 08
+31 10 60 0c 28 0c 00 20 40 00 8d 00 01 92 09 08
+31 00 60 0c 28 0c 00 20 80 02 8d 00 01 83 09 08
+31 10 60 0c 28 0c 00 20 40 00 8d 00 01 93 09 08
+31 00 60 02 29 0c 40 20 40 00 8d 00 06 00 4b 08
+31 00 60 02 29 0c c0 20 c0 00 8d 00 07 01 4b 0a
+31 00 60 02 29 0c c0 21 60 01 8d 00 08 02 4b 0a
+31 00 60 02 29 0c 40 22 40 02 8d 00 09 03 4b 0c
+31 00 60 02 29 0c c0 22 00 03 8d 00 0a 04 4b 08
+31 00 80 02 29 0c 40 22 40 03 8d 00 08 02 8d 12
+31 00 80 02 29 0c 40 20 40 01 8d 00 06 00 8d 0e
+31 00 80 02 29 0c 40 23 60 04 8d 00 09 03 8d 16
+31 00 80 02 29 0c 40 21 a0 06 8d 00 07 01 8d 12
+31 00 80 02 29 0c 40 24 c0 05 8d 00 0a 04 8d 0e
+31 00 60 0c 28 0c 00 20 20 01 8d 00 03 50 0b 0e
+31 10 60 0c 28 0c 00 20 e0 01 8d 00 03 60 0b 0e
diff --git a/src/intel/compiler/elk/tests/gen7.5/sendc.asm b/src/intel/compiler/elk/tests/gen7.5/sendc.asm
new file mode 100644
index 00000000000..279f51eef45
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/sendc.asm
@@ -0,0 +1,104 @@
+sendc(8)        null<1>UW       g124<8,8,1>F    0x88031400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g120<8,8,1>F    0x90031000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 8 rlen 0 { align1 1H EOT };
+sendc(16)       null<1>UW       g114<8,8,1>F    0x82031100
+                            render MsgDesc: RT write SIMD16/RepData LastRT Surface = 0 mlen 1 rlen 0 { align1 1H EOT };
+(+f0.1) sendc(8) null<1>UW      g124<8,8,1>F    0x88031400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 4 rlen 0 { align1 1Q EOT };
+(+f0.1) sendc(16) null<1>UW     g120<8,8,1>F    0x90031000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 8 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g13<8,8,1>F     0x0e0b0401
+                            render MsgDesc: RT write SIMD8 Surface = 1 mlen 7 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g121<8,8,1>F    0x8e0b1402
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 2 mlen 7 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g7<8,8,1>F      0x180b0001
+                            render MsgDesc: RT write SIMD16 Surface = 1 mlen 12 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g116<8,8,1>F    0x980b1002
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 2 mlen 12 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g123<8,8,1>F    0x8a031400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 5 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x94031000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g22<8,8,1>F     0x0c0b0400
+                            render MsgDesc: RT write SIMD8 Surface = 0 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g28<8,8,1>F     0x0c0b0401
+                            render MsgDesc: RT write SIMD8 Surface = 1 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g34<8,8,1>F     0x0c0b0402
+                            render MsgDesc: RT write SIMD8 Surface = 2 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1403
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 3 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g2<8,8,1>F      0x140b0000
+                            render MsgDesc: RT write SIMD16 Surface = 0 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g12<8,8,1>F     0x140b0001
+                            render MsgDesc: RT write SIMD16 Surface = 1 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g22<8,8,1>F     0x140b0002
+                            render MsgDesc: RT write SIMD16 Surface = 2 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1003
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 3 mlen 10 rlen 0 { align1 1H EOT };
+(+f0.1) sendc(8) null<1>UW      g123<8,8,1>F    0x8a031400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 5 rlen 0 { align1 1Q EOT };
+(+f0.1) sendc(16) null<1>UW     g118<8,8,1>F    0x94031000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g118<8,8,1>F    0x940b1200
+                            render MsgDesc: RT write SIMD8/DualSrcLow LastRT Surface = 0 mlen 10 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       g3<8,8,1>F      0x140b1200
+                            render MsgDesc: RT write SIMD8/DualSrcLow LastRT Surface = 0 mlen 10 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g118<8,8,1>F    0x940b1300
+                            render MsgDesc: RT write SIMD8/DualSrcHigh LastRT Surface = 0 mlen 10 rlen 0 { align1 2Q EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1401
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 1 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1001
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1402
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 2 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1002
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 2 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g3<8,8,1>F      0x0c0b0403
+                            render MsgDesc: RT write SIMD8 Surface = 3 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g3<8,8,1>F      0x0c0b0404
+                            render MsgDesc: RT write SIMD8 Surface = 4 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g3<8,8,1>F      0x0c0b0405
+                            render MsgDesc: RT write SIMD8 Surface = 5 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g3<8,8,1>F      0x0c0b0406
+                            render MsgDesc: RT write SIMD8 Surface = 6 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1407
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 7 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g3<8,8,1>F      0x140b0003
+                            render MsgDesc: RT write SIMD16 Surface = 3 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g3<8,8,1>F      0x140b0004
+                            render MsgDesc: RT write SIMD16 Surface = 4 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g3<8,8,1>F      0x140b0005
+                            render MsgDesc: RT write SIMD16 Surface = 5 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g3<8,8,1>F      0x140b0006
+                            render MsgDesc: RT write SIMD16 Surface = 6 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1007
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 7 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g10<8,8,1>F     0x0e0b0400
+                            render MsgDesc: RT write SIMD8 Surface = 0 mlen 7 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g121<8,8,1>F    0x8e0b1401
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 1 mlen 7 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g2<8,8,1>F      0x160b0000
+                            render MsgDesc: RT write SIMD16 Surface = 0 mlen 11 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g117<8,8,1>F    0x960b1001
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 11 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1405
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 5 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1005
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 5 mlen 10 rlen 0 { align1 1H EOT };
+sendc(16)       null<1>UW       g116<8,8,1>F    0x980b1001
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 12 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x88031400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g119<8,8,1>F    0x92031000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 9 rlen 0 { align1 1H EOT };
+sendc(16)       null<1>UW       g11<8,8,1>F     0x180b0000
+                            render MsgDesc: RT write SIMD16 Surface = 0 mlen 12 rlen 0 { align1 1H };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1404
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 4 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1004
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 4 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1406
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 6 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1006
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 6 mlen 10 rlen 0 { align1 1H EOT };
diff --git a/src/intel/compiler/elk/tests/gen7.5/sendc.expected b/src/intel/compiler/elk/tests/gen7.5/sendc.expected
new file mode 100644
index 00000000000..a70230a0eec
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/sendc.expected
@@ -0,0 +1,52 @@
+32 00 60 05 a8 0f 00 20 80 0f 8d 00 00 14 03 88
+32 00 80 05 a8 0f 00 20 00 0f 8d 00 00 10 03 90
+32 00 80 05 a8 0f 00 20 40 0e 8d 00 00 11 03 82
+32 00 61 05 a8 0f 00 20 80 0f 8d 02 00 14 03 88
+32 00 81 05 a8 0f 00 20 00 0f 8d 02 00 10 03 90
+32 00 60 05 a8 0f 00 20 a0 01 8d 00 01 04 0b 0e
+32 00 60 05 a8 0f 00 20 20 0f 8d 00 02 14 0b 8e
+32 00 80 05 a8 0f 00 20 e0 00 8d 00 01 00 0b 18
+32 00 80 05 a8 0f 00 20 80 0e 8d 00 02 10 0b 98
+32 00 60 05 a8 0f 00 20 60 0f 8d 00 00 14 03 8a
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 00 10 03 94
+32 00 60 05 a8 0f 00 20 c0 02 8d 00 00 04 0b 0c
+32 00 60 05 a8 0f 00 20 80 03 8d 00 01 04 0b 0c
+32 00 60 05 a8 0f 00 20 40 04 8d 00 02 04 0b 0c
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 03 14 0b 8c
+32 00 80 05 a8 0f 00 20 40 00 8d 00 00 00 0b 14
+32 00 80 05 a8 0f 00 20 80 01 8d 00 01 00 0b 14
+32 00 80 05 a8 0f 00 20 c0 02 8d 00 02 00 0b 14
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 03 10 0b 94
+32 00 61 05 a8 0f 00 20 60 0f 8d 02 00 14 03 8a
+32 00 81 05 a8 0f 00 20 c0 0e 8d 02 00 10 03 94
+32 00 60 05 a8 0f 00 20 c0 0e 8d 00 00 12 0b 94
+32 00 60 05 a8 0f 00 20 60 00 8d 00 00 12 0b 14
+32 10 60 05 a8 0f 00 20 c0 0e 8d 00 00 13 0b 94
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 01 14 0b 8c
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 01 10 0b 94
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 02 14 0b 8c
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 02 10 0b 94
+32 00 60 05 a8 0f 00 20 60 00 8d 00 03 04 0b 0c
+32 00 60 05 a8 0f 00 20 60 00 8d 00 04 04 0b 0c
+32 00 60 05 a8 0f 00 20 60 00 8d 00 05 04 0b 0c
+32 00 60 05 a8 0f 00 20 60 00 8d 00 06 04 0b 0c
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 07 14 0b 8c
+32 00 80 05 a8 0f 00 20 60 00 8d 00 03 00 0b 14
+32 00 80 05 a8 0f 00 20 60 00 8d 00 04 00 0b 14
+32 00 80 05 a8 0f 00 20 60 00 8d 00 05 00 0b 14
+32 00 80 05 a8 0f 00 20 60 00 8d 00 06 00 0b 14
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 07 10 0b 94
+32 00 60 05 a8 0f 00 20 40 01 8d 00 00 04 0b 0e
+32 00 60 05 a8 0f 00 20 20 0f 8d 00 01 14 0b 8e
+32 00 80 05 a8 0f 00 20 40 00 8d 00 00 00 0b 16
+32 00 80 05 a8 0f 00 20 a0 0e 8d 00 01 10 0b 96
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 05 14 0b 8c
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 05 10 0b 94
+32 00 80 05 a8 0f 00 20 80 0e 8d 00 01 10 0b 98
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 00 14 03 88
+32 00 80 05 a8 0f 00 20 e0 0e 8d 00 00 10 03 92
+32 00 80 05 a8 0f 00 20 60 01 8d 00 00 00 0b 18
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 04 14 0b 8c
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 04 10 0b 94
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 06 14 0b 8c
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 06 10 0b 94
diff --git a/src/intel/compiler/elk/tests/gen7.5/shl.asm b/src/intel/compiler/elk/tests/gen7.5/shl.asm
new file mode 100644
index 00000000000..921918aa974
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/shl.asm
@@ -0,0 +1,15 @@
+shl(1)          a0<1>UW         a0<0,1,0>UW     0x0002UW        { align1 WE_all 1N };
+shl(1)          g12.2<1>UD      g12.2<0,1,0>UD  0x0000000bUD    { align1 WE_all 1N };
+shl(8)          g19<1>D         g18<8,8,1>D     0x00000002UD    { align1 1Q };
+shl(16)         g28<1>D         g26<8,8,1>D     0x00000002UD    { align1 1H };
+shl(8)          g10<1>.xD       g1<0>.yD        0x00000004UD    { align16 1Q };
+shl(8)          g21<1>.xyD      g1<0>.xyyyD     g1<0>.zwwwUD    { align16 1Q };
+shl(8)          g3<1>D          g3<8,8,1>D      g8<8,8,1>UD     { align1 1Q };
+shl(16)         g18<1>D         g4<8,8,1>D      g9<8,8,1>UD     { align1 1H };
+shl(1)          a0<1>UD         g18<0,1,0>UD    0x00000008UD    { align1 WE_all 1N };
+shl(8)          g4<1>.xUD       g17<4>.xUD      g18<4>.xUD      { align16 1Q };
+shl(8)          g14<1>.xUD      g12<4>.yUD      0x00000010UD    { align16 1Q };
+shl(8)          g16<1>.xUD      g15<4>.xUD      g14<4>.xUD      { align16 WE_all 1Q };
+shl(1)          g22<1>UD        g22<0,1,0>UD    0x00000004UD    { align1 WE_all 3N };
+shl(8)          g21<1>UD        g21<8,8,1>UD    0x00000010UD    { align1 1Q };
+shl(16)         g40<1>UD        g40<8,8,1>UD    0x00000010UD    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/shl.expected b/src/intel/compiler/elk/tests/gen7.5/shl.expected
new file mode 100644
index 00000000000..2f453f2ee55
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/shl.expected
@@ -0,0 +1,15 @@
+09 02 00 00 08 2d 00 22 00 02 00 00 02 00 02 00
+09 02 00 00 21 0c 88 21 88 01 00 00 0b 00 00 00
+09 00 60 00 a5 0c 60 22 40 02 8d 00 02 00 00 00
+09 00 80 00 a5 0c 80 23 40 03 8d 00 02 00 00 00
+09 01 60 00 a5 0c 41 21 25 00 05 00 04 00 00 00
+09 01 60 00 a5 04 a3 22 24 00 05 00 2e 00 0f 00
+09 00 60 00 a5 04 60 20 60 00 8d 00 00 01 8d 00
+09 00 80 00 a5 04 40 22 80 00 8d 00 20 01 8d 00
+09 02 00 00 20 0c 00 22 40 02 00 00 08 00 00 00
+09 01 60 00 21 04 81 20 20 02 60 00 40 02 60 00
+09 01 60 00 21 0c c1 21 85 01 65 00 10 00 00 00
+09 03 60 00 21 04 01 22 e0 01 60 00 c0 01 60 00
+09 12 00 00 21 0c c0 22 c0 02 00 00 04 00 00 00
+09 00 60 00 21 0c a0 22 a0 02 8d 00 10 00 00 00
+09 00 80 00 21 0c 00 25 00 05 8d 00 10 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/shr.asm b/src/intel/compiler/elk/tests/gen7.5/shr.asm
new file mode 100644
index 00000000000..dcee9b8e65c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/shr.asm
@@ -0,0 +1,11 @@
+shr(1)          g11<1>UD        g11<0,1,0>UD    0x00000010UD    { align1 1N };
+shr(8)          g20<1>UD        g19<8,8,1>UD    0x00000001UD    { align1 1Q };
+shr(16)         g88<1>UD        g86<8,8,1>UD    0x00000001UD    { align1 1H };
+shr(8)          g10<1>.xyzUD    g1<0>.xyzzUD    g1.4<0>.xyzzUD  { align16 1Q };
+shr(8)          g3<1>UD         g2<0,1,0>UD     g2.2<0,1,0>UD   { align1 1Q };
+shr(16)         g3<1>UD         g2<0,1,0>UD     g2.2<0,1,0>UD   { align1 1H };
+shr(8)          g4<1>.yUD       g1<0>.xUD       0x00000010UD    { align16 NoDDChk 1Q };
+shr(1)          g29<1>UD        g29<0,1,0>UD    5D              { align1 WE_all 1N };
+shr(8)          g8<1>.xUD       g7<4>.xUD       0x00000001UD    { align16 1Q };
+shr(8)          g19<2>UW        g5<8,8,1>UD     g4<8,8,1>UW     { align1 1Q };
+shr(8)          g23<2>UW        g16<8,8,1>UD    g13.8<8,8,1>UW  { align1 2Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/shr.expected b/src/intel/compiler/elk/tests/gen7.5/shr.expected
new file mode 100644
index 00000000000..96d7dd5e74e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/shr.expected
@@ -0,0 +1,11 @@
+08 00 00 00 21 0c 60 21 60 01 00 00 10 00 00 00
+08 00 60 00 21 0c 80 22 60 02 8d 00 01 00 00 00
+08 00 80 00 21 0c 00 2b c0 0a 8d 00 01 00 00 00
+08 01 60 00 21 04 47 21 24 00 0a 00 34 00 0a 00
+08 00 60 00 21 04 60 20 40 00 00 00 48 00 00 00
+08 00 80 00 21 04 60 20 40 00 00 00 48 00 00 00
+08 09 60 00 21 0c 82 20 20 00 00 00 10 00 00 00
+08 02 00 00 21 1c a0 23 a0 03 00 00 05 00 00 00
+08 01 60 00 21 0c 01 21 e0 00 60 00 01 00 00 00
+08 00 60 00 29 24 60 42 a0 00 8d 00 80 00 8d 00
+08 10 60 00 29 24 e0 42 00 02 8d 00 b0 01 8d 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/wait.asm b/src/intel/compiler/elk/tests/gen7.5/wait.asm
new file mode 100644
index 00000000000..7f81fcd2253
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/wait.asm
@@ -0,0 +1,3 @@
+wait(1)         n0<1>.xUD                                       { align16 WE_all 1N };
+wait(1)         n0<1>.yUD                                       { align16 WE_all 1N };
+wait(1)         n0<1>.zUD                                       { align16 WE_all 1N };
diff --git a/src/intel/compiler/elk/tests/gen7.5/wait.expected b/src/intel/compiler/elk/tests/gen7.5/wait.expected
new file mode 100644
index 00000000000..036512e412f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/wait.expected
@@ -0,0 +1,3 @@
+30 03 00 00 00 70 01 32 00 12 00 00 04 00 6e 00
+30 03 00 00 00 70 02 32 05 12 05 00 04 00 6e 00
+30 03 00 00 00 70 04 32 0a 12 0a 00 04 00 6e 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/while.asm b/src/intel/compiler/elk/tests/gen7.5/while.asm
new file mode 100644
index 00000000000..035f96500cd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/while.asm
@@ -0,0 +1,7 @@
+LABEL0:
+while(8)        JIP: LABEL0                                     { align1 1Q };
+while(16)       JIP: LABEL0                                     { align1 1H };
+while(8)        JIP: LABEL0                                     { align16 1Q };
+(-f0.0) while(8) JIP: LABEL0                                    { align1 1Q };
+(-f0.0) while(16) JIP: LABEL0                                   { align1 1H };
+(-f0.0.x) while(8) JIP: LABEL0                                  { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7.5/while.expected b/src/intel/compiler/elk/tests/gen7.5/while.expected
new file mode 100644
index 00000000000..2ea22612e15
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/while.expected
@@ -0,0 +1,6 @@
+27 00 60 00 84 3c 00 20 00 00 8d 00 00 00 00 00
+27 00 80 00 84 3c 00 20 00 00 8d 00 fe ff 00 00
+27 01 60 00 84 3c 0f 20 04 00 6e 00 fc ff 00 00
+27 00 71 00 84 3c 00 20 00 00 8d 00 fa ff 00 00
+27 00 91 00 84 3c 00 20 00 00 8d 00 f8 ff 00 00
+27 01 72 00 84 3c 0f 20 04 00 6e 00 f6 ff 00 00
diff --git a/src/intel/compiler/elk/tests/gen7.5/xor.asm b/src/intel/compiler/elk/tests/gen7.5/xor.asm
new file mode 100644
index 00000000000..3b99c73ee7d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/xor.asm
@@ -0,0 +1,5 @@
+xor(8)          g14<1>.xUD      g5.4<0>.zUD     g13<4>.xUD      { align16 1Q };
+xor(8)          g4<1>UD         g2<0,1,0>UD     g3<8,8,1>UD     { align1 1Q };
+xor(16)         g5<1>UD         g2<0,1,0>UD     g3<8,8,1>UD     { align1 1H };
+xor(8)          g124<1>UD       g5<8,8,1>UD     0x000003ffUD    { align1 1Q };
+xor(16)         g120<1>UD       g13<8,8,1>UD    0x000003ffUD    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7.5/xor.expected b/src/intel/compiler/elk/tests/gen7.5/xor.expected
new file mode 100644
index 00000000000..387688e3e2d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7.5/xor.expected
@@ -0,0 +1,5 @@
+07 01 60 00 21 04 c1 21 ba 00 0a 00 a0 01 60 00
+07 00 60 00 21 04 80 20 40 00 00 00 60 00 8d 00
+07 00 80 00 21 04 a0 20 40 00 00 00 60 00 8d 00
+07 00 60 00 21 0c 80 2f a0 00 8d 00 ff 03 00 00
+07 00 80 00 21 0c 00 2f a0 01 8d 00 ff 03 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/add.asm b/src/intel/compiler/elk/tests/gen7/add.asm
new file mode 100644
index 00000000000..e3d3dfe24fc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/add.asm
@@ -0,0 +1,54 @@
+add(16)         g6<1>UW         g1.4<1,4,0>UW   0x11001010V     { align1 WE_all 1H };
+add(16)         g4<1>UW         g1.4<2,4,0>UW   0x10101010V     { align1 1H };
+add(1)          g11.4<1>UD      g11<0,1,0>UD    0x00000001UD    { align1 1N };
+add(8)          g17<1>F         g6<0>F          g7.4<0>F        { align16 1Q };
+add(1)          a0<1>UW         g11<0,1,0>UW    0x0008UW        { align1 WE_all 1N };
+add(8)          g46<1>F         g42<8,8,1>F     -g4.4<0,1,0>F   { align1 1Q };
+add(16)         g19<1>F         g11<8,8,1>F     -g6.4<0,1,0>F   { align1 1H };
+add(1)          g126.4<1>D      g39.4<0,1,0>D   1D              { align1 WE_all 1N };
+add(8)          g124<1>D        g3.3<0,1,0>D    g2<0,1,0>D      { align1 1Q };
+add(16)         g120<1>D        g3.3<0,1,0>D    g2<0,1,0>D      { align1 1H };
+add(8)          g98<1>.xD       g96<4>.xD       1D              { align16 1Q };
+add(8)          g80<1>.xD       g4<0>.zD        g96<4>.xD       { align16 1Q };
+add(8)          g8<1>F          g5<8,8,1>F      0xbd4ccccdF  /* -0.05F */ { align1 1Q };
+add(16)         g12<1>F         g9<8,8,1>F      0xbd4ccccdF  /* -0.05F */ { align1 1H };
+add(8)          g28<1>.xF       g58<4>.xF       0xbf800000F  /* -1F */ { align16 1Q };
+add(8)          g114<1>.xD      g4<4>.xD        7D              { align16 NoDDClr 1Q };
+add(8)          g124<1>F        g43<8,8,1>D     1D              { align1 1Q };
+add(16)         g120<1>F        g79<8,8,1>D     1D              { align1 1H };
+add(8)          g115<1>.xyF     g10<4>.xyyyF    0x3f000000F  /* 0.5F */ { align16 NoDDClr 1Q };
+add(8)          g3<1>D          g3<8,8,1>D      12D             { align1 1Q };
+add(16)         g5<1>D          g3<8,8,1>D      12D             { align1 1H };
+add(8)          g114<1>.xyzD    g4<4>.xyzzD     g7<4>.xyzzD     { align16 NoDDClr 1Q };
+add(8)          g11<1>F         g10<4>.xF       0x48403000VF /* [0F, 1F, 2F, 3F]VF */ { align16 1Q };
+add(8)          g27<1>.xUD      g21<4>.xUD      0xffffffffUD    { align16 1Q };
+add(8)          g116<1>.zD      g1<0>.xD        2D              { align16 NoDDClr,NoDDChk 1Q };
+add(8)          g117<1>.wD      g1<0>.xD        7D              { align16 NoDDChk 1Q };
+add(8)          g3<1>.yF        g5<4>.xF        -g1<0>.xF       { align16 NoDDClr 1Q };
+add(8)          g3<1>.yF        g13<4>.xF       -g1<0>.xF       { align16 NoDDClr,NoDDChk 1Q };
+add(8)          g66<1>.zF       g1<0>.xF        0x41f80000F  /* 31F */ { align16 NoDDChk 1Q };
+add(8)          g117<1>.zF      g1<0>.xF        0x40000000F  /* 2F */ { align16 NoDDClr,NoDDChk 1Q };
+(+f0.0) add(8)  g16<1>D         -g16<4>D        31D             { align16 1Q };
+add(8)          a0<1>UW         g3<16,8,2>UW    0x004cUW        { align1 1Q };
+add(8)          a0<1>UW         g4<16,8,2>UW    0x004cUW        { align1 2Q };
+add(8)          g4.1<2>UW       g4.1<16,8,2>UW  g16<16,8,2>UW   { align1 1Q };
+add(16)         g4.1<2>UW       g4.1<16,8,2>UW  g10<16,8,2>UW   { align1 1H };
+add.sat(8)      g116<1>F        g2<4>.yzxwF     -g2<4>F         { align16 1Q };
+add(8)          g8<1>UD         g6<8,8,1>D      0x00000001UD    { align1 1Q };
+add(16)         g11<1>UD        g9<8,8,1>D      0x00000001UD    { align1 1H };
+(+f0.0) add(8)  g7<1>D          -g7<8,8,1>D     31D             { align1 1Q };
+(+f0.0) add(16) g8<1>D          -g8<8,8,1>D     31D             { align1 1H };
+add(8)          g115<1>.xyF     g2<4>.xyyyF     g1<0>.xyyyF     { align16 NoDDChk 1Q };
+add(8)          g117<1>.xyD     g6<4>.xyyyD     g12<4>.xD       { align16 NoDDClr,NoDDChk 1Q };
+add(8)          g13<1>UD        g11<8,8,1>UD    1D              { align1 1Q };
+add(16)         g19<1>UD        g16<8,8,1>UD    1D              { align1 1H };
+add(8)          g2<1>UD         g9<0,1,0>UD     g8<1,4,0>UW     { align1 1Q };
+add(8)          g3<1>UD         g9<0,1,0>UD     g8.2<1,4,0>UW   { align1 2Q };
+add.sat(8)      g124<1>F        g2<8,8,1>F      g10<8,8,1>F     { align1 1Q };
+add.sat(16)     g120<1>F        g2<8,8,1>F      g18<8,8,1>F     { align1 1H };
+add(1)          g2<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 WE_all 1N };
+add(8)          g7<1>UD         g2<8,8,1>UD     -g6<8,8,1>UD    { align1 WE_all 1Q };
+add.sat(8)      g22<1>.xUD      g20<4>.xUD      g10<4>.xUD      { align16 1Q };
+add.sat(8)      g116<1>F        g5<4>.xF        0xbf800000F  /* -1F */ { align16 1Q };
+add(1)          a0<1>UD         a0<0,1,0>UD     0x00000200UD    { align1 WE_all 1N };
+add(8)          g18<1>F         -g16<4>.xyxyF   g16<4>.zwzwF    { align16 2Q };
diff --git a/src/intel/compiler/elk/tests/gen7/add.expected b/src/intel/compiler/elk/tests/gen7/add.expected
new file mode 100644
index 00000000000..bf8583b778f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/add.expected
@@ -0,0 +1,54 @@
+40 02 80 00 29 6d c0 20 28 00 28 00 10 10 00 11
+40 00 80 00 29 6d 80 20 28 00 48 00 10 10 10 10
+40 00 00 00 21 0c 70 21 60 01 00 00 01 00 00 00
+40 01 60 00 bd 77 2f 22 c4 00 0e 00 f4 00 0e 00
+40 02 00 00 28 2d 00 22 60 01 00 00 08 00 08 00
+40 00 60 00 bd 77 c0 25 40 05 8d 00 90 40 00 00
+40 00 80 00 bd 77 60 22 60 01 8d 00 d0 40 00 00
+40 02 00 00 a5 1c d0 2f f0 04 00 00 01 00 00 00
+40 00 60 00 a5 14 80 2f 6c 00 00 00 40 00 00 00
+40 00 80 00 a5 14 00 2f 6c 00 00 00 40 00 00 00
+40 01 60 00 a5 1c 41 2c 00 0c 60 00 01 00 00 00
+40 01 60 00 a5 14 01 2a 8a 00 0a 00 00 0c 60 00
+40 00 60 00 bd 7f 00 21 a0 00 8d 00 cd cc 4c bd
+40 00 80 00 bd 7f 80 21 20 01 8d 00 cd cc 4c bd
+40 01 60 00 bd 7f 81 23 40 07 60 00 00 00 80 bf
+40 05 60 00 a5 1c 41 2e 80 00 60 00 07 00 00 00
+40 00 60 00 bd 1c 80 2f 60 05 8d 00 01 00 00 00
+40 00 80 00 bd 1c 00 2f e0 09 8d 00 01 00 00 00
+40 05 60 00 bd 7f 63 2e 44 01 65 00 00 00 00 3f
+40 00 60 00 a5 1c 60 20 60 00 8d 00 0c 00 00 00
+40 00 80 00 a5 1c a0 20 60 00 8d 00 0c 00 00 00
+40 05 60 00 a5 14 47 2e 84 00 6a 00 e4 00 6a 00
+40 01 60 00 bd 5f 6f 21 40 01 60 00 00 30 40 48
+40 01 60 00 21 0c 61 23 a0 02 60 00 ff ff ff ff
+40 0d 60 00 a5 1c 84 2e 20 00 00 00 02 00 00 00
+40 09 60 00 a5 1c a8 2e 20 00 00 00 07 00 00 00
+40 05 60 00 bd 77 62 20 a0 00 60 00 20 40 00 00
+40 0d 60 00 bd 77 62 20 a0 01 60 00 20 40 00 00
+40 09 60 00 bd 7f 44 28 20 00 00 00 00 00 f8 41
+40 0d 60 00 bd 7f a4 2e 20 00 00 00 00 00 00 40
+40 01 61 00 a5 1c 0f 22 04 42 6e 00 1f 00 00 00
+40 00 60 00 28 2d 00 22 60 00 ae 00 4c 00 4c 00
+40 10 60 00 28 2d 00 22 80 00 ae 00 4c 00 4c 00
+40 00 60 00 29 25 82 40 82 00 ae 00 00 02 ae 00
+40 00 80 00 29 25 82 40 82 00 ae 00 40 01 ae 00
+40 01 60 80 bd 77 8f 2e 49 00 6c 00 44 40 6e 00
+40 00 60 00 a1 0c 00 21 c0 00 8d 00 01 00 00 00
+40 00 80 00 a1 0c 60 21 20 01 8d 00 01 00 00 00
+40 00 61 00 a5 1c e0 20 e0 40 8d 00 1f 00 00 00
+40 00 81 00 a5 1c 00 21 00 41 8d 00 1f 00 00 00
+40 09 60 00 bd 77 63 2e 44 00 65 00 24 00 05 00
+40 0d 60 00 a5 14 a3 2e c4 00 65 00 80 01 60 00
+40 00 60 00 21 1c a0 21 60 01 8d 00 01 00 00 00
+40 00 80 00 21 1c 60 22 00 02 8d 00 01 00 00 00
+40 00 60 00 21 24 40 20 20 01 00 00 00 01 28 00
+40 10 60 00 21 24 60 20 20 01 00 00 04 01 28 00
+40 00 60 80 bd 77 80 2f 40 00 8d 00 40 01 8d 00
+40 00 80 80 bd 77 00 2f 40 00 8d 00 40 02 8d 00
+40 02 00 00 21 0c 40 20 40 00 00 00 01 00 00 00
+40 02 60 00 21 04 e0 20 40 00 8d 00 c0 40 8d 00
+40 01 60 80 21 04 c1 22 80 02 60 00 40 01 60 00
+40 01 60 80 bd 7f 8f 2e a0 00 60 00 00 00 80 bf
+40 02 00 00 00 0c 00 22 00 02 00 00 00 02 00 00
+40 11 60 00 bd 77 4f 22 04 42 64 00 0e 02 6e 00
diff --git a/src/intel/compiler/elk/tests/gen7/and.asm b/src/intel/compiler/elk/tests/gen7/and.asm
new file mode 100644
index 00000000000..34bcffc0417
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/and.asm
@@ -0,0 +1,30 @@
+and(1)          g11<1>UD        g0.2<0,1,0>UD   0x007f0000UD    { align1 1N };
+and(1)          g12.2<1>UD      g0.2<0,1,0>UD   0x0000f000UD    { align1 WE_all 1N };
+and(8)          g13<1>UD        g6<0>UD         g5.4<0>.zUD     { align16 1Q };
+and(8)          g59<1>.xUD      g38<4>.xUD      0x00000001UD    { align16 1Q };
+and(8)          g22<1>UD        g21<8,8,1>UD    g20<8,8,1>UD    { align1 1Q };
+and.nz.f0.0(8)  null<1>UD       g24<8,8,1>UD    g25<8,8,1>UD    { align1 1Q };
+and(16)         g41<1>UD        g39<8,8,1>UD    g37<8,8,1>UD    { align1 1H };
+and.nz.f0.0(16) null<1>UD       g45<8,8,1>UD    g47<8,8,1>UD    { align1 1H };
+and(8)          g12<1>UD        g11<8,8,1>UD    0x00000001UD    { align1 1Q };
+and(16)         g19<1>UD        g17<8,8,1>UD    0x00000001UD    { align1 1H };
+and(8)          g8<1>UD         g0.1<0,1,0>UW   0x07ffUW        { align1 1Q };
+and(16)         g20<1>UD        g0.1<0,1,0>UW   0x07ffUW        { align1 1H };
+and(1)          a0<1>UD         g9<0,1,0>UD     0x000000ffUD    { align1 WE_all 1N };
+and.nz.f0.0(8)  null<1>.xUD     g82<4>.xUD      g81<4>.xUD      { align16 1Q };
+and(1)          g2<1>UD         g19<0,1,0>UD    0x000000ffUD    { align1 WE_all 3N };
+and.nz.f0.0(8)  null<1>UD       g4<0,1,0>UD     0x00000001UD    { align1 1Q };
+and.nz.f0.0(16) null<1>UD       g6<0,1,0>UD     0x00000001UD    { align1 1H };
+and.z.f0.0(8)   null<1>UD       g20<8,8,1>UD    0x00000001UD    { align1 1Q };
+and.z.f0.0(16)  null<1>UD       g44<8,8,1>UD    0x00000001UD    { align1 1H };
+and(1)          a0<1>UD         a0<0,1,0>UD     0x00000fffUD    { align1 WE_all 1N };
+and.z.f0.0(8)   null<1>UD       g9<4>.xUD       0x0000001fUD    { align16 1Q };
+and(8)          g57<1>.xUD      g54<4>.xUD      0x00000003UD    { align16 WE_all 1Q };
+and(8)          g6<1>UD         g2<8,8,1>UD     0x00000003UD    { align1 WE_all 1Q };
+and(2)          g99<1>UD        g1.3<0,1,0>UD   0x00001fffUD    { align1 WE_all 1N };
+and.nz.f0.0(8)  g10<1>UD        g9<8,8,1>UD     0x00000001UD    { align1 1Q };
+and.nz.f0.0(16) g15<1>UD        g13<8,8,1>UD    0x00000001UD    { align1 1H };
+and.nz.f0.0(8)  g20<1>UD        g19<8,8,1>UD    g18<8,8,1>UD    { align1 1Q };
+and.nz.f0.0(16) g34<1>UD        g32<8,8,1>UD    g30<8,8,1>UD    { align1 1H };
+and.z.f0.0(8)   null<1>UD       g7<8,8,1>UD     g21<8,8,1>UD    { align1 1Q };
+and.z.f0.0(16)  null<1>UD       g8<8,8,1>UD     g35<8,8,1>UD    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/and.expected b/src/intel/compiler/elk/tests/gen7/and.expected
new file mode 100644
index 00000000000..8c9a40c9e05
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/and.expected
@@ -0,0 +1,30 @@
+05 00 00 00 21 0c 60 21 08 00 00 00 00 00 7f 00
+05 02 00 00 21 0c 88 21 08 00 00 00 00 f0 00 00
+05 01 60 00 21 04 af 21 c4 00 0e 00 ba 00 0a 00
+05 01 60 00 21 0c 61 27 c0 04 60 00 01 00 00 00
+05 00 60 00 21 04 c0 22 a0 02 8d 00 80 02 8d 00
+05 00 60 02 20 04 00 20 00 03 8d 00 20 03 8d 00
+05 00 80 00 21 04 20 25 e0 04 8d 00 a0 04 8d 00
+05 00 80 02 20 04 00 20 a0 05 8d 00 e0 05 8d 00
+05 00 60 00 21 0c 80 21 60 01 8d 00 01 00 00 00
+05 00 80 00 21 0c 60 22 20 02 8d 00 01 00 00 00
+05 00 60 00 21 2d 00 21 02 00 00 00 ff 07 ff 07
+05 00 80 00 21 2d 80 22 02 00 00 00 ff 07 ff 07
+05 02 00 00 20 0c 00 22 20 01 00 00 ff 00 00 00
+05 01 60 02 20 04 01 20 40 0a 60 00 20 0a 60 00
+05 12 00 00 21 0c 40 20 60 02 00 00 ff 00 00 00
+05 00 60 02 20 0c 00 20 80 00 00 00 01 00 00 00
+05 00 80 02 20 0c 00 20 c0 00 00 00 01 00 00 00
+05 00 60 01 20 0c 00 20 80 02 8d 00 01 00 00 00
+05 00 80 01 20 0c 00 20 80 05 8d 00 01 00 00 00
+05 02 00 00 00 0c 00 22 00 02 00 00 ff 0f 00 00
+05 01 60 01 20 0c 0f 20 20 01 60 00 1f 00 00 00
+05 03 60 00 21 0c 21 27 c0 06 60 00 03 00 00 00
+05 02 60 00 21 0c c0 20 40 00 8d 00 03 00 00 00
+05 02 20 00 21 0c 60 2c 2c 00 00 00 ff 1f 00 00
+05 00 60 02 21 0c 40 21 20 01 8d 00 01 00 00 00
+05 00 80 02 21 0c e0 21 a0 01 8d 00 01 00 00 00
+05 00 60 02 21 04 80 22 60 02 8d 00 40 02 8d 00
+05 00 80 02 21 04 40 24 00 04 8d 00 c0 03 8d 00
+05 00 60 01 20 04 00 20 e0 00 8d 00 a0 02 8d 00
+05 00 80 01 20 04 00 20 00 01 8d 00 60 04 8d 00
diff --git a/src/intel/compiler/elk/tests/gen7/asr.asm b/src/intel/compiler/elk/tests/gen7/asr.asm
new file mode 100644
index 00000000000..e85346b84d1
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/asr.asm
@@ -0,0 +1,8 @@
+asr(8)          g13<1>.xD       g5.4<0>.zD      g5.4<0>.wUD     { align16 1Q };
+asr(8)          g57<1>.xD       g38<4>.xD       0x00000001UD    { align16 1Q };
+asr(8)          g6<1>D          g5<8,8,1>D      0x00000001UD    { align1 1Q };
+asr(16)         g8<1>D          g6<8,8,1>D      0x00000001UD    { align1 1H };
+asr.nz.f0.0(8)  null<1>D        -g0<0,1,0>W     15D             { align1 1Q };
+asr.nz.f0.0(16) null<1>D        -g0<0,1,0>W     15D             { align1 1H };
+asr(8)          g2<1>D          -g0<0,1,0>W     15D             { align1 1Q };
+asr(16)         g2<1>D          -g0<0,1,0>W     15D             { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/asr.expected b/src/intel/compiler/elk/tests/gen7/asr.expected
new file mode 100644
index 00000000000..ea6ae739f33
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/asr.expected
@@ -0,0 +1,8 @@
+0c 01 60 00 a5 04 a1 21 ba 00 0a 00 bf 00 0f 00
+0c 01 60 00 a5 0c 21 27 c0 04 60 00 01 00 00 00
+0c 00 60 00 a5 0c c0 20 a0 00 8d 00 01 00 00 00
+0c 00 80 00 a5 0c 00 21 c0 00 8d 00 01 00 00 00
+0c 00 60 02 a4 1d 00 20 00 40 00 00 0f 00 00 00
+0c 00 80 02 a4 1d 00 20 00 40 00 00 0f 00 00 00
+0c 00 60 00 a5 1d 40 20 00 40 00 00 0f 00 00 00
+0c 00 80 00 a5 1d 40 20 00 40 00 00 0f 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/bfe.asm b/src/intel/compiler/elk/tests/gen7/bfe.asm
new file mode 100644
index 00000000000..d19155e4e8a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/bfe.asm
@@ -0,0 +1,4 @@
+bfe(8)          g33<1>UD        g50<4,4,1>UD    g64<4,4,1>UD    g31<4,4,1>UD { align16 1Q };
+bfe(8)          g61<1>UD        g92<4,4,1>UD    g20<4,4,1>UD    g57<4,4,1>UD { align16 2Q };
+bfe(8)          g20<1>D         g18<4,4,1>.xD   g17<4,4,1>.xD   g16<4,4,1>D { align16 1Q };
+bfe(8)          g14<1>D         g12<4,4,1>D     g43<4,4,1>D     g6<4,4,1>D { align16 2Q };
diff --git a/src/intel/compiler/elk/tests/gen7/bfe.expected b/src/intel/compiler/elk/tests/gen7/bfe.expected
new file mode 100644
index 00000000000..bf16a1ced25
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/bfe.expected
@@ -0,0 +1,4 @@
+18 01 60 00 00 28 1e 21 c8 21 03 39 80 20 c7 07
+18 11 60 00 00 28 1e 3d c8 c1 05 39 28 20 47 0e
+18 01 60 00 00 14 1e 14 00 20 01 00 22 20 07 04
+18 11 60 00 00 14 1e 0e c8 c1 00 39 56 20 87 01
diff --git a/src/intel/compiler/elk/tests/gen7/bfi1.asm b/src/intel/compiler/elk/tests/gen7/bfi1.asm
new file mode 100644
index 00000000000..317b5cb32bc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/bfi1.asm
@@ -0,0 +1,3 @@
+bfi1(8)         g22<1>UD        g20<4>.xD       g19<4>.xD       { align16 1Q };
+bfi1(8)         g12<1>UD        g11<8,8,1>D     g10<8,8,1>D     { align1 1Q };
+bfi1(16)        g16<1>UD        g14<8,8,1>D     g12<8,8,1>D     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/bfi1.expected b/src/intel/compiler/elk/tests/gen7/bfi1.expected
new file mode 100644
index 00000000000..53d5c12cbcd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/bfi1.expected
@@ -0,0 +1,3 @@
+19 01 60 00 a1 14 cf 22 80 02 60 00 60 02 60 00
+19 00 60 00 a1 14 80 21 60 01 8d 00 40 01 8d 00
+19 00 80 00 a1 14 00 22 c0 01 8d 00 80 01 8d 00
diff --git a/src/intel/compiler/elk/tests/gen7/bfi2.asm b/src/intel/compiler/elk/tests/gen7/bfi2.asm
new file mode 100644
index 00000000000..052ba3e719a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/bfi2.asm
@@ -0,0 +1,2 @@
+bfi2(8)         g23<1>UD        g22<4,4,1>UD    g18<4,4,1>UD    g17<4,4,1>UD { align16 1Q };
+bfi2(8)         g19<1>UD        g17<4,4,1>UD    g54<4,4,1>UD    g7<4,4,1>UD { align16 2Q };
diff --git a/src/intel/compiler/elk/tests/gen7/bfi2.expected b/src/intel/compiler/elk/tests/gen7/bfi2.expected
new file mode 100644
index 00000000000..c7efdcdece1
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/bfi2.expected
@@ -0,0 +1,2 @@
+1a 01 60 00 00 28 1e 17 c8 61 01 39 24 20 47 04
+1a 11 60 00 00 28 1e 13 c8 11 01 39 6c 20 c7 01
diff --git a/src/intel/compiler/elk/tests/gen7/bfrev.asm b/src/intel/compiler/elk/tests/gen7/bfrev.asm
new file mode 100644
index 00000000000..101d2cc6e6a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/bfrev.asm
@@ -0,0 +1,3 @@
+bfrev(8)        g5<1>UD         g5<8,8,1>UD                     { align1 1Q };
+bfrev(16)       g6<1>UD         g8<8,8,1>UD                     { align1 1H };
+bfrev(8)        g11<1>UD        g10<4>UD                        { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7/bfrev.expected b/src/intel/compiler/elk/tests/gen7/bfrev.expected
new file mode 100644
index 00000000000..5dda563419d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/bfrev.expected
@@ -0,0 +1,3 @@
+17 00 60 00 21 00 a0 20 a0 00 8d 00 00 00 00 00
+17 00 80 00 21 00 c0 20 00 01 8d 00 00 00 00 00
+17 01 60 00 21 00 6f 21 44 01 6e 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/break.asm b/src/intel/compiler/elk/tests/gen7/break.asm
new file mode 100644
index 00000000000..ae39f9ad0a5
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/break.asm
@@ -0,0 +1,9 @@
+break(8)        JIP: LABEL0          UIP: LABEL0                { align1 1Q };
+LABEL0:
+break(16)       JIP: LABEL1          UIP: LABEL2                { align1 1H };
+(+f0.0.x) break(8) JIP: LABEL1       UIP: LABEL2                { align16 1Q };
+LABEL1:
+(+f0.0) break(8) JIP: LABEL2         UIP: LABEL2                { align1 1Q };
+(+f0.0) break(16) JIP: LABEL2        UIP: LABEL2                { align1 1H };
+break(8)        JIP: LABEL2          UIP: LABEL2                { align16 1Q };
+LABEL2:
\ No newline at end of file
diff --git a/src/intel/compiler/elk/tests/gen7/break.expected b/src/intel/compiler/elk/tests/gen7/break.expected
new file mode 100644
index 00000000000..7ded8f0068b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/break.expected
@@ -0,0 +1,6 @@
+28 00 60 00 84 1c 00 20 00 00 8d 00 02 00 02 00
+28 00 80 00 84 1c 00 20 00 00 8d 00 04 00 0a 00
+28 01 62 00 84 1c 0f 20 04 00 6e 00 02 00 08 00
+28 00 61 00 84 1c 00 20 00 00 8d 00 06 00 06 00
+28 00 81 00 84 1c 00 20 00 00 8d 00 04 00 04 00
+28 01 60 00 84 1c 0f 20 04 00 6e 00 02 00 02 00
diff --git a/src/intel/compiler/elk/tests/gen7/cbit.asm b/src/intel/compiler/elk/tests/gen7/cbit.asm
new file mode 100644
index 00000000000..2a62b07307f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/cbit.asm
@@ -0,0 +1,3 @@
+cbit(8)         g5<1>UD         g5<8,8,1>UD                     { align1 1Q };
+cbit(16)        g6<1>UD         g8<8,8,1>UD                     { align1 1H };
+cbit(8)         g11<1>UD        g10<4>UD                        { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7/cbit.expected b/src/intel/compiler/elk/tests/gen7/cbit.expected
new file mode 100644
index 00000000000..e4560c341d0
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/cbit.expected
@@ -0,0 +1,3 @@
+4d 00 60 00 21 00 a0 20 a0 00 8d 00 00 00 00 00
+4d 00 80 00 21 00 c0 20 00 01 8d 00 00 00 00 00
+4d 01 60 00 21 00 6f 21 44 01 6e 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/cmp.asm b/src/intel/compiler/elk/tests/gen7/cmp.asm
new file mode 100644
index 00000000000..3ac02d9c26b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/cmp.asm
@@ -0,0 +1,146 @@
+cmp.ge.f0.0(8)  null<1>F        g45<4>.xF       g43<4>.xF       { align16 1Q switch };
+cmp.g.f0.0(8)   g18<1>.xyF      g13<4>.zwwwF    0x3f800000F  /* 1F */ { align16 1Q };
+cmp.nz.f0.0(8)  null<1>D        g18<4>.xyyyD    0D              { align16 1Q switch };
+cmp.g.f0.0(8)   null<1>F        g14<4>F         0x3f800000F  /* 1F */ { align16 1Q switch };
+cmp.le.f0.0(8)  g24<1>.xyF      g13<4>.zwwwF    0x3f800000F  /* 1F */ { align16 1Q };
+cmp.nz.f0.0(8)  null<1>F        g3<0>.xyzzF     0x74746e64VF /* [10F, 15F, 20F, 20F]VF */ { align16 1Q switch };
+cmp.z.f0.0(8)   null<1>D        g13<4>.xyyyD    g6<0>.yzzzD     { align16 1Q switch };
+cmp.ge.f0.0(8)  g33<1>F         g32<8,8,1>F     0x3189705fF  /* 4e-09F */ { align1 1Q };
+cmp.l.f0.0(8)   g34<1>F         g32<8,8,1>F     0x3189705fF  /* 4e-09F */ { align1 1Q };
+cmp.ge.f0.0(8)  g2<1>F          g23<8,8,1>F     g51<0,1,0>F     { align1 1Q };
+cmp.l.f0.0(8)   g5<1>F          g23<8,8,1>F     g51<0,1,0>F     { align1 1Q };
+cmp.ge.f0.0(8)  g3<1>F          g4<8,8,1>F      g51<0,1,0>F     { align1 2Q };
+cmp.l.f0.0(8)   g6<1>F          g4<8,8,1>F      g51<0,1,0>F     { align1 2Q };
+cmp.z.f0.0(8)   null<1>D        g4<0>.xD        0D              { align16 1Q switch };
+cmp.l.f0.0(8)   null<1>F        g35<4>.xF       0x3189705fF  /* 4e-09F */ { align16 1Q switch };
+cmp.z.f0.0(8)   null<1>F        g3<0>.zwwwF     g3<0>.xyyyF     { align16 1Q switch };
+cmp.l.f0.0(8)   g12<1>.xF       g5.4<0>.zF      g5.4<0>.wF      { align16 1Q };
+cmp.ge.f0.0(8)  g9<1>.xF        g1<0>.xF        g1<0>.yF        { align16 1Q };
+cmp.nz.f0.0(8)  null<1>F        g42<4>F         g3<0>F          { align16 1Q switch };
+cmp.l.f0.0(8)   null<1>UD       g6<4>.xUD       0x00000003UD    { align16 1Q switch };
+cmp.l.f0.0(8)   null<1>D        g4<0,1,0>D      1D              { align1 1Q switch };
+cmp.z.f0.0(8)   g20<1>F         g3<8,8,1>F      g4.3<0,1,0>F    { align1 1Q };
+cmp.l.f0.0(16)  null<1>D        g6<0,1,0>D      1D              { align1 1H switch };
+cmp.z.f0.0(8)   g38<1>F         g5<8,8,1>F      g6.3<0,1,0>F    { align1 2Q };
+cmp.z.f0.0(8)   g5<1>D          g2.1<0,1,0>D    39D             { align1 1Q };
+cmp.z.f0.0(8)   g6<1>D          g2.1<0,1,0>D    39D             { align1 2Q };
+cmp.le.f0.0(8)  null<1>.zF      g7<4>.xF        0x0F  /* 0F */  { align16 1Q switch };
+cmp.z.f0.0(8)   g3<1>F          g2.1<0,1,0>F    0x41000000F  /* 8F */ { align1 1Q };
+cmp.z.f0.0(8)   g4<1>F          g2.1<0,1,0>F    0x41000000F  /* 8F */ { align1 2Q };
+cmp.z.f0.0(8)   null<1>D        g4<0,1,0>D      1D              { align1 1Q switch };
+cmp.z.f0.0(8)   null<1>F        g10<8,8,1>F     g4.1<0,1,0>F    { align1 1Q switch };
+cmp.z.f0.0(16)  null<1>D        g6<0,1,0>D      1D              { align1 1H switch };
+cmp.z.f0.0(16)  null<1>D        g17<8,8,1>F     g6.1<0,1,0>F    { align1 1H switch };
+cmp.z.f0.0(8)   g31<1>.yzwD     g3<0>.xD        g19<4>.yyzwD    { align16 1Q };
+cmp.nz.f0.0(8)  g8<1>D          g5<8,8,1>D      g3.1<0,1,0>D    { align1 1Q };
+cmp.nz.f0.0(8)  g13<1>D         g7<8,8,1>D      g3.1<0,1,0>D    { align1 2Q };
+cmp.nz.f0.0(8)  g6<1>F          g5<8,8,1>F      g2.2<0,1,0>F    { align1 1Q };
+cmp.nz.f0.0(8)  g9<1>F          g7<8,8,1>F      g2.2<0,1,0>F    { align1 2Q };
+cmp.ge.f0.0(8)  g12<1>.xD       g5.4<0>.zD      g5.4<0>.wD      { align16 1Q };
+cmp.nz.f0.0(8)  null<1>D        g4<0,1,0>D      0D              { align1 1Q switch };
+cmp.nz.f0.0(16) null<1>D        g6<0,1,0>D      0D              { align1 1H switch };
+cmp.nz.f0.0(8)  null<1>D        g3<0>.xyzzD     g33<4>.xyzzD    { align16 1Q switch };
+cmp.z.f0.0(8)   null<1>.xF      (abs)g13<4>.xF  0x7f800000F  /* infF */ { align16 1Q switch };
+cmp.nz.f0.0(8)  null<1>F        g13<4>.xF       0x0F  /* 0F */  { align16 1Q switch };
+cmp.l.f0.0(8)   null<1>.xF      g5<4>.xF        g13<4>.xF       { align16 1Q switch };
+cmp.l.f0.0(8)   g10<1>UD        g9<4>UD         g1<0>UD         { align16 1Q };
+cmp.g.f0.0(8)   g32<1>F         g31<8,8,1>F     0x3727c5acF  /* 1e-05F */ { align1 1Q };
+cmp.le.f0.0(8)  g33<1>F         g31<8,8,1>F     0x3727c5acF  /* 1e-05F */ { align1 1Q };
+cmp.g.f0.0(8)   g11<1>F         g9<8,8,1>F      g42<0,1,0>F     { align1 1Q };
+cmp.g.f0.0(8)   g12<1>F         g10<8,8,1>F     g42<0,1,0>F     { align1 2Q };
+cmp.le.f0.0(8)  g13<1>F         g9<8,8,1>F      g42<0,1,0>F     { align1 1Q };
+cmp.le.f0.0(8)  g14<1>F         g10<8,8,1>F     g42<0,1,0>F     { align1 2Q };
+cmp.z.f0.0(8)   g11<1>F         g9<4>.xF        g2<4>F          { align16 1Q };
+cmp.nz.f0.0(8)  g4<1>D          g2.3<0,1,0>D    0D              { align1 1Q };
+cmp.nz.f0.0(8)  g6<1>D          g2.3<0,1,0>D    0D              { align1 2Q };
+cmp.z.f0.0(8)   null<1>F        g4.1<0,1,0>F    0x3f800000F  /* 1F */ { align1 1Q switch };
+cmp.z.f0.0(16)  null<1>D        g6.1<0,1,0>F    0x3f800000F  /* 1F */ { align1 1H switch };
+cmp.l.f0.0(8)   null<1>F        g4.4<0,1,0>F    0x0F  /* 0F */  { align1 1Q switch };
+cmp.l.f0.0(16)  null<1>D        g6.4<0,1,0>F    0x0F  /* 0F */  { align1 1H switch };
+cmp.ge.f0.0(8)  g9<1>F          g6.4<0,1,0>F    0x0F  /* 0F */  { align1 2Q };
+cmp.nz.f0.0(8)  null<1>F        g124<8,8,1>F    g124<8,8,1>F    { align1 1Q switch };
+cmp.nz.f0.0(8)  null<1>F        g124<8,8,1>F    0x0F  /* 0F */  { align1 1Q switch };
+cmp.g.f0.0(8)   null<1>F        g124<8,8,1>F    0x0F  /* 0F */  { align1 1Q switch };
+cmp.nz.f0.0(16) null<1>D        g120<8,8,1>F    g120<8,8,1>F    { align1 1H switch };
+cmp.nz.f0.0(16) null<1>D        g120<8,8,1>F    0x0F  /* 0F */  { align1 1H switch };
+cmp.g.f0.0(16)  null<1>D        g120<8,8,1>F    0x0F  /* 0F */  { align1 1H switch };
+cmp.z.f0.0(8)   g5<1>D          g4<8,8,1>D      g2.5<0,1,0>D    { align1 1Q };
+cmp.z.f0.0(8)   g8<1>D          g6<8,8,1>D      g2.5<0,1,0>D    { align1 2Q };
+cmp.ge.f0.0(8)  null<1>D        g20<8,8,1>D     g4<0,1,0>D      { align1 1Q switch };
+cmp.ge.f0.0(16) null<1>D        g13<8,8,1>D     g6<0,1,0>D      { align1 1H switch };
+(+f0.1) cmp.z.f0.1(8) null<1>D  g2<8,8,1>D      0D              { align1 1Q switch };
+(+f0.1) cmp.z.f0.1(16) null<1>D g2<8,8,1>D      0D              { align1 1H switch };
+cmp.ge.f0.0(8)  g74<1>.xD       g1<0>.xD        16D             { align16 1Q };
+cmp.ge.f0.0(8)  g56<1>D         g3<0,1,0>D      1D              { align1 1Q };
+cmp.l.f0.0(8)   g57<1>D         g3<0,1,0>D      1D              { align1 1Q };
+cmp.ge.f0.0(8)  g109<1>D        g3<0,1,0>D      1D              { align1 2Q };
+cmp.l.f0.0(8)   g111<1>D        g3<0,1,0>D      1D              { align1 2Q };
+cmp.z.f0.0(8)   g48<1>.xD       g3.4<0>.xD      0D              { align16 1Q };
+cmp.ge.f0.0(8)  g9<1>UD         g6<0>UD         g6.4<0>UD       { align16 1Q };
+cmp.nz.f0.0(8)  g9<1>.xD        g1<0>.xD        g1<0>.yD        { align16 1Q };
+cmp.nz.f0.0(8)  g8<1>.xyzF      g1<0>.xyzzF     g1.4<0>.xyzzF   { align16 1Q };
+cmp.l.f0.0(8)   g3<1>D          g2.1<0,1,0>D    g2<0,1,0>D      { align1 1Q };
+cmp.l.f0.0(8)   g4<1>D          g2.1<0,1,0>D    g2<0,1,0>D      { align1 2Q };
+cmp.l.f0.0(8)   g70<1>.xD       g68<4>.xD       7D              { align16 1Q };
+cmp.l.f0.0(8)   null<1>.xD      g68<4>.xD       3D              { align16 1Q switch };
+cmp.le.f0.0(8)  g4<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 1Q };
+cmp.g.f0.0(8)   g5<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 1Q };
+cmp.le.f0.0(8)  g6<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 2Q };
+cmp.g.f0.0(8)   g8<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 2Q };
+cmp.le.f0.0(8)  null<1>F        g63<8,8,1>F     g2.1<0,1,0>F    { align1 1Q switch };
+cmp.le.f0.0(8)  null<1>F        g79<8,8,1>F     0x3fc00000F  /* 1.5F */ { align1 1Q switch };
+cmp.le.f0.0(16) null<1>D        g116<8,8,1>F    g2.2<0,1,0>F    { align1 1H switch };
+cmp.le.f0.0(16) null<1>D        g98<8,8,1>F     0x3fc00000F  /* 1.5F */ { align1 1H switch };
+cmp.z.f0.0(8)   null<1>F        g3<0>.xyzzF     0x6e6e6c6aVF /* [13F, 14F, 15F, 15F]VF */ { align16 1Q switch };
+cmp.ge.f0.0(8)  g3<1>D          g2<0,1,0>D      g2.1<0,1,0>D    { align1 1Q };
+cmp.ge.f0.0(8)  g4<1>D          g2<0,1,0>D      g2.1<0,1,0>D    { align1 2Q };
+cmp.ge.f0.0(8)  g31<1>UD        g30<8,8,1>UD    g5.7<0,1,0>UD   { align1 1Q };
+cmp.l.f0.0(8)   g32<1>UD        g30<8,8,1>UD    g5.3<0,1,0>UD   { align1 1Q };
+cmp.ge.f0.0(8)  g50<1>UD        g48<8,8,1>UD    g7.7<0,1,0>UD   { align1 2Q };
+cmp.l.f0.0(8)   g52<1>UD        g48<8,8,1>UD    g7.3<0,1,0>UD   { align1 2Q };
+cmp.ge.f0.0(8)  null<1>F        g13<8,8,1>F     0x38d1b717F  /* 0.0001F */ { align1 1Q switch };
+cmp.ge.f0.0(16) null<1>D        g24<8,8,1>F     0x38d1b717F  /* 0.0001F */ { align1 1H switch };
+cmp.l.f0.0(8)   g13<1>.xyD      g5.4<0>.zwwwD   g6<0>.xyyyD     { align16 1Q };
+cmp.nz.f0.0(8)  g22<1>F         g8.7<0,1,0>F    0x442f0000F  /* 700F */ { align1 1Q };
+cmp.nz.f0.0(8)  g32<1>F         g8.7<0,1,0>F    0x442f0000F  /* 700F */ { align1 2Q };
+cmp.g.f0.0(8)   g5<1>F          (abs)g3<8,8,1>F 0x3b808081F  /* 0.00392157F */ { align1 2Q };
+cmp.g.f0.0(8)   g30<1>D         g5.1<0,1,0>D    4D              { align1 1Q };
+cmp.g.f0.0(8)   g56<1>D         g7.1<0,1,0>D    4D              { align1 2Q };
+cmp.ge.f0.0(8)  g30<1>.xyF      (abs)g29<4>.xyyyF 0x5d5e0b6bF  /* 1e+18F */ { align16 1Q };
+cmp.l.f0.0(8)   g61<1>.xyF      g59<4>.xyyyF    0x0F  /* 0F */  { align16 1Q };
+cmp.l.f0.0(8)   null<1>UD       g5.4<0>.zUD     g5.4<0>.wUD     { align16 1Q switch };
+cmp.ge.f0.0(8)  null<1>UD       g5.4<0>.wUD     g5.4<0>.zUD     { align16 1Q switch };
+cmp.l.f0.0(8)   null<1>D        g1<0>.yD        g1<0>.xD        { align16 1Q switch };
+cmp.ge.f0.0(8)  null<1>D        g5.4<0>.wD      g5.4<0>.zD      { align16 1Q switch };
+cmp.ge.f0.0(8)  null<1>D        g2<0,1,0>D      1D              { align1 1Q switch };
+cmp.ge.f0.0(16) null<1>D        g2<0,1,0>D      1D              { align1 1H switch };
+cmp.l.f0.0(8)   null<1>F        g12<8,8,1>F     g38<0,1,0>F     { align1 1Q switch };
+cmp.l.f0.0(16)  null<1>D        g19<8,8,1>F     g60<0,1,0>F     { align1 1H switch };
+cmp.g.f0.0(8)   null<1>.xD      g1<0>.xD        0D              { align16 1Q switch };
+cmp.ge.f0.0(8)  null<1>.xD      g7<4>.xD        4096D           { align16 1Q switch };
+cmp.z.f0.0(8)   g14<1>.xF       g1<0>.xF        0x40b79581F  /* 5.737F */ { align16 1Q };
+cmp.nz.f0.0(8)  null<1>UD       g9<4>.xUD       0x00000000UD    { align16 1Q switch };
+cmp.nz.f0.0(8)  null<1>D        g3<8,8,1>D      g2<0,1,0>D      { align1 1Q switch };
+cmp.nz.f0.0(16) null<1>D        g3<8,8,1>D      g2<0,1,0>D      { align1 1H switch };
+cmp.z.f0.0(8)   null<1>UD       g5<8,8,1>UD     0x00000000UD    { align1 1Q switch };
+cmp.z.f0.0(16)  null<1>D        g15<8,8,1>UD    0x00000000UD    { align1 1H switch };
+cmp.g.f0.0(8)   null<1>UD       g1<0>.zUD       0x0000001fUD    { align16 1Q switch };
+cmp.g.f0.0(8)   null<1>F        (abs)g14<8,8,1>F g45<0,1,0>F    { align1 1Q switch };
+cmp.g.f0.0(16)  null<1>D        (abs)g21<8,8,1>F g6<0,1,0>F     { align1 1H switch };
+cmp.ge.f0.0(8)  null<1>F        (abs)g30<4>F    0x5d5e0b6bF  /* 1e+18F */ { align16 1Q switch };
+(+f0.1) cmp.nz.f0.1(8) null<1>UW g0<8,8,1>UW    g0<8,8,1>UW     { align1 1Q switch };
+(+f0.1) cmp.nz.f0.1(16) null<1>D g0<8,8,1>UW    g0<8,8,1>UW     { align1 1H switch };
+cmp.l.f0.0(8)   null<1>UD       g2<8,8,1>UD     g18<8,8,1>UD    { align1 1Q switch };
+cmp.l.f0.0(16)  null<1>D        g32<8,8,1>UD    g29<8,8,1>UD    { align1 1H switch };
+cmp.ge.f0.0(8)  null<1>F        g30<8,8,1>F     g4.4<0,1,0>F    { align1 1Q switch };
+cmp.ge.f0.0(16) null<1>D        g28<8,8,1>F     g6.4<0,1,0>F    { align1 1H switch };
+cmp.le.f0.0(8)  g9<1>F          g7<8,8,1>F      0x3727c5acF  /* 1e-05F */ { align1 2Q };
+cmp.g.f0.0(8)   null<1>D        g44<8,8,1>D     8D              { align1 1Q switch };
+cmp.g.f0.0(16)  null<1>D        g72<8,8,1>D     8D              { align1 1H switch };
+cmp.ge.f0.0(8)  null<1>UD       g4<8,8,1>UD     g2.3<0,1,0>UD   { align1 1Q switch };
+cmp.ge.f0.0(16) null<1>D        g5<8,8,1>UD     g2.3<0,1,0>UD   { align1 1H switch };
+cmp.l.f0.0(8)   g11<1>F         g9<8,8,1>F      0x3e800000F  /* 0.25F */ { align1 2Q };
+cmp.z.f0.0(8)   null<1>D        g33<8,8,1>D     g37<8,8,1>D     { align1 1Q switch };
+cmp.z.f0.0(16)  null<1>D        g58<8,8,1>D     g66<8,8,1>D     { align1 1H switch };
+cmp.g.f0.0(8)   null<1>UD       g4.2<0,1,0>UD   0x0000001fUD    { align1 1Q switch };
+cmp.g.f0.0(16)  null<1>D        g4.2<0,1,0>UD   0x0000001fUD    { align1 1H switch };
diff --git a/src/intel/compiler/elk/tests/gen7/cmp.expected b/src/intel/compiler/elk/tests/gen7/cmp.expected
new file mode 100644
index 00000000000..ade25319b3c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/cmp.expected
@@ -0,0 +1,146 @@
+10 81 60 04 bc 77 0f 20 a0 05 60 00 60 05 60 00
+10 01 60 03 bd 7f 43 22 ae 01 6f 00 00 00 80 3f
+10 81 60 02 a4 1c 0f 20 44 02 65 00 00 00 00 00
+10 81 60 03 bc 7f 0f 20 c4 01 6e 00 00 00 80 3f
+10 01 60 06 bd 7f 03 23 ae 01 6f 00 00 00 80 3f
+10 81 60 02 bc 5f 0f 20 64 00 0a 00 64 6e 74 74
+10 81 60 01 a4 14 0f 20 a4 01 65 00 c9 00 0a 00
+10 00 60 04 bd 7f 20 24 00 04 8d 00 5f 70 89 31
+10 00 60 05 bd 7f 40 24 00 04 8d 00 5f 70 89 31
+10 00 60 04 bd 77 40 20 e0 02 8d 00 60 06 00 00
+10 00 60 05 bd 77 a0 20 e0 02 8d 00 60 06 00 00
+10 10 60 04 bd 77 60 20 80 00 8d 00 60 06 00 00
+10 10 60 05 bd 77 c0 20 80 00 8d 00 60 06 00 00
+10 81 60 01 a4 1c 0f 20 80 00 00 00 00 00 00 00
+10 81 60 05 bc 7f 0f 20 60 04 60 00 5f 70 89 31
+10 81 60 01 bc 77 0f 20 6e 00 0f 00 64 00 05 00
+10 01 60 05 bd 77 81 21 ba 00 0a 00 bf 00 0f 00
+10 01 60 04 bd 77 21 21 20 00 00 00 25 00 05 00
+10 81 60 02 bc 77 0f 20 44 05 6e 00 64 00 0e 00
+10 81 60 05 20 0c 0f 20 c0 00 60 00 03 00 00 00
+10 80 60 05 a4 1c 00 20 80 00 00 00 01 00 00 00
+10 00 60 01 bd 77 80 22 60 00 8d 00 8c 00 00 00
+10 80 80 05 a4 1c 00 20 c0 00 00 00 01 00 00 00
+10 10 60 01 bd 77 c0 24 a0 00 8d 00 cc 00 00 00
+10 00 60 01 a5 1c a0 20 44 00 00 00 27 00 00 00
+10 10 60 01 a5 1c c0 20 44 00 00 00 27 00 00 00
+10 81 60 06 bc 7f 04 20 e0 00 60 00 00 00 00 00
+10 00 60 01 bd 7f 60 20 44 00 00 00 00 00 00 41
+10 10 60 01 bd 7f 80 20 44 00 00 00 00 00 00 41
+10 80 60 01 a4 1c 00 20 80 00 00 00 01 00 00 00
+10 80 60 01 bc 77 00 20 40 01 8d 00 84 00 00 00
+10 80 80 01 a4 1c 00 20 c0 00 00 00 01 00 00 00
+10 80 80 01 a4 77 00 20 20 02 8d 00 c4 00 00 00
+10 01 60 01 a5 14 ee 23 60 00 00 00 65 02 6e 00
+10 00 60 02 a5 14 00 21 a0 00 8d 00 64 00 00 00
+10 10 60 02 a5 14 a0 21 e0 00 8d 00 64 00 00 00
+10 00 60 02 bd 77 c0 20 a0 00 8d 00 48 00 00 00
+10 10 60 02 bd 77 20 21 e0 00 8d 00 48 00 00 00
+10 01 60 04 a5 14 81 21 ba 00 0a 00 bf 00 0f 00
+10 80 60 02 a4 1c 00 20 80 00 00 00 00 00 00 00
+10 80 80 02 a4 1c 00 20 c0 00 00 00 00 00 00 00
+10 81 60 02 a4 14 0f 20 64 00 0a 00 24 04 6a 00
+10 81 60 01 bc 7f 01 20 a0 21 60 00 00 00 80 7f
+10 81 60 02 bc 7f 0f 20 a0 01 60 00 00 00 00 00
+10 81 60 05 bc 77 01 20 a0 00 60 00 a0 01 60 00
+10 01 60 05 21 04 4f 21 24 01 6e 00 24 00 0e 00
+10 00 60 03 bd 7f 00 24 e0 03 8d 00 ac c5 27 37
+10 00 60 06 bd 7f 20 24 e0 03 8d 00 ac c5 27 37
+10 00 60 03 bd 77 60 21 20 01 8d 00 40 05 00 00
+10 10 60 03 bd 77 80 21 40 01 8d 00 40 05 00 00
+10 00 60 06 bd 77 a0 21 20 01 8d 00 40 05 00 00
+10 10 60 06 bd 77 c0 21 40 01 8d 00 40 05 00 00
+10 01 60 01 bd 77 6f 21 20 01 60 00 44 00 6e 00
+10 00 60 02 a5 1c 80 20 4c 00 00 00 00 00 00 00
+10 10 60 02 a5 1c c0 20 4c 00 00 00 00 00 00 00
+10 80 60 01 bc 7f 00 20 84 00 00 00 00 00 80 3f
+10 80 80 01 a4 7f 00 20 c4 00 00 00 00 00 80 3f
+10 80 60 05 bc 7f 00 20 90 00 00 00 00 00 00 00
+10 80 80 05 a4 7f 00 20 d0 00 00 00 00 00 00 00
+10 10 60 04 bd 7f 20 21 d0 00 00 00 00 00 00 00
+10 80 60 02 bc 77 00 20 80 0f 8d 00 80 0f 8d 00
+10 80 60 02 bc 7f 00 20 80 0f 8d 00 00 00 00 00
+10 80 60 03 bc 7f 00 20 80 0f 8d 00 00 00 00 00
+10 80 80 02 a4 77 00 20 00 0f 8d 00 00 0f 8d 00
+10 80 80 02 a4 7f 00 20 00 0f 8d 00 00 00 00 00
+10 80 80 03 a4 7f 00 20 00 0f 8d 00 00 00 00 00
+10 00 60 01 a5 14 a0 20 80 00 8d 00 54 00 00 00
+10 10 60 01 a5 14 00 21 c0 00 8d 00 54 00 00 00
+10 80 60 04 a4 14 00 20 80 02 8d 00 80 00 00 00
+10 80 80 04 a4 14 00 20 a0 01 8d 00 c0 00 00 00
+10 80 61 01 a4 1c 00 20 40 00 8d 02 00 00 00 00
+10 80 81 01 a4 1c 00 20 40 00 8d 02 00 00 00 00
+10 01 60 04 a5 1c 41 29 20 00 00 00 10 00 00 00
+10 00 60 04 a5 1c 00 27 60 00 00 00 01 00 00 00
+10 00 60 05 a5 1c 20 27 60 00 00 00 01 00 00 00
+10 10 60 04 a5 1c a0 2d 60 00 00 00 01 00 00 00
+10 10 60 05 a5 1c e0 2d 60 00 00 00 01 00 00 00
+10 01 60 01 a5 1c 01 26 70 00 00 00 00 00 00 00
+10 01 60 04 21 04 2f 21 c4 00 0e 00 d4 00 0e 00
+10 01 60 02 a5 14 21 21 20 00 00 00 25 00 05 00
+10 01 60 02 bd 77 07 21 24 00 0a 00 34 00 0a 00
+10 00 60 05 a5 14 60 20 44 00 00 00 40 00 00 00
+10 10 60 05 a5 14 80 20 44 00 00 00 40 00 00 00
+10 01 60 05 a5 1c c1 28 80 08 60 00 07 00 00 00
+10 81 60 05 a4 1c 01 20 80 08 60 00 03 00 00 00
+10 00 60 06 21 0c 80 20 40 00 00 00 01 00 00 00
+10 00 60 03 21 0c a0 20 40 00 00 00 01 00 00 00
+10 10 60 06 21 0c c0 20 40 00 00 00 01 00 00 00
+10 10 60 03 21 0c 00 21 40 00 00 00 01 00 00 00
+10 80 60 06 bc 77 00 20 e0 07 8d 00 44 00 00 00
+10 80 60 06 bc 7f 00 20 e0 09 8d 00 00 00 c0 3f
+10 80 80 06 a4 77 00 20 80 0e 8d 00 48 00 00 00
+10 80 80 06 a4 7f 00 20 40 0c 8d 00 00 00 c0 3f
+10 81 60 01 bc 5f 0f 20 64 00 0a 00 6a 6c 6e 6e
+10 00 60 04 a5 14 60 20 40 00 00 00 44 00 00 00
+10 10 60 04 a5 14 80 20 40 00 00 00 44 00 00 00
+10 00 60 04 21 04 e0 23 c0 03 8d 00 bc 00 00 00
+10 00 60 05 21 04 00 24 c0 03 8d 00 ac 00 00 00
+10 10 60 04 21 04 40 26 00 06 8d 00 fc 00 00 00
+10 10 60 05 21 04 80 26 00 06 8d 00 ec 00 00 00
+10 80 60 04 bc 7f 00 20 a0 01 8d 00 17 b7 d1 38
+10 80 80 04 a4 7f 00 20 00 03 8d 00 17 b7 d1 38
+10 01 60 05 a5 14 a3 21 be 00 0f 00 c4 00 05 00
+10 00 60 02 bd 7f c0 22 1c 01 00 00 00 00 2f 44
+10 10 60 02 bd 7f 00 24 1c 01 00 00 00 00 2f 44
+10 10 60 03 bd 7f a0 20 60 20 8d 00 81 80 80 3b
+10 00 60 03 a5 1c c0 23 a4 00 00 00 04 00 00 00
+10 10 60 03 a5 1c 00 27 e4 00 00 00 04 00 00 00
+10 01 60 04 bd 7f c3 23 a4 23 65 00 6b 0b 5e 5d
+10 01 60 05 bd 7f a3 27 64 07 65 00 00 00 00 00
+10 81 60 05 20 04 0f 20 ba 00 0a 00 bf 00 0f 00
+10 81 60 04 20 04 0f 20 bf 00 0f 00 ba 00 0a 00
+10 81 60 05 a4 14 0f 20 25 00 05 00 20 00 00 00
+10 81 60 04 a4 14 0f 20 bf 00 0f 00 ba 00 0a 00
+10 80 60 04 a4 1c 00 20 40 00 00 00 01 00 00 00
+10 80 80 04 a4 1c 00 20 40 00 00 00 01 00 00 00
+10 80 60 05 bc 77 00 20 80 01 8d 00 c0 04 00 00
+10 80 80 05 a4 77 00 20 60 02 8d 00 80 07 00 00
+10 81 60 03 a4 1c 01 20 20 00 00 00 00 00 00 00
+10 81 60 04 a4 1c 01 20 e0 00 60 00 00 10 00 00
+10 01 60 01 bd 7f c1 21 20 00 00 00 81 95 b7 40
+10 81 60 02 20 0c 0f 20 20 01 60 00 00 00 00 00
+10 80 60 02 a4 14 00 20 60 00 8d 00 40 00 00 00
+10 80 80 02 a4 14 00 20 60 00 8d 00 40 00 00 00
+10 80 60 01 20 0c 00 20 a0 00 8d 00 00 00 00 00
+10 80 80 01 24 0c 00 20 e0 01 8d 00 00 00 00 00
+10 81 60 03 20 0c 0f 20 2a 00 0a 00 1f 00 00 00
+10 80 60 03 bc 77 00 20 c0 21 8d 00 a0 05 00 00
+10 80 80 03 a4 77 00 20 a0 22 8d 00 c0 00 00 00
+10 81 60 04 bc 7f 0f 20 c4 23 6e 00 6b 0b 5e 5d
+10 80 61 02 28 25 00 20 00 00 8d 02 00 00 8d 00
+10 80 81 02 24 25 00 20 00 00 8d 02 00 00 8d 00
+10 80 60 05 20 04 00 20 40 00 8d 00 40 02 8d 00
+10 80 80 05 24 04 00 20 00 04 8d 00 a0 03 8d 00
+10 80 60 04 bc 77 00 20 c0 03 8d 00 90 00 00 00
+10 80 80 04 a4 77 00 20 80 03 8d 00 d0 00 00 00
+10 10 60 06 bd 7f 20 21 e0 00 8d 00 ac c5 27 37
+10 80 60 03 a4 1c 00 20 80 05 8d 00 08 00 00 00
+10 80 80 03 a4 1c 00 20 00 09 8d 00 08 00 00 00
+10 80 60 04 20 04 00 20 80 00 8d 00 4c 00 00 00
+10 80 80 04 24 04 00 20 a0 00 8d 00 4c 00 00 00
+10 10 60 05 bd 7f 60 21 20 01 8d 00 00 00 80 3e
+10 80 60 01 a4 14 00 20 20 04 8d 00 a0 04 8d 00
+10 80 80 01 a4 14 00 20 40 07 8d 00 40 08 8d 00
+10 80 60 03 20 0c 00 20 88 00 00 00 1f 00 00 00
+10 80 80 03 24 0c 00 20 88 00 00 00 1f 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/dp2.asm b/src/intel/compiler/elk/tests/gen7/dp2.asm
new file mode 100644
index 00000000000..b615d1ae22e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/dp2.asm
@@ -0,0 +1,4 @@
+dp2(8)          g35<1>.xF       g34<4>.xyyyF    g34<4>.xyyyF    { align16 1Q };
+dp2(8)          g4<1>.yF        g1<0>.xyyyF     g1.4<0>.xyyyF   { align16 NoDDClr 1Q };
+dp2(8)          g4<1>.zF        g1<0>.xyyyF     g1.4<0>.zwwwF   { align16 NoDDClr,NoDDChk 1Q };
+dp2(8)          g4<1>.wF        g1<0>.xyyyF     g2<0>.xyyyF     { align16 NoDDChk 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7/dp2.expected b/src/intel/compiler/elk/tests/gen7/dp2.expected
new file mode 100644
index 00000000000..ff567fe13ac
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/dp2.expected
@@ -0,0 +1,4 @@
+57 01 60 00 bd 77 61 24 44 04 65 00 44 04 65 00
+57 05 60 00 bd 77 82 20 24 00 05 00 34 00 05 00
+57 0d 60 00 bd 77 84 20 24 00 05 00 3e 00 0f 00
+57 09 60 00 bd 77 88 20 24 00 05 00 44 00 05 00
diff --git a/src/intel/compiler/elk/tests/gen7/dp3.asm b/src/intel/compiler/elk/tests/gen7/dp3.asm
new file mode 100644
index 00000000000..3f7a6ae9a9e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/dp3.asm
@@ -0,0 +1,6 @@
+dp3(8)          g12<1>.xF       g11<4>.xyzzF    g11<4>.xyzzF    { align16 1Q };
+dp3(8)          g116<1>.xF      g3<0>.xyzzF     g6<4>.xyzzF     { align16 NoDDClr 1Q };
+dp3(8)          g116<1>.yF      g3.4<0>.xyzzF   g6<4>.xyzzF     { align16 NoDDClr,NoDDChk 1Q };
+dp3(8)          g10<1>.yF       g6<0>.xyzzF     g7<0>.xyzzF     { align16 NoDDChk 1Q };
+dp3.le.f0.0(8)  g40<1>.xF       g31<4>.xyzzF    g3.4<0>.xyzzF   { align16 1Q };
+dp3.sat(8)      g37<1>.xF       g31<4>.xyzzF    g35<4>.xyzzF    { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7/dp3.expected b/src/intel/compiler/elk/tests/gen7/dp3.expected
new file mode 100644
index 00000000000..ada1cc397b0
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/dp3.expected
@@ -0,0 +1,6 @@
+56 01 60 00 bd 77 81 21 64 01 6a 00 64 01 6a 00
+56 05 60 00 bd 77 81 2e 64 00 0a 00 c4 00 6a 00
+56 0d 60 00 bd 77 82 2e 74 00 0a 00 c4 00 6a 00
+56 09 60 00 bd 77 42 21 c4 00 0a 00 e4 00 0a 00
+56 01 60 06 bd 77 01 25 e4 03 6a 00 74 00 0a 00
+56 01 60 80 bd 77 a1 24 e4 03 6a 00 64 04 6a 00
diff --git a/src/intel/compiler/elk/tests/gen7/dp4.asm b/src/intel/compiler/elk/tests/gen7/dp4.asm
new file mode 100644
index 00000000000..51a943a8886
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/dp4.asm
@@ -0,0 +1,6 @@
+dp4(8)          g115<1>.xF      g3<4>F          g1<0>F          { align16 NoDDClr 1Q };
+dp4(8)          g115<1>.yF      g3<4>F          g1.4<0>F        { align16 NoDDClr,NoDDChk 1Q };
+dp4(8)          g115<1>.wF      g3<4>F          g2.4<0>F        { align16 NoDDChk 1Q };
+dp4(8)          g115<1>.wF      g5<4>F          g2.4<0>F        { align16 1Q };
+dp4.sat(8)      g116<1>F        g2<4>.xF        g2<4>F          { align16 1Q };
+dp4(8)          g15<1>.xF       g2<4>F          0x3f800000F  /* 1F */ { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7/dp4.expected b/src/intel/compiler/elk/tests/gen7/dp4.expected
new file mode 100644
index 00000000000..bdf10ad6e9c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/dp4.expected
@@ -0,0 +1,6 @@
+54 05 60 00 bd 77 61 2e 64 00 6e 00 24 00 0e 00
+54 0d 60 00 bd 77 62 2e 64 00 6e 00 34 00 0e 00
+54 09 60 00 bd 77 68 2e 64 00 6e 00 54 00 0e 00
+54 01 60 00 bd 77 68 2e a4 00 6e 00 54 00 0e 00
+54 01 60 80 bd 77 8f 2e 40 00 60 00 44 00 6e 00
+54 01 60 00 bd 7f e1 21 44 00 6e 00 00 00 80 3f
diff --git a/src/intel/compiler/elk/tests/gen7/dph.asm b/src/intel/compiler/elk/tests/gen7/dph.asm
new file mode 100644
index 00000000000..c28a84183dc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/dph.asm
@@ -0,0 +1,5 @@
+dph(8)          g116<1>.xF      g4<4>.xyzxF     g5<4>F          { align16 1Q };
+dph.sat(8)      g116<1>F        g1<0>.xyzxF     g3<4>F          { align16 1Q };
+dph(8)          g115<1>.xF      g5<4>.xyzxF     g1<0>F          { align16 NoDDClr 1Q };
+dph(8)          g115<1>.yF      g5<4>.xyzxF     g1.4<0>F        { align16 NoDDClr,NoDDChk 1Q };
+dph(8)          g115<1>.wF      g5<4>.xyzxF     g2.4<0>F        { align16 NoDDChk 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7/dph.expected b/src/intel/compiler/elk/tests/gen7/dph.expected
new file mode 100644
index 00000000000..02bd5f64902
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/dph.expected
@@ -0,0 +1,5 @@
+55 01 60 00 bd 77 81 2e 84 00 62 00 a4 00 6e 00
+55 01 60 80 bd 77 8f 2e 24 00 02 00 64 00 6e 00
+55 05 60 00 bd 77 61 2e a4 00 62 00 24 00 0e 00
+55 0d 60 00 bd 77 62 2e a4 00 62 00 34 00 0e 00
+55 09 60 00 bd 77 68 2e a4 00 62 00 54 00 0e 00
diff --git a/src/intel/compiler/elk/tests/gen7/else.asm b/src/intel/compiler/elk/tests/gen7/else.asm
new file mode 100644
index 00000000000..5450c0fd6cd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/else.asm
@@ -0,0 +1,4 @@
+else(8)         JIP: LABEL0                                     { align1 1Q };
+else(16)        JIP: LABEL0                                     { align1 1H };
+else(8)         JIP: LABEL0                                     { align16 1Q };
+LABEL0:
diff --git a/src/intel/compiler/elk/tests/gen7/else.expected b/src/intel/compiler/elk/tests/gen7/else.expected
new file mode 100644
index 00000000000..1c4b3515961
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/else.expected
@@ -0,0 +1,3 @@
+24 00 60 00 84 3c 00 20 00 00 8d 00 06 00 00 00
+24 00 80 00 84 3c 00 20 00 00 8d 00 04 00 00 00
+24 01 60 00 84 3c 0f 20 04 00 6e 00 02 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/endif.asm b/src/intel/compiler/elk/tests/gen7/endif.asm
new file mode 100644
index 00000000000..fc00e245353
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/endif.asm
@@ -0,0 +1,6 @@
+endif(8)        JIP: LABEL1                                     { align16 1Q };
+LABEL1:
+endif(8)        JIP: LABEL2                                     { align1 1Q };
+LABEL2:
+endif(16)       JIP: LABEL3                                     { align1 1H };
+LABEL3:
diff --git a/src/intel/compiler/elk/tests/gen7/endif.expected b/src/intel/compiler/elk/tests/gen7/endif.expected
new file mode 100644
index 00000000000..e646da694c2
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/endif.expected
@@ -0,0 +1,3 @@
+25 01 60 00 84 3c 0f 20 04 00 6e 00 02 00 00 00
+25 00 60 00 84 3c 00 20 00 00 8d 00 02 00 00 00
+25 00 80 00 84 3c 00 20 00 00 8d 00 02 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/f16to32.asm b/src/intel/compiler/elk/tests/gen7/f16to32.asm
new file mode 100644
index 00000000000..29de37d1ec4
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/f16to32.asm
@@ -0,0 +1,2 @@
+f16to32(8)      g19<1>F         g2<16,8,2>UW                    { align1 1Q };
+f16to32(16)     g21<1>F         g6<16,8,2>UW                    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/f16to32.expected b/src/intel/compiler/elk/tests/gen7/f16to32.expected
new file mode 100644
index 00000000000..3c22d54985b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/f16to32.expected
@@ -0,0 +1,2 @@
+14 00 60 00 3d 01 60 22 40 00 ae 00 00 00 00 00
+14 00 80 00 3d 01 a0 22 c0 00 ae 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/f32to16.asm b/src/intel/compiler/elk/tests/gen7/f32to16.asm
new file mode 100644
index 00000000000..4d6764d5896
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/f32to16.asm
@@ -0,0 +1,2 @@
+f32to16(8)      g38<2>W         g13<8,8,1>F                     { align1 1Q };
+f32to16(16)     g76<2>W         g91<8,8,1>F                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/f32to16.expected b/src/intel/compiler/elk/tests/gen7/f32to16.expected
new file mode 100644
index 00000000000..d9ab85de7e7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/f32to16.expected
@@ -0,0 +1,2 @@
+13 00 60 00 ad 03 c0 44 a0 01 8d 00 00 00 00 00
+13 00 80 00 ad 03 80 49 60 0b 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/fbh.asm b/src/intel/compiler/elk/tests/gen7/fbh.asm
new file mode 100644
index 00000000000..f54933b6b9f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/fbh.asm
@@ -0,0 +1,3 @@
+fbh(8)          g16<1>D         g15<4>D                         { align16 1Q };
+fbh(8)          g7<1>D          g4<8,8,1>D                      { align1 1Q };
+fbh(16)         g8<1>D          g4<8,8,1>D                      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/fbh.expected b/src/intel/compiler/elk/tests/gen7/fbh.expected
new file mode 100644
index 00000000000..72fad951c82
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/fbh.expected
@@ -0,0 +1,3 @@
+4b 01 60 00 a5 00 0f 22 e4 01 6e 00 00 00 00 00
+4b 00 60 00 a5 00 e0 20 80 00 8d 00 00 00 00 00
+4b 00 80 00 a5 00 00 21 80 00 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/fbl.asm b/src/intel/compiler/elk/tests/gen7/fbl.asm
new file mode 100644
index 00000000000..844976dac0e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/fbl.asm
@@ -0,0 +1,5 @@
+fbl(8)          g5<1>UD         g5<8,8,1>UD                     { align1 1Q };
+fbl(16)         g6<1>UD         g8<8,8,1>UD                     { align1 1H };
+fbl(8)          g11<1>UD        g10<4>UD                        { align16 1Q };
+fbl(1)          g6<1>UD         f1<0,1,0>UB                     { align1 WE_all 1N };
+fbl(1)          g9<1>UD         f1<0,1,0>UW                     { align1 WE_all 1N };
diff --git a/src/intel/compiler/elk/tests/gen7/fbl.expected b/src/intel/compiler/elk/tests/gen7/fbl.expected
new file mode 100644
index 00000000000..ded84caf2dd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/fbl.expected
@@ -0,0 +1,5 @@
+4c 00 60 00 21 00 a0 20 a0 00 8d 00 00 00 00 00
+4c 00 80 00 21 00 c0 20 00 01 8d 00 00 00 00 00
+4c 01 60 00 21 00 6f 21 44 01 6e 00 00 00 00 00
+4c 02 00 00 01 02 c0 20 20 06 00 00 00 00 00 00
+4c 02 00 00 01 01 20 21 20 06 00 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/frc.asm b/src/intel/compiler/elk/tests/gen7/frc.asm
new file mode 100644
index 00000000000..60d2ae7317f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/frc.asm
@@ -0,0 +1,4 @@
+frc(8)          g19<1>.xF       (abs)g1<0>.xF                   { align16 1Q };
+frc.sat(8)      g116<1>F        g6<4>F                          { align16 1Q };
+frc(8)          g3<1>F          g2<0,1,0>F                      { align1 1Q };
+frc(16)         g3<1>F          g2<0,1,0>F                      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/frc.expected b/src/intel/compiler/elk/tests/gen7/frc.expected
new file mode 100644
index 00000000000..36265f5ed7d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/frc.expected
@@ -0,0 +1,4 @@
+43 01 60 00 bd 03 61 22 20 20 00 00 00 00 00 00
+43 01 60 80 bd 03 8f 2e c4 00 6e 00 00 00 00 00
+43 00 60 00 bd 03 60 20 40 00 00 00 00 00 00 00
+43 00 80 00 bd 03 60 20 40 00 00 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/halt.asm b/src/intel/compiler/elk/tests/gen7/halt.asm
new file mode 100644
index 00000000000..5f29e88c57c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/halt.asm
@@ -0,0 +1,6 @@
+(-f0.1.any4h) halt(8) JIP: LABEL0      UIP: LABEL0              { align1 1Q };
+halt(8)         JIP: LABEL1            UIP: LABEL1              { align1 1Q };
+LABEL1:
+(-f0.1.any4h) halt(16) JIP: LABEL0     UIP: LABEL0              { align1 1H };
+halt(16)        JIP: LABEL0            UIP: LABEL0              { align1 1H };
+LABEL0:
diff --git a/src/intel/compiler/elk/tests/gen7/halt.expected b/src/intel/compiler/elk/tests/gen7/halt.expected
new file mode 100644
index 00000000000..f76a4b179f7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/halt.expected
@@ -0,0 +1,4 @@
+2a 00 76 00 84 1c 00 20 00 00 8d 02 08 00 08 00
+2a 00 60 00 84 1c 00 20 00 00 8d 00 02 00 02 00
+2a 00 96 00 84 1c 00 20 00 00 8d 02 04 00 04 00
+2a 00 80 00 84 1c 00 20 00 00 8d 00 02 00 02 00
diff --git a/src/intel/compiler/elk/tests/gen7/if.asm b/src/intel/compiler/elk/tests/gen7/if.asm
new file mode 100644
index 00000000000..5e6bb07f3a6
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/if.asm
@@ -0,0 +1,9 @@
+(+f0.0.x) if(8) JIP: LABEL0         UIP: LABEL2                 { align16 1Q };
+LABEL0:
+(+f0.0) if(8)   JIP: LABEL2         UIP: LABEL1                 { align16 1Q };
+(+f0.0) if(8)   JIP: LABEL2         UIP: LABEL1                 { align1 1Q };
+LABEL1:
+(+f0.0) if(16)  JIP: LABEL2         UIP: LABEL2                 { align1 1H };
+(-f0.0) if(8)   JIP: LABEL2         UIP: LABEL2                 { align1 1Q };
+(-f0.0) if(16)  JIP: LABEL2         UIP: LABEL2                 { align1 1H };
+LABEL2:
diff --git a/src/intel/compiler/elk/tests/gen7/if.expected b/src/intel/compiler/elk/tests/gen7/if.expected
new file mode 100644
index 00000000000..e5e6c19c2b7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/if.expected
@@ -0,0 +1,6 @@
+22 01 62 00 84 3c 0f 20 04 00 0e 00 02 00 0c 00
+22 01 61 00 84 3c 0f 20 04 00 0e 00 0a 00 04 00
+22 00 61 00 84 3c 00 20 00 00 00 00 08 00 02 00
+22 00 81 00 84 3c 00 20 00 00 00 00 06 00 06 00
+22 00 71 00 84 3c 00 20 00 00 00 00 04 00 04 00
+22 00 91 00 84 3c 00 20 00 00 00 00 02 00 02 00
diff --git a/src/intel/compiler/elk/tests/gen7/lrp.asm b/src/intel/compiler/elk/tests/gen7/lrp.asm
new file mode 100644
index 00000000000..6b20c7cd383
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/lrp.asm
@@ -0,0 +1,4 @@
+lrp(8)          g42<1>F         g41<4,4,1>.xF   g40<4,4,1>F     g39<4,4,1>F { align16 1Q };
+lrp(8)          g5<1>F          g2.4<0,1,0>F    g2.2<0,1,0>F    g2.0<0,1,0>F { align16 2Q };
+lrp.sat(8)      g7<1>F          g10<4,4,1>F     g13<4,4,1>F     g16<4,4,1>F { align16 1Q };
+lrp.sat(8)      g19<1>F         g21<4,4,1>F     g27<4,4,1>F     g33<4,4,1>F { align16 2Q };
diff --git a/src/intel/compiler/elk/tests/gen7/lrp.expected b/src/intel/compiler/elk/tests/gen7/lrp.expected
new file mode 100644
index 00000000000..7c80f0658bc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/lrp.expected
@@ -0,0 +1,4 @@
+5c 01 60 00 00 00 1e 2a 00 90 02 39 50 20 c7 09
+5c 11 60 00 00 00 1e 05 01 28 20 80 04 04 80 00
+5c 01 60 80 00 00 1e 07 c8 a1 00 39 1a 20 07 04
+5c 11 60 80 00 00 1e 13 c8 51 01 39 36 20 47 08
diff --git a/src/intel/compiler/elk/tests/gen7/lzd.asm b/src/intel/compiler/elk/tests/gen7/lzd.asm
new file mode 100644
index 00000000000..da2da08af28
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/lzd.asm
@@ -0,0 +1,3 @@
+lzd(8)          g20<1>UD        g2.4<0>UD                       { align16 1Q };
+lzd(8)          g17<1>UD        g3.1<0,1,0>UD                   { align1 1Q };
+lzd(16)         g27<1>UD        g3.1<0,1,0>UD                   { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/lzd.expected b/src/intel/compiler/elk/tests/gen7/lzd.expected
new file mode 100644
index 00000000000..429bf4578d9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/lzd.expected
@@ -0,0 +1,3 @@
+4a 01 60 00 21 00 8f 22 54 00 0e 00 00 00 00 00
+4a 00 60 00 21 00 20 22 64 00 00 00 00 00 00 00
+4a 00 80 00 21 00 60 23 64 00 00 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/mach.asm b/src/intel/compiler/elk/tests/gen7/mach.asm
new file mode 100644
index 00000000000..f0a04c53873
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/mach.asm
@@ -0,0 +1,13 @@
+mach(8)         null<1>D        g5.4<0>.zwwwD   g6<0>.xyyyD     { align16 1Q AccWrEnable };
+mach(8)         g12<1>UD        g10<8,8,1>UD    0xaaaaaaabUD    { align1 1Q AccWrEnable };
+mach(8)         g16<1>D         g10<8,8,1>D     1431655766D     { align1 1Q AccWrEnable };
+mach(8)         g2<1>UD         g23<8,8,1>UD    0xaaaaaaabUD    { align1 WE_all 1Q AccWrEnable };
+mach(8)         g3<1>D          g23<8,8,1>D     1431655766D     { align1 WE_all 1Q AccWrEnable };
+mach(8)         g9<1>D          g1<0>D          g1.4<0>D        { align16 1Q AccWrEnable };
+mach(8)         null<1>D        g1<4>.xD        741092396D      { align16 1Q AccWrEnable };
+mach(8)         g12<1>UD        g4<8,8,1>UD     g8<8,8,1>UD     { align1 1Q AccWrEnable };
+mach(8)         g5<1>UD         g5<8,8,1>UD     g13<8,8,1>UD    { align1 WE_all 1Q AccWrEnable };
+mach(8)         g13<1>D         g5<8,8,1>D      g9<8,8,1>D      { align1 1Q AccWrEnable };
+mach(8)         g2<1>D          g6<8,8,1>D      g14<8,8,1>D     { align1 WE_all 1Q AccWrEnable };
+mach(8)         g24<1>.xUD      g22<4>.xUD      0x80000001UD    { align16 1Q AccWrEnable };
+mach(8)         g12<1>UD        g9<4>UD         g11<4>UD        { align16 1Q AccWrEnable };
diff --git a/src/intel/compiler/elk/tests/gen7/mach.expected b/src/intel/compiler/elk/tests/gen7/mach.expected
new file mode 100644
index 00000000000..3da50e3ea3c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/mach.expected
@@ -0,0 +1,13 @@
+49 01 60 10 a4 14 0f 20 be 00 0f 00 c4 00 05 00
+49 00 60 10 21 0c 80 21 40 01 8d 00 ab aa aa aa
+49 00 60 10 a5 1c 00 22 40 01 8d 00 56 55 55 55
+49 02 60 10 21 0c 40 20 e0 02 8d 00 ab aa aa aa
+49 02 60 10 a5 1c 60 20 e0 02 8d 00 56 55 55 55
+49 01 60 10 a5 14 2f 21 24 00 0e 00 34 00 0e 00
+49 01 60 10 a4 1c 0f 20 20 00 60 00 2c 2c 2c 2c
+49 00 60 10 21 04 80 21 80 00 8d 00 00 01 8d 00
+49 02 60 10 21 04 a0 20 a0 00 8d 00 a0 01 8d 00
+49 00 60 10 a5 14 a0 21 a0 00 8d 00 20 01 8d 00
+49 02 60 10 a5 14 40 20 c0 00 8d 00 c0 01 8d 00
+49 01 60 10 21 0c 01 23 c0 02 60 00 01 00 00 80
+49 01 60 10 21 04 8f 21 24 01 6e 00 64 01 6e 00
diff --git a/src/intel/compiler/elk/tests/gen7/mad.asm b/src/intel/compiler/elk/tests/gen7/mad.asm
new file mode 100644
index 00000000000..4494536578b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/mad.asm
@@ -0,0 +1,38 @@
+mad(8)          g11<1>F         g4.7<0,1,0>F    g4.3<0,1,0>F    g9<4,4,1>F { align16 1Q };
+mad(8)          g25<1>F         g6.7<0,1,0>F    g6.3<0,1,0>F    g21<4,4,1>F { align16 2Q };
+mad(8)          g18<1>.xyzF     -g16<4,4,1>.xyzzF g11<4,4,1>.xyzzF g9<4,4,1>.xyzzF { align16 1Q };
+mad(8)          g3<1>F          -g2.4<0,1,0>F   g6.0<0,1,0>F    g2.0<0,1,0>F { align16 2Q };
+mad(8)          g116<1>.xyzF    g9<4,4,1>.xyzzF g6<4,4,1>.xyzzF g30<4,4,1>.xyzzF { align16 NoDDClr 1Q };
+mad.le.f0.0(8)  g9<1>F          g3<4,4,1>F      g4.2<0,1,0>F    g15<4,4,1>F { align16 1Q };
+mad.le.f0.0(8)  g4<1>F          g2<4,4,1>F      g6.2<0,1,0>F    g20<4,4,1>F { align16 2Q };
+mad(8)          g20<1>.xyzF     g8<4,4,1>.xF    g19<4,4,1>.xyzzF -g9<4,4,1>.xF { align16 1Q };
+mad(8)          g22<1>.xF       g10<4,4,1>.xF   g21<4,4,1>.xF   (abs)g5.6<0,1,0>F { align16 1Q };
+mad.sat(8)      g116<1>.xyzF    g95<4,4,1>.xyzzF g89<4,4,1>.xyzzF g93<4,4,1>.zF { align16 NoDDClr 1Q };
+mad(8)          g53<1>F         -g52<4,4,1>F    g21<4,4,1>F     -g21<4,4,1>F { align16 1Q };
+mad(8)          g71<1>F         -g8<4,4,1>F     -g2.4<0,1,0>F   -g21<4,4,1>F { align16 1Q };
+mad.sat(8)      g40<1>F         g39<4,4,1>F     g37<4,4,1>F     g10<4,4,1>F { align16 1Q };
+mad(8)          g67<1>F         g65<4,4,1>F     g6.2<0,1,0>F    -g2.2<0,1,0>F { align16 2Q };
+mad(8)          g37<1>F         -g35<4,4,1>F    g87<4,4,1>F     -g87<4,4,1>F { align16 2Q };
+mad(8)          g6<1>F          -g93<4,4,1>F    -g2.5<0,1,0>F   -g87<4,4,1>F { align16 2Q };
+mad.sat(8)      g9<1>F          g7<4,4,1>F      g118<4,4,1>F    g105<4,4,1>F { align16 2Q };
+mad.ge.f0.0(8)  g19<1>.xF       g9<4,4,1>.xF    g18<4,4,1>.xF   -g6.0<0,1,0>F { align16 1Q };
+mad(8)          g115<1>.xF      g1<4,4,1>.xF    g9<4,4,1>.xF    g2<4,4,1>.xF { align16 NoDDChk 1Q };
+mad.sat(8)      g116<1>.xyzF    -g9<4,4,1>.xyzzF g8<4,4,1>.zxyyF g6<4,4,1>.yzxxF { align16 NoDDClr 1Q };
+mad(8)          g125<1>F        g11<4,4,1>F     -g13.0<0,1,0>F  g6<4,4,1>F { align16 1Q };
+mad(8)          g123<1>F        g2<4,4,1>F      -g64.0<0,1,0>F  g11<4,4,1>F { align16 2Q };
+mad(8)          g2<1>F          -g6<4,4,1>F     (abs)g5<4,4,1>F g14.0<0,1,0>F { align16 1Q };
+mad(8)          g2<1>F          -g9<4,4,1>F     (abs)g7<4,4,1>F g64.0<0,1,0>F { align16 2Q };
+mad(8)          g11<1>.yF       -g27<4,4,1>.xF  g7.2<0,1,0>F    g6.0<0,1,0>F { align16 NoDDClr,NoDDChk 1Q };
+mad(8)          g10<1>.zF       -g30<4,4,1>.xF  g6.6<0,1,0>F    g6.1<0,1,0>F { align16 NoDDChk 1Q };
+mad(8)          g5<1>F          -g18.1<0,1,0>F  g7<4,4,1>F      (abs)g2.0<0,1,0>F { align16 1Q };
+mad(8)          g5<1>F          -g13.1<0,1,0>F  g12<4,4,1>F     (abs)g2.0<0,1,0>F { align16 2Q };
+mad(8)          g6<1>F          g13.0<0,1,0>F   g5<4,4,1>F      (abs)g2.0<0,1,0>F { align16 2Q };
+mad.ge.f0.0(8)  g9<1>F          g21.0<0,1,0>F   g7<4,4,1>F      -g2.4<0,1,0>F { align16 2Q };
+mad(8)          g13<1>F         g51.2<0,1,0>F   -g51.3<0,1,0>F  (abs)g2.0<0,1,0>F { align16 1Q };
+mad(8)          g12<1>F         g31.2<0,1,0>F   -g31.3<0,1,0>F  (abs)g2.0<0,1,0>F { align16 2Q };
+mad.l.f0.0(8)   g18<1>F         g4<4,4,1>F      g2.2<0,1,0>F    g2.4<0,1,0>F { align16 1Q };
+mad.l.f0.0(8)   g7<1>F          g5<4,4,1>F      g2.2<0,1,0>F    g2.4<0,1,0>F { align16 2Q };
+mad(8)          g9<1>.zF        g36<4,4,1>.xF   g27<4,4,1>.xF   g6.7<0,1,0>F { align16 NoDDClr,NoDDChk 1Q };
+mad(8)          g5<1>.xF        -g16<4,4,1>.xF  g2.2<0,1,0>F    g1.5<0,1,0>F { align16 NoDDClr 1Q };
+mad.nz.f0.0(8)  g10<1>F         -g12.0<0,1,0>F  g7<4,4,1>F      g10<4,4,1>F { align16 1Q };
+mad.nz.f0.0(8)  g16<1>F         -g33.0<0,1,0>F  g10<4,4,1>F     g18<4,4,1>F { align16 2Q };
diff --git a/src/intel/compiler/elk/tests/gen7/mad.expected b/src/intel/compiler/elk/tests/gen7/mad.expected
new file mode 100644
index 00000000000..dea1096f1a0
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/mad.expected
@@ -0,0 +1,38 @@
+5b 01 60 00 00 00 1e 0b 01 4e 20 c0 08 20 47 02
+5b 11 60 00 00 00 1e 19 01 6e 20 c0 0c 20 47 05
+5b 01 60 00 20 00 0e 12 48 01 01 29 16 20 45 02
+5b 11 60 00 20 00 1e 03 01 28 20 00 0c 04 80 00
+5b 05 60 00 00 00 0e 74 48 91 00 29 0c 20 85 07
+5b 01 60 06 00 00 1e 09 c8 31 20 80 08 20 c7 03
+5b 11 60 06 00 00 1e 04 c8 21 20 80 0c 20 07 05
+5b 01 60 00 00 02 0e 14 00 80 00 29 26 00 40 02
+5b 01 60 00 00 01 02 16 00 a0 00 00 2a 04 70 01
+5b 05 60 80 00 00 0e 74 48 f1 05 29 b2 50 45 17
+5b 01 60 00 20 02 1e 35 c8 41 03 39 2a 20 47 05
+5b 01 60 00 a0 02 1e 47 c8 81 20 00 05 20 47 05
+5b 01 60 80 00 00 1e 28 c8 71 02 39 4a 20 87 02
+5b 11 60 00 00 02 1e 43 c8 11 24 80 0c 04 90 00
+5b 11 60 00 20 02 1e 25 c8 31 02 39 ae 20 c7 15
+5b 11 60 00 a0 02 1e 06 c8 d1 25 40 05 20 c7 15
+5b 11 60 80 00 00 1e 09 c8 71 00 39 ec 20 47 1a
+5b 01 60 04 00 02 02 13 00 90 00 00 24 04 80 01
+5b 09 60 00 00 00 02 73 00 10 00 00 12 00 80 00
+5b 05 60 80 20 00 0e 74 48 91 80 14 10 48 80 01
+5b 01 60 00 80 00 1e 7d c8 b1 20 00 1a 20 87 01
+5b 11 60 00 80 00 1e 7b c8 21 20 00 80 20 c7 02
+5b 01 60 00 60 00 1e 02 c8 61 00 39 0a 04 80 03
+5b 11 60 00 60 00 1e 02 c8 91 00 39 0e 04 00 10
+5b 0d 60 00 20 00 04 0b 00 b0 21 80 0e 04 80 01
+5b 09 60 00 20 00 08 0a 00 e0 21 80 0d 04 88 01
+5b 01 60 00 20 01 1e 05 01 22 01 39 0e 04 80 00
+5b 11 60 00 20 01 1e 05 01 d2 00 39 18 04 80 00
+5b 11 60 00 00 01 1e 06 01 d0 00 39 0a 04 80 00
+5b 11 60 04 00 02 1e 09 01 50 01 39 0e 04 a0 00
+5b 01 60 00 80 01 1e 0d 01 34 23 c0 66 04 80 00
+5b 11 60 00 80 01 1e 0c 01 f4 21 c0 3e 04 80 00
+5b 01 60 05 00 00 1e 12 c8 41 20 80 04 04 a0 00
+5b 11 60 05 00 00 1e 07 c8 51 20 80 04 04 a0 00
+5b 0d 60 00 00 00 08 09 00 40 02 00 36 04 b8 01
+5b 05 60 00 20 00 02 05 00 00 21 80 04 04 68 00
+5b 01 60 02 20 00 1e 0a 01 c0 00 39 0e 20 87 02
+5b 11 60 02 20 00 1e 10 01 10 02 39 14 20 87 04
diff --git a/src/intel/compiler/elk/tests/gen7/math.asm b/src/intel/compiler/elk/tests/gen7/math.asm
new file mode 100644
index 00000000000..6e181a67379
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/math.asm
@@ -0,0 +1,39 @@
+math inv(8)     g6<1>F          g2<0,1,0>F      null<8,8,1>F    { align1 1Q };
+math inv(16)    g10<1>F         g2<0,1,0>F      null<8,8,1>F    { align1 1H };
+math inv(8)     g11<1>.xyzF     g2<0>.xyzzF     null<4>F        { align16 1Q };
+math sqrt(8)    g13<1>.xF       g12<4>.xF       null<4>F        { align16 1Q };
+math sqrt(8)    g9<1>F          g8<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math sqrt(16)   g6<1>F          g4<8,8,1>F      null<8,8,1>F    { align1 1H };
+math pow(8)     g20<1>F         g14<8,8,1>F     g20<0,1,0>F     { align1 1Q };
+math pow(16)    g21<1>F         g19<8,8,1>F     g25<0,1,0>F     { align1 1H };
+math intmod(8)  g13<1>UD        g6<0>UD         g5.4<0>.zUD     { align16 1Q };
+math pow(8)     g14<1>.xF       g13<4>.xF       g12<4>.xF       { align16 1Q };
+math log(8)     g22<1>F         g20<8,8,1>F     null<8,8,1>F    { align1 1Q };
+math log(16)    g4<1>F          g44<8,8,1>F     null<8,8,1>F    { align1 1H };
+math cos(8)     g5<1>.yF        g14<4>.xF       null<4>F        { align16 1Q };
+math sin(8)     g21<1>.xF       g16<4>.xF       null<4>F        { align16 1Q };
+math exp(8)     g16<1>.xF       g15<4>.xF       null<4>F        { align16 1Q };
+math log(8)     g15<1>.xF       g14<4>.xF       null<4>F        { align16 1Q };
+math intdiv(8)  g9<1>.xyD       g1<0>.xD        g1<0>.yzzzD     { align16 1Q };
+math exp(8)     g124<1>F        g5<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math exp(16)    g120<1>F        g7<8,8,1>F      null<8,8,1>F    { align1 1H };
+math intdiv(8)  g14<1>.xyzUD    g6<0>.xyzzUD    g6.4<0>.xyzzUD  { align16 1Q };
+math cos(8)     g127<1>F        g5<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math cos(16)    g126<1>F        g7<8,8,1>F      null<8,8,1>F    { align1 1H };
+math intdiv(8)  g4<1>D          g2<0,1,0>D      g2.4<0,1,0>D    { align1 1Q };
+math intdiv(8)  g4<1>D          g2<0,1,0>D      g2.4<0,1,0>D    { align1 2Q };
+math rsq(8)     g69<1>.xF       (abs)g68<4>.xF  null<4>F        { align16 1Q };
+math rsq(8)     g47<1>F         g46<8,8,1>F     null<8,8,1>F    { align1 1Q };
+math rsq(16)    g84<1>F         g82<8,8,1>F     null<8,8,1>F    { align1 1H };
+math.sat exp(8) g3<1>F          g2<0,1,0>F      null<8,8,1>F    { align1 1Q };
+math.sat exp(16) g3<1>F         g2<0,1,0>F      null<8,8,1>F    { align1 1H };
+math sin(8)     g3<1>F          g2<0,1,0>F      null<8,8,1>F    { align1 1Q };
+math sin(16)    g3<1>F          g2<0,1,0>F      null<8,8,1>F    { align1 1H };
+math intdiv(8)  g126<1>UD       g4<0,1,0>UD     g6<8,8,1>UD     { align1 1Q };
+math intdiv(8)  g125<1>UD       g4<0,1,0>UD     g7<8,8,1>UD     { align1 2Q };
+math.sat rsq(8) g116<1>F        (abs)g6<4>.xF   null<4>F        { align16 1Q };
+math.sat pow(8) g116<1>F        g2<4>.xF        g2<4>.yF        { align16 1Q };
+math intmod(8)  g44<1>UD        g43<8,8,1>UD    g12<8,8,1>UD    { align1 1Q };
+math intmod(8)  g73<1>UD        g71<8,8,1>UD    g13<8,8,1>UD    { align1 2Q };
+math.sat inv(8) g116<1>.xF      g1<0>.xF        null<4>F        { align16 1Q };
+math.sat log(8) g116<1>F        g6<4>.xF        null<4>F        { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7/math.expected b/src/intel/compiler/elk/tests/gen7/math.expected
new file mode 100644
index 00000000000..f7c1eb6ed3f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/math.expected
@@ -0,0 +1,39 @@
+38 00 60 01 bd 73 c0 20 40 00 00 00 00 00 8d 00
+38 00 80 01 bd 73 40 21 40 00 00 00 00 00 8d 00
+38 01 60 01 bd 73 67 21 44 00 0a 00 04 00 6e 00
+38 01 60 04 bd 73 a1 21 80 01 60 00 04 00 6e 00
+38 00 60 04 bd 73 20 21 00 01 8d 00 00 00 8d 00
+38 00 80 04 bd 73 c0 20 80 00 8d 00 00 00 8d 00
+38 00 60 0a bd 77 80 22 c0 01 8d 00 80 02 00 00
+38 00 80 0a bd 77 a0 22 60 02 8d 00 20 03 00 00
+38 01 60 0d 21 04 af 21 c4 00 0e 00 ba 00 0a 00
+38 01 60 0a bd 77 c1 21 a0 01 60 00 80 01 60 00
+38 00 60 02 bd 73 c0 22 80 02 8d 00 00 00 8d 00
+38 00 80 02 bd 73 80 20 80 05 8d 00 00 00 8d 00
+38 01 60 07 bd 73 a2 20 c0 01 60 00 04 00 6e 00
+38 01 60 06 bd 73 a1 22 00 02 60 00 04 00 6e 00
+38 01 60 03 bd 73 01 22 e0 01 60 00 04 00 6e 00
+38 01 60 02 bd 73 e1 21 c0 01 60 00 04 00 6e 00
+38 01 60 0c a5 14 23 21 20 00 00 00 29 00 0a 00
+38 00 60 03 bd 73 80 2f a0 00 8d 00 00 00 8d 00
+38 00 80 03 bd 73 00 2f e0 00 8d 00 00 00 8d 00
+38 01 60 0c 21 04 c7 21 c4 00 0a 00 d4 00 0a 00
+38 00 60 07 bd 73 e0 2f a0 00 8d 00 00 00 8d 00
+38 00 80 07 bd 73 c0 2f e0 00 8d 00 00 00 8d 00
+38 00 60 0c a5 14 80 20 40 00 00 00 50 00 00 00
+38 10 60 0c a5 14 80 20 40 00 00 00 50 00 00 00
+38 01 60 05 bd 73 a1 28 80 28 60 00 04 00 6e 00
+38 00 60 05 bd 73 e0 25 c0 05 8d 00 00 00 8d 00
+38 00 80 05 bd 73 80 2a 40 0a 8d 00 00 00 8d 00
+38 00 60 83 bd 73 60 20 40 00 00 00 00 00 8d 00
+38 00 80 83 bd 73 60 20 40 00 00 00 00 00 8d 00
+38 00 60 06 bd 73 60 20 40 00 00 00 00 00 8d 00
+38 00 80 06 bd 73 60 20 40 00 00 00 00 00 8d 00
+38 00 60 0c 21 04 c0 2f 80 00 00 00 c0 00 8d 00
+38 10 60 0c 21 04 a0 2f 80 00 00 00 e0 00 8d 00
+38 01 60 85 bd 73 8f 2e c0 20 60 00 04 00 6e 00
+38 01 60 8a bd 77 8f 2e 40 00 60 00 45 00 65 00
+38 00 60 0d 21 04 80 25 60 05 8d 00 80 01 8d 00
+38 10 60 0d 21 04 20 29 e0 08 8d 00 a0 01 8d 00
+38 01 60 81 bd 73 81 2e 20 00 00 00 04 00 6e 00
+38 01 60 82 bd 73 8f 2e c0 00 60 00 04 00 6e 00
diff --git a/src/intel/compiler/elk/tests/gen7/mov.asm b/src/intel/compiler/elk/tests/gen7/mov.asm
new file mode 100644
index 00000000000..e77b450e42a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/mov.asm
@@ -0,0 +1,148 @@
+mov(8)          g126<1>F        g4<8,8,1>D                      { align1 1Q };
+mov(8)          g124<1>F        g126<8,8,1>F                    { align1 1Q };
+mov(16)         g124<1>F        g4<8,8,1>D                      { align1 1H };
+mov(16)         g120<1>F        g124<8,8,1>F                    { align1 1H };
+mov(8)          g114<1>D        0D                              { align16 1Q };
+mov(8)          g115<1>F        0x41880000F      /* 17F */      { align16 1Q };
+mov(8)          g113<1>UD       g0<4>UD                         { align16 WE_all 1Q };
+mov.sat(8)      g116<1>F        g4<4>F                          { align16 1Q };
+mov(8)          g114<1>.wF      g5<4>.xF                        { align16 1Q };
+mov(4)          g114<1>F        g2.3<8,2,4>F                    { align1 WE_all 1N };
+mov(8)          g125<1>F        0x0F             /* 0F */       { align1 1Q };
+mov(16)         g122<1>F        0x0F             /* 0F */       { align1 1H };
+mov(8)          g2<1>F          g6<8,4,1>UW                     { align1 1Q };
+mov(8)          g7<1>D          g2<8,8,1>F                      { align1 1Q };
+mov(16)         g2<1>F          g4<8,8,1>UW                     { align1 1H };
+mov(16)         g8<1>D          g2<8,8,1>F                      { align1 1H };
+mov(8)          g39<1>D         g3.4<0>D                        { align16 1Q };
+mov(8)          g12<1>F         0x30003000VF    /* [0F, 1F, 0F, 1F]VF */ { align16 1Q };
+mov(8)          g51<1>UD        0x00000000UD                    { align1 WE_all 1Q };
+mov(1)          g51.5<1>UD      0x0000ff00UD                    { align1 WE_all 1N };
+mov(1)          g51<1>UD        g[a0]<0,1,0>UD                  { align1 WE_all 1N };
+mov(2)          g12<1>UD        g0<0,1,0>UD                     { align1 WE_all 1N };
+mov(8)          g13<1>D         g50<4>D                         { align16 WE_all 1Q };
+mov(8)          g15<1>.xUD      g5<0>.wUD                       { align16 1Q };
+(+f0.0.any4h) mov(8) g19<1>.xD  -1D                             { align16 1Q };
+mov.z.f0.0(8)   null<1>F        g11<0>.xUD                      { align16 1Q };
+mov(8)          g126<1>F        0x00000000UD                    { align1 WE_all 1Q };
+mov(1)          g51<1>F         0x3189705fF      /* 4e-09F */   { align1 WE_all 1N };
+mov(1)          g126<1>D        0D                              { align1 WE_all 1N };
+mov(1)          g126<1>D        g39<0,1,0>D                     { align1 WE_all 1N };
+mov(8)          g116<1>.xD      1059749626D                     { align16 NoDDClr 1Q };
+mov(8)          g116<1>.yD      1143373824D                     { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          g117<1>.yD      -1093874483D                    { align16 NoDDChk 1Q };
+mov(8)          g117<1>.xzF     0x7e0020VF      /* [0.5F, 0F, 30F, 0F]VF */ { align16 NoDDChk 1Q };
+mov(8)          g7<1>UD         g0<8,8,1>UD                     { align1 WE_all 1Q };
+mov(8)          g21<1>UD        g0<8,8,1>UD                     { align1 WE_all 2Q };
+mov(8)          g23<1>F         g6<0,1,0>F                      { align1 2Q };
+mov(1)          g21.2<1>UD      0x000003f2UD                    { align1 WE_all 3N };
+mov(8)          g8<1>.xD        g27<4>.xD                       { align16 NoDDClr 1Q };
+mov(8)          g8<1>.yD        g4<0>.yD                        { align16 NoDDChk 1Q };
+mov(8)          g10<1>.xF       -g12<4>.xD                      { align16 1Q };
+mov(8)          g13<1>.xyD      acc0<4>D                        { align16 1Q };
+mov(8)          g116<1>.xF      -g9<4>.xD                       { align16 NoDDClr 1Q };
+mov(8)          g115<1>.yUD     0x00000000UD                    { align16 NoDDChk 1Q };
+mov(2)          g113.3<1>UD     0x00000000UD                    { align1 WE_all 1N };
+mov(2)          g113.4<1>UW     g12<8,1,0>UW                    { align1 WE_all 1N };
+mov(8)          g38<1>UD        g1.7<0,1,0>D                    { align1 1Q };
+mov(8)          g7<1>UD         g0.1<0,1,0>UD                   { align1 1Q };
+mov(8)          g19<1>.xD       g18<4>.xF                       { align16 1Q };
+mov(8)          g8<1>D          1065353216D                     { align16 WE_all 1Q };
+mov(8)          g5<1>F          g3.3<0,1,0>UD                   { align1 1Q };
+mov(16)         g5<1>F          g3.3<0,1,0>UD                   { align1 1H };
+mov.sat(8)      g116<1>.xF      0x3f800000F      /* 1F */       { align16 NoDDClr 1Q };
+mov.sat(8)      g116<1>.yF      0x3f666666F      /* 0.9F */     { align16 NoDDClr,NoDDChk 1Q };
+mov.sat(8)      g116<1>.wF      0x3f333333F      /* 0.7F */     { align16 NoDDChk 1Q };
+mov(8)          g19<1>.yzwD     0x48403000VF    /* [0F, 1F, 2F, 3F]VF */ { align16 1Q };
+mov(8)          g12<1>F         g11<4>UD                        { align16 1Q };
+(+f0.0.all4h) mov(8) g13<1>.xD  -1D                             { align16 1Q };
+mov(16)         g122<1>UD       g0<8,8,1>UD                     { align1 WE_all 1H };
+mov(8)          g116<1>.yF      g56<4>.xD                       { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          g116<1>.wF      g58<4>.xD                       { align16 NoDDChk 1Q };
+mov(8)          g115<1>.zwF     0x30000000VF    /* [0F, 0F, 0F, 1F]VF */ { align16 NoDDClr 1Q };
+mov(8)          g26<1>.xUD      0x00000001UD                    { align16 1Q };
+mov(8)          g116<1>.xyD     g4<4>.xyyyD                     { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          g14<1>D         0D                              { align1 1Q };
+mov(16)         g20<1>D         0D                              { align1 1H };
+mov(8)          g3<1>.xF        -g16<4>.xF                      { align16 NoDDChk 1Q };
+mov(8)          g115<1>.wF      0D                              { align16 NoDDChk 1Q };
+mov(8)          g8<1>.xUD       g1<0>.xF                        { align16 1Q };
+mov(8)          g3<1>.xyzF      g1.4<0>.xyzzUD                  { align16 NoDDClr 1Q };
+mov(8)          g3<1>.wF        g1<0>.xUD                       { align16 NoDDChk 1Q };
+mov(8)          g3<1>D          -g2<0,1,0>D                     { align1 1Q };
+mov(16)         g3<1>D          -g2<0,1,0>D                     { align1 1H };
+mov.nz.f0.0(8)  null<1>.xD      g1<0>.xD                        { align16 1Q };
+mov.sat(8)      g124<1>F        g2.2<0,1,0>F                    { align1 1Q };
+mov.sat(16)     g120<1>F        g2.2<0,1,0>F                    { align1 1H };
+mov(8)          g124<1>UD       g15<8,8,1>F                     { align1 1Q };
+mov(16)         g120<1>UD       g28<8,8,1>F                     { align1 1H };
+mov(8)          g7<1>.xF        -g6<4>.yF                       { align16 NoDDClr 1Q };
+mov(16)         g18<1>UD        g2<8,8,1>D                      { align1 1H };
+mov(1)          f0.1<1>UW       g1.14<0,1,0>UW                  { align1 WE_all 1N };
+mov(1)          g123.14<1>UW    f0.1<0,1,0>UW                   { align1 WE_all 1N };
+mov.nz.f0.0(8)  null<1>D        g2<8,8,1>D                      { align1 1Q };
+mov.nz.f0.0(16) null<1>D        g89<8,8,1>D                     { align1 1H };
+mov.sat(8)      g116<1>.wF      g20<4>.wF                       { align16 NoDDChk 1Q };
+mov.z.f0.0(8)   g29<1>.xD       g28<4>.xF                       { align16 1Q };
+mov(8)          g26<1>UD        g2<8,8,1>UD                     { align1 2Q };
+mov(8)          g34<1>D         g3<8,8,1>D                      { align1 2Q };
+mov.sat(8)      g116<1>F        0x3f800000F      /* 1F */       { align16 1Q };
+mov(8)          g7<1>.xUD       2D                              { align16 1Q };
+mov.sat(8)      g116<1>F        -g6<4>D                         { align16 1Q };
+mov(8)          g117<1>.yF      g4<4>.yF                        { align16 NoDDClr,NoDDChk 1Q };
+mov(1)          f1<1>UD         g1.7<0,1,0>UD                   { align1 WE_all 1N };
+mov(8)          g119<1>.zwD     0x706e0000VF    /* [0F, 0F, 15F, 16F]VF */ { align16 NoDDChk 1Q };
+mov.sat(8)      g116<1>.xyzF    -g11<4>.xyzzD                   { align16 NoDDClr 1Q };
+mov(8)          g26<1>UD        0D                              { align1 WE_all 1Q };
+mov(1)          g13.7<1>UD      65535D                          { align1 WE_all 1N };
+mov(1)          g26.7<1>UD      f0.1<0,1,0>UW                   { align1 WE_all 1N };
+mov(8)          g18<1>UD        0D                              { align1 WE_all 2Q };
+mov(1)          g18.7<1>UD      65535D                          { align1 WE_all 3N };
+mov(1)          g2.7<1>UD       f0.1<0,1,0>UW                   { align1 WE_all 3N };
+mov(1)          g2.7<1>UD       g1.7<0,1,0>UD                   { align1 WE_all 3N };
+mov(8)          g9<1>UD         0x00000000UD                    { align16 WE_all 1Q };
+mov(8)          g6<1>UD         0D                              { align1 1Q };
+mov(16)         g8<1>UD         0D                              { align1 1H };
+mov(8)          g8<1>UW         0x32103210V                     { align1 WE_all 1Q };
+mov(8)          g116<1>.yzF     0x484000VF      /* [0F, 2F, 3F, 0F]VF */ { align16 NoDDClr,NoDDChk 1Q };
+mov(8)          g22<1>.xUD      0D                              { align16 WE_all 1Q };
+mov(8)          g21<1>.xUD      g13<4>.xD                       { align16 1Q };
+mov.nz.f1.0(4)  null<1>F        g16<4>.xUD                      { align16 WE_all 1N };
+mov(1)          f1<1>UW         f0.1<0,1,0>UW                   { align1 WE_all 1N };
+mov(8)          g5<1>UD         0x00000000UD                    { align1 1Q };
+mov(16)         g7<1>UD         0x00000000UD                    { align1 1H };
+mov.nz.f0.0(8)  null<1>D        0x00000000UD                    { align1 1Q };
+mov.nz.f0.0(16) null<1>D        0x00000000UD                    { align1 1H };
+mov(1)          g2<1>UW         g3<0,1,0>UW                     { align1 WE_all 1N };
+mov(8)          g59<1>.xUD      0x00000020UD                    { align16 NoDDClr 1Q };
+mov(8)          g59<1>.yzwUD    0D                              { align16 NoDDChk 1Q };
+mov(8)          g11<1>D         16D                             { align1 2Q };
+mov.sat(8)      g116<1>.yzF     g1<0>.xxzzF                     { align16 NoDDClr,NoDDChk 1Q };
+mov.sat(8)      g116<1>.xF      -g1<0>.wF                       { align16 NoDDClr 1Q };
+mov.sat(8)      g116<1>.yF      -g11<4>.xD                      { align16 NoDDClr,NoDDChk 1Q };
+mov.sat(8)      g116<1>.wF      -g13<4>.xD                      { align16 NoDDChk 1Q };
+mov(8)          g2<1>UD         g11<16,8,2>UW                   { align1 1Q };
+mov(16)         g6<1>UD         g2<16,8,2>UW                    { align1 1H };
+mov(8)          g27<1>UD        g24.2<32,8,4>UB                 { align1 1Q };
+mov(16)         g38<1>UD        g36<32,8,4>UB                   { align1 1H };
+mov(8)          g19<1>D         g2<16,8,2>W                     { align1 1Q };
+mov(16)         g21<1>D         g6<16,8,2>W                     { align1 1H };
+mov(8)          g27<1>D         g24.2<32,8,4>B                  { align1 1Q };
+mov(16)         g38<1>D         g36<32,8,4>B                    { align1 1H };
+mov(8)          g27<1>F         g24.2<32,8,4>UB                 { align1 1Q };
+mov(16)         g39<1>F         g37<32,8,4>UB                   { align1 1H };
+mov(8)          g25<1>F         g2<16,8,2>W                     { align1 1Q };
+mov(16)         g21<1>F         g6<16,8,2>W                     { align1 1H };
+mov(8)          g27<1>F         g24.2<32,8,4>B                  { align1 1Q };
+mov(16)         g34<1>F         g32<32,8,4>B                    { align1 1H };
+mov(8)          g3<1>F          0x0F             /* 0F */       { align1 WE_all 1Q };
+mov(16)         g2<1>UD         0x00000000UD                    { align1 WE_all 1H };
+mov(8)          g2<1>D          0x00000000UD                    { align1 1Q };
+mov(16)         g2<1>D          0x00000000UD                    { align1 1H };
+mov(1)          g1<1>UW         g2<0>UW                         { align16 WE_all 1N };
+mov(1)          f1<1>UD         0x00000000UD                    { align1 WE_all 1N };
+mov.z.f1.0(8)   null<1>UW       0x0000UW                        { align1 1Q };
+mov.z.f1.0(16)  null<1>UW       0x0000UW                        { align1 1H };
+mov.z.f0.0(8)   null<1>D        g21<8,8,1>F                     { align1 1Q };
+mov.z.f0.0(16)  null<1>D        g86<8,8,1>F                     { align1 1H };
+mov(1)          a0.4<1>W        127W                            { align1 1N };
diff --git a/src/intel/compiler/elk/tests/gen7/mov.expected b/src/intel/compiler/elk/tests/gen7/mov.expected
new file mode 100644
index 00000000000..418c6464e4b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/mov.expected
@@ -0,0 +1,148 @@
+01 00 60 00 bd 00 c0 2f 80 00 8d 00 00 00 00 00
+01 00 60 00 bd 03 80 2f c0 0f 8d 00 00 00 00 00
+01 00 80 00 bd 00 80 2f 80 00 8d 00 00 00 00 00
+01 00 80 00 bd 03 00 2f 80 0f 8d 00 00 00 00 00
+01 01 60 00 e5 10 4f 2e 00 00 00 00 00 00 00 00
+01 01 60 00 fd 73 6f 2e 00 00 00 00 00 00 88 41
+01 03 60 00 21 00 2f 2e 04 00 6e 00 00 00 00 00
+01 01 60 80 bd 03 8f 2e 84 00 6e 00 00 00 00 00
+01 01 60 00 bd 03 48 2e a0 00 60 00 00 00 00 00
+01 02 40 00 bd 03 40 2e 4c 00 87 00 00 00 00 00
+01 00 60 00 fd 73 a0 2f 00 00 00 00 00 00 00 00
+01 00 80 00 fd 73 40 2f 00 00 00 00 00 00 00 00
+01 00 60 00 3d 01 40 20 c0 00 89 00 00 00 00 00
+01 00 60 00 a5 03 e0 20 40 00 8d 00 00 00 00 00
+01 00 80 00 3d 01 40 20 80 00 8d 00 00 00 00 00
+01 00 80 00 a5 03 00 21 40 00 8d 00 00 00 00 00
+01 01 60 00 a5 00 ef 24 74 00 0e 00 00 00 00 00
+01 01 60 00 fd 52 8f 21 00 00 00 00 00 30 00 30
+01 02 60 00 61 00 60 26 00 00 00 00 00 00 00 00
+01 02 00 00 61 00 74 26 00 00 00 00 00 ff 00 00
+01 02 00 00 21 00 60 26 00 80 00 00 00 00 00 00
+01 02 20 00 21 00 80 21 00 00 00 00 00 00 00 00
+01 03 60 00 a5 00 af 21 44 06 6e 00 00 00 00 00
+01 01 60 00 21 00 e1 21 af 00 0f 00 00 00 00 00
+01 01 66 00 e5 10 61 22 00 00 00 00 ff ff ff ff
+01 01 60 01 3c 00 0f 20 60 01 00 00 00 00 00 00
+01 02 60 00 7d 00 c0 2f 00 00 00 00 00 00 00 00
+01 02 00 00 fd 73 60 26 00 00 00 00 5f 70 89 31
+01 02 00 00 e5 10 c0 2f 00 00 00 00 00 00 00 00
+01 02 00 00 a5 00 c0 2f e0 04 00 00 00 00 00 00
+01 05 60 00 e5 10 81 2e 00 00 00 00 fa 7e 2a 3f
+01 0d 60 00 e5 10 82 2e 00 00 00 00 00 80 26 44
+01 09 60 00 e5 10 a2 2e 00 00 00 00 cd cc cc be
+01 09 60 00 fd 52 a5 2e 00 00 00 00 20 00 7e 00
+01 02 60 00 21 00 e0 20 00 00 8d 00 00 00 00 00
+01 12 60 00 21 00 a0 22 00 00 8d 00 00 00 00 00
+01 10 60 00 bd 03 e0 22 c0 00 00 00 00 00 00 00
+01 12 00 00 61 00 a8 22 00 00 00 00 f2 03 00 00
+01 05 60 00 a5 00 01 21 60 03 60 00 00 00 00 00
+01 09 60 00 a5 00 02 21 85 00 05 00 00 00 00 00
+01 01 60 00 bd 00 41 21 80 41 60 00 00 00 00 00
+01 01 60 00 85 00 a3 21 04 04 6e 00 00 00 00 00
+01 05 60 00 bd 00 81 2e 20 41 60 00 00 00 00 00
+01 09 60 00 61 00 62 2e 00 00 00 00 00 00 00 00
+01 02 20 00 61 00 2c 2e 00 00 00 00 00 00 00 00
+01 02 20 00 29 01 28 2e 80 01 80 00 00 00 00 00
+01 00 60 00 a1 00 c0 24 3c 00 00 00 00 00 00 00
+01 00 60 00 21 00 e0 20 04 00 00 00 00 00 00 00
+01 01 60 00 a5 03 61 22 40 02 60 00 00 00 00 00
+01 03 60 00 e5 10 0f 21 00 00 00 00 00 00 80 3f
+01 00 60 00 3d 00 a0 20 6c 00 00 00 00 00 00 00
+01 00 80 00 3d 00 a0 20 6c 00 00 00 00 00 00 00
+01 05 60 80 fd 73 81 2e 00 00 00 00 00 00 80 3f
+01 0d 60 80 fd 73 82 2e 00 00 00 00 66 66 66 3f
+01 09 60 80 fd 73 88 2e 00 00 00 00 33 33 33 3f
+01 01 60 00 e5 52 6e 22 00 00 00 00 00 30 40 48
+01 01 60 00 3d 00 8f 21 64 01 6e 00 00 00 00 00
+01 01 67 00 e5 10 a1 21 00 00 00 00 ff ff ff ff
+01 02 80 00 21 00 40 2f 00 00 8d 00 00 00 00 00
+01 0d 60 00 bd 00 82 2e 00 07 60 00 00 00 00 00
+01 09 60 00 bd 00 88 2e 40 07 60 00 00 00 00 00
+01 05 60 00 fd 52 6c 2e 00 00 00 00 00 00 00 30
+01 01 60 00 61 00 41 23 00 00 00 00 01 00 00 00
+01 0d 60 00 a5 00 83 2e 84 00 65 00 00 00 00 00
+01 00 60 00 e5 10 c0 21 00 00 00 00 00 00 00 00
+01 00 80 00 e5 10 80 22 00 00 00 00 00 00 00 00
+01 09 60 00 bd 03 61 20 00 42 60 00 00 00 00 00
+01 09 60 00 fd 10 68 2e 00 00 00 00 00 00 00 00
+01 01 60 00 a1 03 01 21 20 00 00 00 00 00 00 00
+01 05 60 00 3d 00 67 20 34 00 0a 00 00 00 00 00
+01 09 60 00 3d 00 68 20 20 00 00 00 00 00 00 00
+01 00 60 00 a5 00 60 20 40 40 00 00 00 00 00 00
+01 00 80 00 a5 00 60 20 40 40 00 00 00 00 00 00
+01 01 60 02 a4 00 01 20 20 00 00 00 00 00 00 00
+01 00 60 80 bd 03 80 2f 48 00 00 00 00 00 00 00
+01 00 80 80 bd 03 00 2f 48 00 00 00 00 00 00 00
+01 00 60 00 a1 03 80 2f e0 01 8d 00 00 00 00 00
+01 00 80 00 a1 03 00 2f 80 03 8d 00 00 00 00 00
+01 05 60 00 bd 03 e1 20 c5 40 65 00 00 00 00 00
+01 00 80 00 a1 00 40 22 40 00 8d 00 00 00 00 00
+01 02 00 00 28 01 02 26 3c 00 00 00 00 00 00 00
+01 02 00 00 09 01 7c 2f 02 06 00 00 00 00 00 00
+01 00 60 02 a4 00 00 20 40 00 8d 00 00 00 00 00
+01 00 80 02 a4 00 00 20 20 0b 8d 00 00 00 00 00
+01 09 60 80 bd 03 88 2e 8f 02 6f 00 00 00 00 00
+01 01 60 01 a5 03 a1 23 80 03 60 00 00 00 00 00
+01 10 60 00 21 00 40 23 40 00 8d 00 00 00 00 00
+01 10 60 00 a5 00 40 24 60 00 8d 00 00 00 00 00
+01 01 60 80 fd 73 8f 2e 00 00 00 00 00 00 80 3f
+01 01 60 00 e1 10 e1 20 00 00 00 00 02 00 00 00
+01 01 60 80 bd 00 8f 2e c4 40 6e 00 00 00 00 00
+01 0d 60 00 bd 03 a2 2e 85 00 65 00 00 00 00 00
+01 02 00 00 20 00 20 26 3c 00 00 00 00 00 00 00
+01 09 60 00 e5 52 ec 2e 00 00 00 00 00 00 6e 70
+01 05 60 80 bd 00 87 2e 64 41 6a 00 00 00 00 00
+01 02 60 00 e1 10 40 23 00 00 00 00 00 00 00 00
+01 02 00 00 e1 10 bc 21 00 00 00 00 ff ff 00 00
+01 02 00 00 01 01 5c 23 02 06 00 00 00 00 00 00
+01 12 60 00 e1 10 40 22 00 00 00 00 00 00 00 00
+01 12 00 00 e1 10 5c 22 00 00 00 00 ff ff 00 00
+01 12 00 00 01 01 5c 20 02 06 00 00 00 00 00 00
+01 12 00 00 21 00 5c 20 3c 00 00 00 00 00 00 00
+01 03 60 00 61 00 2f 21 00 00 00 00 00 00 00 00
+01 00 60 00 e1 10 c0 20 00 00 00 00 00 00 00 00
+01 00 80 00 e1 10 00 21 00 00 00 00 00 00 00 00
+01 02 60 00 69 63 00 21 00 00 00 00 10 32 10 32
+01 0d 60 00 fd 52 86 2e 00 00 00 00 00 40 48 00
+01 03 60 00 e1 10 c1 22 00 00 00 00 00 00 00 00
+01 01 60 00 a1 00 a1 22 a0 01 60 00 00 00 00 00
+01 03 40 02 3c 00 0f 20 00 02 60 04 00 00 00 00
+01 02 00 00 08 01 20 26 02 06 00 00 00 00 00 00
+01 00 60 00 61 00 a0 20 00 00 00 00 00 00 00 00
+01 00 80 00 61 00 e0 20 00 00 00 00 00 00 00 00
+01 00 60 02 64 00 00 20 00 00 00 00 00 00 00 00
+01 00 80 02 64 00 00 20 00 00 00 00 00 00 00 00
+01 02 00 00 29 01 40 20 60 00 00 00 00 00 00 00
+01 05 60 00 61 00 61 27 00 00 00 00 20 00 00 00
+01 09 60 00 e1 10 6e 27 00 00 00 00 00 00 00 00
+01 10 60 00 e5 10 60 21 00 00 00 00 10 00 00 00
+01 0d 60 80 bd 03 86 2e 20 00 0a 00 00 00 00 00
+01 05 60 80 bd 03 81 2e 2f 40 0f 00 00 00 00 00
+01 0d 60 80 bd 00 82 2e 60 41 60 00 00 00 00 00
+01 09 60 80 bd 00 88 2e a0 41 60 00 00 00 00 00
+01 00 60 00 21 01 40 20 60 01 ae 00 00 00 00 00
+01 00 80 00 21 01 c0 20 40 00 ae 00 00 00 00 00
+01 00 60 00 21 02 60 23 02 03 cf 00 00 00 00 00
+01 00 80 00 21 02 c0 24 80 04 cf 00 00 00 00 00
+01 00 60 00 a5 01 60 22 40 00 ae 00 00 00 00 00
+01 00 80 00 a5 01 a0 22 c0 00 ae 00 00 00 00 00
+01 00 60 00 a5 02 60 23 02 03 cf 00 00 00 00 00
+01 00 80 00 a5 02 c0 24 80 04 cf 00 00 00 00 00
+01 00 60 00 3d 02 60 23 02 03 cf 00 00 00 00 00
+01 00 80 00 3d 02 e0 24 a0 04 cf 00 00 00 00 00
+01 00 60 00 bd 01 20 23 40 00 ae 00 00 00 00 00
+01 00 80 00 bd 01 a0 22 c0 00 ae 00 00 00 00 00
+01 00 60 00 bd 02 60 23 02 03 cf 00 00 00 00 00
+01 00 80 00 bd 02 40 24 00 04 cf 00 00 00 00 00
+01 02 60 00 fd 73 60 20 00 00 00 00 00 00 00 00
+01 02 80 00 61 00 40 20 00 00 00 00 00 00 00 00
+01 00 60 00 65 00 40 20 00 00 00 00 00 00 00 00
+01 00 80 00 65 00 40 20 00 00 00 00 00 00 00 00
+01 03 00 00 29 01 2f 20 44 00 0e 00 00 00 00 00
+01 02 00 00 60 00 20 26 00 00 00 00 00 00 00 00
+01 00 60 01 68 21 00 20 00 00 00 04 00 00 00 00
+01 00 80 01 68 21 00 20 00 00 00 04 00 00 00 00
+01 00 60 01 a4 03 00 20 a0 02 8d 00 00 00 00 00
+01 00 80 01 a4 03 00 20 c0 0a 8d 00 00 00 00 00
+01 00 00 00 ec 31 08 22 00 00 00 00 7f 00 7f 00
diff --git a/src/intel/compiler/elk/tests/gen7/mul.asm b/src/intel/compiler/elk/tests/gen7/mul.asm
new file mode 100644
index 00000000000..d4fe5a55284
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/mul.asm
@@ -0,0 +1,48 @@
+mul(8)          g124<1>F        g4<8,8,1>F      g6<8,8,1>F      { align1 1Q };
+mul(16)         g120<1>F        g6<8,8,1>F      g10<8,8,1>F     { align1 1H };
+mul(8)          g45<1>.xF       g5.4<0>.zF      g5.4<0>.zF      { align16 1Q };
+mul(8)          g39<1>.xD       g5<0>.xD        2D              { align16 1Q };
+mul(8)          acc0<1>D        g5.4<0>.zwwwD   g6<0>.xyyyD     { align16 1Q };
+mul(8)          g124<1>F        g4<8,8,1>F      0x3c23d70aF  /* 0.01F */ { align1 1Q };
+mul(16)         g120<1>F        g4<8,8,1>F      0x3c23d70aF  /* 0.01F */ { align1 1H };
+mul(8)          g9<1>.xyF       g8<4>.xyyyF     0x40000000F  /* 2F */ { align16 1Q };
+mul.sat(8)      g19<1>.xyzF     g15<4>.xyzzF    g18<4>.xF       { align16 1Q };
+mul(8)          g116<1>.xyF     g6<4>.xyyyF     0x3f000000F  /* 0.5F */ { align16 NoDDClr 1Q };
+mul.sat(8)      g2<1>F          g6<8,8,1>F      g5<8,8,1>F      { align1 1Q };
+mul.sat(16)     g16<1>F         g12<8,8,1>F     g10<8,8,1>F     { align1 1H };
+mul(8)          g3<1>D          g2<0,1,0>D      36W             { align1 1Q };
+mul(16)         g3<1>D          g2<0,1,0>D      36W             { align1 1H };
+mul(8)          g29<1>F         g28<4>.yF       0x3000VF /* [0F, 1F, 0F, 0F]VF */ { align16 1Q };
+mul(8)          g115<1>.xyzF    g2<4>.xyzzF     g8<4>.xF        { align16 NoDDClr 1Q };
+mul.l.f0.0(8)   null<1>.xF      g6<0>.xF        g5.4<0>.wF      { align16 1Q };
+mul.sat(8)      g10<1>F         g64<8,8,1>F     0x40a00001F  /* 5F */ { align1 1Q };
+mul.sat(16)     g13<1>F         g11<8,8,1>F     0x40a00001F  /* 5F */ { align1 1H };
+mul(2)          g113.3<1>UD     g35<8,2,4>UD    0x0005UW        { align1 WE_all 1N };
+mul(8)          acc0<1>UD       g10<8,8,1>UD    0xaaaaaaabUD    { align1 1Q };
+mul(8)          acc0<1>D        g10<8,8,1>D     1431655766D     { align1 1Q };
+mul(8)          acc0<1>UD       g23<8,8,1>UD    0xaaaaaaabUD    { align1 2Q };
+mul(8)          acc0<1>D        g23<8,8,1>D     1431655766D     { align1 2Q };
+mul(8)          g116<1>.yF      g12<4>.xF       0x3b800000F  /* 0.00390625F */ { align16 NoDDChk 1Q };
+mul(8)          g4<1>D          g2<0,1,0>D      g2.6<0,1,0>UW   { align1 1Q };
+mul(16)         g4<1>D          g2<0,1,0>D      g2.6<0,1,0>UW   { align1 1H };
+mul.l.f0.0(8)   g20<1>F         g2<8,8,1>F      0x42700000F  /* 60F */ { align1 1Q };
+mul.l.f0.0(16)  g32<1>F         g2<8,8,1>F      0x42700000F  /* 60F */ { align1 1H };
+mul(8)          g115<1>.xF      g15<4>.xF       0x40a66666F  /* 5.2F */ { align16 NoDDClr,NoDDChk 1Q };
+mul.sat(8)      g116<1>F        g6<4>F          0x3b800000F  /* 0.00390625F */ { align16 1Q };
+mul(8)          acc0<1>D        g1<4>.xD        741092396D      { align16 1Q };
+mul(8)          acc0<1>UD       g4<8,8,1>UD     g8<8,8,1>UD     { align1 1Q };
+mul(8)          acc0<1>UD       g5<8,8,1>UD     g13<8,8,1>UD    { align1 2Q };
+mul(8)          acc0<1>D        g5<8,8,1>D      g9<8,8,1>D      { align1 1Q };
+mul(8)          acc0<1>D        g6<8,8,1>D      g14<8,8,1>D     { align1 2Q };
+mul(8)          g3<1>D          g2<0,1,0>D      0x77b9UW        { align1 1Q };
+mul(16)         g3<1>D          g2<0,1,0>D      0x77b9UW        { align1 1H };
+mul(8)          acc0<1>UD       g8<4>.xUD       0xaaaaaaabUD    { align16 1Q };
+mul(8)          g17<1>.xD       g3<4>.xD        g11<4>.xD       { align16 1Q };
+mul.sat(8)      g116<1>.xyF     g1<0>.wzzzF     g3<4>.wzzzF     { align16 NoDDClr 1Q };
+mul.sat(8)      g116<1>.zwF     g1<0>.yyyxF     g3<4>.yyyxF     { align16 NoDDChk 1Q };
+mul.sat(8)      g116<1>F        g4<4>F          0x20303030VF /* [1F, 1F, 1F, 0.5F]VF */ { align16 1Q };
+mul(8)          acc0<1>UD       g9<4>UD         g11<4>UD        { align16 1Q };
+mul(1)          g3<1>UD         g15<0,1,0>UD    0x0101UW        { align1 WE_all 1N };
+mul(8)          g3<1>.wF        g1<0>.zF        g9<4>.xF        { align16 NoDDClr,NoDDChk 1Q };
+mul(8)          g117<1>.yF      g36<4>.xF       g19<4>.xF       { align16 NoDDChk 1Q };
+mul.sat(8)      g116<1>.xyzF    g12<4>.xF       0x3030VF /* [1F, 1F, 0F, 0F]VF */ { align16 NoDDClr 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7/mul.expected b/src/intel/compiler/elk/tests/gen7/mul.expected
new file mode 100644
index 00000000000..9e29bdb3228
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/mul.expected
@@ -0,0 +1,48 @@
+41 00 60 00 bd 77 80 2f 80 00 8d 00 c0 00 8d 00
+41 00 80 00 bd 77 00 2f c0 00 8d 00 40 01 8d 00
+41 01 60 00 bd 77 a1 25 ba 00 0a 00 ba 00 0a 00
+41 01 60 00 a5 1c e1 24 a0 00 00 00 02 00 00 00
+41 01 60 00 a4 14 0f 24 be 00 0f 00 c4 00 05 00
+41 00 60 00 bd 7f 80 2f 80 00 8d 00 0a d7 23 3c
+41 00 80 00 bd 7f 00 2f 80 00 8d 00 0a d7 23 3c
+41 01 60 00 bd 7f 23 21 04 01 65 00 00 00 00 40
+41 01 60 80 bd 77 67 22 e4 01 6a 00 40 02 60 00
+41 05 60 00 bd 7f 83 2e c4 00 65 00 00 00 00 3f
+41 00 60 80 bd 77 40 20 c0 00 8d 00 a0 00 8d 00
+41 00 80 80 bd 77 00 22 80 01 8d 00 40 01 8d 00
+41 00 60 00 a5 3c 60 20 40 00 00 00 24 00 24 00
+41 00 80 00 a5 3c 60 20 40 00 00 00 24 00 24 00
+41 01 60 00 bd 5f af 23 85 03 65 00 00 30 00 00
+41 05 60 00 bd 77 67 2e 44 00 6a 00 00 01 60 00
+41 01 60 05 bc 77 01 20 c0 00 00 00 bf 00 0f 00
+41 00 60 80 bd 7f 40 21 00 08 8d 00 01 00 a0 40
+41 00 80 80 bd 7f a0 21 60 01 8d 00 01 00 a0 40
+41 02 20 00 21 2c 2c 2e 60 04 87 00 05 00 05 00
+41 00 60 00 20 0c 00 24 40 01 8d 00 ab aa aa aa
+41 00 60 00 a4 1c 00 24 40 01 8d 00 56 55 55 55
+41 10 60 00 20 0c 00 24 e0 02 8d 00 ab aa aa aa
+41 10 60 00 a4 1c 00 24 e0 02 8d 00 56 55 55 55
+41 09 60 00 bd 7f 82 2e 80 01 60 00 00 00 80 3b
+41 00 60 00 a5 24 80 20 40 00 00 00 4c 00 00 00
+41 00 80 00 a5 24 80 20 40 00 00 00 4c 00 00 00
+41 00 60 05 bd 7f 80 22 40 00 8d 00 00 00 70 42
+41 00 80 05 bd 7f 00 24 40 00 8d 00 00 00 70 42
+41 0d 60 00 bd 7f 61 2e e0 01 60 00 66 66 a6 40
+41 01 60 80 bd 7f 8f 2e c4 00 6e 00 00 00 80 3b
+41 01 60 00 a4 1c 0f 24 20 00 60 00 2c 2c 2c 2c
+41 00 60 00 20 04 00 24 80 00 8d 00 00 01 8d 00
+41 10 60 00 20 04 00 24 a0 00 8d 00 a0 01 8d 00
+41 00 60 00 a4 14 00 24 a0 00 8d 00 20 01 8d 00
+41 10 60 00 a4 14 00 24 c0 00 8d 00 c0 01 8d 00
+41 00 60 00 a5 2c 60 20 40 00 00 00 b9 77 b9 77
+41 00 80 00 a5 2c 60 20 40 00 00 00 b9 77 b9 77
+41 01 60 00 20 0c 0f 24 00 01 60 00 ab aa aa aa
+41 01 60 00 a5 14 21 22 60 00 60 00 60 01 60 00
+41 05 60 80 bd 77 83 2e 2b 00 0a 00 6b 00 6a 00
+41 09 60 80 bd 77 8c 2e 25 00 01 00 65 00 61 00
+41 01 60 80 bd 5f 8f 2e 84 00 6e 00 30 30 30 20
+41 01 60 00 20 04 0f 24 24 01 6e 00 64 01 6e 00
+41 02 00 00 21 2c 60 20 e0 01 00 00 01 01 01 01
+41 0d 60 00 bd 77 68 20 2a 00 0a 00 20 01 60 00
+41 09 60 00 bd 77 a2 2e 80 04 60 00 60 02 60 00
+41 05 60 80 bd 5f 87 2e 80 01 60 00 30 30 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/not.asm b/src/intel/compiler/elk/tests/gen7/not.asm
new file mode 100644
index 00000000000..f5106c46915
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/not.asm
@@ -0,0 +1,4 @@
+not(8)          g13<1>.xD       g5.4<0>.wD                      { align16 1Q };
+not.nz.f0.0(8)  null<1>.xD      g13<4>.xD                       { align16 1Q };
+not(8)          g20<1>D         g19<8,8,1>D                     { align1 1Q };
+not(16)         g27<1>D         g25<8,8,1>D                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/not.expected b/src/intel/compiler/elk/tests/gen7/not.expected
new file mode 100644
index 00000000000..5a7df374c57
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/not.expected
@@ -0,0 +1,4 @@
+04 01 60 00 a5 00 a1 21 bf 00 0f 00 00 00 00 00
+04 01 60 02 a4 00 01 20 a0 01 60 00 00 00 00 00
+04 00 60 00 a5 00 80 22 60 02 8d 00 00 00 00 00
+04 00 80 00 a5 00 60 23 20 03 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/or.asm b/src/intel/compiler/elk/tests/gen7/or.asm
new file mode 100644
index 00000000000..513eeba23cf
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/or.asm
@@ -0,0 +1,20 @@
+or(1)           g113.5<1>UD     g0.5<0,1,0>UD   0x0000ff00UD    { align1 WE_all 1N };
+or.nz.f0.0(8)   null<1>.xUD     g21<4>.xUD      g19<4>.xUD      { align16 1Q };
+or(8)           g13<1>.xyUD     g5.4<0>.zwwwUD  g6<0>.xUD       { align16 1Q };
+or(8)           g10<1>UD        g9<8,8,1>UD     g8<8,8,1>UD     { align1 1Q };
+or(16)          g16<1>UD        g14<8,8,1>UD    g12<8,8,1>UD    { align1 1H };
+(+f0.0) or(8)   g22<1>.xUD      g22<4>.xUD      0x3f800000UD    { align16 1Q };
+or.nz.f0.0(8)   g8<1>UD         g4<8,8,1>UD     g7<8,8,1>UD     { align1 1Q };
+or.nz.f0.0(8)   null<1>UD       g12<8,8,1>UD    g15<8,8,1>UD    { align1 1Q };
+or.nz.f0.0(16)  g12<1>UD        g5<8,8,1>UD     g10<8,8,1>UD    { align1 1H };
+or.nz.f0.0(16)  null<1>UD       g20<8,8,1>UD    g26<8,8,1>UD    { align1 1H };
+(+f0.0) or(8)   g6<1>UD         g6<8,8,1>UD     0x3f800000UD    { align1 1Q };
+(+f0.0) or(16)  g8<1>UD         g8<8,8,1>UD     0x3f800000UD    { align1 1H };
+or(1)           a0<1>UD         g8<0,1,0>UD     0x02427000UD    { align1 WE_all 1N };
+or(1)           a0<1>UD         a0<0,1,0>UD     0x02114000UD    { align1 WE_all 1N };
+or(1)           a0<1>UD         g2<0,1,0>UD     0x0e0b6000UD    { align1 WE_all 3N };
+or(1)           a0<1>UD         a0<0,1,0>UD     g17<0,1,0>UD    { align1 WE_all 1N };
+or(1)           g113.21<1>UB    g59<0,1,0>UB    g59.16<0,1,0>UB { align1 WE_all 1N };
+or(1)           g2<1>UD         g2<0,1,0>UD     g7<0,1,0>UD     { align1 WE_all 1N };
+or(8)           g32<1>UD        g24<8,8,1>UD    0x00281502UD    { align1 1Q };
+or(16)          g47<1>UD        g45<8,8,1>UD    0x00281502UD    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/or.expected b/src/intel/compiler/elk/tests/gen7/or.expected
new file mode 100644
index 00000000000..8673a5fab9f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/or.expected
@@ -0,0 +1,20 @@
+06 02 00 00 21 0c 34 2e 14 00 00 00 00 ff 00 00
+06 01 60 02 20 04 01 20 a0 02 60 00 60 02 60 00
+06 01 60 00 21 04 a3 21 be 00 0f 00 c0 00 00 00
+06 00 60 00 21 04 40 21 20 01 8d 00 00 01 8d 00
+06 00 80 00 21 04 00 22 c0 01 8d 00 80 01 8d 00
+06 01 61 00 21 0c c1 22 c0 02 60 00 00 00 80 3f
+06 00 60 02 21 04 00 21 80 00 8d 00 e0 00 8d 00
+06 00 60 02 20 04 00 20 80 01 8d 00 e0 01 8d 00
+06 00 80 02 21 04 80 21 a0 00 8d 00 40 01 8d 00
+06 00 80 02 20 04 00 20 80 02 8d 00 40 03 8d 00
+06 00 61 00 21 0c c0 20 c0 00 8d 00 00 00 80 3f
+06 00 81 00 21 0c 00 21 00 01 8d 00 00 00 80 3f
+06 02 00 00 20 0c 00 22 00 01 00 00 00 70 42 02
+06 02 00 00 00 0c 00 22 00 02 00 00 00 40 11 02
+06 12 00 00 20 0c 00 22 40 00 00 00 00 60 0b 0e
+06 02 00 00 00 04 00 22 00 02 00 00 20 02 00 00
+06 02 00 00 31 46 35 2e 60 07 00 00 70 07 00 00
+06 02 00 00 21 04 40 20 40 00 00 00 e0 00 00 00
+06 00 60 00 21 0c 00 24 00 03 8d 00 02 15 28 00
+06 00 80 00 21 0c e0 25 a0 05 8d 00 02 15 28 00
diff --git a/src/intel/compiler/elk/tests/gen7/pln.asm b/src/intel/compiler/elk/tests/gen7/pln.asm
new file mode 100644
index 00000000000..1353e700f3e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/pln.asm
@@ -0,0 +1,2 @@
+pln(8)          g124<1>F        g4<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln(16)         g120<1>F        g6<0,1,0>F      g2<8,8,1>F      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/pln.expected b/src/intel/compiler/elk/tests/gen7/pln.expected
new file mode 100644
index 00000000000..17ef47ccb65
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/pln.expected
@@ -0,0 +1,2 @@
+5a 00 60 00 bd 77 80 2f 80 00 00 00 40 00 8d 00
+5a 00 80 00 bd 77 00 2f c0 00 00 00 40 00 8d 00
diff --git a/src/intel/compiler/elk/tests/gen7/rndd.asm b/src/intel/compiler/elk/tests/gen7/rndd.asm
new file mode 100644
index 00000000000..b8dbada287e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/rndd.asm
@@ -0,0 +1,7 @@
+rndd(8)         g18<1>.xF       g1<0>.xF                        { align16 1Q };
+rndd(8)         g4<1>F          -g2<0,1,0>F                     { align1 1Q };
+rndd(16)        g4<1>F          -g2<0,1,0>F                     { align1 1H };
+rndd(8)         g6<1>.zF        g22<4>.xF                       { align16 NoDDClr 1Q };
+rndd.z.f0.0(8)  null<1>F        g17<8,8,1>F                     { align1 1Q };
+rndd.z.f0.0(16) null<1>F        g38<8,8,1>F                     { align1 1H };
+rndd.sat(8)     g116<1>F        g6<4>F                          { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7/rndd.expected b/src/intel/compiler/elk/tests/gen7/rndd.expected
new file mode 100644
index 00000000000..e20ac2958f9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/rndd.expected
@@ -0,0 +1,7 @@
+45 01 60 00 bd 03 41 22 20 00 00 00 00 00 00 00
+45 00 60 00 bd 03 80 20 40 40 00 00 00 00 00 00
+45 00 80 00 bd 03 80 20 40 40 00 00 00 00 00 00
+45 05 60 00 bd 03 c4 20 c0 02 60 00 00 00 00 00
+45 00 60 01 bc 03 00 20 20 02 8d 00 00 00 00 00
+45 00 80 01 bc 03 00 20 c0 04 8d 00 00 00 00 00
+45 01 60 80 bd 03 8f 2e c4 00 6e 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/rnde.asm b/src/intel/compiler/elk/tests/gen7/rnde.asm
new file mode 100644
index 00000000000..9aeec4a2d6d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/rnde.asm
@@ -0,0 +1,3 @@
+rnde(8)         g6<1>F          g3<8,8,1>F                      { align1 1Q };
+rnde(16)        g8<1>F          g4<8,8,1>F                      { align1 1H };
+rnde(8)         g12<1>.xyzF     g6<0>.xyzzF                     { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7/rnde.expected b/src/intel/compiler/elk/tests/gen7/rnde.expected
new file mode 100644
index 00000000000..a2bed50a837
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/rnde.expected
@@ -0,0 +1,3 @@
+46 00 60 00 bd 03 c0 20 60 00 8d 00 00 00 00 00
+46 00 80 00 bd 03 00 21 80 00 8d 00 00 00 00 00
+46 01 60 00 bd 03 87 21 c4 00 0a 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/rndz.asm b/src/intel/compiler/elk/tests/gen7/rndz.asm
new file mode 100644
index 00000000000..c1fa15618e0
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/rndz.asm
@@ -0,0 +1,3 @@
+rndz(8)         g9<1>.xyzF      g1<0>.xyzzF                     { align16 1Q };
+rndz(8)         g3<1>F          g2<0,1,0>F                      { align1 1Q };
+rndz(16)        g3<1>F          g2<0,1,0>F                      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/rndz.expected b/src/intel/compiler/elk/tests/gen7/rndz.expected
new file mode 100644
index 00000000000..6079a4089d1
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/rndz.expected
@@ -0,0 +1,3 @@
+47 01 60 00 bd 03 27 21 24 00 0a 00 00 00 00 00
+47 00 60 00 bd 03 60 20 40 00 00 00 00 00 00 00
+47 00 80 00 bd 03 60 20 40 00 00 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/sel.asm b/src/intel/compiler/elk/tests/gen7/sel.asm
new file mode 100644
index 00000000000..7e31e6309f5
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/sel.asm
@@ -0,0 +1,56 @@
+(+f0.0) sel(8)  g47<1>UD        g12<4>UD        g13<4>UD        { align16 1Q };
+(-f0.0) sel(8)  g25<1>.xyUD     g13<4>.zwwwUD   0x40000000UD    { align16 1Q };
+(+f0.0.any4h) sel(8) g30<1>UD   g13<4>UD        g12<4>UD        { align16 1Q };
+(+f0.0.all4h) sel(8) g16<1>UD   g8<4>UD         g9<4>UD         { align16 1Q };
+(+f0.0) sel(8)  g2<1>UD         g31<8,8,1>UD    g34<8,8,1>UD    { align1 1Q };
+(+f0.0) sel(8)  g124<1>UD       g67<8,8,1>UD    0x3f800000UD    { align1 1Q };
+(+f0.0) sel(16) g2<1>UD         g35<8,8,1>UD    g41<8,8,1>UD    { align1 1H };
+(+f0.0) sel(16) g120<1>UD       g27<8,8,1>UD    0x3f800000UD    { align1 1H };
+sel.ge(8)       g64<1>F         g9<8,8,1>F      0x0F  /* 0F */  { align1 1Q };
+(-f0.0) sel(8)  g16<1>UD        g20<8,8,1>UD    0x00000000UD    { align1 1Q };
+sel.ge(16)      g17<1>F         g3<8,8,1>F      0x0F  /* 0F */  { align1 1H };
+(-f0.0) sel(16) g23<1>UD        g21<8,8,1>UD    0x00000000UD    { align1 1H };
+sel.l(8)        g13<1>.xyzD     g6<0>.xyzzD     g5.4<0>.zD      { align16 1Q };
+sel.ge(8)       g3<1>.yF        g7<4>.xF        0x0F  /* 0F */  { align16 1Q };
+sel.l(8)        g11<1>.xF       g7<4>.wF        0x43000000F  /* 128F */ { align16 1Q };
+(-f0.0.z) sel(8) g3<1>.zUD      g14<4>.xUD      0x00000000UD    { align16 1Q };
+sel.l(8)        g14<1>UD        g6<0>UD         g6.4<0>UD       { align16 1Q };
+(+f0.0.x) sel(8) g32<1>.xUD     g12<4>.yUD      0x41a80000UD    { align16 1Q };
+(-f0.0.x) sel(8) g33<1>.xUD     g32<4>.xUD      0x41b80000UD    { align16 1Q };
+sel.ge(8)       g2<1>F          (abs)g7<8,8,1>F (abs)g8<8,8,1>F { align1 1Q };
+sel.ge(16)      g2<1>F          (abs)g10<8,8,1>F (abs)g12<8,8,1>F { align1 1H };
+(+f0.0.x) sel(8) g25<1>.xUD     g23<4>.yUD      g23<4>.xUD      { align16 1Q };
+sel.sat.l(8)    g116<1>F        g2<4>F          0x3f000000F  /* 0.5F */ { align16 1Q };
+sel.l(8)        g13<1>.xF       g1<0>.wF        g1<0>.zF        { align16 1Q };
+sel.ge(8)       g13<1>.xF       g1<0>.wF        g1<0>.zF        { align16 1Q };
+(-f0.0.any4h) sel(8) g67<1>.xUD g63<4>.xUD      0x00000000UD    { align16 1Q };
+(+f0.0.x) sel(8) g17<1>.xF      g5.4<0>.zF      -g5.4<0>.zF     { align16 1Q };
+sel.l(8)        g124<1>F        g2.3<0,1,0>F    g2.2<0,1,0>F    { align1 1Q };
+sel.l(16)       g120<1>F        g2.3<0,1,0>F    g2.2<0,1,0>F    { align1 1H };
+sel.ge(8)       g13<1>.xyzD     g6<0>.xyzzD     g5.4<0>.zD      { align16 1Q };
+sel.ge(8)       g13<1>.xyUD     g5.4<0>.zwwwUD  g6<0>.xyyyUD    { align16 1Q };
+(+f0.0.any4h) sel(8) g17<1>.xUD g8<4>.xUD       0x00000001UD    { align16 1Q };
+sel.ge(8)       g12<1>.xD       g5.4<0>.zD      -1D             { align16 1Q };
+sel.l(8)        g14<1>.xD       g12<4>.xD       1D              { align16 1Q };
+sel.l(8)        g3<1>UD         g2.1<0,1,0>UD   0x00000001UD    { align1 1Q };
+sel.l(16)       g3<1>UD         g2.1<0,1,0>UD   0x00000001UD    { align1 1H };
+sel.ge(8)       g4<1>D          g3<0,1,0>D      -252D           { align1 1Q };
+sel.l(8)        g5<1>D          g4<8,8,1>D      254D            { align1 1Q };
+sel.ge(16)      g4<1>D          g3<0,1,0>D      -252D           { align1 1H };
+sel.l(16)       g6<1>D          g4<8,8,1>D      254D            { align1 1H };
+sel.sat.l(8)    g116<1>F        g1<0>F          g3<4>F          { align16 1Q };
+sel.l(8)        g6<1>F          g3<8,8,1>F      0x40400000F  /* 3F */ { align1 1Q };
+sel.l(16)       g2<1>F          g20<8,8,1>F     0x40400000F  /* 3F */ { align1 1H };
+(+f0.0) sel(8)  g28<1>.xyF      (abs)g6<0>.xyyyF g5.4<0>.zwwwF  { align16 1Q };
+(+f0.0) sel(8)  g31<1>.xyUD     g10<4>.xyyyUD   0x3f800000UD    { align16 1Q };
+(-f0.0) sel(8)  g38<1>.xyF      (abs)g35<4>.xyyyF 0x3f800000F  /* 1F */ { align16 1Q };
+sel.l(8)        g13<1>.xUD      g11<4>.xUD      0x00000007UD    { align16 1Q };
+(+f1.0) sel(4)  g17<1>.xUD      g15.4<4>.xUD    g15<4>.xUD      { align16 WE_all 1N };
+(+f0.0) sel(8)  g57<1>F         (abs)g2.3<0,1,0>F g2<0,1,0>F    { align1 1Q };
+(-f0.0) sel(8)  g29<1>F         (abs)g26<8,8,1>F 0x3f800000F  /* 1F */ { align1 1Q };
+(+f0.0) sel(16) g8<1>F          (abs)g2.3<0,1,0>F g2<0,1,0>F    { align1 1H };
+(-f0.0) sel(16) g55<1>F         (abs)g14<8,8,1>F 0x3f800000F  /* 1F */ { align1 1H };
+(-f0.0.x) sel(8) g32<1>.xF      (abs)g29<4>.xF  0x3f800000F  /* 1F */ { align16 1Q };
+sel.sat.ge(8)   g116<1>F        g25<4>F         0xbf800000F  /* -1F */ { align16 1Q };
+sel.sat.l(8)    g46<1>F         g45<8,8,1>F     0x3f000000F  /* 0.5F */ { align1 1Q };
+sel.sat.l(16)   g8<1>F          g83<8,8,1>F     0x3f000000F  /* 0.5F */ { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/sel.expected b/src/intel/compiler/elk/tests/gen7/sel.expected
new file mode 100644
index 00000000000..e0d0a2350c9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/sel.expected
@@ -0,0 +1,56 @@
+02 01 61 00 21 04 ef 25 84 01 6e 00 a4 01 6e 00
+02 01 71 00 21 0c 23 23 ae 01 6f 00 00 00 00 40
+02 01 66 00 21 04 cf 23 a4 01 6e 00 84 01 6e 00
+02 01 67 00 21 04 0f 22 04 01 6e 00 24 01 6e 00
+02 00 61 00 21 04 40 20 e0 03 8d 00 40 04 8d 00
+02 00 61 00 21 0c 80 2f 60 08 8d 00 00 00 80 3f
+02 00 81 00 21 04 40 20 60 04 8d 00 20 05 8d 00
+02 00 81 00 21 0c 00 2f 60 03 8d 00 00 00 80 3f
+02 00 60 04 bd 7f 00 28 20 01 8d 00 00 00 00 00
+02 00 71 00 21 0c 00 22 80 02 8d 00 00 00 00 00
+02 00 80 04 bd 7f 20 22 60 00 8d 00 00 00 00 00
+02 00 91 00 21 0c e0 22 a0 02 8d 00 00 00 00 00
+02 01 60 05 a5 14 a7 21 c4 00 0a 00 ba 00 0a 00
+02 01 60 04 bd 7f 62 20 e0 00 60 00 00 00 00 00
+02 01 60 05 bd 7f 61 21 ef 00 6f 00 00 00 00 43
+02 01 74 00 21 0c 64 20 c0 01 60 00 00 00 00 00
+02 01 60 05 21 04 cf 21 c4 00 0e 00 d4 00 0e 00
+02 01 62 00 21 0c 01 24 85 01 65 00 00 00 a8 41
+02 01 72 00 21 0c 21 24 00 04 60 00 00 00 b8 41
+02 00 60 04 bd 77 40 20 e0 20 8d 00 00 21 8d 00
+02 00 80 04 bd 77 40 20 40 21 8d 00 80 21 8d 00
+02 01 62 00 21 04 21 23 e5 02 65 00 e0 02 60 00
+02 01 60 85 bd 7f 8f 2e 44 00 6e 00 00 00 00 3f
+02 01 60 05 bd 77 a1 21 2f 00 0f 00 2a 00 0a 00
+02 01 60 04 bd 77 a1 21 2f 00 0f 00 2a 00 0a 00
+02 01 76 00 21 0c 61 28 e0 07 60 00 00 00 00 00
+02 01 62 00 bd 77 21 22 ba 00 0a 00 ba 40 0a 00
+02 00 60 05 bd 77 80 2f 4c 00 00 00 48 00 00 00
+02 00 80 05 bd 77 00 2f 4c 00 00 00 48 00 00 00
+02 01 60 04 a5 14 a7 21 c4 00 0a 00 ba 00 0a 00
+02 01 60 04 21 04 a3 21 be 00 0f 00 c4 00 05 00
+02 01 66 00 21 0c 21 22 00 01 60 00 01 00 00 00
+02 01 60 04 a5 1c 81 21 ba 00 0a 00 ff ff ff ff
+02 01 60 05 a5 1c c1 21 80 01 60 00 01 00 00 00
+02 00 60 05 21 0c 60 20 44 00 00 00 01 00 00 00
+02 00 80 05 21 0c 60 20 44 00 00 00 01 00 00 00
+02 00 60 04 a5 1c 80 20 60 00 00 00 04 ff ff ff
+02 00 60 05 a5 1c a0 20 80 00 8d 00 fe 00 00 00
+02 00 80 04 a5 1c 80 20 60 00 00 00 04 ff ff ff
+02 00 80 05 a5 1c c0 20 80 00 8d 00 fe 00 00 00
+02 01 60 85 bd 77 8f 2e 24 00 0e 00 64 00 6e 00
+02 00 60 05 bd 7f c0 20 60 00 8d 00 00 00 40 40
+02 00 80 05 bd 7f 40 20 80 02 8d 00 00 00 40 40
+02 01 61 00 bd 77 83 23 c4 20 05 00 be 00 0f 00
+02 01 61 00 21 0c e3 23 44 01 65 00 00 00 80 3f
+02 01 71 00 bd 7f c3 24 64 24 65 00 00 00 80 3f
+02 01 60 05 21 0c a1 21 60 01 60 00 07 00 00 00
+02 03 41 00 21 04 21 22 f0 01 60 04 e0 01 60 00
+02 00 61 00 bd 77 20 27 4c 20 00 00 40 00 00 00
+02 00 71 00 bd 7f a0 23 40 23 8d 00 00 00 80 3f
+02 00 81 00 bd 77 00 21 4c 20 00 00 40 00 00 00
+02 00 91 00 bd 7f e0 26 c0 21 8d 00 00 00 80 3f
+02 01 72 00 bd 7f 01 24 a0 23 60 00 00 00 80 3f
+02 01 60 84 bd 7f 8f 2e 24 03 6e 00 00 00 80 bf
+02 00 60 85 bd 7f c0 25 a0 05 8d 00 00 00 00 3f
+02 00 80 85 bd 7f 00 21 60 0a 8d 00 00 00 00 3f
diff --git a/src/intel/compiler/elk/tests/gen7/send.asm b/src/intel/compiler/elk/tests/gen7/send.asm
new file mode 100644
index 00000000000..6345a9c3bd6
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/send.asm
@@ -0,0 +1,1170 @@
+send(8)         null<1>F        g113<4>F        0x8608c000
+                            urb MsgDesc: 0 write HWord interleave complete mlen 3 rlen 0 { align16 1Q EOT };
+send(8)         null<1>F        g113<4>F        0x8a08c000
+                            urb MsgDesc: 0 write HWord interleave complete mlen 5 rlen 0 { align16 1Q EOT };
+send(8)         g2<1>UW         g8<8,8,1>UD     0x08427001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g14<8,8,1>UD    0x10847001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g50<1>D         g51<4>UD        0x02194013
+                            urb MsgDesc: 2 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>F        g12<4>UD        0x04094019
+                            urb MsgDesc: 3 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g13<4>UD        0x04094011
+                            urb MsgDesc: 2 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g12<4>UD        0x04094009
+                            urb MsgDesc: 1 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g12<4>UD        0x04094001
+                            urb MsgDesc: 0 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         g14<1>D         g15<4>UD        0x0219400b
+                            urb MsgDesc: 1 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g13<1>D         g12<4>UD        0x02194003
+                            urb MsgDesc: 0 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>UW       g12<4,4,1>UD    0x02008004
+                            gateway MsgDesc: (barrier msg) mlen 1 rlen 0    { align1 WE_all 1Q };
+send(8)         null<1>F        g13<4>UD        0x0208c003
+                            urb MsgDesc: 0 read OWord interleave complete mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        g126<4>F        0x84080001
+                            urb MsgDesc: 0 write OWord mlen 2 rlen 0        { align16 1Q EOT };
+send(8)         g0<1>F          g125<4>F        0x060a80ff
+                            data MsgDesc: ( DC OWORD dual block write, 255, 0) mlen 3 rlen 0 { align16 1Q };
+send(8)         g41<1>F         g126<4>F        0x041880ff
+                            data MsgDesc: ( DC OWORD dual block read, 255, 0) mlen 2 rlen 1 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x8e08c000
+                            urb MsgDesc: 0 write HWord interleave complete mlen 7 rlen 0 { align16 1Q EOT };
+send(8)         g124<1>UW       g11<8,8,1>UD    0x06420001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g16<8,8,1>UD    0x0c840001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x04420001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x08840001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g7<8,8,1>UD     0x144a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 10 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g8<8,8,1>UD     0x084a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g38<1>UD        g87<4>.xUD      0x02107000
+                            sampler MsgDesc: ld SIMD4x2 Surface = 0 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x0a4a6001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g12<8,8,1>UD    0x0a4a6102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x128c6001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g20<8,8,1>UD    0x128c6102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 9 rlen 8 { align1 1H };
+send(8)         g10<1>D         g114<4>F        0x0411e000
+                            sampler MsgDesc: ld2dms SIMD4x2 Surface = 0 Sampler = 0 mlen 2 rlen 1 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x0a094000
+                            urb MsgDesc: 0 write HWord per-slot interleave mlen 5 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x82084000
+                            urb MsgDesc: 0 write HWord interleave mlen 1 rlen 0 { align16 1Q EOT };
+send(8)         null<1>F        g9<4>UD         0x04094021
+                            urb MsgDesc: 4 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g9<4>UD         0x02088003
+                            urb MsgDesc: 0 read OWord complete mlen 1 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x06094000
+                            urb MsgDesc: 0 write HWord per-slot interleave mlen 3 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x1a084000
+                            urb MsgDesc: 0 write HWord interleave mlen 13 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x8a08c030
+                            urb MsgDesc: 6 write HWord interleave complete mlen 5 rlen 0 { align16 1Q EOT };
+send(8)         g5<1>UW         g21<8,8,1>UD    0x02420001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g7<1>UW         g21<8,8,1>UD    0x04840001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         g12<1>D         g114<4>F        0x0210a000
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 0 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>F        g9<4>UD         0x04094029
+                            urb MsgDesc: 5 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x06427001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g18<8,8,1>UD    0x0c847001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x0c424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g3<1>D          g114<4>F        0x0210a101
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 1 Sampler = 1 mlen 1 rlen 1 { align16 1Q };
+send(8)         g5<1>D          g114<4>F        0x0210a202
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 2 Sampler = 2 mlen 1 rlen 1 { align16 1Q };
+send(8)         g7<1>D          g114<4>F        0x0210a303
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 3 Sampler = 3 mlen 1 rlen 1 { align16 1Q };
+send(8)         g9<1>D          g114<4>F        0x0210a404
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 4 Sampler = 4 mlen 1 rlen 1 { align16 1Q };
+send(8)         g11<1>D         g114<4>F        0x0210a505
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 5 Sampler = 5 mlen 1 rlen 1 { align16 1Q };
+send(8)         g13<1>D         g114<4>F        0x0210a606
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 6 Sampler = 6 mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x9208c000
+                            urb MsgDesc: 0 write HWord interleave complete mlen 9 rlen 0 { align16 1Q EOT };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x04429001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08849001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(16)        g13<1>UD        g3<8,8,1>UD     0x02280301
+                            const MsgDesc: (1, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(8)         g11<1>D         g114<4>F        0x06191001
+                            sampler MsgDesc: gather4_po SIMD4x2 Surface = 1 Sampler = 0 mlen 3 rlen 1 { align16 1Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x04422001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x08842001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g5<1>UW         g3<8,8,1>UD     0x02427001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g8<1>UW         g5<8,8,1>UD     0x04847001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g9<8,8,1>UD     0x08421001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g14<8,8,1>UD    0x10841001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x02429001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x04849001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g9<8,8,1>UD     0x0242a001
+                            sampler MsgDesc: resinfo SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0484a001
+                            sampler MsgDesc: resinfo SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g21<8,8,1>UD    0x08426001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g2<8,8,1>UD     0x10846001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g8<8,8,1>UD     0x06426001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g20<8,8,1>UD    0x0c846001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x1a084030
+                            urb MsgDesc: 6 write HWord interleave mlen 13 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x9608c060
+                            urb MsgDesc: 12 write HWord interleave complete mlen 11 rlen 0 { align16 1Q EOT };
+send(8)         g2<1>UW         g6<8,8,1>UD     0x06423001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0c843001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x0a094008
+                            urb MsgDesc: 1 write HWord per-slot interleave mlen 5 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x04084001
+                            urb MsgDesc: 0 write OWord interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         g9<1>F          g114<4>F        0x04188001
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 1 Sampler = 0 mlen 2 rlen 1 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x8608c030
+                            urb MsgDesc: 6 write HWord interleave complete mlen 3 rlen 0 { align16 1Q EOT };
+send(8)         g124<1>UW       g11<8,8,1>UD    0x0a4a1001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g19<8,8,1>UD    0x128c1001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x9608c000
+                            urb MsgDesc: 0 write HWord interleave complete mlen 11 rlen 0 { align16 1Q EOT };
+send(8)         g9<1>UW         g2<8,8,1>UD     0x0242a102
+                            sampler MsgDesc: resinfo SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g16<8,8,1>UD    0x06426102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g29<1>UW        g37<8,8,1>UD    0x0484a102
+                            sampler MsgDesc: resinfo SIMD16 Surface = 2 Sampler = 1 mlen 2 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g15<8,8,1>UD    0x0c846102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         g5<1>UW         g17<8,8,1>UD    0x06427102
+                            sampler MsgDesc: ld SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(8)         g9<1>UW         g20<8,8,1>UD    0x0843e001
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g21<1>UW        g7<8,8,1>UD     0x0c847102
+                            sampler MsgDesc: ld SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(16)        g29<1>UW        g13<8,8,1>UD    0x1085e001
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g13<8,8,1>UD    0x0a43e001
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g10<8,8,1>UD    0x1485e001
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x0643d001
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0c85d001
+                            sampler MsgDesc: ld_mcs SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g9<1>UW         g19<8,8,1>UD    0x0443d001
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g23<1>UW        g11<8,8,1>UD    0x0885d001
+                            sampler MsgDesc: ld_mcs SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g16<1>.xD       g114<4>F        0x0218b000
+                            sampler MsgDesc: sampleinfo SIMD4x2 Surface = 0 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x16094000
+                            urb MsgDesc: 0 write HWord per-slot interleave mlen 11 rlen 0 { align16 1Q };
+send(8)         g2<1>UW         g6<8,8,1>UD     0x064a3001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g9<8,8,1>UD     0x064a3102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0a8c3001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0a8c3102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 5 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x06421001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x0c841001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g11<8,8,1>UD    0x06425001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g14<8,8,1>UD    0x06425102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g21<8,8,1>UD    0x0c845001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g27<8,8,1>UD    0x0c845102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g9<8,8,1>UD     0x08420001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x10840001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g11<1>UD        g114<4>F        0x0211d000
+                            sampler MsgDesc: ld_mcs SIMD4x2 Surface = 0 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g124<1>UW       g7<8,8,1>UD     0x0e424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+send(8)         g16<1>F         g17<4>.xUD      0x02107001
+                            sampler MsgDesc: ld SIMD4x2 Surface = 1 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g6<1>UW         g9<8,8,1>UD     0x08426102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g15<8,8,1>UD    0x10846102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 8 rlen 8 { align1 1H };
+send(8)         g4<1>D          g114<4>F        0x04188003
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 3 Sampler = 0 mlen 2 rlen 1 { align16 1Q };
+send(8)         g5<1>D          g114<4>F        0x04188104
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 4 Sampler = 1 mlen 2 rlen 1 { align16 1Q };
+send(8)         g8<1>D          g114<4>F        0x04188205
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 5 Sampler = 2 mlen 2 rlen 1 { align16 1Q };
+send(8)         g124<1>UW       g10<8,8,1>UD    0x08422001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g15<8,8,1>UD    0x10842001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g7<8,8,1>UD     a0<0,1,0>UD     0x00000200
+                            sampler MsgDesc: indirect                       { align1 1Q };
+send(8)         g28<1>D         g29<4>UD        0x0219401b
+                            urb MsgDesc: 3 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g36<1>D         g37<4>UD        0x02194023
+                            urb MsgDesc: 4 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g43<1>D         g44<4>UD        0x0219402b
+                            urb MsgDesc: 5 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g3<1>.xUW       g1<4>UD         0x0411bb00
+                            data MsgDesc: ( DC untyped atomic, 0,  imin) mlen 2 rlen 1 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x1a094000
+                            urb MsgDesc: 0 write HWord per-slot interleave mlen 13 rlen 0 { align16 1Q };
+send(8)         g124<1>UW       g5<8,8,1>UD     0x04420304
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 3 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x08840304
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 3 mlen 4 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g9<8,8,1>UD     0x08423001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g16<1>UW        g8<8,8,1>UD     0x10843001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g10<8,8,1>UD    0x0a421001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g9<8,8,1>UD     0x14841001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x06422001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g14<1>UW        g8<8,8,1>UD     0x0c842001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x0e094000
+                            urb MsgDesc: 0 write HWord per-slot interleave mlen 7 rlen 0 { align16 1Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x06424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x06429001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g12<8,8,1>UD    0x0c849001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x04427001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x08847001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x064a8006
+                            sampler MsgDesc: gather4 SIMD8 Surface = 6 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g14<8,8,1>UD    0x064a840a
+                            sampler MsgDesc: gather4 SIMD8 Surface = 10 Sampler = 4 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x084a8107
+                            sampler MsgDesc: gather4 SIMD8 Surface = 7 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g10<8,8,1>UD    0x084a8208
+                            sampler MsgDesc: gather4 SIMD8 Surface = 8 Sampler = 2 mlen 4 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g26<8,8,1>UD    0x0a4a8309
+                            sampler MsgDesc: gather4 SIMD8 Surface = 9 Sampler = 3 mlen 5 rlen 4 { align1 1Q };
+send(16)        g35<1>UW        g2<8,8,1>UD     0x0a8c8006
+                            sampler MsgDesc: gather4 SIMD16 Surface = 6 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(16)        g18<1>UW        g43<8,8,1>UD    0x0a8c840a
+                            sampler MsgDesc: gather4 SIMD16 Surface = 10 Sampler = 4 mlen 5 rlen 8 { align1 1H };
+send(16)        g43<1>UW        g7<8,8,1>UD     0x0e8c8107
+                            sampler MsgDesc: gather4 SIMD16 Surface = 7 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g51<8,8,1>UD    0x0e8c8208
+                            sampler MsgDesc: gather4 SIMD16 Surface = 8 Sampler = 2 mlen 7 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g26<8,8,1>UD    0x128c8309
+                            sampler MsgDesc: gather4 SIMD16 Surface = 9 Sampler = 3 mlen 9 rlen 8 { align1 1H };
+(+f1.0) send(8) null<1>UW       g7<8,8,1>UD     0x0a036001
+                            data MsgDesc: ( DC untyped surface write, 1, 32) mlen 5 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g9<8,8,1>UD     0x14035001
+                            data MsgDesc: ( DC untyped surface write, 1, 16) mlen 10 rlen 0 { align1 1H };
+send(8)         g124<1>UW       g10<8,8,1>UD    0x0e4a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g10<8,8,1>UD    0x084a1001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g17<8,8,1>UD    0x0e8c1001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g9<8,8,1>UD     0x0a422001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x14842001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(16)        g3<1>UD         g5<8,8,1>UD     0x02280302
+                            const MsgDesc: (2, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(8)         g25<1>UW        g13<8,8,1>UD    0x06194e01
+                            render MsgDesc: typed surface read MsgCtrl = 0x14 Surface = 1 mlen 3 rlen 1 { align1 1Q };
+send(8)         null<1>UW       g26<8,8,1>UD    0x080b4e01
+                            render MsgDesc: typed surface write MsgCtrl = 0x14 Surface = 1 mlen 4 rlen 0 { align1 1Q };
+send(8)         g40<1>UW        g18<8,8,1>UD    0x06196e01
+                            render MsgDesc: typed surface read MsgCtrl = 0x46 Surface = 1 mlen 3 rlen 1 { align1 2Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x080b6e01
+                            render MsgDesc: typed surface write MsgCtrl = 0x46 Surface = 1 mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06098501
+                            render MsgDesc: typed atomic op MsgCtrl = 0x5 Surface = 1 mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06099501
+                            render MsgDesc: typed atomic op MsgCtrl = 0x21 Surface = 1 mlen 3 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g7<8,8,1>UD     0x08098c01
+                            render MsgDesc: typed atomic op MsgCtrl = 0x12 Surface = 1 mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099c01
+                            render MsgDesc: typed atomic op MsgCtrl = 0x28 Surface = 1 mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g7<8,8,1>UD     0x08098401
+                            render MsgDesc: typed atomic op MsgCtrl = 0x4 Surface = 1 mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099401
+                            render MsgDesc: typed atomic op MsgCtrl = 0x20 Surface = 1 mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g7<8,8,1>UD     0x0a098e01
+                            render MsgDesc: typed atomic op MsgCtrl = 0x14 Surface = 1 mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g16<8,8,1>UD    0x0a099e01
+                            render MsgDesc: typed atomic op MsgCtrl = 0x30 Surface = 1 mlen 5 rlen 0 { align1 2Q };
+send(8)         g2<1>UW         g6<8,8,1>UD     0x04423001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g8<8,8,1>UD     0x04423102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08843001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x08843102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 4 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g5<8,8,1>UD     0x064a0001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x0a8c0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x06423102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g26<8,8,1>UD    0x0c843102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         g5<1>UW         g15<8,8,1>UD    0x04420203
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 2 mlen 2 rlen 4 { align1 1Q };
+send(8)         g9<1>UW         g15<8,8,1>UD    0x04420102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(16)        g7<1>UW         g27<8,8,1>UD    0x08840203
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 2 mlen 4 rlen 8 { align1 1H };
+send(16)        g15<1>UW        g27<8,8,1>UD    0x08840102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 4 rlen 8 { align1 1H };
+send(8)         g22<1>UW        g22<8,8,1>UD    0x0242a203
+                            sampler MsgDesc: resinfo SIMD8 Surface = 3 Sampler = 2 mlen 1 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g26<8,8,1>UD    0x0242a304
+                            sampler MsgDesc: resinfo SIMD8 Surface = 4 Sampler = 3 mlen 1 rlen 4 { align1 1Q };
+send(8)         g30<1>UW        g30<8,8,1>UD    0x0242a405
+                            sampler MsgDesc: resinfo SIMD8 Surface = 5 Sampler = 4 mlen 1 rlen 4 { align1 1Q };
+send(8)         g34<1>UW        g34<8,8,1>UD    0x0242a506
+                            sampler MsgDesc: resinfo SIMD8 Surface = 6 Sampler = 5 mlen 1 rlen 4 { align1 1Q };
+send(8)         g38<1>UW        g38<8,8,1>UD    0x0242a607
+                            sampler MsgDesc: resinfo SIMD8 Surface = 7 Sampler = 6 mlen 1 rlen 4 { align1 1Q };
+send(8)         g42<1>UW        g42<8,8,1>UD    0x0242a708
+                            sampler MsgDesc: resinfo SIMD8 Surface = 8 Sampler = 7 mlen 1 rlen 4 { align1 1Q };
+send(8)         g46<1>UW        g46<8,8,1>UD    0x0242a809
+                            sampler MsgDesc: resinfo SIMD8 Surface = 9 Sampler = 8 mlen 1 rlen 4 { align1 1Q };
+send(8)         g50<1>UW        g50<8,8,1>UD    0x0242a90a
+                            sampler MsgDesc: resinfo SIMD8 Surface = 10 Sampler = 9 mlen 1 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g54<8,8,1>UD    0x0242aa0b
+                            sampler MsgDesc: resinfo SIMD8 Surface = 11 Sampler = 10 mlen 1 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g55<8,8,1>UD    0x0242ab0c
+                            sampler MsgDesc: resinfo SIMD8 Surface = 12 Sampler = 11 mlen 1 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g56<8,8,1>UD    0x0242ac0d
+                            sampler MsgDesc: resinfo SIMD8 Surface = 13 Sampler = 12 mlen 1 rlen 4 { align1 1Q };
+send(16)        g82<1>UW        g110<8,8,1>UD   0x0484aa0b
+                            sampler MsgDesc: resinfo SIMD16 Surface = 11 Sampler = 10 mlen 2 rlen 8 { align1 1H };
+send(16)        g18<1>UW        g26<8,8,1>UD    0x0484a203
+                            sampler MsgDesc: resinfo SIMD16 Surface = 3 Sampler = 2 mlen 2 rlen 8 { align1 1H };
+send(16)        g90<1>UW        g112<8,8,1>UD   0x0484ab0c
+                            sampler MsgDesc: resinfo SIMD16 Surface = 12 Sampler = 11 mlen 2 rlen 8 { align1 1H };
+send(16)        g98<1>UW        g106<8,8,1>UD   0x0484ac0d
+                            sampler MsgDesc: resinfo SIMD16 Surface = 13 Sampler = 12 mlen 2 rlen 8 { align1 1H };
+send(16)        g26<1>UW        g34<8,8,1>UD    0x0484a304
+                            sampler MsgDesc: resinfo SIMD16 Surface = 4 Sampler = 3 mlen 2 rlen 8 { align1 1H };
+send(16)        g34<1>UW        g42<8,8,1>UD    0x0484a405
+                            sampler MsgDesc: resinfo SIMD16 Surface = 5 Sampler = 4 mlen 2 rlen 8 { align1 1H };
+send(16)        g42<1>UW        g50<8,8,1>UD    0x0484a506
+                            sampler MsgDesc: resinfo SIMD16 Surface = 6 Sampler = 5 mlen 2 rlen 8 { align1 1H };
+send(16)        g50<1>UW        g58<8,8,1>UD    0x0484a607
+                            sampler MsgDesc: resinfo SIMD16 Surface = 7 Sampler = 6 mlen 2 rlen 8 { align1 1H };
+send(16)        g58<1>UW        g66<8,8,1>UD    0x0484a708
+                            sampler MsgDesc: resinfo SIMD16 Surface = 8 Sampler = 7 mlen 2 rlen 8 { align1 1H };
+send(16)        g66<1>UW        g74<8,8,1>UD    0x0484a809
+                            sampler MsgDesc: resinfo SIMD16 Surface = 9 Sampler = 8 mlen 2 rlen 8 { align1 1H };
+send(16)        g74<1>UW        g108<8,8,1>UD   0x0484a90a
+                            sampler MsgDesc: resinfo SIMD16 Surface = 10 Sampler = 9 mlen 2 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g11<8,8,1>UD    0x12424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 9 rlen 4 { align1 1Q };
+send(8)         g17<1>F         g114<4>F        0x04102000
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 0 Sampler = 0 mlen 2 rlen 1 { align16 1Q };
+send(8)         g10<1>UW        g10<8,8,1>UD    0x04420405
+                            sampler MsgDesc: sample SIMD8 Surface = 5 Sampler = 4 mlen 2 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g14<8,8,1>UD    0x04420506
+                            sampler MsgDesc: sample SIMD8 Surface = 6 Sampler = 5 mlen 2 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x04420607
+                            sampler MsgDesc: sample SIMD8 Surface = 7 Sampler = 6 mlen 2 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g22<8,8,1>UD    0x04420708
+                            sampler MsgDesc: sample SIMD8 Surface = 8 Sampler = 7 mlen 2 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g26<8,8,1>UD    0x04420809
+                            sampler MsgDesc: sample SIMD8 Surface = 9 Sampler = 8 mlen 2 rlen 4 { align1 1Q };
+send(8)         g30<1>UW        g30<8,8,1>UD    0x0442090a
+                            sampler MsgDesc: sample SIMD8 Surface = 10 Sampler = 9 mlen 2 rlen 4 { align1 1Q };
+send(8)         g34<1>UW        g34<8,8,1>UD    0x04420a0b
+                            sampler MsgDesc: sample SIMD8 Surface = 11 Sampler = 10 mlen 2 rlen 4 { align1 1Q };
+send(8)         g38<1>UW        g38<8,8,1>UD    0x04420b0c
+                            sampler MsgDesc: sample SIMD8 Surface = 12 Sampler = 11 mlen 2 rlen 4 { align1 1Q };
+send(8)         g42<1>UW        g42<8,8,1>UD    0x04420c0d
+                            sampler MsgDesc: sample SIMD8 Surface = 13 Sampler = 12 mlen 2 rlen 4 { align1 1Q };
+send(8)         g46<1>UW        g46<8,8,1>UD    0x04420d0e
+                            sampler MsgDesc: sample SIMD8 Surface = 14 Sampler = 13 mlen 2 rlen 4 { align1 1Q };
+send(8)         g50<1>UW        g50<8,8,1>UD    0x04420e0f
+                            sampler MsgDesc: sample SIMD8 Surface = 15 Sampler = 14 mlen 2 rlen 4 { align1 1Q };
+send(8)         g54<1>UW        g54<8,8,1>UD    0x04420f10
+                            sampler MsgDesc: sample SIMD8 Surface = 16 Sampler = 15 mlen 2 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08840405
+                            sampler MsgDesc: sample SIMD16 Surface = 5 Sampler = 4 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g18<8,8,1>UD    0x08840506
+                            sampler MsgDesc: sample SIMD16 Surface = 6 Sampler = 5 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08840607
+                            sampler MsgDesc: sample SIMD16 Surface = 7 Sampler = 6 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g18<8,8,1>UD    0x08840708
+                            sampler MsgDesc: sample SIMD16 Surface = 8 Sampler = 7 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08840809
+                            sampler MsgDesc: sample SIMD16 Surface = 9 Sampler = 8 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g18<8,8,1>UD    0x0884090a
+                            sampler MsgDesc: sample SIMD16 Surface = 10 Sampler = 9 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08840a0b
+                            sampler MsgDesc: sample SIMD16 Surface = 11 Sampler = 10 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g18<8,8,1>UD    0x08840b0c
+                            sampler MsgDesc: sample SIMD16 Surface = 12 Sampler = 11 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08840c0d
+                            sampler MsgDesc: sample SIMD16 Surface = 13 Sampler = 12 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g18<8,8,1>UD    0x08840d0e
+                            sampler MsgDesc: sample SIMD16 Surface = 14 Sampler = 13 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08840e0f
+                            sampler MsgDesc: sample SIMD16 Surface = 15 Sampler = 14 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g18<8,8,1>UD    0x08840f10
+                            sampler MsgDesc: sample SIMD16 Surface = 16 Sampler = 15 mlen 4 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g2<8,8,1>UD     0x02420102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g2<8,8,1>UD     0x04840102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 2 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x02416001
+                            data MsgDesc: ( DC untyped surface read, 1, 32) mlen 1 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x04815001
+                            data MsgDesc: ( DC untyped surface read, 1, 16) mlen 2 rlen 8 { align1 1H };
+send(8)         null<1>F        g11<4>UD        0x04094031
+                            urb MsgDesc: 6 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g9<4>UD         0x04094039
+                            urb MsgDesc: 7 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         g52<1>D         g53<4>UD        0x02194033
+                            urb MsgDesc: 6 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g59<1>D         g60<4>UD        0x0219403b
+                            urb MsgDesc: 7 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g66<1>D         g67<4>UD        0x02194043
+                            urb MsgDesc: 8 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g73<1>D         g74<4>UD        0x0219404b
+                            urb MsgDesc: 9 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x084a2001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x0e8c2001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g5<8,8,1>UD     0x0a426001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g10<8,8,1>UD    0x0a426102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g15<8,8,1>UD    0x14846001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g25<8,8,1>UD    0x14846102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 10 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x084a5001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x084a5102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x0e8c5001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0e8c5102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g6<8,8,1>UD     0x084a3001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g10<8,8,1>UD    0x084a3102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0e8c3001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0e8c3102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g3<8,8,1>UD     0x044a0001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g4<8,8,1>UD     0x068c0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 3 rlen 8 { align1 1H };
+send(8)         g17<1>UW        g12<8,8,1>UD    0x04420003
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g7<1>UW         g37<8,8,1>UD    0x08840003
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g11<1>UW        g39<8,8,1>UD    0x06427008
+                            sampler MsgDesc: ld SIMD8 Surface = 8 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g15<1>UW        g39<8,8,1>UD    0x06427109
+                            sampler MsgDesc: ld SIMD8 Surface = 9 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(8)         g19<1>UW        g39<8,8,1>UD    0x0642720a
+                            sampler MsgDesc: ld SIMD8 Surface = 10 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(8)         g23<1>UW        g39<8,8,1>UD    0x0642730b
+                            sampler MsgDesc: ld SIMD8 Surface = 11 Sampler = 3 mlen 3 rlen 4 { align1 1Q };
+send(8)         g27<1>UW        g39<8,8,1>UD    0x0642740c
+                            sampler MsgDesc: ld SIMD8 Surface = 12 Sampler = 4 mlen 3 rlen 4 { align1 1Q };
+send(8)         g31<1>UW        g39<8,8,1>UD    0x0642750d
+                            sampler MsgDesc: ld SIMD8 Surface = 13 Sampler = 5 mlen 3 rlen 4 { align1 1Q };
+send(8)         g35<1>UW        g39<8,8,1>UD    0x0642760e
+                            sampler MsgDesc: ld SIMD8 Surface = 14 Sampler = 6 mlen 3 rlen 4 { align1 1Q };
+send(8)         g39<1>UW        g39<8,8,1>UD    0x0642770f
+                            sampler MsgDesc: ld SIMD8 Surface = 15 Sampler = 7 mlen 3 rlen 4 { align1 1Q };
+send(16)        g67<1>UW        g93<8,8,1>UD    0x0c847008
+                            sampler MsgDesc: ld SIMD16 Surface = 8 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(16)        g27<1>UW        g93<8,8,1>UD    0x0c847109
+                            sampler MsgDesc: ld SIMD16 Surface = 9 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(16)        g37<1>UW        g93<8,8,1>UD    0x0c84720a
+                            sampler MsgDesc: ld SIMD16 Surface = 10 Sampler = 2 mlen 6 rlen 8 { align1 1H };
+send(16)        g47<1>UW        g93<8,8,1>UD    0x0c84730b
+                            sampler MsgDesc: ld SIMD16 Surface = 11 Sampler = 3 mlen 6 rlen 8 { align1 1H };
+send(16)        g57<1>UW        g93<8,8,1>UD    0x0c84740c
+                            sampler MsgDesc: ld SIMD16 Surface = 12 Sampler = 4 mlen 6 rlen 8 { align1 1H };
+send(16)        g17<1>UW        g93<8,8,1>UD    0x0c84750d
+                            sampler MsgDesc: ld SIMD16 Surface = 13 Sampler = 5 mlen 6 rlen 8 { align1 1H };
+send(16)        g85<1>UW        g93<8,8,1>UD    0x0c84760e
+                            sampler MsgDesc: ld SIMD16 Surface = 14 Sampler = 6 mlen 6 rlen 8 { align1 1H };
+send(16)        g77<1>UW        g93<8,8,1>UD    0x0c84770f
+                            sampler MsgDesc: ld SIMD16 Surface = 15 Sampler = 7 mlen 6 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g8<8,8,1>UD     0x064a1001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g15<8,8,1>UD    0x0a8c1001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(8)         g5<1>F          g114<4>F        0x06190003
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 3 Sampler = 0 mlen 3 rlen 1 { align16 1Q };
+send(8)         g6<1>F          g114<4>F        0x06190104
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 4 Sampler = 1 mlen 3 rlen 1 { align16 1Q };
+send(8)         g9<1>F          g114<4>F        0x06190205
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 5 Sampler = 2 mlen 3 rlen 1 { align16 1Q };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x084a6001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x084a6102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x0e8c6001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0e8c6102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x0a4a5001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g17<8,8,1>UD    0x0a4a5102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g25<1>UW        g7<8,8,1>UD     0x128c5001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(16)        g33<1>UW        g16<8,8,1>UD    0x128c5102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 9 rlen 8 { align1 1H };
+send(8)         g69<1>.xUW      g66<4>UD        0x0211b500
+                            data MsgDesc: ( DC untyped atomic, 0,  inc) mlen 1 rlen 1 { align16 1Q };
+send(8)         g3<1>D          g114<4>F        0x04188005
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 5 Sampler = 0 mlen 2 rlen 1 { align16 1Q };
+send(8)         g4<1>D          g114<4>F        0x04188106
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 6 Sampler = 1 mlen 2 rlen 1 { align16 1Q };
+send(8)         g6<1>D          g114<4>F        0x04188207
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 7 Sampler = 2 mlen 2 rlen 1 { align16 1Q };
+send(8)         g9<1>D          g114<4>F        0x04188308
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 8 Sampler = 3 mlen 2 rlen 1 { align16 1Q };
+send(8)         g11<1>D         g114<4>F        0x04188409
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 9 Sampler = 4 mlen 2 rlen 1 { align16 1Q };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x0c4a6001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g13<8,8,1>UD    0x0c4a6102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 6 rlen 4 { align1 1Q };
+send(16)        g31<1>UW        g9<8,8,1>UD     0x168c6001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 11 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g20<8,8,1>UD    0x168c6102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 11 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x064a8004
+                            sampler MsgDesc: gather4 SIMD8 Surface = 4 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x084a8105
+                            sampler MsgDesc: gather4 SIMD8 Surface = 5 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g10<8,8,1>UD    0x064a8206
+                            sampler MsgDesc: gather4 SIMD8 Surface = 6 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0a8c8004
+                            sampler MsgDesc: gather4 SIMD16 Surface = 4 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0e8c8105
+                            sampler MsgDesc: gather4 SIMD16 Surface = 5 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(16)        g26<1>UW        g34<8,8,1>UD    0x0a8c8206
+                            sampler MsgDesc: gather4 SIMD16 Surface = 6 Sampler = 2 mlen 5 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g7<8,8,1>UD     0x0a4a2001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g9<8,8,1>UD     0x128c2001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         g24<1>F         g25<4>.xUD      0x02107002
+                            sampler MsgDesc: ld SIMD4x2 Surface = 2 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g30<1>F         g31<4>.xUD      0x02107003
+                            sampler MsgDesc: ld SIMD4x2 Surface = 3 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g35<1>F         g36<4>.xUD      0x02107004
+                            sampler MsgDesc: ld SIMD4x2 Surface = 4 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g41<1>F         g42<4>.xUD      0x02107005
+                            sampler MsgDesc: ld SIMD4x2 Surface = 5 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g47<1>F         g48<4>.xUD      0x02107006
+                            sampler MsgDesc: ld SIMD4x2 Surface = 6 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g53<1>F         g54<4>.xUD      0x02107007
+                            sampler MsgDesc: ld SIMD4x2 Surface = 7 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g59<1>F         g60<4>.xUD      0x02107008
+                            sampler MsgDesc: ld SIMD4x2 Surface = 8 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g65<1>F         g66<4>.xUD      0x02107009
+                            sampler MsgDesc: ld SIMD4x2 Surface = 9 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g71<1>F         g72<4>.xUD      0x0210700a
+                            sampler MsgDesc: ld SIMD4x2 Surface = 10 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g77<1>F         g78<4>.xUD      0x0210700b
+                            sampler MsgDesc: ld SIMD4x2 Surface = 11 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g83<1>F         g84<4>.xUD      0x0210700c
+                            sampler MsgDesc: ld SIMD4x2 Surface = 12 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g89<1>F         g90<4>.xUD      0x0210700d
+                            sampler MsgDesc: ld SIMD4x2 Surface = 13 Sampler = 0 mlen 1 rlen 1 { align16 1Q };
+send(8)         g5<1>F          g114<4>F        0x04102505
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 5 Sampler = 5 mlen 2 rlen 1 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x06094008
+                            urb MsgDesc: 1 write HWord per-slot interleave mlen 3 rlen 0 { align16 1Q };
+(+f1.0) send(8) g2<1>UW         g6<8,8,1>UD     0x0211b501
+                            data MsgDesc: ( DC untyped atomic, 1,  inc) mlen 1 rlen 1 { align1 1Q };
+(+f1.0) send(16) g2<1>UW        g8<8,8,1>UD     0x0421a501
+                            data MsgDesc: ( DC untyped atomic, 1,  inc) mlen 2 rlen 2 { align1 1H };
+(+f1.0) send(8) null<1>UW       g2<8,8,1>UD     0x02019501
+                            data MsgDesc: ( DC untyped atomic, 1,  inc) mlen 1 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g2<8,8,1>UD     0x04018501
+                            data MsgDesc: ( DC untyped atomic, 1,  inc) mlen 2 rlen 0 { align1 1H };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x0a423001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g15<8,8,1>UD    0x0a423102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g29<1>UW        g9<8,8,1>UD     0x14843001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(16)        g37<1>UW        g19<8,8,1>UD    0x14843102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 10 rlen 8 { align1 1H };
+(+f1.0) send(8) g4<1>UW         g12<8,8,1>UD    0x0211b502
+                            data MsgDesc: ( DC untyped atomic, 2,  inc) mlen 1 rlen 1 { align1 1Q };
+(+f1.0) send(16) g5<1>UW        g17<8,8,1>UD    0x0421a502
+                            data MsgDesc: ( DC untyped atomic, 2,  inc) mlen 2 rlen 2 { align1 1H };
+send(8)         g2<1>UW         g9<8,8,1>UD     0x024ab001
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x028cb001
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 1 Sampler = 0 mlen 1 rlen 8 { align1 1H };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x04036e01
+                            data MsgDesc: ( DC untyped surface write, 1, 46) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x06036c01
+                            data MsgDesc: ( DC untyped surface write, 1, 44) mlen 3 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g4<8,8,1>UD     0x08035e01
+                            data MsgDesc: ( DC untyped surface write, 1, 30) mlen 4 rlen 0 { align1 1H };
+(+f1.0) send(16) null<1>UW      g4<8,8,1>UD     0x0c035c01
+                            data MsgDesc: ( DC untyped surface write, 1, 28) mlen 6 rlen 0 { align1 1H };
+send(1)         g2<1>UW         g2<0,1,0>UW     0x0219e000
+                            data MsgDesc: ( DC mfence, 0, 32) mlen 1 rlen 1 { align1 WE_all 1N };
+send(8)         g2<1>UW         g94<8,8,1>UD    0x02116e01
+                            data MsgDesc: ( DC untyped surface read, 1, 46) mlen 1 rlen 1 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g119<8,8,1>UD   0x02019601
+                            data MsgDesc: ( DC untyped atomic, 1,  dec) mlen 1 rlen 0 { align1 1Q };
+(+f1.0) send(8) g48<1>UW        g119<8,8,1>UD   0x0211b601
+                            data MsgDesc: ( DC untyped atomic, 1,  dec) mlen 1 rlen 1 { align1 1Q };
+send(16)        g5<1>UW         g23<8,8,1>UD    0x04215e01
+                            data MsgDesc: ( DC untyped surface read, 1, 30) mlen 2 rlen 2 { align1 1H };
+(+f1.0) send(16) null<1>UW      g3<8,8,1>UD     0x04018601
+                            data MsgDesc: ( DC untyped atomic, 1,  dec) mlen 2 rlen 0 { align1 1H };
+(+f1.0) send(16) g97<1>UW       g3<8,8,1>UD     0x0421a601
+                            data MsgDesc: ( DC untyped atomic, 1,  dec) mlen 2 rlen 2 { align1 1H };
+send(8)         g47<1>.xUW      g44<4>UD        0x0211b600
+                            data MsgDesc: ( DC untyped atomic, 0,  dec) mlen 1 rlen 1 { align16 1Q };
+send(8)         g101<1>D        g99<4>UD        0x021940c3
+                            urb MsgDesc: 24 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         g110<1>D        g99<4>UD        0x021940cb
+                            urb MsgDesc: 25 read OWord per-slot interleave mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>F        g9<4>UD         0x04094041
+                            urb MsgDesc: 8 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         null<1>F        g9<4>UD         0x04094049
+                            urb MsgDesc: 9 write OWord per-slot interleave mlen 2 rlen 0 { align16 1Q };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x04421001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x08841001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x06420102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0c840102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x8e08c030
+                            urb MsgDesc: 6 write HWord interleave complete mlen 7 rlen 0 { align16 1Q EOT };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x02416002
+                            data MsgDesc: ( DC untyped surface read, 2, 32) mlen 1 rlen 4 { align1 1Q };
+send(16)        g19<1>UW        g17<8,8,1>UD    0x04815002
+                            data MsgDesc: ( DC untyped surface read, 2, 16) mlen 2 rlen 8 { align1 1H };
+send(8)         g2<1>.xD        g114<4>F        0x0218b101
+                            sampler MsgDesc: sampleinfo SIMD4x2 Surface = 1 Sampler = 1 mlen 1 rlen 1 { align16 1Q };
+send(8)         g4<1>.xD        g114<4>F        0x0218b202
+                            sampler MsgDesc: sampleinfo SIMD4x2 Surface = 2 Sampler = 2 mlen 1 rlen 1 { align16 1Q };
+send(8)         g6<1>.xD        g114<4>F        0x0218b303
+                            sampler MsgDesc: sampleinfo SIMD4x2 Surface = 3 Sampler = 3 mlen 1 rlen 1 { align16 1Q };
+send(8)         g8<1>.xD        g114<4>F        0x0218b404
+                            sampler MsgDesc: sampleinfo SIMD4x2 Surface = 4 Sampler = 4 mlen 1 rlen 1 { align16 1Q };
+send(8)         g10<1>.xD       g114<4>F        0x0218b505
+                            sampler MsgDesc: sampleinfo SIMD4x2 Surface = 5 Sampler = 5 mlen 1 rlen 1 { align16 1Q };
+send(8)         g124<1>UW       g8<8,8,1>UD     0x064a2001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g12<8,8,1>UD    0x0a8c2001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g15<8,8,1>UD    0x08423102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g2<8,8,1>UD     0x10843102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 8 rlen 8 { align1 1H };
+send(8)         g5<1>UW         g5<8,8,1>UD     0x04194e01
+                            render MsgDesc: typed surface read MsgCtrl = 0x14 Surface = 1 mlen 2 rlen 1 { align1 1Q };
+send(8)         null<1>UW       g7<8,8,1>UD     0x060b4e02
+                            render MsgDesc: typed surface write MsgCtrl = 0x14 Surface = 2 mlen 3 rlen 0 { align1 1Q };
+send(8)         g7<1>UW         g10<8,8,1>UD    0x04196e01
+                            render MsgDesc: typed surface read MsgCtrl = 0x46 Surface = 1 mlen 2 rlen 1 { align1 2Q };
+send(8)         null<1>UW       g12<8,8,1>UD    0x060b6e02
+                            render MsgDesc: typed surface write MsgCtrl = 0x46 Surface = 2 mlen 3 rlen 0 { align1 2Q };
+send(8)         null<1>F        g113<4>F        0x1a094030
+                            urb MsgDesc: 6 write HWord per-slot interleave mlen 13 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x06094060
+                            urb MsgDesc: 12 write HWord per-slot interleave mlen 3 rlen 0 { align16 1Q };
+send(8)         null<1>F        g113<4>F        0x9a08c000
+                            urb MsgDesc: 0 write HWord interleave complete mlen 13 rlen 0 { align16 1Q EOT };
+send(8)         g14<1>UW        g11<8,8,1>UD    0x084b0206
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 6 Sampler = 2 mlen 4 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x084b0004
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 4 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x0a4b0105
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 5 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g26<1>UW        g2<8,8,1>UD     0x0e8d0206
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 6 Sampler = 2 mlen 7 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0e8d0004
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 4 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g34<8,8,1>UD    0x128d0105
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 5 Sampler = 1 mlen 9 rlen 8 { align1 1H };
+send(16)        g52<1>UD        g6<8,8,1>UD     0x02280304
+                            const MsgDesc: (4, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(16)        g30<1>UD        g11<8,8,1>UD    0x02280303
+                            const MsgDesc: (3, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(16)        g32<1>UD        g14<8,8,1>UD    0x02280306
+                            const MsgDesc: (6, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(16)        g34<1>UD        g16<8,8,1>UD    0x02280305
+                            const MsgDesc: (5, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(8)         g7<1>UW         g24<8,8,1>UD    0x02116e02
+                            data MsgDesc: ( DC untyped surface read, 2, 46) mlen 1 rlen 1 { align1 1Q };
+send(8)         g8<1>UW         g24<8,8,1>UD    0x02116e04
+                            data MsgDesc: ( DC untyped surface read, 4, 46) mlen 1 rlen 1 { align1 1Q };
+send(8)         g5<1>UW         g21<8,8,1>UD    0x02116e03
+                            data MsgDesc: ( DC untyped surface read, 3, 46) mlen 1 rlen 1 { align1 1Q };
+send(16)        g12<1>UW        g40<8,8,1>UD    0x04215e02
+                            data MsgDesc: ( DC untyped surface read, 2, 30) mlen 2 rlen 2 { align1 1H };
+send(16)        g14<1>UW        g40<8,8,1>UD    0x04215e04
+                            data MsgDesc: ( DC untyped surface read, 4, 30) mlen 2 rlen 2 { align1 1H };
+send(16)        g8<1>UW         g36<8,8,1>UD    0x04215e03
+                            data MsgDesc: ( DC untyped surface read, 3, 30) mlen 2 rlen 2 { align1 1H };
+(+f1.0) send(8) null<1>UW       g11<8,8,1>UD    0x0a036002
+                            data MsgDesc: ( DC untyped surface write, 2, 32) mlen 5 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g13<8,8,1>UD    0x14035002
+                            data MsgDesc: ( DC untyped surface write, 2, 16) mlen 10 rlen 0 { align1 1H };
+send(8)         g15<1>D         g114<4>F        0x0210a707
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 7 Sampler = 7 mlen 1 rlen 1 { align16 1Q };
+send(8)         g17<1>D         g114<4>F        0x0210a808
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 8 Sampler = 8 mlen 1 rlen 1 { align16 1Q };
+send(8)         g19<1>D         g114<4>F        0x0210a909
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 9 Sampler = 9 mlen 1 rlen 1 { align16 1Q };
+send(8)         g21<1>D         g114<4>F        0x0210aa0a
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 10 Sampler = 10 mlen 1 rlen 1 { align16 1Q };
+send(8)         g23<1>D         g114<4>F        0x0210ab0b
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 11 Sampler = 11 mlen 1 rlen 1 { align16 1Q };
+send(8)         g25<1>D         g114<4>F        0x0210ac0c
+                            sampler MsgDesc: resinfo SIMD4x2 Surface = 12 Sampler = 12 mlen 1 rlen 1 { align16 1Q };
+send(8)         null<1>UW       g126<8,8,1>UD   0x040a02ff
+                            data MsgDesc: ( DC OWORD block write, 255, 2) mlen 2 rlen 0 { align1 1Q };
+send(8)         g66<1>UW        g0<8,8,1>F      0x021c0009
+                            data MsgDesc: ( DC OWORD block read, 9, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g66<1>UW        g0<8,8,1>F      0x021c0001
+                            data MsgDesc: ( DC OWORD block read, 1, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g66<1>UW        g0<8,8,1>F      0x021c0008
+                            data MsgDesc: ( DC OWORD block read, 8, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g66<1>UW        g0<8,8,1>F      0x021c0002
+                            data MsgDesc: ( DC OWORD block read, 2, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g66<1>UW        g0<8,8,1>F      0x021c0007
+                            data MsgDesc: ( DC OWORD block read, 7, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g66<1>UW        g0<8,8,1>F      0x021c0000
+                            data MsgDesc: ( DC OWORD block read, 0, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g68<1>UW        g0<8,8,1>F      0x021c0005
+                            data MsgDesc: ( DC OWORD block read, 5, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g66<1>UW        g0<8,8,1>F      0x021c0004
+                            data MsgDesc: ( DC OWORD block read, 4, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g64<1>UW        g0<8,8,1>F      0x021c0006
+                            data MsgDesc: ( DC OWORD block read, 6, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         g64<1>UW        g0<8,8,1>F      0x021c0003
+                            data MsgDesc: ( DC OWORD block read, 3, 0) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         null<1>F        g113<4>F        0x16094060
+                            urb MsgDesc: 12 write HWord per-slot interleave mlen 11 rlen 0 { align16 1Q };
+send(8)         null<1>UW       g9<8,8,1>UD     0x0e0b4002
+                            render MsgDesc: typed surface write MsgCtrl = 0x0 Surface = 2 mlen 7 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g21<8,8,1>UD    0x0e0b6002
+                            render MsgDesc: typed surface write MsgCtrl = 0x32 Surface = 2 mlen 7 rlen 0 { align1 2Q };
+send(8)         g2<1>UW         g50<8,8,1>UD    0x02216c01
+                            data MsgDesc: ( DC untyped surface read, 1, 44) mlen 1 rlen 2 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g2<8,8,1>UD     0x06036c02
+                            data MsgDesc: ( DC untyped surface write, 2, 44) mlen 3 rlen 0 { align1 1Q };
+send(16)        g15<1>UW        g85<8,8,1>UD    0x04415c01
+                            data MsgDesc: ( DC untyped surface read, 1, 28) mlen 2 rlen 4 { align1 1H };
+(+f1.0) send(16) null<1>UW      g2<8,8,1>UD     0x0c035c02
+                            data MsgDesc: ( DC untyped surface write, 2, 28) mlen 6 rlen 0 { align1 1H };
+send(8)         null<1>UW       g7<8,8,1>UD     0x080b4e02
+                            render MsgDesc: typed surface write MsgCtrl = 0x14 Surface = 2 mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x080b6e02
+                            render MsgDesc: typed surface write MsgCtrl = 0x46 Surface = 2 mlen 4 rlen 0 { align1 2Q };
+send(8)         g124<1>UW       g7<8,8,1>UD     0x104a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 8 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x084a0001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x0e8c0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         g4<1>UW         g16<8,8,1>UD    0x04194e02
+                            render MsgDesc: typed surface read MsgCtrl = 0x14 Surface = 2 mlen 2 rlen 1 { align1 1Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x0619a701
+                            render MsgDesc: typed atomic op MsgCtrl = 0x39 Surface = 1 mlen 3 rlen 1 { align1 1Q };
+send(8)         g7<1>UW         g31<8,8,1>UD    0x04196e02
+                            render MsgDesc: typed surface read MsgCtrl = 0x46 Surface = 2 mlen 2 rlen 1 { align1 2Q };
+send(8)         g121<1>UW       g5<8,8,1>UD     0x0619b701
+                            render MsgDesc: typed atomic op MsgCtrl = 0x55 Surface = 1 mlen 3 rlen 1 { align1 2Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x0619ad01
+                            render MsgDesc: typed atomic op MsgCtrl = 0x45 Surface = 1 mlen 3 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g5<8,8,1>UD     0x0619bd01
+                            render MsgDesc: typed atomic op MsgCtrl = 0x61 Surface = 1 mlen 3 rlen 1 { align1 2Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x0619ac01
+                            render MsgDesc: typed atomic op MsgCtrl = 0x44 Surface = 1 mlen 3 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g5<8,8,1>UD     0x0619bc01
+                            render MsgDesc: typed atomic op MsgCtrl = 0x60 Surface = 1 mlen 3 rlen 1 { align1 2Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x0619a101
+                            render MsgDesc: typed atomic op MsgCtrl = 0x33 Surface = 1 mlen 3 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g5<8,8,1>UD     0x0619b101
+                            render MsgDesc: typed atomic op MsgCtrl = 0x49 Surface = 1 mlen 3 rlen 1 { align1 2Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x0619a201
+                            render MsgDesc: typed atomic op MsgCtrl = 0x34 Surface = 1 mlen 3 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g5<8,8,1>UD     0x0619b201
+                            render MsgDesc: typed atomic op MsgCtrl = 0x50 Surface = 1 mlen 3 rlen 1 { align1 2Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x0619a301
+                            render MsgDesc: typed atomic op MsgCtrl = 0x35 Surface = 1 mlen 3 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g5<8,8,1>UD     0x0619b301
+                            render MsgDesc: typed atomic op MsgCtrl = 0x51 Surface = 1 mlen 3 rlen 1 { align1 2Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x0619a401
+                            render MsgDesc: typed atomic op MsgCtrl = 0x36 Surface = 1 mlen 3 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g5<8,8,1>UD     0x0619b401
+                            render MsgDesc: typed atomic op MsgCtrl = 0x52 Surface = 1 mlen 3 rlen 1 { align1 2Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x0819ae01
+                            render MsgDesc: typed atomic op MsgCtrl = 0x46 Surface = 1 mlen 4 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g2<8,8,1>UD     0x0819be01
+                            render MsgDesc: typed atomic op MsgCtrl = 0x62 Surface = 1 mlen 4 rlen 1 { align1 2Q };
+send(8)         null<1>F        g113<4>F        0x8608c060
+                            urb MsgDesc: 12 write HWord interleave complete mlen 3 rlen 0 { align16 1Q EOT };
+send(8)         g9<1>UW         g19<8,8,1>UD    0x0843e102
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g23<1>UW        g7<8,8,1>UD     0x1085e102
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 2 Sampler = 1 mlen 8 rlen 8 { align1 1H };
+send(8)         g9<1>UW         g21<8,8,1>UD    0x0443d002
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 2 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g23<1>UW        g11<8,8,1>UD    0x0885d002
+                            sampler MsgDesc: ld_mcs SIMD16 Surface = 2 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         null<1>UW       g2<8,8,1>UD     0x060b4e01
+                            render MsgDesc: typed surface write MsgCtrl = 0x14 Surface = 1 mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x060b6e01
+                            render MsgDesc: typed surface write MsgCtrl = 0x46 Surface = 1 mlen 3 rlen 0 { align1 2Q };
+(+f1.0) send(8) null<1>UW       g11<8,8,1>UD    0x0a036003
+                            data MsgDesc: ( DC untyped surface write, 3, 32) mlen 5 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g11<8,8,1>UD    0x14035003
+                            data MsgDesc: ( DC untyped surface write, 3, 16) mlen 10 rlen 0 { align1 1H };
+send(8)         g3<1>UW         g4<8,8,1>UD     0x02427002
+                            sampler MsgDesc: ld SIMD8 Surface = 2 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g4<1>UW         g12<8,8,1>UD    0x04847002
+                            sampler MsgDesc: ld SIMD16 Surface = 2 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x0a4a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x08425001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x08425102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x10845001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g19<8,8,1>UD    0x10845102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 8 rlen 8 { align1 1H };
+send(16)        g2<1>UD         g4<8,8,1>UD     0x02280307
+                            const MsgDesc: (7, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x064a8002
+                            sampler MsgDesc: gather4 SIMD8 Surface = 2 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0a8c8002
+                            sampler MsgDesc: gather4 SIMD16 Surface = 2 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+(+f1.0) send(8) null<1>UW       g9<8,8,1>UD     a0<0,1,0>UD     0x00000200
+                            data MsgDesc: indirect                          { align1 1Q };
+send(8)         g10<1>UW        g10<8,8,1>UD    0x04420004
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g26<8,8,1>UD    0x08840004
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g22<1>UW        g22<8,8,1>UD    0x064a800d
+                            sampler MsgDesc: gather4 SIMD8 Surface = 13 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g38<1>UW        g38<8,8,1>UD    0x084a810e
+                            sampler MsgDesc: gather4 SIMD8 Surface = 14 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(8)         g5<1>UW         g19<8,8,1>UD    0x064a820f
+                            sampler MsgDesc: gather4 SIMD8 Surface = 15 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(8)         g47<1>UW        g41<8,8,1>UD    0x064a8310
+                            sampler MsgDesc: gather4 SIMD8 Surface = 16 Sampler = 3 mlen 3 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g14<8,8,1>UD    0x084a8411
+                            sampler MsgDesc: gather4 SIMD8 Surface = 17 Sampler = 4 mlen 4 rlen 4 { align1 1Q };
+send(8)         g47<1>UW        g7<8,8,1>UD     0x064a8512
+                            sampler MsgDesc: gather4 SIMD8 Surface = 18 Sampler = 5 mlen 3 rlen 4 { align1 1Q };
+send(8)         g25<1>UW        g25<8,8,1>UD    0x064a8613
+                            sampler MsgDesc: gather4 SIMD8 Surface = 19 Sampler = 6 mlen 3 rlen 4 { align1 1Q };
+send(8)         g29<1>UW        g45<8,8,1>UD    0x084a8714
+                            sampler MsgDesc: gather4 SIMD8 Surface = 20 Sampler = 7 mlen 4 rlen 4 { align1 1Q };
+send(8)         g11<1>UW        g8<8,8,1>UD     0x064a8815
+                            sampler MsgDesc: gather4 SIMD8 Surface = 21 Sampler = 8 mlen 3 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g26<8,8,1>UD    0x084b0916
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 22 Sampler = 9 mlen 4 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x0a4b0a17
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 23 Sampler = 10 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x084b0b18
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 24 Sampler = 11 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0a8c800d
+                            sampler MsgDesc: gather4 SIMD16 Surface = 13 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(16)        g34<1>UW        g42<8,8,1>UD    0x0e8c810e
+                            sampler MsgDesc: gather4 SIMD16 Surface = 14 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(16)        g34<1>UW        g89<8,8,1>UD    0x0a8c820f
+                            sampler MsgDesc: gather4 SIMD16 Surface = 15 Sampler = 2 mlen 5 rlen 8 { align1 1H };
+send(16)        g30<1>UW        g73<8,8,1>UD    0x0a8c8310
+                            sampler MsgDesc: gather4 SIMD16 Surface = 16 Sampler = 3 mlen 5 rlen 8 { align1 1H };
+send(16)        g30<1>UW        g23<8,8,1>UD    0x0e8c8411
+                            sampler MsgDesc: gather4 SIMD16 Surface = 17 Sampler = 4 mlen 7 rlen 8 { align1 1H };
+send(16)        g5<1>UW         g33<8,8,1>UD    0x0a8c8512
+                            sampler MsgDesc: gather4 SIMD16 Surface = 18 Sampler = 5 mlen 5 rlen 8 { align1 1H };
+send(16)        g33<1>UW        g56<8,8,1>UD    0x0a8c8613
+                            sampler MsgDesc: gather4 SIMD16 Surface = 19 Sampler = 6 mlen 5 rlen 8 { align1 1H };
+send(16)        g34<1>UW        g23<8,8,1>UD    0x0e8c8714
+                            sampler MsgDesc: gather4 SIMD16 Surface = 20 Sampler = 7 mlen 7 rlen 8 { align1 1H };
+send(16)        g5<1>UW         g34<8,8,1>UD    0x0a8c8815
+                            sampler MsgDesc: gather4 SIMD16 Surface = 21 Sampler = 8 mlen 5 rlen 8 { align1 1H };
+send(16)        g38<1>UW        g67<8,8,1>UD    0x0e8d0916
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 22 Sampler = 9 mlen 7 rlen 8 { align1 1H };
+send(16)        g38<1>UW        g2<8,8,1>UD     0x128d0a17
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 23 Sampler = 10 mlen 9 rlen 8 { align1 1H };
+send(16)        g18<1>UW        g33<8,8,1>UD    0x0e8d0b18
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 24 Sampler = 11 mlen 7 rlen 8 { align1 1H };
+(+f1.0) send(8) g3<1>UW         g10<8,8,1>UD    0x0411b701
+                            data MsgDesc: ( DC untyped atomic, 1,  add) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g5<1>UW         g10<8,8,1>UD    0x0411bd01
+                            data MsgDesc: ( DC untyped atomic, 1,  umin) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g6<1>UW         g10<8,8,1>UD    0x0411bc01
+                            data MsgDesc: ( DC untyped atomic, 1,  umax) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g7<1>UW         g10<8,8,1>UD    0x0411b101
+                            data MsgDesc: ( DC untyped atomic, 1,  and) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g8<1>UW         g10<8,8,1>UD    0x0411b201
+                            data MsgDesc: ( DC untyped atomic, 1,  or) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g9<1>UW         g10<8,8,1>UD    0x0411b301
+                            data MsgDesc: ( DC untyped atomic, 1,  xor) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g10<1>UW        g10<8,8,1>UD    0x0411b401
+                            data MsgDesc: ( DC untyped atomic, 1,  mov) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g11<1>UW        g11<8,8,1>UD    0x0611be01
+                            data MsgDesc: ( DC untyped atomic, 1,  cmpwr) mlen 3 rlen 1 { align1 1Q };
+(+f1.0) send(16) g3<1>UW        g19<8,8,1>UD    0x0821a701
+                            data MsgDesc: ( DC untyped atomic, 1,  add) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g7<1>UW        g19<8,8,1>UD    0x0821ad01
+                            data MsgDesc: ( DC untyped atomic, 1,  umin) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g9<1>UW        g19<8,8,1>UD    0x0821ac01
+                            data MsgDesc: ( DC untyped atomic, 1,  umax) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g11<1>UW       g19<8,8,1>UD    0x0821a101
+                            data MsgDesc: ( DC untyped atomic, 1,  and) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g13<1>UW       g19<8,8,1>UD    0x0821a201
+                            data MsgDesc: ( DC untyped atomic, 1,  or) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g15<1>UW       g19<8,8,1>UD    0x0821a301
+                            data MsgDesc: ( DC untyped atomic, 1,  xor) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g17<1>UW       g19<8,8,1>UD    0x0821a401
+                            data MsgDesc: ( DC untyped atomic, 1,  mov) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g19<1>UW       g21<8,8,1>UD    0x0c21ae01
+                            data MsgDesc: ( DC untyped atomic, 1,  cmpwr) mlen 6 rlen 2 { align1 1H };
+send(8)         g26<1>F         g114<4>F        0x0418800c
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 12 Sampler = 0 mlen 2 rlen 1 { align16 1Q };
+send(8)         g30<1>F         g114<4>F        0x0418810d
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 13 Sampler = 1 mlen 2 rlen 1 { align16 1Q };
+send(8)         g36<1>F         g114<4>F        0x0418820e
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 14 Sampler = 2 mlen 2 rlen 1 { align16 1Q };
+send(8)         g55<1>D         g114<4>F        0x0418830f
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 15 Sampler = 3 mlen 2 rlen 1 { align16 1Q };
+send(8)         g61<1>D         g114<4>F        0x04188410
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 16 Sampler = 4 mlen 2 rlen 1 { align16 1Q };
+send(8)         g67<1>D         g114<4>F        0x04188511
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 17 Sampler = 5 mlen 2 rlen 1 { align16 1Q };
+send(8)         g86<1>UD        g114<4>F        0x04188612
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 18 Sampler = 6 mlen 2 rlen 1 { align16 1Q };
+send(8)         g92<1>UD        g114<4>F        0x04188713
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 19 Sampler = 7 mlen 2 rlen 1 { align16 1Q };
+send(8)         g98<1>UD        g114<4>F        0x04188814
+                            sampler MsgDesc: gather4 SIMD4x2 Surface = 20 Sampler = 8 mlen 2 rlen 1 { align16 1Q };
+send(8)         g6<1>F          g114<4>F        0x06190915
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 21 Sampler = 9 mlen 3 rlen 1 { align16 1Q };
+send(8)         g11<1>F         g114<4>F        0x06190a16
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 22 Sampler = 10 mlen 3 rlen 1 { align16 1Q };
+send(8)         g16<1>F         g114<4>F        0x06190b17
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 23 Sampler = 11 mlen 3 rlen 1 { align16 1Q };
+send(8)         g124<1>UW       g3<8,8,1>UD     0x0a4b1002
+                            sampler MsgDesc: gather4_po SIMD8 Surface = 2 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g3<8,8,1>UD     0x128d1002
+                            sampler MsgDesc: gather4_po SIMD16 Surface = 2 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         g4<1>F          g114<4>F        0x06190005
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 5 Sampler = 0 mlen 3 rlen 1 { align16 1Q };
+send(8)         g5<1>F          g114<4>F        0x06190106
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 6 Sampler = 1 mlen 3 rlen 1 { align16 1Q };
+send(8)         g7<1>F          g114<4>F        0x06190207
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 7 Sampler = 2 mlen 3 rlen 1 { align16 1Q };
+send(8)         g10<1>F         g114<4>F        0x06190308
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 8 Sampler = 3 mlen 3 rlen 1 { align16 1Q };
+send(8)         g12<1>F         g114<4>F        0x06190409
+                            sampler MsgDesc: gather4_c SIMD4x2 Surface = 9 Sampler = 4 mlen 3 rlen 1 { align16 1Q };
+send(8)         g24<1>UW        g2<8,8,1>UD     0x06423203
+                            sampler MsgDesc: sample_c SIMD8 Surface = 3 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(16)        g19<1>UW        g27<8,8,1>UD    0x0c843203
+                            sampler MsgDesc: sample_c SIMD16 Surface = 3 Sampler = 2 mlen 6 rlen 8 { align1 1H };
+send(8)         g5<1>F          g114<4>F        0x04102303
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 3 Sampler = 3 mlen 2 rlen 1 { align16 1Q };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x08424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g9<1>UW         g5<8,8,1>UD     0x04420002
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g13<1>UW        g7<8,8,1>UD     0x08840002
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x0419a501
+                            render MsgDesc: typed atomic op MsgCtrl = 0x37 Surface = 1 mlen 2 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g2<8,8,1>UD     0x0419b501
+                            render MsgDesc: typed atomic op MsgCtrl = 0x53 Surface = 1 mlen 2 rlen 1 { align1 2Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06098101
+                            render MsgDesc: typed atomic op MsgCtrl = 0x1 Surface = 1 mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06099101
+                            render MsgDesc: typed atomic op MsgCtrl = 0x17 Surface = 1 mlen 3 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06098201
+                            render MsgDesc: typed atomic op MsgCtrl = 0x2 Surface = 1 mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06099201
+                            render MsgDesc: typed atomic op MsgCtrl = 0x18 Surface = 1 mlen 3 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06098301
+                            render MsgDesc: typed atomic op MsgCtrl = 0x3 Surface = 1 mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06099301
+                            render MsgDesc: typed atomic op MsgCtrl = 0x19 Surface = 1 mlen 3 rlen 0 { align1 2Q };
+send(8)         g29<1>UW        g18<8,8,1>UD    0x04420008
+                            sampler MsgDesc: sample SIMD8 Surface = 8 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g35<1>UW        g18<8,8,1>UD    0x04420109
+                            sampler MsgDesc: sample SIMD8 Surface = 9 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(8)         g41<1>UW        g18<8,8,1>UD    0x0442020a
+                            sampler MsgDesc: sample SIMD8 Surface = 10 Sampler = 2 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g18<8,8,1>UD    0x0442030b
+                            sampler MsgDesc: sample SIMD8 Surface = 11 Sampler = 3 mlen 2 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g18<8,8,1>UD    0x0442040c
+                            sampler MsgDesc: sample SIMD8 Surface = 12 Sampler = 4 mlen 2 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g18<8,8,1>UD    0x0442050d
+                            sampler MsgDesc: sample SIMD8 Surface = 13 Sampler = 5 mlen 2 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g18<8,8,1>UD    0x0442060e
+                            sampler MsgDesc: sample SIMD8 Surface = 14 Sampler = 6 mlen 2 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x0442070f
+                            sampler MsgDesc: sample SIMD8 Surface = 15 Sampler = 7 mlen 2 rlen 4 { align1 1Q };
+send(16)        g32<1>UW        g22<8,8,1>UD    0x08840008
+                            sampler MsgDesc: sample SIMD16 Surface = 8 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(16)        g42<1>UW        g22<8,8,1>UD    0x08840109
+                            sampler MsgDesc: sample SIMD16 Surface = 9 Sampler = 1 mlen 4 rlen 8 { align1 1H };
+send(16)        g60<1>UW        g22<8,8,1>UD    0x0884020a
+                            sampler MsgDesc: sample SIMD16 Surface = 10 Sampler = 2 mlen 4 rlen 8 { align1 1H };
+send(16)        g70<1>UW        g22<8,8,1>UD    0x0884030b
+                            sampler MsgDesc: sample SIMD16 Surface = 11 Sampler = 3 mlen 4 rlen 8 { align1 1H };
+send(16)        g78<1>UW        g22<8,8,1>UD    0x0884040c
+                            sampler MsgDesc: sample SIMD16 Surface = 12 Sampler = 4 mlen 4 rlen 8 { align1 1H };
+send(16)        g86<1>UW        g22<8,8,1>UD    0x0884050d
+                            sampler MsgDesc: sample SIMD16 Surface = 13 Sampler = 5 mlen 4 rlen 8 { align1 1H };
+send(16)        g94<1>UW        g22<8,8,1>UD    0x0884060e
+                            sampler MsgDesc: sample SIMD16 Surface = 14 Sampler = 6 mlen 4 rlen 8 { align1 1H };
+send(16)        g52<1>UW        g22<8,8,1>UD    0x0884070f
+                            sampler MsgDesc: sample SIMD16 Surface = 15 Sampler = 7 mlen 4 rlen 8 { align1 1H };
+send(8)         g5<1>F          g114<4>F        0x04102101
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 1 Sampler = 1 mlen 2 rlen 1 { align16 1Q };
+send(8)         g6<1>F          g114<4>F        0x04102202
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 2 Sampler = 2 mlen 2 rlen 1 { align16 1Q };
+send(8)         g8<1>F          g114<4>F        0x04102404
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 4 Sampler = 4 mlen 2 rlen 1 { align16 1Q };
+send(8)         g10<1>F         g114<4>F        0x04102606
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 6 Sampler = 6 mlen 2 rlen 1 { align16 1Q };
+send(8)         g11<1>F         g114<4>F        0x04102707
+                            sampler MsgDesc: sample_l SIMD4x2 Surface = 7 Sampler = 7 mlen 2 rlen 1 { align16 1Q };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x0a425001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g17<8,8,1>UD    0x0a425102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g27<1>UW        g7<8,8,1>UD     0x14845001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(16)        g35<1>UW        g17<8,8,1>UD    0x14845102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 10 rlen 8 { align1 1H };
+send(8)         null<1>F        g113<4>F        0x0e094030
+                            urb MsgDesc: 6 write HWord per-slot interleave mlen 7 rlen 0 { align16 1Q };
+send(8)         null<1>UW       g7<8,8,1>UD     0x08098701
+                            render MsgDesc: typed atomic op MsgCtrl = 0x7 Surface = 1 mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099701
+                            render MsgDesc: typed atomic op MsgCtrl = 0x23 Surface = 1 mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g7<8,8,1>UD     0x08098d01
+                            render MsgDesc: typed atomic op MsgCtrl = 0x13 Surface = 1 mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099d01
+                            render MsgDesc: typed atomic op MsgCtrl = 0x29 Surface = 1 mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g7<8,8,1>UD     0x08098101
+                            render MsgDesc: typed atomic op MsgCtrl = 0x1 Surface = 1 mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099101
+                            render MsgDesc: typed atomic op MsgCtrl = 0x17 Surface = 1 mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g7<8,8,1>UD     0x08098201
+                            render MsgDesc: typed atomic op MsgCtrl = 0x2 Surface = 1 mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099201
+                            render MsgDesc: typed atomic op MsgCtrl = 0x18 Surface = 1 mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g7<8,8,1>UD     0x08098301
+                            render MsgDesc: typed atomic op MsgCtrl = 0x3 Surface = 1 mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099301
+                            render MsgDesc: typed atomic op MsgCtrl = 0x19 Surface = 1 mlen 4 rlen 0 { align1 2Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x024ab102
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g10<8,8,1>UD    0x024ab203
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 3 Sampler = 2 mlen 1 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g14<8,8,1>UD    0x024ab304
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 4 Sampler = 3 mlen 1 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x024ab405
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 5 Sampler = 4 mlen 1 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g22<8,8,1>UD    0x024ab506
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 6 Sampler = 5 mlen 1 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g26<8,8,1>UD    0x028cb102
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 2 Sampler = 1 mlen 1 rlen 8 { align1 1H };
+send(16)        g28<1>UW        g27<8,8,1>UD    0x028cb203
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 3 Sampler = 2 mlen 1 rlen 8 { align1 1H };
+send(16)        g36<1>UW        g44<8,8,1>UD    0x028cb304
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 4 Sampler = 3 mlen 1 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g53<8,8,1>UD    0x028cb506
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 6 Sampler = 5 mlen 1 rlen 8 { align1 1H };
+send(16)        g44<1>UW        g52<8,8,1>UD    0x028cb405
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 5 Sampler = 4 mlen 1 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x084b0006
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 6 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x0a4b0107
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 7 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g11<8,8,1>UD    0x0a4b0208
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 8 Sampler = 2 mlen 5 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x0c4b0309
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 9 Sampler = 3 mlen 6 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g24<8,8,1>UD    0x084b040a
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 10 Sampler = 4 mlen 4 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g26<8,8,1>UD    0x128d0208
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 8 Sampler = 2 mlen 9 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0e8d0006
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 6 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(16)        g26<1>UW        g35<8,8,1>UD    0x168d0309
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 9 Sampler = 3 mlen 11 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g53<8,8,1>UD    0x128d0107
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 7 Sampler = 1 mlen 9 rlen 8 { align1 1H };
+send(16)        g34<1>UW        g46<8,8,1>UD    0x0e8d040a
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 10 Sampler = 4 mlen 7 rlen 8 { align1 1H };
+send(8)         null<1>UW       g9<8,8,1>UD     0x0e0b4003
+                            render MsgDesc: typed surface write MsgCtrl = 0x0 Surface = 3 mlen 7 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g15<8,8,1>UD    0x0e0b6003
+                            render MsgDesc: typed surface write MsgCtrl = 0x32 Surface = 3 mlen 7 rlen 0 { align1 2Q };
diff --git a/src/intel/compiler/elk/tests/gen7/send.expected b/src/intel/compiler/elk/tests/gen7/send.expected
new file mode 100644
index 00000000000..1ed520e7a7a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/send.expected
@@ -0,0 +1,585 @@
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 c0 08 86
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 c0 08 8a
+31 00 60 02 29 0c 40 20 00 01 8d 00 01 70 42 08
+31 00 80 02 29 0c 40 20 c0 01 8d 00 01 70 84 10
+31 01 60 06 25 0c 4f 26 64 06 6e 00 13 40 19 02
+31 01 60 06 3c 0c 0f 20 84 01 6e 00 19 40 09 04
+31 01 60 06 3c 0c 0f 20 a4 01 6e 00 11 40 09 04
+31 01 60 06 3c 0c 0f 20 84 01 6e 00 09 40 09 04
+31 01 60 06 3c 0c 0f 20 84 01 6e 00 01 40 09 04
+31 01 60 06 25 0c cf 21 e4 01 6e 00 0b 40 19 02
+31 01 60 06 25 0c af 21 84 01 6e 00 03 40 19 02
+31 02 60 03 28 0c 00 20 80 01 69 00 04 80 00 02
+31 01 60 06 3c 0c 0f 20 a4 01 6e 00 03 c0 08 02
+31 01 60 06 bc 0f 0f 20 c4 0f 6e 00 01 00 08 84
+31 01 60 0a bd 0f 0f 20 a4 0f 6e 00 ff 80 0a 06
+31 01 60 0a bd 0f 2f 25 c4 0f 6e 00 ff 80 18 04
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 c0 08 8e
+31 00 60 02 29 0c 80 2f 60 01 8d 00 01 00 42 06
+31 00 80 02 29 0c 00 2f 00 02 8d 00 01 00 84 0c
+31 00 60 02 29 0c 80 2f 40 00 8d 00 01 00 42 04
+31 00 80 02 29 0c 00 2f 40 00 8d 00 01 00 84 08
+31 00 60 02 29 0c 80 2f e0 00 8d 00 01 40 4a 14
+31 00 60 02 29 0c 80 2f 00 01 8d 00 01 40 4a 08
+31 01 60 02 21 0c cf 24 e0 0a 60 00 00 70 10 02
+31 00 60 02 29 0c 40 20 e0 00 8d 00 01 60 4a 0a
+31 00 60 02 29 0c c0 20 80 01 8d 00 02 61 4a 0a
+31 00 80 02 29 0c 40 20 60 01 8d 00 01 60 8c 12
+31 00 80 02 29 0c 40 21 80 02 8d 00 02 61 8c 12
+31 01 60 02 a5 0f 4f 21 44 0e 6e 00 00 e0 11 04
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 40 09 0a
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 40 08 82
+31 01 60 06 3c 0c 0f 20 24 01 6e 00 21 40 09 04
+31 01 60 06 3c 0c 0f 20 24 01 6e 00 03 80 08 02
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 40 09 06
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 40 08 1a
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 30 c0 08 8a
+31 00 60 02 29 0c a0 20 a0 02 8d 00 01 00 42 02
+31 00 80 02 29 0c e0 20 a0 02 8d 00 01 00 84 04
+31 01 60 02 a5 0f 8f 21 44 0e 6e 00 00 a0 10 02
+31 01 60 06 3c 0c 0f 20 24 01 6e 00 29 40 09 04
+31 00 60 02 29 0c 40 20 40 01 8d 00 01 70 42 06
+31 00 80 02 29 0c 40 20 40 02 8d 00 01 70 84 0c
+31 00 60 02 29 0c 80 2f c0 00 8d 00 01 40 42 0c
+31 01 60 02 a5 0f 6f 20 44 0e 6e 00 01 a1 10 02
+31 01 60 02 a5 0f af 20 44 0e 6e 00 02 a2 10 02
+31 01 60 02 a5 0f ef 20 44 0e 6e 00 03 a3 10 02
+31 01 60 02 a5 0f 2f 21 44 0e 6e 00 04 a4 10 02
+31 01 60 02 a5 0f 6f 21 44 0e 6e 00 05 a5 10 02
+31 01 60 02 a5 0f af 21 44 0e 6e 00 06 a6 10 02
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 c0 08 92
+31 00 60 02 29 0c 40 20 40 00 8d 00 01 90 42 04
+31 00 80 02 29 0c 40 20 40 01 8d 00 01 90 84 08
+31 02 80 09 21 0c a0 21 60 00 8d 00 01 03 28 02
+31 01 60 02 a5 0f 6f 21 44 0e 6e 00 01 10 19 06
+31 00 60 02 29 0c 80 2f 40 00 8d 00 01 20 42 04
+31 00 80 02 29 0c 00 2f e0 00 8d 00 01 20 84 08
+31 00 60 02 29 0c a0 20 60 00 8d 00 01 70 42 02
+31 00 80 02 29 0c 00 21 a0 00 8d 00 01 70 84 04
+31 00 60 02 29 0c 80 2f 20 01 8d 00 01 10 42 08
+31 00 80 02 29 0c 00 2f c0 01 8d 00 01 10 84 10
+31 00 60 02 29 0c 40 20 40 00 8d 00 01 90 42 02
+31 00 80 02 29 0c 40 20 40 01 8d 00 01 90 84 04
+31 00 60 02 29 0c 40 20 20 01 8d 00 01 a0 42 02
+31 00 80 02 29 0c 40 20 40 01 8d 00 01 a0 84 04
+31 00 60 02 29 0c 40 20 a0 02 8d 00 01 60 42 08
+31 00 80 02 29 0c 40 21 40 00 8d 00 01 60 84 10
+31 00 60 02 29 0c 40 20 00 01 8d 00 01 60 42 06
+31 00 80 02 29 0c 40 20 80 02 8d 00 01 60 84 0c
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 30 40 08 1a
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 60 c0 08 96
+31 00 60 02 29 0c 40 20 c0 00 8d 00 01 30 42 06
+31 00 80 02 29 0c 40 20 40 01 8d 00 01 30 84 0c
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 08 40 09 0a
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 01 40 08 04
+31 01 60 02 bd 0f 2f 21 44 0e 6e 00 01 80 18 04
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 30 c0 08 86
+31 00 60 02 29 0c 80 2f 60 01 8d 00 01 10 4a 0a
+31 00 80 02 29 0c 00 2f 60 02 8d 00 01 10 8c 12
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 c0 08 96
+31 00 60 02 29 0c 20 21 40 00 8d 00 02 a1 42 02
+31 00 60 02 29 0c c0 20 00 02 8d 00 02 61 42 06
+31 00 80 02 29 0c a0 23 a0 04 8d 00 02 a1 84 04
+31 00 80 02 29 0c 40 20 e0 01 8d 00 02 61 84 0c
+31 00 60 02 29 0c a0 20 20 02 8d 00 02 71 42 06
+31 00 60 02 29 0c 20 21 80 02 8d 00 01 e0 43 08
+31 00 80 02 29 0c a0 22 e0 00 8d 00 02 71 84 0c
+31 00 80 02 29 0c a0 23 a0 01 8d 00 01 e0 85 10
+31 00 60 02 29 0c 80 2f a0 01 8d 00 01 e0 43 0a
+31 00 80 02 29 0c 00 2f 40 01 8d 00 01 e0 85 14
+31 00 60 02 29 0c 40 20 a0 01 8d 00 01 d0 43 06
+31 00 80 02 29 0c 40 20 40 01 8d 00 01 d0 85 0c
+31 00 60 02 29 0c 20 21 60 02 8d 00 01 d0 43 04
+31 00 80 02 29 0c e0 22 60 01 8d 00 01 d0 85 08
+31 01 60 02 a5 0f 01 22 44 0e 6e 00 00 b0 18 02
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 40 09 16
+31 00 60 02 29 0c 40 20 c0 00 8d 00 01 30 4a 06
+31 00 60 02 29 0c c0 20 20 01 8d 00 02 31 4a 06
+31 00 80 02 29 0c 40 20 40 01 8d 00 01 30 8c 0a
+31 00 80 02 29 0c 40 21 40 02 8d 00 02 31 8c 0a
+31 00 60 02 29 0c 80 2f 40 00 8d 00 01 10 42 06
+31 00 80 02 29 0c 00 2f 40 00 8d 00 01 10 84 0c
+31 00 60 02 29 0c 40 20 60 01 8d 00 01 50 42 06
+31 00 60 02 29 0c c0 20 c0 01 8d 00 02 51 42 06
+31 00 80 02 29 0c 40 20 a0 02 8d 00 01 50 84 0c
+31 00 80 02 29 0c 40 21 60 03 8d 00 02 51 84 0c
+31 00 60 02 29 0c 80 2f 20 01 8d 00 01 00 42 08
+31 00 80 02 29 0c 00 2f 00 01 8d 00 01 00 84 10
+31 01 60 02 a1 0f 6f 21 44 0e 6e 00 00 d0 11 02
+31 00 60 02 29 0c 80 2f e0 00 8d 00 01 40 42 0e
+31 01 60 02 3d 0c 0f 22 20 02 60 00 01 70 10 02
+31 00 60 02 29 0c c0 20 20 01 8d 00 02 61 42 08
+31 00 80 02 29 0c 40 20 e0 01 8d 00 02 61 84 10
+31 01 60 02 a5 0f 8f 20 44 0e 6e 00 03 80 18 04
+31 01 60 02 a5 0f af 20 44 0e 6e 00 04 81 18 04
+31 01 60 02 a5 0f 0f 21 44 0e 6e 00 05 82 18 04
+31 00 60 02 29 0c 80 2f 40 01 8d 00 01 20 42 08
+31 00 80 02 29 0c 00 2f e0 01 8d 00 01 20 84 10
+31 00 60 02 29 00 80 2f e0 00 8d 00 00 02 00 00
+31 01 60 06 25 0c 8f 23 a4 03 6e 00 1b 40 19 02
+31 01 60 06 25 0c 8f 24 a4 04 6e 00 23 40 19 02
+31 01 60 06 25 0c 6f 25 84 05 6e 00 2b 40 19 02
+31 01 60 0a 29 0c 61 20 24 00 6e 00 00 bb 11 04
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 40 09 1a
+31 00 60 02 29 0c 80 2f a0 00 8d 00 04 03 42 04
+31 00 80 02 29 0c 00 2f e0 00 8d 00 04 03 84 08
+31 00 60 02 29 0c 40 20 20 01 8d 00 01 30 42 08
+31 00 80 02 29 0c 00 22 00 01 8d 00 01 30 84 10
+31 00 60 02 29 0c 80 2f 40 01 8d 00 01 10 42 0a
+31 00 80 02 29 0c 00 2f 20 01 8d 00 01 10 84 14
+31 00 60 02 29 0c 40 20 a0 01 8d 00 01 20 42 06
+31 00 80 02 29 0c c0 21 00 01 8d 00 01 20 84 0c
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 40 09 0e
+31 00 60 02 29 0c 80 2f 40 00 8d 00 01 40 42 06
+31 00 60 02 29 0c 40 20 40 00 8d 00 01 90 42 06
+31 00 80 02 29 0c 40 20 80 01 8d 00 01 90 84 0c
+31 00 60 02 29 0c 80 2f 40 00 8d 00 01 70 42 04
+31 00 80 02 29 0c 00 2f 40 00 8d 00 01 70 84 08
+31 00 60 02 29 0c 40 20 40 00 8d 00 06 80 4a 06
+31 00 60 02 29 0c c0 22 c0 01 8d 00 0a 84 4a 06
+31 00 60 02 29 0c c0 20 c0 00 8d 00 07 81 4a 08
+31 00 60 02 29 0c c0 21 40 01 8d 00 08 82 4a 08
+31 00 60 02 29 0c 40 22 40 03 8d 00 09 83 4a 0a
+31 00 80 02 29 0c 60 24 40 00 8d 00 06 80 8c 0a
+31 00 80 02 29 0c 40 22 60 05 8d 00 0a 84 8c 0a
+31 00 80 02 29 0c 60 25 e0 00 8d 00 07 81 8c 0e
+31 00 80 02 29 0c 40 20 60 06 8d 00 08 82 8c 0e
+31 00 80 02 29 0c 40 21 40 03 8d 00 09 83 8c 12
+31 00 61 0a 28 0c 00 20 e0 00 8d 04 01 60 03 0a
+31 00 81 0a 28 0c 00 20 20 01 8d 04 01 50 03 14
+31 00 60 02 29 0c 80 2f 40 01 8d 00 01 40 4a 0e
+31 00 60 02 29 0c 80 2f 40 01 8d 00 01 10 4a 08
+31 00 80 02 29 0c 00 2f 20 02 8d 00 01 10 8c 0e
+31 00 60 02 29 0c 80 2f 20 01 8d 00 01 20 42 0a
+31 00 80 02 29 0c 00 2f 00 01 8d 00 01 20 84 14
+31 02 80 09 21 0c 60 20 a0 00 8d 00 02 03 28 02
+31 00 60 05 29 0c 20 23 a0 01 8d 00 01 4e 19 06
+31 00 60 05 28 0c 00 20 40 03 8d 00 01 4e 0b 08
+31 10 60 05 29 0c 00 25 40 02 8d 00 01 6e 19 06
+31 10 60 05 28 0c 00 20 40 00 8d 00 01 6e 0b 08
+31 00 60 05 28 0c 00 20 40 00 8d 00 01 85 09 06
+31 10 60 05 28 0c 00 20 40 00 8d 00 01 95 09 06
+31 00 60 05 28 0c 00 20 e0 00 8d 00 01 8c 09 08
+31 10 60 05 28 0c 00 20 40 00 8d 00 01 9c 09 08
+31 00 60 05 28 0c 00 20 e0 00 8d 00 01 84 09 08
+31 10 60 05 28 0c 00 20 40 00 8d 00 01 94 09 08
+31 00 60 05 28 0c 00 20 e0 00 8d 00 01 8e 09 0a
+31 10 60 05 28 0c 00 20 00 02 8d 00 01 9e 09 0a
+31 00 60 02 29 0c 40 20 c0 00 8d 00 01 30 42 04
+31 00 60 02 29 0c c0 20 00 01 8d 00 02 31 42 04
+31 00 80 02 29 0c 40 20 40 01 8d 00 01 30 84 08
+31 00 80 02 29 0c 40 21 40 02 8d 00 02 31 84 08
+31 00 60 02 29 0c 80 2f a0 00 8d 00 01 00 4a 06
+31 00 80 02 29 0c 00 2f e0 00 8d 00 01 00 8c 0a
+31 00 60 02 29 0c c0 20 c0 00 8d 00 02 31 42 06
+31 00 80 02 29 0c 40 21 40 03 8d 00 02 31 84 0c
+31 00 60 02 29 0c a0 20 e0 01 8d 00 03 02 42 04
+31 00 60 02 29 0c 20 21 e0 01 8d 00 02 01 42 04
+31 00 80 02 29 0c e0 20 60 03 8d 00 03 02 84 08
+31 00 80 02 29 0c e0 21 60 03 8d 00 02 01 84 08
+31 00 60 02 29 0c c0 22 c0 02 8d 00 03 a2 42 02
+31 00 60 02 29 0c 40 23 40 03 8d 00 04 a3 42 02
+31 00 60 02 29 0c c0 23 c0 03 8d 00 05 a4 42 02
+31 00 60 02 29 0c 40 24 40 04 8d 00 06 a5 42 02
+31 00 60 02 29 0c c0 24 c0 04 8d 00 07 a6 42 02
+31 00 60 02 29 0c 40 25 40 05 8d 00 08 a7 42 02
+31 00 60 02 29 0c c0 25 c0 05 8d 00 09 a8 42 02
+31 00 60 02 29 0c 40 26 40 06 8d 00 0a a9 42 02
+31 00 60 02 29 0c 40 20 c0 06 8d 00 0b aa 42 02
+31 00 60 02 29 0c c0 20 e0 06 8d 00 0c ab 42 02
+31 00 60 02 29 0c 40 21 00 07 8d 00 0d ac 42 02
+31 00 80 02 29 0c 40 2a c0 0d 8d 00 0b aa 84 04
+31 00 80 02 29 0c 40 22 40 03 8d 00 03 a2 84 04
+31 00 80 02 29 0c 40 2b 00 0e 8d 00 0c ab 84 04
+31 00 80 02 29 0c 40 2c 40 0d 8d 00 0d ac 84 04
+31 00 80 02 29 0c 40 23 40 04 8d 00 04 a3 84 04
+31 00 80 02 29 0c 40 24 40 05 8d 00 05 a4 84 04
+31 00 80 02 29 0c 40 25 40 06 8d 00 06 a5 84 04
+31 00 80 02 29 0c 40 26 40 07 8d 00 07 a6 84 04
+31 00 80 02 29 0c 40 27 40 08 8d 00 08 a7 84 04
+31 00 80 02 29 0c 40 28 40 09 8d 00 09 a8 84 04
+31 00 80 02 29 0c 40 29 80 0d 8d 00 0a a9 84 04
+31 00 60 02 29 0c 80 2f 60 01 8d 00 01 40 42 12
+31 01 60 02 bd 0f 2f 22 44 0e 6e 00 00 20 10 04
+31 00 60 02 29 0c 40 21 40 01 8d 00 05 04 42 04
+31 00 60 02 29 0c c0 21 c0 01 8d 00 06 05 42 04
+31 00 60 02 29 0c 40 22 40 02 8d 00 07 06 42 04
+31 00 60 02 29 0c c0 22 c0 02 8d 00 08 07 42 04
+31 00 60 02 29 0c 40 23 40 03 8d 00 09 08 42 04
+31 00 60 02 29 0c c0 23 c0 03 8d 00 0a 09 42 04
+31 00 60 02 29 0c 40 24 40 04 8d 00 0b 0a 42 04
+31 00 60 02 29 0c c0 24 c0 04 8d 00 0c 0b 42 04
+31 00 60 02 29 0c 40 25 40 05 8d 00 0d 0c 42 04
+31 00 60 02 29 0c c0 25 c0 05 8d 00 0e 0d 42 04
+31 00 60 02 29 0c 40 26 40 06 8d 00 0f 0e 42 04
+31 00 60 02 29 0c c0 26 c0 06 8d 00 10 0f 42 04
+31 00 80 02 29 0c 40 20 40 01 8d 00 05 04 84 08
+31 00 80 02 29 0c 40 20 40 02 8d 00 06 05 84 08
+31 00 80 02 29 0c 40 20 40 01 8d 00 07 06 84 08
+31 00 80 02 29 0c 40 20 40 02 8d 00 08 07 84 08
+31 00 80 02 29 0c 40 20 40 01 8d 00 09 08 84 08
+31 00 80 02 29 0c 40 20 40 02 8d 00 0a 09 84 08
+31 00 80 02 29 0c 40 20 40 01 8d 00 0b 0a 84 08
+31 00 80 02 29 0c 40 20 40 02 8d 00 0c 0b 84 08
+31 00 80 02 29 0c 40 20 40 01 8d 00 0d 0c 84 08
+31 00 80 02 29 0c 40 20 40 02 8d 00 0e 0d 84 08
+31 00 80 02 29 0c 40 20 40 01 8d 00 0f 0e 84 08
+31 00 80 02 29 0c 40 20 40 02 8d 00 10 0f 84 08
+31 00 60 02 29 0c c0 20 40 00 8d 00 02 01 42 02
+31 00 80 02 29 0c 40 21 40 00 8d 00 02 01 84 04
+31 00 60 0a 29 0c 80 2f 40 00 8d 00 01 60 41 02
+31 00 80 0a 29 0c 00 2f 40 00 8d 00 01 50 81 04
+31 01 60 06 3c 0c 0f 20 64 01 6e 00 31 40 09 04
+31 01 60 06 3c 0c 0f 20 24 01 6e 00 39 40 09 04
+31 01 60 06 25 0c 8f 26 a4 06 6e 00 33 40 19 02
+31 01 60 06 25 0c 6f 27 84 07 6e 00 3b 40 19 02
+31 01 60 06 25 0c 4f 28 64 08 6e 00 43 40 19 02
+31 01 60 06 25 0c 2f 29 44 09 6e 00 4b 40 19 02
+31 00 60 02 29 0c 80 2f c0 00 8d 00 01 20 4a 08
+31 00 80 02 29 0c 00 2f 00 01 8d 00 01 20 8c 0e
+31 00 60 02 29 0c 40 20 a0 00 8d 00 01 60 42 0a
+31 00 60 02 29 0c c0 20 40 01 8d 00 02 61 42 0a
+31 00 80 02 29 0c 40 20 e0 01 8d 00 01 60 84 14
+31 00 80 02 29 0c 40 21 20 03 8d 00 02 61 84 14
+31 00 60 02 29 0c 40 20 e0 00 8d 00 01 50 4a 08
+31 00 60 02 29 0c c0 20 60 01 8d 00 02 51 4a 08
+31 00 80 02 29 0c 40 20 60 01 8d 00 01 50 8c 0e
+31 00 80 02 29 0c 40 21 40 02 8d 00 02 51 8c 0e
+31 00 60 02 29 0c 40 20 c0 00 8d 00 01 30 4a 08
+31 00 60 02 29 0c c0 20 40 01 8d 00 02 31 4a 08
+31 00 80 02 29 0c 40 20 40 01 8d 00 01 30 8c 0e
+31 00 80 02 29 0c 40 21 40 02 8d 00 02 31 8c 0e
+31 00 60 02 29 0c 80 2f 60 00 8d 00 01 00 4a 04
+31 00 80 02 29 0c 00 2f 80 00 8d 00 01 00 8c 06
+31 00 60 02 29 0c 20 22 80 01 8d 00 03 00 42 04
+31 00 80 02 29 0c e0 20 a0 04 8d 00 03 00 84 08
+31 00 60 02 29 0c 60 21 e0 04 8d 00 08 70 42 06
+31 00 60 02 29 0c e0 21 e0 04 8d 00 09 71 42 06
+31 00 60 02 29 0c 60 22 e0 04 8d 00 0a 72 42 06
+31 00 60 02 29 0c e0 22 e0 04 8d 00 0b 73 42 06
+31 00 60 02 29 0c 60 23 e0 04 8d 00 0c 74 42 06
+31 00 60 02 29 0c e0 23 e0 04 8d 00 0d 75 42 06
+31 00 60 02 29 0c 60 24 e0 04 8d 00 0e 76 42 06
+31 00 60 02 29 0c e0 24 e0 04 8d 00 0f 77 42 06
+31 00 80 02 29 0c 60 28 a0 0b 8d 00 08 70 84 0c
+31 00 80 02 29 0c 60 23 a0 0b 8d 00 09 71 84 0c
+31 00 80 02 29 0c a0 24 a0 0b 8d 00 0a 72 84 0c
+31 00 80 02 29 0c e0 25 a0 0b 8d 00 0b 73 84 0c
+31 00 80 02 29 0c 20 27 a0 0b 8d 00 0c 74 84 0c
+31 00 80 02 29 0c 20 22 a0 0b 8d 00 0d 75 84 0c
+31 00 80 02 29 0c a0 2a a0 0b 8d 00 0e 76 84 0c
+31 00 80 02 29 0c a0 29 a0 0b 8d 00 0f 77 84 0c
+31 00 60 02 29 0c 80 2f 00 01 8d 00 01 10 4a 06
+31 00 80 02 29 0c 00 2f e0 01 8d 00 01 10 8c 0a
+31 01 60 02 bd 0f af 20 44 0e 6e 00 03 00 19 06
+31 01 60 02 bd 0f cf 20 44 0e 6e 00 04 01 19 06
+31 01 60 02 bd 0f 2f 21 44 0e 6e 00 05 02 19 06
+31 00 60 02 29 0c 40 20 e0 00 8d 00 01 60 4a 08
+31 00 60 02 29 0c c0 20 60 01 8d 00 02 61 4a 08
+31 00 80 02 29 0c 40 20 60 01 8d 00 01 60 8c 0e
+31 00 80 02 29 0c 40 21 40 02 8d 00 02 61 8c 0e
+31 00 60 02 29 0c 40 20 80 01 8d 00 01 50 4a 0a
+31 00 60 02 29 0c c0 20 20 02 8d 00 02 51 4a 0a
+31 00 80 02 29 0c 20 23 e0 00 8d 00 01 50 8c 12
+31 00 80 02 29 0c 20 24 00 02 8d 00 02 51 8c 12
+31 01 60 0a 29 0c a1 28 44 08 6e 00 00 b5 11 02
+31 01 60 02 a5 0f 6f 20 44 0e 6e 00 05 80 18 04
+31 01 60 02 a5 0f 8f 20 44 0e 6e 00 06 81 18 04
+31 01 60 02 a5 0f cf 20 44 0e 6e 00 07 82 18 04
+31 01 60 02 a5 0f 2f 21 44 0e 6e 00 08 83 18 04
+31 01 60 02 a5 0f 6f 21 44 0e 6e 00 09 84 18 04
+31 00 60 02 29 0c 40 20 e0 00 8d 00 01 60 4a 0c
+31 00 60 02 29 0c c0 20 a0 01 8d 00 02 61 4a 0c
+31 00 80 02 29 0c e0 23 20 01 8d 00 01 60 8c 16
+31 00 80 02 29 0c 40 20 80 02 8d 00 02 61 8c 16
+31 00 60 02 29 0c 40 20 40 00 8d 00 04 80 4a 06
+31 00 60 02 29 0c c0 20 c0 00 8d 00 05 81 4a 08
+31 00 60 02 29 0c c0 21 40 01 8d 00 06 82 4a 06
+31 00 80 02 29 0c 40 20 40 01 8d 00 04 80 8c 0a
+31 00 80 02 29 0c 40 21 40 02 8d 00 05 81 8c 0e
+31 00 80 02 29 0c 40 23 40 04 8d 00 06 82 8c 0a
+31 00 60 02 29 0c 80 2f e0 00 8d 00 01 20 4a 0a
+31 00 80 02 29 0c 00 2f 20 01 8d 00 01 20 8c 12
+31 01 60 02 3d 0c 0f 23 20 03 60 00 02 70 10 02
+31 01 60 02 3d 0c cf 23 e0 03 60 00 03 70 10 02
+31 01 60 02 3d 0c 6f 24 80 04 60 00 04 70 10 02
+31 01 60 02 3d 0c 2f 25 40 05 60 00 05 70 10 02
+31 01 60 02 3d 0c ef 25 00 06 60 00 06 70 10 02
+31 01 60 02 3d 0c af 26 c0 06 60 00 07 70 10 02
+31 01 60 02 3d 0c 6f 27 80 07 60 00 08 70 10 02
+31 01 60 02 3d 0c 2f 28 40 08 60 00 09 70 10 02
+31 01 60 02 3d 0c ef 28 00 09 60 00 0a 70 10 02
+31 01 60 02 3d 0c af 29 c0 09 60 00 0b 70 10 02
+31 01 60 02 3d 0c 6f 2a 80 0a 60 00 0c 70 10 02
+31 01 60 02 3d 0c 2f 2b 40 0b 60 00 0d 70 10 02
+31 01 60 02 bd 0f af 20 44 0e 6e 00 05 25 10 04
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 08 40 09 06
+31 00 61 0a 29 0c 40 20 c0 00 8d 04 01 b5 11 02
+31 00 81 0a 29 0c 40 20 00 01 8d 04 01 a5 21 04
+31 00 61 0a 28 0c 00 20 40 00 8d 04 01 95 01 02
+31 00 81 0a 28 0c 00 20 40 00 8d 04 01 85 01 04
+31 00 60 02 29 0c 40 20 40 01 8d 00 01 30 42 0a
+31 00 60 02 29 0c c0 20 e0 01 8d 00 02 31 42 0a
+31 00 80 02 29 0c a0 23 20 01 8d 00 01 30 84 14
+31 00 80 02 29 0c a0 24 60 02 8d 00 02 31 84 14
+31 00 61 0a 29 0c 80 20 80 01 8d 04 02 b5 11 02
+31 00 81 0a 29 0c a0 20 20 02 8d 04 02 a5 21 04
+31 00 60 02 29 0c 40 20 20 01 8d 00 01 b0 4a 02
+31 00 80 02 29 0c 40 20 40 01 8d 00 01 b0 8c 02
+31 00 61 0a 28 0c 00 20 60 00 8d 04 01 6e 03 04
+31 00 61 0a 28 0c 00 20 60 00 8d 04 01 6c 03 06
+31 00 81 0a 28 0c 00 20 80 00 8d 04 01 5e 03 08
+31 00 81 0a 28 0c 00 20 80 00 8d 04 01 5c 03 0c
+31 02 00 0a 29 0d 40 20 40 00 00 00 00 e0 19 02
+31 00 60 0a 29 0c 40 20 c0 0b 8d 00 01 6e 11 02
+31 00 61 0a 28 0c 00 20 e0 0e 8d 04 01 96 01 02
+31 00 61 0a 29 0c 00 26 e0 0e 8d 04 01 b6 11 02
+31 00 80 0a 29 0c a0 20 e0 02 8d 00 01 5e 21 04
+31 00 81 0a 28 0c 00 20 60 00 8d 04 01 86 01 04
+31 00 81 0a 29 0c 20 2c 60 00 8d 04 01 a6 21 04
+31 01 60 0a 29 0c e1 25 84 05 6e 00 00 b6 11 02
+31 01 60 06 25 0c af 2c 64 0c 6e 00 c3 40 19 02
+31 01 60 06 25 0c cf 2d 64 0c 6e 00 cb 40 19 02
+31 01 60 06 3c 0c 0f 20 24 01 6e 00 41 40 09 04
+31 01 60 06 3c 0c 0f 20 24 01 6e 00 49 40 09 04
+31 00 60 02 29 0c 80 2f c0 00 8d 00 01 10 42 04
+31 00 80 02 29 0c 00 2f 00 01 8d 00 01 10 84 08
+31 00 60 02 29 0c c0 20 c0 00 8d 00 02 01 42 06
+31 00 80 02 29 0c 40 21 40 02 8d 00 02 01 84 0c
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 30 c0 08 8e
+31 00 60 0a 29 0c c0 20 60 01 8d 00 02 60 41 02
+31 00 80 0a 29 0c 60 22 20 02 8d 00 02 50 81 04
+31 01 60 02 a5 0f 41 20 44 0e 6e 00 01 b1 18 02
+31 01 60 02 a5 0f 81 20 44 0e 6e 00 02 b2 18 02
+31 01 60 02 a5 0f c1 20 44 0e 6e 00 03 b3 18 02
+31 01 60 02 a5 0f 01 21 44 0e 6e 00 04 b4 18 02
+31 01 60 02 a5 0f 41 21 44 0e 6e 00 05 b5 18 02
+31 00 60 02 29 0c 80 2f 00 01 8d 00 01 20 4a 06
+31 00 80 02 29 0c 00 2f 80 01 8d 00 01 20 8c 0a
+31 00 60 02 29 0c c0 20 e0 01 8d 00 02 31 42 08
+31 00 80 02 29 0c 40 22 40 00 8d 00 02 31 84 10
+31 00 60 05 29 0c a0 20 a0 00 8d 00 01 4e 19 04
+31 00 60 05 28 0c 00 20 e0 00 8d 00 02 4e 0b 06
+31 10 60 05 29 0c e0 20 40 01 8d 00 01 6e 19 04
+31 10 60 05 28 0c 00 20 80 01 8d 00 02 6e 0b 06
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 30 40 09 1a
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 60 40 09 06
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 00 c0 08 9a
+31 00 60 02 29 0c c0 21 60 01 8d 00 06 02 4b 08
+31 00 60 02 29 0c 40 20 40 00 8d 00 04 00 4b 08
+31 00 60 02 29 0c c0 20 c0 00 8d 00 05 01 4b 0a
+31 00 80 02 29 0c 40 23 40 00 8d 00 06 02 8d 0e
+31 00 80 02 29 0c 40 20 40 01 8d 00 04 00 8d 0e
+31 00 80 02 29 0c 40 21 40 04 8d 00 05 01 8d 12
+31 02 80 09 21 0c 80 26 c0 00 8d 00 04 03 28 02
+31 02 80 09 21 0c c0 23 60 01 8d 00 03 03 28 02
+31 02 80 09 21 0c 00 24 c0 01 8d 00 06 03 28 02
+31 02 80 09 21 0c 40 24 00 02 8d 00 05 03 28 02
+31 00 60 0a 29 0c e0 20 00 03 8d 00 02 6e 11 02
+31 00 60 0a 29 0c 00 21 00 03 8d 00 04 6e 11 02
+31 00 60 0a 29 0c a0 20 a0 02 8d 00 03 6e 11 02
+31 00 80 0a 29 0c 80 21 00 05 8d 00 02 5e 21 04
+31 00 80 0a 29 0c c0 21 00 05 8d 00 04 5e 21 04
+31 00 80 0a 29 0c 00 21 80 04 8d 00 03 5e 21 04
+31 00 61 0a 28 0c 00 20 60 01 8d 04 02 60 03 0a
+31 00 81 0a 28 0c 00 20 a0 01 8d 04 02 50 03 14
+31 01 60 02 a5 0f ef 21 44 0e 6e 00 07 a7 10 02
+31 01 60 02 a5 0f 2f 22 44 0e 6e 00 08 a8 10 02
+31 01 60 02 a5 0f 6f 22 44 0e 6e 00 09 a9 10 02
+31 01 60 02 a5 0f af 22 44 0e 6e 00 0a aa 10 02
+31 01 60 02 a5 0f ef 22 44 0e 6e 00 0b ab 10 02
+31 01 60 02 a5 0f 2f 23 44 0e 6e 00 0c ac 10 02
+31 00 60 0a 28 0c 00 20 c0 0f 8d 00 ff 02 0a 04
+31 02 60 0a a9 0f 40 28 00 00 8d 00 09 00 1c 02
+31 02 60 0a a9 0f 40 28 00 00 8d 00 01 00 1c 02
+31 02 60 0a a9 0f 40 28 00 00 8d 00 08 00 1c 02
+31 02 60 0a a9 0f 40 28 00 00 8d 00 02 00 1c 02
+31 02 60 0a a9 0f 40 28 00 00 8d 00 07 00 1c 02
+31 02 60 0a a9 0f 40 28 00 00 8d 00 00 00 1c 02
+31 02 60 0a a9 0f 80 28 00 00 8d 00 05 00 1c 02
+31 02 60 0a a9 0f 40 28 00 00 8d 00 04 00 1c 02
+31 02 60 0a a9 0f 00 28 00 00 8d 00 06 00 1c 02
+31 02 60 0a a9 0f 00 28 00 00 8d 00 03 00 1c 02
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 60 40 09 16
+31 00 60 05 28 0c 00 20 20 01 8d 00 02 40 0b 0e
+31 10 60 05 28 0c 00 20 a0 02 8d 00 02 60 0b 0e
+31 00 60 0a 29 0c 40 20 40 06 8d 00 01 6c 21 02
+31 00 61 0a 28 0c 00 20 40 00 8d 04 02 6c 03 06
+31 00 80 0a 29 0c e0 21 a0 0a 8d 00 01 5c 41 04
+31 00 81 0a 28 0c 00 20 40 00 8d 04 02 5c 03 0c
+31 00 60 05 28 0c 00 20 e0 00 8d 00 02 4e 0b 08
+31 10 60 05 28 0c 00 20 40 00 8d 00 02 6e 0b 08
+31 00 60 02 29 0c 80 2f e0 00 8d 00 01 40 4a 10
+31 00 60 02 29 0c 80 2f c0 00 8d 00 01 00 4a 08
+31 00 80 02 29 0c 00 2f 00 01 8d 00 01 00 8c 0e
+31 00 60 05 29 0c 80 20 00 02 8d 00 02 4e 19 04
+31 00 60 05 29 0c 80 2f 40 00 8d 00 01 a7 19 06
+31 10 60 05 29 0c e0 20 e0 03 8d 00 02 6e 19 04
+31 10 60 05 29 0c 20 2f a0 00 8d 00 01 b7 19 06
+31 00 60 05 29 0c 80 2f 40 00 8d 00 01 ad 19 06
+31 10 60 05 29 0c 20 2f a0 00 8d 00 01 bd 19 06
+31 00 60 05 29 0c 80 2f 40 00 8d 00 01 ac 19 06
+31 10 60 05 29 0c 20 2f a0 00 8d 00 01 bc 19 06
+31 00 60 05 29 0c 80 2f 40 00 8d 00 01 a1 19 06
+31 10 60 05 29 0c 20 2f a0 00 8d 00 01 b1 19 06
+31 00 60 05 29 0c 80 2f 40 00 8d 00 01 a2 19 06
+31 10 60 05 29 0c 20 2f a0 00 8d 00 01 b2 19 06
+31 00 60 05 29 0c 80 2f 40 00 8d 00 01 a3 19 06
+31 10 60 05 29 0c 20 2f a0 00 8d 00 01 b3 19 06
+31 00 60 05 29 0c 80 2f 40 00 8d 00 01 a4 19 06
+31 10 60 05 29 0c 20 2f a0 00 8d 00 01 b4 19 06
+31 00 60 05 29 0c 80 2f 40 00 8d 00 01 ae 19 08
+31 10 60 05 29 0c 20 2f 40 00 8d 00 01 be 19 08
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 60 c0 08 86
+31 00 60 02 29 0c 20 21 60 02 8d 00 02 e1 43 08
+31 00 80 02 29 0c e0 22 e0 00 8d 00 02 e1 85 10
+31 00 60 02 29 0c 20 21 a0 02 8d 00 02 d0 43 04
+31 00 80 02 29 0c e0 22 60 01 8d 00 02 d0 85 08
+31 00 60 05 28 0c 00 20 40 00 8d 00 01 4e 0b 06
+31 10 60 05 28 0c 00 20 40 00 8d 00 01 6e 0b 06
+31 00 61 0a 28 0c 00 20 60 01 8d 04 03 60 03 0a
+31 00 81 0a 28 0c 00 20 60 01 8d 04 03 50 03 14
+31 00 60 02 29 0c 60 20 80 00 8d 00 02 70 42 02
+31 00 80 02 29 0c 80 20 80 01 8d 00 02 70 84 04
+31 00 60 02 29 0c 80 2f c0 00 8d 00 01 40 4a 0a
+31 00 60 02 29 0c 40 20 e0 00 8d 00 01 50 42 08
+31 00 60 02 29 0c c0 20 60 01 8d 00 02 51 42 08
+31 00 80 02 29 0c 40 20 60 01 8d 00 01 50 84 10
+31 00 80 02 29 0c 40 21 60 02 8d 00 02 51 84 10
+31 02 80 09 21 0c 40 20 80 00 8d 00 07 03 28 02
+31 00 60 02 29 0c 40 20 40 00 8d 00 02 80 4a 06
+31 00 80 02 29 0c 40 20 40 01 8d 00 02 80 8c 0a
+31 00 61 0a 28 00 00 20 20 01 8d 04 00 02 00 00
+31 00 60 02 29 0c 40 21 40 01 8d 00 04 00 42 04
+31 00 80 02 29 0c 40 22 40 03 8d 00 04 00 84 08
+31 00 60 02 29 0c c0 22 c0 02 8d 00 0d 80 4a 06
+31 00 60 02 29 0c c0 24 c0 04 8d 00 0e 81 4a 08
+31 00 60 02 29 0c a0 20 60 02 8d 00 0f 82 4a 06
+31 00 60 02 29 0c e0 25 20 05 8d 00 10 83 4a 06
+31 00 60 02 29 0c c0 21 c0 01 8d 00 11 84 4a 08
+31 00 60 02 29 0c e0 25 e0 00 8d 00 12 85 4a 06
+31 00 60 02 29 0c 20 23 20 03 8d 00 13 86 4a 06
+31 00 60 02 29 0c a0 23 a0 05 8d 00 14 87 4a 08
+31 00 60 02 29 0c 60 21 00 01 8d 00 15 88 4a 06
+31 00 60 02 29 0c 40 23 40 03 8d 00 16 09 4b 08
+31 00 60 02 29 0c 40 20 40 00 8d 00 17 0a 4b 0a
+31 00 60 02 29 0c c0 20 c0 00 8d 00 18 0b 4b 08
+31 00 80 02 29 0c 40 20 40 01 8d 00 0d 80 8c 0a
+31 00 80 02 29 0c 40 24 40 05 8d 00 0e 81 8c 0e
+31 00 80 02 29 0c 40 24 20 0b 8d 00 0f 82 8c 0a
+31 00 80 02 29 0c c0 23 20 09 8d 00 10 83 8c 0a
+31 00 80 02 29 0c c0 23 e0 02 8d 00 11 84 8c 0e
+31 00 80 02 29 0c a0 20 20 04 8d 00 12 85 8c 0a
+31 00 80 02 29 0c 20 24 00 07 8d 00 13 86 8c 0a
+31 00 80 02 29 0c 40 24 e0 02 8d 00 14 87 8c 0e
+31 00 80 02 29 0c a0 20 40 04 8d 00 15 88 8c 0a
+31 00 80 02 29 0c c0 24 60 08 8d 00 16 09 8d 0e
+31 00 80 02 29 0c c0 24 40 00 8d 00 17 0a 8d 12
+31 00 80 02 29 0c 40 22 20 04 8d 00 18 0b 8d 0e
+31 00 61 0a 29 0c 60 20 40 01 8d 04 01 b7 11 04
+31 00 61 0a 29 0c a0 20 40 01 8d 04 01 bd 11 04
+31 00 61 0a 29 0c c0 20 40 01 8d 04 01 bc 11 04
+31 00 61 0a 29 0c e0 20 40 01 8d 04 01 b1 11 04
+31 00 61 0a 29 0c 00 21 40 01 8d 04 01 b2 11 04
+31 00 61 0a 29 0c 20 21 40 01 8d 04 01 b3 11 04
+31 00 61 0a 29 0c 40 21 40 01 8d 04 01 b4 11 04
+31 00 61 0a 29 0c 60 21 60 01 8d 04 01 be 11 06
+31 00 81 0a 29 0c 60 20 60 02 8d 04 01 a7 21 08
+31 00 81 0a 29 0c e0 20 60 02 8d 04 01 ad 21 08
+31 00 81 0a 29 0c 20 21 60 02 8d 04 01 ac 21 08
+31 00 81 0a 29 0c 60 21 60 02 8d 04 01 a1 21 08
+31 00 81 0a 29 0c a0 21 60 02 8d 04 01 a2 21 08
+31 00 81 0a 29 0c e0 21 60 02 8d 04 01 a3 21 08
+31 00 81 0a 29 0c 20 22 60 02 8d 04 01 a4 21 08
+31 00 81 0a 29 0c 60 22 a0 02 8d 04 01 ae 21 0c
+31 01 60 02 bd 0f 4f 23 44 0e 6e 00 0c 80 18 04
+31 01 60 02 bd 0f cf 23 44 0e 6e 00 0d 81 18 04
+31 01 60 02 bd 0f 8f 24 44 0e 6e 00 0e 82 18 04
+31 01 60 02 a5 0f ef 26 44 0e 6e 00 0f 83 18 04
+31 01 60 02 a5 0f af 27 44 0e 6e 00 10 84 18 04
+31 01 60 02 a5 0f 6f 28 44 0e 6e 00 11 85 18 04
+31 01 60 02 a1 0f cf 2a 44 0e 6e 00 12 86 18 04
+31 01 60 02 a1 0f 8f 2b 44 0e 6e 00 13 87 18 04
+31 01 60 02 a1 0f 4f 2c 44 0e 6e 00 14 88 18 04
+31 01 60 02 bd 0f cf 20 44 0e 6e 00 15 09 19 06
+31 01 60 02 bd 0f 6f 21 44 0e 6e 00 16 0a 19 06
+31 01 60 02 bd 0f 0f 22 44 0e 6e 00 17 0b 19 06
+31 00 60 02 29 0c 80 2f 60 00 8d 00 02 10 4b 0a
+31 00 80 02 29 0c 00 2f 60 00 8d 00 02 10 8d 12
+31 01 60 02 bd 0f 8f 20 44 0e 6e 00 05 00 19 06
+31 01 60 02 bd 0f af 20 44 0e 6e 00 06 01 19 06
+31 01 60 02 bd 0f ef 20 44 0e 6e 00 07 02 19 06
+31 01 60 02 bd 0f 4f 21 44 0e 6e 00 08 03 19 06
+31 01 60 02 bd 0f 8f 21 44 0e 6e 00 09 04 19 06
+31 00 60 02 29 0c 00 23 40 00 8d 00 03 32 42 06
+31 00 80 02 29 0c 60 22 60 03 8d 00 03 32 84 0c
+31 01 60 02 bd 0f af 20 44 0e 6e 00 03 23 10 04
+31 00 60 02 29 0c 80 2f c0 00 8d 00 01 40 42 08
+31 00 60 02 29 0c 20 21 a0 00 8d 00 02 00 42 04
+31 00 80 02 29 0c a0 21 e0 00 8d 00 02 00 84 08
+31 00 60 05 29 0c 80 2f 40 00 8d 00 01 a5 19 04
+31 10 60 05 29 0c 20 2f 40 00 8d 00 01 b5 19 04
+31 00 60 05 28 0c 00 20 40 00 8d 00 01 81 09 06
+31 10 60 05 28 0c 00 20 40 00 8d 00 01 91 09 06
+31 00 60 05 28 0c 00 20 40 00 8d 00 01 82 09 06
+31 10 60 05 28 0c 00 20 40 00 8d 00 01 92 09 06
+31 00 60 05 28 0c 00 20 40 00 8d 00 01 83 09 06
+31 10 60 05 28 0c 00 20 40 00 8d 00 01 93 09 06
+31 00 60 02 29 0c a0 23 40 02 8d 00 08 00 42 04
+31 00 60 02 29 0c 60 24 40 02 8d 00 09 01 42 04
+31 00 60 02 29 0c 20 25 40 02 8d 00 0a 02 42 04
+31 00 60 02 29 0c 40 20 40 02 8d 00 0b 03 42 04
+31 00 60 02 29 0c c0 20 40 02 8d 00 0c 04 42 04
+31 00 60 02 29 0c 40 21 40 02 8d 00 0d 05 42 04
+31 00 60 02 29 0c c0 21 40 02 8d 00 0e 06 42 04
+31 00 60 02 29 0c 40 22 40 02 8d 00 0f 07 42 04
+31 00 80 02 29 0c 00 24 c0 02 8d 00 08 00 84 08
+31 00 80 02 29 0c 40 25 c0 02 8d 00 09 01 84 08
+31 00 80 02 29 0c 80 27 c0 02 8d 00 0a 02 84 08
+31 00 80 02 29 0c c0 28 c0 02 8d 00 0b 03 84 08
+31 00 80 02 29 0c c0 29 c0 02 8d 00 0c 04 84 08
+31 00 80 02 29 0c c0 2a c0 02 8d 00 0d 05 84 08
+31 00 80 02 29 0c c0 2b c0 02 8d 00 0e 06 84 08
+31 00 80 02 29 0c 80 26 c0 02 8d 00 0f 07 84 08
+31 01 60 02 bd 0f af 20 44 0e 6e 00 01 21 10 04
+31 01 60 02 bd 0f cf 20 44 0e 6e 00 02 22 10 04
+31 01 60 02 bd 0f 0f 21 44 0e 6e 00 04 24 10 04
+31 01 60 02 bd 0f 4f 21 44 0e 6e 00 06 26 10 04
+31 01 60 02 bd 0f 6f 21 44 0e 6e 00 07 27 10 04
+31 00 60 02 29 0c 40 20 80 01 8d 00 01 50 42 0a
+31 00 60 02 29 0c c0 20 20 02 8d 00 02 51 42 0a
+31 00 80 02 29 0c 60 23 e0 00 8d 00 01 50 84 14
+31 00 80 02 29 0c 60 24 20 02 8d 00 02 51 84 14
+31 01 60 06 bc 0f 0f 20 24 0e 6e 00 30 40 09 0e
+31 00 60 05 28 0c 00 20 e0 00 8d 00 01 87 09 08
+31 10 60 05 28 0c 00 20 40 00 8d 00 01 97 09 08
+31 00 60 05 28 0c 00 20 e0 00 8d 00 01 8d 09 08
+31 10 60 05 28 0c 00 20 40 00 8d 00 01 9d 09 08
+31 00 60 05 28 0c 00 20 e0 00 8d 00 01 81 09 08
+31 10 60 05 28 0c 00 20 40 00 8d 00 01 91 09 08
+31 00 60 05 28 0c 00 20 e0 00 8d 00 01 82 09 08
+31 10 60 05 28 0c 00 20 40 00 8d 00 01 92 09 08
+31 00 60 05 28 0c 00 20 e0 00 8d 00 01 83 09 08
+31 10 60 05 28 0c 00 20 40 00 8d 00 01 93 09 08
+31 00 60 02 29 0c c0 20 c0 00 8d 00 02 b1 4a 02
+31 00 60 02 29 0c 40 21 40 01 8d 00 03 b2 4a 02
+31 00 60 02 29 0c c0 21 c0 01 8d 00 04 b3 4a 02
+31 00 60 02 29 0c 40 22 40 02 8d 00 05 b4 4a 02
+31 00 60 02 29 0c c0 22 c0 02 8d 00 06 b5 4a 02
+31 00 80 02 29 0c 40 22 40 03 8d 00 02 b1 8c 02
+31 00 80 02 29 0c 80 23 60 03 8d 00 03 b2 8c 02
+31 00 80 02 29 0c 80 24 80 05 8d 00 04 b3 8c 02
+31 00 80 02 29 0c 40 20 a0 06 8d 00 06 b5 8c 02
+31 00 80 02 29 0c 80 25 80 06 8d 00 05 b4 8c 02
+31 00 60 02 29 0c 40 20 40 00 8d 00 06 00 4b 08
+31 00 60 02 29 0c c0 20 c0 00 8d 00 07 01 4b 0a
+31 00 60 02 29 0c c0 21 60 01 8d 00 08 02 4b 0a
+31 00 60 02 29 0c 40 22 40 02 8d 00 09 03 4b 0c
+31 00 60 02 29 0c c0 22 00 03 8d 00 0a 04 4b 08
+31 00 80 02 29 0c 40 22 40 03 8d 00 08 02 8d 12
+31 00 80 02 29 0c 40 20 40 01 8d 00 06 00 8d 0e
+31 00 80 02 29 0c 40 23 60 04 8d 00 09 03 8d 16
+31 00 80 02 29 0c 40 21 a0 06 8d 00 07 01 8d 12
+31 00 80 02 29 0c 40 24 c0 05 8d 00 0a 04 8d 0e
+31 00 60 05 28 0c 00 20 20 01 8d 00 03 40 0b 0e
+31 10 60 05 28 0c 00 20 e0 01 8d 00 03 60 0b 0e
diff --git a/src/intel/compiler/elk/tests/gen7/sendc.asm b/src/intel/compiler/elk/tests/gen7/sendc.asm
new file mode 100644
index 00000000000..07c36606f8a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/sendc.asm
@@ -0,0 +1,98 @@
+sendc(8)        null<1>UW       g124<8,8,1>F    0x88031400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g120<8,8,1>F    0x90031000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 8 rlen 0 { align1 1H EOT };
+sendc(16)       null<1>UW       g114<8,8,1>F    0x82031100
+                            render MsgDesc: RT write SIMD16/RepData LastRT Surface = 0 mlen 1 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1401
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 1 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1001
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g13<8,8,1>F     0x0e0b0401
+                            render MsgDesc: RT write SIMD8 Surface = 1 mlen 7 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g121<8,8,1>F    0x8e0b1402
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 2 mlen 7 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g7<8,8,1>F      0x180b0001
+                            render MsgDesc: RT write SIMD16 Surface = 1 mlen 12 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g116<8,8,1>F    0x980b1002
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 2 mlen 12 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g123<8,8,1>F    0x8a031400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 5 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x94031000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g5<8,8,1>F      0x0c0b0400
+                            render MsgDesc: RT write SIMD8 Surface = 0 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g5<8,8,1>F      0x0c0b0401
+                            render MsgDesc: RT write SIMD8 Surface = 1 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g5<8,8,1>F      0x0c0b0402
+                            render MsgDesc: RT write SIMD8 Surface = 2 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g5<8,8,1>F      0x0c0b0403
+                            render MsgDesc: RT write SIMD8 Surface = 3 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g5<8,8,1>F      0x0c0b0404
+                            render MsgDesc: RT write SIMD8 Surface = 4 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1405
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 5 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g5<8,8,1>F      0x140b0000
+                            render MsgDesc: RT write SIMD16 Surface = 0 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g5<8,8,1>F      0x140b0001
+                            render MsgDesc: RT write SIMD16 Surface = 1 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g5<8,8,1>F      0x140b0002
+                            render MsgDesc: RT write SIMD16 Surface = 2 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g5<8,8,1>F      0x140b0003
+                            render MsgDesc: RT write SIMD16 Surface = 3 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g5<8,8,1>F      0x140b0004
+                            render MsgDesc: RT write SIMD16 Surface = 4 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1005
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 5 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g121<8,8,1>F    0x8e0b1400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 7 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g116<8,8,1>F    0x980b1000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 12 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g118<8,8,1>F    0x940b1200
+                            render MsgDesc: RT write SIMD8/DualSrcLow LastRT Surface = 0 mlen 10 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       g3<8,8,1>F      0x140b1200
+                            render MsgDesc: RT write SIMD8/DualSrcLow LastRT Surface = 0 mlen 10 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g118<8,8,1>F    0x940b1300
+                            render MsgDesc: RT write SIMD8/DualSrcHigh LastRT Surface = 0 mlen 10 rlen 0 { align1 2Q EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1402
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 2 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1002
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 2 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g23<8,8,1>F     0x0c0b0405
+                            render MsgDesc: RT write SIMD8 Surface = 5 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g29<8,8,1>F     0x0c0b0406
+                            render MsgDesc: RT write SIMD8 Surface = 6 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1407
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 7 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g57<8,8,1>F     0x140b0005
+                            render MsgDesc: RT write SIMD16 Surface = 5 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g17<8,8,1>F     0x140b0006
+                            render MsgDesc: RT write SIMD16 Surface = 6 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1007
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 7 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1403
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 3 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1003
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 3 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1404
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 4 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1004
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 4 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1406
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 6 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1006
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 6 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g121<8,8,1>F    0x8e0b1401
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 1 mlen 7 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g116<8,8,1>F    0x980b1001
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 12 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x88031400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       g7<8,8,1>F      0x0e0b0400
+                            render MsgDesc: RT write SIMD8 Surface = 0 mlen 7 rlen 0 { align1 1Q };
+sendc(16)       null<1>UW       g11<8,8,1>F     0x180b0000
+                            render MsgDesc: RT write SIMD16 Surface = 0 mlen 12 rlen 0 { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/sendc.expected b/src/intel/compiler/elk/tests/gen7/sendc.expected
new file mode 100644
index 00000000000..3bd787a3aa9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/sendc.expected
@@ -0,0 +1,49 @@
+32 00 60 05 a8 0f 00 20 80 0f 8d 00 00 14 03 88
+32 00 80 05 a8 0f 00 20 00 0f 8d 00 00 10 03 90
+32 00 80 05 a8 0f 00 20 40 0e 8d 00 00 11 03 82
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 01 14 0b 8c
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 01 10 0b 94
+32 00 60 05 a8 0f 00 20 a0 01 8d 00 01 04 0b 0e
+32 00 60 05 a8 0f 00 20 20 0f 8d 00 02 14 0b 8e
+32 00 80 05 a8 0f 00 20 e0 00 8d 00 01 00 0b 18
+32 00 80 05 a8 0f 00 20 80 0e 8d 00 02 10 0b 98
+32 00 60 05 a8 0f 00 20 60 0f 8d 00 00 14 03 8a
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 00 10 03 94
+32 00 60 05 a8 0f 00 20 a0 00 8d 00 00 04 0b 0c
+32 00 60 05 a8 0f 00 20 a0 00 8d 00 01 04 0b 0c
+32 00 60 05 a8 0f 00 20 a0 00 8d 00 02 04 0b 0c
+32 00 60 05 a8 0f 00 20 a0 00 8d 00 03 04 0b 0c
+32 00 60 05 a8 0f 00 20 a0 00 8d 00 04 04 0b 0c
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 05 14 0b 8c
+32 00 80 05 a8 0f 00 20 a0 00 8d 00 00 00 0b 14
+32 00 80 05 a8 0f 00 20 a0 00 8d 00 01 00 0b 14
+32 00 80 05 a8 0f 00 20 a0 00 8d 00 02 00 0b 14
+32 00 80 05 a8 0f 00 20 a0 00 8d 00 03 00 0b 14
+32 00 80 05 a8 0f 00 20 a0 00 8d 00 04 00 0b 14
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 05 10 0b 94
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 00 14 0b 8c
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 00 10 0b 94
+32 00 60 05 a8 0f 00 20 20 0f 8d 00 00 14 0b 8e
+32 00 80 05 a8 0f 00 20 80 0e 8d 00 00 10 0b 98
+32 00 60 05 a8 0f 00 20 c0 0e 8d 00 00 12 0b 94
+32 00 60 05 a8 0f 00 20 60 00 8d 00 00 12 0b 14
+32 10 60 05 a8 0f 00 20 c0 0e 8d 00 00 13 0b 94
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 02 14 0b 8c
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 02 10 0b 94
+32 00 60 05 a8 0f 00 20 e0 02 8d 00 05 04 0b 0c
+32 00 60 05 a8 0f 00 20 a0 03 8d 00 06 04 0b 0c
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 07 14 0b 8c
+32 00 80 05 a8 0f 00 20 20 07 8d 00 05 00 0b 14
+32 00 80 05 a8 0f 00 20 20 02 8d 00 06 00 0b 14
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 07 10 0b 94
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 03 14 0b 8c
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 03 10 0b 94
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 04 14 0b 8c
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 04 10 0b 94
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 06 14 0b 8c
+32 00 80 05 a8 0f 00 20 c0 0e 8d 00 06 10 0b 94
+32 00 60 05 a8 0f 00 20 20 0f 8d 00 01 14 0b 8e
+32 00 80 05 a8 0f 00 20 80 0e 8d 00 01 10 0b 98
+32 00 60 05 a8 0f 00 20 40 0f 8d 00 00 14 03 88
+32 00 60 05 a8 0f 00 20 e0 00 8d 00 00 04 0b 0e
+32 00 80 05 a8 0f 00 20 60 01 8d 00 00 00 0b 18
diff --git a/src/intel/compiler/elk/tests/gen7/shl.asm b/src/intel/compiler/elk/tests/gen7/shl.asm
new file mode 100644
index 00000000000..9d24156210e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/shl.asm
@@ -0,0 +1,13 @@
+shl(1)          a0<1>UW         a0<0,1,0>UW     0x0002UW        { align1 WE_all 1N };
+shl(1)          g12.2<1>UD      g12.2<0,1,0>UD  0x0000000cUD    { align1 WE_all 1N };
+shl(8)          g87<1>.xD       g4<0>.zD        0x00000004UD    { align16 1Q };
+shl(8)          g13<1>.xyD      g5.4<0>.zwwwD   g6<0>.xyyyUD    { align16 1Q };
+shl(8)          g114<1>UD       g26<4>.xUD      g27<4>.xUD      { align16 1Q };
+shl(8)          g3<1>D          g2<0,1,0>D      0x00000003UD    { align1 1Q };
+shl(16)         g3<1>D          g2<0,1,0>D      0x00000003UD    { align1 1H };
+shl(8)          g37<1>D         g35<8,8,1>D     g5.6<0,1,0>UD   { align1 1Q };
+shl(16)         g68<1>D         g23<8,8,1>D     g7.6<0,1,0>UD   { align1 1H };
+shl(1)          a0<1>UD         g20<0,1,0>UD    0x00000008UD    { align1 WE_all 1N };
+shl(8)          g59<1>.xUD      g58<4>.xUD      g57<4>.xUD      { align16 WE_all 1Q };
+shl(8)          g38<1>UD        g38<8,8,1>UD    0x00000010UD    { align1 1Q };
+shl(16)         g76<1>UD        g76<8,8,1>UD    0x00000010UD    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/shl.expected b/src/intel/compiler/elk/tests/gen7/shl.expected
new file mode 100644
index 00000000000..56f943daabc
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/shl.expected
@@ -0,0 +1,13 @@
+09 02 00 00 08 2d 00 22 00 02 00 00 02 00 02 00
+09 02 00 00 21 0c 88 21 88 01 00 00 0c 00 00 00
+09 01 60 00 a5 0c e1 2a 8a 00 0a 00 04 00 00 00
+09 01 60 00 a5 04 a3 21 be 00 0f 00 c4 00 05 00
+09 01 60 00 21 04 4f 2e 40 03 60 00 60 03 60 00
+09 00 60 00 a5 0c 60 20 40 00 00 00 03 00 00 00
+09 00 80 00 a5 0c 60 20 40 00 00 00 03 00 00 00
+09 00 60 00 a5 04 a0 24 60 04 8d 00 b8 00 00 00
+09 00 80 00 a5 04 80 28 e0 02 8d 00 f8 00 00 00
+09 02 00 00 20 0c 00 22 80 02 00 00 08 00 00 00
+09 03 60 00 21 04 61 27 40 07 60 00 20 07 60 00
+09 00 60 00 21 0c c0 24 c0 04 8d 00 10 00 00 00
+09 00 80 00 21 0c 80 29 80 09 8d 00 10 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/shr.asm b/src/intel/compiler/elk/tests/gen7/shr.asm
new file mode 100644
index 00000000000..3c6e23d17a9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/shr.asm
@@ -0,0 +1,8 @@
+shr(1)          g11<1>UD        g11<0,1,0>UD    0x0000000fUD    { align1 1N };
+shr(8)          g13<1>.xUD      g5.4<0>.zUD     g5.4<0>.wUD     { align16 1Q };
+shr(8)          g13<1>UD        g12<8,8,1>UD    0x00000001UD    { align1 1Q };
+shr(16)         g27<1>UD        g25<8,8,1>UD    0x00000001UD    { align1 1H };
+shr(8)          g35<1>UD        g31<8,8,1>UD    g5.5<0,1,0>UD   { align1 1Q };
+shr(16)         g23<1>UD        g56<8,8,1>UD    g7.5<0,1,0>UD   { align1 1H };
+shr(1)          g9<1>UD         g9<0,1,0>UD     5D              { align1 WE_all 1N };
+shr(8)          g54<1>.xUD      g55<4>.xUD      0x00000005UD    { align16 1Q };
diff --git a/src/intel/compiler/elk/tests/gen7/shr.expected b/src/intel/compiler/elk/tests/gen7/shr.expected
new file mode 100644
index 00000000000..e6db058fefa
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/shr.expected
@@ -0,0 +1,8 @@
+08 00 00 00 21 0c 60 21 60 01 00 00 0f 00 00 00
+08 01 60 00 21 04 a1 21 ba 00 0a 00 bf 00 0f 00
+08 00 60 00 21 0c a0 21 80 01 8d 00 01 00 00 00
+08 00 80 00 21 0c 60 23 20 03 8d 00 01 00 00 00
+08 00 60 00 21 04 60 24 e0 03 8d 00 b4 00 00 00
+08 00 80 00 21 04 e0 22 00 07 8d 00 f4 00 00 00
+08 02 00 00 21 1c 20 21 20 01 00 00 05 00 00 00
+08 01 60 00 21 0c c1 26 e0 06 60 00 05 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/wait.asm b/src/intel/compiler/elk/tests/gen7/wait.asm
new file mode 100644
index 00000000000..7f81fcd2253
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/wait.asm
@@ -0,0 +1,3 @@
+wait(1)         n0<1>.xUD                                       { align16 WE_all 1N };
+wait(1)         n0<1>.yUD                                       { align16 WE_all 1N };
+wait(1)         n0<1>.zUD                                       { align16 WE_all 1N };
diff --git a/src/intel/compiler/elk/tests/gen7/wait.expected b/src/intel/compiler/elk/tests/gen7/wait.expected
new file mode 100644
index 00000000000..036512e412f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/wait.expected
@@ -0,0 +1,3 @@
+30 03 00 00 00 70 01 32 00 12 00 00 04 00 6e 00
+30 03 00 00 00 70 02 32 05 12 05 00 04 00 6e 00
+30 03 00 00 00 70 04 32 0a 12 0a 00 04 00 6e 00
diff --git a/src/intel/compiler/elk/tests/gen7/while.asm b/src/intel/compiler/elk/tests/gen7/while.asm
new file mode 100644
index 00000000000..8465a910663
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/while.asm
@@ -0,0 +1,6 @@
+LABEL0:
+while(8)        JIP: LABEL0                                     { align1 1Q };
+while(16)       JIP: LABEL0                                     { align1 1H };
+while(8)        JIP: LABEL0                                     { align16 1Q };
+(-f0.0) while(8) JIP: LABEL0                                    { align1 1Q };
+(-f0.0) while(16) JIP: LABEL0                                   { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/while.expected b/src/intel/compiler/elk/tests/gen7/while.expected
new file mode 100644
index 00000000000..379819a609b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/while.expected
@@ -0,0 +1,5 @@
+27 00 60 00 84 3c 00 20 00 00 8d 00 00 00 00 00
+27 00 80 00 84 3c 00 20 00 00 8d 00 fe ff 00 00
+27 01 60 00 84 3c 0f 20 04 00 6e 00 fc ff 00 00
+27 00 71 00 84 3c 00 20 00 00 8d 00 fa ff 00 00
+27 00 91 00 84 3c 00 20 00 00 8d 00 f8 ff 00 00
diff --git a/src/intel/compiler/elk/tests/gen7/xor.asm b/src/intel/compiler/elk/tests/gen7/xor.asm
new file mode 100644
index 00000000000..a631d4eb151
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/xor.asm
@@ -0,0 +1,5 @@
+xor(8)          g14<1>.xUD      g5.4<0>.zUD     g13<4>.xUD      { align16 1Q };
+xor(8)          g47<1>UD        g45<8,8,1>UD    g46<8,8,1>UD    { align1 1Q };
+xor(16)         g87<1>UD        g83<8,8,1>UD    g85<8,8,1>UD    { align1 1H };
+xor(8)          g124<1>UD       g5<8,8,1>UD     0x000003ffUD    { align1 1Q };
+xor(16)         g120<1>UD       g13<8,8,1>UD    0x000003ffUD    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen7/xor.expected b/src/intel/compiler/elk/tests/gen7/xor.expected
new file mode 100644
index 00000000000..4bdf8562140
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen7/xor.expected
@@ -0,0 +1,5 @@
+07 01 60 00 21 04 c1 21 ba 00 0a 00 a0 01 60 00
+07 00 60 00 21 04 e0 25 a0 05 8d 00 c0 05 8d 00
+07 00 80 00 21 04 e0 2a 60 0a 8d 00 a0 0a 8d 00
+07 00 60 00 21 0c 80 2f a0 00 8d 00 ff 03 00 00
+07 00 80 00 21 0c 00 2f a0 01 8d 00 ff 03 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/add.asm b/src/intel/compiler/elk/tests/gen8/add.asm
new file mode 100644
index 00000000000..ed48a90a795
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/add.asm
@@ -0,0 +1,40 @@
+add(8)          g124<1>F        g7<8,8,1>D      1D              { align1 1Q };
+add(16)         g120<1>F        g11<8,8,1>D     1D              { align1 1H };
+add(16)         g11<1>F         g1<0,1,0>F      -g1.4<0,1,0>F   { align1 1H };
+add(8)          g10.8<1>UW      g10<8,8,1>UW    0x0008UW        { align1 WE_all 1Q };
+add(16)         g14<1>D         g25<8,8,1>D     g19<8,8,1>D     { align1 1H };
+add(16)         g6<1>UW         g1.4<1,4,0>UW   0x11001010V     { align1 WE_all 1H };
+add(32)         g18<1>UW        g1.4<1,4,0>UW   0x11001010V     { align1 WE_all };
+add(8)          g2<1>D          g34<8,8,1>D     -1023D          { align1 1Q };
+add(8)          g4<1>F          g5.6<0,1,0>F    g7.2<0,1,0>F    { align1 1Q };
+add(8)          g53<1>DF        g49<4,4,1>DF    g51<4,4,1>DF    { align1 1Q };
+add.z.f0.0(8)   g3<1>D          g4<8,8,1>D      g2<8,8,1>D      { align1 1Q };
+add.sat(16)     g12<1>UD        g10<8,8,1>UD    0x00000001UD    { align1 1H };
+add(1)          g8.3<1>UD       g0.3<0,1,0>UD   g7<0,1,0>UD     { align1 WE_all 1N };
+add(8)          a0<1>UW         g34<16,8,2>UW   0x0080UW        { align1 1Q };
+add(8)          g8<1>DF         g2<0,1,0>DF     g3.2<0,1,0>DF   { align1 2Q };
+add(16)         a0<1>UW         g3<16,8,2>UW    0x0040UW        { align1 1H };
+add.sat.le.f0.0(8) g125<1>F     -g6<8,8,1>F     0x3f000000F  /* 0.5F */ { align1 1Q };
+add.z.f0.0(8)   g8<1>F          g2<0,1,0>F      -g2.4<0,1,0>F   { align1 1Q };
+add.z.f0.0(16)  g3<1>F          g2<0,1,0>F      -g2.1<0,1,0>F   { align1 1H };
+add(8)          g3<1>UD         g2<8,8,1>UD     0xffffffffUD    { align1 1Q };
+(+f0.0) add(8)  g15<1>D         -g15<8,8,1>D    31D             { align1 1Q };
+add(1)          a0<1>UD         a0<0,1,0>UD     0x00000200UD    { align1 WE_all 1N };
+add.sat(8)      g124<1>F        g7<8,8,1>F      -g6<8,8,1>F     { align1 1Q };
+add(8)          g8<1>UD         g6<8,8,1>D      0x00000001UD    { align1 1Q };
+add(16)         g11<1>UD        g9<8,8,1>D      0x00000001UD    { align1 1H };
+(+f0.0) add(16) g8<1>D          -g8<8,8,1>D     31D             { align1 1H };
+add.sat(16)     g126<1>F        g2<0,1,0>F      g2.4<0,1,0>F    { align1 1H };
+add.sat(8)      g124<1>F        g17<8,8,1>D     1D              { align1 1Q };
+add(16)         g40<1>D         g38<8,8,1>D     g36<8,8,1>D     { align1 2H };
+add.z.f0.0(16)  null<1>D        g68<8,8,1>D     1D              { align1 1H };
+add.z.f0.0(16)  null<1>D        g8<8,8,1>D      1D              { align1 2H };
+add(16)         g20<1>UD        g17<8,8,1>UD    1D              { align1 1H };
+add(8)          g7<1>F          -g6<4>.xyxyF    g6<4>.zwzwF     { align16 1Q };
+add(16)         g9<1>F          -g7<4>.xyxyF    g7<4>.zwzwF     { align16 1H };
+add(8)          g7<1>UD         g2<8,8,1>UD     -g6<8,8,1>UD    { align1 WE_all 1Q };
+add.le.f0.0(16) g1<1>D          g3.1<0,1,0>D    -g6<8,8,1>D     { align1 1H };
+add.sat(8)      g10<1>UD        g9<8,8,1>UD     0x00000001UD    { align1 1Q };
+add(8)          g22<1>Q         g19<4,4,1>Q     -g21<4,4,1>Q    { align1 1Q };
+add(8)          g8<1>Q          g5<4,4,1>Q      -g7<4,4,1>Q     { align1 2Q };
+add(1)          g4<1>UD         g4<0,1,0>UD     0x00000001UD    { align1 WE_all 3N };
diff --git a/src/intel/compiler/elk/tests/gen8/add.expected b/src/intel/compiler/elk/tests/gen8/add.expected
new file mode 100644
index 00000000000..bbd9ef57a84
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/add.expected
@@ -0,0 +1,40 @@
+40 00 60 00 e8 0a 80 2f e0 00 8d 0e 01 00 00 00
+40 00 80 00 e8 0a 00 2f 60 01 8d 0e 01 00 00 00
+40 00 80 00 e8 3a 60 21 20 00 00 3a 30 40 00 00
+40 00 60 00 4c 12 50 21 40 01 8d 16 08 00 08 00
+40 00 80 00 28 0a c0 21 20 03 8d 0a 60 02 8d 00
+40 00 80 00 4c 12 c0 20 28 00 28 36 10 10 00 11
+40 00 a0 00 4c 12 40 22 28 00 28 36 10 10 00 11
+40 00 60 00 28 0a 40 20 40 04 8d 0e 01 fc ff ff
+40 00 60 00 e8 3a 80 20 b8 00 00 3a e8 00 00 00
+40 00 60 00 c8 32 a0 26 20 06 69 32 60 06 69 00
+40 00 60 01 28 0a 60 20 80 00 8d 0a 40 00 8d 00
+40 00 80 80 08 02 80 21 40 01 8d 06 01 00 00 00
+40 00 00 00 0c 02 0c 21 0c 00 00 02 e0 00 00 00
+40 00 60 00 40 12 00 22 40 04 ae 16 80 00 80 00
+40 10 60 00 c8 32 00 21 40 00 00 32 70 00 00 00
+40 00 80 00 40 12 00 22 60 00 ae 16 40 00 40 00
+40 00 60 86 e8 3a a0 2f c0 40 8d 3e 00 00 00 3f
+40 00 60 01 e8 3a 00 21 40 00 00 3a 50 40 00 00
+40 00 80 01 e8 3a 60 20 40 00 00 3a 44 40 00 00
+40 00 60 00 08 02 60 20 40 00 8d 06 ff ff ff ff
+40 00 61 00 28 0a e0 21 e0 41 8d 0e 1f 00 00 00
+40 00 00 00 04 00 00 22 00 02 00 06 00 02 00 00
+40 00 60 80 e8 3a 80 2f e0 00 8d 3a c0 40 8d 00
+40 00 60 00 08 0a 00 21 c0 00 8d 06 01 00 00 00
+40 00 80 00 08 0a 60 21 20 01 8d 06 01 00 00 00
+40 00 81 00 28 0a 00 21 00 41 8d 0e 1f 00 00 00
+40 00 80 80 e8 3a c0 2f 40 00 00 3a 50 00 00 00
+40 00 60 80 e8 0a 80 2f 20 02 8d 0e 01 00 00 00
+40 20 80 00 28 0a 00 25 c0 04 8d 0a 80 04 8d 00
+40 00 80 01 20 0a 00 20 80 08 8d 0e 01 00 00 00
+40 20 80 01 20 0a 00 20 00 01 8d 0e 01 00 00 00
+40 00 80 00 08 02 80 22 20 02 8d 0e 01 00 00 00
+40 01 60 00 e8 3a ef 20 c4 40 64 3a ce 00 6e 00
+40 01 80 00 e8 3a 2f 21 e4 40 64 3a ee 00 6e 00
+40 00 60 00 0c 02 e0 20 40 00 8d 02 c0 40 8d 00
+40 00 80 06 28 0a 20 20 64 00 00 0a c0 40 8d 00
+40 00 60 80 08 02 40 21 20 01 8d 06 01 00 00 00
+40 00 60 00 28 4b c0 22 60 02 69 4a a0 42 69 00
+40 10 60 00 28 4b 00 21 a0 00 69 4a e0 40 69 00
+40 10 00 00 0c 02 80 20 80 00 00 06 01 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/and.asm b/src/intel/compiler/elk/tests/gen8/and.asm
new file mode 100644
index 00000000000..49dc122806c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/and.asm
@@ -0,0 +1,29 @@
+and(8)          g3<1>UD         g2<0,1,0>UD     ~g2.2<0,1,0>D   { align1 1Q };
+and(16)         g3<1>UD         g2<0,1,0>UD     ~g2.2<0,1,0>D   { align1 1H };
+and(8)          g8<1>UD         g0.1<0,1,0>UW   0x07ffUW        { align1 1Q };
+and(16)         g20<1>UD        g0.1<0,1,0>UW   0x07ffUW        { align1 1H };
+and.z.f0.0(8)   g9<1>UD         g8<8,8,1>UD     0x00000003UD    { align1 1Q };
+and(16)         g120<1>D        g7<8,8,1>D      g2<8,8,1>D      { align1 1H };
+and.z.f0.0(8)   null<1>UD       g13<8,8,1>UD    g12<8,8,1>UD    { align1 1Q };
+and.nz.f0.0(8)  null<1>UD       g4.1<0,1,0>UD   g19<8,8,1>UD    { align1 1Q };
+and.z.f0.0(16)  null<1>UD       g27<8,8,1>UD    g18<8,8,1>UD    { align1 1H };
+and.nz.f0.0(16) null<1>UD       g6.1<0,1,0>UD   g22<8,8,1>UD    { align1 1H };
+and(1)          g7<1>UD         g5<0,1,0>UD     0x000000f0UD    { align1 WE_all 1N };
+and.z.f0.0(16)  g21<1>UD        g19<8,8,1>UD    g17<8,8,1>UD    { align1 1H };
+and(8)          g61<1>UD        g79<8,8,1>UD    g32.1<8,4,2>UD  { align1 2Q };
+and(8)          g96<1>D         ~g94<8,8,1>D    ~g95<8,8,1>D    { align1 1Q };
+and(1)          a0<1>UD         g4<0,1,0>UD     0x000000ffUD    { align1 WE_all 1N };
+and(16)         g66<1>UD        g40<8,8,1>UD    0x0000003fUD    { align1 2H };
+and(1)          g2<1>UD         g20<0,1,0>UD    0x000000ffUD    { align1 WE_all 3N };
+and.z.f0.0(8)   null<1>D        g13<8,8,1>UD    0x0000001fUD    { align1 1Q };
+and(8)          g21<1>UD        g15<8,8,1>UD    0x00000003UD    { align1 WE_all 1Q };
+and(8)          g4<1>UW         g3<8,8,1>UW     0xfffcUW        { align1 1Q };
+and(16)         g13<1>UW        g19<16,8,2>UW   0xfffcUW        { align1 1H };
+and.nz.f0.0(8)  null<1>UD       ~g2.2<0,1,0>D   g9<8,8,1>UD     { align1 1Q };
+and(8)          g18<1>UD        ~g2.2<0,1,0>D   g7<8,8,1>UD     { align1 1Q };
+and.nz.f0.0(16) null<1>UD       ~g2.2<0,1,0>D   g14<8,8,1>UD    { align1 1H };
+and(16)         g30<1>UD        ~g2.2<0,1,0>D   g10<8,8,1>UD    { align1 1H };
+and.nz.f0.0(8)  g10<1>UD        g9<8,8,1>UD     0x00000001UD    { align1 1Q };
+and.nz.f0.0(16) g16<1>UD        g14<8,8,1>UD    0x00000001UD    { align1 1H };
+and(8)          g12<1>UQ        g9<4,4,1>UQ     g11<4,4,1>UQ    { align1 1Q };
+and(8)          g26<1>UQ        g18<4,4,1>UQ    g22<4,4,1>UQ    { align1 2Q };
diff --git a/src/intel/compiler/elk/tests/gen8/and.expected b/src/intel/compiler/elk/tests/gen8/and.expected
new file mode 100644
index 00000000000..6ab0b8082a0
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/and.expected
@@ -0,0 +1,29 @@
+05 00 60 00 08 02 60 20 40 00 00 0a 48 40 00 00
+05 00 80 00 08 02 60 20 40 00 00 0a 48 40 00 00
+05 00 60 00 08 12 00 21 02 00 00 16 ff 07 ff 07
+05 00 80 00 08 12 80 22 02 00 00 16 ff 07 ff 07
+05 00 60 01 08 02 20 21 00 01 8d 06 03 00 00 00
+05 00 80 00 28 0a 00 2f e0 00 8d 0a 40 00 8d 00
+05 00 60 01 00 02 00 20 a0 01 8d 02 80 01 8d 00
+05 00 60 02 00 02 00 20 84 00 00 02 60 02 8d 00
+05 00 80 01 00 02 00 20 60 03 8d 02 40 02 8d 00
+05 00 80 02 00 02 00 20 c4 00 00 02 c0 02 8d 00
+05 00 00 00 0c 02 e0 20 a0 00 00 06 f0 00 00 00
+05 00 80 01 08 02 a0 22 60 02 8d 02 20 02 8d 00
+05 10 60 00 08 02 a0 27 e0 09 8d 02 04 04 8a 00
+05 00 60 00 28 0a 00 2c c0 4b 8d 0a e0 4b 8d 00
+05 00 00 00 04 02 00 22 80 00 00 06 ff 00 00 00
+05 20 80 00 08 02 40 28 00 05 8d 06 3f 00 00 00
+05 10 00 00 0c 02 40 20 80 02 00 06 ff 00 00 00
+05 00 60 01 20 02 00 20 a0 01 8d 06 1f 00 00 00
+05 00 60 00 0c 02 a0 22 e0 01 8d 06 03 00 00 00
+05 00 60 00 48 12 80 20 60 00 8d 16 fc ff fc ff
+05 00 80 00 48 12 a0 21 60 02 ae 16 fc ff fc ff
+05 00 60 02 00 0a 00 20 48 40 00 02 20 01 8d 00
+05 00 60 00 08 0a 40 22 48 40 00 02 e0 00 8d 00
+05 00 80 02 00 0a 00 20 48 40 00 02 c0 01 8d 00
+05 00 80 00 08 0a c0 23 48 40 00 02 40 01 8d 00
+05 00 60 02 08 02 40 21 20 01 8d 06 01 00 00 00
+05 00 80 02 08 02 00 22 c0 01 8d 06 01 00 00 00
+05 00 60 00 08 43 80 21 20 01 69 42 60 01 69 00
+05 10 60 00 08 43 40 23 40 02 69 42 c0 02 69 00
diff --git a/src/intel/compiler/elk/tests/gen8/asr.asm b/src/intel/compiler/elk/tests/gen8/asr.asm
new file mode 100644
index 00000000000..9beabc9cc8b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/asr.asm
@@ -0,0 +1,6 @@
+asr(8)          g19<1>D         g7<8,8,1>D      0x00000001UD    { align1 1Q };
+asr(16)         g20<1>D         g2.7<0,1,0>D    0x0000001fUD    { align1 1H };
+asr.nz.f0.0(8)  null<1>D        -g0<0,1,0>W     15D             { align1 1Q };
+asr.nz.f0.0(16) null<1>D        -g0<0,1,0>W     15D             { align1 1H };
+asr(8)          g2<1>D          -g0<0,1,0>W     15D             { align1 1Q };
+asr(16)         g2<1>D          -g0<0,1,0>W     15D             { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/asr.expected b/src/intel/compiler/elk/tests/gen8/asr.expected
new file mode 100644
index 00000000000..f1832cd80d7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/asr.expected
@@ -0,0 +1,6 @@
+0c 00 60 00 28 0a 60 22 e0 00 8d 06 01 00 00 00
+0c 00 80 00 28 0a 80 22 5c 00 00 06 1f 00 00 00
+0c 00 60 02 20 1a 00 20 00 40 00 0e 0f 00 00 00
+0c 00 80 02 20 1a 00 20 00 40 00 0e 0f 00 00 00
+0c 00 60 00 28 1a 40 20 00 40 00 0e 0f 00 00 00
+0c 00 80 00 28 1a 40 20 00 40 00 0e 0f 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/bfe.asm b/src/intel/compiler/elk/tests/gen8/bfe.asm
new file mode 100644
index 00000000000..e1113c411fd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/bfe.asm
@@ -0,0 +1,4 @@
+bfe(8)          g34<1>UD        g89<4,4,1>UD    g30<4,4,1>UD    g91<4,4,1>UD { align16 1Q };
+bfe(16)         g13<1>UD        g44<4,4,1>UD    g115<4,4,1>UD   g126<4,4,1>UD { align16 1H };
+bfe(8)          g18<1>D         g17<4,4,1>D     g16<4,4,1>D     g49<4,4,1>D { align16 1Q };
+bfe(16)         g13<1>D         g11<4,4,1>D     g42<4,4,1>D     g5<4,4,1>D { align16 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/bfe.expected b/src/intel/compiler/elk/tests/gen8/bfe.expected
new file mode 100644
index 00000000000..bc933f07201
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/bfe.expected
@@ -0,0 +1,4 @@
+18 01 60 00 00 90 1e 22 c8 91 05 39 3c 20 c7 16
+18 01 80 00 00 90 1e 0d c8 c1 02 39 e6 20 87 1f
+18 01 60 00 00 48 1e 12 c8 11 01 39 20 20 47 0c
+18 01 80 00 00 48 1e 0d c8 b1 00 39 54 20 47 01
diff --git a/src/intel/compiler/elk/tests/gen8/bfi1.asm b/src/intel/compiler/elk/tests/gen8/bfi1.asm
new file mode 100644
index 00000000000..d2bfa85d7ce
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/bfi1.asm
@@ -0,0 +1,2 @@
+bfi1(8)         g20<1>UD        g19<8,8,1>D     g18<8,8,1>D     { align1 1Q };
+bfi1(16)        g16<1>UD        g14<8,8,1>D     g12<8,8,1>D     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/bfi1.expected b/src/intel/compiler/elk/tests/gen8/bfi1.expected
new file mode 100644
index 00000000000..d8b4474c53e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/bfi1.expected
@@ -0,0 +1,2 @@
+19 00 60 00 08 0a 80 22 60 02 8d 0a 40 02 8d 00
+19 00 80 00 08 0a 00 22 c0 01 8d 0a 80 01 8d 00
diff --git a/src/intel/compiler/elk/tests/gen8/bfi2.asm b/src/intel/compiler/elk/tests/gen8/bfi2.asm
new file mode 100644
index 00000000000..1dadebe1753
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/bfi2.asm
@@ -0,0 +1,2 @@
+bfi2(8)         g31<1>UD        g88<4,4,1>UD    g90<4,4,1>UD    g91<4,4,1>UD { align16 1Q };
+bfi2(16)        g5<1>UD         g42<4,4,1>UD    g40<4,4,1>UD    g126<4,4,1>UD { align16 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/bfi2.expected b/src/intel/compiler/elk/tests/gen8/bfi2.expected
new file mode 100644
index 00000000000..61eda29eaf4
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/bfi2.expected
@@ -0,0 +1,2 @@
+1a 01 60 00 00 90 1e 1f c8 81 05 39 b4 20 c7 16
+1a 01 80 00 00 90 1e 05 c8 a1 02 39 50 20 87 1f
diff --git a/src/intel/compiler/elk/tests/gen8/bfrev.asm b/src/intel/compiler/elk/tests/gen8/bfrev.asm
new file mode 100644
index 00000000000..44b45c53bae
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/bfrev.asm
@@ -0,0 +1,2 @@
+bfrev(8)        g5<1>UD         g5<8,8,1>UD                     { align1 1Q };
+bfrev(16)       g6<1>UD         g8<8,8,1>UD                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/bfrev.expected b/src/intel/compiler/elk/tests/gen8/bfrev.expected
new file mode 100644
index 00000000000..b4d7fb02205
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/bfrev.expected
@@ -0,0 +1,2 @@
+17 00 60 00 08 02 a0 20 a0 00 8d 00 00 00 00 00
+17 00 80 00 08 02 c0 20 00 01 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/break.asm b/src/intel/compiler/elk/tests/gen8/break.asm
new file mode 100644
index 00000000000..681b3d2c8a1
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/break.asm
@@ -0,0 +1,6 @@
+break(8)        JIP: LABEL0         UIP: LABEL1                 { align1 1Q };
+break(16)       JIP: LABEL0         UIP: LABEL1                 { align1 1H };
+LABEL0:
+(+f0.0) break(8) JIP: LABEL1        UIP: LABEL1                 { align1 1Q };
+(+f0.0) break(16) JIP: LABEL1       UIP: LABEL1                 { align1 1H };
+LABEL1:
diff --git a/src/intel/compiler/elk/tests/gen8/break.expected b/src/intel/compiler/elk/tests/gen8/break.expected
new file mode 100644
index 00000000000..f5448cdbdf3
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/break.expected
@@ -0,0 +1,4 @@
+28 00 60 00 20 0e 00 20 40 00 00 00 20 00 00 00
+28 00 80 00 20 0e 00 20 30 00 00 00 10 00 00 00
+28 00 61 00 20 0e 00 20 20 00 00 00 20 00 00 00
+28 00 81 00 20 0e 00 20 10 00 00 00 10 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/cbit.asm b/src/intel/compiler/elk/tests/gen8/cbit.asm
new file mode 100644
index 00000000000..a48d5e29182
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/cbit.asm
@@ -0,0 +1,2 @@
+cbit(8)         g9<1>UD         g31<8,8,1>UD                    { align1 1Q };
+cbit(16)        g6<1>UD         g8<8,8,1>UD                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/cbit.expected b/src/intel/compiler/elk/tests/gen8/cbit.expected
new file mode 100644
index 00000000000..8cb5ca16d1c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/cbit.expected
@@ -0,0 +1,2 @@
+4d 00 60 00 08 02 20 21 e0 03 8d 00 00 00 00 00
+4d 00 80 00 08 02 c0 20 00 01 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/cmp.asm b/src/intel/compiler/elk/tests/gen8/cmp.asm
new file mode 100644
index 00000000000..3ed715406ad
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/cmp.asm
@@ -0,0 +1,104 @@
+cmp.z.f0.0(8)   null<1>F        g20<8,8,1>F     0xbf800000F  /* -1F */ { align1 1Q };
+cmp.nz.f0.0(8)  g59<1>DF        g2.1<0,1,0>DF   g59<4,4,1>DF    { align1 1Q };
+cmp.nz.f0.0(8)  g49<1>F         g47<8,8,1>F     g14.1<0,1,0>F   { align1 1Q };
+cmp.nz.f0.0(8)  null<1>D        g7<8,8,1>D      0D              { align1 1Q };
+cmp.z.f0.0(8)   g5<1>D          g4<8,8,1>D      g2.5<0,1,0>D    { align1 1Q };
+cmp.z.f0.0(16)  g7<1>D          g5<8,8,1>D      g2.5<0,1,0>D    { align1 1H };
+cmp.l.f0.0(16)  g35<1>F         g33<8,8,1>F     g31<8,8,1>F     { align1 1H };
+cmp.ge.f0.0(16) g37<1>F         g33<8,8,1>F     g31<8,8,1>F     { align1 1H };
+cmp.nz.f0.0(8)  g43<1>D         g42<8,8,1>D     g2.1<0,1,0>D    { align1 1Q };
+cmp.z.f0.0(8)   g32<1>DF        (abs)g6.2<0,1,0>DF g68<4,4,1>DF { align1 1Q };
+cmp.le.f0.0(8)  g108<1>D        g106<8,8,1>D    0D              { align1 1Q };
+cmp.nz.f0.0(8)  null<1>DF       g6.2<0,1,0>DF   g66<4,4,1>DF    { align1 1Q };
+cmp.l.f0.0(8)   g5<1>DF         g36<4,4,1>DF    g53<4,4,1>DF    { align1 1Q };
+cmp.ge.f0.0(8)  g18<1>DF        g36<4,4,1>DF    g53<4,4,1>DF    { align1 1Q };
+cmp.z.f0.0(8)   g34<1>DF        (abs)g106<4,4,1>DF g52<4,4,1>DF { align1 2Q };
+cmp.le.f0.0(16) g35<1>D         g21<8,8,1>D     0D              { align1 1H };
+cmp.nz.f0.0(8)  null<1>DF       g106<4,4,1>DF   g50<4,4,1>DF    { align1 2Q };
+cmp.nz.f0.0(8)  g113<1>DF       g3.1<0,1,0>DF   g59<4,4,1>DF    { align1 2Q };
+cmp.l.f0.0(8)   null<1>UD       g12<8,8,1>UD    0x00000004UD    { align1 1Q };
+cmp.l.f0.0(8)   g53<1>F         g52<8,8,1>F     g51<8,8,1>F     { align1 1Q };
+cmp.ge.f0.0(8)  g55<1>F         g52<8,8,1>F     g51<8,8,1>F     { align1 1Q };
+cmp.g.f0.0(8)   null<1>F        g14<8,8,1>F     0x3f800000F  /* 1F */ { align1 1Q };
+cmp.le.f0.0(8)  null<1>F        g4<8,8,1>F      0x3f800000F  /* 1F */ { align1 1Q };
+cmp.ge.f0.0(8)  g15<1>D         (abs)g12<8,8,1>D 1D             { align1 1Q };
+cmp.l.f0.0(8)   null<1>D        g6<0,1,0>D      2D              { align1 1Q };
+(+f0.1) cmp.z.f0.1(8) null<1>D  g3<8,8,1>D      0D              { align1 1Q };
+cmp.nz.f0.0(16) g4<1>D          g2<8,8,1>D      3D              { align1 1H };
+(+f0.1) cmp.z.f0.1(16) null<1>D g4<8,8,1>D      0D              { align1 1H };
+cmp.z.f0.0(8)   null<1>D        g22<8,8,1>D     1D              { align1 1Q };
+cmp.z.f0.0(16)  null<1>D        g55<8,8,1>D     1D              { align1 1H };
+cmp.ge.f0.0(8)  g30<1>UD        g29<8,8,1>UD    g5.7<0,1,0>UD   { align1 1Q };
+cmp.l.f0.0(8)   g31<1>UD        g29<8,8,1>UD    g5.3<0,1,0>UD   { align1 1Q };
+cmp.ge.f0.0(16) g50<1>UD        g48<8,8,1>UD    g7.7<0,1,0>UD   { align1 1H };
+cmp.l.f0.0(16)  g52<1>UD        g48<8,8,1>UD    g7.3<0,1,0>UD   { align1 1H };
+cmp.nz.f0.0(16) g12<1>F         g2.5<0,1,0>F    g1.1<0,1,0>F    { align1 1H };
+cmp.ge.f0.0(8)  null<1>D        g38<8,8,1>D     32D             { align1 1Q };
+cmp.ge.f0.0(8)  null<1>DF       g21<4,4,1>DF    g13<4,4,1>DF    { align1 1Q };
+cmp.ge.f0.0(16) g3<1>D          g1.1<0,1,0>D    g1<0,1,0>D      { align1 1H };
+cmp.l.f0.0(16)  g5<1>D          g1.1<0,1,0>D    g1<0,1,0>D      { align1 1H };
+cmp.ge.f0.0(16) null<1>D        g7<8,8,1>D      g6<0,1,0>D      { align1 1H };
+cmp.nz.f0.0(16) null<1>D        g31<8,8,1>D     0D              { align1 1H };
+cmp.z.f0.0(8)   g25<1>F         g4.3<0,1,0>F    g4.1<0,1,0>F    { align1 1Q };
+cmp.l.f0.0(8)   g33<1>D         g5<0,1,0>D      1D              { align1 1Q };
+cmp.l.f0.0(8)   g43<1>DF        g39<4,4,1>DF    g37<4,4,1>DF    { align1 2Q };
+cmp.ge.f0.0(8)  g46<1>DF        g39<4,4,1>DF    g37<4,4,1>DF    { align1 2Q };
+cmp.l.f0.0(16)  null<1>D        g6<0,1,0>D      1D              { align1 1H };
+cmp.z.f0.0(16)  g62<1>F         g12<8,8,1>F     g6.3<0,1,0>F    { align1 1H };
+cmp.nz.f0.0(8)  null<1>F        g2<0,1,0>F      0x0F  /* 0F */  { align1 1Q };
+cmp.nz.f0.0(16) null<1>F        g2<0,1,0>F      0x0F  /* 0F */  { align1 1H };
+cmp.ge.f0.0(16) null<1>UD       g5<8,8,1>UD     0x00000040UD    { align1 1H };
+cmp.z.f0.0(16)  null<1>F        g14<8,8,1>F     g6.1<0,1,0>F    { align1 1H };
+cmp.l.f0.0(16)  null<1>UD       g39<8,8,1>UD    0x00000004UD    { align1 1H };
+cmp.le.f0.0(8)  g20<1>F         g5.3<0,1,0>F    0x0F  /* 0F */  { align1 1Q };
+cmp.ge.f0.0(8)  null<1>F        (abs)g26<8,8,1>F 0x5d5e0b6bF  /* 1e+18F */ { align1 1Q };
+cmp.g.f0.0(8)   g80<1>F         (abs)g44<8,8,1>F 0x3f800000F  /* 1F */ { align1 1Q };
+cmp.z.f0.0(8)   g4<1>F          g13<8,4,2>F     g2.5<0,1,0>F    { align1 2Q };
+cmp.g.f0.0(16)  null<1>F        g120<8,8,1>F    0x0F  /* 0F */  { align1 1H };
+cmp.l.f0.0(8)   null<1>DF       (abs)g5<0,1,0>DF g20<4,4,1>DF   { align1 1Q };
+cmp.nz.f0.0(8)  g29<1>D         g22.1<8,4,2>D   g3.2<0,1,0>D    { align1 2Q };
+cmp.l.f0.0(8)   null<1>DF       g11<4,4,1>DF    g8<4,4,1>DF     { align1 2Q };
+cmp.nz.f0.0(8)  g73<1>F         g6.1<0,1,0>F    g14<8,4,2>F     { align1 2Q };
+cmp.g.f0.0(8)   g7<1>D          g2<0,1,0>D      0D              { align1 1Q };
+cmp.l.f0.0(8)   null<1>F        g4.4<0,1,0>F    0x0F  /* 0F */  { align1 1Q };
+cmp.l.f0.0(16)  null<1>F        g6.4<0,1,0>F    0x0F  /* 0F */  { align1 1H };
+cmp.le.f0.0(8)  g4<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 1Q };
+cmp.g.f0.0(8)   g5<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 1Q };
+cmp.le.f0.0(16) g5<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 1H };
+cmp.g.f0.0(16)  g7<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 1H };
+cmp.le.f0.0(16) null<1>F        g115<8,8,1>F    0x3f000000F  /* 0.5F */ { align1 1H };
+cmp.le.f0.0(16) g121<1>F        g27<8,8,1>F     0x461c3f9aF  /* 9999.9F */ { align1 1H };
+cmp.le.f0.0(8)  null<1>D        g8<8,8,1>D      50D             { align1 1Q };
+cmp.le.f0.0(16) null<1>D        g21<8,8,1>D     50D             { align1 1H };
+cmp.ge.f0.0(16) null<1>F        g42<8,8,1>F     0x3f000000F  /* 0.5F */ { align1 1H };
+cmp.g.f0.0(16)  g13<1>F         g11<8,8,1>F     0x3727c5acF  /* 1e-05F */ { align1 1H };
+cmp.z.f0.0(8)   g5<1>D          g14<8,4,2>D     g3.1<0,1,0>D    { align1 2Q };
+cmp.g.f0.0(8)   null<1>D        g5.2<0,1,0>D    31D             { align1 1Q };
+cmp.g.f0.0(8)   null<1>UD       g4.2<0,1,0>UD   0x0000001fUD    { align1 1Q };
+cmp.z.f0.0(16)  null<1>D        g1<8,8,1>D      1024D           { align1 2H };
+cmp.l.f0.0(16)  null<1>D        g66<8,8,1>D     32D             { align1 2H };
+cmp.nz.f0.0(8)  null<1>UD       g3<8,8,1>UD     0x00000000UD    { align1 1Q };
+cmp.nz.f0.0(16) null<1>UD       g3<8,8,1>UD     0x00000000UD    { align1 1H };
+cmp.g.f0.0(16)  null<1>D        g2.1<0,1,0>D    0D              { align1 1H };
+cmp.nz.f0.0(8)  null<1>Q        g6<4,4,1>Q      g3<4,4,1>Q      { align1 1Q };
+cmp.z.f0.0(8)   g8<1>Q          g5<4,4,1>Q      g3<4,4,1>Q      { align1 1Q };
+cmp.nz.f0.0(8)  g2<1>Q          g5<4,4,1>Q      g3<4,4,1>Q      { align1 1Q };
+cmp.nz.f0.0(8)  null<1>Q        g9<4,4,1>Q      g4<4,4,1>Q      { align1 2Q };
+cmp.z.f0.0(8)   g17<1>Q         g11<4,4,1>Q     g4<4,4,1>Q      { align1 2Q };
+cmp.nz.f0.0(8)  g20<1>Q         g11<4,4,1>Q     g4<4,4,1>Q      { align1 2Q };
+cmp.z.f0.0(8)   null<1>UD       g5<8,8,1>UD     0x00000000UD    { align1 1Q };
+cmp.z.f0.0(16)  null<1>UD       g15<8,8,1>UD    0x00000000UD    { align1 1H };
+cmp.g.f0.0(16)  g1<1>D          g8<8,8,1>D      0D              { align1 1H };
+cmp.ge.f0.0(8)  null<1>UD       g10<8,8,1>UD    g8<8,8,1>UD     { align1 1Q };
+(+f0.1) cmp.nz.f0.1(8) null<1>UW g0<8,8,1>UW    g0<8,8,1>UW     { align1 1Q };
+(+f0.1) cmp.nz.f0.1(16) null<1>UW g0<8,8,1>UW   g0<8,8,1>UW     { align1 1H };
+cmp.ge.f0.0(8)  null<1>DF       g37<4,4,1>DF    g26<4,4,1>DF    { align1 2Q };
+cmp.l.f0.0(8)   null<1>Q        g17<4,4,1>Q     g22<4,4,1>Q     { align1 1Q };
+cmp.l.f0.0(8)   null<1>Q        g2<4,4,1>Q      g8<4,4,1>Q      { align1 2Q };
+cmp.ge.f0.0(8)  null<1>Q        g17<4,4,1>Q     g24<4,4,1>Q     { align1 1Q };
+cmp.ge.f0.0(8)  null<1>Q        g2<4,4,1>Q      g8<4,4,1>Q      { align1 2Q };
+cmp.le.f0.0(8)  null<1>UD       g19<8,8,1>UD    0x000000ffUD    { align1 1Q };
+cmp.le.f0.0(16) null<1>UD       g33<8,8,1>UD    0x000000ffUD    { align1 1H };
+cmp.z.f0.0(8)   null<1>Q        g12<4,4,1>Q     g7<4,4,1>Q      { align1 1Q };
+cmp.z.f0.0(8)   null<1>Q        g26<4,4,1>Q     g12<4,4,1>Q     { align1 2Q };
+cmp.g.f0.0(16)  null<1>UD       g4.2<0,1,0>UD   0x0000001fUD    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/cmp.expected b/src/intel/compiler/elk/tests/gen8/cmp.expected
new file mode 100644
index 00000000000..e1d55980d88
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/cmp.expected
@@ -0,0 +1,104 @@
+10 00 60 01 e0 3a 00 20 80 02 8d 3e 00 00 80 bf
+10 00 60 02 c8 32 60 27 48 00 00 32 60 07 69 00
+10 00 60 02 e8 3a 20 26 e0 05 8d 3a c4 01 00 00
+10 00 60 02 20 0a 00 20 e0 00 8d 0e 00 00 00 00
+10 00 60 01 28 0a a0 20 80 00 8d 0a 54 00 00 00
+10 00 80 01 28 0a e0 20 a0 00 8d 0a 54 00 00 00
+10 00 80 05 e8 3a 60 24 20 04 8d 3a e0 03 8d 00
+10 00 80 04 e8 3a a0 24 20 04 8d 3a e0 03 8d 00
+10 00 60 02 28 0a 60 25 40 05 8d 0a 44 00 00 00
+10 00 60 01 c8 32 00 24 d0 20 00 32 80 08 69 00
+10 00 60 06 28 0a 80 2d 40 0d 8d 0e 00 00 00 00
+10 00 60 02 c0 32 00 20 d0 00 00 32 40 08 69 00
+10 00 60 05 c8 32 a0 20 80 04 69 32 a0 06 69 00
+10 00 60 04 c8 32 40 22 80 04 69 32 a0 06 69 00
+10 10 60 01 c8 32 40 24 40 2d 69 32 80 06 69 00
+10 00 80 06 28 0a 60 24 a0 02 8d 0e 00 00 00 00
+10 10 60 02 c0 32 00 20 40 0d 69 32 40 06 69 00
+10 10 60 02 c8 32 20 2e 68 00 00 32 60 07 69 00
+10 00 60 05 00 02 00 20 80 01 8d 06 04 00 00 00
+10 00 60 05 e8 3a a0 26 80 06 8d 3a 60 06 8d 00
+10 00 60 04 e8 3a e0 26 80 06 8d 3a 60 06 8d 00
+10 00 60 03 e0 3a 00 20 c0 01 8d 3e 00 00 80 3f
+10 00 60 06 e0 3a 00 20 80 00 8d 3e 00 00 80 3f
+10 00 60 04 28 0a e0 21 80 21 8d 0e 01 00 00 00
+10 00 60 05 20 0a 00 20 c0 00 00 0e 02 00 00 00
+10 00 61 01 21 0a 00 20 60 00 8d 0e 00 00 00 00
+10 00 80 02 28 0a 80 20 40 00 8d 0e 03 00 00 00
+10 00 81 01 21 0a 00 20 80 00 8d 0e 00 00 00 00
+10 00 60 01 20 0a 00 20 c0 02 8d 0e 01 00 00 00
+10 00 80 01 20 0a 00 20 e0 06 8d 0e 01 00 00 00
+10 00 60 04 08 02 c0 23 a0 03 8d 02 bc 00 00 00
+10 00 60 05 08 02 e0 23 a0 03 8d 02 ac 00 00 00
+10 00 80 04 08 02 40 26 00 06 8d 02 fc 00 00 00
+10 00 80 05 08 02 80 26 00 06 8d 02 ec 00 00 00
+10 00 80 02 e8 3a 80 21 54 00 00 3a 24 00 00 00
+10 00 60 04 20 0a 00 20 c0 04 8d 0e 20 00 00 00
+10 00 60 04 c0 32 00 20 a0 02 69 32 a0 01 69 00
+10 00 80 04 28 0a 60 20 24 00 00 0a 20 00 00 00
+10 00 80 05 28 0a a0 20 24 00 00 0a 20 00 00 00
+10 00 80 04 20 0a 00 20 e0 00 8d 0a c0 00 00 00
+10 00 80 02 20 0a 00 20 e0 03 8d 0e 00 00 00 00
+10 00 60 01 e8 3a 20 23 8c 00 00 3a 84 00 00 00
+10 00 60 05 28 0a 20 24 a0 00 00 0e 01 00 00 00
+10 10 60 05 c8 32 60 25 e0 04 69 32 a0 04 69 00
+10 10 60 04 c8 32 c0 25 e0 04 69 32 a0 04 69 00
+10 00 80 05 20 0a 00 20 c0 00 00 0e 01 00 00 00
+10 00 80 01 e8 3a c0 27 80 01 8d 3a cc 00 00 00
+10 00 60 02 e0 3a 00 20 40 00 00 3e 00 00 00 00
+10 00 80 02 e0 3a 00 20 40 00 00 3e 00 00 00 00
+10 00 80 04 00 02 00 20 a0 00 8d 06 40 00 00 00
+10 00 80 01 e0 3a 00 20 c0 01 8d 3a c4 00 00 00
+10 00 80 05 00 02 00 20 e0 04 8d 06 04 00 00 00
+10 00 60 06 e8 3a 80 22 ac 00 00 3e 00 00 00 00
+10 00 60 04 e0 3a 00 20 40 23 8d 3e 6b 0b 5e 5d
+10 00 60 03 e8 3a 00 2a 80 25 8d 3e 00 00 80 3f
+10 10 60 01 e8 3a 80 20 a0 01 8a 3a 54 00 00 00
+10 00 80 03 e0 3a 00 20 00 0f 8d 3e 00 00 00 00
+10 00 60 05 c0 32 00 20 a0 20 00 32 80 02 69 00
+10 10 60 02 28 0a a0 23 c4 02 8a 0a 68 00 00 00
+10 10 60 05 c0 32 00 20 60 01 69 32 00 01 69 00
+10 10 60 02 e8 3a 20 29 c4 00 00 3a c0 01 8a 00
+10 00 60 03 28 0a e0 20 40 00 00 0e 00 00 00 00
+10 00 60 05 e0 3a 00 20 90 00 00 3e 00 00 00 00
+10 00 80 05 e0 3a 00 20 d0 00 00 3e 00 00 00 00
+10 00 60 06 08 02 80 20 40 00 00 06 01 00 00 00
+10 00 60 03 08 02 a0 20 40 00 00 06 01 00 00 00
+10 00 80 06 08 02 a0 20 40 00 00 06 01 00 00 00
+10 00 80 03 08 02 e0 20 40 00 00 06 01 00 00 00
+10 00 80 06 e0 3a 00 20 60 0e 8d 3e 00 00 00 3f
+10 00 80 06 e8 3a 20 2f 60 03 8d 3e 9a 3f 1c 46
+10 00 60 06 20 0a 00 20 00 01 8d 0e 32 00 00 00
+10 00 80 06 20 0a 00 20 a0 02 8d 0e 32 00 00 00
+10 00 80 04 e0 3a 00 20 40 05 8d 3e 00 00 00 3f
+10 00 80 03 e8 3a a0 21 60 01 8d 3e ac c5 27 37
+10 10 60 01 28 0a a0 20 c0 01 8a 0a 64 00 00 00
+10 00 60 03 20 0a 00 20 a8 00 00 0e 1f 00 00 00
+10 00 60 03 00 02 00 20 88 00 00 06 1f 00 00 00
+10 20 80 01 20 0a 00 20 20 00 8d 0e 00 04 00 00
+10 20 80 05 20 0a 00 20 40 08 8d 0e 20 00 00 00
+10 00 60 02 00 02 00 20 60 00 8d 06 00 00 00 00
+10 00 80 02 00 02 00 20 60 00 8d 06 00 00 00 00
+10 00 80 03 20 0a 00 20 44 00 00 0e 00 00 00 00
+10 00 60 02 20 4b 00 20 c0 00 69 4a 60 00 69 00
+10 00 60 01 28 4b 00 21 a0 00 69 4a 60 00 69 00
+10 00 60 02 28 4b 40 20 a0 00 69 4a 60 00 69 00
+10 10 60 02 20 4b 00 20 20 01 69 4a 80 00 69 00
+10 10 60 01 28 4b 20 22 60 01 69 4a 80 00 69 00
+10 10 60 02 28 4b 80 22 60 01 69 4a 80 00 69 00
+10 00 60 01 00 02 00 20 a0 00 8d 06 00 00 00 00
+10 00 80 01 00 02 00 20 e0 01 8d 06 00 00 00 00
+10 00 80 03 28 0a 20 20 00 01 8d 0e 00 00 00 00
+10 00 60 04 00 02 00 20 40 01 8d 02 00 01 8d 00
+10 00 61 02 41 12 00 20 00 00 8d 12 00 00 8d 00
+10 00 81 02 41 12 00 20 00 00 8d 12 00 00 8d 00
+10 10 60 04 c0 32 00 20 a0 04 69 32 40 03 69 00
+10 00 60 05 20 4b 00 20 20 02 69 4a c0 02 69 00
+10 10 60 05 20 4b 00 20 40 00 69 4a 00 01 69 00
+10 00 60 04 20 4b 00 20 20 02 69 4a 00 03 69 00
+10 10 60 04 20 4b 00 20 40 00 69 4a 00 01 69 00
+10 00 60 06 00 02 00 20 60 02 8d 06 ff 00 00 00
+10 00 80 06 00 02 00 20 20 04 8d 06 ff 00 00 00
+10 00 60 01 20 4b 00 20 80 01 69 4a e0 00 69 00
+10 10 60 01 20 4b 00 20 40 03 69 4a 80 01 69 00
+10 00 80 03 00 02 00 20 88 00 00 06 1f 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/cont.asm b/src/intel/compiler/elk/tests/gen8/cont.asm
new file mode 100644
index 00000000000..ca97a556e9c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/cont.asm
@@ -0,0 +1,4 @@
+cont(8)         JIP: LABEL0         UIP: LABEL1                 { align1 1Q };
+LABEL0:
+cont(16)        JIP: LABEL1         UIP: LABEL1                 { align1 1H };
+LABEL1:
diff --git a/src/intel/compiler/elk/tests/gen8/cont.expected b/src/intel/compiler/elk/tests/gen8/cont.expected
new file mode 100644
index 00000000000..d8036df8e1c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/cont.expected
@@ -0,0 +1,2 @@
+29 00 60 00 00 0e 00 34 20 00 00 00 10 00 00 00
+29 00 80 00 00 0e 00 34 10 00 00 00 10 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/cr0.asm b/src/intel/compiler/elk/tests/gen8/cr0.asm
new file mode 100644
index 00000000000..d5b67ca9cf1
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/cr0.asm
@@ -0,0 +1,14 @@
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xfffffb3fUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xffffff3fUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xfffffb7fUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xffffff7fUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xfffffbbfUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xffffffbfUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xffffffcfUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xfffffbffUD    { align1 1N switch };
+or(1)           cr0<1>UD        cr0<0,1,0>UD    0x00000400UD    { align1 1N switch };
+or(1)           cr0<1>UD        cr0<0,1,0>UD    0x00000030UD    { align1 1N switch };
+or(1)           cr0<1>UD        cr0<0,1,0>UD    0x00000040UD    { align1 1N switch };
+or(1)           cr0<1>UD        cr0<0,1,0>UD    0x00000440UD    { align1 1N switch };
+or(1)           cr0<1>UD        cr0<0,1,0>UD    0x00000080UD    { align1 1N switch };
+or(1)           cr0<1>UD        cr0<0,1,0>UD    0x00000480UD    { align1 1N switch };
diff --git a/src/intel/compiler/elk/tests/gen8/cr0.expected b/src/intel/compiler/elk/tests/gen8/cr0.expected
new file mode 100644
index 00000000000..ccf8a886035
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/cr0.expected
@@ -0,0 +1,14 @@
+05 80 00 00 00 00 00 30 00 10 00 06 3f fb ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 3f ff ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 7f fb ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 7f ff ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 bf fb ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 bf ff ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 cf ff ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 ff fb ff ff
+06 80 00 00 00 00 00 30 00 10 00 06 00 04 00 00
+06 80 00 00 00 00 00 30 00 10 00 06 30 00 00 00
+06 80 00 00 00 00 00 30 00 10 00 06 40 00 00 00
+06 80 00 00 00 00 00 30 00 10 00 06 40 04 00 00
+06 80 00 00 00 00 00 30 00 10 00 06 80 00 00 00
+06 80 00 00 00 00 00 30 00 10 00 06 80 04 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/csel.asm b/src/intel/compiler/elk/tests/gen8/csel.asm
new file mode 100644
index 00000000000..b5ec2cce005
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/csel.asm
@@ -0,0 +1,13 @@
+csel.nz(8)      g15<1>F         g11<4,4,1>F     (abs)g11<4,4,1>F g11<4,4,1>F { align16 1Q };
+csel.nz(16)     g14<1>F         g8<4,4,1>F      (abs)g8<4,4,1>F g8<4,4,1>F { align16 1H };
+csel.le(8)      g21<1>F         (abs)g5.3<0,1,0>F g5.0<0,1,0>F  g5.3<0,1,0>F { align16 1Q };
+csel.l(8)       g107<1>F        -g101<4,4,1>F   g101<4,4,1>F    g104<4,4,1>F { align16 1Q };
+csel.le(8)      g21<1>F         g5.0<0,1,0>F    (abs)g5.1<0,1,0>F g5.1<0,1,0>F { align16 1Q };
+csel.l(8)       g127<1>F        g2<4,4,1>F      g8<4,4,1>F      g4.0<0,1,0>F { align16 1Q };
+csel.l(16)      g126<1>F        g2<4,4,1>F      g13<4,4,1>F     g6.0<0,1,0>F { align16 1H };
+csel.le(16)     g13<1>F         (abs)g73<4,4,1>F g58<4,4,1>F    g73<4,4,1>F { align16 1H };
+csel.le(16)     g15<1>F         g58<4,4,1>F     (abs)g73<4,4,1>F g73<4,4,1>F { align16 1H };
+csel.l(16)      g69<1>F         -g11<4,4,1>F    g11<4,4,1>F     g67<4,4,1>F { align16 1H };
+csel.sat.g(8)   g125<1>F        g2.3<0,1,0>F    g2.2<0,1,0>F    g2.0<0,1,0>F { align16 1Q };
+csel.g(8)       g125<1>F        g2.3<0,1,0>F    g2.2<0,1,0>F    g2.0<0,1,0>F { align16 1Q };
+csel.g(16)      g122<1>F        g2.3<0,1,0>F    g2.2<0,1,0>F    g2.0<0,1,0>F { align16 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/csel.expected b/src/intel/compiler/elk/tests/gen8/csel.expected
new file mode 100644
index 00000000000..6cefe9a9f5d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/csel.expected
@@ -0,0 +1,13 @@
+12 01 60 02 80 00 1e 0f c8 b1 00 39 16 20 c7 02
+12 01 80 02 80 00 1e 0e c8 81 00 39 10 20 07 02
+12 01 60 06 20 00 1e 15 01 56 20 00 0a 04 58 01
+12 01 60 05 40 00 1e 6b c8 51 06 39 ca 20 07 1a
+12 01 60 06 80 00 1e 15 01 50 20 40 0a 04 48 01
+12 01 60 05 00 00 1e 7f c8 21 00 39 10 04 00 01
+12 01 80 05 00 00 1e 7e c8 21 00 39 1a 04 80 01
+12 01 80 06 20 00 1e 0d c8 91 04 39 74 20 47 12
+12 01 80 06 80 00 1e 0f c8 a1 03 39 92 20 47 12
+12 01 80 05 40 00 1e 45 c8 b1 00 39 16 20 c7 10
+12 01 60 83 00 00 1e 7d 01 26 20 80 04 04 80 00
+12 01 60 03 00 00 1e 7d 01 26 20 80 04 04 80 00
+12 01 80 03 00 00 1e 7a 01 26 20 80 04 04 80 00
diff --git a/src/intel/compiler/elk/tests/gen8/else.asm b/src/intel/compiler/elk/tests/gen8/else.asm
new file mode 100644
index 00000000000..ce868a280cd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/else.asm
@@ -0,0 +1,4 @@
+else(8)         JIP: LABEL0        UIP: LABEL0                  { align1 1Q };
+else(16)        JIP: LABEL0        UIP: LABEL0                  { align1 1H };
+else(32)        JIP: LABEL0        UIP: LABEL0                  { align1 };
+LABEL0:
\ No newline at end of file
diff --git a/src/intel/compiler/elk/tests/gen8/else.expected b/src/intel/compiler/elk/tests/gen8/else.expected
new file mode 100644
index 00000000000..c7834d75bcd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/else.expected
@@ -0,0 +1,3 @@
+24 00 60 00 20 0e 00 20 30 00 00 00 30 00 00 00
+24 00 80 00 20 0e 00 20 20 00 00 00 20 00 00 00
+24 00 a0 00 20 0e 00 20 10 00 00 00 10 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/endif.asm b/src/intel/compiler/elk/tests/gen8/endif.asm
new file mode 100644
index 00000000000..206798e2de6
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/endif.asm
@@ -0,0 +1,4 @@
+endif(8)        JIP: LABEL0                                     { align1 1Q };
+endif(16)       JIP: LABEL0                                     { align1 1H };
+endif(32)       JIP: LABEL0                                     { align1 };
+LABEL0:
diff --git a/src/intel/compiler/elk/tests/gen8/endif.expected b/src/intel/compiler/elk/tests/gen8/endif.expected
new file mode 100644
index 00000000000..5f6a9feba40
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/endif.expected
@@ -0,0 +1,3 @@
+25 00 60 00 00 0e 00 00 00 00 00 08 30 00 00 00
+25 00 80 00 00 0e 00 00 00 00 00 08 20 00 00 00
+25 00 a0 00 00 0e 00 00 00 00 00 08 10 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/fbh.asm b/src/intel/compiler/elk/tests/gen8/fbh.asm
new file mode 100644
index 00000000000..fb62e766685
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/fbh.asm
@@ -0,0 +1,2 @@
+fbh(8)          g15<1>D         g35<8,8,1>D                     { align1 1Q };
+fbh(16)         g8<1>D          g4<8,8,1>D                      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/fbh.expected b/src/intel/compiler/elk/tests/gen8/fbh.expected
new file mode 100644
index 00000000000..a3a1fcee746
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/fbh.expected
@@ -0,0 +1,2 @@
+4b 00 60 00 28 0a e0 21 60 04 8d 00 00 00 00 00
+4b 00 80 00 28 0a 00 21 80 00 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/fbl.asm b/src/intel/compiler/elk/tests/gen8/fbl.asm
new file mode 100644
index 00000000000..948f70ad807
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/fbl.asm
@@ -0,0 +1,3 @@
+fbl(8)          g5<1>UD         g5<8,8,1>UD                     { align1 1Q };
+fbl(16)         g6<1>UD         g8<8,8,1>UD                     { align1 1H };
+fbl(1)          g27<1>UD        mask0<0,1,0>UD                  { align1 WE_all 1N };
diff --git a/src/intel/compiler/elk/tests/gen8/fbl.expected b/src/intel/compiler/elk/tests/gen8/fbl.expected
new file mode 100644
index 00000000000..10a89482074
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/fbl.expected
@@ -0,0 +1,3 @@
+4c 00 60 00 08 02 a0 20 a0 00 8d 00 00 00 00 00
+4c 00 80 00 08 02 c0 20 00 01 8d 00 00 00 00 00
+4c 00 00 00 0c 00 60 23 00 08 00 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/frc.asm b/src/intel/compiler/elk/tests/gen8/frc.asm
new file mode 100644
index 00000000000..4ef83b81db1
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/frc.asm
@@ -0,0 +1,2 @@
+frc(8)          g28<1>F         g4<8,8,1>F                      { align1 1Q };
+frc(16)         g10<1>F         g1<0,1,0>F                      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/frc.expected b/src/intel/compiler/elk/tests/gen8/frc.expected
new file mode 100644
index 00000000000..302884cb8fa
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/frc.expected
@@ -0,0 +1,2 @@
+43 00 60 00 e8 3a 80 23 80 00 8d 00 00 00 00 00
+43 00 80 00 e8 3a 40 21 20 00 00 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/halt.asm b/src/intel/compiler/elk/tests/gen8/halt.asm
new file mode 100644
index 00000000000..726d1917f88
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/halt.asm
@@ -0,0 +1,6 @@
+(-f0.1.any4h) halt(8) JIP: LABEL0      UIP: LABEL0              { align1 1Q };
+halt(8)         JIP: LABEL1            UIP: LABEL1              { align1 1Q };
+LABEL1:
+(-f0.1.any4h) halt(16) JIP: LABEL0     UIP: LABEL0              { align1 1H };
+halt(16)        JIP: LABEL0            UIP: LABEL0              { align1 1H };
+LABEL0:
\ No newline at end of file
diff --git a/src/intel/compiler/elk/tests/gen8/halt.expected b/src/intel/compiler/elk/tests/gen8/halt.expected
new file mode 100644
index 00000000000..b0867fe7f81
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/halt.expected
@@ -0,0 +1,4 @@
+2a 00 76 00 21 0e 00 20 40 00 00 00 40 00 00 00
+2a 00 60 00 20 0e 00 20 10 00 00 00 10 00 00 00
+2a 00 96 00 21 0e 00 20 20 00 00 00 20 00 00 00
+2a 00 80 00 20 0e 00 20 10 00 00 00 10 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/if.asm b/src/intel/compiler/elk/tests/gen8/if.asm
new file mode 100644
index 00000000000..d6f8b84d758
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/if.asm
@@ -0,0 +1,7 @@
+(+f0.0) if(8)   JIP: LABEL0       UIP: LABEL1                   { align1 1Q };
+(-f0.0) if(8)   JIP: LABEL0       UIP: LABEL1                   { align1 1Q };
+LABEL0:
+(-f0.0) if(16)  JIP: LABEL1       UIP: LABEL1                 { align1 1H };
+(+f0.0) if(16)  JIP: LABEL1       UIP: LABEL1                 { align1 1H };
+(+f0.0) if(32)  JIP: LABEL1       UIP: LABEL1                 { align1 };
+LABEL1:
diff --git a/src/intel/compiler/elk/tests/gen8/if.expected b/src/intel/compiler/elk/tests/gen8/if.expected
new file mode 100644
index 00000000000..d11bebc1730
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/if.expected
@@ -0,0 +1,5 @@
+22 00 61 00 20 0e 00 20 50 00 00 00 20 00 00 00
+22 00 71 00 20 0e 00 20 40 00 00 00 10 00 00 00
+22 00 91 00 20 0e 00 20 30 00 00 00 30 00 00 00
+22 00 81 00 20 0e 00 20 20 00 00 00 20 00 00 00
+22 00 a1 00 20 0e 00 20 10 00 00 00 10 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/lrp.asm b/src/intel/compiler/elk/tests/gen8/lrp.asm
new file mode 100644
index 00000000000..d2445c6919b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/lrp.asm
@@ -0,0 +1,5 @@
+lrp(8)          g4<1>F          g16<4,4,1>F     g7.2<0,1,0>F    g6.6<0,1,0>F { align16 1Q };
+lrp(16)         g4<1>F          g2.4<0,1,0>F    g2.2<0,1,0>F    g2.0<0,1,0>F { align16 1H };
+lrp.z.f0.0(8)   g8<1>F          g3.2<0,1,0>F    g3.1<0,1,0>F    g3.0<0,1,0>F { align16 1Q };
+lrp.sat(8)      g7<1>F          g10<4,4,1>F     g13<4,4,1>F     g16<4,4,1>F { align16 1Q };
+lrp.sat(16)     g18<1>F         g20<4,4,1>F     g26<4,4,1>F     g32<4,4,1>F { align16 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/lrp.expected b/src/intel/compiler/elk/tests/gen8/lrp.expected
new file mode 100644
index 00000000000..b109e92a5be
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/lrp.expected
@@ -0,0 +1,5 @@
+5c 01 60 00 00 00 1e 04 c8 01 21 80 0e 04 b0 01
+5c 01 80 00 00 00 1e 04 01 28 20 80 04 04 80 00
+5c 01 60 01 00 00 1e 08 01 34 20 40 06 04 c0 00
+5c 01 60 80 00 00 1e 07 c8 a1 00 39 1a 20 07 04
+5c 01 80 80 00 00 1e 12 c8 41 01 39 34 20 07 08
diff --git a/src/intel/compiler/elk/tests/gen8/lzd.asm b/src/intel/compiler/elk/tests/gen8/lzd.asm
new file mode 100644
index 00000000000..2dba1a11453
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/lzd.asm
@@ -0,0 +1,2 @@
+lzd(8)          g25<1>UD        g3.1<0,1,0>UD                   { align1 1Q };
+lzd(16)         g27<1>UD        g3.1<0,1,0>UD                   { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/lzd.expected b/src/intel/compiler/elk/tests/gen8/lzd.expected
new file mode 100644
index 00000000000..74afe29080d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/lzd.expected
@@ -0,0 +1,2 @@
+4a 00 60 00 08 02 20 23 64 00 00 00 00 00 00 00
+4a 00 80 00 08 02 60 23 64 00 00 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/mach.asm b/src/intel/compiler/elk/tests/gen8/mach.asm
new file mode 100644
index 00000000000..9ddbe0b3742
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/mach.asm
@@ -0,0 +1,4 @@
+mach(8)         g19<1>UD        g17<8,8,1>UD    0xaaaaaaabUD    { align1 1Q AccWrEnable };
+mach(8)         g23<1>D         g17<8,8,1>D     1431655766D     { align1 1Q AccWrEnable };
+mach(8)         g50<1>UD        g47<8,8,1>UD    0xaaaaaaabUD    { align1 2Q AccWrEnable };
+mach(8)         g58<1>D         g47<8,8,1>D     1431655766D     { align1 2Q AccWrEnable };
diff --git a/src/intel/compiler/elk/tests/gen8/mach.expected b/src/intel/compiler/elk/tests/gen8/mach.expected
new file mode 100644
index 00000000000..a14964a25e6
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/mach.expected
@@ -0,0 +1,4 @@
+49 00 60 10 08 02 60 22 20 02 8d 06 ab aa aa aa
+49 00 60 10 28 0a e0 22 20 02 8d 0e 56 55 55 55
+49 10 60 10 08 02 40 26 e0 05 8d 06 ab aa aa aa
+49 10 60 10 28 0a 40 27 e0 05 8d 0e 56 55 55 55
diff --git a/src/intel/compiler/elk/tests/gen8/mad.asm b/src/intel/compiler/elk/tests/gen8/mad.asm
new file mode 100644
index 00000000000..d3995953690
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/mad.asm
@@ -0,0 +1,43 @@
+mad(8)          g26<1>F         g22<4,4,1>F     g2.4<0,1,0>F    g5<4,4,1>F { align16 1Q };
+mad(16)         g21<1>F         g19<4,4,1>F     g11<4,4,1>F     g11<4,4,1>F { align16 1H };
+mad(8)          g64<1>DF        g62<4,4,1>DF    g40<4,4,1>DF    g92<4,4,1>DF { align16 1Q };
+mad(8)          g74<1>DF        -g50<4,4,1>DF   g24<4,4,1>DF    g74<4,4,1>DF { align16 1Q };
+mad(8)          g27<1>DF        g48<4,4,1>DF    g106<4,4,1>DF   g25<4,4,1>DF { align16 2Q };
+mad(8)          g29<1>DF        g23<4,4,1>DF    g27<4,4,1>DF    -g25<4,4,1>DF { align16 1Q };
+mad(16)         g15<1>F         -g2.6<0,1,0>F   g13<4,4,1>F     g1.0<0,1,0>F { align16 1H };
+mad(8)          g124<1>F        -g15.0<0,1,0>F  g14<4,4,1>F     g15.1<0,1,0>F { align16 1Q };
+mad(8)          g124<1>F        g15.0<0,1,0>F   g14<4,4,1>F     -g15.1<0,1,0>F { align16 1Q };
+mad.le.f0.0(8)  g9<1>F          g3<4,4,1>F      g4.2<0,1,0>F    g15<4,4,1>F { align16 1Q };
+mad.le.f0.0(16) g15<1>F         g4<4,4,1>F      g6.2<0,1,0>F    g24<4,4,1>F { align16 1H };
+mad(16)         g56<1>F         g54<4,4,1>F     g2.3<0,1,0>F    -g5<4,4,1>F { align16 1H };
+mad.sat(8)      g12<1>F         g4.1<0,1,0>F    g4.0<0,1,0>F    g13<4,4,1>F { align16 1Q };
+mad.sat(16)     g18<1>F         g6.1<0,1,0>F    g6.0<0,1,0>F    g10<4,4,1>F { align16 1H };
+mad(8)          g86<1>F         g88.6<0,1,0>F   -g88.7<0,1,0>F  g77<4,4,1>F { align16 1Q };
+mad(8)          g85<1>DF        g28<4,4,1>DF    g26<4,4,1>DF    -g81<4,4,1>DF { align16 2Q };
+mad(8)          g11<1>F         -g2.0<0,1,0>F   g10<4,4,1>F     (abs)g5.6<0,1,0>F { align16 1Q };
+mad(8)          g15<1>F         g2.1<0,1,0>F    g11<4,4,1>F     (abs)g5.6<0,1,0>F { align16 1Q };
+mad.l.f0.0(8)   g2<1>F          g22<4,4,1>F     g5.7<0,1,0>F    g6.3<0,1,0>F { align16 1Q };
+mad(8)          g79<1>DF        -g39<4,4,1>DF   g21<4,4,1>DF    g79<4,4,1>DF { align16 2Q };
+mad(8)          g117<1>F        -g116<4,4,1>F   g9.0<0,1,0>F    -g113<4,4,1>F { align16 1Q };
+mad.ge.f0.0(8)  g13<1>F         g28.0<0,1,0>F   g9<4,4,1>F      -g2.4<0,1,0>F { align16 1Q };
+mad.ge.f0.0(16) g23<1>F         g17.0<0,1,0>F   g6<4,4,1>F      -g3.0<0,1,0>F { align16 1H };
+mad(8)          g26<1>F         g2.0<0,1,0>F    -g2.1<0,1,0>F   (abs)g5.6<0,1,0>F { align16 1Q };
+mad(8)          g70<1>F         -g13<4,4,1>F    -g2.1<0,1,0>F   -g47<4,4,1>F { align16 1Q };
+mad(16)         g95<1>F         -g93<4,4,1>F    g85<4,4,1>F     -g85<4,4,1>F { align16 1H };
+mad(16)         g5<1>F          -g21<4,4,1>F    -g2.1<0,1,0>F   -g85<4,4,1>F { align16 1H };
+mad(16)         g56<1>F         g6.4<0,1,0>F    -g6.5<0,1,0>F   g51<4,4,1>F { align16 1H };
+mad.sat(8)      g124<1>F        -g7<4,4,1>F     g2.6<0,1,0>F    g2.1<0,1,0>F { align16 1Q };
+mad(16)         g28<1>F         g58.0<0,1,0>F   -g58.1<0,1,0>F  (abs)g1.0<0,1,0>F { align16 1H };
+mad(16)         g34<1>F         -g58.2<0,1,0>F  g28<4,4,1>F     (abs)g1.0<0,1,0>F { align16 1H };
+mad(16)         g40<1>F         g58.3<0,1,0>F   g34<4,4,1>F     (abs)g1.0<0,1,0>F { align16 1H };
+mad(8)          g43<1>DF        g42<4,4,1>DF    -g34<4,4,1>DF   g7<4,4,1>DF { align16 1Q };
+mad(8)          g3<1>DF         g2<4,4,1>DF     -g111<4,4,1>DF  g39<4,4,1>DF { align16 2Q };
+mad(8)          g2<1>F          -g2<4,4,1>F     (abs)g7<4,4,1>F g8.0<0,1,0>F { align16 1Q };
+mad(16)         g2<1>F          -g10<4,4,1>F    (abs)g19<4,4,1>F g28.0<0,1,0>F { align16 1H };
+mad.sat(8)      g125<1>F        g9<4,4,1>F      g6<4,4,1>F      -g64.0<0,1,0>F { align16 1Q };
+mad.l.f0.0(16)  g5<1>F          g9<4,4,1>F      g2.7<0,1,0>F    g3.3<0,1,0>F { align16 1H };
+mad(8)          g6<1>DF         -g55<4,4,1>DF   g2<4,4,1>DF     -g47<4,4,1>DF { align16 1Q };
+mad.z.f0.0(8)   g8<1>F          g3.2<0,1,0>F    g3.1<0,1,0>F    g3.0<0,1,0>F { align16 1Q };
+mad(8)          g63<1>DF        -g48<4,4,1>DF   g56<4,4,1>DF    -g44<4,4,1>DF { align16 2Q };
+mad.nz.f0.0(8)  g10<1>F         -g12.0<0,1,0>F  g7<4,4,1>F      g10<4,4,1>F { align16 1Q };
+mad.nz.f0.0(16) g15<1>F         -g33.0<0,1,0>F  g9<4,4,1>F      g17<4,4,1>F { align16 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/mad.expected b/src/intel/compiler/elk/tests/gen8/mad.expected
new file mode 100644
index 00000000000..9f9cd7eb35e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/mad.expected
@@ -0,0 +1,43 @@
+5b 01 60 00 00 00 1e 1a c8 61 21 00 05 20 47 01
+5b 01 80 00 00 00 1e 15 c8 31 01 39 16 20 c7 02
+5b 01 60 00 00 d8 1e 40 c8 e1 03 39 50 20 07 17
+5b 01 60 00 40 d8 1e 4a c8 21 03 39 30 20 87 12
+5b 11 60 00 00 d8 1e 1b c8 01 03 39 d4 20 47 06
+5b 01 60 00 00 dc 1e 1d c8 71 01 39 36 20 47 06
+5b 01 80 00 40 00 1e 0f 01 2c 00 39 1a 04 40 00
+5b 01 60 00 40 00 1e 7c 01 f0 00 39 1c 04 c8 03
+5b 01 60 00 00 04 1e 7c 01 f0 00 39 1c 04 c8 03
+5b 01 60 06 00 00 1e 09 c8 31 20 80 08 20 c7 03
+5b 01 80 06 00 00 1e 0f c8 41 20 80 0c 20 07 06
+5b 01 80 00 00 04 1e 38 c8 61 23 c0 04 20 47 01
+5b 01 60 80 00 00 1e 0c 01 42 20 00 08 20 47 03
+5b 01 80 80 00 00 1e 12 01 62 20 00 0c 20 87 02
+5b 01 60 00 00 01 1e 56 01 8c 25 c0 b1 20 47 13
+5b 11 60 00 00 dc 1e 55 c8 c1 01 39 34 20 47 14
+5b 01 60 00 40 02 1e 0b 01 20 00 39 14 04 70 01
+5b 01 60 00 00 02 1e 0f 01 22 00 39 16 04 70 01
+5b 01 60 05 00 00 1e 02 c8 61 21 c0 0b 04 98 01
+5b 11 60 00 40 d8 1e 4f c8 71 02 39 2a 20 c7 13
+5b 01 60 00 40 04 1e 75 c8 41 27 00 12 20 47 1c
+5b 01 60 04 00 04 1e 0d 01 c0 01 39 12 04 a0 00
+5b 01 80 04 00 04 1e 17 01 10 01 39 0c 04 c0 00
+5b 01 60 00 00 03 1e 1a 01 20 20 40 04 04 70 01
+5b 01 60 00 40 05 1e 46 c8 d1 20 40 04 20 c7 0b
+5b 01 80 00 40 04 1e 5f c8 d1 05 39 aa 20 47 15
+5b 01 80 00 40 05 1e 05 c8 51 21 40 04 20 47 15
+5b 01 80 00 00 01 1e 38 01 68 20 40 0d 20 c7 0c
+5b 01 60 80 40 00 1e 7c c8 71 20 80 05 04 88 00
+5b 01 80 00 00 03 1e 1c 01 a0 23 40 74 04 40 00
+5b 01 80 00 40 02 1e 22 01 a4 03 39 38 04 40 00
+5b 01 80 00 00 02 1e 28 01 a6 03 39 44 04 40 00
+5b 01 60 00 00 d9 1e 2b c8 a1 02 39 44 20 c7 01
+5b 11 60 00 00 d9 1e 03 c8 21 00 39 de 20 c7 09
+5b 01 60 00 c0 00 1e 02 c8 21 00 39 0e 04 00 02
+5b 01 80 00 c0 00 1e 02 c8 a1 00 39 26 04 00 07
+5b 01 60 80 00 04 1e 7d c8 91 00 39 0c 04 00 10
+5b 01 80 05 00 00 1e 05 c8 91 20 c0 05 04 d8 00
+5b 01 60 00 40 dc 1e 06 c8 71 03 39 04 20 c7 0b
+5b 01 60 01 00 00 1e 08 01 34 20 40 06 04 c0 00
+5b 11 60 00 40 dc 1e 3f c8 01 03 39 70 20 07 0b
+5b 01 60 02 40 00 1e 0a 01 c0 00 39 0e 20 87 02
+5b 01 80 02 40 00 1e 0f 01 10 02 39 12 20 47 04
diff --git a/src/intel/compiler/elk/tests/gen8/math.asm b/src/intel/compiler/elk/tests/gen8/math.asm
new file mode 100644
index 00000000000..f0e74df984e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/math.asm
@@ -0,0 +1,31 @@
+math sqrt(16)   g27<1>F         g25<8,8,1>F     null<8,8,1>F    { align1 1H };
+math inv(8)     g95<1>F         g94<8,8,1>F     null<8,8,1>F    { align1 1Q };
+math inv(16)    g10<1>F         g8<8,8,1>F      null<8,8,1>F    { align1 1H };
+math intmod(8)  g3<1>UD         g1<0,1,0>UD     g1.2<0,1,0>UD   { align1 1Q };
+math intmod(8)  g4<1>UD         g1<0,1,0>UD     g1.2<0,1,0>UD   { align1 2Q };
+math sqrt(8)    g24<1>F         g23<8,8,1>F     null<8,8,1>F    { align1 1Q };
+math rsq(8)     g5<1>F          g2<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math pow(8)     g20<1>F         g14<8,8,1>F     0x42fc6666F  /* 126.2F */ { align1 1Q };
+math pow(16)    g26<1>F         g24<8,8,1>F     0x42fc6666F  /* 126.2F */ { align1 1H };
+math log(8)     g7<1>F          g6<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math log(16)    g11<1>F         g9<8,8,1>F      null<8,8,1>F    { align1 1H };
+math cos(8)     g3<1>F          g2<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math cos(16)    g4<1>F          g2<8,8,1>F      null<8,8,1>F    { align1 1H };
+math intdiv(8)  g11<1>UD        g1<0,1,0>UD     g1.4<0,1,0>UD   { align1 1Q };
+math intdiv(8)  g12<1>UD        g1<0,1,0>UD     g1.4<0,1,0>UD   { align1 2Q };
+math intdiv(8)  g24<1>D         g4<0,1,0>D      g2.2<0,1,0>D    { align1 1Q };
+math sin(8)     g10<1>F         g9<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math rsq(16)    g68<1>F         g66<8,8,1>F     null<8,8,1>F    { align1 1H };
+math exp(8)     g124<1>F        g10<8,8,1>F     null<8,8,1>F    { align1 1Q };
+math exp(16)    g120<1>F        g7<8,8,1>F      null<8,8,1>F    { align1 1H };
+math intdiv(8)  g5<1>D          g2<0,1,0>D      g2.4<0,1,0>D    { align1 2Q };
+math sin(16)    g3<1>F          g2<0,1,0>F      null<8,8,1>F    { align1 1H };
+math.sat pow(8) g3<1>F          g2<0,1,0>F      g2.4<0,1,0>F    { align1 1Q };
+math.sat pow(16) g3<1>F         g2<0,1,0>F      g2.4<0,1,0>F    { align1 1H };
+math.sat sqrt(8) g3<1>F         g2<0,1,0>F      null<8,8,1>F    { align1 1Q };
+math.sat sqrt(16) g3<1>F        g2<0,1,0>F      null<8,8,1>F    { align1 1H };
+math.sat exp(8) g3<1>F          g2<0,1,0>F      null<8,8,1>F    { align1 1Q };
+math.sat exp(16) g3<1>F         g2<0,1,0>F      null<8,8,1>F    { align1 1H };
+math.sat rsq(8) g127<1>F        (abs)g7<8,8,1>F null<8,8,1>F    { align1 1Q };
+math.sat inv(8) g124<1>F        g2<0,1,0>F      null<8,8,1>F    { align1 1Q };
+math.sat log(8) g127<1>F        g7<8,8,1>F      null<8,8,1>F    { align1 1Q };
diff --git a/src/intel/compiler/elk/tests/gen8/math.expected b/src/intel/compiler/elk/tests/gen8/math.expected
new file mode 100644
index 00000000000..27f38c3f558
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/math.expected
@@ -0,0 +1,31 @@
+38 00 80 04 e8 3a 60 23 20 03 8d 38 00 00 8d 00
+38 00 60 01 e8 3a e0 2b c0 0b 8d 38 00 00 8d 00
+38 00 80 01 e8 3a 40 21 00 01 8d 38 00 00 8d 00
+38 00 60 0d 08 02 60 20 20 00 00 02 28 00 00 00
+38 10 60 0d 08 02 80 20 20 00 00 02 28 00 00 00
+38 00 60 04 e8 3a 00 23 e0 02 8d 38 00 00 8d 00
+38 00 60 05 e8 3a a0 20 40 00 8d 38 00 00 8d 00
+38 00 60 0a e8 3a 80 22 c0 01 8d 3e 66 66 fc 42
+38 00 80 0a e8 3a 40 23 00 03 8d 3e 66 66 fc 42
+38 00 60 02 e8 3a e0 20 c0 00 8d 38 00 00 8d 00
+38 00 80 02 e8 3a 60 21 20 01 8d 38 00 00 8d 00
+38 00 60 07 e8 3a 60 20 40 00 8d 38 00 00 8d 00
+38 00 80 07 e8 3a 80 20 40 00 8d 38 00 00 8d 00
+38 00 60 0c 08 02 60 21 20 00 00 02 30 00 00 00
+38 10 60 0c 08 02 80 21 20 00 00 02 30 00 00 00
+38 00 60 0c 28 0a 00 23 80 00 00 0a 48 00 00 00
+38 00 60 06 e8 3a 40 21 20 01 8d 38 00 00 8d 00
+38 00 80 05 e8 3a 80 28 40 08 8d 38 00 00 8d 00
+38 00 60 03 e8 3a 80 2f 40 01 8d 38 00 00 8d 00
+38 00 80 03 e8 3a 00 2f e0 00 8d 38 00 00 8d 00
+38 10 60 0c 28 0a a0 20 40 00 00 0a 50 00 00 00
+38 00 80 06 e8 3a 60 20 40 00 00 38 00 00 8d 00
+38 00 60 8a e8 3a 60 20 40 00 00 3a 50 00 00 00
+38 00 80 8a e8 3a 60 20 40 00 00 3a 50 00 00 00
+38 00 60 84 e8 3a 60 20 40 00 00 38 00 00 8d 00
+38 00 80 84 e8 3a 60 20 40 00 00 38 00 00 8d 00
+38 00 60 83 e8 3a 60 20 40 00 00 38 00 00 8d 00
+38 00 80 83 e8 3a 60 20 40 00 00 38 00 00 8d 00
+38 00 60 85 e8 3a e0 2f e0 20 8d 38 00 00 8d 00
+38 00 60 81 e8 3a 80 2f 40 00 00 38 00 00 8d 00
+38 00 60 82 e8 3a e0 2f e0 00 8d 38 00 00 8d 00
diff --git a/src/intel/compiler/elk/tests/gen8/mov.asm b/src/intel/compiler/elk/tests/gen8/mov.asm
new file mode 100644
index 00000000000..6d1485b4eae
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/mov.asm
@@ -0,0 +1,145 @@
+mov(8)          g123<1>UD       g1<8,8,1>UD                     { align1 WE_all 1Q };
+mov(8)          g124<1>F        0x40c00000F      /* 6F */       { align1 1Q };
+mov(8)          g14<1>UD        0x00000000UD                    { align1 1Q };
+mov(8)          g17<1>F         g12<8,8,1>F                     { align1 1Q };
+mov.sat(8)      g124<1>F        g8<8,8,1>F                      { align1 1Q };
+mov(8)          g61<2>D         g22<8,8,1>D                     { align1 1Q };
+mov(8)          g21<1>D         g59<8,4,2>UD                    { align1 1Q };
+mov(8)          g4<1>D          -1D                             { align1 1Q };
+mov.nz.f0.0(8)  null<1>D        g4<8,8,1>D                      { align1 1Q };
+mov(1)          g2.2<1>UD       0x00000000UD                    { align1 WE_all 1N };
+mov(4)          g114<1>F        g2.3<8,2,4>F                    { align1 WE_all 1N };
+mov(8)          g125<1>F        -g7<8,8,1>D                     { align1 1Q };
+mov(16)         g124<1>F        0x0F             /* 0F */       { align1 1H };
+mov(16)         g122<1>F        -g11<8,8,1>D                    { align1 1H };
+mov(16)         g124<1>D        1065353216D                     { align1 1H };
+mov.nz.f0.0(16) null<1>D        g2<0,1,0>D                      { align1 1H };
+mov(8)          g10<1>UW        0x76543210V                     { align1 WE_all 1Q };
+mov(16)         g27<1>UD        g0.1<0,1,0>UD                   { align1 1H };
+mov(8)          g3<1>UD         0D                              { align1 WE_all 1Q };
+mov(1)          g3.7<1>UD       -1D                             { align1 WE_all 1N };
+mov(16)         g13<1>D         g10<8,8,1>UW                    { align1 1H };
+mov(8)          g1<1>UD         0D                              { align1 WE_all 2Q };
+mov(8)          g2<1>D          g15<8,8,1>D                     { align1 2Q };
+mov(8)          g6<1>D          0D                              { align1 2Q };
+mov(1)          g1.7<1>UD       -1D                             { align1 WE_all 3N };
+mov(8)          g2<1>F          g6<8,4,1>UW                     { align1 1Q };
+mov(8)          g7<1>D          g2<8,8,1>F                      { align1 1Q };
+mov(16)         g16<1>F         g9.3<0,1,0>F                    { align1 1H };
+mov(16)         g25<1>F         g18<8,4,1>UW                    { align1 1H };
+mov(16)         g19<1>D         g25<8,8,1>F                     { align1 1H };
+mov(8)          g74<1>DF        g5<0,1,0>DF                     { align1 1Q };
+mov(8)          g92<2>UD        g6.4<0,1,0>UD                   { align1 1Q };
+mov(8)          g62<1>Q         0xbff0000000000000Q             { align1 1Q };
+mov(8)          g92<2>F         g92<4,4,1>DF                    { align1 1Q };
+mov(8)          g92<1>DF        g95<4,4,1>F                     { align1 1Q };
+mov(8)          g106<1>DF       g2<0,1,0>F                      { align1 2Q };
+mov(8)          g48<1>Q         0xbff0000000000000Q             { align1 2Q };
+mov(8)          g127<1>UD       g106.1<8,4,2>UD                 { align1 2Q };
+mov(8)          g11<2>F         g7<4,4,1>DF                     { align1 2Q };
+mov(8)          g33<1>D         g34<8,4,2>UD                    { align1 2Q };
+mov(8)          g6<2>UD         0x00000000UD                    { align1 2Q };
+mov(8)          g2<1>UW         0x76543210UV                    { align1 1Q };
+mov(8)          g12<1>UD        g2<8,8,1>UW                     { align1 1Q };
+mov(8)          g7<1>UD         0x00080000UD                    { align1 WE_all 1Q };
+mov(1)          g2<1>F          0x3e800000F      /* 0.25F */    { align1 WE_all 1N };
+mov(8)          g15<1>F         g26<8,8,1>UD                    { align1 1Q };
+mov(1)          f0.1<1>UW       g1.14<0,1,0>UW                  { align1 WE_all 1N };
+mov(8)          g18<1>UD        g2<8,8,1>D                      { align1 1Q };
+mov(16)         g18<1>UD        g26<8,8,1>D                     { align1 1H };
+mov(16)         g120<1>D        g34<8,8,1>D                     { align1 1H };
+mov(8)          g8<1>Q          g13<4,4,1>Q                     { align1 1Q };
+mov(8)          g21<1>UD        g0<8,8,1>UD                     { align1 WE_all 2Q };
+mov(8)          g23<1>F         g6<0,1,0>F                      { align1 2Q };
+mov(1)          g21.2<1>UD      0x000003f2UD                    { align1 WE_all 3N };
+mov.nz.f0.0(8)  g19<1>D         g3<8,4,2>UD                     { align1 1Q };
+mov(8)          g3<1>UD         0D                              { align1 1Q };
+mov(16)         g4<1>UD         0D                              { align1 1H };
+mov(1)          f1<1>UD         g1.7<0,1,0>UD                   { align1 WE_all 1N };
+mov.sat(8)      g126<1>F        0x0F             /* 0F */       { align1 1Q };
+mov.sat(8)      g124<1>F        -g36<8,8,1>D                    { align1 1Q };
+mov(16)         g86<1>UD        g88<8,8,1>UD                    { align1 WE_all 1H };
+mov.sat(16)     g120<1>F        g2<0,1,0>F                      { align1 1H };
+mov(16)         g2<1>F          g18<8,8,1>UD                    { align1 1H };
+mov(8)          g4<1>UD         0x0F             /* 0F */       { align1 1Q };
+mov(8)          g8<1>DF         g2<0,1,0>D                      { align1 1Q };
+mov(16)         g5<1>UD         0x00000000UD                    { align1 1H };
+mov.nz.f0.0(8)  g4<1>F          -(abs)g2<0,1,0>F                { align1 1Q };
+(+f0.0) mov(8)  g4<1>F          0xbf800000F      /* -1F */      { align1 1Q };
+mov.nz.f0.0(16) g4<1>F          -(abs)g2<0,1,0>F                { align1 1H };
+(+f0.0) mov(16) g4<1>F          0xbf800000F      /* -1F */      { align1 1H };
+mov(1)          g14.7<1>UD      g1.7<0,1,0>UD                   { align1 WE_all 1N };
+mov(1)          g7.7<1>UD       g1.7<0,1,0>UD                   { align1 WE_all 3N };
+mov(8)          g32<1>DF        g2<0,1,0>DF                     { align1 2Q };
+mov(8)          g5<1>F          g2<0,1,0>HF                     { align1 1Q };
+mov(16)         g6<1>F          g2<0,1,0>HF                     { align1 1H };
+mov(8)          g7<1>UD         g2<0,1,0>F                      { align1 1Q };
+mov(8)          g123<1>UW       g2<16,8,2>UW                    { align1 WE_all 1Q };
+mov(16)         g119<1>UW       g2<16,8,2>UW                    { align1 WE_all 1H };
+mov(16)         g15<1>UD        g11<8,8,1>F                     { align1 1H };
+mov(16)         g19<1>UD        g15<16,8,2>UW                   { align1 1H };
+mov(8)          g7<1>D          0x00000000UD                    { align1 1Q };
+mov(16)         g75<1>D         0x00000000UD                    { align1 1H };
+mov(16)         g79<1>D         g18<8,8,1>UD                    { align1 1H };
+mov(16)         g29<1>UD        g27<32,8,4>UB                   { align1 1H };
+mov(8)          g7<1>DF         0x0000000000000000DF /* 0DF */  { align1 1Q };
+mov(8)          g14<1>F         0x3f000000F      /* 0.5F */     { align1 2Q };
+mov(8)          g5<1>F          0x0F             /* 0F */       { align1 WE_all 1Q };
+mov(16)         g4<1>UD         0x00000000UD                    { align1 WE_all 1H };
+mov(8)          g5<2>UD         g2<0,1,0>DF                     { align1 1Q };
+mov(8)          g10<2>UD        g2<0,1,0>DF                     { align1 2Q };
+mov(8)          g3<1>DF         g2<0,1,0>UD                     { align1 1Q };
+mov(8)          g3<1>DF         g2<0,1,0>UD                     { align1 2Q };
+mov(1)          f0<1>UW         0x0000UW                        { align1 WE_all 1N };
+mov(1)          g1<1>D          0D                              { align1 WE_all 1N };
+(+f0.0.any16h) mov(1) g1<1>D    -1D                             { align1 WE_all 1N };
+mov(8)          g9<1>F          g2<0,1,0>W                      { align1 1Q };
+mov(8)          g7<1>UQ         g4<4,4,1>UQ                     { align1 1Q };
+mov(16)         g11<1>UD        0x0F             /* 0F */       { align1 1H };
+mov(8)          g5<2>D          g2<0,1,0>DF                     { align1 1Q };
+mov(8)          g10<2>D         g2<0,1,0>DF                     { align1 2Q };
+mov(1)          g24.7<1>UD      f0.1<0,1,0>UW                   { align1 WE_all 1N };
+mov(1)          g2.7<1>UD       f0.1<0,1,0>UW                   { align1 WE_all 3N };
+mov(16)         g6<1>D          0D                              { align1 2H };
+mov(8)          g14<1>UD        g13<32,8,4>UB                   { align1 1Q };
+mov(16)         g38<1>D         g2<8,8,1>UW                     { align1 2H };
+mov(16)         g124<1>UD       g44<8,8,1>UD                    { align1 2H };
+mov(16)         g4<1>UD         0x00000001UD                    { align1 2H };
+mov(1)          g4<2>UW         0x00000000UD                    { align1 WE_all 1N };
+mov(8)          g4<1>UD         f0<0,1,0>UW                     { align1 1Q };
+mov(8)          g8<1>D          g2<8,8,1>UW                     { align1 1Q };
+mov(16)         g4<1>UD         f0<0,1,0>UW                     { align1 1H };
+mov(8)          g3<1>DF         -g2<0,1,0>D                     { align1 2Q };
+mov(8)          g5<1>F          g2<0,1,0>B                      { align1 1Q };
+mov(16)         g6<1>F          g2<0,1,0>B                      { align1 1H };
+mov(8)          g4<1>DF         0x0000000000000000DF /* 0DF */  { align1 2Q };
+mov.nz.f0.0(8)  g16<1>D         g17<8,4,2>UD                    { align1 2Q };
+mov(8)          g34<1>UW        0x76543210V                     { align1 1Q };
+mov(8)          g7<2>HF         g2.1<0,1,0>F                    { align1 1Q };
+mov(1)          g5<1>D          g[a0 96]<0,1,0>D                { align1 WE_all 1N };
+mov(1)          f1<1>UW         f0.1<0,1,0>UW                   { align1 WE_all 1N };
+(+f0.0.any8h) mov(1) g2<1>D     -1D                             { align1 WE_all 1N };
+mov(8)          g2<2>UW         g9<8,8,1>F                      { align1 1Q };
+mov(8)          g3<1>UW         g2<16,8,2>UW                    { align1 1Q };
+mov.sat(16)     g13<1>F         0x3f800000F      /* 1F */       { align1 1H };
+mov(16)         g19<2>UW        g17<8,8,1>F                     { align1 1H };
+mov.nz.f0.0(8)  null<1>D        0x00000000UD                    { align1 1Q };
+mov.nz.f0.0(16) null<1>D        0x00000000UD                    { align1 1H };
+mov(4)          g3<1>UD         tm0<4,4,1>UD                    { align1 WE_all 1N };
+(+f0.0.all16h) mov(1) g1<1>D    -1D                             { align1 WE_all 1N };
+mov(8)          g9<1>F          g2<0,1,0>UB                     { align1 1Q };
+mov(16)         g6<1>F          g2<0,1,0>UB                     { align1 1H };
+mov(16)         g10<2>HF        g4<8,8,1>F                      { align1 1H };
+mov.z.f0.0(8)   null<1>UD       g2<8,8,1>UD                     { align1 1Q };
+mov.sat(8)      g125<1>F        g9<8,8,1>UD                     { align1 1Q };
+mov.z.f0.0(16)  g1<1>UD         g0.7<0,1,0>UD                   { align1 1H };
+mov.z.f0.0(8)   g18<1>D         g17<8,8,1>F                     { align1 1Q };
+mov(8)          g2<1>D          g12<16,8,2>W                    { align1 1Q };
+mov(16)         g40<1>D         g18<16,8,2>W                    { align1 1H };
+mov(8)          g2<1>D          g12<32,8,4>B                    { align1 1Q };
+mov(16)         g40<1>D         g18<32,8,4>B                    { align1 1H };
+mov(16)         g42<1>F         g4<16,8,2>W                     { align1 1H };
+mov(8)          g23<1>Q         g26<4,4,1>Q                     { align1 2Q };
+(+f0.0.all8h) mov(1) g7<1>D     -1D                             { align1 WE_all 1N };
+mov.z.f0.0(8)   null<1>D        g21<8,8,1>F                     { align1 1Q };
+mov.z.f0.0(16)  null<1>D        g65<8,8,1>F                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/mov.expected b/src/intel/compiler/elk/tests/gen8/mov.expected
new file mode 100644
index 00000000000..c14c348a86f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/mov.expected
@@ -0,0 +1,145 @@
+01 00 60 00 0c 02 60 2f 20 00 8d 00 00 00 00 00
+01 00 60 00 e8 3e 80 2f 00 00 00 38 00 00 c0 40
+01 00 60 00 08 06 c0 21 00 00 00 00 00 00 00 00
+01 00 60 00 e8 3a 20 22 80 01 8d 00 00 00 00 00
+01 00 60 80 e8 3a 80 2f 00 01 8d 00 00 00 00 00
+01 00 60 00 28 0a a0 47 c0 02 8d 00 00 00 00 00
+01 00 60 00 28 02 a0 22 60 07 8a 00 00 00 00 00
+01 00 60 00 28 0e 80 20 00 00 00 08 ff ff ff ff
+01 00 60 02 20 0a 00 20 80 00 8d 00 00 00 00 00
+01 00 00 00 0c 06 48 20 00 00 00 00 00 00 00 00
+01 00 40 00 ec 3a 40 2e 4c 00 87 00 00 00 00 00
+01 00 60 00 e8 0a a0 2f e0 40 8d 00 00 00 00 00
+01 00 80 00 e8 3e 80 2f 00 00 00 38 00 00 00 00
+01 00 80 00 e8 0a 40 2f 60 41 8d 00 00 00 00 00
+01 00 80 00 28 0e 80 2f 00 00 00 08 00 00 80 3f
+01 00 80 02 20 0a 00 20 40 00 00 00 00 00 00 00
+01 00 60 00 4c 36 40 21 00 00 00 30 10 32 54 76
+01 00 80 00 08 02 60 23 04 00 00 00 00 00 00 00
+01 00 60 00 0c 0e 60 20 00 00 00 08 00 00 00 00
+01 00 00 00 0c 0e 7c 20 00 00 00 08 ff ff ff ff
+01 00 80 00 28 12 a0 21 40 01 8d 00 00 00 00 00
+01 10 60 00 0c 0e 20 20 00 00 00 08 00 00 00 00
+01 10 60 00 28 0a 40 20 e0 01 8d 00 00 00 00 00
+01 10 60 00 28 0e c0 20 00 00 00 08 00 00 00 00
+01 10 00 00 0c 0e 3c 20 00 00 00 08 ff ff ff ff
+01 00 60 00 e8 12 40 20 c0 00 89 00 00 00 00 00
+01 00 60 00 28 3a e0 20 40 00 8d 00 00 00 00 00
+01 00 80 00 e8 3a 00 22 2c 01 00 00 00 00 00 00
+01 00 80 00 e8 12 20 23 40 02 89 00 00 00 00 00
+01 00 80 00 28 3a 60 22 20 03 8d 00 00 00 00 00
+01 00 60 00 c8 32 40 29 a0 00 00 00 00 00 00 00
+01 00 60 00 08 02 80 4b d0 00 00 00 00 00 00 00
+01 00 60 00 28 4f c0 27 00 00 00 00 00 00 f0 bf
+01 00 60 00 e8 32 80 4b 80 0b 69 00 00 00 00 00
+01 00 60 00 c8 3a 80 2b e0 0b 69 00 00 00 00 00
+01 10 60 00 c8 3a 40 2d 40 00 00 00 00 00 00 00
+01 10 60 00 28 4f 00 26 00 00 00 00 00 00 f0 bf
+01 10 60 00 08 02 e0 2f 44 0d 8a 00 00 00 00 00
+01 10 60 00 e8 32 60 41 e0 00 69 00 00 00 00 00
+01 10 60 00 28 02 20 24 40 04 8a 00 00 00 00 00
+01 10 60 00 08 06 c0 40 00 00 00 00 00 00 00 00
+01 00 60 00 48 26 40 20 00 00 00 20 10 32 54 76
+01 00 60 00 08 12 80 21 40 00 8d 00 00 00 00 00
+01 00 60 00 0c 06 e0 20 00 00 00 00 00 00 08 00
+01 00 00 00 ec 3e 40 20 00 00 00 38 00 00 80 3e
+01 00 60 00 e8 02 e0 21 40 03 8d 00 00 00 00 00
+01 00 00 00 44 12 02 26 3c 00 00 00 00 00 00 00
+01 00 60 00 08 0a 40 22 40 00 8d 00 00 00 00 00
+01 00 80 00 08 0a 40 22 40 03 8d 00 00 00 00 00
+01 00 80 00 28 0a 00 2f 40 04 8d 00 00 00 00 00
+01 00 60 00 28 4b 00 21 a0 01 69 00 00 00 00 00
+01 10 60 00 0c 02 a0 22 00 00 8d 00 00 00 00 00
+01 10 60 00 e8 3a e0 22 c0 00 00 00 00 00 00 00
+01 10 00 00 0c 06 a8 22 00 00 00 00 f2 03 00 00
+01 00 60 02 28 02 60 22 60 00 8a 00 00 00 00 00
+01 00 60 00 08 0e 60 20 00 00 00 08 00 00 00 00
+01 00 80 00 08 0e 80 20 00 00 00 08 00 00 00 00
+01 00 00 00 04 02 20 26 3c 00 00 00 00 00 00 00
+01 00 60 80 e8 3e c0 2f 00 00 00 38 00 00 00 00
+01 00 60 80 e8 0a 80 2f 80 44 8d 00 00 00 00 00
+01 00 80 00 0c 02 c0 2a 00 0b 8d 00 00 00 00 00
+01 00 80 80 e8 3a 00 2f 40 00 00 00 00 00 00 00
+01 00 80 00 e8 02 40 20 40 02 8d 00 00 00 00 00
+01 00 60 00 08 3e 80 20 00 00 00 38 00 00 00 00
+01 00 60 00 c8 0a 00 21 40 00 00 00 00 00 00 00
+01 00 80 00 08 06 a0 20 00 00 00 00 00 00 00 00
+01 00 60 02 e8 3a 80 20 40 60 00 00 00 00 00 00
+01 00 61 00 e8 3e 80 20 00 00 00 38 00 00 80 bf
+01 00 80 02 e8 3a 80 20 40 60 00 00 00 00 00 00
+01 00 81 00 e8 3e 80 20 00 00 00 38 00 00 80 bf
+01 00 00 00 0c 02 dc 21 3c 00 00 00 00 00 00 00
+01 10 00 00 0c 02 fc 20 3c 00 00 00 00 00 00 00
+01 10 60 00 c8 32 00 24 40 00 00 00 00 00 00 00
+01 00 60 00 e8 52 a0 20 40 00 00 00 00 00 00 00
+01 00 80 00 e8 52 c0 20 40 00 00 00 00 00 00 00
+01 00 60 00 08 3a e0 20 40 00 00 00 00 00 00 00
+01 00 60 00 4c 12 60 2f 40 00 ae 00 00 00 00 00
+01 00 80 00 4c 12 e0 2e 40 00 ae 00 00 00 00 00
+01 00 80 00 08 3a e0 21 60 01 8d 00 00 00 00 00
+01 00 80 00 08 12 60 22 e0 01 ae 00 00 00 00 00
+01 00 60 00 28 06 e0 20 00 00 00 00 00 00 00 00
+01 00 80 00 28 06 60 29 00 00 00 00 00 00 00 00
+01 00 80 00 28 02 e0 29 40 02 8d 00 00 00 00 00
+01 00 80 00 08 22 a0 23 60 03 cf 00 00 00 00 00
+01 00 60 00 c8 56 e0 20 00 00 00 00 00 00 00 00
+01 10 60 00 e8 3e c0 21 00 00 00 38 00 00 00 3f
+01 00 60 00 ec 3e a0 20 00 00 00 38 00 00 00 00
+01 00 80 00 0c 06 80 20 00 00 00 00 00 00 00 00
+01 00 60 00 08 32 a0 40 40 00 00 00 00 00 00 00
+01 10 60 00 08 32 40 41 40 00 00 00 00 00 00 00
+01 00 60 00 c8 02 60 20 40 00 00 00 00 00 00 00
+01 10 60 00 c8 02 60 20 40 00 00 00 00 00 00 00
+01 00 00 00 44 16 00 26 00 00 00 10 00 00 00 00
+01 00 00 00 2c 0e 20 20 00 00 00 08 00 00 00 00
+01 00 0a 00 2c 0e 20 20 00 00 00 08 ff ff ff ff
+01 00 60 00 e8 1a 20 21 40 00 00 00 00 00 00 00
+01 00 60 00 08 43 e0 20 80 00 69 00 00 00 00 00
+01 00 80 00 08 3e 60 21 00 00 00 38 00 00 00 00
+01 00 60 00 28 32 a0 40 40 00 00 00 00 00 00 00
+01 10 60 00 28 32 40 41 40 00 00 00 00 00 00 00
+01 00 00 00 0c 10 1c 23 02 06 00 00 00 00 00 00
+01 10 00 00 0c 10 5c 20 02 06 00 00 00 00 00 00
+01 20 80 00 28 0e c0 20 00 00 00 08 00 00 00 00
+01 00 60 00 08 22 c0 21 a0 01 cf 00 00 00 00 00
+01 20 80 00 28 12 c0 24 40 00 8d 00 00 00 00 00
+01 20 80 00 08 02 80 2f 80 05 8d 00 00 00 00 00
+01 20 80 00 08 06 80 20 00 00 00 00 01 00 00 00
+01 00 00 00 4c 06 80 40 00 00 00 00 00 00 00 00
+01 00 60 00 08 10 80 20 00 06 00 00 00 00 00 00
+01 00 60 00 28 12 00 21 40 00 8d 00 00 00 00 00
+01 00 80 00 08 10 80 20 00 06 00 00 00 00 00 00
+01 10 60 00 c8 0a 60 20 40 40 00 00 00 00 00 00
+01 00 60 00 e8 2a a0 20 40 00 00 00 00 00 00 00
+01 00 80 00 e8 2a c0 20 40 00 00 00 00 00 00 00
+01 10 60 00 c8 56 80 20 00 00 00 00 00 00 00 00
+01 10 60 02 28 02 00 22 20 02 8a 00 00 00 00 00
+01 00 60 00 48 36 40 24 00 00 00 30 10 32 54 76
+01 00 60 00 48 3b e0 40 44 00 00 00 00 00 00 00
+01 00 00 00 2c 0a a0 20 60 80 00 00 00 00 00 00
+01 00 00 00 44 10 20 26 02 06 00 00 00 00 00 00
+01 00 08 00 2c 0e 40 20 00 00 00 08 ff ff ff ff
+01 00 60 00 48 3a 40 40 20 01 8d 00 00 00 00 00
+01 00 60 00 48 12 60 20 40 00 ae 00 00 00 00 00
+01 00 80 80 e8 3e a0 21 00 00 00 38 00 00 80 3f
+01 00 80 00 48 3a 60 42 20 02 8d 00 00 00 00 00
+01 00 60 02 20 06 00 20 00 00 00 00 00 00 00 00
+01 00 80 02 20 06 00 20 00 00 00 00 00 00 00 00
+01 00 40 00 0c 00 60 20 00 18 69 00 00 00 00 00
+01 00 0b 00 2c 0e 20 20 00 00 00 08 ff ff ff ff
+01 00 60 00 e8 22 20 21 40 00 00 00 00 00 00 00
+01 00 80 00 e8 22 c0 20 40 00 00 00 00 00 00 00
+01 00 80 00 48 3b 40 41 80 00 8d 00 00 00 00 00
+01 00 60 01 00 02 00 20 40 00 8d 00 00 00 00 00
+01 00 60 80 e8 02 a0 2f 20 01 8d 00 00 00 00 00
+01 00 80 01 08 02 20 20 1c 00 00 00 00 00 00 00
+01 00 60 01 28 3a 40 22 20 02 8d 00 00 00 00 00
+01 00 60 00 28 1a 40 20 80 01 ae 00 00 00 00 00
+01 00 80 00 28 1a 00 25 40 02 ae 00 00 00 00 00
+01 00 60 00 28 2a 40 20 80 01 cf 00 00 00 00 00
+01 00 80 00 28 2a 00 25 40 02 cf 00 00 00 00 00
+01 00 80 00 e8 1a 40 25 80 00 ae 00 00 00 00 00
+01 10 60 00 28 4b e0 22 40 03 69 00 00 00 00 00
+01 00 09 00 2c 0e e0 20 00 00 00 08 ff ff ff ff
+01 00 60 01 20 3a 00 20 a0 02 8d 00 00 00 00 00
+01 00 80 01 20 3a 00 20 20 08 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/mul.asm b/src/intel/compiler/elk/tests/gen8/mul.asm
new file mode 100644
index 00000000000..a9c7ed2a567
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/mul.asm
@@ -0,0 +1,31 @@
+mul(8)          g22<1>F         g4<8,8,1>F      g2<0,1,0>F      { align1 1Q };
+mul(16)         g33<1>F         g2<0,1,0>F      g2<0,1,0>F      { align1 1H };
+mul(8)          g36<1>DF        g8<0,1,0>DF     g8<0,1,0>DF     { align1 1Q };
+mul(8)          g9<1>UD         g32<8,8,1>UD    0x00000004UD    { align1 1Q };
+mul(8)          acc0<1>UD       g17<8,8,1>UD    0xaaabUW        { align1 1Q };
+mul(8)          acc0<1>D        g17<8,8,1>D     0x5556UW        { align1 1Q };
+mul(8)          g21<1>D         g20<8,8,1>D     3D              { align1 1Q };
+mul(8)          acc0<1>UD       g47<8,8,1>UD    0xaaabUW        { align1 2Q };
+mul(16)         g53<1>D         g51<8,8,1>D     3D              { align1 1H };
+mul(8)          acc0<1>D        g47<8,8,1>D     0x5556UW        { align1 2Q };
+mul.z.f0.0(8)   g10<1>F         g5<0,1,0>F      g9<8,8,1>F      { align1 1Q };
+mul(8)          g39<1>DF        g3.3<0,1,0>DF   g3.3<0,1,0>DF   { align1 2Q };
+mul.z.f0.0(16)  g6<1>F          g2<0,1,0>F      g4<8,8,1>F      { align1 1H };
+mul.sat(8)      g17<1>F         g4<8,8,1>F      g16<8,8,1>F     { align1 1Q };
+mul.sat(16)     g16<1>F         g10<8,8,1>F     g14<8,8,1>F     { align1 1H };
+mul.l.f0.0(8)   null<1>F        g6<0,1,0>F      g5.7<0,1,0>F    { align1 1Q };
+mul.sat(8)      g8<1>DF         g34<4,4,1>DF    g5<4,4,1>DF     { align1 1Q };
+mul(8)          g4<1>UQ         g8<4,4,1>UD     g12<4,4,1>UD    { align1 1Q };
+mul(8)          g20<1>UQ        g5<4,4,1>UD     g13<4,4,1>UD    { align1 2Q };
+mul(8)          g5<1>Q          g9<4,4,1>D      g13<4,4,1>D     { align1 1Q };
+mul.sat(8)      g10<1>DF        g10<4,4,1>DF    g16<4,4,1>DF    { align1 2Q };
+mul.l.f0.0(8)   g20<1>F         g2<8,8,1>F      0x42700000F  /* 60F */ { align1 1Q };
+mul.l.f0.0(16)  g32<1>F         g2<8,8,1>F      0x42700000F  /* 60F */ { align1 1H };
+mul(1)          g6<1>UD         g12<0,1,0>UD    0x00000101UD    { align1 WE_all 1N };
+mul(8)          g21<1>Q         g6<4,4,1>D      g14<4,4,1>D     { align1 2Q };
+mul.l.f0.0(16)  null<1>F        g2.2<0,1,0>F    g2.1<0,1,0>F    { align1 1H };
+mul(8)          g6<1>UW         g6<8,8,1>UW     0x0808UW        { align1 1Q };
+mul(16)         g15<1>UW        g14<16,16,1>UW  0x0808UW        { align1 1H };
+mul.nz.f0.0(8)  g6<1>F          g12<8,8,1>F     0x3f808000F  /* 1.00391F */ { align1 1Q };
+mul.nz.f0.0(16) g9<1>F          g7<8,8,1>F      0x3f808000F  /* 1.00391F */ { align1 1H };
+mul(1)          g4<1>UD         g4<0,1,0>UD     0x00000101UD    { align1 WE_all 3N };
diff --git a/src/intel/compiler/elk/tests/gen8/mul.expected b/src/intel/compiler/elk/tests/gen8/mul.expected
new file mode 100644
index 00000000000..0563fd16648
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/mul.expected
@@ -0,0 +1,31 @@
+41 00 60 00 e8 3a c0 22 80 00 8d 3a 40 00 00 00
+41 00 80 00 e8 3a 20 24 40 00 00 3a 40 00 00 00
+41 00 60 00 c8 32 80 24 00 01 00 32 00 01 00 00
+41 00 60 00 08 02 20 21 00 04 8d 06 04 00 00 00
+41 00 60 00 00 02 00 24 20 02 8d 16 ab aa ab aa
+41 00 60 00 20 0a 00 24 20 02 8d 16 56 55 56 55
+41 00 60 00 28 0a a0 22 80 02 8d 0e 03 00 00 00
+41 10 60 00 00 02 00 24 e0 05 8d 16 ab aa ab aa
+41 00 80 00 28 0a a0 26 60 06 8d 0e 03 00 00 00
+41 10 60 00 20 0a 00 24 e0 05 8d 16 56 55 56 55
+41 00 60 01 e8 3a 40 21 a0 00 00 3a 20 01 8d 00
+41 10 60 00 c8 32 e0 24 78 00 00 32 78 00 00 00
+41 00 80 01 e8 3a c0 20 40 00 00 3a 80 00 8d 00
+41 00 60 80 e8 3a 20 22 80 00 8d 3a 00 02 8d 00
+41 00 80 80 e8 3a 00 22 40 01 8d 3a c0 01 8d 00
+41 00 60 05 e0 3a 00 20 c0 00 00 3a bc 00 00 00
+41 00 60 80 c8 32 00 21 40 04 69 32 a0 00 69 00
+41 00 60 00 08 03 80 20 00 01 69 02 80 01 69 00
+41 10 60 00 08 03 80 22 a0 00 69 02 a0 01 69 00
+41 00 60 00 28 0b a0 20 20 01 69 0a a0 01 69 00
+41 10 60 80 c8 32 40 21 40 01 69 32 00 02 69 00
+41 00 60 05 e8 3a 80 22 40 00 8d 3e 00 00 70 42
+41 00 80 05 e8 3a 00 24 40 00 8d 3e 00 00 70 42
+41 00 00 00 0c 02 c0 20 80 01 00 06 01 01 00 00
+41 10 60 00 28 0b a0 22 c0 00 69 0a c0 01 69 00
+41 00 80 05 e0 3a 00 20 48 00 00 3a 44 00 00 00
+41 00 60 00 48 12 c0 20 c0 00 8d 16 08 08 08 08
+41 00 80 00 48 12 e0 21 c0 01 b1 16 08 08 08 08
+41 00 60 02 e8 3a c0 20 80 01 8d 3e 00 80 80 3f
+41 00 80 02 e8 3a 20 21 e0 00 8d 3e 00 80 80 3f
+41 10 00 00 0c 02 80 20 80 00 00 06 01 01 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/nop.asm b/src/intel/compiler/elk/tests/gen8/nop.asm
new file mode 100644
index 00000000000..0b66395094f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/nop.asm
@@ -0,0 +1 @@
+nop                                                             ;
diff --git a/src/intel/compiler/elk/tests/gen8/nop.expected b/src/intel/compiler/elk/tests/gen8/nop.expected
new file mode 100644
index 00000000000..9a3dcf265b5
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/nop.expected
@@ -0,0 +1 @@
+7e 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/not.asm b/src/intel/compiler/elk/tests/gen8/not.asm
new file mode 100644
index 00000000000..4ec8010b0c8
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/not.asm
@@ -0,0 +1,2 @@
+not(16)         g10<1>D         g1.2<0,1,0>D                    { align1 1H };
+not(8)          g4<1>D          g8<8,8,1>D                      { align1 1Q };
diff --git a/src/intel/compiler/elk/tests/gen8/not.expected b/src/intel/compiler/elk/tests/gen8/not.expected
new file mode 100644
index 00000000000..6ee27a20b78
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/not.expected
@@ -0,0 +1,2 @@
+04 00 80 00 28 0a 40 21 28 00 00 00 00 00 00 00
+04 00 60 00 28 0a 80 20 00 01 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/or.asm b/src/intel/compiler/elk/tests/gen8/or.asm
new file mode 100644
index 00000000000..88f4d9bf941
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/or.asm
@@ -0,0 +1,18 @@
+or(8)           g53<1>UD        g49<8,8,1>UD    g21<8,8,1>UD    { align1 1Q };
+or.nz.f0.0(8)   null<1>UD       g21<8,8,1>UD    g2<8,8,1>UD     { align1 1Q };
+or.nz.f0.0(8)   g5<1>UD         g62<8,8,1>UD    g67<8,8,1>UD    { align1 1Q };
+or(8)           g5<1>UD         g106.1<8,4,2>UD 0x7ff00000UD    { align1 2Q };
+or.nz.f0.0(16)  null<1>UD       g35<8,8,1>UD    g32<8,8,1>UD    { align1 1H };
+or(16)          g36<1>UD        g34<8,8,1>UD    g20<8,8,1>UD    { align1 1H };
+or.nz.f0.0(16)  g56<1>UD        g54<8,8,1>UD    g52<8,8,1>UD    { align1 1H };
+or(1)           g2<1>UD         g2<0,1,0>UD     g4<0,1,0>UD     { align1 WE_all 1N };
+or(1)           a0<1>UD         g2<0,1,0>UD     0x064a7000UD    { align1 WE_all 1N };
+(+f0.0) or(8)   g3<1>UD         g3<8,8,1>UD     0x3f800000UD    { align1 1Q };
+(+f0.0) or(16)  g3<1>UD         g3<8,8,1>UD     0x3f800000UD    { align1 1H };
+or(1)           a0<1>UD         a0<0,1,0>UD     0x02280300UD    { align1 WE_all 1N };
+or(1)           a0<1>UD         g2<0,1,0>UD     0x0e0b6000UD    { align1 WE_all 3N };
+(+f0.0) or(8)   g17.1<2>UD      g17.1<8,4,2>UD  0x3ff00000UD    { align1 2Q };
+or(8)           g4<1>UW         g4<8,8,1>UW     g6<8,8,1>UW     { align1 1Q };
+or(16)          g16<1>UW        g14<16,16,1>UW  g15<16,16,1>UW  { align1 1H };
+or(8)           g22<1>UD        ~g2.2<0,1,0>D   g21<8,8,1>UD    { align1 1Q };
+or(16)          g37<1>UD        ~g2.2<0,1,0>D   g35<8,8,1>UD    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/or.expected b/src/intel/compiler/elk/tests/gen8/or.expected
new file mode 100644
index 00000000000..182b23da15e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/or.expected
@@ -0,0 +1,18 @@
+06 00 60 00 08 02 a0 26 20 06 8d 02 a0 02 8d 00
+06 00 60 02 00 02 00 20 a0 02 8d 02 40 00 8d 00
+06 00 60 02 08 02 a0 20 c0 07 8d 02 60 08 8d 00
+06 10 60 00 08 02 a0 20 44 0d 8a 06 00 00 f0 7f
+06 00 80 02 00 02 00 20 60 04 8d 02 00 04 8d 00
+06 00 80 00 08 02 80 24 40 04 8d 02 80 02 8d 00
+06 00 80 02 08 02 00 27 c0 06 8d 02 80 06 8d 00
+06 00 00 00 0c 02 40 20 40 00 00 02 80 00 00 00
+06 00 00 00 04 02 00 22 40 00 00 06 00 70 4a 06
+06 00 61 00 08 02 60 20 60 00 8d 06 00 00 80 3f
+06 00 81 00 08 02 60 20 60 00 8d 06 00 00 80 3f
+06 00 00 00 04 00 00 22 00 02 00 06 00 03 28 02
+06 10 00 00 04 02 00 22 40 00 00 06 00 60 0b 0e
+06 10 61 00 08 02 24 42 24 02 8a 06 00 00 f0 3f
+06 00 60 00 48 12 80 20 80 00 8d 12 c0 00 8d 00
+06 00 80 00 48 12 00 22 c0 01 b1 12 e0 01 b1 00
+06 00 60 00 08 0a c0 22 48 40 00 02 a0 02 8d 00
+06 00 80 00 08 0a a0 24 48 40 00 02 60 04 8d 00
diff --git a/src/intel/compiler/elk/tests/gen8/pln.asm b/src/intel/compiler/elk/tests/gen8/pln.asm
new file mode 100644
index 00000000000..5b0adcf28cd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/pln.asm
@@ -0,0 +1,10 @@
+pln(8)          g124<1>F        g4<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln(16)         g120<1>F        g6<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.sat(8)      g9<1>F          g5<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.sat(16)     g12<1>F         g7<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.g.f0.0(8)   g7<1>F          g4<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.g.f0.0(16)  g11<1>F         g6<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.l.f0.0(8)   g8<1>F          g4<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.l.f0.0(16)  g11<1>F         g6<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.nz.f0.0(8)  g18<1>F         g5<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.nz.f0.0(16) g14<1>F         g7<0,1,0>F      g2<8,8,1>F      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/pln.expected b/src/intel/compiler/elk/tests/gen8/pln.expected
new file mode 100644
index 00000000000..eb77b2a434f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/pln.expected
@@ -0,0 +1,10 @@
+5a 00 60 00 e8 3a 80 2f 80 00 00 3a 40 00 8d 00
+5a 00 80 00 e8 3a 00 2f c0 00 00 3a 40 00 8d 00
+5a 00 60 80 e8 3a 20 21 a0 00 00 3a 40 00 8d 00
+5a 00 80 80 e8 3a 80 21 e0 00 00 3a 40 00 8d 00
+5a 00 60 03 e8 3a e0 20 80 00 00 3a 40 00 8d 00
+5a 00 80 03 e8 3a 60 21 c0 00 00 3a 40 00 8d 00
+5a 00 60 05 e8 3a 00 21 80 00 00 3a 40 00 8d 00
+5a 00 80 05 e8 3a 60 21 c0 00 00 3a 40 00 8d 00
+5a 00 60 02 e8 3a 40 22 a0 00 00 3a 40 00 8d 00
+5a 00 80 02 e8 3a c0 21 e0 00 00 3a 40 00 8d 00
diff --git a/src/intel/compiler/elk/tests/gen8/rndd.asm b/src/intel/compiler/elk/tests/gen8/rndd.asm
new file mode 100644
index 00000000000..463ef808ca9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/rndd.asm
@@ -0,0 +1,5 @@
+rndd(8)         g22<1>F         g17<0,1,0>F                     { align1 1Q };
+rndd(16)        g7<1>F          g5<8,8,1>F                      { align1 1H };
+rndd.z.f0.0(8)  null<1>F        g17<8,8,1>F                     { align1 1Q };
+rndd.z.f0.0(16) null<1>F        g39<8,8,1>F                     { align1 1H };
+rndd.sat(8)     g124<1>F        g10<8,8,1>F                     { align1 1Q };
diff --git a/src/intel/compiler/elk/tests/gen8/rndd.expected b/src/intel/compiler/elk/tests/gen8/rndd.expected
new file mode 100644
index 00000000000..ff7ca82d09f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/rndd.expected
@@ -0,0 +1,5 @@
+45 00 60 00 e8 3a c0 22 20 02 00 00 00 00 00 00
+45 00 80 00 e8 3a e0 20 a0 00 8d 00 00 00 00 00
+45 00 60 01 e0 3a 00 20 20 02 8d 00 00 00 00 00
+45 00 80 01 e0 3a 00 20 e0 04 8d 00 00 00 00 00
+45 00 60 80 e8 3a 80 2f 40 01 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/rnde.asm b/src/intel/compiler/elk/tests/gen8/rnde.asm
new file mode 100644
index 00000000000..bc65bbcc02d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/rnde.asm
@@ -0,0 +1,2 @@
+rnde(8)         g7<1>F          g5<8,8,1>F                      { align1 1Q };
+rnde(16)        g11<1>F         g7<8,8,1>F                      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/rnde.expected b/src/intel/compiler/elk/tests/gen8/rnde.expected
new file mode 100644
index 00000000000..edac496ec93
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/rnde.expected
@@ -0,0 +1,2 @@
+46 00 60 00 e8 3a e0 20 a0 00 8d 00 00 00 00 00
+46 00 80 00 e8 3a 60 21 e0 00 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/rndz.asm b/src/intel/compiler/elk/tests/gen8/rndz.asm
new file mode 100644
index 00000000000..4b082d0539b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/rndz.asm
@@ -0,0 +1,2 @@
+rndz(8)         g7<1>F          g2<0,1,0>F                      { align1 1Q };
+rndz(16)        g102<1>F        g99<8,8,1>F                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/rndz.expected b/src/intel/compiler/elk/tests/gen8/rndz.expected
new file mode 100644
index 00000000000..2a79a2372d9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/rndz.expected
@@ -0,0 +1,2 @@
+47 00 60 00 e8 3a e0 20 40 00 00 00 00 00 00 00
+47 00 80 00 e8 3a c0 2c 60 0c 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen8/sel.asm b/src/intel/compiler/elk/tests/gen8/sel.asm
new file mode 100644
index 00000000000..c41fd396da5
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/sel.asm
@@ -0,0 +1,33 @@
+(-f0.0) sel(8)  g124<1>UD       g124<8,8,1>UD   0x3f800000UD    { align1 1Q };
+(+f0.0) sel(8)  g124<1>UD       g124<8,8,1>UD   0x00000000UD    { align1 1Q };
+(+f0.0) sel(8)  g24<1>UQ        g66<4,4,1>UQ    g40<4,4,1>UQ    { align1 1Q };
+(+f0.0) sel(8)  g36<1>UQ        g50<4,4,1>UQ    g31<4,4,1>UQ    { align1 2Q };
+sel.ge(8)       g10<1>F         g4<8,8,1>F      g5<8,8,1>F      { align1 1Q };
+(+f0.0) sel(16) g23<1>UD        g39<8,8,1>UD    g41<8,8,1>UD    { align1 1H };
+(-f0.0) sel(16) g11<1>UD        g58<8,8,1>UD    0x00000000UD    { align1 1H };
+sel.l(8)        g3<1>UD         g2.1<0,1,0>UD   0x00000001UD    { align1 1Q };
+sel.l(16)       g3<1>UD         g2.1<0,1,0>UD   0x00000001UD    { align1 1H };
+sel.ge(8)       g3<1>D          g2<0,1,0>D      -1D             { align1 1Q };
+sel.l(8)        g4<1>D          g3<8,8,1>D      1D              { align1 1Q };
+sel.ge(16)      g3<1>D          g2<0,1,0>D      -1D             { align1 1H };
+sel.l(16)       g5<1>D          g3<8,8,1>D      1D              { align1 1H };
+sel.ge(16)      g24<1>F         g20<8,8,1>F     0x0F  /* 0F */  { align1 1H };
+sel.l(8)        g8<1>F          g7<8,8,1>F      0x43000000F  /* 128F */ { align1 1Q };
+(-f0.0) sel.sat(8) g126<1>F     g11<8,8,1>F     0x0F  /* 0F */  { align1 1Q };
+sel.l(8)        g18<1>DF        g5<0,1,0>DF     g5.1<0,1,0>DF   { align1 1Q };
+sel.ge(16)      g37<1>UD        g9<8,8,1>UD     g13<8,8,1>UD    { align1 1H };
+sel.ge(8)       g19<1>UD        g5<0,1,0>UD     g5.4<0,1,0>UD   { align1 1Q };
+sel.sat.l(8)    g124<1>F        g6<8,8,1>F      0x3f000000F  /* 0.5F */ { align1 1Q };
+(+f0.0) sel(8)  g26<1>F         g5<0,1,0>F      (abs)g5.3<0,1,0>F { align1 1Q };
+(-f0.0) sel(8)  g44<1>F         (abs)g41<8,8,1>F 0x3f800000F  /* 1F */ { align1 1Q };
+sel.l(16)       g120<1>F        g2.3<0,1,0>F    g2.2<0,1,0>F    { align1 1H };
+(+f0.0) sel(8)  g9<1>DF         g2<0,1,0>DF     -g2<0,1,0>DF    { align1 1Q };
+(+f0.0) sel(8)  g12<1>DF        g2<0,1,0>DF     -g2<0,1,0>DF    { align1 2Q };
+sel.ge(8)       g5<1>DF         g2<0,1,0>DF     g2.2<0,1,0>DF   { align1 1Q };
+sel.ge(8)       g35<1>DF        g2<0,1,0>DF     g2.2<0,1,0>DF   { align1 2Q };
+sel.l(8)        g11<1>DF        g35<4,4,1>DF    g3<0,1,0>DF     { align1 2Q };
+(+f0.0) sel.sat(8) g126<1>F     g11<8,8,1>F     0x0F  /* 0F */  { align1 1Q };
+(-f0.0) sel(16) g27<1>F         (abs)g25<8,8,1>F 0x3f800000F  /* 1F */ { align1 1H };
+(+f0.0) sel(16) g36<1>F         g2<0,1,0>F      (abs)g2.4<0,1,0>F { align1 1H };
+(+f0.0) sel(16) g8<1>UD         g4<8,8,1>UD     g6<8,8,1>UD     { align1 2H };
+sel.sat.l(16)   g8<1>F          g83<8,8,1>F     0x3f000000F  /* 0.5F */ { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/sel.expected b/src/intel/compiler/elk/tests/gen8/sel.expected
new file mode 100644
index 00000000000..01f3e5adc9c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/sel.expected
@@ -0,0 +1,33 @@
+02 00 71 00 08 02 80 2f 80 0f 8d 06 00 00 80 3f
+02 00 61 00 08 02 80 2f 80 0f 8d 06 00 00 00 00
+02 00 61 00 08 43 00 23 40 08 69 42 00 05 69 00
+02 10 61 00 08 43 80 24 40 06 69 42 e0 03 69 00
+02 00 60 04 e8 3a 40 21 80 00 8d 3a a0 00 8d 00
+02 00 81 00 08 02 e0 22 e0 04 8d 02 20 05 8d 00
+02 00 91 00 08 02 60 21 40 07 8d 06 00 00 00 00
+02 00 60 05 08 02 60 20 44 00 00 06 01 00 00 00
+02 00 80 05 08 02 60 20 44 00 00 06 01 00 00 00
+02 00 60 04 28 0a 60 20 40 00 00 0e ff ff ff ff
+02 00 60 05 28 0a 80 20 60 00 8d 0e 01 00 00 00
+02 00 80 04 28 0a 60 20 40 00 00 0e ff ff ff ff
+02 00 80 05 28 0a a0 20 60 00 8d 0e 01 00 00 00
+02 00 80 04 e8 3a 00 23 80 02 8d 3e 00 00 00 00
+02 00 60 05 e8 3a 00 21 e0 00 8d 3e 00 00 00 43
+02 00 71 80 e8 3a c0 2f 60 01 8d 3e 00 00 00 00
+02 00 60 05 c8 32 40 22 a0 00 00 32 a8 00 00 00
+02 00 80 04 08 02 a0 24 20 01 8d 02 a0 01 8d 00
+02 00 60 04 08 02 60 22 a0 00 00 02 b0 00 00 00
+02 00 60 85 e8 3a 80 2f c0 00 8d 3e 00 00 00 3f
+02 00 61 00 e8 3a 40 23 a0 00 00 3a ac 20 00 00
+02 00 71 00 e8 3a 80 25 20 25 8d 3e 00 00 80 3f
+02 00 80 05 e8 3a 00 2f 4c 00 00 3a 48 00 00 00
+02 00 61 00 c8 32 20 21 40 00 00 32 40 40 00 00
+02 10 61 00 c8 32 80 21 40 00 00 32 40 40 00 00
+02 00 60 04 c8 32 a0 20 40 00 00 32 50 00 00 00
+02 10 60 04 c8 32 60 24 40 00 00 32 50 00 00 00
+02 10 60 05 c8 32 60 21 60 04 69 32 60 00 00 00
+02 00 61 80 e8 3a c0 2f 60 01 8d 3e 00 00 00 00
+02 00 91 00 e8 3a 60 23 20 23 8d 3e 00 00 80 3f
+02 00 81 00 e8 3a 80 24 40 00 00 3a 50 20 00 00
+02 20 81 00 08 02 00 21 80 00 8d 02 c0 00 8d 00
+02 00 80 85 e8 3a 00 21 60 0a 8d 3e 00 00 00 3f
diff --git a/src/intel/compiler/elk/tests/gen8/send.asm b/src/intel/compiler/elk/tests/gen8/send.asm
new file mode 100644
index 00000000000..ae1ee832df0
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/send.asm
@@ -0,0 +1,4380 @@
+send(8)         null<1>F        g123<8,8,1>F    0x8a080017
+                            urb MsgDesc: 1 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         null<1>F        g13<8,8,1>F     0x12080007
+                            urb MsgDesc: 0 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080027
+                            urb MsgDesc: 2 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(16)        g9<1>UD         g2<8,8,1>UD     0x02280300
+                            const MsgDesc: (0, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(8)         null<1>F        g119<8,8,1>F    0x92080017
+                            urb MsgDesc: 1 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         null<1>UW       g3<8,8,1>UD     0x0e0b5001
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD16, Mask = 0x0) mlen 7 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g1<8,8,1>UD     0x0e0b6001
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD8, Mask = 0x0) mlen 7 rlen 0 { align1 2Q };
+send(16)        null<1>UW       g127<8,8,1>UW   0x82000010
+                            thread_spawner MsgDesc: mlen 1 rlen 0           { align1 WE_all 1H EOT };
+send(8)         g124<1>UW       g13<8,8,1>UD    0x08427001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g10<8,8,1>UD    0x10847001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g10<1>UD        g2<8,8,1>UD     0x02480028
+                            urb MsgDesc: 2 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         null<1>F        g8<8,8,1>F      0x140a0017
+                            urb MsgDesc: 1 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g118<8,8,1>F    0x940a0017
+                            urb MsgDesc: 1 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q EOT };
+send(8)         null<1>F        g11<8,8,1>UD    0x0c0a0037
+                            urb MsgDesc: 3 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0a080027
+                            urb MsgDesc: 2 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0c088017
+                            urb MsgDesc: 1 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0a088017
+                            urb MsgDesc: 1 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x08088017
+                            urb MsgDesc: 1 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g2<8,8,1>UD     0x06088017
+                            urb MsgDesc: 1 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0c088007
+                            urb MsgDesc: 0 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0a088007
+                            urb MsgDesc: 0 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x02480008
+                            urb MsgDesc: 0 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         g6<1>UD         g6<8,8,1>UD     0x02480018
+                            urb MsgDesc: 1 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         null<1>F        g125<8,8,1>UD   0x86088007
+                            urb MsgDesc: 0 SIMD8 write masked mlen 3 rlen 0 { align1 1Q EOT };
+send(8)         g7<1>UW         g7<8,8,1>UD     0x04427000
+                            sampler MsgDesc: ld SIMD8 Surface = 0 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>F     0x12080017
+                            urb MsgDesc: 1 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>F     0x12080037
+                            urb MsgDesc: 3 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080057
+                            urb MsgDesc: 5 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x04420001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x08840001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x0643d001
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g8<1>UW         g16<8,8,1>UD    0x0c85d001
+                            sampler MsgDesc: ld_mcs SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g8<1>UW         g17<8,8,1>UD    0x0a43e001
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g34<1>UW        g16<8,8,1>UD    0x1485e001
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(8)         g5<1>UW         g7<8,8,1>UD     0x0242a000
+                            sampler MsgDesc: resinfo SIMD8 Surface = 0 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g15<8,8,1>UD    0x064a8000
+                            sampler MsgDesc: gather4 SIMD8 Surface = 0 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g11<8,8,1>UD    0x06420001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g19<8,8,1>UD    0x0c840001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g12<1>UW        g5<8,8,1>UD     0x02427000
+                            sampler MsgDesc: ld SIMD8 Surface = 0 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080037
+                            urb MsgDesc: 3 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         g124<1>UW       g7<8,8,1>UD     0x144a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 10 rlen 4 { align1 1Q };
+(+f1.0) send(16) g122<1>UW      g9<8,8,1>UD     0x0420a501
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, inc) mlen 2 rlen 2 { align1 1H };
+send(8)         g12<1>UW        g7<8,8,1>UD     0x0443d001
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g13<1>UW        g5<8,8,1>UD     0x0843e001
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g15<1>UW        g11<8,8,1>UD    0x0885d001
+                            sampler MsgDesc: ld_mcs SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(16)        g27<1>UW        g7<8,8,1>UD     0x1085e001
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+(+f1.0) send(8) g125<1>UW       g3<8,8,1>UD     0x0210b501
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, inc) mlen 1 rlen 1 { align1 1Q };
+send(8)         g124<1>UW       g8<8,8,1>UD     a0<0,1,0>UD     0x00000200
+                            sampler MsgDesc: indirect                       { align1 1Q };
+send(8)         g124<1>UW       g8<8,8,1>UD     0x084a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g8<8,8,1>UD     0x064a8001
+                            sampler MsgDesc: gather4 SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g12<8,8,1>UD    0x0a8c8001
+                            sampler MsgDesc: gather4 SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x0a4a6001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g12<8,8,1>UD    0x0a4a6102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x128c6001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g20<8,8,1>UD    0x128c6102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 9 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g3<8,8,1>UD     0x0a43e000
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 0 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080027
+                            urb MsgDesc: 2 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         g2<1>UW         g3<8,8,1>UD     0x0643d000
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 0 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         null<1>F        g7<8,8,1>UD     0x0a080037
+                            urb MsgDesc: 3 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g8<8,8,1>UD     0x0a080047
+                            urb MsgDesc: 4 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>F     0x0c0a0017
+                            urb MsgDesc: 1 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g122<8,8,1>F    0x8c0a0017
+                            urb MsgDesc: 1 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         g5<1>UW         g21<8,8,1>UD    0x02420001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g7<1>UW         g26<8,8,1>UD    0x04840001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         g8<1>UW         g10<8,8,1>UD    0x0242a001
+                            sampler MsgDesc: resinfo SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x0c4b1001
+                            sampler MsgDesc: gather4_po SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(16)        g23<1>UW        g20<8,8,1>UD    0x0484a001
+                            sampler MsgDesc: resinfo SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(16)        g18<1>UW        g7<8,8,1>UD     0x168d1001
+                            sampler MsgDesc: gather4_po SIMD16 Surface = 1 Sampler = 0 mlen 11 rlen 8 { align1 1H };
+send(8)         null<1>F        g6<8,8,1>UD     0x0a088027
+                            urb MsgDesc: 2 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g7<8,8,1>UD     0x0a088037
+                            urb MsgDesc: 3 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g8<8,8,1>UD     0x0a088047
+                            urb MsgDesc: 4 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g9<8,8,1>UD     0x0a088057
+                            urb MsgDesc: 5 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x084a8001
+                            sampler MsgDesc: gather4 SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g14<1>UW        g7<8,8,1>UD     0x0e8c8001
+                            sampler MsgDesc: gather4 SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x0c424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x0c4b1000
+                            sampler MsgDesc: gather4_po SIMD8 Surface = 0 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x0242a101
+                            sampler MsgDesc: resinfo SIMD8 Surface = 1 Sampler = 1 mlen 1 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g10<8,8,1>UD    0x0242a202
+                            sampler MsgDesc: resinfo SIMD8 Surface = 2 Sampler = 2 mlen 1 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g14<8,8,1>UD    0x0242a303
+                            sampler MsgDesc: resinfo SIMD8 Surface = 3 Sampler = 3 mlen 1 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x0242a404
+                            sampler MsgDesc: resinfo SIMD8 Surface = 4 Sampler = 4 mlen 1 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g22<8,8,1>UD    0x0242a505
+                            sampler MsgDesc: resinfo SIMD8 Surface = 5 Sampler = 5 mlen 1 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g26<8,8,1>UD    0x0242a606
+                            sampler MsgDesc: resinfo SIMD8 Surface = 6 Sampler = 6 mlen 1 rlen 4 { align1 1Q };
+send(8)         g6<1>UD         g15<8,8,1>UD    0x042a0318
+                            urb MsgDesc: 49 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g8<1>UD         g15<8,8,1>UD    0x042a0518
+                            urb MsgDesc: 81 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g10<1>UD        g15<8,8,1>UD    0x042a0718
+                            urb MsgDesc: 113 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g12<1>UD        g15<8,8,1>UD    0x042a0918
+                            urb MsgDesc: 145 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g14<1>UD        g15<8,8,1>UD    0x042a0128
+                            urb MsgDesc: 18 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g16<1>UD        g14<8,8,1>UD    0x042a0218
+                            urb MsgDesc: 33 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g18<1>UD        g14<8,8,1>UD    0x042a0418
+                            urb MsgDesc: 65 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g20<1>UD        g14<8,8,1>UD    0x042a0618
+                            urb MsgDesc: 97 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g22<1>UD        g14<8,8,1>UD    0x042a0818
+                            urb MsgDesc: 129 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g13<1>UD        g14<8,8,1>UD    0x042a0028
+                            urb MsgDesc: 2 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g30<8,8,1>UD    0x02480208
+                            urb MsgDesc: 32 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g14<1>UD        g30<8,8,1>UD    0x02480408
+                            urb MsgDesc: 64 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g18<1>UD        g30<8,8,1>UD    0x02480608
+                            urb MsgDesc: 96 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g30<8,8,1>UD    0x02480808
+                            urb MsgDesc: 128 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0a0a8217
+                            urb MsgDesc: 33 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>UD    0x0a0a8227
+                            urb MsgDesc: 34 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0a0a8237
+                            urb MsgDesc: 35 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x0a0a8247
+                            urb MsgDesc: 36 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x0a0a8257
+                            urb MsgDesc: 37 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x0a0a8267
+                            urb MsgDesc: 38 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0a0a8277
+                            urb MsgDesc: 39 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x0a0a8287
+                            urb MsgDesc: 40 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x0a0a8297
+                            urb MsgDesc: 41 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x0a0a82a7
+                            urb MsgDesc: 42 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0a0a82b7
+                            urb MsgDesc: 43 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0a0a82c7
+                            urb MsgDesc: 44 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0a0a82d7
+                            urb MsgDesc: 45 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0a0a82e7
+                            urb MsgDesc: 46 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0a0a82f7
+                            urb MsgDesc: 47 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0a0a8307
+                            urb MsgDesc: 48 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x0a0a8317
+                            urb MsgDesc: 49 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0a0a8327
+                            urb MsgDesc: 50 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0a0a8337
+                            urb MsgDesc: 51 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0a0a8347
+                            urb MsgDesc: 52 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0a0a8357
+                            urb MsgDesc: 53 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0a0a8367
+                            urb MsgDesc: 54 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0a0a8377
+                            urb MsgDesc: 55 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x0a0a8387
+                            urb MsgDesc: 56 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0a0a8397
+                            urb MsgDesc: 57 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0a0a83a7
+                            urb MsgDesc: 58 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0a0a83b7
+                            urb MsgDesc: 59 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0a0a83c7
+                            urb MsgDesc: 60 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0a0a83d7
+                            urb MsgDesc: 61 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0a0a83e7
+                            urb MsgDesc: 62 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x0a0a83f7
+                            urb MsgDesc: 63 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>UD    0x08088027
+                            urb MsgDesc: 2 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x08088037
+                            urb MsgDesc: 3 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x08088047
+                            urb MsgDesc: 4 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x08088057
+                            urb MsgDesc: 5 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x08088067
+                            urb MsgDesc: 6 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x08088077
+                            urb MsgDesc: 7 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x08088087
+                            urb MsgDesc: 8 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x08088097
+                            urb MsgDesc: 9 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x080880a7
+                            urb MsgDesc: 10 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x080880b7
+                            urb MsgDesc: 11 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x080880c7
+                            urb MsgDesc: 12 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x080880d7
+                            urb MsgDesc: 13 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x080880e7
+                            urb MsgDesc: 14 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x080880f7
+                            urb MsgDesc: 15 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x08088107
+                            urb MsgDesc: 16 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x08088117
+                            urb MsgDesc: 17 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x08088127
+                            urb MsgDesc: 18 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x08088137
+                            urb MsgDesc: 19 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x08088147
+                            urb MsgDesc: 20 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x08088157
+                            urb MsgDesc: 21 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x08088167
+                            urb MsgDesc: 22 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x08088177
+                            urb MsgDesc: 23 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x08088187
+                            urb MsgDesc: 24 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x08088197
+                            urb MsgDesc: 25 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x080881a7
+                            urb MsgDesc: 26 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x080881b7
+                            urb MsgDesc: 27 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x080881c7
+                            urb MsgDesc: 28 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x080881d7
+                            urb MsgDesc: 29 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x080881e7
+                            urb MsgDesc: 30 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x080881f7
+                            urb MsgDesc: 31 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>UD    0x0c0a0207
+                            urb MsgDesc: 32 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080057
+                            urb MsgDesc: 5 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         g10<1>UW        g18<8,8,1>UD    0x084a8000
+                            sampler MsgDesc: gather4 SIMD8 Surface = 0 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g7<1>UW         g1<8,8,1>UD     0x0c847001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(16)        null<1>UW       g57<8,8,1>UD    0x04008502
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, inc) mlen 2 rlen 0 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x04422001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x08842001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g3<8,8,1>UD     0x06427000
+                            sampler MsgDesc: ld SIMD8 Surface = 0 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g5<1>UW         g3<8,8,1>UD     0x02427001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g8<1>UW         g5<8,8,1>UD     0x04847001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         null<1>F        g119<8,8,1>F    0x92080007
+                            urb MsgDesc: 0 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         null<1>F        g126<8,8,1>UD   0x84080017
+                            urb MsgDesc: 1 SIMD8 write mlen 2 rlen 0        { align1 1Q EOT };
+send(8)         g124<1>UW       g9<8,8,1>UD     0x08421001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g14<8,8,1>UD    0x10841001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g38<1>UD        g1<8,8,1>UD     0x02180028
+                            urb MsgDesc: 2 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g40<1>UD        g1<8,8,1>UD     0x02180038
+                            urb MsgDesc: 3 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g42<1>UD        g1<8,8,1>UD     0x02180048
+                            urb MsgDesc: 4 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g44<1>UD        g1<8,8,1>UD     0x02180058
+                            urb MsgDesc: 5 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g46<1>UD        g1<8,8,1>UD     0x02180068
+                            urb MsgDesc: 6 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g48<1>UD        g1<8,8,1>UD     0x02180078
+                            urb MsgDesc: 7 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g50<1>UD        g1<8,8,1>UD     0x02180088
+                            urb MsgDesc: 8 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g52<1>UD        g1<8,8,1>UD     0x02180098
+                            urb MsgDesc: 9 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g54<1>UD        g1<8,8,1>UD     0x021800a8
+                            urb MsgDesc: 10 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g56<1>UD        g1<8,8,1>UD     0x021800b8
+                            urb MsgDesc: 11 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g58<1>UD        g1<8,8,1>UD     0x021800c8
+                            urb MsgDesc: 12 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g60<1>UD        g1<8,8,1>UD     0x021800d8
+                            urb MsgDesc: 13 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g62<1>UD        g1<8,8,1>UD     0x021800e8
+                            urb MsgDesc: 14 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g64<1>UD        g1<8,8,1>UD     0x021800f8
+                            urb MsgDesc: 15 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g66<1>UD        g1<8,8,1>UD     0x02180108
+                            urb MsgDesc: 16 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g68<1>UD        g1<8,8,1>UD     0x02180118
+                            urb MsgDesc: 17 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g70<1>UD        g1<8,8,1>UD     0x02180128
+                            urb MsgDesc: 18 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g72<1>UD        g1<8,8,1>UD     0x02180138
+                            urb MsgDesc: 19 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g74<1>UD        g1<8,8,1>UD     0x02180148
+                            urb MsgDesc: 20 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g76<1>UD        g1<8,8,1>UD     0x02180158
+                            urb MsgDesc: 21 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g78<1>UD        g1<8,8,1>UD     0x02180168
+                            urb MsgDesc: 22 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g80<1>UD        g1<8,8,1>UD     0x02180178
+                            urb MsgDesc: 23 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g82<1>UD        g1<8,8,1>UD     0x02180188
+                            urb MsgDesc: 24 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g84<1>UD        g1<8,8,1>UD     0x02180198
+                            urb MsgDesc: 25 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g86<1>UD        g1<8,8,1>UD     0x021801a8
+                            urb MsgDesc: 26 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g88<1>UD        g1<8,8,1>UD     0x021801b8
+                            urb MsgDesc: 27 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g90<1>UD        g1<8,8,1>UD     0x021801c8
+                            urb MsgDesc: 28 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g92<1>UD        g1<8,8,1>UD     0x021801d8
+                            urb MsgDesc: 29 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g94<1>UD        g1<8,8,1>UD     0x021801e8
+                            urb MsgDesc: 30 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g96<1>UD        g1<8,8,1>UD     0x021801f8
+                            urb MsgDesc: 31 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g98<1>UD        g1<8,8,1>UD     0x02180208
+                            urb MsgDesc: 32 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0c0a0027
+                            urb MsgDesc: 2 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g126<8,8,1>UD   0x040a02fd
+                            data MsgDesc: ( DC OWORD block write, 253, 2) mlen 2 rlen 0 { align1 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c001b
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 27, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c001c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 28, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c001d
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 29, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c001e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 30, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c001f
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 31, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c0020
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 32, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c0021
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 33, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         null<1>F        g25<8,8,1>F     0x12080057
+                            urb MsgDesc: 5 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>F     0x12080077
+                            urb MsgDesc: 7 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g43<8,8,1>F     0x12080097
+                            urb MsgDesc: 9 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g52<8,8,1>F     0x120800b7
+                            urb MsgDesc: 11 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g61<8,8,1>F     0x120800d7
+                            urb MsgDesc: 13 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g70<8,8,1>F     0x120800f7
+                            urb MsgDesc: 15 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         g2<1>UW         g0<8,8,1>F      0x021c0000
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 0, 0) mlen 1 rlen 1            { align1 WE_all 1Q };
+send(8)         g3<1>UW         g0<8,8,1>F      0x021c0001
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 1, 0) mlen 1 rlen 1            { align1 WE_all 1Q };
+send(8)         g4<1>UW         g0<8,8,1>F      0x021c0002
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 2, 0) mlen 1 rlen 1            { align1 WE_all 1Q };
+send(8)         g5<1>UW         g0<8,8,1>F      0x021c0003
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 3, 0) mlen 1 rlen 1            { align1 WE_all 1Q };
+send(8)         g6<1>UW         g0<8,8,1>F      0x021c0004
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 4, 0) mlen 1 rlen 1            { align1 WE_all 1Q };
+send(8)         g7<1>UW         g0<8,8,1>F      0x021c0005
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 5, 0) mlen 1 rlen 1            { align1 WE_all 1Q };
+send(8)         g8<1>UW         g0<8,8,1>F      0x021c0006
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 6, 0) mlen 1 rlen 1            { align1 WE_all 1Q };
+send(8)         g9<1>UW         g0<8,8,1>F      0x021c0007
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 7, 0) mlen 1 rlen 1            { align1 WE_all 1Q };
+send(8)         g10<1>UW        g0<8,8,1>F      0x021c0008
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 8, 0) mlen 1 rlen 1            { align1 WE_all 1Q };
+send(8)         null<1>F        g2<8,8,1>F      0x12080117
+                            urb MsgDesc: 17 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         g2<1>UW         g0<8,8,1>F      0x021c0009
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 9, 0) mlen 1 rlen 1            { align1 WE_all 1Q };
+send(8)         g3<1>UW         g0<8,8,1>F      0x021c000a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 10, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g4<1>UW         g0<8,8,1>F      0x021c000b
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 11, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g5<1>UW         g0<8,8,1>F      0x021c000c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 12, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g6<1>UW         g0<8,8,1>F      0x021c000d
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 13, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g7<1>UW         g0<8,8,1>F      0x021c000e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 14, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g8<1>UW         g0<8,8,1>F      0x021c000f
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 15, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g9<1>UW         g0<8,8,1>F      0x021c0010
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 16, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g10<1>UW        g0<8,8,1>F      0x021c0011
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 17, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         null<1>F        g2<8,8,1>F      0x12080137
+                            urb MsgDesc: 19 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         g2<1>UW         g0<8,8,1>F      0x021c0012
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 18, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g3<1>UW         g0<8,8,1>F      0x021c0013
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 19, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g4<1>UW         g0<8,8,1>F      0x021c0014
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 20, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g5<1>UW         g0<8,8,1>F      0x021c0015
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 21, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g6<1>UW         g0<8,8,1>F      0x021c0016
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 22, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g7<1>UW         g0<8,8,1>F      0x021c0017
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 23, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g8<1>UW         g0<8,8,1>F      0x021c0018
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 24, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g9<1>UW         g0<8,8,1>F      0x021c0019
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 25, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g10<1>UW        g0<8,8,1>F      0x021c001a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 26, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         null<1>F        g2<8,8,1>F      0x12080157
+                            urb MsgDesc: 21 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g79<8,8,1>F     0x12080177
+                            urb MsgDesc: 23 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g88<8,8,1>F     0x12080197
+                            urb MsgDesc: 25 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g97<8,8,1>F     0x120801b7
+                            urb MsgDesc: 27 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g106<8,8,1>F    0x120801d7
+                            urb MsgDesc: 29 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g117<8,8,1>F    0x920801f7
+                            urb MsgDesc: 31 SIMD8 write mlen 9 rlen 0       { align1 1Q EOT };
+send(8)         g124<1>UW       g12<8,8,1>UD    0x0a4b1001
+                            sampler MsgDesc: gather4_po SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x128d1001
+                            sampler MsgDesc: gather4_po SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x02429001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x04849001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g3<8,8,1>UD     0x08427000
+                            sampler MsgDesc: ld SIMD8 Surface = 0 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        null<1>UW       g40<8,8,1>UD    0x04008501
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, inc) mlen 2 rlen 0 { align1 1H };
+send(8)         null<1>F        g127<8,8,1>UD   0x82080007
+                            urb MsgDesc: 0 SIMD8 write mlen 1 rlen 0        { align1 1Q EOT };
+send(8)         g124<1>UW       g9<8,8,1>UD     0x0a4a8000
+                            sampler MsgDesc: gather4 SIMD8 Surface = 0 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g10<8,8,1>UD    0x0e434001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+(+f1.0) send(8) g12<1>UW        g2<8,8,1>UD     0x0410b201
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, or) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g2<8,8,1>UD     0x02009501
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, inc) mlen 1 rlen 0 { align1 1Q };
+(+f1.0) send(16) g14<1>UW       g16<8,8,1>UD    0x0820a201
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, or) mlen 4 rlen 2 { align1 1H };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x08434001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g119<1>UW       g0<8,8,1>F      0x021c0022
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 34, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g120<1>UW       g0<8,8,1>F      0x021c0023
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 35, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         null<1>F        g102<8,8,1>F    0x120801f7
+                            urb MsgDesc: 31 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g121<8,8,1>F    0x8a080217
+                            urb MsgDesc: 33 SIMD8 write mlen 5 rlen 0       { align1 1Q EOT };
+send(16)        null<1>UW       g3<8,8,1>UD     0x08025efe
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 254, SIMD16, Mask = 0xe) mlen 4 rlen 0 { align1 1H };
+send(16)        null<1>UW       g3<8,8,1>UD     0x02008004
+                            gateway MsgDesc: (barrier msg) mlen 1 rlen 0    { align1 WE_all 1H };
+send(16)        null<1>UW       g7<8,8,1>UD     0x080087fe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, add) mlen 4 rlen 0 { align1 1H };
+send(16)        g3<1>UW         g5<8,8,1>UD     0x04205efe
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 254, SIMD16, Mask = 0xe) mlen 2 rlen 2 { align1 1H };
+send(8)         g124<1>UW       g3<8,8,1>UD     0x06423001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g10<8,8,1>UD    0x0c843001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         null<1>F        g30<8,8,1>F     0x140a0027
+                            urb MsgDesc: 2 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>F     0x0c0a0047
+                            urb MsgDesc: 4 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g126<8,8,1>UD   0x84080007
+                            urb MsgDesc: 0 SIMD8 write mlen 2 rlen 0        { align1 1Q EOT };
+send(8)         g5<1>UW         g11<8,8,1>UD    0x06495001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0x0) mlen 3 rlen 4 { align1 1Q };
+send(8)         null<1>UW       g14<8,8,1>UD    0x0e0b5002
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD16, Mask = 0x0) mlen 7 rlen 0 { align1 1Q };
+send(8)         g2<1>UW         g11<8,8,1>UD    0x06496001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0x0) mlen 3 rlen 4 { align1 2Q };
+send(8)         null<1>UW       g7<8,8,1>UD     0x0e0b6002
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD8, Mask = 0x0) mlen 7 rlen 0 { align1 2Q };
+send(8)         g13<1>UD        g3<8,8,1>UD     0x02480038
+                            urb MsgDesc: 3 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         null<1>F        g7<8,8,1>F      0x140a0037
+                            urb MsgDesc: 3 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         g15<1>UD        g2<8,8,1>UD     0x02280038
+                            urb MsgDesc: 3 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080037
+                            urb MsgDesc: 3 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         null<1>F        g8<8,8,1>F      0x140a0007
+                            urb MsgDesc: 0 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g118<8,8,1>F    0x940a0007
+                            urb MsgDesc: 0 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q EOT };
+send(8)         g116<1>UW       g0<8,8,1>F      0x021c0024
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 36, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g117<1>UW       g0<8,8,1>F      0x021c0025
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 37, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g118<1>UW       g0<8,8,1>F      0x021c0026
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 38, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g119<1>UW       g0<8,8,1>F      0x021c0027
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 39, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g120<1>UW       g0<8,8,1>F      0x021c0028
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 40, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g10<1>UD        g2<8,8,1>UD     0x02480048
+                            urb MsgDesc: 4 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         g6<1>UD         g2<8,8,1>UD     0x02480088
+                            urb MsgDesc: 8 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         g14<1>UD        g2<8,8,1>UD     0x02480058
+                            urb MsgDesc: 5 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         g11<1>UD        g2<8,8,1>UD     0x024800a8
+                            urb MsgDesc: 10 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g18<1>UD        g2<8,8,1>UD     0x02480068
+                            urb MsgDesc: 6 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         g16<1>UD        g2<8,8,1>UD     0x023800c8
+                            urb MsgDesc: 12 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g2<8,8,1>UD     0x02480078
+                            urb MsgDesc: 7 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x024800b8
+                            urb MsgDesc: 11 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g7<1>UD         g2<8,8,1>UD     0x02480098
+                            urb MsgDesc: 9 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x920800b7
+                            urb MsgDesc: 11 SIMD8 write mlen 9 rlen 0       { align1 1Q EOT };
+send(8)         g4<1>UW         g10<8,8,1>UD    0x084b0000
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 0 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g7<1>UW         g0<8,8,1>UD     0x02200008
+                            pixel interp MsgDesc: (persp, per_message_offset, 0x08) mlen 1 rlen 2 { align1 1Q };
+send(16)        g9<1>UW         g0<8,8,1>UD     0x02410008
+                            pixel interp MsgDesc: (persp, per_message_offset, 0x08) mlen 1 rlen 4 { align1 1H };
+send(8)         g2<1>UW         g6<8,8,1>UD     0x0a4b1000
+                            sampler MsgDesc: gather4_po SIMD8 Surface = 0 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g74<1>UD        g2<8,8,1>UD     0x02280028
+                            urb MsgDesc: 2 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         g7<1>UD         g2<8,8,1>UD     0x02380028
+                            urb MsgDesc: 2 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         g15<1>UD        g2<8,8,1>UD     0x02380038
+                            urb MsgDesc: 3 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         g124<1>UW       g3<8,8,1>UD     0x0843e000
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 0 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g3<8,8,1>UD     0x0443d000
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 0 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g11<8,8,1>UD    0x0a4a1001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g19<8,8,1>UD    0x128c1001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         null<1>F        g2<8,8,1>F      0x0c0a0057
+                            urb MsgDesc: 5 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g9<8,8,1>UD     0x04080027
+                            urb MsgDesc: 2 SIMD8 write mlen 2 rlen 0        { align1 1Q };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x04429001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08849001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x08434102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(8)         g7<1>UW         g7<8,8,1>UD     0x024ab000
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 0 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         null<1>F        g50<8,8,1>F     0x140a0057
+                            urb MsgDesc: 5 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g60<8,8,1>F     0x140a0077
+                            urb MsgDesc: 7 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g70<8,8,1>F     0x0c0a0097
+                            urb MsgDesc: 9 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g122<8,8,1>F    0x8c0a0097
+                            urb MsgDesc: 9 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         g2<1>UW         g6<8,8,1>UD     0x08427005
+                            sampler MsgDesc: ld SIMD8 Surface = 5 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g10<8,8,1>UD    0x08427006
+                            sampler MsgDesc: ld SIMD8 Surface = 6 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g14<8,8,1>UD    0x08427007
+                            sampler MsgDesc: ld SIMD8 Surface = 7 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g18<8,8,1>UD    0x08427008
+                            sampler MsgDesc: ld SIMD8 Surface = 8 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g65<1>UW        g73<8,8,1>UD    0x10847005
+                            sampler MsgDesc: ld SIMD16 Surface = 5 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(16)        g32<1>UW        g81<8,8,1>UD    0x10847006
+                            sampler MsgDesc: ld SIMD16 Surface = 6 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(16)        g40<1>UW        g49<8,8,1>UD    0x10847007
+                            sampler MsgDesc: ld SIMD16 Surface = 7 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(16)        g48<1>UW        g57<8,8,1>UD    0x10847008
+                            sampler MsgDesc: ld SIMD16 Surface = 8 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x0a4b0000
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 0 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g6<8,8,1>UD     0x064a3001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g9<8,8,1>UD     0x064a3102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0a8c3001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0a8c3102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 5 rlen 8 { align1 1H };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080077
+                            urb MsgDesc: 7 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         null<1>F        g30<8,8,1>UD    0x0c0a0067
+                            urb MsgDesc: 6 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0c0a0077
+                            urb MsgDesc: 7 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g42<8,8,1>UD    0x0c0a0087
+                            urb MsgDesc: 8 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x0a4a8001
+                            sampler MsgDesc: gather4 SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g16<1>UW        g7<8,8,1>UD     0x128c8001
+                            sampler MsgDesc: gather4 SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x06420102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0c840102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x04420102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x06420304
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 3 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08840102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 4 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0c840304
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 3 mlen 6 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x04420304
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 3 mlen 2 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x06420708
+                            sampler MsgDesc: sample SIMD8 Surface = 8 Sampler = 7 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08840304
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 3 mlen 4 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0c840708
+                            sampler MsgDesc: sample SIMD16 Surface = 8 Sampler = 7 mlen 6 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x06421001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x0c841001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(16)        g4<1>UD         g13<8,8,1>UD    0x02280301
+                            const MsgDesc: (1, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x06427001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g11<8,8,1>UD    0x06425001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g14<8,8,1>UD    0x06425102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g21<8,8,1>UD    0x0c845001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g27<8,8,1>UD    0x0c845102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g10<8,8,1>UD    0x06422001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g17<8,8,1>UD    0x0c842001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g38<1>UD        g2<8,8,1>UD     0x024800c8
+                            urb MsgDesc: 12 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g39<1>UD        g2<8,8,1>UD     0x024800d8
+                            urb MsgDesc: 13 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g40<1>UD        g2<8,8,1>UD     0x024800e8
+                            urb MsgDesc: 14 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g41<1>UD        g2<8,8,1>UD     0x024800f8
+                            urb MsgDesc: 15 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g42<1>UD        g2<8,8,1>UD     0x02480108
+                            urb MsgDesc: 16 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g43<1>UD        g2<8,8,1>UD     0x02480118
+                            urb MsgDesc: 17 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g44<1>UD        g2<8,8,1>UD     0x02480128
+                            urb MsgDesc: 18 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g45<1>UD        g2<8,8,1>UD     0x02480138
+                            urb MsgDesc: 19 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g46<1>UD        g2<8,8,1>UD     0x02480148
+                            urb MsgDesc: 20 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g47<1>UD        g2<8,8,1>UD     0x02480158
+                            urb MsgDesc: 21 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g48<1>UD        g2<8,8,1>UD     0x02480168
+                            urb MsgDesc: 22 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g49<1>UD        g2<8,8,1>UD     0x02480178
+                            urb MsgDesc: 23 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g50<1>UD        g2<8,8,1>UD     0x02480188
+                            urb MsgDesc: 24 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g51<1>UD        g2<8,8,1>UD     0x02480198
+                            urb MsgDesc: 25 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g52<1>UD        g2<8,8,1>UD     0x024801a8
+                            urb MsgDesc: 26 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g53<1>UD        g2<8,8,1>UD     0x024801b8
+                            urb MsgDesc: 27 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g54<1>UD        g2<8,8,1>UD     0x024801c8
+                            urb MsgDesc: 28 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g55<1>UD        g2<8,8,1>UD     0x024801d8
+                            urb MsgDesc: 29 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g56<1>UD        g2<8,8,1>UD     0x024801e8
+                            urb MsgDesc: 30 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g57<1>UD        g2<8,8,1>UD     0x024801f8
+                            urb MsgDesc: 31 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c003e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 62, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c003d
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 61, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c0029
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 41, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c002a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 42, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c002b
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 43, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c002c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 44, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c002d
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 45, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c002e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 46, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c002f
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 47, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c0030
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 48, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c0031
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 49, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c0032
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 50, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c0033
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 51, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c0034
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 52, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c0035
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 53, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c0036
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 54, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c0037
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 55, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c0038
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 56, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c0039
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 57, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g2<1>UW         g0<8,8,1>F      0x021c003a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 58, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c003b
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 59, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g2<1>UW         g0<8,8,1>F      0x021c003c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 60, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x080a8027
+                            urb MsgDesc: 2 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g8<8,8,1>UD     0x0a0a8027
+                            urb MsgDesc: 2 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         g124<1>UW       g7<8,8,1>UD     0x0e424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+send(8)         g8<1>UD         g14<8,8,1>UD    0x044a0128
+                            urb MsgDesc: 18 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g22<1>UD        g16<8,8,1>UD    0x044a0028
+                            urb MsgDesc: 2 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x0a080017
+                            urb MsgDesc: 1 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g7<8,8,1>F      0x0a080057
+                            urb MsgDesc: 5 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         g4<1>UW         g2<8,8,1>UD     0x02406001
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD8, Mask = 0x0) mlen 1 rlen 4 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x0a026001
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD8, Mask = 0x0) mlen 5 rlen 0 { align1 1Q };
+send(16)        g6<1>UW         g2<8,8,1>UD     0x04805001
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD16, Mask = 0x0) mlen 2 rlen 8 { align1 1H };
+(+f1.0) send(16) null<1>UW      g4<8,8,1>UD     0x14025001
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD16, Mask = 0x0) mlen 10 rlen 0 { align1 1H };
+send(8)         g3<1>UW         g7<8,8,1>UD     0x08427002
+                            sampler MsgDesc: ld SIMD8 Surface = 2 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g3<1>UW         g11<8,8,1>UD    0x10847002
+                            sampler MsgDesc: ld SIMD16 Surface = 2 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g12<8,8,1>UD    0x084b0001
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x0e8d0001
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g17<8,8,1>UD    0x0e434102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 7 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g10<8,8,1>UD    0x064a8202
+                            sampler MsgDesc: gather4 SIMD8 Surface = 2 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x084a8101
+                            sampler MsgDesc: gather4 SIMD8 Surface = 1 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g10<8,8,1>UD    0x08422001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g15<8,8,1>UD    0x10842001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         null<1>F        g122<8,8,1>F    0x8c0a0037
+                            urb MsgDesc: 3 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         null<1>F        g10<8,8,1>F     0x12080027
+                            urb MsgDesc: 2 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080047
+                            urb MsgDesc: 4 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         g14<1>UW        g2<8,8,1>UD     0x06422000
+                            sampler MsgDesc: sample_l SIMD8 Surface = 0 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g61<1>UD        g107<8,8,1>UD   0x02380048
+                            urb MsgDesc: 4 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         g64<1>UD        g113<8,8,1>UD   0x02380058
+                            urb MsgDesc: 5 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080047
+                            urb MsgDesc: 4 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         null<1>UW       g2<8,8,1>UD     0x04009b00
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 0, SIMD8, imin) mlen 2 rlen 0 { align1 1Q };
+send(8)         g5<1>UW         g4<8,8,1>UD     0x08495001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0x0) mlen 4 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x08496001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0x0) mlen 4 rlen 4 { align1 2Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080077
+                            urb MsgDesc: 7 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         g12<1>UD        g8<4,4,1>UD     0x044a0038
+                            urb MsgDesc: 3 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g21<1>UD        g8<4,4,1>UD     0x044a0048
+                            urb MsgDesc: 4 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0c0a00a7
+                            urb MsgDesc: 10 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g56<8,8,1>F     0x140a0097
+                            urb MsgDesc: 9 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g76<8,8,1>F     0x0c0a00b7
+                            urb MsgDesc: 11 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g122<8,8,1>F    0x8c0a00b7
+                            urb MsgDesc: 11 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         null<1>F        g6<8,8,1>UD     0x0a080007
+                            urb MsgDesc: 0 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         g2<1>UW         g9<8,8,1>UD     0x08423001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g16<1>UW        g8<8,8,1>UD     0x10843001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g11<8,8,1>UD    0x06426001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g14<8,8,1>UD    0x06426102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g21<8,8,1>UD    0x0c846001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g27<8,8,1>UD    0x0c846102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g10<8,8,1>UD    0x0a421001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g9<8,8,1>UD     0x14841001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x04423001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x08843001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g4<1>UW         g0<8,8,1>UD     0x02201000
+                            pixel interp MsgDesc: (persp, sample_position, 0x00) mlen 1 rlen 2 { align1 1Q };
+send(16)        g6<1>UW         g0<8,8,1>UD     0x02411000
+                            pixel interp MsgDesc: (persp, sample_position, 0x00) mlen 1 rlen 4 { align1 1H };
+send(8)         g124<1>UW       g14<8,8,1>UD    0x0a4b0001
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x128d0001
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         null<1>F        g118<8,8,1>F    0x940a0037
+                            urb MsgDesc: 3 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q EOT };
+send(8)         g8<1>UD         g15<8,8,1>UD    0x042a0138
+                            urb MsgDesc: 19 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g10<1>UD        g15<8,8,1>UD    0x042a0338
+                            urb MsgDesc: 51 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g12<1>UD        g15<8,8,1>UD    0x042a0538
+                            urb MsgDesc: 83 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g14<1>UD        g15<8,8,1>UD    0x042a0738
+                            urb MsgDesc: 115 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g8<1>UD         g15<8,8,1>UD    0x042a0038
+                            urb MsgDesc: 3 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g10<1>UD        g15<8,8,1>UD    0x042a0238
+                            urb MsgDesc: 35 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g12<1>UD        g15<8,8,1>UD    0x042a0438
+                            urb MsgDesc: 67 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g14<1>UD        g15<8,8,1>UD    0x042a0638
+                            urb MsgDesc: 99 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g35<8,8,1>UD    0x02480228
+                            urb MsgDesc: 34 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g8<1>UD         g35<8,8,1>UD    0x02480428
+                            urb MsgDesc: 66 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g35<8,8,1>UD    0x02480628
+                            urb MsgDesc: 98 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0a0a8037
+                            urb MsgDesc: 3 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>UD    0x0a0a8047
+                            urb MsgDesc: 4 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0a0a8057
+                            urb MsgDesc: 5 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x0a0a8067
+                            urb MsgDesc: 6 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x0a0a8077
+                            urb MsgDesc: 7 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x0a0a8087
+                            urb MsgDesc: 8 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0a0a8097
+                            urb MsgDesc: 9 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x0a0a80a7
+                            urb MsgDesc: 10 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x0a0a80b7
+                            urb MsgDesc: 11 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x0a0a80c7
+                            urb MsgDesc: 12 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0a0a80d7
+                            urb MsgDesc: 13 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0a0a80e7
+                            urb MsgDesc: 14 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0a0a80f7
+                            urb MsgDesc: 15 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0a0a8107
+                            urb MsgDesc: 16 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0a0a8117
+                            urb MsgDesc: 17 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0a0a8127
+                            urb MsgDesc: 18 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x0a0a8137
+                            urb MsgDesc: 19 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0a0a8147
+                            urb MsgDesc: 20 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0a0a8157
+                            urb MsgDesc: 21 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0a0a8167
+                            urb MsgDesc: 22 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0a0a8177
+                            urb MsgDesc: 23 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0a0a8187
+                            urb MsgDesc: 24 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0a0a8197
+                            urb MsgDesc: 25 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x0a0a81a7
+                            urb MsgDesc: 26 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0a0a81b7
+                            urb MsgDesc: 27 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0a0a81c7
+                            urb MsgDesc: 28 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0a0a81d7
+                            urb MsgDesc: 29 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0a0a81e7
+                            urb MsgDesc: 30 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0a0a81f7
+                            urb MsgDesc: 31 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0a0a8207
+                            urb MsgDesc: 32 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g122<8,8,1>F    0x8c0a0027
+                            urb MsgDesc: 2 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x06429001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g12<8,8,1>UD    0x0c849001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g5<1>UW         g17<8,8,1>UD    0x06427102
+                            sampler MsgDesc: ld SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g21<1>UW        g7<8,8,1>UD     0x0c847102
+                            sampler MsgDesc: ld SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         null<1>F        g118<8,8,1>F    0x940a0027
+                            urb MsgDesc: 2 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q EOT };
+send(8)         null<1>F        g2<8,8,1>F      0x12080067
+                            urb MsgDesc: 6 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080087
+                            urb MsgDesc: 8 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         g21<1>UD        g2<8,8,1>UD     0x02380068
+                            urb MsgDesc: 6 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         g35<1>UD        g2<8,8,1>UD     0x02380088
+                            urb MsgDesc: 8 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         null<1>F        g5<8,8,1>F      0x140a0067
+                            urb MsgDesc: 6 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g118<8,8,1>F    0x940a0067
+                            urb MsgDesc: 6 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q EOT };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x04427001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x08847001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g5<8,8,1>UD     0x08425001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x10845001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         null<1>F        g123<8,8,1>F    0x8a0800d7
+                            urb MsgDesc: 13 SIMD8 write mlen 5 rlen 0       { align1 1Q EOT };
+send(8)         g22<1>UW        g14<8,8,1>UD    0x064a8405
+                            sampler MsgDesc: gather4 SIMD8 Surface = 5 Sampler = 4 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x084a8102
+                            sampler MsgDesc: gather4 SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g10<8,8,1>UD    0x084a8203
+                            sampler MsgDesc: gather4 SIMD8 Surface = 3 Sampler = 2 mlen 4 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g26<8,8,1>UD    0x0a4a8304
+                            sampler MsgDesc: gather4 SIMD8 Surface = 4 Sampler = 3 mlen 5 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g43<8,8,1>UD    0x0a8c8405
+                            sampler MsgDesc: gather4 SIMD16 Surface = 5 Sampler = 4 mlen 5 rlen 8 { align1 1H };
+send(16)        g43<1>UW        g7<8,8,1>UD     0x0e8c8102
+                            sampler MsgDesc: gather4 SIMD16 Surface = 2 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g51<8,8,1>UD    0x0e8c8203
+                            sampler MsgDesc: gather4 SIMD16 Surface = 3 Sampler = 2 mlen 7 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g26<8,8,1>UD    0x128c8304
+                            sampler MsgDesc: gather4 SIMD16 Surface = 4 Sampler = 3 mlen 9 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g10<8,8,1>UD    0x0e4a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g10<8,8,1>UD    0x084a1001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g17<8,8,1>UD    0x0e8c1001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g9<8,8,1>UD     0x0a422001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x14842001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(16)        null<1>UW       g2<8,8,1>UD     0x04008601
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, dec) mlen 2 rlen 0 { align1 1H };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x08426001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x08426102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x10846001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g19<8,8,1>UD    0x10846102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 8 rlen 8 { align1 1H };
+send(8)         null<1>F        g18<8,8,1>UD    0x0e0a8047
+                            urb MsgDesc: 4 SIMD8 write per-slot masked mlen 7 rlen 0 { align1 1Q };
+send(8)         g9<1>UD         g34<8,8,1>UD    0x02480218
+                            urb MsgDesc: 33 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g17<1>UD        g34<8,8,1>UD    0x02480238
+                            urb MsgDesc: 35 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g2<1>UD         g6<8,8,1>UD     0x041a0128
+                            urb MsgDesc: 18 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g22<1>UD        g8<8,8,1>UD     0x041a0028
+                            urb MsgDesc: 2 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         null<1>F        g2<8,8,1>UD     0x06088027
+                            urb MsgDesc: 2 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x06088037
+                            urb MsgDesc: 3 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x06088047
+                            urb MsgDesc: 4 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x06088057
+                            urb MsgDesc: 5 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x06088067
+                            urb MsgDesc: 6 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x06088077
+                            urb MsgDesc: 7 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x06088087
+                            urb MsgDesc: 8 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x06088097
+                            urb MsgDesc: 9 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x060880a7
+                            urb MsgDesc: 10 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x060880b7
+                            urb MsgDesc: 11 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x060880c7
+                            urb MsgDesc: 12 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x060880d7
+                            urb MsgDesc: 13 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x060880e7
+                            urb MsgDesc: 14 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x060880f7
+                            urb MsgDesc: 15 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x06088107
+                            urb MsgDesc: 16 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x06088117
+                            urb MsgDesc: 17 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x06088127
+                            urb MsgDesc: 18 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x06088137
+                            urb MsgDesc: 19 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x06088147
+                            urb MsgDesc: 20 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x06088157
+                            urb MsgDesc: 21 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x06088167
+                            urb MsgDesc: 22 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x06088177
+                            urb MsgDesc: 23 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x06088187
+                            urb MsgDesc: 24 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x06088197
+                            urb MsgDesc: 25 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x060881a7
+                            urb MsgDesc: 26 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x060881b7
+                            urb MsgDesc: 27 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x060881c7
+                            urb MsgDesc: 28 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x060881d7
+                            urb MsgDesc: 29 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x060881e7
+                            urb MsgDesc: 30 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x060881f7
+                            urb MsgDesc: 31 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x02406000
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 0, SIMD8, Mask = 0x0) mlen 1 rlen 4 { align1 1Q };
+send(8)         g23<1>UW        g11<8,8,1>UD    0x06195e01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0xe) mlen 3 rlen 1 { align1 1Q };
+send(8)         null<1>UW       g24<8,8,1>UD    0x080b5e01
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD16, Mask = 0xe) mlen 4 rlen 0 { align1 1Q };
+send(8)         g39<1>UW        g17<8,8,1>UD    0x06196e01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0xe) mlen 3 rlen 1 { align1 2Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x080b6e01
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD8, Mask = 0xe) mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g11<8,8,1>UD    0x06098501
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, inc) mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06099501
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, inc) mlen 3 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g24<8,8,1>UD    0x08098c01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, umax) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099c01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, umax) mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g24<8,8,1>UD    0x08098401
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, mov) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099401
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, mov) mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g26<8,8,1>UD    0x0a098e01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, cmpwr) mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g14<8,8,1>UD    0x0a099e01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, cmpwr) mlen 5 rlen 0 { align1 2Q };
+send(8)         g6<1>UW         g8<8,8,1>UD     0x04423102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x08843102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 4 rlen 8 { align1 1H };
+send(8)         g6<1>UD         g22<8,8,1>UD    0x044a0318
+                            urb MsgDesc: 49 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g10<1>UD        g22<8,8,1>UD    0x044a0518
+                            urb MsgDesc: 81 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g14<1>UD        g22<8,8,1>UD    0x044a0718
+                            urb MsgDesc: 113 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g18<1>UD        g22<8,8,1>UD    0x044a0918
+                            urb MsgDesc: 145 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g13<1>UD        g29<8,8,1>UD    0x044a0218
+                            urb MsgDesc: 33 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g17<1>UD        g29<8,8,1>UD    0x044a0418
+                            urb MsgDesc: 65 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g21<1>UD        g29<8,8,1>UD    0x044a0618
+                            urb MsgDesc: 97 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g25<1>UD        g29<8,8,1>UD    0x044a0818
+                            urb MsgDesc: 129 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0c0a0217
+                            urb MsgDesc: 33 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0c0a0227
+                            urb MsgDesc: 34 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x0c0a0237
+                            urb MsgDesc: 35 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x0c0a0247
+                            urb MsgDesc: 36 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x0c0a0257
+                            urb MsgDesc: 37 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0c0a0267
+                            urb MsgDesc: 38 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x0c0a0277
+                            urb MsgDesc: 39 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x0c0a0287
+                            urb MsgDesc: 40 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x0c0a0297
+                            urb MsgDesc: 41 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0c0a02a7
+                            urb MsgDesc: 42 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0c0a02b7
+                            urb MsgDesc: 43 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0c0a02c7
+                            urb MsgDesc: 44 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0c0a02d7
+                            urb MsgDesc: 45 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0c0a02e7
+                            urb MsgDesc: 46 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0c0a02f7
+                            urb MsgDesc: 47 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x0c0a0307
+                            urb MsgDesc: 48 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0c0a0317
+                            urb MsgDesc: 49 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0c0a0327
+                            urb MsgDesc: 50 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0c0a0337
+                            urb MsgDesc: 51 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0c0a0347
+                            urb MsgDesc: 52 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0c0a0357
+                            urb MsgDesc: 53 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0c0a0367
+                            urb MsgDesc: 54 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x0c0a0377
+                            urb MsgDesc: 55 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0c0a0387
+                            urb MsgDesc: 56 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0c0a0397
+                            urb MsgDesc: 57 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0c0a03a7
+                            urb MsgDesc: 58 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0c0a03b7
+                            urb MsgDesc: 59 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0c0a03c7
+                            urb MsgDesc: 60 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0c0a03d7
+                            urb MsgDesc: 61 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x0c0a03e7
+                            urb MsgDesc: 62 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g41<8,8,1>UD    0x0c0a03f7
+                            urb MsgDesc: 63 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0a080067
+                            urb MsgDesc: 6 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x0a080077
+                            urb MsgDesc: 7 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x0a080087
+                            urb MsgDesc: 8 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x0a080097
+                            urb MsgDesc: 9 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0a0800a7
+                            urb MsgDesc: 10 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0a0800b7
+                            urb MsgDesc: 11 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0a0800c7
+                            urb MsgDesc: 12 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0a0800d7
+                            urb MsgDesc: 13 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0a0800e7
+                            urb MsgDesc: 14 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0a0800f7
+                            urb MsgDesc: 15 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x0a080107
+                            urb MsgDesc: 16 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0a080117
+                            urb MsgDesc: 17 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0a080127
+                            urb MsgDesc: 18 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0a080137
+                            urb MsgDesc: 19 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0a080147
+                            urb MsgDesc: 20 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0a080157
+                            urb MsgDesc: 21 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0a080167
+                            urb MsgDesc: 22 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x0a080177
+                            urb MsgDesc: 23 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0a080187
+                            urb MsgDesc: 24 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0a080197
+                            urb MsgDesc: 25 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0a0801a7
+                            urb MsgDesc: 26 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0a0801b7
+                            urb MsgDesc: 27 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0a0801c7
+                            urb MsgDesc: 28 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0a0801d7
+                            urb MsgDesc: 29 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x0a0801e7
+                            urb MsgDesc: 30 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g41<8,8,1>UD    0x0a0801f7
+                            urb MsgDesc: 31 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         g124<1>UW       g5<8,8,1>UD     0x064a0001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x0a8c0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x06423102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g26<8,8,1>UD    0x0c843102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g9<8,8,1>UD     0x08420001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x10840001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g5<1>UW         g15<8,8,1>UD    0x04420203
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 2 mlen 2 rlen 4 { align1 1Q };
+send(16)        g7<1>UW         g27<8,8,1>UD    0x08840203
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 2 mlen 4 rlen 8 { align1 1H };
+send(16)        g4<1>UW         g17<8,8,1>UD    0x0420a503
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 3, SIMD16, inc) mlen 2 rlen 2 { align1 1H };
+send(16)        null<1>UW       g18<8,8,1>UD    0x04008504
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 4, SIMD16, inc) mlen 2 rlen 0 { align1 1H };
+send(16)        g11<1>UW        g19<8,8,1>UD    0x0420a602
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, dec) mlen 2 rlen 2 { align1 1H };
+send(16)        null<1>UW       g20<8,8,1>UD    0x04008505
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 5, SIMD16, inc) mlen 2 rlen 0 { align1 1H };
+send(16)        g16<1>UW        g21<8,8,1>UD    0x04205e01
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD16, Mask = 0xe) mlen 2 rlen 2 { align1 1H };
+send(16)        null<1>UW       g22<8,8,1>UD    0x04008506
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 6, SIMD16, inc) mlen 2 rlen 0 { align1 1H };
+send(8)         g26<1>UW        g26<8,8,1>UD    0x0242a203
+                            sampler MsgDesc: resinfo SIMD8 Surface = 3 Sampler = 2 mlen 1 rlen 4 { align1 1Q };
+send(8)         g30<1>UW        g30<8,8,1>UD    0x0242a304
+                            sampler MsgDesc: resinfo SIMD8 Surface = 4 Sampler = 3 mlen 1 rlen 4 { align1 1Q };
+send(8)         g34<1>UW        g34<8,8,1>UD    0x0242a405
+                            sampler MsgDesc: resinfo SIMD8 Surface = 5 Sampler = 4 mlen 1 rlen 4 { align1 1Q };
+send(8)         g38<1>UW        g38<8,8,1>UD    0x0242a506
+                            sampler MsgDesc: resinfo SIMD8 Surface = 6 Sampler = 5 mlen 1 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g25<8,8,1>UD    0x0242a102
+                            sampler MsgDesc: resinfo SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 4 { align1 1Q };
+send(8)         g42<1>UW        g42<8,8,1>UD    0x0242a607
+                            sampler MsgDesc: resinfo SIMD8 Surface = 7 Sampler = 6 mlen 1 rlen 4 { align1 1Q };
+send(8)         g46<1>UW        g46<8,8,1>UD    0x0242a708
+                            sampler MsgDesc: resinfo SIMD8 Surface = 8 Sampler = 7 mlen 1 rlen 4 { align1 1Q };
+send(8)         g50<1>UW        g50<8,8,1>UD    0x0242a809
+                            sampler MsgDesc: resinfo SIMD8 Surface = 9 Sampler = 8 mlen 1 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g54<8,8,1>UD    0x0242a90a
+                            sampler MsgDesc: resinfo SIMD8 Surface = 10 Sampler = 9 mlen 1 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g55<8,8,1>UD    0x0242aa0b
+                            sampler MsgDesc: resinfo SIMD8 Surface = 11 Sampler = 10 mlen 1 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g56<8,8,1>UD    0x0242ab0c
+                            sampler MsgDesc: resinfo SIMD8 Surface = 12 Sampler = 11 mlen 1 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g57<8,8,1>UD    0x0242ac0d
+                            sampler MsgDesc: resinfo SIMD8 Surface = 13 Sampler = 12 mlen 1 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0484a102
+                            sampler MsgDesc: resinfo SIMD16 Surface = 2 Sampler = 1 mlen 2 rlen 8 { align1 1H };
+send(16)        g82<1>UW        g110<8,8,1>UD   0x0484aa0b
+                            sampler MsgDesc: resinfo SIMD16 Surface = 11 Sampler = 10 mlen 2 rlen 8 { align1 1H };
+send(16)        g18<1>UW        g26<8,8,1>UD    0x0484a203
+                            sampler MsgDesc: resinfo SIMD16 Surface = 3 Sampler = 2 mlen 2 rlen 8 { align1 1H };
+send(16)        g90<1>UW        g112<8,8,1>UD   0x0484ab0c
+                            sampler MsgDesc: resinfo SIMD16 Surface = 12 Sampler = 11 mlen 2 rlen 8 { align1 1H };
+send(16)        g98<1>UW        g106<8,8,1>UD   0x0484ac0d
+                            sampler MsgDesc: resinfo SIMD16 Surface = 13 Sampler = 12 mlen 2 rlen 8 { align1 1H };
+send(16)        g26<1>UW        g34<8,8,1>UD    0x0484a304
+                            sampler MsgDesc: resinfo SIMD16 Surface = 4 Sampler = 3 mlen 2 rlen 8 { align1 1H };
+send(16)        g34<1>UW        g42<8,8,1>UD    0x0484a405
+                            sampler MsgDesc: resinfo SIMD16 Surface = 5 Sampler = 4 mlen 2 rlen 8 { align1 1H };
+send(16)        g42<1>UW        g50<8,8,1>UD    0x0484a506
+                            sampler MsgDesc: resinfo SIMD16 Surface = 6 Sampler = 5 mlen 2 rlen 8 { align1 1H };
+send(16)        g50<1>UW        g58<8,8,1>UD    0x0484a607
+                            sampler MsgDesc: resinfo SIMD16 Surface = 7 Sampler = 6 mlen 2 rlen 8 { align1 1H };
+send(16)        g58<1>UW        g66<8,8,1>UD    0x0484a708
+                            sampler MsgDesc: resinfo SIMD16 Surface = 8 Sampler = 7 mlen 2 rlen 8 { align1 1H };
+send(16)        g66<1>UW        g74<8,8,1>UD    0x0484a809
+                            sampler MsgDesc: resinfo SIMD16 Surface = 9 Sampler = 8 mlen 2 rlen 8 { align1 1H };
+send(16)        g74<1>UW        g108<8,8,1>UD   0x0484a90a
+                            sampler MsgDesc: resinfo SIMD16 Surface = 10 Sampler = 9 mlen 2 rlen 8 { align1 1H };
+send(16)        null<1>UW       g1<8,8,1>UD     0x040085fe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, inc) mlen 2 rlen 0 { align1 1H };
+send(16)        null<1>UW       g1<8,8,1>UD     0x08008dfe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, umin) mlen 4 rlen 0 { align1 1H };
+send(16)        null<1>UW       g5<8,8,1>UD     0x08008bfe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, imin) mlen 4 rlen 0 { align1 1H };
+send(16)        null<1>UW       g1<8,8,1>UD     0x08008cfe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, umax) mlen 4 rlen 0 { align1 1H };
+send(16)        null<1>UW       g5<8,8,1>UD     0x08008afe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, imax) mlen 4 rlen 0 { align1 1H };
+send(16)        null<1>UW       g1<8,8,1>UD     0x080081fe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, and) mlen 4 rlen 0 { align1 1H };
+send(16)        null<1>UW       g1<8,8,1>UD     0x080082fe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, or) mlen 4 rlen 0 { align1 1H };
+send(16)        null<1>UW       g1<8,8,1>UD     0x080083fe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, xor) mlen 4 rlen 0 { align1 1H };
+send(16)        null<1>UW       g1<8,8,1>UD     0x080084fe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, mov) mlen 4 rlen 0 { align1 1H };
+send(16)        null<1>UW       g1<8,8,1>UD     0x0c008efe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, cmpwr) mlen 6 rlen 0 { align1 1H };
+send(8)         null<1>F        g119<8,8,1>F    0x92080067
+                            urb MsgDesc: 6 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         g124<1>UW       g11<8,8,1>UD    0x12424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 9 rlen 4 { align1 1Q };
+send(16)        g1<1>UW         g14<8,8,1>UD    0x0820a4fe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, mov) mlen 4 rlen 2 { align1 1H };
+send(16)        g13<1>UW        g17<8,8,1>UD    0x0820a2fe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, or) mlen 4 rlen 2 { align1 1H };
+send(16)        null<1>UW       g123<8,8,1>UD   0x060a03fd
+                            data MsgDesc: ( DC OWORD block write, 253, 3) mlen 3 rlen 0 { align1 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1000
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 0, 16) mlen 1 rlen 2           { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1002
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 2, 16) mlen 1 rlen 2           { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1012
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 18, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1014
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 20, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1004
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 4, 16) mlen 1 rlen 2           { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1006
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 6, 16) mlen 1 rlen 2           { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1016
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 22, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1018
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 24, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1008
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 8, 16) mlen 1 rlen 2           { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c100a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 10, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c101a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 26, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c101c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 28, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c100c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 12, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c100e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 14, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c101e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 30, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1022
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 34, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1010
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 16, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1020
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 32, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1024
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 36, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c102a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 42, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1026
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 38, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1028
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 40, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c102c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 44, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1032
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 50, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c102e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 46, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1030
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 48, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1034
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 52, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c103a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 58, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1036
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 54, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1038
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 56, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c103c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 60, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1042
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 66, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c103e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 62, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1040
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 64, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1044
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 68, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c104a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 74, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1046
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 70, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1048
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 72, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c104c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 76, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1052
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 82, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c104e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 78, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1050
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 80, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1054
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 84, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c105a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 90, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1056
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 86, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1058
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 88, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c105c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 92, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1062
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 98, 16) mlen 1 rlen 2          { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c105e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 94, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1060
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 96, 16) mlen 1 rlen 2          { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1064
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 100, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c106a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 106, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1066
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 102, 16) mlen 1 rlen 2         { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1068
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 104, 16) mlen 1 rlen 2         { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c106c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 108, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1072
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 114, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c106e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 110, 16) mlen 1 rlen 2         { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1070
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 112, 16) mlen 1 rlen 2         { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1074
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 116, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c107a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 122, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1076
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 118, 16) mlen 1 rlen 2         { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1078
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 120, 16) mlen 1 rlen 2         { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c107c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 124, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1082
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 130, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c107e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 126, 16) mlen 1 rlen 2         { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1080
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 128, 16) mlen 1 rlen 2         { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1084
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 132, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c108a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 138, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c1086
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 134, 16) mlen 1 rlen 2         { align1 WE_all 2H };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c1088
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 136, 16) mlen 1 rlen 2         { align1 WE_all 2H };
+send(16)        g4<1>UW         g0<8,8,1>F      0x022c108c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 140, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(8)         null<1>F        g12<8,8,1>UD    0x0c0a0127
+                            urb MsgDesc: 18 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         g2<1>UW         g11<8,8,1>UD    0x04420405
+                            sampler MsgDesc: sample SIMD8 Surface = 5 Sampler = 4 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x04420506
+                            sampler MsgDesc: sample SIMD8 Surface = 6 Sampler = 5 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x04420607
+                            sampler MsgDesc: sample SIMD8 Surface = 7 Sampler = 6 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g14<8,8,1>UD    0x04420708
+                            sampler MsgDesc: sample SIMD8 Surface = 8 Sampler = 7 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g15<8,8,1>UD    0x04420809
+                            sampler MsgDesc: sample SIMD8 Surface = 9 Sampler = 8 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g16<8,8,1>UD    0x0442090a
+                            sampler MsgDesc: sample SIMD8 Surface = 10 Sampler = 9 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g17<8,8,1>UD    0x04420a0b
+                            sampler MsgDesc: sample SIMD8 Surface = 11 Sampler = 10 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g18<8,8,1>UD    0x04420b0c
+                            sampler MsgDesc: sample SIMD8 Surface = 12 Sampler = 11 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g19<8,8,1>UD    0x04420c0d
+                            sampler MsgDesc: sample SIMD8 Surface = 13 Sampler = 12 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g20<8,8,1>UD    0x04420d0e
+                            sampler MsgDesc: sample SIMD8 Surface = 14 Sampler = 13 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g21<8,8,1>UD    0x04420e0f
+                            sampler MsgDesc: sample SIMD8 Surface = 15 Sampler = 14 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g22<8,8,1>UD    0x04420f10
+                            sampler MsgDesc: sample SIMD8 Surface = 16 Sampler = 15 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0011
+                            sampler MsgDesc: sample SIMD8 Surface = 17 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0112
+                            sampler MsgDesc: sample SIMD8 Surface = 18 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0213
+                            sampler MsgDesc: sample SIMD8 Surface = 19 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0314
+                            sampler MsgDesc: sample SIMD8 Surface = 20 Sampler = 3 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0415
+                            sampler MsgDesc: sample SIMD8 Surface = 21 Sampler = 4 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0516
+                            sampler MsgDesc: sample SIMD8 Surface = 22 Sampler = 5 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0617
+                            sampler MsgDesc: sample SIMD8 Surface = 23 Sampler = 6 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0718
+                            sampler MsgDesc: sample SIMD8 Surface = 24 Sampler = 7 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0819
+                            sampler MsgDesc: sample SIMD8 Surface = 25 Sampler = 8 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a091a
+                            sampler MsgDesc: sample SIMD8 Surface = 26 Sampler = 9 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0a1b
+                            sampler MsgDesc: sample SIMD8 Surface = 27 Sampler = 10 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0b1c
+                            sampler MsgDesc: sample SIMD8 Surface = 28 Sampler = 11 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0c1d
+                            sampler MsgDesc: sample SIMD8 Surface = 29 Sampler = 12 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0d1e
+                            sampler MsgDesc: sample SIMD8 Surface = 30 Sampler = 13 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0e1f
+                            sampler MsgDesc: sample SIMD8 Surface = 31 Sampler = 14 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0f20
+                            sampler MsgDesc: sample SIMD8 Surface = 32 Sampler = 15 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g28<8,8,1>UD    0x08840405
+                            sampler MsgDesc: sample SIMD16 Surface = 5 Sampler = 4 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g29<8,8,1>UD    0x08840506
+                            sampler MsgDesc: sample SIMD16 Surface = 6 Sampler = 5 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g30<8,8,1>UD    0x08840607
+                            sampler MsgDesc: sample SIMD16 Surface = 7 Sampler = 6 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g31<8,8,1>UD    0x08840708
+                            sampler MsgDesc: sample SIMD16 Surface = 8 Sampler = 7 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g32<8,8,1>UD    0x08840809
+                            sampler MsgDesc: sample SIMD16 Surface = 9 Sampler = 8 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g33<8,8,1>UD    0x0884090a
+                            sampler MsgDesc: sample SIMD16 Surface = 10 Sampler = 9 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g34<8,8,1>UD    0x08840a0b
+                            sampler MsgDesc: sample SIMD16 Surface = 11 Sampler = 10 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g35<8,8,1>UD    0x08840b0c
+                            sampler MsgDesc: sample SIMD16 Surface = 12 Sampler = 11 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g36<8,8,1>UD    0x08840c0d
+                            sampler MsgDesc: sample SIMD16 Surface = 13 Sampler = 12 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g37<8,8,1>UD    0x08840d0e
+                            sampler MsgDesc: sample SIMD16 Surface = 14 Sampler = 13 mlen 4 rlen 8 { align1 1H };
+send(16)        g7<1>UW         g38<8,8,1>UD    0x08840e0f
+                            sampler MsgDesc: sample SIMD16 Surface = 15 Sampler = 14 mlen 4 rlen 8 { align1 1H };
+send(16)        g23<1>UW        g39<8,8,1>UD    0x08840f10
+                            sampler MsgDesc: sample SIMD16 Surface = 16 Sampler = 15 mlen 4 rlen 8 { align1 1H };
+send(16)        g17<1>UW        g2<8,8,1>UD     0x0a8c0011
+                            sampler MsgDesc: sample SIMD16 Surface = 17 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(16)        g29<1>UW        g7<8,8,1>UD     0x0a8c0112
+                            sampler MsgDesc: sample SIMD16 Surface = 18 Sampler = 1 mlen 5 rlen 8 { align1 1H };
+send(16)        g27<1>UW        g12<8,8,1>UD    0x0a8c0213
+                            sampler MsgDesc: sample SIMD16 Surface = 19 Sampler = 2 mlen 5 rlen 8 { align1 1H };
+send(16)        g32<1>UW        g17<8,8,1>UD    0x0a8c0314
+                            sampler MsgDesc: sample SIMD16 Surface = 20 Sampler = 3 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g22<8,8,1>UD    0x0a8c0415
+                            sampler MsgDesc: sample SIMD16 Surface = 21 Sampler = 4 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g27<8,8,1>UD    0x0a8c0516
+                            sampler MsgDesc: sample SIMD16 Surface = 22 Sampler = 5 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g32<8,8,1>UD    0x0a8c0617
+                            sampler MsgDesc: sample SIMD16 Surface = 23 Sampler = 6 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g37<8,8,1>UD    0x0a8c0718
+                            sampler MsgDesc: sample SIMD16 Surface = 24 Sampler = 7 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g42<8,8,1>UD    0x0a8c0819
+                            sampler MsgDesc: sample SIMD16 Surface = 25 Sampler = 8 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g47<8,8,1>UD    0x0a8c091a
+                            sampler MsgDesc: sample SIMD16 Surface = 26 Sampler = 9 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g52<8,8,1>UD    0x0a8c0a1b
+                            sampler MsgDesc: sample SIMD16 Surface = 27 Sampler = 10 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g57<8,8,1>UD    0x0a8c0b1c
+                            sampler MsgDesc: sample SIMD16 Surface = 28 Sampler = 11 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g62<8,8,1>UD    0x0a8c0c1d
+                            sampler MsgDesc: sample SIMD16 Surface = 29 Sampler = 12 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g67<8,8,1>UD    0x0a8c0d1e
+                            sampler MsgDesc: sample SIMD16 Surface = 30 Sampler = 13 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g72<8,8,1>UD    0x0a8c0e1f
+                            sampler MsgDesc: sample SIMD16 Surface = 31 Sampler = 14 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g77<8,8,1>UD    0x0a8c0f20
+                            sampler MsgDesc: sample SIMD16 Surface = 32 Sampler = 15 mlen 5 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g2<8,8,1>UD     0x02420102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g2<8,8,1>UD     0x04840102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 2 rlen 8 { align1 1H };
+send(8)         null<1>UW       g10<8,8,1>UD    0x0a026000
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 0, SIMD8, Mask = 0x0) mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g8<8,8,1>UD     0x0c0a8027
+                            urb MsgDesc: 2 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>F     0x12080047
+                            urb MsgDesc: 4 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080087
+                            urb MsgDesc: 8 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         g2<1>UW         g74<8,8,1>UD    0x02106e02
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+send(8)         null<1>UW       g1<8,8,1>UD     0x04026efe
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 254, SIMD8, Mask = 0xe) mlen 2 rlen 0 { align1 1Q };
+send(8)         g67<1>UW        g2<8,8,1>UD     0x0410bdfe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD8, umin) mlen 2 rlen 1 { align1 1Q };
+send(8)         g68<1>UW        g2<8,8,1>UD     0x02106efe
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 254, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+send(8)         g73<1>UW        g4<8,8,1>UD     0x0410b4fe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD8, mov) mlen 2 rlen 1 { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080097
+                            urb MsgDesc: 9 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         g29<1>UW        g5<8,8,1>UD     0x0e4b2001
+                            sampler MsgDesc: gather4_po_c SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x084a2001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x0e8c2001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g5<8,8,1>UD     0x0a426001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g10<8,8,1>UD    0x0a426102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g15<8,8,1>UD    0x14846001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g25<8,8,1>UD    0x14846102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 10 rlen 8 { align1 1H };
+send(8)         null<1>F        g14<8,8,1>UD    0x0c0a8037
+                            urb MsgDesc: 3 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x0c0a8047
+                            urb MsgDesc: 4 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0c0a8057
+                            urb MsgDesc: 5 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x084a5001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x084a5102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x0e8c5001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0e8c5102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(16)        g1<1>UW         g7<8,8,1>UD     0x0820a7fe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, add) mlen 4 rlen 2 { align1 1H };
+send(8)         g2<1>UW         g6<8,8,1>UD     0x084a3001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g10<8,8,1>UD    0x084a3102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0e8c3001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0e8c3102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g3<8,8,1>UD     0x044a0001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g4<8,8,1>UD     0x068c0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 3 rlen 8 { align1 1H };
+send(8)         g17<1>UW        g12<8,8,1>UD    0x04420003
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g7<1>UW         g39<8,8,1>UD    0x08840003
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g11<1>UW        g39<8,8,1>UD    0x06427008
+                            sampler MsgDesc: ld SIMD8 Surface = 8 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g15<1>UW        g39<8,8,1>UD    0x06427109
+                            sampler MsgDesc: ld SIMD8 Surface = 9 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(8)         g19<1>UW        g39<8,8,1>UD    0x0642720a
+                            sampler MsgDesc: ld SIMD8 Surface = 10 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(8)         g23<1>UW        g39<8,8,1>UD    0x0642730b
+                            sampler MsgDesc: ld SIMD8 Surface = 11 Sampler = 3 mlen 3 rlen 4 { align1 1Q };
+send(8)         g27<1>UW        g39<8,8,1>UD    0x0642740c
+                            sampler MsgDesc: ld SIMD8 Surface = 12 Sampler = 4 mlen 3 rlen 4 { align1 1Q };
+send(8)         g31<1>UW        g39<8,8,1>UD    0x0642750d
+                            sampler MsgDesc: ld SIMD8 Surface = 13 Sampler = 5 mlen 3 rlen 4 { align1 1Q };
+send(8)         g35<1>UW        g39<8,8,1>UD    0x0642760e
+                            sampler MsgDesc: ld SIMD8 Surface = 14 Sampler = 6 mlen 3 rlen 4 { align1 1Q };
+send(8)         g39<1>UW        g39<8,8,1>UD    0x0642770f
+                            sampler MsgDesc: ld SIMD8 Surface = 15 Sampler = 7 mlen 3 rlen 4 { align1 1Q };
+send(16)        g67<1>UW        g93<8,8,1>UD    0x0c847008
+                            sampler MsgDesc: ld SIMD16 Surface = 8 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(16)        g27<1>UW        g93<8,8,1>UD    0x0c847109
+                            sampler MsgDesc: ld SIMD16 Surface = 9 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(16)        g37<1>UW        g93<8,8,1>UD    0x0c84720a
+                            sampler MsgDesc: ld SIMD16 Surface = 10 Sampler = 2 mlen 6 rlen 8 { align1 1H };
+send(16)        g47<1>UW        g93<8,8,1>UD    0x0c84730b
+                            sampler MsgDesc: ld SIMD16 Surface = 11 Sampler = 3 mlen 6 rlen 8 { align1 1H };
+send(16)        g57<1>UW        g93<8,8,1>UD    0x0c84740c
+                            sampler MsgDesc: ld SIMD16 Surface = 12 Sampler = 4 mlen 6 rlen 8 { align1 1H };
+send(16)        g17<1>UW        g93<8,8,1>UD    0x0c84750d
+                            sampler MsgDesc: ld SIMD16 Surface = 13 Sampler = 5 mlen 6 rlen 8 { align1 1H };
+send(16)        g85<1>UW        g93<8,8,1>UD    0x0c84760e
+                            sampler MsgDesc: ld SIMD16 Surface = 14 Sampler = 6 mlen 6 rlen 8 { align1 1H };
+send(16)        g77<1>UW        g93<8,8,1>UD    0x0c84770f
+                            sampler MsgDesc: ld SIMD16 Surface = 15 Sampler = 7 mlen 6 rlen 8 { align1 1H };
+send(16)        g81<1>UW        g84<8,8,1>UD    0x04205e00
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 0, SIMD16, Mask = 0xe) mlen 2 rlen 2 { align1 1H };
+send(8)         null<1>F        g122<8,8,1>F    0x8c0a0047
+                            urb MsgDesc: 4 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         g124<1>UW       g8<8,8,1>UD     0x064a1001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g15<8,8,1>UD    0x0a8c1001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(8)         g14<1>UW        g11<8,8,1>UD    0x084b0202
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 2 Sampler = 2 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x0a4b0101
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 1 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(8)         null<1>F        g3<8,8,1>F      0x12080087
+                            urb MsgDesc: 8 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a0800a7
+                            urb MsgDesc: 10 SIMD8 write mlen 5 rlen 0       { align1 1Q EOT };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x084a6001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x084a6102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x0e8c6001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0e8c6102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(8)         g31<1>UD        g28<8,8,1>UD    0x02380238
+                            urb MsgDesc: 35 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g34<1>UD        g28<8,8,1>UD    0x02380438
+                            urb MsgDesc: 67 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g37<1>UD        g28<8,8,1>UD    0x02380638
+                            urb MsgDesc: 99 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g28<8,8,1>UD    0x02380248
+                            urb MsgDesc: 36 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g28<8,8,1>UD    0x02380448
+                            urb MsgDesc: 68 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g28<1>UD        g28<8,8,1>UD    0x02380648
+                            urb MsgDesc: 100 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g22<1>UD        g29<8,8,1>UD    0x02380258
+                            urb MsgDesc: 37 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g29<8,8,1>UD    0x02380458
+                            urb MsgDesc: 69 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g28<1>UD        g29<8,8,1>UD    0x02380658
+                            urb MsgDesc: 101 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g22<1>UD        g30<8,8,1>UD    0x02380268
+                            urb MsgDesc: 38 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g30<8,8,1>UD    0x02380468
+                            urb MsgDesc: 70 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g28<1>UD        g30<8,8,1>UD    0x02380668
+                            urb MsgDesc: 102 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g22<1>UD        g31<8,8,1>UD    0x02380278
+                            urb MsgDesc: 39 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g31<8,8,1>UD    0x02380478
+                            urb MsgDesc: 71 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g28<1>UD        g31<8,8,1>UD    0x02380678
+                            urb MsgDesc: 103 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g25<1>UD        g32<8,8,1>UD    0x02380488
+                            urb MsgDesc: 72 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g32<8,8,1>UD    0x02380288
+                            urb MsgDesc: 40 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g28<1>UD        g32<8,8,1>UD    0x02380688
+                            urb MsgDesc: 104 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g25<1>UD        g33<8,8,1>UD    0x02380498
+                            urb MsgDesc: 73 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g33<8,8,1>UD    0x02380298
+                            urb MsgDesc: 41 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g28<1>UD        g33<8,8,1>UD    0x02380698
+                            urb MsgDesc: 105 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g25<1>UD        g34<8,8,1>UD    0x023806a8
+                            urb MsgDesc: 106 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g34<8,8,1>UD    0x023802a8
+                            urb MsgDesc: 42 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g34<8,8,1>UD    0x023804a8
+                            urb MsgDesc: 74 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g8<1>UD         g35<8,8,1>UD    0x023802b8
+                            urb MsgDesc: 43 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g35<8,8,1>UD    0x023804b8
+                            urb MsgDesc: 75 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g35<8,8,1>UD    0x023806b8
+                            urb MsgDesc: 107 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g36<8,8,1>UD    0x023802c8
+                            urb MsgDesc: 44 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g36<8,8,1>UD    0x023804c8
+                            urb MsgDesc: 76 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g36<8,8,1>UD    0x023806c8
+                            urb MsgDesc: 108 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g37<8,8,1>UD    0x023802d8
+                            urb MsgDesc: 45 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g37<8,8,1>UD    0x023804d8
+                            urb MsgDesc: 77 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g37<8,8,1>UD    0x023806d8
+                            urb MsgDesc: 109 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g38<8,8,1>UD    0x023802e8
+                            urb MsgDesc: 46 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g38<8,8,1>UD    0x023804e8
+                            urb MsgDesc: 78 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g38<8,8,1>UD    0x023806e8
+                            urb MsgDesc: 110 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g39<8,8,1>UD    0x023802f8
+                            urb MsgDesc: 47 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g39<8,8,1>UD    0x023804f8
+                            urb MsgDesc: 79 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g39<8,8,1>UD    0x023806f8
+                            urb MsgDesc: 111 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g40<8,8,1>UD    0x02380308
+                            urb MsgDesc: 48 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g40<8,8,1>UD    0x02380508
+                            urb MsgDesc: 80 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g40<8,8,1>UD    0x02380708
+                            urb MsgDesc: 112 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g41<8,8,1>UD    0x02380318
+                            urb MsgDesc: 49 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g41<8,8,1>UD    0x02380518
+                            urb MsgDesc: 81 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g41<8,8,1>UD    0x02380718
+                            urb MsgDesc: 113 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g3<8,8,1>UD     0x02380328
+                            urb MsgDesc: 50 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g3<8,8,1>UD     0x02380528
+                            urb MsgDesc: 82 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g3<8,8,1>UD     0x02380728
+                            urb MsgDesc: 114 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g43<8,8,1>UD    0x02380338
+                            urb MsgDesc: 51 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g43<8,8,1>UD    0x02380538
+                            urb MsgDesc: 83 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g43<8,8,1>UD    0x02380738
+                            urb MsgDesc: 115 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g44<8,8,1>UD    0x02380348
+                            urb MsgDesc: 52 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g44<8,8,1>UD    0x02380548
+                            urb MsgDesc: 84 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g44<8,8,1>UD    0x02380748
+                            urb MsgDesc: 116 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g45<8,8,1>UD    0x02380358
+                            urb MsgDesc: 53 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g45<8,8,1>UD    0x02380558
+                            urb MsgDesc: 85 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g45<8,8,1>UD    0x02380758
+                            urb MsgDesc: 117 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g46<8,8,1>UD    0x02380368
+                            urb MsgDesc: 54 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g46<8,8,1>UD    0x02380568
+                            urb MsgDesc: 86 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g46<8,8,1>UD    0x02380768
+                            urb MsgDesc: 118 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g47<8,8,1>UD    0x02380378
+                            urb MsgDesc: 55 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g47<8,8,1>UD    0x02380578
+                            urb MsgDesc: 87 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g47<8,8,1>UD    0x02380778
+                            urb MsgDesc: 119 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g48<8,8,1>UD    0x02380388
+                            urb MsgDesc: 56 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g48<8,8,1>UD    0x02380588
+                            urb MsgDesc: 88 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g48<8,8,1>UD    0x02380788
+                            urb MsgDesc: 120 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g49<8,8,1>UD    0x02380398
+                            urb MsgDesc: 57 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g49<8,8,1>UD    0x02380598
+                            urb MsgDesc: 89 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g49<8,8,1>UD    0x02380798
+                            urb MsgDesc: 121 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g50<8,8,1>UD    0x023803a8
+                            urb MsgDesc: 58 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g50<8,8,1>UD    0x023805a8
+                            urb MsgDesc: 90 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g50<8,8,1>UD    0x023807a8
+                            urb MsgDesc: 122 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g54<8,8,1>UD    0x023803b8
+                            urb MsgDesc: 59 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g54<8,8,1>UD    0x023805b8
+                            urb MsgDesc: 91 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g54<8,8,1>UD    0x023807b8
+                            urb MsgDesc: 123 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g55<8,8,1>UD    0x023803c8
+                            urb MsgDesc: 60 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g55<8,8,1>UD    0x023805c8
+                            urb MsgDesc: 92 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g55<8,8,1>UD    0x023807c8
+                            urb MsgDesc: 124 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g56<8,8,1>UD    0x023803d8
+                            urb MsgDesc: 61 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g56<8,8,1>UD    0x023805d8
+                            urb MsgDesc: 93 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g56<8,8,1>UD    0x023807d8
+                            urb MsgDesc: 125 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g57<8,8,1>UD    0x023803e8
+                            urb MsgDesc: 62 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g57<8,8,1>UD    0x023805e8
+                            urb MsgDesc: 94 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g57<8,8,1>UD    0x023807e8
+                            urb MsgDesc: 126 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g58<8,8,1>UD    0x023803f8
+                            urb MsgDesc: 63 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g58<8,8,1>UD    0x023805f8
+                            urb MsgDesc: 95 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g58<8,8,1>UD    0x023807f8
+                            urb MsgDesc: 127 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g59<8,8,1>UD    0x02380208
+                            urb MsgDesc: 32 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g59<8,8,1>UD    0x02380408
+                            urb MsgDesc: 64 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g59<8,8,1>UD    0x02380608
+                            urb MsgDesc: 96 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g17<1>UD        g59<8,8,1>UD    0x02380808
+                            urb MsgDesc: 128 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g60<8,8,1>UD    0x02380218
+                            urb MsgDesc: 33 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g60<8,8,1>UD    0x02380418
+                            urb MsgDesc: 65 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g60<8,8,1>UD    0x02380618
+                            urb MsgDesc: 97 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g17<1>UD        g60<8,8,1>UD    0x02380818
+                            urb MsgDesc: 129 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x0c0a8067
+                            urb MsgDesc: 6 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x0c0a8077
+                            urb MsgDesc: 7 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0c0a8087
+                            urb MsgDesc: 8 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x0c0a8097
+                            urb MsgDesc: 9 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x0c0a80a7
+                            urb MsgDesc: 10 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x0c0a80b7
+                            urb MsgDesc: 11 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0c0a80c7
+                            urb MsgDesc: 12 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0c0a80d7
+                            urb MsgDesc: 13 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0c0a80e7
+                            urb MsgDesc: 14 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0c0a80f7
+                            urb MsgDesc: 15 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0c0a8107
+                            urb MsgDesc: 16 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0c0a8117
+                            urb MsgDesc: 17 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x0c0a8127
+                            urb MsgDesc: 18 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0c0a8137
+                            urb MsgDesc: 19 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0c0a8147
+                            urb MsgDesc: 20 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0c0a8157
+                            urb MsgDesc: 21 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0c0a8167
+                            urb MsgDesc: 22 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0c0a8177
+                            urb MsgDesc: 23 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0c0a8187
+                            urb MsgDesc: 24 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x0c0a8197
+                            urb MsgDesc: 25 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0c0a81a7
+                            urb MsgDesc: 26 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0c0a81b7
+                            urb MsgDesc: 27 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0c0a81c7
+                            urb MsgDesc: 28 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0c0a81d7
+                            urb MsgDesc: 29 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0c0a81e7
+                            urb MsgDesc: 30 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0c0a81f7
+                            urb MsgDesc: 31 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x0c0a8207
+                            urb MsgDesc: 32 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g41<8,8,1>UD    0x0c0a8217
+                            urb MsgDesc: 33 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(1)         g1<1>UW         g1<0,1,0>UW     0x0209c000
+                            data MsgDesc: ( DC mfence, 0, 0) mlen 1 rlen 0  { align1 WE_all 1N };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x02106e01
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+send(16)        g11<1>UW        g19<8,8,1>UD    0x0420a601
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, dec) mlen 2 rlen 2 { align1 1H };
+send(16)        null<1>UW       g20<8,8,1>UD    0x04008503
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 3, SIMD16, inc) mlen 2 rlen 0 { align1 1H };
+send(8)         null<1>F        g122<8,8,1>UD   0x8c088007
+                            urb MsgDesc: 0 SIMD8 write masked mlen 6 rlen 0 { align1 1Q EOT };
+(+f1.0) send(8) null<1>UW       g6<8,8,1>UD     0x04026e01
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD8, Mask = 0xe) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g6<8,8,1>UD     0x04026e02
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 2, SIMD8, Mask = 0xe) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g8<8,8,1>UD     0x08025e01
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD16, Mask = 0xe) mlen 4 rlen 0 { align1 1H };
+(+f1.0) send(16) null<1>UW      g8<8,8,1>UD     0x08025e02
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 2, SIMD16, Mask = 0xe) mlen 4 rlen 0 { align1 1H };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x0a4a5001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g17<8,8,1>UD    0x0a4a5102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g25<1>UW        g7<8,8,1>UD     0x128c5001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(16)        g33<1>UW        g16<8,8,1>UD    0x128c5102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 9 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x0c4b2001
+                            sampler MsgDesc: gather4_po_c SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g15<8,8,1>UD    0x168d2001
+                            sampler MsgDesc: gather4_po_c SIMD16 Surface = 1 Sampler = 0 mlen 11 rlen 8 { align1 1H };
+send(8)         g54<1>UD        g7<8,8,1>UD     0x02280048
+                            urb MsgDesc: 4 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         g7<1>UW         g53<8,8,1>UD    0x02106e00
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 0, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+send(8)         null<1>UW       g53<8,8,1>UD    0x02009500
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 0, SIMD8, inc) mlen 1 rlen 0 { align1 1Q };
+send(8)         g7<1>UD         g37<8,8,1>UD    0x02480438
+                            urb MsgDesc: 67 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g11<1>UD        g37<8,8,1>UD    0x02480638
+                            urb MsgDesc: 99 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g13<1>UD        g14<8,8,1>UD    0x042a0148
+                            urb MsgDesc: 20 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g14<8,8,1>UD    0x042a0048
+                            urb MsgDesc: 4 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x124b4001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 9 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g16<8,8,1>UD    0x124b4102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 9 rlen 4 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x02380078
+                            urb MsgDesc: 7 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         g124<1>UW       g7<8,8,1>UD     0x0a4a2001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g9<8,8,1>UD     0x128c2001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         g50<1>UD        g51<8,8,1>UD    0x02180018
+                            urb MsgDesc: 1 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g59<1>UW        g64<8,8,1>UD    0x02427002
+                            sampler MsgDesc: ld SIMD8 Surface = 2 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g64<8,8,1>UD    0x02427003
+                            sampler MsgDesc: ld SIMD8 Surface = 3 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g64<8,8,1>UD    0x02427004
+                            sampler MsgDesc: ld SIMD8 Surface = 4 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g64<8,8,1>UD    0x02427005
+                            sampler MsgDesc: ld SIMD8 Surface = 5 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g64<8,8,1>UD    0x02427006
+                            sampler MsgDesc: ld SIMD8 Surface = 6 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g64<8,8,1>UD    0x02427007
+                            sampler MsgDesc: ld SIMD8 Surface = 7 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g64<8,8,1>UD    0x02427008
+                            sampler MsgDesc: ld SIMD8 Surface = 8 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g64<8,8,1>UD    0x02427009
+                            sampler MsgDesc: ld SIMD8 Surface = 9 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g30<1>UW        g64<8,8,1>UD    0x0242700a
+                            sampler MsgDesc: ld SIMD8 Surface = 10 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g34<1>UW        g64<8,8,1>UD    0x0242700b
+                            sampler MsgDesc: ld SIMD8 Surface = 11 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g38<1>UW        g64<8,8,1>UD    0x0242700c
+                            sampler MsgDesc: ld SIMD8 Surface = 12 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g42<1>UW        g64<8,8,1>UD    0x0242700d
+                            sampler MsgDesc: ld SIMD8 Surface = 13 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x06422505
+                            sampler MsgDesc: sample_l SIMD8 Surface = 5 Sampler = 5 mlen 3 rlen 4 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0a088067
+                            urb MsgDesc: 6 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x0a088077
+                            urb MsgDesc: 7 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x0a088087
+                            urb MsgDesc: 8 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x0a088097
+                            urb MsgDesc: 9 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0a0880a7
+                            urb MsgDesc: 10 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0a0880b7
+                            urb MsgDesc: 11 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0a0880c7
+                            urb MsgDesc: 12 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0a0880d7
+                            urb MsgDesc: 13 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0a0880e7
+                            urb MsgDesc: 14 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0a0880f7
+                            urb MsgDesc: 15 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x0a088107
+                            urb MsgDesc: 16 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0a088117
+                            urb MsgDesc: 17 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0a088127
+                            urb MsgDesc: 18 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0a088137
+                            urb MsgDesc: 19 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0a088147
+                            urb MsgDesc: 20 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0a088157
+                            urb MsgDesc: 21 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0a088167
+                            urb MsgDesc: 22 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x0a088177
+                            urb MsgDesc: 23 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0a088187
+                            urb MsgDesc: 24 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0a088197
+                            urb MsgDesc: 25 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0a0881a7
+                            urb MsgDesc: 26 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0a0881b7
+                            urb MsgDesc: 27 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0a0881c7
+                            urb MsgDesc: 28 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0a0881d7
+                            urb MsgDesc: 29 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x0a0881e7
+                            urb MsgDesc: 30 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g41<8,8,1>UD    0x0a0881f7
+                            urb MsgDesc: 31 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g4<8,8,1>UD     0x0e0a8027
+                            urb MsgDesc: 2 SIMD8 write per-slot masked mlen 7 rlen 0 { align1 1Q };
+(+f1.0) send(8) g3<1>UW         g3<8,8,1>UD     0x0410b702
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD8, add) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(16) g4<1>UW        g6<8,8,1>UD     0x0820a702
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, add) mlen 4 rlen 2 { align1 1H };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x0a423001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g15<8,8,1>UD    0x0a423102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g29<1>UW        g9<8,8,1>UD     0x14843001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(16)        g37<1>UW        g19<8,8,1>UD    0x14843102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 10 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x06424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+(+f1.0) send(8) g4<1>UW         g12<8,8,1>UD    0x0210b502
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD8, inc) mlen 1 rlen 1 { align1 1Q };
+(+f1.0) send(16) g5<1>UW        g17<8,8,1>UD    0x0420a502
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, inc) mlen 2 rlen 2 { align1 1H };
+send(8)         g12<1>UD        g1<8,8,1>UD     0x02280058
+                            urb MsgDesc: 5 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0e0a8067
+                            urb MsgDesc: 6 SIMD8 write per-slot masked mlen 7 rlen 0 { align1 1Q };
+send(8)         g12<1>UD        g1<8,8,1>UD     0x02280078
+                            urb MsgDesc: 7 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0e0a8087
+                            urb MsgDesc: 8 SIMD8 write per-slot masked mlen 7 rlen 0 { align1 1Q };
+send(8)         g12<1>UD        g1<8,8,1>UD     0x02280098
+                            urb MsgDesc: 9 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0e0a80a7
+                            urb MsgDesc: 10 SIMD8 write per-slot masked mlen 7 rlen 0 { align1 1Q };
+send(8)         g2<1>UW         g9<8,8,1>UD     0x024ab001
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x028cb001
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 1 Sampler = 0 mlen 1 rlen 8 { align1 1H };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x06026c01
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD8, Mask = 0xc) mlen 3 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g4<8,8,1>UD     0x0c025c01
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD16, Mask = 0xc) mlen 6 rlen 0 { align1 1H };
+send(8)         null<1>UW       g9<8,8,1>UD     0x04026e00
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 0, SIMD8, Mask = 0xe) mlen 2 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g9<8,8,1>UD     0x06026c00
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 0, SIMD8, Mask = 0xc) mlen 3 rlen 0 { align1 1Q };
+send(16)        g9<1>UW         g17<8,8,1>UD    0x04847002
+                            sampler MsgDesc: ld SIMD16 Surface = 2 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+(+f1.0) send(8) g14<1>UW        g13<8,8,1>UD    0x0410bb02
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD8, imin) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g18<1>UW        g4<8,8,1>UD     0x0410b402
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD8, mov) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(16) g22<1>UW       g24<8,8,1>UD    0x0820ab02
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, imin) mlen 4 rlen 2 { align1 1H };
+send(16)        g23<1>UW        g3<8,8,1>UD     0x04205e02
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD16, Mask = 0xe) mlen 2 rlen 2 { align1 1H };
+(+f1.0) send(16) g26<1>UW       g6<8,8,1>UD     0x0820a402
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, mov) mlen 4 rlen 2 { align1 1H };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280068
+                            urb MsgDesc: 6 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280088
+                            urb MsgDesc: 8 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022800a8
+                            urb MsgDesc: 10 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022800b8
+                            urb MsgDesc: 11 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022800c8
+                            urb MsgDesc: 12 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022800d8
+                            urb MsgDesc: 13 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022800e8
+                            urb MsgDesc: 14 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022800f8
+                            urb MsgDesc: 15 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280108
+                            urb MsgDesc: 16 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280118
+                            urb MsgDesc: 17 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280128
+                            urb MsgDesc: 18 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280138
+                            urb MsgDesc: 19 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280148
+                            urb MsgDesc: 20 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280158
+                            urb MsgDesc: 21 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280168
+                            urb MsgDesc: 22 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280178
+                            urb MsgDesc: 23 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280188
+                            urb MsgDesc: 24 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280198
+                            urb MsgDesc: 25 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022801a8
+                            urb MsgDesc: 26 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022801b8
+                            urb MsgDesc: 27 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022801c8
+                            urb MsgDesc: 28 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022801d8
+                            urb MsgDesc: 29 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022801e8
+                            urb MsgDesc: 30 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022801f8
+                            urb MsgDesc: 31 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280208
+                            urb MsgDesc: 32 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g2<1>UW         g3<8,8,1>UD     0x04203000
+                            pixel interp MsgDesc: (persp, per_slot_offset, 0x00) mlen 2 rlen 2 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x08413000
+                            pixel interp MsgDesc: (persp, per_slot_offset, 0x00) mlen 4 rlen 4 { align1 1H };
+send(8)         g2<1>UW         g0<8,8,1>UD     0x02201010
+                            pixel interp MsgDesc: (persp, sample_position, 0x10) mlen 1 rlen 2 { align1 1Q };
+send(16)        g2<1>UW         g0<8,8,1>UD     0x02411010
+                            pixel interp MsgDesc: (persp, sample_position, 0x10) mlen 1 rlen 4 { align1 1H };
+send(8)         g2<1>UW         g0<8,8,1>UD     0x02201020
+                            pixel interp MsgDesc: (persp, sample_position, 0x20) mlen 1 rlen 2 { align1 1Q };
+send(16)        g2<1>UW         g0<8,8,1>UD     0x02411020
+                            pixel interp MsgDesc: (persp, sample_position, 0x20) mlen 1 rlen 4 { align1 1H };
+send(8)         g2<1>UW         g0<8,8,1>UD     0x02201030
+                            pixel interp MsgDesc: (persp, sample_position, 0x30) mlen 1 rlen 2 { align1 1Q };
+send(16)        g2<1>UW         g0<8,8,1>UD     0x02411030
+                            pixel interp MsgDesc: (persp, sample_position, 0x30) mlen 1 rlen 4 { align1 1H };
+(+f1.0) send(8) null<1>UW       g118<8,8,1>UD   0x02009601
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, dec) mlen 1 rlen 0 { align1 1Q };
+(+f1.0) send(8) g47<1>UW        g118<8,8,1>UD   0x0210b601
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, dec) mlen 1 rlen 1 { align1 1Q };
+send(16)        null<1>UW       g8<8,8,1>UD     0x0c025c02
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 2, SIMD16, Mask = 0xc) mlen 6 rlen 0 { align1 1H };
+send(16)        g4<1>UW         g1<8,8,1>UD     0x04405c02
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD16, Mask = 0xc) mlen 2 rlen 4 { align1 1H };
+send(8)         null<1>UW       g124<8,8,1>UD   0x02009600
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 0, SIMD8, dec) mlen 1 rlen 0 { align1 1Q };
+send(8)         g51<1>UW        g124<8,8,1>UD   0x0210b600
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 0, SIMD8, dec) mlen 1 rlen 1 { align1 1Q };
+send(8)         g127<1>UW       g9<8,8,1>UD     0x0819a401
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, mov) mlen 4 rlen 1 { align1 1Q };
+send(8)         g127<1>UW       g2<8,8,1>UD     0x0819b401
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, mov) mlen 4 rlen 1 { align1 2Q };
+send(16)        g13<1>UW        g21<8,8,1>UD    0x0c85d002
+                            sampler MsgDesc: ld_mcs SIMD16 Surface = 2 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g13<1>UW        g31<8,8,1>UD    0x0a43e002
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 2 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x04421001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x08841001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g36<1>UW        g0<8,8,1>F      0x021c003f
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 63, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g37<1>UW        g0<8,8,1>F      0x021c0040
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 64, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g38<1>UW        g0<8,8,1>F      0x021c0041
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 65, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g39<1>UW        g0<8,8,1>F      0x021c0042
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 66, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g41<1>UW        g0<8,8,1>F      0x021c0044
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 68, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g40<1>UW        g0<8,8,1>F      0x021c0043
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 67, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g42<1>UW        g0<8,8,1>F      0x021c0045
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 69, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g43<1>UW        g0<8,8,1>F      0x021c0046
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 70, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g45<1>UW        g0<8,8,1>F      0x021c0048
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 72, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g44<1>UW        g0<8,8,1>F      0x021c0047
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 71, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g46<1>UW        g0<8,8,1>F      0x021c0049
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 73, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g47<1>UW        g0<8,8,1>F      0x021c004a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 74, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g61<1>UW        g0<8,8,1>F      0x021c0057
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 87, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g48<1>UW        g0<8,8,1>F      0x021c004b
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 75, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g62<1>UW        g0<8,8,1>F      0x021c0058
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 88, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g57<1>UW        g0<8,8,1>F      0x021c0053
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 83, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g63<1>UW        g0<8,8,1>F      0x021c0059
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 89, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g58<1>UW        g0<8,8,1>F      0x021c0054
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 84, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g64<1>UW        g0<8,8,1>F      0x021c005a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 90, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g53<1>UW        g0<8,8,1>F      0x021c004f
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 79, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g59<1>UW        g0<8,8,1>F      0x021c0055
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 85, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g54<1>UW        g0<8,8,1>F      0x021c0050
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 80, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g60<1>UW        g0<8,8,1>F      0x021c0056
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 86, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g51<1>UW        g0<8,8,1>F      0x021c004d
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 77, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g55<1>UW        g0<8,8,1>F      0x021c0051
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 81, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g52<1>UW        g0<8,8,1>F      0x021c004e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 78, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g56<1>UW        g0<8,8,1>F      0x021c0052
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 82, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g50<1>UW        g0<8,8,1>F      0x021c004c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 76, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080117
+                            urb MsgDesc: 17 SIMD8 write mlen 5 rlen 0       { align1 1Q EOT };
+send(8)         g6<1>UW         g9<8,8,1>UD     0x02406002
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD8, Mask = 0x0) mlen 1 rlen 4 { align1 1Q };
+send(16)        g16<1>UW        g14<8,8,1>UD    0x04805002
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD16, Mask = 0x0) mlen 2 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g17<8,8,1>UD    0x0210b500
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 0, SIMD8, inc) mlen 1 rlen 1 { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080097
+                            urb MsgDesc: 9 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         null<1>F        g4<8,8,1>F      0x120800c7
+                            urb MsgDesc: 12 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g5<8,8,1>F      0x120800e7
+                            urb MsgDesc: 14 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080107
+                            urb MsgDesc: 16 SIMD8 write mlen 5 rlen 0       { align1 1Q EOT };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x024ab101
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 1 Sampler = 1 mlen 1 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g10<8,8,1>UD    0x024ab202
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 2 Sampler = 2 mlen 1 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g14<8,8,1>UD    0x024ab303
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 3 Sampler = 3 mlen 1 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x024ab404
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 4 Sampler = 4 mlen 1 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g22<8,8,1>UD    0x024ab505
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 5 Sampler = 5 mlen 1 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g8<8,8,1>UD     0x064a2001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g12<8,8,1>UD    0x0a8c2001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g15<8,8,1>UD    0x08423102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g2<8,8,1>UD     0x10843102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 8 rlen 8 { align1 1H };
+send(8)         g2<1>UD         g9<8,8,1>UD     0x043a0028
+                            urb MsgDesc: 2 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x02380098
+                            urb MsgDesc: 9 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x023800a8
+                            urb MsgDesc: 10 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x023800b8
+                            urb MsgDesc: 11 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x023800d8
+                            urb MsgDesc: 13 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x023800e8
+                            urb MsgDesc: 14 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x023800f8
+                            urb MsgDesc: 15 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x02380108
+                            urb MsgDesc: 16 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x02380118
+                            urb MsgDesc: 17 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         null<1>F        g60<8,8,1>F     0x120800a7
+                            urb MsgDesc: 10 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080107
+                            urb MsgDesc: 16 SIMD8 write mlen 9 rlen 0       { align1 1Q EOT };
+send(8)         g3<1>UW         g3<8,8,1>UD     0x04195e01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0xe) mlen 2 rlen 1 { align1 1Q };
+send(8)         null<1>UW       g5<8,8,1>UD     0x060b5e02
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD16, Mask = 0xe) mlen 3 rlen 0 { align1 1Q };
+send(8)         g5<1>UW         g8<8,8,1>UD     0x04196e01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0xe) mlen 2 rlen 1 { align1 2Q };
+send(8)         null<1>UW       g10<8,8,1>UD    0x060b6e02
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD8, Mask = 0xe) mlen 3 rlen 0 { align1 2Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080067
+                            urb MsgDesc: 6 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c0077
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 119, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g30<1>UW        g0<8,8,1>F      0x021c0075
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 117, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c0074
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 116, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g14<1>UW        g0<8,8,1>F      0x021c0073
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 115, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g118<1>UW       g0<8,8,1>F      0x021c0070
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 112, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g122<1>UW       g0<8,8,1>F      0x021c0071
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 113, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g6<1>UW         g0<8,8,1>F      0x021c0072
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 114, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g66<1>UW        g0<8,8,1>F      0x021c006a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 106, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g70<1>UW        g0<8,8,1>F      0x021c006b
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 107, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g102<1>UW       g0<8,8,1>F      0x021c006c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 108, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g106<1>UW       g0<8,8,1>F      0x021c006d
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 109, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g110<1>UW       g0<8,8,1>F      0x021c006e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 110, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g114<1>UW       g0<8,8,1>F      0x021c006f
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 111, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g6<1>UW         g0<8,8,1>F      0x021c005e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 94, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g14<1>UW        g0<8,8,1>F      0x021c005f
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 95, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c0060
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 96, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g30<1>UW        g0<8,8,1>F      0x021c0061
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 97, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g34<1>UW        g0<8,8,1>F      0x021c0062
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 98, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g38<1>UW        g0<8,8,1>F      0x021c0063
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 99, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g42<1>UW        g0<8,8,1>F      0x021c0064
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 100, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g46<1>UW        g0<8,8,1>F      0x021c0065
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 101, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g50<1>UW        g0<8,8,1>F      0x021c0066
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 102, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g54<1>UW        g0<8,8,1>F      0x021c0067
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 103, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g58<1>UW        g0<8,8,1>F      0x021c0068
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 104, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g62<1>UW        g0<8,8,1>F      0x021c0069
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 105, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g25<1>UW        g0<8,8,1>F      0x021c005b
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 91, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g6<1>UW         g0<8,8,1>F      0x021c005c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 92, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         g6<1>UW         g0<8,8,1>F      0x021c005d
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 93, 0) mlen 1 rlen 1           { align1 WE_all 1Q };
+send(8)         null<1>F        g80<8,8,1>F     0x140a00b7
+                            urb MsgDesc: 11 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x140a00d7
+                            urb MsgDesc: 13 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x140a00f7
+                            urb MsgDesc: 15 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x140a0117
+                            urb MsgDesc: 17 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x140a0137
+                            urb MsgDesc: 19 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g90<8,8,1>F     0x140a0157
+                            urb MsgDesc: 21 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g100<8,8,1>F    0x140a0177
+                            urb MsgDesc: 23 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g110<8,8,1>F    0x0c0a0197
+                            urb MsgDesc: 25 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         g16<1>UW        g0<8,8,1>F      0x021c0076
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 118, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g4<1>UW         g0<8,8,1>F      0x021c0079
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 121, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g4<1>UW         g0<8,8,1>F      0x021c0078
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 120, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         null<1>F        g120<8,8,1>F    0x8c0a0197
+                            urb MsgDesc: 25 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         null<1>F        g123<8,8,1>F    0x8a0800b7
+                            urb MsgDesc: 11 SIMD8 write mlen 5 rlen 0       { align1 1Q EOT };
+send(8)         g22<1>UD        g53<8,8,1>UD    0x02180238
+                            urb MsgDesc: 35 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g54<1>UD        g53<8,8,1>UD    0x02180438
+                            urb MsgDesc: 67 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g67<1>UD        g53<8,8,1>UD    0x02180638
+                            urb MsgDesc: 99 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g61<1>UD        g53<8,8,1>UD    0x02180248
+                            urb MsgDesc: 36 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g66<1>UD        g53<8,8,1>UD    0x02180448
+                            urb MsgDesc: 68 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g63<1>UD        g53<8,8,1>UD    0x02180648
+                            urb MsgDesc: 100 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g68<1>UD        g65<8,8,1>UD    0x02180258
+                            urb MsgDesc: 37 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g69<1>UD        g65<8,8,1>UD    0x02180458
+                            urb MsgDesc: 69 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g70<1>UD        g65<8,8,1>UD    0x02180658
+                            urb MsgDesc: 101 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g75<1>UD        g24<8,8,1>UD    0x02180268
+                            urb MsgDesc: 38 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g76<1>UD        g24<8,8,1>UD    0x02180468
+                            urb MsgDesc: 70 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g77<1>UD        g24<8,8,1>UD    0x02180668
+                            urb MsgDesc: 102 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g82<1>UD        g25<8,8,1>UD    0x02180278
+                            urb MsgDesc: 39 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g83<1>UD        g25<8,8,1>UD    0x02180478
+                            urb MsgDesc: 71 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g84<1>UD        g25<8,8,1>UD    0x02180678
+                            urb MsgDesc: 103 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g89<1>UD        g26<8,8,1>UD    0x02180288
+                            urb MsgDesc: 40 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g90<1>UD        g26<8,8,1>UD    0x02180488
+                            urb MsgDesc: 72 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g91<1>UD        g26<8,8,1>UD    0x02180688
+                            urb MsgDesc: 104 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g96<1>UD        g27<8,8,1>UD    0x02180298
+                            urb MsgDesc: 41 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g97<1>UD        g27<8,8,1>UD    0x02180498
+                            urb MsgDesc: 73 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g98<1>UD        g27<8,8,1>UD    0x02180698
+                            urb MsgDesc: 105 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g103<1>UD       g28<8,8,1>UD    0x021802a8
+                            urb MsgDesc: 42 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g104<1>UD       g28<8,8,1>UD    0x021804a8
+                            urb MsgDesc: 74 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g105<1>UD       g28<8,8,1>UD    0x021806a8
+                            urb MsgDesc: 106 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g110<1>UD       g29<8,8,1>UD    0x021802b8
+                            urb MsgDesc: 43 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g111<1>UD       g29<8,8,1>UD    0x021804b8
+                            urb MsgDesc: 75 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g112<1>UD       g29<8,8,1>UD    0x021806b8
+                            urb MsgDesc: 107 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g117<1>UD       g30<8,8,1>UD    0x021802c8
+                            urb MsgDesc: 44 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g118<1>UD       g30<8,8,1>UD    0x021804c8
+                            urb MsgDesc: 76 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g119<1>UD       g30<8,8,1>UD    0x021806c8
+                            urb MsgDesc: 108 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g124<1>UD       g31<8,8,1>UD    0x021802d8
+                            urb MsgDesc: 45 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g125<1>UD       g31<8,8,1>UD    0x021804d8
+                            urb MsgDesc: 77 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g126<1>UD       g31<8,8,1>UD    0x021806d8
+                            urb MsgDesc: 109 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g10<1>UD        g32<8,8,1>UD    0x021802e8
+                            urb MsgDesc: 46 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g11<1>UD        g32<8,8,1>UD    0x021804e8
+                            urb MsgDesc: 78 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g12<1>UD        g32<8,8,1>UD    0x021806e8
+                            urb MsgDesc: 110 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g26<1>UD        g33<8,8,1>UD    0x021802f8
+                            urb MsgDesc: 47 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g27<1>UD        g33<8,8,1>UD    0x021804f8
+                            urb MsgDesc: 79 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g28<1>UD        g33<8,8,1>UD    0x021806f8
+                            urb MsgDesc: 111 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g33<1>UD        g35<8,8,1>UD    0x02180308
+                            urb MsgDesc: 48 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g34<1>UD        g35<8,8,1>UD    0x02180508
+                            urb MsgDesc: 80 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g35<1>UD        g35<8,8,1>UD    0x02180708
+                            urb MsgDesc: 112 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g64<1>UD        g36<8,8,1>UD    0x02180318
+                            urb MsgDesc: 49 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g41<1>UD        g36<8,8,1>UD    0x02180518
+                            urb MsgDesc: 81 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g42<1>UD        g36<8,8,1>UD    0x02180718
+                            urb MsgDesc: 113 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g6<1>UD         g37<8,8,1>UD    0x02180328
+                            urb MsgDesc: 50 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g48<1>UD        g37<8,8,1>UD    0x02180528
+                            urb MsgDesc: 82 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g49<1>UD        g37<8,8,1>UD    0x02180728
+                            urb MsgDesc: 114 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g67<1>UD        g38<8,8,1>UD    0x02180338
+                            urb MsgDesc: 51 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g56<1>UD        g38<8,8,1>UD    0x02180538
+                            urb MsgDesc: 83 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g57<1>UD        g38<8,8,1>UD    0x02180738
+                            urb MsgDesc: 115 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g66<1>UD        g39<8,8,1>UD    0x02180348
+                            urb MsgDesc: 52 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g63<1>UD        g39<8,8,1>UD    0x02180548
+                            urb MsgDesc: 84 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g40<1>UD        g39<8,8,1>UD    0x02180748
+                            urb MsgDesc: 116 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g69<1>UD        g64<8,8,1>UD    0x02180358
+                            urb MsgDesc: 53 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g70<1>UD        g64<8,8,1>UD    0x02180558
+                            urb MsgDesc: 85 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g71<1>UD        g64<8,8,1>UD    0x02180758
+                            urb MsgDesc: 117 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g76<1>UD        g41<8,8,1>UD    0x02180368
+                            urb MsgDesc: 54 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g77<1>UD        g41<8,8,1>UD    0x02180568
+                            urb MsgDesc: 86 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g78<1>UD        g41<8,8,1>UD    0x02180768
+                            urb MsgDesc: 118 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g83<1>UD        g42<8,8,1>UD    0x02180378
+                            urb MsgDesc: 55 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g84<1>UD        g42<8,8,1>UD    0x02180578
+                            urb MsgDesc: 87 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g85<1>UD        g42<8,8,1>UD    0x02180778
+                            urb MsgDesc: 119 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g90<1>UD        g43<8,8,1>UD    0x02180388
+                            urb MsgDesc: 56 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g91<1>UD        g43<8,8,1>UD    0x02180588
+                            urb MsgDesc: 88 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g92<1>UD        g43<8,8,1>UD    0x02180788
+                            urb MsgDesc: 120 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g97<1>UD        g44<8,8,1>UD    0x02180398
+                            urb MsgDesc: 57 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g98<1>UD        g44<8,8,1>UD    0x02180598
+                            urb MsgDesc: 89 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g99<1>UD        g44<8,8,1>UD    0x02180798
+                            urb MsgDesc: 121 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g104<1>UD       g45<8,8,1>UD    0x021803a8
+                            urb MsgDesc: 58 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g105<1>UD       g45<8,8,1>UD    0x021805a8
+                            urb MsgDesc: 90 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g106<1>UD       g45<8,8,1>UD    0x021807a8
+                            urb MsgDesc: 122 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g111<1>UD       g46<8,8,1>UD    0x021803b8
+                            urb MsgDesc: 59 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g112<1>UD       g46<8,8,1>UD    0x021805b8
+                            urb MsgDesc: 91 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g113<1>UD       g46<8,8,1>UD    0x021807b8
+                            urb MsgDesc: 123 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g118<1>UD       g6<8,8,1>UD     0x021803c8
+                            urb MsgDesc: 60 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g119<1>UD       g6<8,8,1>UD     0x021805c8
+                            urb MsgDesc: 92 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g120<1>UD       g6<8,8,1>UD     0x021807c8
+                            urb MsgDesc: 124 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g125<1>UD       g48<8,8,1>UD    0x021803d8
+                            urb MsgDesc: 61 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g126<1>UD       g48<8,8,1>UD    0x021805d8
+                            urb MsgDesc: 93 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g2<1>UD         g48<8,8,1>UD    0x021807d8
+                            urb MsgDesc: 125 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g12<1>UD        g49<8,8,1>UD    0x021803e8
+                            urb MsgDesc: 62 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g13<1>UD        g49<8,8,1>UD    0x021805e8
+                            urb MsgDesc: 94 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g14<1>UD        g49<8,8,1>UD    0x021807e8
+                            urb MsgDesc: 126 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g19<1>UD        g50<8,8,1>UD    0x021803f8
+                            urb MsgDesc: 63 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g20<1>UD        g50<8,8,1>UD    0x021805f8
+                            urb MsgDesc: 95 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g53<1>UD        g50<8,8,1>UD    0x021807f8
+                            urb MsgDesc: 127 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g28<1>UD        g51<8,8,1>UD    0x02180408
+                            urb MsgDesc: 64 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g29<1>UD        g51<8,8,1>UD    0x02180608
+                            urb MsgDesc: 96 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g30<1>UD        g51<8,8,1>UD    0x02180808
+                            urb MsgDesc: 128 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g35<1>UD        g22<8,8,1>UD    0x02180218
+                            urb MsgDesc: 33 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g36<1>UD        g22<8,8,1>UD    0x02180418
+                            urb MsgDesc: 65 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g37<1>UD        g22<8,8,1>UD    0x02180618
+                            urb MsgDesc: 97 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g38<1>UD        g22<8,8,1>UD    0x02180818
+                            urb MsgDesc: 129 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x080a8037
+                            urb MsgDesc: 3 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g10<8,8,1>UD    0x080a8047
+                            urb MsgDesc: 4 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>UD    0x080a8057
+                            urb MsgDesc: 5 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x080a8067
+                            urb MsgDesc: 6 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x080a8077
+                            urb MsgDesc: 7 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x080a8087
+                            urb MsgDesc: 8 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x080a8097
+                            urb MsgDesc: 9 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x080a80a7
+                            urb MsgDesc: 10 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x080a80b7
+                            urb MsgDesc: 11 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x080a80c7
+                            urb MsgDesc: 12 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x080a80d7
+                            urb MsgDesc: 13 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x080a80e7
+                            urb MsgDesc: 14 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x080a80f7
+                            urb MsgDesc: 15 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x080a8107
+                            urb MsgDesc: 16 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x080a8117
+                            urb MsgDesc: 17 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x080a8127
+                            urb MsgDesc: 18 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x080a8137
+                            urb MsgDesc: 19 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x080a8147
+                            urb MsgDesc: 20 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x080a8157
+                            urb MsgDesc: 21 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x080a8167
+                            urb MsgDesc: 22 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x080a8177
+                            urb MsgDesc: 23 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x080a8187
+                            urb MsgDesc: 24 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x080a8197
+                            urb MsgDesc: 25 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x080a81a7
+                            urb MsgDesc: 26 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x080a81b7
+                            urb MsgDesc: 27 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x080a81c7
+                            urb MsgDesc: 28 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x080a81d7
+                            urb MsgDesc: 29 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x080a81e7
+                            urb MsgDesc: 30 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x080a81f7
+                            urb MsgDesc: 31 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x080a8207
+                            urb MsgDesc: 32 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x080a8217
+                            urb MsgDesc: 33 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         g19<1>UW        g20<8,8,1>UD    0x06195e00
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 0, SIMD16, Mask = 0xe) mlen 3 rlen 1 { align1 1Q };
+send(8)         null<1>UW       g23<8,8,1>UD    0x080b5e00
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 0, SIMD16, Mask = 0xe) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g3<8,8,1>UD     0x080b5e02
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD16, Mask = 0xe) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g1<8,8,1>UD     0x080b6e02
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD8, Mask = 0xe) mlen 4 rlen 0 { align1 2Q };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x0a4b4001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g12<8,8,1>UD    0x0a4b4102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(8)         g34<1>UD        g42<8,8,1>UD    0x02480248
+                            urb MsgDesc: 36 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g38<1>UD        g42<8,8,1>UD    0x02480448
+                            urb MsgDesc: 68 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g42<1>UD        g42<8,8,1>UD    0x02480648
+                            urb MsgDesc: 100 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g22<1>UD        g43<8,8,1>UD    0x02480258
+                            urb MsgDesc: 37 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g43<8,8,1>UD    0x02480458
+                            urb MsgDesc: 69 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g30<1>UD        g43<8,8,1>UD    0x02480658
+                            urb MsgDesc: 101 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g22<1>UD        g44<8,8,1>UD    0x02480268
+                            urb MsgDesc: 38 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g44<8,8,1>UD    0x02480468
+                            urb MsgDesc: 70 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g30<1>UD        g44<8,8,1>UD    0x02480668
+                            urb MsgDesc: 102 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g22<1>UD        g45<8,8,1>UD    0x02480278
+                            urb MsgDesc: 39 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g45<8,8,1>UD    0x02480478
+                            urb MsgDesc: 71 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g30<1>UD        g45<8,8,1>UD    0x02480678
+                            urb MsgDesc: 103 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g22<1>UD        g55<8,8,1>UD    0x02480288
+                            urb MsgDesc: 40 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g55<8,8,1>UD    0x02480488
+                            urb MsgDesc: 72 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g30<1>UD        g55<8,8,1>UD    0x02480688
+                            urb MsgDesc: 104 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g26<1>UD        g56<8,8,1>UD    0x02480498
+                            urb MsgDesc: 73 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g56<8,8,1>UD    0x02480298
+                            urb MsgDesc: 41 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g30<1>UD        g56<8,8,1>UD    0x02480698
+                            urb MsgDesc: 105 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g26<1>UD        g82<8,8,1>UD    0x024804a8
+                            urb MsgDesc: 74 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g82<8,8,1>UD    0x024802a8
+                            urb MsgDesc: 42 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g30<1>UD        g82<8,8,1>UD    0x024806a8
+                            urb MsgDesc: 106 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g26<1>UD        g83<8,8,1>UD    0x024804b8
+                            urb MsgDesc: 75 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g83<8,8,1>UD    0x024802b8
+                            urb MsgDesc: 43 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g30<1>UD        g83<8,8,1>UD    0x024806b8
+                            urb MsgDesc: 107 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g26<1>UD        g84<8,8,1>UD    0x024806c8
+                            urb MsgDesc: 108 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g84<8,8,1>UD    0x024802c8
+                            urb MsgDesc: 44 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g84<8,8,1>UD    0x024804c8
+                            urb MsgDesc: 76 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g8<1>UD         g85<8,8,1>UD    0x024802d8
+                            urb MsgDesc: 45 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g85<8,8,1>UD    0x024804d8
+                            urb MsgDesc: 77 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g85<8,8,1>UD    0x024806d8
+                            urb MsgDesc: 109 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g6<8,8,1>UD     0x024802e8
+                            urb MsgDesc: 46 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g6<8,8,1>UD     0x024804e8
+                            urb MsgDesc: 78 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g6<8,8,1>UD     0x024806e8
+                            urb MsgDesc: 110 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g3<8,8,1>UD     0x024802f8
+                            urb MsgDesc: 47 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g3<8,8,1>UD     0x024804f8
+                            urb MsgDesc: 79 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g3<8,8,1>UD     0x024806f8
+                            urb MsgDesc: 111 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g46<8,8,1>UD    0x02480308
+                            urb MsgDesc: 48 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g46<8,8,1>UD    0x02480508
+                            urb MsgDesc: 80 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g46<8,8,1>UD    0x02480708
+                            urb MsgDesc: 112 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g47<8,8,1>UD    0x02480318
+                            urb MsgDesc: 49 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g47<8,8,1>UD    0x02480518
+                            urb MsgDesc: 81 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g47<8,8,1>UD    0x02480718
+                            urb MsgDesc: 113 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g57<8,8,1>UD    0x02480328
+                            urb MsgDesc: 50 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g57<8,8,1>UD    0x02480528
+                            urb MsgDesc: 82 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g57<8,8,1>UD    0x02480728
+                            urb MsgDesc: 114 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g58<8,8,1>UD    0x02480338
+                            urb MsgDesc: 51 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g58<8,8,1>UD    0x02480538
+                            urb MsgDesc: 83 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g58<8,8,1>UD    0x02480738
+                            urb MsgDesc: 115 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g59<8,8,1>UD    0x02480348
+                            urb MsgDesc: 52 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g59<8,8,1>UD    0x02480548
+                            urb MsgDesc: 84 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g59<8,8,1>UD    0x02480748
+                            urb MsgDesc: 116 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g60<8,8,1>UD    0x02480358
+                            urb MsgDesc: 53 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g60<8,8,1>UD    0x02480558
+                            urb MsgDesc: 85 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g60<8,8,1>UD    0x02480758
+                            urb MsgDesc: 117 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g61<8,8,1>UD    0x02480368
+                            urb MsgDesc: 54 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g61<8,8,1>UD    0x02480568
+                            urb MsgDesc: 86 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g61<8,8,1>UD    0x02480768
+                            urb MsgDesc: 118 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g62<8,8,1>UD    0x02480378
+                            urb MsgDesc: 55 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g62<8,8,1>UD    0x02480578
+                            urb MsgDesc: 87 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g62<8,8,1>UD    0x02480778
+                            urb MsgDesc: 119 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g63<8,8,1>UD    0x02480388
+                            urb MsgDesc: 56 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g63<8,8,1>UD    0x02480588
+                            urb MsgDesc: 88 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g63<8,8,1>UD    0x02480788
+                            urb MsgDesc: 120 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g64<8,8,1>UD    0x02480398
+                            urb MsgDesc: 57 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g64<8,8,1>UD    0x02480598
+                            urb MsgDesc: 89 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g64<8,8,1>UD    0x02480798
+                            urb MsgDesc: 121 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g68<8,8,1>UD    0x024803a8
+                            urb MsgDesc: 58 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g68<8,8,1>UD    0x024805a8
+                            urb MsgDesc: 90 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g68<8,8,1>UD    0x024807a8
+                            urb MsgDesc: 122 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g69<8,8,1>UD    0x024803b8
+                            urb MsgDesc: 59 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g69<8,8,1>UD    0x024805b8
+                            urb MsgDesc: 91 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g69<8,8,1>UD    0x024807b8
+                            urb MsgDesc: 123 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g70<8,8,1>UD    0x024803c8
+                            urb MsgDesc: 60 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g70<8,8,1>UD    0x024805c8
+                            urb MsgDesc: 92 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g16<1>UD        g70<8,8,1>UD    0x024807c8
+                            urb MsgDesc: 124 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g71<8,8,1>UD    0x024803d8
+                            urb MsgDesc: 61 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g71<8,8,1>UD    0x024805d8
+                            urb MsgDesc: 93 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g16<1>UD        g71<8,8,1>UD    0x024807d8
+                            urb MsgDesc: 125 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g72<8,8,1>UD    0x024803e8
+                            urb MsgDesc: 62 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g72<8,8,1>UD    0x024805e8
+                            urb MsgDesc: 94 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g16<1>UD        g72<8,8,1>UD    0x024807e8
+                            urb MsgDesc: 126 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g73<8,8,1>UD    0x024803f8
+                            urb MsgDesc: 63 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g73<8,8,1>UD    0x024805f8
+                            urb MsgDesc: 95 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g16<1>UD        g73<8,8,1>UD    0x024807f8
+                            urb MsgDesc: 127 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g12<1>UD        g75<8,8,1>UD    0x02480418
+                            urb MsgDesc: 65 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g16<1>UD        g75<8,8,1>UD    0x02480618
+                            urb MsgDesc: 97 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g20<1>UD        g75<8,8,1>UD    0x02480818
+                            urb MsgDesc: 129 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0c0a00c7
+                            urb MsgDesc: 12 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0c0a00d7
+                            urb MsgDesc: 13 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0c0a00e7
+                            urb MsgDesc: 14 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0c0a00f7
+                            urb MsgDesc: 15 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0c0a0107
+                            urb MsgDesc: 16 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0c0a0117
+                            urb MsgDesc: 17 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0c0a0137
+                            urb MsgDesc: 19 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0c0a0147
+                            urb MsgDesc: 20 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0c0a0157
+                            urb MsgDesc: 21 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0c0a0167
+                            urb MsgDesc: 22 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0c0a0177
+                            urb MsgDesc: 23 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0c0a0187
+                            urb MsgDesc: 24 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0c0a01a7
+                            urb MsgDesc: 26 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0c0a01b7
+                            urb MsgDesc: 27 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0c0a01c7
+                            urb MsgDesc: 28 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0c0a01d7
+                            urb MsgDesc: 29 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0c0a01e7
+                            urb MsgDesc: 30 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0c0a01f7
+                            urb MsgDesc: 31 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(16)        g46<1>UD        g12<8,8,1>UD    0x02280302
+                            const MsgDesc: (2, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(16)        g50<1>UD        g15<8,8,1>UD    0x02280304
+                            const MsgDesc: (4, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(16)        g34<1>UD        g20<8,8,1>UD    0x02280303
+                            const MsgDesc: (3, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(16)        g16<1>UD        g21<8,8,1>UD    0x02280306
+                            const MsgDesc: (6, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(32)        g87<1>UW        g0<8,8,1>F      0x024c200c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 12, 32) mlen 1 rlen 4          { align1 WE_all };
+send(32)        g91<1>UW        g0<8,8,1>F      0x024c2014
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 20, 32) mlen 1 rlen 4          { align1 WE_all };
+send(32)        g95<1>UW        g0<8,8,1>F      0x024c201c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 28, 32) mlen 1 rlen 4          { align1 WE_all };
+send(8)         g8<1>UW         g24<8,8,1>UD    0x02106e04
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 4, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+send(8)         g5<1>UW         g21<8,8,1>UD    0x02106e03
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 3, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+send(16)        g14<1>UW        g40<8,8,1>UD    0x04205e04
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 4, SIMD16, Mask = 0xe) mlen 2 rlen 2 { align1 1H };
+send(16)        g8<1>UW         g36<8,8,1>UD    0x04205e03
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 3, SIMD16, Mask = 0xe) mlen 2 rlen 2 { align1 1H };
+(+f1.0) send(8) null<1>UW       g11<8,8,1>UD    0x0a026002
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 2, SIMD8, Mask = 0x0) mlen 5 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g11<8,8,1>UD    0x14025002
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 2, SIMD16, Mask = 0x0) mlen 10 rlen 0 { align1 1H };
+send(8)         g15<1>UD        g12<8,8,1>UD    0x041a0038
+                            urb MsgDesc: 3 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g22<1>UW        g14<8,8,1>UD    0x064a8404
+                            sampler MsgDesc: gather4 SIMD8 Surface = 4 Sampler = 4 mlen 3 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g10<8,8,1>UD    0x084a8202
+                            sampler MsgDesc: gather4 SIMD8 Surface = 2 Sampler = 2 mlen 4 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g26<8,8,1>UD    0x0a4a8303
+                            sampler MsgDesc: gather4 SIMD8 Surface = 3 Sampler = 3 mlen 5 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g54<8,8,1>UD    0x0242a707
+                            sampler MsgDesc: resinfo SIMD8 Surface = 7 Sampler = 7 mlen 1 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g55<8,8,1>UD    0x0242a808
+                            sampler MsgDesc: resinfo SIMD8 Surface = 8 Sampler = 8 mlen 1 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g56<8,8,1>UD    0x0242a909
+                            sampler MsgDesc: resinfo SIMD8 Surface = 9 Sampler = 9 mlen 1 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g57<8,8,1>UD    0x0242aa0a
+                            sampler MsgDesc: resinfo SIMD8 Surface = 10 Sampler = 10 mlen 1 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g58<8,8,1>UD    0x0242ab0b
+                            sampler MsgDesc: resinfo SIMD8 Surface = 11 Sampler = 11 mlen 1 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g59<8,8,1>UD    0x0242ac0c
+                            sampler MsgDesc: resinfo SIMD8 Surface = 12 Sampler = 12 mlen 1 rlen 4 { align1 1Q };
+send(8)         null<1>F        g9<8,8,1>UD     0x0c088027
+                            urb MsgDesc: 2 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g10<8,8,1>UD    0x0c088047
+                            urb MsgDesc: 4 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>UD    0x0c088067
+                            urb MsgDesc: 6 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0c088037
+                            urb MsgDesc: 3 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g7<8,8,1>UD     0x0c088057
+                            urb MsgDesc: 5 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g8<8,8,1>UD     0x0c088077
+                            urb MsgDesc: 7 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c0100
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 0, 1) mlen 1 rlen 1            { align1 WE_all 1Q };
+send(8)         g25<1>UW        g0<8,8,1>F      0x021c00c1
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 193, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g79<1>UW        g0<8,8,1>F      0x021c00fe
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 254, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g75<1>UW        g0<8,8,1>F      0x021c00fd
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 253, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g71<1>UW        g0<8,8,1>F      0x021c00fc
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 252, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g67<1>UW        g0<8,8,1>F      0x021c00fb
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 251, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g63<1>UW        g0<8,8,1>F      0x021c00fa
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 250, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g47<1>UW        g0<8,8,1>F      0x021c00f6
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 246, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g8<1>UW         g0<8,8,1>F      0x021c00ef
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 239, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g15<1>UW        g0<8,8,1>F      0x021c00f0
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 240, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g23<1>UW        g0<8,8,1>F      0x021c00f1
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 241, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g31<1>UW        g0<8,8,1>F      0x021c00f2
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 242, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g35<1>UW        g0<8,8,1>F      0x021c00f3
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 243, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g39<1>UW        g0<8,8,1>F      0x021c00f4
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 244, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g43<1>UW        g0<8,8,1>F      0x021c00f5
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 245, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g51<1>UW        g0<8,8,1>F      0x021c00f7
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 247, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g55<1>UW        g0<8,8,1>F      0x021c00f8
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 248, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g59<1>UW        g0<8,8,1>F      0x021c00f9
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 249, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g89<1>UW        g0<8,8,1>F      0x021c00ee
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 238, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g85<1>UW        g0<8,8,1>F      0x021c00ed
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 237, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g81<1>UW        g0<8,8,1>F      0x021c00ec
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 236, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g77<1>UW        g0<8,8,1>F      0x021c00eb
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 235, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g25<1>UW        g0<8,8,1>F      0x021c00df
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 223, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g33<1>UW        g0<8,8,1>F      0x021c00e0
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 224, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g69<1>UW        g0<8,8,1>F      0x021c00e9
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 233, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g73<1>UW        g0<8,8,1>F      0x021c00ea
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 234, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g37<1>UW        g0<8,8,1>F      0x021c00e1
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 225, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g41<1>UW        g0<8,8,1>F      0x021c00e2
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 226, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g45<1>UW        g0<8,8,1>F      0x021c00e3
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 227, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g49<1>UW        g0<8,8,1>F      0x021c00e4
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 228, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g53<1>UW        g0<8,8,1>F      0x021c00e5
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 229, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g57<1>UW        g0<8,8,1>F      0x021c00e6
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 230, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g61<1>UW        g0<8,8,1>F      0x021c00e7
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 231, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g65<1>UW        g0<8,8,1>F      0x021c00e8
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 232, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g23<1>UW        g0<8,8,1>F      0x021c00be
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 190, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c00c0
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 192, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g116<1>UW       g0<8,8,1>F      0x021c00bf
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 191, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g7<1>UW         g0<8,8,1>F      0x021c00c3
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 195, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c00c4
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 196, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g25<1>UW        g0<8,8,1>F      0x021c00c5
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 197, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c00c6
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 198, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g25<1>UW        g0<8,8,1>F      0x021c00c7
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 199, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c00c8
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 200, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g25<1>UW        g0<8,8,1>F      0x021c00c9
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 201, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c00ca
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 202, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g25<1>UW        g0<8,8,1>F      0x021c00cb
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 203, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c00cc
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 204, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g25<1>UW        g0<8,8,1>F      0x021c00cd
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 205, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c00ce
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 206, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g25<1>UW        g0<8,8,1>F      0x021c00cf
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 207, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c00d0
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 208, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g25<1>UW        g0<8,8,1>F      0x021c00d1
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 209, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c00d2
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 210, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g25<1>UW        g0<8,8,1>F      0x021c00d3
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 211, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c00d4
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 212, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g25<1>UW        g0<8,8,1>F      0x021c00d5
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 213, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c00d6
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 214, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g6<1>UW         g0<8,8,1>F      0x021c00db
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 219, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c00dc
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 220, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g25<1>UW        g0<8,8,1>F      0x021c00dd
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 221, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c00de
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 222, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g25<1>UW        g0<8,8,1>F      0x021c00d7
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 215, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g6<1>UW         g0<8,8,1>F      0x021c00d8
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 216, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g6<1>UW         g0<8,8,1>F      0x021c00d9
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 217, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g6<1>UW         g0<8,8,1>F      0x021c00da
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 218, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x140a0197
+                            urb MsgDesc: 25 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x140a01b7
+                            urb MsgDesc: 27 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x140a01d7
+                            urb MsgDesc: 29 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(16)        g6<1>UW         g0<8,8,1>F      0x022c10a0
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 160, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g8<1>UW         g0<8,8,1>F      0x022c10a2
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 162, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g10<1>UW        g0<8,8,1>F      0x022c10a4
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 164, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g12<1>UW        g0<8,8,1>F      0x022c10a6
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 166, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g14<1>UW        g0<8,8,1>F      0x022c10a8
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 168, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(8)         null<1>F        g6<8,8,1>F      0x140a01f7
+                            urb MsgDesc: 31 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         g16<1>UW        g0<8,8,1>F      0x021c00ff
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 255, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(16)        g18<1>UW        g0<8,8,1>F      0x022c108e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 142, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g20<1>UW        g0<8,8,1>F      0x022c1090
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 144, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g22<1>UW        g0<8,8,1>F      0x022c1092
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 146, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g24<1>UW        g0<8,8,1>F      0x022c1094
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 148, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g16<1>UW        g0<8,8,1>F      0x022c10aa
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 170, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g18<1>UW        g0<8,8,1>F      0x022c10ac
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 172, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g20<1>UW        g0<8,8,1>F      0x022c10ae
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 174, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g22<1>UW        g0<8,8,1>F      0x022c10b0
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 176, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g24<1>UW        g0<8,8,1>F      0x022c10b2
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 178, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(8)         g4<1>UW         g0<8,8,1>F      0x021c0102
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 2, 1) mlen 1 rlen 1            { align1 WE_all 1Q };
+send(8)         g4<1>UW         g0<8,8,1>F      0x021c0101
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 1, 1) mlen 1 rlen 1            { align1 WE_all 1Q };
+send(8)         g5<1>UW         g0<8,8,1>F      0x021c00c2
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 194, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(16)        g16<1>UW        g0<8,8,1>F      0x022c1096
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 150, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g18<1>UW        g0<8,8,1>F      0x022c1098
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 152, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g20<1>UW        g0<8,8,1>F      0x022c109a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 154, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g22<1>UW        g0<8,8,1>F      0x022c109c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 156, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g24<1>UW        g0<8,8,1>F      0x022c109e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 158, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g16<1>UW        g0<8,8,1>F      0x022c10b4
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 180, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g18<1>UW        g0<8,8,1>F      0x022c10b6
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 182, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g20<1>UW        g0<8,8,1>F      0x022c10b8
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 184, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g22<1>UW        g0<8,8,1>F      0x022c10ba
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 186, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(16)        g24<1>UW        g0<8,8,1>F      0x022c10bc
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 188, 16) mlen 1 rlen 2         { align1 WE_all 1H };
+send(8)         null<1>F        g120<8,8,1>F    0x8c0a0217
+                            urb MsgDesc: 33 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         g8<1>UD         g6<8,8,1>UD     0x041a0318
+                            urb MsgDesc: 49 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g9<1>UD         g6<8,8,1>UD     0x041a0518
+                            urb MsgDesc: 81 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g10<1>UD        g6<8,8,1>UD     0x041a0718
+                            urb MsgDesc: 113 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g11<1>UD        g6<8,8,1>UD     0x041a0918
+                            urb MsgDesc: 145 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g7<1>UD         g11<8,8,1>UD    0x041a0218
+                            urb MsgDesc: 33 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g8<1>UD         g11<8,8,1>UD    0x041a0418
+                            urb MsgDesc: 65 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g9<1>UD         g11<8,8,1>UD    0x041a0618
+                            urb MsgDesc: 97 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g10<1>UD        g11<8,8,1>UD    0x041a0818
+                            urb MsgDesc: 129 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         null<1>F        g10<8,8,1>UD    0x080a8227
+                            urb MsgDesc: 34 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>UD    0x080a8237
+                            urb MsgDesc: 35 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x080a8247
+                            urb MsgDesc: 36 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x080a8257
+                            urb MsgDesc: 37 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x080a8267
+                            urb MsgDesc: 38 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x080a8277
+                            urb MsgDesc: 39 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x080a8287
+                            urb MsgDesc: 40 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x080a8297
+                            urb MsgDesc: 41 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x080a82a7
+                            urb MsgDesc: 42 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x080a82b7
+                            urb MsgDesc: 43 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x080a82c7
+                            urb MsgDesc: 44 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x080a82d7
+                            urb MsgDesc: 45 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x080a82e7
+                            urb MsgDesc: 46 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x080a82f7
+                            urb MsgDesc: 47 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x080a8307
+                            urb MsgDesc: 48 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x080a8317
+                            urb MsgDesc: 49 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x080a8327
+                            urb MsgDesc: 50 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x080a8337
+                            urb MsgDesc: 51 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x080a8347
+                            urb MsgDesc: 52 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x080a8357
+                            urb MsgDesc: 53 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x080a8367
+                            urb MsgDesc: 54 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x080a8377
+                            urb MsgDesc: 55 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x080a8387
+                            urb MsgDesc: 56 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x080a8397
+                            urb MsgDesc: 57 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x080a83a7
+                            urb MsgDesc: 58 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x080a83b7
+                            urb MsgDesc: 59 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x080a83c7
+                            urb MsgDesc: 60 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x080a83d7
+                            urb MsgDesc: 61 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x080a83e7
+                            urb MsgDesc: 62 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x080a83f7
+                            urb MsgDesc: 63 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080007
+                            urb MsgDesc: 0 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+(+f1.0) send(8) g14<1>UW        g13<8,8,1>UD    0x0410bd02
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD8, umin) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(16) g22<1>UW       g24<8,8,1>UD    0x0820ad02
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, umin) mlen 4 rlen 2 { align1 1H };
+send(8)         g2<1>UW         g19<8,8,1>UD    0x06295c01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0xc) mlen 3 rlen 2 { align1 1Q };
+send(8)         null<1>UW       g22<8,8,1>UD    0x0a0b5c02
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD16, Mask = 0xc) mlen 5 rlen 0 { align1 1Q };
+send(8)         g49<1>UW        g43<8,8,1>UD    0x06296c01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0xc) mlen 3 rlen 2 { align1 2Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x0a0b6c02
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD8, Mask = 0xc) mlen 5 rlen 0 { align1 2Q };
+send(8)         g124<1>UW       g7<8,8,1>UD     0x104a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 8 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x084a0001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x0e8c0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         g119<1>UW       g0<8,8,1>F      0x021c00b2
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 178, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g39<1>UW        g0<8,8,1>F      0x021c00ad
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 173, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g40<1>UW        g0<8,8,1>F      0x021c00af
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 175, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g41<1>UW        g0<8,8,1>F      0x021c00ae
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 174, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g42<1>UW        g0<8,8,1>F      0x021c00b0
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 176, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g33<1>UW        g0<8,8,1>F      0x021c00a9
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 169, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g34<1>UW        g0<8,8,1>F      0x021c00a8
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 168, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g35<1>UW        g0<8,8,1>F      0x021c00aa
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 170, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g25<1>UW        g0<8,8,1>F      0x021c00a4
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 164, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g36<1>UW        g0<8,8,1>F      0x021c00ab
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 171, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g30<1>UW        g0<8,8,1>F      0x021c00a5
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 165, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g31<1>UW        g0<8,8,1>F      0x021c00a7
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 167, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g32<1>UW        g0<8,8,1>F      0x021c00a6
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 166, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g17<1>UW        g0<8,8,1>F      0x021c00a0
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 160, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g22<1>UW        g0<8,8,1>F      0x021c00a2
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 162, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g23<1>UW        g0<8,8,1>F      0x021c00a1
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 161, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g24<1>UW        g0<8,8,1>F      0x021c00a3
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 163, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g13<1>UW        g0<8,8,1>F      0x021c009d
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 157, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g14<1>UW        g0<8,8,1>F      0x021c009c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 156, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g15<1>UW        g0<8,8,1>F      0x021c009e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 158, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g9<1>UW         g0<8,8,1>F      0x021c0098
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 152, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g16<1>UW        g0<8,8,1>F      0x021c009f
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 159, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g10<1>UW        g0<8,8,1>F      0x021c0099
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 153, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g11<1>UW        g0<8,8,1>F      0x021c009b
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 155, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g12<1>UW        g0<8,8,1>F      0x021c009a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 154, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g121<1>UW       g0<8,8,1>F      0x021c0094
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 148, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g6<1>UW         g0<8,8,1>F      0x021c0096
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 150, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g7<1>UW         g0<8,8,1>F      0x021c0095
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 149, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g8<1>UW         g0<8,8,1>F      0x021c0097
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 151, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g116<1>UW       g0<8,8,1>F      0x021c0091
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 145, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g117<1>UW       g0<8,8,1>F      0x021c0090
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 144, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g119<1>UW       g0<8,8,1>F      0x021c0092
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 146, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g112<1>UW       g0<8,8,1>F      0x021c008c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 140, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g120<1>UW       g0<8,8,1>F      0x021c0093
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 147, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g113<1>UW       g0<8,8,1>F      0x021c008d
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 141, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g114<1>UW       g0<8,8,1>F      0x021c008f
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 143, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g115<1>UW       g0<8,8,1>F      0x021c008e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 142, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g104<1>UW       g0<8,8,1>F      0x021c0084
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 132, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g105<1>UW       g0<8,8,1>F      0x021c0086
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 134, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g106<1>UW       g0<8,8,1>F      0x021c0085
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 133, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g107<1>UW       g0<8,8,1>F      0x021c0087
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 135, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g108<1>UW       g0<8,8,1>F      0x021c0089
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 137, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g109<1>UW       g0<8,8,1>F      0x021c0088
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 136, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g110<1>UW       g0<8,8,1>F      0x021c008a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 138, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g100<1>UW       g0<8,8,1>F      0x021c0080
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 128, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g111<1>UW       g0<8,8,1>F      0x021c008b
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 139, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g101<1>UW       g0<8,8,1>F      0x021c0081
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 129, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g102<1>UW       g0<8,8,1>F      0x021c0083
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 131, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g103<1>UW       g0<8,8,1>F      0x021c0082
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 130, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g96<1>UW        g0<8,8,1>F      0x021c007c
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 124, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g97<1>UW        g0<8,8,1>F      0x021c007e
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 126, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g98<1>UW        g0<8,8,1>F      0x021c007d
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 125, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g99<1>UW        g0<8,8,1>F      0x021c007f
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 127, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g44<1>UW        g0<8,8,1>F      0x021c00b1
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 177, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g94<1>UW        g0<8,8,1>F      0x021c007a
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 122, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g95<1>UW        g0<8,8,1>F      0x021c007b
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 123, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g38<1>UW        g0<8,8,1>F      0x021c00ac
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 172, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         null<1>F        g121<8,8,1>F    0x8a080197
+                            urb MsgDesc: 25 SIMD8 write mlen 5 rlen 0       { align1 1Q EOT };
+send(8)         g2<1>UW         g17<8,8,1>UD    0x04495000
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 0, SIMD16, Mask = 0x0) mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g17<8,8,1>UD    0x04295c00
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 0, SIMD16, Mask = 0xc) mlen 2 rlen 2 { align1 1Q };
+send(8)         g18<1>UW        g17<8,8,1>UD    0x04195e00
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 0, SIMD16, Mask = 0xe) mlen 2 rlen 1 { align1 1Q };
+send(8)         g124<1>UW       g19<8,8,1>UD    0x0619a700
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, add) mlen 3 rlen 1 { align1 1Q };
+send(8)         g124<1>UW       g19<8,8,1>UD    0x0619ad00
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, umin) mlen 3 rlen 1 { align1 1Q };
+send(8)         g124<1>UW       g19<8,8,1>UD    0x0619ac00
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, umax) mlen 3 rlen 1 { align1 1Q };
+send(8)         g124<1>UW       g19<8,8,1>UD    0x0619a100
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, and) mlen 3 rlen 1 { align1 1Q };
+send(8)         g124<1>UW       g19<8,8,1>UD    0x0619a200
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, or) mlen 3 rlen 1 { align1 1Q };
+send(8)         g124<1>UW       g19<8,8,1>UD    0x0619a300
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, xor) mlen 3 rlen 1 { align1 1Q };
+send(8)         g124<1>UW       g19<8,8,1>UD    0x0619a400
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, mov) mlen 3 rlen 1 { align1 1Q };
+send(8)         g124<1>UW       g19<8,8,1>UD    0x0819ae00
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, cmpwr) mlen 4 rlen 1 { align1 1Q };
+send(8)         g34<1>UW        g0<8,8,1>F      0x021c00b3
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 179, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g35<1>UW        g0<8,8,1>F      0x021c00b4
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 180, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g36<1>UW        g0<8,8,1>F      0x021c00b5
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 181, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g119<1>UW       g0<8,8,1>F      0x021c00b8
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 184, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g117<1>UW       g0<8,8,1>F      0x021c00b7
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 183, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g40<1>UW        g0<8,8,1>F      0x021c00b6
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 182, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g17<1>UW        g0<8,8,1>F      0x021c00b9
+                            data MsgDesc: (*** invalid DP DC0 message type value 16 , 185, 0) mlen 1 rlen 1          { align1 WE_all 1Q };
+send(8)         g9<1>UW         g19<8,8,1>UD    0x0843e102
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g23<1>UW        g7<8,8,1>UD     0x1085e102
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 2 Sampler = 1 mlen 8 rlen 8 { align1 1H };
+send(8)         g9<1>UW         g21<8,8,1>UD    0x0443d002
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 2 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g23<1>UW        g11<8,8,1>UD    0x0885d002
+                            sampler MsgDesc: ld_mcs SIMD16 Surface = 2 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g5<8,8,1>UD     0x0c4b0001
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x168d0001
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 1 Sampler = 0 mlen 11 rlen 8 { align1 1H };
+send(8)         null<1>UW       g2<8,8,1>UD     0x060b5e01
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD16, Mask = 0xe) mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x060b6e01
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD8, Mask = 0xe) mlen 3 rlen 0 { align1 2Q };
+(+f1.0) send(8) null<1>UW       g7<8,8,1>UD     0x0a026003
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 3, SIMD8, Mask = 0x0) mlen 5 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g9<8,8,1>UD     0x14025003
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 3, SIMD16, Mask = 0x0) mlen 10 rlen 0 { align1 1H };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x0a434001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g12<8,8,1>UD    0x0a434102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(8)         g22<1>UD        g10<8,8,1>UD    0x041a0138
+                            urb MsgDesc: 19 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g21<1>UD        g10<8,8,1>UD    0x041a0338
+                            urb MsgDesc: 51 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g65<1>UD        g10<8,8,1>UD    0x041a0538
+                            urb MsgDesc: 83 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g10<1>UD        g10<8,8,1>UD    0x041a0738
+                            urb MsgDesc: 115 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g65<1>UD        g11<8,8,1>UD    0x041a0238
+                            urb MsgDesc: 35 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g10<1>UD        g11<8,8,1>UD    0x041a0438
+                            urb MsgDesc: 67 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g11<1>UD        g11<8,8,1>UD    0x041a0638
+                            urb MsgDesc: 99 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g8<1>UD         g7<8,8,1>UD     0x041a0048
+                            urb MsgDesc: 4 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x0a4a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g12<8,8,1>UD    0x04009701
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, add) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g20<8,8,1>UD    0x08008701
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, add) mlen 4 rlen 0 { align1 1H };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x0c4b4001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g13<8,8,1>UD    0x0c4b4102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 6 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g10<8,8,1>UD    0x064a8203
+                            sampler MsgDesc: gather4 SIMD8 Surface = 3 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(16)        g26<1>UW        g2<8,8,1>UD     0x0a8c8203
+                            sampler MsgDesc: gather4 SIMD16 Surface = 3 Sampler = 2 mlen 5 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x08425102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g19<8,8,1>UD    0x10845102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 8 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x02306801
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD8, Mask = 0x8) mlen 1 rlen 3 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x04605801
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD16, Mask = 0x8) mlen 2 rlen 6 { align1 1H };
+send(8)         g8<1>UD         g7<8,8,1>UD     0x043a0128
+                            urb MsgDesc: 18 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x104b4001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 8 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g20<8,8,1>UD    0x104b4102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 8 rlen 4 { align1 1Q };
+send(8)         g8<1>UD         g20<8,8,1>UD    0x044a0138
+                            urb MsgDesc: 19 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g20<8,8,1>UD    0x044a0338
+                            urb MsgDesc: 51 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g16<1>UD        g20<8,8,1>UD    0x044a0538
+                            urb MsgDesc: 83 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g20<1>UD        g20<8,8,1>UD    0x044a0738
+                            urb MsgDesc: 115 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g14<1>UD        g22<8,8,1>UD    0x044a0238
+                            urb MsgDesc: 35 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g18<1>UD        g22<8,8,1>UD    0x044a0438
+                            urb MsgDesc: 67 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g22<1>UD        g22<8,8,1>UD    0x044a0638
+                            urb MsgDesc: 99 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g10<8,8,1>UD    0x04420004
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g26<8,8,1>UD    0x08840004
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         null<1>UW       g1<8,8,1>UD     0x100b5001
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD16, Mask = 0x0) mlen 8 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g1<8,8,1>UD     0x100b6001
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD8, Mask = 0x0) mlen 8 rlen 0 { align1 2Q };
+send(8)         g124<1>UW       g7<8,8,1>UD     0x0c4b2000
+                            sampler MsgDesc: gather4_po_c SIMD8 Surface = 0 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g13<1>UD        g39<8,8,1>UD    0x041a0058
+                            urb MsgDesc: 5 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g4<1>UD         g10<8,8,1>UD    0x041a0068
+                            urb MsgDesc: 6 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g4<1>UD         g3<8,8,1>UD     0x041a0078
+                            urb MsgDesc: 7 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g4<1>UD         g3<8,8,1>UD     0x041a0088
+                            urb MsgDesc: 8 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g4<1>UD         g3<8,8,1>UD     0x041a0098
+                            urb MsgDesc: 9 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g4<1>UD         g3<8,8,1>UD     0x041a00a8
+                            urb MsgDesc: 10 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g4<1>UD         g3<8,8,1>UD     0x041a00b8
+                            urb MsgDesc: 11 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g4<1>UD         g3<8,8,1>UD     0x041a00c8
+                            urb MsgDesc: 12 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a00d8
+                            urb MsgDesc: 13 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a00e8
+                            urb MsgDesc: 14 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a00f8
+                            urb MsgDesc: 15 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0108
+                            urb MsgDesc: 16 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0118
+                            urb MsgDesc: 17 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0148
+                            urb MsgDesc: 20 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0158
+                            urb MsgDesc: 21 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0168
+                            urb MsgDesc: 22 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0178
+                            urb MsgDesc: 23 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0188
+                            urb MsgDesc: 24 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0198
+                            urb MsgDesc: 25 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a01a8
+                            urb MsgDesc: 26 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a01b8
+                            urb MsgDesc: 27 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a01c8
+                            urb MsgDesc: 28 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a01d8
+                            urb MsgDesc: 29 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a01e8
+                            urb MsgDesc: 30 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a01f8
+                            urb MsgDesc: 31 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0208
+                            urb MsgDesc: 32 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g38<1>UW        g38<8,8,1>UD    0x084a8405
+                            sampler MsgDesc: gather4 SIMD8 Surface = 5 Sampler = 4 mlen 4 rlen 4 { align1 1Q };
+send(8)         g46<1>UW        g23<8,8,1>UD    0x064a8304
+                            sampler MsgDesc: gather4 SIMD8 Surface = 4 Sampler = 3 mlen 3 rlen 4 { align1 1Q };
+send(8)         g28<1>UW        g28<8,8,1>UD    0x064a8506
+                            sampler MsgDesc: gather4 SIMD8 Surface = 6 Sampler = 5 mlen 3 rlen 4 { align1 1Q };
+send(8)         g12<1>UW        g23<8,8,1>UD    0x064a8607
+                            sampler MsgDesc: gather4 SIMD8 Surface = 7 Sampler = 6 mlen 3 rlen 4 { align1 1Q };
+send(8)         g12<1>UW        g32<8,8,1>UD    0x084a8708
+                            sampler MsgDesc: gather4 SIMD8 Surface = 8 Sampler = 7 mlen 4 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g13<8,8,1>UD    0x064a8809
+                            sampler MsgDesc: gather4 SIMD8 Surface = 9 Sampler = 8 mlen 3 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g26<8,8,1>UD    0x084b090a
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 10 Sampler = 9 mlen 4 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x0a4b0a0b
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 11 Sampler = 10 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x084b0b0c
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 12 Sampler = 11 mlen 4 rlen 4 { align1 1Q };
+send(16)        g30<1>UW        g73<8,8,1>UD    0x0a8c8304
+                            sampler MsgDesc: gather4 SIMD16 Surface = 4 Sampler = 3 mlen 5 rlen 8 { align1 1H };
+send(16)        g40<1>UW        g2<8,8,1>UD     0x0e8c8405
+                            sampler MsgDesc: gather4 SIMD16 Surface = 5 Sampler = 4 mlen 7 rlen 8 { align1 1H };
+send(16)        g5<1>UW         g33<8,8,1>UD    0x0a8c8506
+                            sampler MsgDesc: gather4 SIMD16 Surface = 6 Sampler = 5 mlen 5 rlen 8 { align1 1H };
+send(16)        g32<1>UW        g55<8,8,1>UD    0x0a8c8607
+                            sampler MsgDesc: gather4 SIMD16 Surface = 7 Sampler = 6 mlen 5 rlen 8 { align1 1H };
+send(16)        g30<1>UW        g23<8,8,1>UD    0x0e8c8708
+                            sampler MsgDesc: gather4 SIMD16 Surface = 8 Sampler = 7 mlen 7 rlen 8 { align1 1H };
+send(16)        g5<1>UW         g40<8,8,1>UD    0x0a8c8809
+                            sampler MsgDesc: gather4 SIMD16 Surface = 9 Sampler = 8 mlen 5 rlen 8 { align1 1H };
+send(16)        g38<1>UW        g67<8,8,1>UD    0x0e8d090a
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 10 Sampler = 9 mlen 7 rlen 8 { align1 1H };
+send(16)        g38<1>UW        g2<8,8,1>UD     0x128d0a0b
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 11 Sampler = 10 mlen 9 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g39<8,8,1>UD    0x0e8d0b0c
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 12 Sampler = 11 mlen 7 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g6<8,8,1>UD     0x0e4b2000
+                            sampler MsgDesc: gather4_po_c SIMD8 Surface = 0 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+send(8)         g67<1>UW        g2<8,8,1>UD     0x0410bbfe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD8, imin) mlen 2 rlen 1 { align1 1Q };
+send(8)         g13<1>UW        g4<8,8,1>UD     0x06495002
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 2, SIMD16, Mask = 0x0) mlen 3 rlen 4 { align1 1Q };
+send(8)         g9<1>UW         g7<8,8,1>UD     0x06496002
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 2, SIMD8, Mask = 0x0) mlen 3 rlen 4 { align1 2Q };
+send(8)         null<1>UW       g4<8,8,1>UD     0x0e0b5003
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 3, SIMD16, Mask = 0x0) mlen 7 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g4<8,8,1>UD     0x0e0b6003
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 3, SIMD8, Mask = 0x0) mlen 7 rlen 0 { align1 2Q };
+(+f1.0) send(8) g3<1>UW         g10<8,8,1>UD    0x0410b701
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, add) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g5<1>UW         g10<8,8,1>UD    0x0410bd01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, umin) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g6<1>UW         g10<8,8,1>UD    0x0410bc01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, umax) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g7<1>UW         g10<8,8,1>UD    0x0410b101
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, and) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g9<1>UW         g10<8,8,1>UD    0x0410b301
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, xor) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g10<1>UW        g10<8,8,1>UD    0x0410b401
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, mov) mlen 2 rlen 1 { align1 1Q };
+(+f1.0) send(8) g11<1>UW        g11<8,8,1>UD    0x0610be01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, cmpwr) mlen 3 rlen 1 { align1 1Q };
+(+f1.0) send(16) g3<1>UW        g19<8,8,1>UD    0x0820a701
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, add) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g7<1>UW        g19<8,8,1>UD    0x0820ad01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, umin) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g9<1>UW        g19<8,8,1>UD    0x0820ac01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, umax) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g11<1>UW       g19<8,8,1>UD    0x0820a101
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, and) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g15<1>UW       g19<8,8,1>UD    0x0820a301
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, xor) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g17<1>UW       g19<8,8,1>UD    0x0820a401
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, mov) mlen 4 rlen 2 { align1 1H };
+(+f1.0) send(16) g19<1>UW       g21<8,8,1>UD    0x0c20ae01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, cmpwr) mlen 6 rlen 2 { align1 1H };
+send(8)         null<1>F        g16<8,8,1>UD    0x0e0a8057
+                            urb MsgDesc: 5 SIMD8 write per-slot masked mlen 7 rlen 0 { align1 1Q };
+send(8)         g6<1>UD         g18<8,8,1>UD    0x043a0318
+                            urb MsgDesc: 49 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g9<1>UD         g18<8,8,1>UD    0x043a0518
+                            urb MsgDesc: 81 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g12<1>UD        g18<8,8,1>UD    0x043a0718
+                            urb MsgDesc: 113 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g15<1>UD        g18<8,8,1>UD    0x043a0918
+                            urb MsgDesc: 145 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g11<1>UD        g23<8,8,1>UD    0x043a0218
+                            urb MsgDesc: 33 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g14<1>UD        g23<8,8,1>UD    0x043a0418
+                            urb MsgDesc: 65 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g17<1>UD        g23<8,8,1>UD    0x043a0618
+                            urb MsgDesc: 97 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g20<1>UD        g23<8,8,1>UD    0x043a0818
+                            urb MsgDesc: 129 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0c0a8227
+                            urb MsgDesc: 34 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x0c0a8237
+                            urb MsgDesc: 35 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x0c0a8247
+                            urb MsgDesc: 36 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x0c0a8257
+                            urb MsgDesc: 37 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0c0a8267
+                            urb MsgDesc: 38 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x0c0a8277
+                            urb MsgDesc: 39 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x0c0a8287
+                            urb MsgDesc: 40 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x0c0a8297
+                            urb MsgDesc: 41 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0c0a82a7
+                            urb MsgDesc: 42 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0c0a82b7
+                            urb MsgDesc: 43 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0c0a82c7
+                            urb MsgDesc: 44 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0c0a82d7
+                            urb MsgDesc: 45 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0c0a82e7
+                            urb MsgDesc: 46 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0c0a82f7
+                            urb MsgDesc: 47 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x0c0a8307
+                            urb MsgDesc: 48 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0c0a8317
+                            urb MsgDesc: 49 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0c0a8327
+                            urb MsgDesc: 50 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0c0a8337
+                            urb MsgDesc: 51 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0c0a8347
+                            urb MsgDesc: 52 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0c0a8357
+                            urb MsgDesc: 53 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0c0a8367
+                            urb MsgDesc: 54 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x0c0a8377
+                            urb MsgDesc: 55 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0c0a8387
+                            urb MsgDesc: 56 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0c0a8397
+                            urb MsgDesc: 57 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0c0a83a7
+                            urb MsgDesc: 58 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0c0a83b7
+                            urb MsgDesc: 59 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0c0a83c7
+                            urb MsgDesc: 60 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0c0a83d7
+                            urb MsgDesc: 61 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x0c0a83e7
+                            urb MsgDesc: 62 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g41<8,8,1>UD    0x0c0a83f7
+                            urb MsgDesc: 63 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x10434001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 8 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g15<8,8,1>UD    0x10434102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 8 rlen 4 { align1 1Q };
+send(8)         g16<1>UD        g16<8,8,1>UD    0x044a0148
+                            urb MsgDesc: 20 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g38<1>UW        g38<8,8,1>UD    0x084a8404
+                            sampler MsgDesc: gather4 SIMD8 Surface = 4 Sampler = 4 mlen 4 rlen 4 { align1 1Q };
+send(8)         g46<1>UW        g23<8,8,1>UD    0x064a8303
+                            sampler MsgDesc: gather4 SIMD8 Surface = 3 Sampler = 3 mlen 3 rlen 4 { align1 1Q };
+send(8)         g28<1>UW        g28<8,8,1>UD    0x064a8505
+                            sampler MsgDesc: gather4 SIMD8 Surface = 5 Sampler = 5 mlen 3 rlen 4 { align1 1Q };
+send(8)         g12<1>UW        g23<8,8,1>UD    0x064a8606
+                            sampler MsgDesc: gather4 SIMD8 Surface = 6 Sampler = 6 mlen 3 rlen 4 { align1 1Q };
+send(8)         g12<1>UW        g32<8,8,1>UD    0x084a8707
+                            sampler MsgDesc: gather4 SIMD8 Surface = 7 Sampler = 7 mlen 4 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g13<8,8,1>UD    0x064a8808
+                            sampler MsgDesc: gather4 SIMD8 Surface = 8 Sampler = 8 mlen 3 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g26<8,8,1>UD    0x084b0909
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 9 Sampler = 9 mlen 4 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x0a4b0a0a
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 10 Sampler = 10 mlen 5 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g10<8,8,1>UD    0x084b0b0b
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 11 Sampler = 11 mlen 4 rlen 4 { align1 1Q };
+send(8)         null<1>UW       g1<8,8,1>UD     0x080b5e09
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 9, SIMD16, Mask = 0xe) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g1<8,8,1>UD     0x080b6e09
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 9, SIMD8, Mask = 0xe) mlen 4 rlen 0 { align1 2Q };
+send(8)         g2<1>UD         g15<8,8,1>UD    0x043a0048
+                            urb MsgDesc: 4 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g12<1>UD        g15<8,8,1>UD    0x043a0058
+                            urb MsgDesc: 5 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0068
+                            urb MsgDesc: 6 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0078
+                            urb MsgDesc: 7 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0088
+                            urb MsgDesc: 8 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0098
+                            urb MsgDesc: 9 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a00a8
+                            urb MsgDesc: 10 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a00b8
+                            urb MsgDesc: 11 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a00c8
+                            urb MsgDesc: 12 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a00d8
+                            urb MsgDesc: 13 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a00e8
+                            urb MsgDesc: 14 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a00f8
+                            urb MsgDesc: 15 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0108
+                            urb MsgDesc: 16 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0118
+                            urb MsgDesc: 17 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0138
+                            urb MsgDesc: 19 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0148
+                            urb MsgDesc: 20 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0158
+                            urb MsgDesc: 21 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0168
+                            urb MsgDesc: 22 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0178
+                            urb MsgDesc: 23 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0188
+                            urb MsgDesc: 24 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0198
+                            urb MsgDesc: 25 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a01a8
+                            urb MsgDesc: 26 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a01b8
+                            urb MsgDesc: 27 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a01c8
+                            urb MsgDesc: 28 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a01d8
+                            urb MsgDesc: 29 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a01e8
+                            urb MsgDesc: 30 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a01f8
+                            urb MsgDesc: 31 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0208
+                            urb MsgDesc: 32 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>F     0x140a0047
+                            urb MsgDesc: 4 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>F     0x140a0087
+                            urb MsgDesc: 8 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g118<8,8,1>F    0x940a0087
+                            urb MsgDesc: 8 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q EOT };
+send(8)         g14<1>UW        g11<8,8,1>UD    0x0a4b0202
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 2 Sampler = 2 mlen 5 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x0c4b0303
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 3 Sampler = 3 mlen 6 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g24<8,8,1>UD    0x084b0404
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 4 Sampler = 4 mlen 4 rlen 4 { align1 1Q };
+send(8)         g24<1>UW        g2<8,8,1>UD     0x06423203
+                            sampler MsgDesc: sample_c SIMD8 Surface = 3 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(16)        g19<1>UW        g27<8,8,1>UD    0x0c843203
+                            sampler MsgDesc: sample_c SIMD16 Surface = 3 Sampler = 2 mlen 6 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x06422303
+                            sampler MsgDesc: sample_l SIMD8 Surface = 3 Sampler = 3 mlen 3 rlen 4 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x04009d01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, umin) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x04009c01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, umax) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x04009101
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, and) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x04009201
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, or) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x04009301
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, xor) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g3<8,8,1>UD     0x04009401
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, mov) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g9<8,8,1>UD     0x06009e01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, cmpwr) mlen 3 rlen 0 { align1 1Q };
+(+f1.0) send(16) null<1>UW      g4<8,8,1>UD     0x08008d01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, umin) mlen 4 rlen 0 { align1 1H };
+(+f1.0) send(16) null<1>UW      g4<8,8,1>UD     0x08008c01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, umax) mlen 4 rlen 0 { align1 1H };
+(+f1.0) send(16) null<1>UW      g4<8,8,1>UD     0x08008101
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, and) mlen 4 rlen 0 { align1 1H };
+(+f1.0) send(16) null<1>UW      g4<8,8,1>UD     0x08008201
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, or) mlen 4 rlen 0 { align1 1H };
+(+f1.0) send(16) null<1>UW      g4<8,8,1>UD     0x08008301
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, xor) mlen 4 rlen 0 { align1 1H };
+(+f1.0) send(16) null<1>UW      g4<8,8,1>UD     0x08008401
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, mov) mlen 4 rlen 0 { align1 1H };
+(+f1.0) send(16) null<1>UW      g14<8,8,1>UD    0x0c008e01
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, cmpwr) mlen 6 rlen 0 { align1 1H };
+send(8)         g11<1>UD        g17<8,8,1>UD    0x043a0338
+                            urb MsgDesc: 51 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g14<1>UD        g17<8,8,1>UD    0x043a0538
+                            urb MsgDesc: 83 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g17<1>UD        g17<8,8,1>UD    0x043a0738
+                            urb MsgDesc: 115 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g9<1>UD         g18<8,8,1>UD    0x043a0038
+                            urb MsgDesc: 3 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g12<1>UD        g18<8,8,1>UD    0x043a0238
+                            urb MsgDesc: 35 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g15<1>UD        g18<8,8,1>UD    0x043a0438
+                            urb MsgDesc: 67 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g18<1>UD        g18<8,8,1>UD    0x043a0638
+                            urb MsgDesc: 99 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x08424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g9<1>UW         g5<8,8,1>UD     0x04420002
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g13<1>UW        g7<8,8,1>UD     0x08840002
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x0419a501
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, inc) mlen 2 rlen 1 { align1 1Q };
+send(8)         g121<1>UW       g2<8,8,1>UD     0x0419b501
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, inc) mlen 2 rlen 1 { align1 2Q };
+send(8)         null<1>UW       g20<8,8,1>UD    0x06098101
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, and) mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06099101
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, and) mlen 3 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g19<8,8,1>UD    0x06098201
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, or) mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06099201
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, or) mlen 3 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g19<8,8,1>UD    0x06098301
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, xor) mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x06099301
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, xor) mlen 3 rlen 0 { align1 2Q };
+send(8)         g22<1>UD        g32<8,8,1>UD    0x02280238
+                            urb MsgDesc: 35 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g24<1>UD        g32<8,8,1>UD    0x02280438
+                            urb MsgDesc: 67 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g26<1>UD        g32<8,8,1>UD    0x02280638
+                            urb MsgDesc: 99 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g28<1>UD        g32<8,8,1>UD    0x02280248
+                            urb MsgDesc: 36 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g30<1>UD        g32<8,8,1>UD    0x02280448
+                            urb MsgDesc: 68 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g32<1>UD        g32<8,8,1>UD    0x02280648
+                            urb MsgDesc: 100 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g22<1>UD        g33<8,8,1>UD    0x02280258
+                            urb MsgDesc: 37 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g24<1>UD        g33<8,8,1>UD    0x02280458
+                            urb MsgDesc: 69 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g26<1>UD        g33<8,8,1>UD    0x02280658
+                            urb MsgDesc: 101 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g22<1>UD        g34<8,8,1>UD    0x02280268
+                            urb MsgDesc: 38 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g24<1>UD        g34<8,8,1>UD    0x02280468
+                            urb MsgDesc: 70 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g26<1>UD        g34<8,8,1>UD    0x02280668
+                            urb MsgDesc: 102 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g24<1>UD        g35<8,8,1>UD    0x02280478
+                            urb MsgDesc: 71 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g35<8,8,1>UD    0x02280278
+                            urb MsgDesc: 39 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g26<1>UD        g35<8,8,1>UD    0x02280678
+                            urb MsgDesc: 103 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g24<1>UD        g36<8,8,1>UD    0x02280688
+                            urb MsgDesc: 104 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g36<8,8,1>UD    0x02280288
+                            urb MsgDesc: 40 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g36<8,8,1>UD    0x02280488
+                            urb MsgDesc: 72 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g37<8,8,1>UD    0x02280298
+                            urb MsgDesc: 41 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g37<8,8,1>UD    0x02280498
+                            urb MsgDesc: 73 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g24<1>UD        g37<8,8,1>UD    0x02280698
+                            urb MsgDesc: 105 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g38<8,8,1>UD    0x022802a8
+                            urb MsgDesc: 42 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g38<8,8,1>UD    0x022804a8
+                            urb MsgDesc: 74 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g24<1>UD        g38<8,8,1>UD    0x022806a8
+                            urb MsgDesc: 106 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g39<8,8,1>UD    0x022802b8
+                            urb MsgDesc: 43 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g39<8,8,1>UD    0x022804b8
+                            urb MsgDesc: 75 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g24<1>UD        g39<8,8,1>UD    0x022806b8
+                            urb MsgDesc: 107 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g40<8,8,1>UD    0x022802c8
+                            urb MsgDesc: 44 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g40<8,8,1>UD    0x022804c8
+                            urb MsgDesc: 76 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g40<8,8,1>UD    0x022806c8
+                            urb MsgDesc: 108 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g41<8,8,1>UD    0x022802d8
+                            urb MsgDesc: 45 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g41<8,8,1>UD    0x022804d8
+                            urb MsgDesc: 77 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g41<8,8,1>UD    0x022806d8
+                            urb MsgDesc: 109 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g42<8,8,1>UD    0x022802e8
+                            urb MsgDesc: 46 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g42<8,8,1>UD    0x022804e8
+                            urb MsgDesc: 78 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g42<8,8,1>UD    0x022806e8
+                            urb MsgDesc: 110 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g43<8,8,1>UD    0x022802f8
+                            urb MsgDesc: 47 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g43<8,8,1>UD    0x022804f8
+                            urb MsgDesc: 79 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g43<8,8,1>UD    0x022806f8
+                            urb MsgDesc: 111 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g44<8,8,1>UD    0x02280308
+                            urb MsgDesc: 48 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g44<8,8,1>UD    0x02280508
+                            urb MsgDesc: 80 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g44<8,8,1>UD    0x02280708
+                            urb MsgDesc: 112 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g45<8,8,1>UD    0x02280318
+                            urb MsgDesc: 49 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g45<8,8,1>UD    0x02280518
+                            urb MsgDesc: 81 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g45<8,8,1>UD    0x02280718
+                            urb MsgDesc: 113 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g46<8,8,1>UD    0x02280328
+                            urb MsgDesc: 50 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g46<8,8,1>UD    0x02280528
+                            urb MsgDesc: 82 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g46<8,8,1>UD    0x02280728
+                            urb MsgDesc: 114 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g47<8,8,1>UD    0x02280338
+                            urb MsgDesc: 51 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g47<8,8,1>UD    0x02280538
+                            urb MsgDesc: 83 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g47<8,8,1>UD    0x02280738
+                            urb MsgDesc: 115 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g48<8,8,1>UD    0x02280348
+                            urb MsgDesc: 52 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g48<8,8,1>UD    0x02280548
+                            urb MsgDesc: 84 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g48<8,8,1>UD    0x02280748
+                            urb MsgDesc: 116 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g49<8,8,1>UD    0x02280358
+                            urb MsgDesc: 53 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g49<8,8,1>UD    0x02280558
+                            urb MsgDesc: 85 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g49<8,8,1>UD    0x02280758
+                            urb MsgDesc: 117 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g50<8,8,1>UD    0x02280368
+                            urb MsgDesc: 54 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g50<8,8,1>UD    0x02280568
+                            urb MsgDesc: 86 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g50<8,8,1>UD    0x02280768
+                            urb MsgDesc: 118 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g53<8,8,1>UD    0x02280378
+                            urb MsgDesc: 55 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g53<8,8,1>UD    0x02280578
+                            urb MsgDesc: 87 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g53<8,8,1>UD    0x02280778
+                            urb MsgDesc: 119 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g54<8,8,1>UD    0x02280388
+                            urb MsgDesc: 56 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g54<8,8,1>UD    0x02280588
+                            urb MsgDesc: 88 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g54<8,8,1>UD    0x02280788
+                            urb MsgDesc: 120 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g55<8,8,1>UD    0x02280398
+                            urb MsgDesc: 57 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g55<8,8,1>UD    0x02280598
+                            urb MsgDesc: 89 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g55<8,8,1>UD    0x02280798
+                            urb MsgDesc: 121 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g56<8,8,1>UD    0x022803a8
+                            urb MsgDesc: 58 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g56<8,8,1>UD    0x022805a8
+                            urb MsgDesc: 90 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g56<8,8,1>UD    0x022807a8
+                            urb MsgDesc: 122 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g57<8,8,1>UD    0x022803b8
+                            urb MsgDesc: 59 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g57<8,8,1>UD    0x022805b8
+                            urb MsgDesc: 91 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g57<8,8,1>UD    0x022807b8
+                            urb MsgDesc: 123 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g58<8,8,1>UD    0x022803c8
+                            urb MsgDesc: 60 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g58<8,8,1>UD    0x022805c8
+                            urb MsgDesc: 92 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g58<8,8,1>UD    0x022807c8
+                            urb MsgDesc: 124 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g59<8,8,1>UD    0x022803d8
+                            urb MsgDesc: 61 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g59<8,8,1>UD    0x022805d8
+                            urb MsgDesc: 93 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g59<8,8,1>UD    0x022807d8
+                            urb MsgDesc: 125 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g60<8,8,1>UD    0x022803e8
+                            urb MsgDesc: 62 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g60<8,8,1>UD    0x022805e8
+                            urb MsgDesc: 94 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g60<8,8,1>UD    0x022807e8
+                            urb MsgDesc: 126 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g61<8,8,1>UD    0x022803f8
+                            urb MsgDesc: 63 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g61<8,8,1>UD    0x022805f8
+                            urb MsgDesc: 95 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g61<8,8,1>UD    0x022807f8
+                            urb MsgDesc: 127 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g10<1>UD        g62<8,8,1>UD    0x02280408
+                            urb MsgDesc: 64 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g62<8,8,1>UD    0x02280608
+                            urb MsgDesc: 96 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g14<1>UD        g62<8,8,1>UD    0x02280808
+                            urb MsgDesc: 128 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g63<8,8,1>UD    0x02280218
+                            urb MsgDesc: 33 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g63<8,8,1>UD    0x02280418
+                            urb MsgDesc: 65 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g63<8,8,1>UD    0x02280618
+                            urb MsgDesc: 97 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g14<1>UD        g63<8,8,1>UD    0x02280818
+                            urb MsgDesc: 129 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g29<1>UW        g18<8,8,1>UD    0x04420008
+                            sampler MsgDesc: sample SIMD8 Surface = 8 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g35<1>UW        g18<8,8,1>UD    0x04420109
+                            sampler MsgDesc: sample SIMD8 Surface = 9 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(8)         g41<1>UW        g18<8,8,1>UD    0x0442020a
+                            sampler MsgDesc: sample SIMD8 Surface = 10 Sampler = 2 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g18<8,8,1>UD    0x0442030b
+                            sampler MsgDesc: sample SIMD8 Surface = 11 Sampler = 3 mlen 2 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g18<8,8,1>UD    0x0442040c
+                            sampler MsgDesc: sample SIMD8 Surface = 12 Sampler = 4 mlen 2 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g18<8,8,1>UD    0x0442050d
+                            sampler MsgDesc: sample SIMD8 Surface = 13 Sampler = 5 mlen 2 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g18<8,8,1>UD    0x0442060e
+                            sampler MsgDesc: sample SIMD8 Surface = 14 Sampler = 6 mlen 2 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x0442070f
+                            sampler MsgDesc: sample SIMD8 Surface = 15 Sampler = 7 mlen 2 rlen 4 { align1 1Q };
+send(16)        g32<1>UW        g22<8,8,1>UD    0x08840008
+                            sampler MsgDesc: sample SIMD16 Surface = 8 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(16)        g42<1>UW        g22<8,8,1>UD    0x08840109
+                            sampler MsgDesc: sample SIMD16 Surface = 9 Sampler = 1 mlen 4 rlen 8 { align1 1H };
+send(16)        g60<1>UW        g22<8,8,1>UD    0x0884020a
+                            sampler MsgDesc: sample SIMD16 Surface = 10 Sampler = 2 mlen 4 rlen 8 { align1 1H };
+send(16)        g70<1>UW        g22<8,8,1>UD    0x0884030b
+                            sampler MsgDesc: sample SIMD16 Surface = 11 Sampler = 3 mlen 4 rlen 8 { align1 1H };
+send(16)        g78<1>UW        g22<8,8,1>UD    0x0884040c
+                            sampler MsgDesc: sample SIMD16 Surface = 12 Sampler = 4 mlen 4 rlen 8 { align1 1H };
+send(16)        g86<1>UW        g22<8,8,1>UD    0x0884050d
+                            sampler MsgDesc: sample SIMD16 Surface = 13 Sampler = 5 mlen 4 rlen 8 { align1 1H };
+send(16)        g94<1>UW        g22<8,8,1>UD    0x0884060e
+                            sampler MsgDesc: sample SIMD16 Surface = 14 Sampler = 6 mlen 4 rlen 8 { align1 1H };
+send(16)        g52<1>UW        g22<8,8,1>UD    0x0884070f
+                            sampler MsgDesc: sample SIMD16 Surface = 15 Sampler = 7 mlen 4 rlen 8 { align1 1H };
+send(8)         g16<1>UW        g42<8,8,1>UD    0x06422101
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(8)         g20<1>UW        g42<8,8,1>UD    0x06422202
+                            sampler MsgDesc: sample_l SIMD8 Surface = 2 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(8)         g29<1>UW        g42<8,8,1>UD    0x06422404
+                            sampler MsgDesc: sample_l SIMD8 Surface = 4 Sampler = 4 mlen 3 rlen 4 { align1 1Q };
+send(8)         g38<1>UW        g42<8,8,1>UD    0x06422606
+                            sampler MsgDesc: sample_l SIMD8 Surface = 6 Sampler = 6 mlen 3 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g42<8,8,1>UD    0x06422707
+                            sampler MsgDesc: sample_l SIMD8 Surface = 7 Sampler = 7 mlen 3 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g16<8,8,1>UD    0x044a0058
+                            urb MsgDesc: 5 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0068
+                            urb MsgDesc: 6 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0078
+                            urb MsgDesc: 7 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0088
+                            urb MsgDesc: 8 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0098
+                            urb MsgDesc: 9 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a00a8
+                            urb MsgDesc: 10 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a00b8
+                            urb MsgDesc: 11 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a00c8
+                            urb MsgDesc: 12 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a00d8
+                            urb MsgDesc: 13 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a00e8
+                            urb MsgDesc: 14 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a00f8
+                            urb MsgDesc: 15 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0108
+                            urb MsgDesc: 16 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0118
+                            urb MsgDesc: 17 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0158
+                            urb MsgDesc: 21 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0168
+                            urb MsgDesc: 22 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0178
+                            urb MsgDesc: 23 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0188
+                            urb MsgDesc: 24 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0198
+                            urb MsgDesc: 25 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a01a8
+                            urb MsgDesc: 26 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a01b8
+                            urb MsgDesc: 27 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a01c8
+                            urb MsgDesc: 28 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a01d8
+                            urb MsgDesc: 29 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a01e8
+                            urb MsgDesc: 30 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a01f8
+                            urb MsgDesc: 31 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0208
+                            urb MsgDesc: 32 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x0a425001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g17<8,8,1>UD    0x0a425102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g27<1>UW        g7<8,8,1>UD     0x14845001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(16)        g35<1>UW        g17<8,8,1>UD    0x14845102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 10 rlen 8 { align1 1H };
+send(8)         null<1>F        g120<8,8,1>F    0x8c0a0117
+                            urb MsgDesc: 17 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380128
+                            urb MsgDesc: 18 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380138
+                            urb MsgDesc: 19 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380148
+                            urb MsgDesc: 20 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380158
+                            urb MsgDesc: 21 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380168
+                            urb MsgDesc: 22 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380178
+                            urb MsgDesc: 23 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380188
+                            urb MsgDesc: 24 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380198
+                            urb MsgDesc: 25 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x023801a8
+                            urb MsgDesc: 26 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x023801b8
+                            urb MsgDesc: 27 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x023801c8
+                            urb MsgDesc: 28 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x023801d8
+                            urb MsgDesc: 29 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x023801e8
+                            urb MsgDesc: 30 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x023801f8
+                            urb MsgDesc: 31 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         null<1>UW       g18<8,8,1>UD    0x08098701
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, add) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099701
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, add) mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g18<8,8,1>UD    0x08098d01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, umin) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099d01
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, umin) mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g18<8,8,1>UD    0x08098101
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, and) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099101
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, and) mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g18<8,8,1>UD    0x08098201
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, or) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099201
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, or) mlen 4 rlen 0 { align1 2Q };
+send(8)         null<1>UW       g18<8,8,1>UD    0x08098301
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, xor) mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g2<8,8,1>UD     0x08099301
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, xor) mlen 4 rlen 0 { align1 2Q };
+send(8)         g11<1>UD        g13<8,8,1>UD    0x042a0058
+                            urb MsgDesc: 5 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a0068
+                            urb MsgDesc: 6 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a0078
+                            urb MsgDesc: 7 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a0088
+                            urb MsgDesc: 8 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a0098
+                            urb MsgDesc: 9 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a00a8
+                            urb MsgDesc: 10 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a00b8
+                            urb MsgDesc: 11 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a00c8
+                            urb MsgDesc: 12 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a00d8
+                            urb MsgDesc: 13 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a00e8
+                            urb MsgDesc: 14 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a00f8
+                            urb MsgDesc: 15 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a0108
+                            urb MsgDesc: 16 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a0118
+                            urb MsgDesc: 17 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a0158
+                            urb MsgDesc: 21 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a0168
+                            urb MsgDesc: 22 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a0178
+                            urb MsgDesc: 23 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a0188
+                            urb MsgDesc: 24 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a0198
+                            urb MsgDesc: 25 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a01a8
+                            urb MsgDesc: 26 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a01b8
+                            urb MsgDesc: 27 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a01c8
+                            urb MsgDesc: 28 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a01d8
+                            urb MsgDesc: 29 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a01e8
+                            urb MsgDesc: 30 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a01f8
+                            urb MsgDesc: 31 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a0208
+                            urb MsgDesc: 32 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x024ab102
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g10<8,8,1>UD    0x024ab203
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 3 Sampler = 2 mlen 1 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g14<8,8,1>UD    0x024ab304
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 4 Sampler = 3 mlen 1 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x024ab405
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 5 Sampler = 4 mlen 1 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g22<8,8,1>UD    0x024ab506
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 6 Sampler = 5 mlen 1 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g26<8,8,1>UD    0x028cb102
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 2 Sampler = 1 mlen 1 rlen 8 { align1 1H };
+send(16)        g28<1>UW        g27<8,8,1>UD    0x028cb203
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 3 Sampler = 2 mlen 1 rlen 8 { align1 1H };
+send(16)        g36<1>UW        g44<8,8,1>UD    0x028cb304
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 4 Sampler = 3 mlen 1 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g53<8,8,1>UD    0x028cb506
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 6 Sampler = 5 mlen 1 rlen 8 { align1 1H };
+send(16)        g44<1>UW        g52<8,8,1>UD    0x028cb405
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 5 Sampler = 4 mlen 1 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x0a4b0102
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g11<8,8,1>UD    0x0a4b0203
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 3 Sampler = 2 mlen 5 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x0c4b0304
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 4 Sampler = 3 mlen 6 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g24<8,8,1>UD    0x084b0405
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 5 Sampler = 4 mlen 4 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g26<8,8,1>UD    0x128d0203
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 3 Sampler = 2 mlen 9 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g53<8,8,1>UD    0x128d0102
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 2 Sampler = 1 mlen 9 rlen 8 { align1 1H };
+send(16)        g26<1>UW        g35<8,8,1>UD    0x168d0304
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 4 Sampler = 3 mlen 11 rlen 8 { align1 1H };
+send(16)        g34<1>UW        g46<8,8,1>UD    0x0e8d0405
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 5 Sampler = 4 mlen 7 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g9<8,8,1>UD     0x0c4b0000
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 0 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
diff --git a/src/intel/compiler/elk/tests/gen8/send.expected b/src/intel/compiler/elk/tests/gen8/send.expected
new file mode 100644
index 00000000000..13890581ae9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/send.expected
@@ -0,0 +1,2190 @@
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 17 00 08 8a
+31 00 60 06 e0 3a 00 20 a0 01 8d 06 07 00 08 12
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 27 00 08 8a
+31 00 80 09 0c 02 20 21 40 00 8d 06 00 03 28 02
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 17 00 08 92
+31 00 60 0c 40 02 00 20 60 00 8d 06 01 50 0b 0e
+31 10 60 0c 40 02 00 20 20 00 8d 06 01 60 0b 0e
+31 00 80 07 44 12 00 20 e0 0f 8d 06 10 00 00 82
+31 00 60 02 48 02 80 2f a0 01 8d 06 01 70 42 08
+31 00 80 02 48 02 00 2f 40 01 8d 06 01 70 84 10
+31 00 60 06 08 02 40 21 40 00 8d 06 28 00 48 02
+31 00 60 06 e0 3a 00 20 00 01 8d 06 17 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 0e 8d 06 17 00 0a 94
+31 00 60 06 e0 02 00 20 60 01 8d 06 37 00 0a 0c
+31 00 60 06 e0 02 00 20 c0 00 8d 06 27 00 08 0a
+31 00 60 06 e0 02 00 20 c0 00 8d 06 17 80 08 0c
+31 00 60 06 e0 02 00 20 c0 00 8d 06 17 80 08 0a
+31 00 60 06 e0 02 00 20 c0 00 8d 06 17 80 08 08
+31 00 60 06 e0 02 00 20 40 00 8d 06 17 80 08 06
+31 00 60 06 e0 02 00 20 c0 00 8d 06 07 80 08 0c
+31 00 60 06 e0 02 00 20 c0 00 8d 06 07 80 08 0a
+31 00 60 06 08 02 40 20 40 00 8d 06 08 00 48 02
+31 00 60 06 08 02 c0 20 c0 00 8d 06 18 00 48 02
+31 00 60 06 e0 02 00 20 a0 0f 8d 06 07 80 08 86
+31 00 60 02 48 02 e0 20 e0 00 8d 06 00 70 42 04
+31 00 60 06 e0 3a 00 20 60 01 8d 06 17 00 08 12
+31 00 60 06 e0 3a 00 20 80 02 8d 06 37 00 08 12
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 57 00 08 8a
+31 00 60 02 48 02 80 2f 40 00 8d 06 01 00 42 04
+31 00 80 02 48 02 00 2f 40 00 8d 06 01 00 84 08
+31 00 60 02 48 02 c0 20 c0 00 8d 06 01 d0 43 06
+31 00 80 02 48 02 00 21 00 02 8d 06 01 d0 85 0c
+31 00 60 02 48 02 00 21 20 02 8d 06 01 e0 43 0a
+31 00 80 02 48 02 40 24 00 02 8d 06 01 e0 85 14
+31 00 60 02 48 02 a0 20 e0 00 8d 06 00 a0 42 02
+31 00 60 02 48 02 80 2f e0 01 8d 06 00 80 4a 06
+31 00 60 02 48 02 80 2f 60 01 8d 06 01 00 42 06
+31 00 80 02 48 02 00 2f 60 02 8d 06 01 00 84 0c
+31 00 60 02 48 02 80 21 a0 00 8d 06 00 70 42 02
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 37 00 08 8a
+31 00 60 02 48 02 80 2f e0 00 8d 06 01 40 4a 14
+31 00 81 0c 4a 02 40 2f 20 01 8d 06 01 a5 20 04
+31 00 60 02 48 02 80 21 e0 00 8d 06 01 d0 43 04
+31 00 60 02 48 02 a0 21 a0 00 8d 06 01 e0 43 08
+31 00 80 02 48 02 e0 21 60 01 8d 06 01 d0 85 08
+31 00 80 02 48 02 60 23 e0 00 8d 06 01 e0 85 10
+31 00 61 0c 4a 02 a0 2f 60 00 8d 06 01 b5 10 02
+31 00 60 02 48 02 80 2f 00 01 8d 00 00 02 00 00
+31 00 60 02 48 02 80 2f 00 01 8d 06 01 40 4a 08
+31 00 60 02 48 02 80 2f 00 01 8d 06 01 80 4a 06
+31 00 80 02 48 02 00 2f 80 01 8d 06 01 80 8c 0a
+31 00 60 02 48 02 40 20 e0 00 8d 06 01 60 4a 0a
+31 00 60 02 48 02 c0 20 80 01 8d 06 02 61 4a 0a
+31 00 80 02 48 02 40 20 60 01 8d 06 01 60 8c 12
+31 00 80 02 48 02 40 21 80 02 8d 06 02 61 8c 12
+31 00 60 02 48 02 80 2f 60 00 8d 06 00 e0 43 0a
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 27 00 08 92
+31 00 60 02 48 02 40 20 60 00 8d 06 00 d0 43 06
+31 00 60 06 e0 02 00 20 e0 00 8d 06 37 00 08 0a
+31 00 60 06 e0 02 00 20 00 01 8d 06 47 00 08 0a
+31 00 60 06 e0 3a 00 20 a0 03 8d 06 17 00 0a 0c
+31 00 60 06 e0 3a 00 20 40 0f 8d 06 17 00 0a 8c
+31 00 60 02 48 02 a0 20 a0 02 8d 06 01 00 42 02
+31 00 80 02 48 02 e0 20 40 03 8d 06 01 00 84 04
+31 00 60 02 48 02 00 21 40 01 8d 06 01 a0 42 02
+31 00 60 02 48 02 40 20 80 01 8d 06 01 10 4b 0c
+31 00 80 02 48 02 e0 22 80 02 8d 06 01 a0 84 04
+31 00 80 02 48 02 40 22 e0 00 8d 06 01 10 8d 16
+31 00 60 06 e0 02 00 20 c0 00 8d 06 27 80 08 0a
+31 00 60 06 e0 02 00 20 e0 00 8d 06 37 80 08 0a
+31 00 60 06 e0 02 00 20 00 01 8d 06 47 80 08 0a
+31 00 60 06 e0 02 00 20 20 01 8d 06 57 80 08 0a
+31 00 60 02 48 02 40 20 80 01 8d 06 01 80 4a 08
+31 00 80 02 48 02 c0 21 e0 00 8d 06 01 80 8c 0e
+31 00 60 02 48 02 80 2f c0 00 8d 06 01 40 42 0c
+31 00 60 02 48 02 40 20 e0 00 8d 06 00 10 4b 0c
+31 00 60 02 48 02 c0 20 c0 00 8d 06 01 a1 42 02
+31 00 60 02 48 02 40 21 40 01 8d 06 02 a2 42 02
+31 00 60 02 48 02 c0 21 c0 01 8d 06 03 a3 42 02
+31 00 60 02 48 02 40 22 40 02 8d 06 04 a4 42 02
+31 00 60 02 48 02 c0 22 c0 02 8d 06 05 a5 42 02
+31 00 60 02 48 02 40 23 40 03 8d 06 06 a6 42 02
+31 00 60 06 08 02 c0 20 e0 01 8d 06 18 03 2a 04
+31 00 60 06 08 02 00 21 e0 01 8d 06 18 05 2a 04
+31 00 60 06 08 02 40 21 e0 01 8d 06 18 07 2a 04
+31 00 60 06 08 02 80 21 e0 01 8d 06 18 09 2a 04
+31 00 60 06 08 02 c0 21 e0 01 8d 06 28 01 2a 04
+31 00 60 06 08 02 00 22 c0 01 8d 06 18 02 2a 04
+31 00 60 06 08 02 40 22 c0 01 8d 06 18 04 2a 04
+31 00 60 06 08 02 80 22 c0 01 8d 06 18 06 2a 04
+31 00 60 06 08 02 c0 22 c0 01 8d 06 18 08 2a 04
+31 00 60 06 08 02 a0 21 c0 01 8d 06 28 00 2a 04
+31 00 60 06 08 02 40 20 c0 03 8d 06 08 02 48 02
+31 00 60 06 08 02 c0 21 c0 03 8d 06 08 04 48 02
+31 00 60 06 08 02 40 22 c0 03 8d 06 08 06 48 02
+31 00 60 06 08 02 c0 22 c0 03 8d 06 08 08 48 02
+31 00 60 06 e0 02 00 20 c0 00 8d 06 17 82 0a 0a
+31 00 60 06 e0 02 00 20 60 01 8d 06 27 82 0a 0a
+31 00 60 06 e0 02 00 20 80 01 8d 06 37 82 0a 0a
+31 00 60 06 e0 02 00 20 a0 01 8d 06 47 82 0a 0a
+31 00 60 06 e0 02 00 20 c0 01 8d 06 57 82 0a 0a
+31 00 60 06 e0 02 00 20 e0 01 8d 06 67 82 0a 0a
+31 00 60 06 e0 02 00 20 00 02 8d 06 77 82 0a 0a
+31 00 60 06 e0 02 00 20 20 02 8d 06 87 82 0a 0a
+31 00 60 06 e0 02 00 20 40 02 8d 06 97 82 0a 0a
+31 00 60 06 e0 02 00 20 60 02 8d 06 a7 82 0a 0a
+31 00 60 06 e0 02 00 20 80 02 8d 06 b7 82 0a 0a
+31 00 60 06 e0 02 00 20 a0 02 8d 06 c7 82 0a 0a
+31 00 60 06 e0 02 00 20 c0 02 8d 06 d7 82 0a 0a
+31 00 60 06 e0 02 00 20 e0 02 8d 06 e7 82 0a 0a
+31 00 60 06 e0 02 00 20 00 03 8d 06 f7 82 0a 0a
+31 00 60 06 e0 02 00 20 20 03 8d 06 07 83 0a 0a
+31 00 60 06 e0 02 00 20 40 03 8d 06 17 83 0a 0a
+31 00 60 06 e0 02 00 20 60 03 8d 06 27 83 0a 0a
+31 00 60 06 e0 02 00 20 80 03 8d 06 37 83 0a 0a
+31 00 60 06 e0 02 00 20 a0 03 8d 06 47 83 0a 0a
+31 00 60 06 e0 02 00 20 c0 03 8d 06 57 83 0a 0a
+31 00 60 06 e0 02 00 20 e0 03 8d 06 67 83 0a 0a
+31 00 60 06 e0 02 00 20 00 04 8d 06 77 83 0a 0a
+31 00 60 06 e0 02 00 20 20 04 8d 06 87 83 0a 0a
+31 00 60 06 e0 02 00 20 40 04 8d 06 97 83 0a 0a
+31 00 60 06 e0 02 00 20 60 04 8d 06 a7 83 0a 0a
+31 00 60 06 e0 02 00 20 80 04 8d 06 b7 83 0a 0a
+31 00 60 06 e0 02 00 20 a0 04 8d 06 c7 83 0a 0a
+31 00 60 06 e0 02 00 20 c0 04 8d 06 d7 83 0a 0a
+31 00 60 06 e0 02 00 20 e0 04 8d 06 e7 83 0a 0a
+31 00 60 06 e0 02 00 20 00 05 8d 06 f7 83 0a 0a
+31 00 60 06 e0 02 00 20 60 01 8d 06 27 80 08 08
+31 00 60 06 e0 02 00 20 80 01 8d 06 37 80 08 08
+31 00 60 06 e0 02 00 20 a0 01 8d 06 47 80 08 08
+31 00 60 06 e0 02 00 20 c0 01 8d 06 57 80 08 08
+31 00 60 06 e0 02 00 20 e0 01 8d 06 67 80 08 08
+31 00 60 06 e0 02 00 20 00 02 8d 06 77 80 08 08
+31 00 60 06 e0 02 00 20 20 02 8d 06 87 80 08 08
+31 00 60 06 e0 02 00 20 40 02 8d 06 97 80 08 08
+31 00 60 06 e0 02 00 20 60 02 8d 06 a7 80 08 08
+31 00 60 06 e0 02 00 20 80 02 8d 06 b7 80 08 08
+31 00 60 06 e0 02 00 20 a0 02 8d 06 c7 80 08 08
+31 00 60 06 e0 02 00 20 c0 02 8d 06 d7 80 08 08
+31 00 60 06 e0 02 00 20 e0 02 8d 06 e7 80 08 08
+31 00 60 06 e0 02 00 20 00 03 8d 06 f7 80 08 08
+31 00 60 06 e0 02 00 20 20 03 8d 06 07 81 08 08
+31 00 60 06 e0 02 00 20 40 03 8d 06 17 81 08 08
+31 00 60 06 e0 02 00 20 60 03 8d 06 27 81 08 08
+31 00 60 06 e0 02 00 20 80 03 8d 06 37 81 08 08
+31 00 60 06 e0 02 00 20 a0 03 8d 06 47 81 08 08
+31 00 60 06 e0 02 00 20 c0 03 8d 06 57 81 08 08
+31 00 60 06 e0 02 00 20 e0 03 8d 06 67 81 08 08
+31 00 60 06 e0 02 00 20 00 04 8d 06 77 81 08 08
+31 00 60 06 e0 02 00 20 20 04 8d 06 87 81 08 08
+31 00 60 06 e0 02 00 20 40 04 8d 06 97 81 08 08
+31 00 60 06 e0 02 00 20 60 04 8d 06 a7 81 08 08
+31 00 60 06 e0 02 00 20 80 04 8d 06 b7 81 08 08
+31 00 60 06 e0 02 00 20 a0 04 8d 06 c7 81 08 08
+31 00 60 06 e0 02 00 20 c0 04 8d 06 d7 81 08 08
+31 00 60 06 e0 02 00 20 e0 04 8d 06 e7 81 08 08
+31 00 60 06 e0 02 00 20 00 05 8d 06 f7 81 08 08
+31 00 60 06 e0 02 00 20 60 01 8d 06 07 02 0a 0c
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 57 00 08 92
+31 00 60 02 48 02 40 21 40 02 8d 06 00 80 4a 08
+31 00 80 02 48 02 e0 20 20 00 8d 06 01 70 84 0c
+31 00 80 0c 40 02 00 20 20 07 8d 06 02 85 00 04
+31 00 60 02 48 02 80 2f 40 00 8d 06 01 20 42 04
+31 00 80 02 48 02 00 2f e0 00 8d 06 01 20 84 08
+31 00 60 02 48 02 80 2f 60 00 8d 06 00 70 42 06
+31 00 60 02 48 02 a0 20 60 00 8d 06 01 70 42 02
+31 00 80 02 48 02 00 21 a0 00 8d 06 01 70 84 04
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 07 00 08 92
+31 00 60 06 e0 02 00 20 c0 0f 8d 06 17 00 08 84
+31 00 60 02 48 02 80 2f 20 01 8d 06 01 10 42 08
+31 00 80 02 48 02 00 2f c0 01 8d 06 01 10 84 10
+31 00 60 06 08 02 c0 24 20 00 8d 06 28 00 18 02
+31 00 60 06 08 02 00 25 20 00 8d 06 38 00 18 02
+31 00 60 06 08 02 40 25 20 00 8d 06 48 00 18 02
+31 00 60 06 08 02 80 25 20 00 8d 06 58 00 18 02
+31 00 60 06 08 02 c0 25 20 00 8d 06 68 00 18 02
+31 00 60 06 08 02 00 26 20 00 8d 06 78 00 18 02
+31 00 60 06 08 02 40 26 20 00 8d 06 88 00 18 02
+31 00 60 06 08 02 80 26 20 00 8d 06 98 00 18 02
+31 00 60 06 08 02 c0 26 20 00 8d 06 a8 00 18 02
+31 00 60 06 08 02 00 27 20 00 8d 06 b8 00 18 02
+31 00 60 06 08 02 40 27 20 00 8d 06 c8 00 18 02
+31 00 60 06 08 02 80 27 20 00 8d 06 d8 00 18 02
+31 00 60 06 08 02 c0 27 20 00 8d 06 e8 00 18 02
+31 00 60 06 08 02 00 28 20 00 8d 06 f8 00 18 02
+31 00 60 06 08 02 40 28 20 00 8d 06 08 01 18 02
+31 00 60 06 08 02 80 28 20 00 8d 06 18 01 18 02
+31 00 60 06 08 02 c0 28 20 00 8d 06 28 01 18 02
+31 00 60 06 08 02 00 29 20 00 8d 06 38 01 18 02
+31 00 60 06 08 02 40 29 20 00 8d 06 48 01 18 02
+31 00 60 06 08 02 80 29 20 00 8d 06 58 01 18 02
+31 00 60 06 08 02 c0 29 20 00 8d 06 68 01 18 02
+31 00 60 06 08 02 00 2a 20 00 8d 06 78 01 18 02
+31 00 60 06 08 02 40 2a 20 00 8d 06 88 01 18 02
+31 00 60 06 08 02 80 2a 20 00 8d 06 98 01 18 02
+31 00 60 06 08 02 c0 2a 20 00 8d 06 a8 01 18 02
+31 00 60 06 08 02 00 2b 20 00 8d 06 b8 01 18 02
+31 00 60 06 08 02 40 2b 20 00 8d 06 c8 01 18 02
+31 00 60 06 08 02 80 2b 20 00 8d 06 d8 01 18 02
+31 00 60 06 08 02 c0 2b 20 00 8d 06 e8 01 18 02
+31 00 60 06 08 02 00 2c 20 00 8d 06 f8 01 18 02
+31 00 60 06 08 02 40 2c 20 00 8d 06 08 02 18 02
+31 00 60 06 e0 02 00 20 80 01 8d 06 27 00 0a 0c
+31 00 60 0a 40 02 00 20 c0 0f 8d 06 fd 02 0a 04
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 1b 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 1c 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 1d 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 1e 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 1f 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 20 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 21 00 1c 02
+31 00 60 06 e0 3a 00 20 20 03 8d 06 57 00 08 12
+31 00 60 06 e0 3a 00 20 40 04 8d 06 77 00 08 12
+31 00 60 06 e0 3a 00 20 60 05 8d 06 97 00 08 12
+31 00 60 06 e0 3a 00 20 80 06 8d 06 b7 00 08 12
+31 00 60 06 e0 3a 00 20 a0 07 8d 06 d7 00 08 12
+31 00 60 06 e0 3a 00 20 c0 08 8d 06 f7 00 08 12
+31 00 60 0a 4c 3a 40 20 00 00 8d 06 00 00 1c 02
+31 00 60 0a 4c 3a 60 20 00 00 8d 06 01 00 1c 02
+31 00 60 0a 4c 3a 80 20 00 00 8d 06 02 00 1c 02
+31 00 60 0a 4c 3a a0 20 00 00 8d 06 03 00 1c 02
+31 00 60 0a 4c 3a c0 20 00 00 8d 06 04 00 1c 02
+31 00 60 0a 4c 3a e0 20 00 00 8d 06 05 00 1c 02
+31 00 60 0a 4c 3a 00 21 00 00 8d 06 06 00 1c 02
+31 00 60 0a 4c 3a 20 21 00 00 8d 06 07 00 1c 02
+31 00 60 0a 4c 3a 40 21 00 00 8d 06 08 00 1c 02
+31 00 60 06 e0 3a 00 20 40 00 8d 06 17 01 08 12
+31 00 60 0a 4c 3a 40 20 00 00 8d 06 09 00 1c 02
+31 00 60 0a 4c 3a 60 20 00 00 8d 06 0a 00 1c 02
+31 00 60 0a 4c 3a 80 20 00 00 8d 06 0b 00 1c 02
+31 00 60 0a 4c 3a a0 20 00 00 8d 06 0c 00 1c 02
+31 00 60 0a 4c 3a c0 20 00 00 8d 06 0d 00 1c 02
+31 00 60 0a 4c 3a e0 20 00 00 8d 06 0e 00 1c 02
+31 00 60 0a 4c 3a 00 21 00 00 8d 06 0f 00 1c 02
+31 00 60 0a 4c 3a 20 21 00 00 8d 06 10 00 1c 02
+31 00 60 0a 4c 3a 40 21 00 00 8d 06 11 00 1c 02
+31 00 60 06 e0 3a 00 20 40 00 8d 06 37 01 08 12
+31 00 60 0a 4c 3a 40 20 00 00 8d 06 12 00 1c 02
+31 00 60 0a 4c 3a 60 20 00 00 8d 06 13 00 1c 02
+31 00 60 0a 4c 3a 80 20 00 00 8d 06 14 00 1c 02
+31 00 60 0a 4c 3a a0 20 00 00 8d 06 15 00 1c 02
+31 00 60 0a 4c 3a c0 20 00 00 8d 06 16 00 1c 02
+31 00 60 0a 4c 3a e0 20 00 00 8d 06 17 00 1c 02
+31 00 60 0a 4c 3a 00 21 00 00 8d 06 18 00 1c 02
+31 00 60 0a 4c 3a 20 21 00 00 8d 06 19 00 1c 02
+31 00 60 0a 4c 3a 40 21 00 00 8d 06 1a 00 1c 02
+31 00 60 06 e0 3a 00 20 40 00 8d 06 57 01 08 12
+31 00 60 06 e0 3a 00 20 e0 09 8d 06 77 01 08 12
+31 00 60 06 e0 3a 00 20 00 0b 8d 06 97 01 08 12
+31 00 60 06 e0 3a 00 20 20 0c 8d 06 b7 01 08 12
+31 00 60 06 e0 3a 00 20 40 0d 8d 06 d7 01 08 12
+31 00 60 06 e0 3a 00 20 a0 0e 8d 06 f7 01 08 92
+31 00 60 02 48 02 80 2f 80 01 8d 06 01 10 4b 0a
+31 00 80 02 48 02 00 2f e0 00 8d 06 01 10 8d 12
+31 00 60 02 48 02 40 20 40 00 8d 06 01 90 42 02
+31 00 80 02 48 02 40 20 40 01 8d 06 01 90 84 04
+31 00 60 02 48 02 80 2f 60 00 8d 06 00 70 42 08
+31 00 80 0c 40 02 00 20 00 05 8d 06 01 85 00 04
+31 00 60 06 e0 02 00 20 e0 0f 8d 06 07 00 08 82
+31 00 60 02 48 02 80 2f 20 01 8d 06 00 80 4a 0a
+31 00 60 02 48 02 80 2f 40 01 8d 06 01 40 43 0e
+31 00 61 0c 4a 02 80 21 40 00 8d 06 01 b2 10 04
+31 00 61 0c 42 02 00 20 40 00 8d 06 01 95 00 02
+31 00 81 0c 4a 02 c0 21 00 02 8d 06 01 a2 20 08
+31 00 60 02 48 02 80 2f c0 00 8d 06 01 40 43 08
+31 00 60 0a 4c 3a e0 2e 00 00 8d 06 22 00 1c 02
+31 00 60 0a 4c 3a 00 2f 00 00 8d 06 23 00 1c 02
+31 00 60 06 e0 3a 00 20 c0 0c 8d 06 f7 01 08 12
+31 00 60 06 e0 3a 00 20 20 0f 8d 06 17 02 08 8a
+31 00 80 0c 40 02 00 20 60 00 8d 06 fe 5e 02 08
+31 00 80 03 44 02 00 20 60 00 8d 06 04 80 00 02
+31 00 80 0c 40 02 00 20 e0 00 8d 06 fe 87 00 08
+31 00 80 0c 48 02 60 20 a0 00 8d 06 fe 5e 20 04
+31 00 60 02 48 02 80 2f 60 00 8d 06 01 30 42 06
+31 00 80 02 48 02 00 2f 40 01 8d 06 01 30 84 0c
+31 00 60 06 e0 3a 00 20 c0 03 8d 06 27 00 0a 14
+31 00 60 06 e0 3a 00 20 00 05 8d 06 47 00 0a 0c
+31 00 60 06 e0 02 00 20 c0 0f 8d 06 07 00 08 84
+31 00 60 0c 48 02 a0 20 60 01 8d 06 01 50 49 06
+31 00 60 0c 40 02 00 20 c0 01 8d 06 02 50 0b 0e
+31 10 60 0c 48 02 40 20 60 01 8d 06 01 60 49 06
+31 10 60 0c 40 02 00 20 e0 00 8d 06 02 60 0b 0e
+31 00 60 06 08 02 a0 21 60 00 8d 06 38 00 48 02
+31 00 60 06 e0 3a 00 20 e0 00 8d 06 37 00 0a 14
+31 00 60 06 08 02 e0 21 40 00 8d 06 38 00 28 02
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 37 00 08 92
+31 00 60 06 e0 3a 00 20 00 01 8d 06 07 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 0e 8d 06 07 00 0a 94
+31 00 60 0a 4c 3a 80 2e 00 00 8d 06 24 00 1c 02
+31 00 60 0a 4c 3a a0 2e 00 00 8d 06 25 00 1c 02
+31 00 60 0a 4c 3a c0 2e 00 00 8d 06 26 00 1c 02
+31 00 60 0a 4c 3a e0 2e 00 00 8d 06 27 00 1c 02
+31 00 60 0a 4c 3a 00 2f 00 00 8d 06 28 00 1c 02
+31 00 60 06 08 02 40 21 40 00 8d 06 48 00 48 02
+31 00 60 06 08 02 c0 20 40 00 8d 06 88 00 48 02
+31 00 60 06 08 02 c0 21 40 00 8d 06 58 00 48 02
+31 00 60 06 08 02 60 21 40 00 8d 06 a8 00 48 02
+31 00 60 06 08 02 40 22 40 00 8d 06 68 00 48 02
+31 00 60 06 08 02 00 22 40 00 8d 06 c8 00 38 02
+31 00 60 06 08 02 c0 22 40 00 8d 06 78 00 48 02
+31 00 60 06 08 02 80 21 40 00 8d 06 b8 00 48 02
+31 00 60 06 08 02 e0 20 40 00 8d 06 98 00 48 02
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 b7 00 08 92
+31 00 60 02 48 02 80 20 40 01 8d 06 00 00 4b 08
+31 00 60 0b 48 02 e0 20 00 00 8d 06 08 00 20 02
+31 00 80 0b 48 02 20 21 00 00 8d 06 08 00 41 02
+31 00 60 02 48 02 40 20 c0 00 8d 06 00 10 4b 0a
+31 00 60 06 08 02 40 29 40 00 8d 06 28 00 28 02
+31 00 60 06 08 02 e0 20 40 00 8d 06 28 00 38 02
+31 00 60 06 08 02 e0 21 40 00 8d 06 38 00 38 02
+31 00 60 02 48 02 80 2f 60 00 8d 06 00 e0 43 08
+31 00 60 02 48 02 40 20 60 00 8d 06 00 d0 43 04
+31 00 60 02 48 02 80 2f 60 01 8d 06 01 10 4a 0a
+31 00 80 02 48 02 00 2f 60 02 8d 06 01 10 8c 12
+31 00 60 06 e0 3a 00 20 40 00 8d 06 57 00 0a 0c
+31 00 60 06 e0 02 00 20 20 01 8d 06 27 00 08 04
+31 00 60 02 48 02 40 20 40 00 8d 06 01 90 42 04
+31 00 80 02 48 02 40 20 40 01 8d 06 01 90 84 08
+31 00 60 02 48 02 c0 20 60 01 8d 06 02 41 43 08
+31 00 60 02 48 02 e0 20 e0 00 8d 06 00 b0 4a 02
+31 00 60 06 e0 3a 00 20 40 06 8d 06 57 00 0a 14
+31 00 60 06 e0 3a 00 20 80 07 8d 06 77 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 08 8d 06 97 00 0a 0c
+31 00 60 06 e0 3a 00 20 40 0f 8d 06 97 00 0a 8c
+31 00 60 02 48 02 40 20 c0 00 8d 06 05 70 42 08
+31 00 60 02 48 02 c0 20 40 01 8d 06 06 70 42 08
+31 00 60 02 48 02 40 21 c0 01 8d 06 07 70 42 08
+31 00 60 02 48 02 c0 21 40 02 8d 06 08 70 42 08
+31 00 80 02 48 02 20 28 20 09 8d 06 05 70 84 10
+31 00 80 02 48 02 00 24 20 0a 8d 06 06 70 84 10
+31 00 80 02 48 02 00 25 20 06 8d 06 07 70 84 10
+31 00 80 02 48 02 00 26 20 07 8d 06 08 70 84 10
+31 00 60 02 48 02 80 2f c0 00 8d 06 00 00 4b 0a
+31 00 60 02 48 02 40 20 c0 00 8d 06 01 30 4a 06
+31 00 60 02 48 02 c0 20 20 01 8d 06 02 31 4a 06
+31 00 80 02 48 02 40 20 40 01 8d 06 01 30 8c 0a
+31 00 80 02 48 02 40 21 40 02 8d 06 02 31 8c 0a
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 77 00 08 8a
+31 00 60 06 e0 02 00 20 c0 03 8d 06 67 00 0a 0c
+31 00 60 06 e0 02 00 20 80 04 8d 06 77 00 0a 0c
+31 00 60 06 e0 02 00 20 40 05 8d 06 87 00 0a 0c
+31 00 60 02 48 02 40 20 80 01 8d 06 01 80 4a 0a
+31 00 80 02 48 02 00 22 e0 00 8d 06 01 80 8c 12
+31 00 60 02 48 02 c0 20 c0 00 8d 06 02 01 42 06
+31 00 80 02 48 02 40 21 40 02 8d 06 02 01 84 0c
+31 00 60 02 48 02 40 20 40 00 8d 06 02 01 42 04
+31 00 60 02 48 02 c0 20 c0 00 8d 06 04 03 42 06
+31 00 80 02 48 02 40 20 40 01 8d 06 02 01 84 08
+31 00 80 02 48 02 40 21 40 02 8d 06 04 03 84 0c
+31 00 60 02 48 02 40 20 40 00 8d 06 04 03 42 04
+31 00 60 02 48 02 c0 20 c0 00 8d 06 08 07 42 06
+31 00 80 02 48 02 40 20 40 01 8d 06 04 03 84 08
+31 00 80 02 48 02 40 21 40 02 8d 06 08 07 84 0c
+31 00 60 02 48 02 80 2f 40 00 8d 06 01 10 42 06
+31 00 80 02 48 02 00 2f 40 00 8d 06 01 10 84 0c
+31 00 80 09 0c 02 80 20 a0 01 8d 06 01 03 28 02
+31 00 60 02 48 02 80 2f 40 00 8d 06 01 70 42 06
+31 00 60 02 48 02 40 20 60 01 8d 06 01 50 42 06
+31 00 60 02 48 02 c0 20 c0 01 8d 06 02 51 42 06
+31 00 80 02 48 02 40 20 a0 02 8d 06 01 50 84 0c
+31 00 80 02 48 02 40 21 60 03 8d 06 02 51 84 0c
+31 00 60 02 48 02 80 2f 40 01 8d 06 01 20 42 06
+31 00 80 02 48 02 00 2f 20 02 8d 06 01 20 84 0c
+31 00 60 06 08 02 c0 24 40 00 8d 06 c8 00 48 02
+31 00 60 06 08 02 e0 24 40 00 8d 06 d8 00 48 02
+31 00 60 06 08 02 00 25 40 00 8d 06 e8 00 48 02
+31 00 60 06 08 02 20 25 40 00 8d 06 f8 00 48 02
+31 00 60 06 08 02 40 25 40 00 8d 06 08 01 48 02
+31 00 60 06 08 02 60 25 40 00 8d 06 18 01 48 02
+31 00 60 06 08 02 80 25 40 00 8d 06 28 01 48 02
+31 00 60 06 08 02 a0 25 40 00 8d 06 38 01 48 02
+31 00 60 06 08 02 c0 25 40 00 8d 06 48 01 48 02
+31 00 60 06 08 02 e0 25 40 00 8d 06 58 01 48 02
+31 00 60 06 08 02 00 26 40 00 8d 06 68 01 48 02
+31 00 60 06 08 02 20 26 40 00 8d 06 78 01 48 02
+31 00 60 06 08 02 40 26 40 00 8d 06 88 01 48 02
+31 00 60 06 08 02 60 26 40 00 8d 06 98 01 48 02
+31 00 60 06 08 02 80 26 40 00 8d 06 a8 01 48 02
+31 00 60 06 08 02 a0 26 40 00 8d 06 b8 01 48 02
+31 00 60 06 08 02 c0 26 40 00 8d 06 c8 01 48 02
+31 00 60 06 08 02 e0 26 40 00 8d 06 d8 01 48 02
+31 00 60 06 08 02 00 27 40 00 8d 06 e8 01 48 02
+31 00 60 06 08 02 20 27 40 00 8d 06 f8 01 48 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 3e 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 3d 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 29 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 2a 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 2b 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 2c 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 2d 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 2e 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 2f 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 30 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 31 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 32 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 33 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 34 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 35 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 36 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 37 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 38 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 39 00 1c 02
+31 00 60 0a 4c 3a 40 20 00 00 8d 06 3a 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 3b 00 1c 02
+31 00 60 0a 4c 3a 40 20 00 00 8d 06 3c 00 1c 02
+31 00 60 06 e0 02 00 20 60 02 8d 06 27 80 0a 08
+31 00 60 06 e0 02 00 20 00 01 8d 06 27 80 0a 0a
+31 00 60 02 48 02 80 2f e0 00 8d 06 01 40 42 0e
+31 00 60 06 08 02 00 21 c0 01 8d 06 28 01 4a 04
+31 00 60 06 08 02 c0 22 00 02 8d 06 28 00 4a 04
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 17 00 08 0a
+31 00 60 06 e0 3a 00 20 e0 00 8d 06 57 00 08 0a
+31 00 60 0c 48 02 80 20 40 00 8d 06 01 60 40 02
+31 00 61 0c 42 02 00 20 60 00 8d 06 01 60 02 0a
+31 00 80 0c 48 02 c0 20 40 00 8d 06 01 50 80 04
+31 00 81 0c 42 02 00 20 80 00 8d 06 01 50 02 14
+31 00 60 02 48 02 60 20 e0 00 8d 06 02 70 42 08
+31 00 80 02 48 02 60 20 60 01 8d 06 02 70 84 10
+31 00 60 02 48 02 80 2f 80 01 8d 06 01 00 4b 08
+31 00 80 02 48 02 00 2f e0 00 8d 06 01 00 8d 0e
+31 00 60 02 48 02 c0 20 20 02 8d 06 02 41 43 0e
+31 00 60 02 48 02 c0 21 40 01 8d 06 02 82 4a 06
+31 00 60 02 48 02 c0 20 c0 00 8d 06 01 81 4a 08
+31 00 60 02 48 02 80 2f 40 01 8d 06 01 20 42 08
+31 00 80 02 48 02 00 2f e0 01 8d 06 01 20 84 10
+31 00 60 06 e0 3a 00 20 40 0f 8d 06 37 00 0a 8c
+31 00 60 06 e0 3a 00 20 40 01 8d 06 27 00 08 12
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 47 00 08 8a
+31 00 60 02 48 02 c0 21 40 00 8d 06 00 20 42 06
+31 00 60 06 08 02 a0 27 60 0d 8d 06 48 00 38 02
+31 00 60 06 08 02 00 28 20 0e 8d 06 58 00 38 02
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 47 00 08 92
+31 00 60 0c 40 02 00 20 40 00 8d 06 00 9b 00 04
+31 00 60 0c 48 02 a0 20 80 00 8d 06 01 50 49 08
+31 10 60 0c 48 02 40 20 40 01 8d 06 01 60 49 08
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 77 00 08 92
+31 00 60 06 08 02 80 21 00 01 69 06 38 00 4a 04
+31 00 60 06 08 02 a0 22 00 01 69 06 48 00 4a 04
+31 00 60 06 e0 02 00 20 c0 02 8d 06 a7 00 0a 0c
+31 00 60 06 e0 3a 00 20 00 07 8d 06 97 00 0a 14
+31 00 60 06 e0 3a 00 20 80 09 8d 06 b7 00 0a 0c
+31 00 60 06 e0 3a 00 20 40 0f 8d 06 b7 00 0a 8c
+31 00 60 06 e0 02 00 20 c0 00 8d 06 07 00 08 0a
+31 00 60 02 48 02 40 20 20 01 8d 06 01 30 42 08
+31 00 80 02 48 02 00 22 00 01 8d 06 01 30 84 10
+31 00 60 02 48 02 40 20 60 01 8d 06 01 60 42 06
+31 00 60 02 48 02 c0 20 c0 01 8d 06 02 61 42 06
+31 00 80 02 48 02 40 20 a0 02 8d 06 01 60 84 0c
+31 00 80 02 48 02 40 21 60 03 8d 06 02 61 84 0c
+31 00 60 02 48 02 80 2f 40 01 8d 06 01 10 42 0a
+31 00 80 02 48 02 00 2f 20 01 8d 06 01 10 84 14
+31 00 60 02 48 02 80 2f 40 00 8d 06 01 30 42 04
+31 00 80 02 48 02 00 2f 40 00 8d 06 01 30 84 08
+31 00 60 0b 48 02 80 20 00 00 8d 06 00 10 20 02
+31 00 80 0b 48 02 c0 20 00 00 8d 06 00 10 41 02
+31 00 60 02 48 02 80 2f c0 01 8d 06 01 00 4b 0a
+31 00 80 02 48 02 00 2f e0 00 8d 06 01 00 8d 12
+31 00 60 06 e0 3a 00 20 c0 0e 8d 06 37 00 0a 94
+31 00 60 06 08 02 00 21 e0 01 8d 06 38 01 2a 04
+31 00 60 06 08 02 40 21 e0 01 8d 06 38 03 2a 04
+31 00 60 06 08 02 80 21 e0 01 8d 06 38 05 2a 04
+31 00 60 06 08 02 c0 21 e0 01 8d 06 38 07 2a 04
+31 00 60 06 08 02 00 21 e0 01 8d 06 38 00 2a 04
+31 00 60 06 08 02 40 21 e0 01 8d 06 38 02 2a 04
+31 00 60 06 08 02 80 21 e0 01 8d 06 38 04 2a 04
+31 00 60 06 08 02 c0 21 e0 01 8d 06 38 06 2a 04
+31 00 60 06 08 02 40 20 60 04 8d 06 28 02 48 02
+31 00 60 06 08 02 00 21 60 04 8d 06 28 04 48 02
+31 00 60 06 08 02 80 21 60 04 8d 06 28 06 48 02
+31 00 60 06 e0 02 00 20 c0 00 8d 06 37 80 0a 0a
+31 00 60 06 e0 02 00 20 60 01 8d 06 47 80 0a 0a
+31 00 60 06 e0 02 00 20 80 01 8d 06 57 80 0a 0a
+31 00 60 06 e0 02 00 20 a0 01 8d 06 67 80 0a 0a
+31 00 60 06 e0 02 00 20 c0 01 8d 06 77 80 0a 0a
+31 00 60 06 e0 02 00 20 e0 01 8d 06 87 80 0a 0a
+31 00 60 06 e0 02 00 20 00 02 8d 06 97 80 0a 0a
+31 00 60 06 e0 02 00 20 20 02 8d 06 a7 80 0a 0a
+31 00 60 06 e0 02 00 20 40 02 8d 06 b7 80 0a 0a
+31 00 60 06 e0 02 00 20 60 02 8d 06 c7 80 0a 0a
+31 00 60 06 e0 02 00 20 80 02 8d 06 d7 80 0a 0a
+31 00 60 06 e0 02 00 20 a0 02 8d 06 e7 80 0a 0a
+31 00 60 06 e0 02 00 20 c0 02 8d 06 f7 80 0a 0a
+31 00 60 06 e0 02 00 20 e0 02 8d 06 07 81 0a 0a
+31 00 60 06 e0 02 00 20 00 03 8d 06 17 81 0a 0a
+31 00 60 06 e0 02 00 20 20 03 8d 06 27 81 0a 0a
+31 00 60 06 e0 02 00 20 40 03 8d 06 37 81 0a 0a
+31 00 60 06 e0 02 00 20 60 03 8d 06 47 81 0a 0a
+31 00 60 06 e0 02 00 20 80 03 8d 06 57 81 0a 0a
+31 00 60 06 e0 02 00 20 a0 03 8d 06 67 81 0a 0a
+31 00 60 06 e0 02 00 20 c0 03 8d 06 77 81 0a 0a
+31 00 60 06 e0 02 00 20 e0 03 8d 06 87 81 0a 0a
+31 00 60 06 e0 02 00 20 00 04 8d 06 97 81 0a 0a
+31 00 60 06 e0 02 00 20 20 04 8d 06 a7 81 0a 0a
+31 00 60 06 e0 02 00 20 40 04 8d 06 b7 81 0a 0a
+31 00 60 06 e0 02 00 20 60 04 8d 06 c7 81 0a 0a
+31 00 60 06 e0 02 00 20 80 04 8d 06 d7 81 0a 0a
+31 00 60 06 e0 02 00 20 a0 04 8d 06 e7 81 0a 0a
+31 00 60 06 e0 02 00 20 c0 04 8d 06 f7 81 0a 0a
+31 00 60 06 e0 02 00 20 e0 04 8d 06 07 82 0a 0a
+31 00 60 06 e0 3a 00 20 40 0f 8d 06 27 00 0a 8c
+31 00 60 02 48 02 40 20 40 00 8d 06 01 90 42 06
+31 00 80 02 48 02 40 20 80 01 8d 06 01 90 84 0c
+31 00 60 02 48 02 a0 20 20 02 8d 06 02 71 42 06
+31 00 80 02 48 02 a0 22 e0 00 8d 06 02 71 84 0c
+31 00 60 06 e0 3a 00 20 c0 0e 8d 06 27 00 0a 94
+31 00 60 06 e0 3a 00 20 40 00 8d 06 67 00 08 12
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 87 00 08 8a
+31 00 60 06 08 02 a0 22 40 00 8d 06 68 00 38 02
+31 00 60 06 08 02 60 24 40 00 8d 06 88 00 38 02
+31 00 60 06 e0 3a 00 20 a0 00 8d 06 67 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 0e 8d 06 67 00 0a 94
+31 00 60 02 48 02 80 2f 40 00 8d 06 01 70 42 04
+31 00 80 02 48 02 00 2f 40 00 8d 06 01 70 84 08
+31 00 60 02 48 02 80 2f a0 00 8d 06 01 50 42 08
+31 00 80 02 48 02 00 2f e0 00 8d 06 01 50 84 10
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 d7 00 08 8a
+31 00 60 02 48 02 c0 22 c0 01 8d 06 05 84 4a 06
+31 00 60 02 48 02 c0 20 c0 00 8d 06 02 81 4a 08
+31 00 60 02 48 02 c0 21 40 01 8d 06 03 82 4a 08
+31 00 60 02 48 02 40 22 40 03 8d 06 04 83 4a 0a
+31 00 80 02 48 02 40 22 60 05 8d 06 05 84 8c 0a
+31 00 80 02 48 02 60 25 e0 00 8d 06 02 81 8c 0e
+31 00 80 02 48 02 40 20 60 06 8d 06 03 82 8c 0e
+31 00 80 02 48 02 40 21 40 03 8d 06 04 83 8c 12
+31 00 60 02 48 02 80 2f 40 01 8d 06 01 40 4a 0e
+31 00 60 02 48 02 80 2f 40 01 8d 06 01 10 4a 08
+31 00 80 02 48 02 00 2f 20 02 8d 06 01 10 8c 0e
+31 00 60 02 48 02 80 2f 20 01 8d 06 01 20 42 0a
+31 00 80 02 48 02 00 2f 00 01 8d 06 01 20 84 14
+31 00 80 0c 40 02 00 20 40 00 8d 06 01 86 00 04
+31 00 60 02 48 02 40 20 e0 00 8d 06 01 60 42 08
+31 00 60 02 48 02 c0 20 60 01 8d 06 02 61 42 08
+31 00 80 02 48 02 40 20 60 01 8d 06 01 60 84 10
+31 00 80 02 48 02 40 21 60 02 8d 06 02 61 84 10
+31 00 60 06 e0 02 00 20 40 02 8d 06 47 80 0a 0e
+31 00 60 06 08 02 20 21 40 04 8d 06 18 02 48 02
+31 00 60 06 08 02 20 22 40 04 8d 06 38 02 48 02
+31 00 60 06 08 02 40 20 c0 00 8d 06 28 01 1a 04
+31 00 60 06 08 02 c0 22 00 01 8d 06 28 00 1a 04
+31 00 60 06 e0 02 00 20 40 00 8d 06 27 80 08 06
+31 00 60 06 e0 02 00 20 80 01 8d 06 37 80 08 06
+31 00 60 06 e0 02 00 20 a0 01 8d 06 47 80 08 06
+31 00 60 06 e0 02 00 20 c0 01 8d 06 57 80 08 06
+31 00 60 06 e0 02 00 20 e0 01 8d 06 67 80 08 06
+31 00 60 06 e0 02 00 20 00 02 8d 06 77 80 08 06
+31 00 60 06 e0 02 00 20 20 02 8d 06 87 80 08 06
+31 00 60 06 e0 02 00 20 40 02 8d 06 97 80 08 06
+31 00 60 06 e0 02 00 20 60 02 8d 06 a7 80 08 06
+31 00 60 06 e0 02 00 20 80 02 8d 06 b7 80 08 06
+31 00 60 06 e0 02 00 20 a0 02 8d 06 c7 80 08 06
+31 00 60 06 e0 02 00 20 c0 02 8d 06 d7 80 08 06
+31 00 60 06 e0 02 00 20 e0 02 8d 06 e7 80 08 06
+31 00 60 06 e0 02 00 20 00 03 8d 06 f7 80 08 06
+31 00 60 06 e0 02 00 20 20 03 8d 06 07 81 08 06
+31 00 60 06 e0 02 00 20 40 03 8d 06 17 81 08 06
+31 00 60 06 e0 02 00 20 60 03 8d 06 27 81 08 06
+31 00 60 06 e0 02 00 20 80 03 8d 06 37 81 08 06
+31 00 60 06 e0 02 00 20 a0 03 8d 06 47 81 08 06
+31 00 60 06 e0 02 00 20 c0 03 8d 06 57 81 08 06
+31 00 60 06 e0 02 00 20 e0 03 8d 06 67 81 08 06
+31 00 60 06 e0 02 00 20 00 04 8d 06 77 81 08 06
+31 00 60 06 e0 02 00 20 20 04 8d 06 87 81 08 06
+31 00 60 06 e0 02 00 20 40 04 8d 06 97 81 08 06
+31 00 60 06 e0 02 00 20 60 04 8d 06 a7 81 08 06
+31 00 60 06 e0 02 00 20 80 04 8d 06 b7 81 08 06
+31 00 60 06 e0 02 00 20 a0 04 8d 06 c7 81 08 06
+31 00 60 06 e0 02 00 20 c0 04 8d 06 d7 81 08 06
+31 00 60 06 e0 02 00 20 e0 04 8d 06 e7 81 08 06
+31 00 60 06 e0 02 00 20 00 05 8d 06 f7 81 08 06
+31 00 60 0c 48 02 80 2f 40 00 8d 06 00 60 40 02
+31 00 60 0c 48 02 e0 22 60 01 8d 06 01 5e 19 06
+31 00 60 0c 40 02 00 20 00 03 8d 06 01 5e 0b 08
+31 10 60 0c 48 02 e0 24 20 02 8d 06 01 6e 19 06
+31 10 60 0c 40 02 00 20 40 00 8d 06 01 6e 0b 08
+31 00 60 0c 40 02 00 20 60 01 8d 06 01 85 09 06
+31 10 60 0c 40 02 00 20 40 00 8d 06 01 95 09 06
+31 00 60 0c 40 02 00 20 00 03 8d 06 01 8c 09 08
+31 10 60 0c 40 02 00 20 40 00 8d 06 01 9c 09 08
+31 00 60 0c 40 02 00 20 00 03 8d 06 01 84 09 08
+31 10 60 0c 40 02 00 20 40 00 8d 06 01 94 09 08
+31 00 60 0c 40 02 00 20 40 03 8d 06 01 8e 09 0a
+31 10 60 0c 40 02 00 20 c0 01 8d 06 01 9e 09 0a
+31 00 60 02 48 02 c0 20 00 01 8d 06 02 31 42 04
+31 00 80 02 48 02 40 21 40 02 8d 06 02 31 84 08
+31 00 60 06 08 02 c0 20 c0 02 8d 06 18 03 4a 04
+31 00 60 06 08 02 40 21 c0 02 8d 06 18 05 4a 04
+31 00 60 06 08 02 c0 21 c0 02 8d 06 18 07 4a 04
+31 00 60 06 08 02 40 22 c0 02 8d 06 18 09 4a 04
+31 00 60 06 08 02 a0 21 a0 03 8d 06 18 02 4a 04
+31 00 60 06 08 02 20 22 a0 03 8d 06 18 04 4a 04
+31 00 60 06 08 02 a0 22 a0 03 8d 06 18 06 4a 04
+31 00 60 06 08 02 20 23 a0 03 8d 06 18 08 4a 04
+31 00 60 06 e0 02 00 20 c0 00 8d 06 17 02 0a 0c
+31 00 60 06 e0 02 00 20 80 01 8d 06 27 02 0a 0c
+31 00 60 06 e0 02 00 20 a0 01 8d 06 37 02 0a 0c
+31 00 60 06 e0 02 00 20 c0 01 8d 06 47 02 0a 0c
+31 00 60 06 e0 02 00 20 e0 01 8d 06 57 02 0a 0c
+31 00 60 06 e0 02 00 20 00 02 8d 06 67 02 0a 0c
+31 00 60 06 e0 02 00 20 20 02 8d 06 77 02 0a 0c
+31 00 60 06 e0 02 00 20 40 02 8d 06 87 02 0a 0c
+31 00 60 06 e0 02 00 20 60 02 8d 06 97 02 0a 0c
+31 00 60 06 e0 02 00 20 80 02 8d 06 a7 02 0a 0c
+31 00 60 06 e0 02 00 20 a0 02 8d 06 b7 02 0a 0c
+31 00 60 06 e0 02 00 20 c0 02 8d 06 c7 02 0a 0c
+31 00 60 06 e0 02 00 20 e0 02 8d 06 d7 02 0a 0c
+31 00 60 06 e0 02 00 20 00 03 8d 06 e7 02 0a 0c
+31 00 60 06 e0 02 00 20 20 03 8d 06 f7 02 0a 0c
+31 00 60 06 e0 02 00 20 40 03 8d 06 07 03 0a 0c
+31 00 60 06 e0 02 00 20 60 03 8d 06 17 03 0a 0c
+31 00 60 06 e0 02 00 20 80 03 8d 06 27 03 0a 0c
+31 00 60 06 e0 02 00 20 a0 03 8d 06 37 03 0a 0c
+31 00 60 06 e0 02 00 20 c0 03 8d 06 47 03 0a 0c
+31 00 60 06 e0 02 00 20 e0 03 8d 06 57 03 0a 0c
+31 00 60 06 e0 02 00 20 00 04 8d 06 67 03 0a 0c
+31 00 60 06 e0 02 00 20 20 04 8d 06 77 03 0a 0c
+31 00 60 06 e0 02 00 20 40 04 8d 06 87 03 0a 0c
+31 00 60 06 e0 02 00 20 60 04 8d 06 97 03 0a 0c
+31 00 60 06 e0 02 00 20 80 04 8d 06 a7 03 0a 0c
+31 00 60 06 e0 02 00 20 a0 04 8d 06 b7 03 0a 0c
+31 00 60 06 e0 02 00 20 c0 04 8d 06 c7 03 0a 0c
+31 00 60 06 e0 02 00 20 e0 04 8d 06 d7 03 0a 0c
+31 00 60 06 e0 02 00 20 00 05 8d 06 e7 03 0a 0c
+31 00 60 06 e0 02 00 20 20 05 8d 06 f7 03 0a 0c
+31 00 60 06 e0 02 00 20 00 02 8d 06 67 00 08 0a
+31 00 60 06 e0 02 00 20 20 02 8d 06 77 00 08 0a
+31 00 60 06 e0 02 00 20 40 02 8d 06 87 00 08 0a
+31 00 60 06 e0 02 00 20 60 02 8d 06 97 00 08 0a
+31 00 60 06 e0 02 00 20 80 02 8d 06 a7 00 08 0a
+31 00 60 06 e0 02 00 20 a0 02 8d 06 b7 00 08 0a
+31 00 60 06 e0 02 00 20 c0 02 8d 06 c7 00 08 0a
+31 00 60 06 e0 02 00 20 e0 02 8d 06 d7 00 08 0a
+31 00 60 06 e0 02 00 20 00 03 8d 06 e7 00 08 0a
+31 00 60 06 e0 02 00 20 20 03 8d 06 f7 00 08 0a
+31 00 60 06 e0 02 00 20 40 03 8d 06 07 01 08 0a
+31 00 60 06 e0 02 00 20 60 03 8d 06 17 01 08 0a
+31 00 60 06 e0 02 00 20 80 03 8d 06 27 01 08 0a
+31 00 60 06 e0 02 00 20 a0 03 8d 06 37 01 08 0a
+31 00 60 06 e0 02 00 20 c0 03 8d 06 47 01 08 0a
+31 00 60 06 e0 02 00 20 e0 03 8d 06 57 01 08 0a
+31 00 60 06 e0 02 00 20 00 04 8d 06 67 01 08 0a
+31 00 60 06 e0 02 00 20 20 04 8d 06 77 01 08 0a
+31 00 60 06 e0 02 00 20 40 04 8d 06 87 01 08 0a
+31 00 60 06 e0 02 00 20 60 04 8d 06 97 01 08 0a
+31 00 60 06 e0 02 00 20 80 04 8d 06 a7 01 08 0a
+31 00 60 06 e0 02 00 20 a0 04 8d 06 b7 01 08 0a
+31 00 60 06 e0 02 00 20 c0 04 8d 06 c7 01 08 0a
+31 00 60 06 e0 02 00 20 e0 04 8d 06 d7 01 08 0a
+31 00 60 06 e0 02 00 20 00 05 8d 06 e7 01 08 0a
+31 00 60 06 e0 02 00 20 20 05 8d 06 f7 01 08 0a
+31 00 60 02 48 02 80 2f a0 00 8d 06 01 00 4a 06
+31 00 80 02 48 02 00 2f e0 00 8d 06 01 00 8c 0a
+31 00 60 02 48 02 c0 20 c0 00 8d 06 02 31 42 06
+31 00 80 02 48 02 40 21 40 03 8d 06 02 31 84 0c
+31 00 60 02 48 02 80 2f 20 01 8d 06 01 00 42 08
+31 00 80 02 48 02 00 2f 00 01 8d 06 01 00 84 10
+31 00 60 02 48 02 a0 20 e0 01 8d 06 03 02 42 04
+31 00 80 02 48 02 e0 20 60 03 8d 06 03 02 84 08
+31 00 80 0c 48 02 80 20 20 02 8d 06 03 a5 20 04
+31 00 80 0c 40 02 00 20 40 02 8d 06 04 85 00 04
+31 00 80 0c 48 02 60 21 60 02 8d 06 02 a6 20 04
+31 00 80 0c 40 02 00 20 80 02 8d 06 05 85 00 04
+31 00 80 0c 48 02 00 22 a0 02 8d 06 01 5e 20 04
+31 00 80 0c 40 02 00 20 c0 02 8d 06 06 85 00 04
+31 00 60 02 48 02 40 23 40 03 8d 06 03 a2 42 02
+31 00 60 02 48 02 c0 23 c0 03 8d 06 04 a3 42 02
+31 00 60 02 48 02 40 24 40 04 8d 06 05 a4 42 02
+31 00 60 02 48 02 c0 24 c0 04 8d 06 06 a5 42 02
+31 00 60 02 48 02 c0 22 20 03 8d 06 02 a1 42 02
+31 00 60 02 48 02 40 25 40 05 8d 06 07 a6 42 02
+31 00 60 02 48 02 c0 25 c0 05 8d 06 08 a7 42 02
+31 00 60 02 48 02 40 26 40 06 8d 06 09 a8 42 02
+31 00 60 02 48 02 40 20 c0 06 8d 06 0a a9 42 02
+31 00 60 02 48 02 c0 20 e0 06 8d 06 0b aa 42 02
+31 00 60 02 48 02 40 21 00 07 8d 06 0c ab 42 02
+31 00 60 02 48 02 c0 21 20 07 8d 06 0d ac 42 02
+31 00 80 02 48 02 40 21 40 02 8d 06 02 a1 84 04
+31 00 80 02 48 02 40 2a c0 0d 8d 06 0b aa 84 04
+31 00 80 02 48 02 40 22 40 03 8d 06 03 a2 84 04
+31 00 80 02 48 02 40 2b 00 0e 8d 06 0c ab 84 04
+31 00 80 02 48 02 40 2c 40 0d 8d 06 0d ac 84 04
+31 00 80 02 48 02 40 23 40 04 8d 06 04 a3 84 04
+31 00 80 02 48 02 40 24 40 05 8d 06 05 a4 84 04
+31 00 80 02 48 02 40 25 40 06 8d 06 06 a5 84 04
+31 00 80 02 48 02 40 26 40 07 8d 06 07 a6 84 04
+31 00 80 02 48 02 40 27 40 08 8d 06 08 a7 84 04
+31 00 80 02 48 02 40 28 40 09 8d 06 09 a8 84 04
+31 00 80 02 48 02 40 29 80 0d 8d 06 0a a9 84 04
+31 00 80 0c 40 02 00 20 20 00 8d 06 fe 85 00 04
+31 00 80 0c 40 02 00 20 20 00 8d 06 fe 8d 00 08
+31 00 80 0c 40 02 00 20 a0 00 8d 06 fe 8b 00 08
+31 00 80 0c 40 02 00 20 20 00 8d 06 fe 8c 00 08
+31 00 80 0c 40 02 00 20 a0 00 8d 06 fe 8a 00 08
+31 00 80 0c 40 02 00 20 20 00 8d 06 fe 81 00 08
+31 00 80 0c 40 02 00 20 20 00 8d 06 fe 82 00 08
+31 00 80 0c 40 02 00 20 20 00 8d 06 fe 83 00 08
+31 00 80 0c 40 02 00 20 20 00 8d 06 fe 84 00 08
+31 00 80 0c 40 02 00 20 20 00 8d 06 fe 8e 00 0c
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 67 00 08 92
+31 00 60 02 48 02 80 2f 60 01 8d 06 01 40 42 12
+31 00 80 0c 48 02 20 20 c0 01 8d 06 fe a4 20 08
+31 00 80 0c 48 02 a0 21 20 02 8d 06 fe a2 20 08
+31 20 80 0a 40 02 00 20 60 0f 8d 06 fd 03 0a 06
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 00 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 02 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 12 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 14 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 04 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 06 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 16 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 18 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 08 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 0a 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 1a 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 1c 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 0c 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 0e 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 1e 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 22 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 10 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 20 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 24 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 2a 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 26 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 28 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 2c 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 32 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 2e 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 30 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 34 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 3a 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 36 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 38 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 3c 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 42 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 3e 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 40 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 44 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 4a 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 46 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 48 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 4c 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 52 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 4e 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 50 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 54 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 5a 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 56 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 58 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 5c 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 62 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 5e 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 60 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 64 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 6a 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 66 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 68 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 6c 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 72 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 6e 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 70 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 74 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 7a 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 76 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 78 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 7c 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 82 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 7e 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 80 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 84 10 2c 02
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 8a 10 2c 02
+31 20 80 0a 4c 3a 80 20 00 00 8d 06 86 10 2c 02
+31 20 80 0a 4c 3a c0 20 00 00 8d 06 88 10 2c 02
+31 00 80 0a 4c 3a 80 20 00 00 8d 06 8c 10 2c 02
+31 00 60 06 e0 02 00 20 80 01 8d 06 27 01 0a 0c
+31 00 60 02 48 02 40 20 60 01 8d 06 05 04 42 04
+31 00 60 02 48 02 40 20 80 01 8d 06 06 05 42 04
+31 00 60 02 48 02 40 20 a0 01 8d 06 07 06 42 04
+31 00 60 02 48 02 40 20 c0 01 8d 06 08 07 42 04
+31 00 60 02 48 02 40 20 e0 01 8d 06 09 08 42 04
+31 00 60 02 48 02 40 20 00 02 8d 06 0a 09 42 04
+31 00 60 02 48 02 40 20 20 02 8d 06 0b 0a 42 04
+31 00 60 02 48 02 40 20 40 02 8d 06 0c 0b 42 04
+31 00 60 02 48 02 40 20 60 02 8d 06 0d 0c 42 04
+31 00 60 02 48 02 40 20 80 02 8d 06 0e 0d 42 04
+31 00 60 02 48 02 40 20 a0 02 8d 06 0f 0e 42 04
+31 00 60 02 48 02 40 20 c0 02 8d 06 10 0f 42 04
+31 00 60 02 48 02 40 20 40 01 8d 06 11 00 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 12 01 4a 06
+31 00 60 02 48 02 40 20 40 01 8d 06 13 02 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 14 03 4a 06
+31 00 60 02 48 02 40 20 40 01 8d 06 15 04 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 16 05 4a 06
+31 00 60 02 48 02 40 20 40 01 8d 06 17 06 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 18 07 4a 06
+31 00 60 02 48 02 40 20 40 01 8d 06 19 08 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 1a 09 4a 06
+31 00 60 02 48 02 40 20 40 01 8d 06 1b 0a 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 1c 0b 4a 06
+31 00 60 02 48 02 40 20 40 01 8d 06 1d 0c 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 1e 0d 4a 06
+31 00 60 02 48 02 40 20 40 01 8d 06 1f 0e 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 20 0f 4a 06
+31 00 80 02 48 02 40 20 80 03 8d 06 05 04 84 08
+31 00 80 02 48 02 40 20 a0 03 8d 06 06 05 84 08
+31 00 80 02 48 02 40 20 c0 03 8d 06 07 06 84 08
+31 00 80 02 48 02 40 20 e0 03 8d 06 08 07 84 08
+31 00 80 02 48 02 40 20 00 04 8d 06 09 08 84 08
+31 00 80 02 48 02 40 20 20 04 8d 06 0a 09 84 08
+31 00 80 02 48 02 40 20 40 04 8d 06 0b 0a 84 08
+31 00 80 02 48 02 40 20 60 04 8d 06 0c 0b 84 08
+31 00 80 02 48 02 40 20 80 04 8d 06 0d 0c 84 08
+31 00 80 02 48 02 40 20 a0 04 8d 06 0e 0d 84 08
+31 00 80 02 48 02 e0 20 c0 04 8d 06 0f 0e 84 08
+31 00 80 02 48 02 e0 22 e0 04 8d 06 10 0f 84 08
+31 00 80 02 48 02 20 22 40 00 8d 06 11 00 8c 0a
+31 00 80 02 48 02 a0 23 e0 00 8d 06 12 01 8c 0a
+31 00 80 02 48 02 60 23 80 01 8d 06 13 02 8c 0a
+31 00 80 02 48 02 00 24 20 02 8d 06 14 03 8c 0a
+31 00 80 02 48 02 40 20 c0 02 8d 06 15 04 8c 0a
+31 00 80 02 48 02 40 20 60 03 8d 06 16 05 8c 0a
+31 00 80 02 48 02 40 20 00 04 8d 06 17 06 8c 0a
+31 00 80 02 48 02 40 20 a0 04 8d 06 18 07 8c 0a
+31 00 80 02 48 02 40 20 40 05 8d 06 19 08 8c 0a
+31 00 80 02 48 02 40 20 e0 05 8d 06 1a 09 8c 0a
+31 00 80 02 48 02 40 20 80 06 8d 06 1b 0a 8c 0a
+31 00 80 02 48 02 40 20 20 07 8d 06 1c 0b 8c 0a
+31 00 80 02 48 02 40 20 c0 07 8d 06 1d 0c 8c 0a
+31 00 80 02 48 02 40 20 60 08 8d 06 1e 0d 8c 0a
+31 00 80 02 48 02 40 20 00 09 8d 06 1f 0e 8c 0a
+31 00 80 02 48 02 40 20 a0 09 8d 06 20 0f 8c 0a
+31 00 60 02 48 02 c0 20 40 00 8d 06 02 01 42 02
+31 00 80 02 48 02 40 21 40 00 8d 06 02 01 84 04
+31 00 60 0c 40 02 00 20 40 01 8d 06 00 60 02 0a
+31 00 60 06 e0 02 00 20 00 01 8d 06 27 80 0a 0c
+31 00 60 06 e0 3a 00 20 a0 01 8d 06 47 00 08 12
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 87 00 08 92
+31 00 60 0c 48 02 40 20 40 09 8d 06 02 6e 10 02
+31 00 60 0c 40 02 00 20 20 00 8d 06 fe 6e 02 04
+31 00 60 0c 48 02 60 28 40 00 8d 06 fe bd 10 04
+31 00 60 0c 48 02 80 28 40 00 8d 06 fe 6e 10 02
+31 00 60 0c 48 02 20 29 80 00 8d 06 fe b4 10 04
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 97 00 08 8a
+31 00 60 02 48 02 a0 23 a0 00 8d 06 01 20 4b 0e
+31 00 60 02 48 02 80 2f c0 00 8d 06 01 20 4a 08
+31 00 80 02 48 02 00 2f 00 01 8d 06 01 20 8c 0e
+31 00 60 02 48 02 40 20 a0 00 8d 06 01 60 42 0a
+31 00 60 02 48 02 c0 20 40 01 8d 06 02 61 42 0a
+31 00 80 02 48 02 40 20 e0 01 8d 06 01 60 84 14
+31 00 80 02 48 02 40 21 20 03 8d 06 02 61 84 14
+31 00 60 06 e0 02 00 20 c0 01 8d 06 37 80 0a 0c
+31 00 60 06 e0 02 00 20 e0 01 8d 06 47 80 0a 0c
+31 00 60 06 e0 02 00 20 00 02 8d 06 57 80 0a 0c
+31 00 60 02 48 02 40 20 e0 00 8d 06 01 50 4a 08
+31 00 60 02 48 02 c0 20 60 01 8d 06 02 51 4a 08
+31 00 80 02 48 02 40 20 60 01 8d 06 01 50 8c 0e
+31 00 80 02 48 02 40 21 40 02 8d 06 02 51 8c 0e
+31 00 80 0c 48 02 20 20 e0 00 8d 06 fe a7 20 08
+31 00 60 02 48 02 40 20 c0 00 8d 06 01 30 4a 08
+31 00 60 02 48 02 c0 20 40 01 8d 06 02 31 4a 08
+31 00 80 02 48 02 40 20 40 01 8d 06 01 30 8c 0e
+31 00 80 02 48 02 40 21 40 02 8d 06 02 31 8c 0e
+31 00 60 02 48 02 80 2f 60 00 8d 06 01 00 4a 04
+31 00 80 02 48 02 00 2f 80 00 8d 06 01 00 8c 06
+31 00 60 02 48 02 20 22 80 01 8d 06 03 00 42 04
+31 00 80 02 48 02 e0 20 e0 04 8d 06 03 00 84 08
+31 00 60 02 48 02 60 21 e0 04 8d 06 08 70 42 06
+31 00 60 02 48 02 e0 21 e0 04 8d 06 09 71 42 06
+31 00 60 02 48 02 60 22 e0 04 8d 06 0a 72 42 06
+31 00 60 02 48 02 e0 22 e0 04 8d 06 0b 73 42 06
+31 00 60 02 48 02 60 23 e0 04 8d 06 0c 74 42 06
+31 00 60 02 48 02 e0 23 e0 04 8d 06 0d 75 42 06
+31 00 60 02 48 02 60 24 e0 04 8d 06 0e 76 42 06
+31 00 60 02 48 02 e0 24 e0 04 8d 06 0f 77 42 06
+31 00 80 02 48 02 60 28 a0 0b 8d 06 08 70 84 0c
+31 00 80 02 48 02 60 23 a0 0b 8d 06 09 71 84 0c
+31 00 80 02 48 02 a0 24 a0 0b 8d 06 0a 72 84 0c
+31 00 80 02 48 02 e0 25 a0 0b 8d 06 0b 73 84 0c
+31 00 80 02 48 02 20 27 a0 0b 8d 06 0c 74 84 0c
+31 00 80 02 48 02 20 22 a0 0b 8d 06 0d 75 84 0c
+31 00 80 02 48 02 a0 2a a0 0b 8d 06 0e 76 84 0c
+31 00 80 02 48 02 a0 29 a0 0b 8d 06 0f 77 84 0c
+31 00 80 0c 48 02 20 2a 80 0a 8d 06 00 5e 20 04
+31 00 60 06 e0 3a 00 20 40 0f 8d 06 47 00 0a 8c
+31 00 60 02 48 02 80 2f 00 01 8d 06 01 10 4a 06
+31 00 80 02 48 02 00 2f e0 01 8d 06 01 10 8c 0a
+31 00 60 02 48 02 c0 21 60 01 8d 06 02 02 4b 08
+31 00 60 02 48 02 c0 20 c0 00 8d 06 01 01 4b 0a
+31 00 60 06 e0 3a 00 20 60 00 8d 06 87 00 08 12
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 a7 00 08 8a
+31 00 60 02 48 02 40 20 e0 00 8d 06 01 60 4a 08
+31 00 60 02 48 02 c0 20 60 01 8d 06 02 61 4a 08
+31 00 80 02 48 02 40 20 60 01 8d 06 01 60 8c 0e
+31 00 80 02 48 02 40 21 40 02 8d 06 02 61 8c 0e
+31 00 60 06 08 02 e0 23 80 03 8d 06 38 02 38 02
+31 00 60 06 08 02 40 24 80 03 8d 06 38 04 38 02
+31 00 60 06 08 02 a0 24 80 03 8d 06 38 06 38 02
+31 00 60 06 08 02 c0 22 80 03 8d 06 48 02 38 02
+31 00 60 06 08 02 20 23 80 03 8d 06 48 04 38 02
+31 00 60 06 08 02 80 23 80 03 8d 06 48 06 38 02
+31 00 60 06 08 02 c0 22 a0 03 8d 06 58 02 38 02
+31 00 60 06 08 02 20 23 a0 03 8d 06 58 04 38 02
+31 00 60 06 08 02 80 23 a0 03 8d 06 58 06 38 02
+31 00 60 06 08 02 c0 22 c0 03 8d 06 68 02 38 02
+31 00 60 06 08 02 20 23 c0 03 8d 06 68 04 38 02
+31 00 60 06 08 02 80 23 c0 03 8d 06 68 06 38 02
+31 00 60 06 08 02 c0 22 e0 03 8d 06 78 02 38 02
+31 00 60 06 08 02 20 23 e0 03 8d 06 78 04 38 02
+31 00 60 06 08 02 80 23 e0 03 8d 06 78 06 38 02
+31 00 60 06 08 02 20 23 00 04 8d 06 88 04 38 02
+31 00 60 06 08 02 c0 22 00 04 8d 06 88 02 38 02
+31 00 60 06 08 02 80 23 00 04 8d 06 88 06 38 02
+31 00 60 06 08 02 20 23 20 04 8d 06 98 04 38 02
+31 00 60 06 08 02 c0 22 20 04 8d 06 98 02 38 02
+31 00 60 06 08 02 80 23 20 04 8d 06 98 06 38 02
+31 00 60 06 08 02 20 23 40 04 8d 06 a8 06 38 02
+31 00 60 06 08 02 00 21 40 04 8d 06 a8 02 38 02
+31 00 60 06 08 02 c0 22 40 04 8d 06 a8 04 38 02
+31 00 60 06 08 02 00 21 60 04 8d 06 b8 02 38 02
+31 00 60 06 08 02 c0 22 60 04 8d 06 b8 04 38 02
+31 00 60 06 08 02 20 23 60 04 8d 06 b8 06 38 02
+31 00 60 06 08 02 00 21 80 04 8d 06 c8 02 38 02
+31 00 60 06 08 02 c0 22 80 04 8d 06 c8 04 38 02
+31 00 60 06 08 02 20 23 80 04 8d 06 c8 06 38 02
+31 00 60 06 08 02 00 21 a0 04 8d 06 d8 02 38 02
+31 00 60 06 08 02 c0 22 a0 04 8d 06 d8 04 38 02
+31 00 60 06 08 02 20 23 a0 04 8d 06 d8 06 38 02
+31 00 60 06 08 02 00 21 c0 04 8d 06 e8 02 38 02
+31 00 60 06 08 02 c0 22 c0 04 8d 06 e8 04 38 02
+31 00 60 06 08 02 20 23 c0 04 8d 06 e8 06 38 02
+31 00 60 06 08 02 00 21 e0 04 8d 06 f8 02 38 02
+31 00 60 06 08 02 c0 22 e0 04 8d 06 f8 04 38 02
+31 00 60 06 08 02 20 23 e0 04 8d 06 f8 06 38 02
+31 00 60 06 08 02 00 21 00 05 8d 06 08 03 38 02
+31 00 60 06 08 02 60 21 00 05 8d 06 08 05 38 02
+31 00 60 06 08 02 c0 22 00 05 8d 06 08 07 38 02
+31 00 60 06 08 02 00 21 20 05 8d 06 18 03 38 02
+31 00 60 06 08 02 60 21 20 05 8d 06 18 05 38 02
+31 00 60 06 08 02 c0 22 20 05 8d 06 18 07 38 02
+31 00 60 06 08 02 00 21 60 00 8d 06 28 03 38 02
+31 00 60 06 08 02 60 21 60 00 8d 06 28 05 38 02
+31 00 60 06 08 02 c0 22 60 00 8d 06 28 07 38 02
+31 00 60 06 08 02 00 21 60 05 8d 06 38 03 38 02
+31 00 60 06 08 02 60 21 60 05 8d 06 38 05 38 02
+31 00 60 06 08 02 c0 22 60 05 8d 06 38 07 38 02
+31 00 60 06 08 02 00 21 80 05 8d 06 48 03 38 02
+31 00 60 06 08 02 60 21 80 05 8d 06 48 05 38 02
+31 00 60 06 08 02 c0 22 80 05 8d 06 48 07 38 02
+31 00 60 06 08 02 00 21 a0 05 8d 06 58 03 38 02
+31 00 60 06 08 02 60 21 a0 05 8d 06 58 05 38 02
+31 00 60 06 08 02 c0 22 a0 05 8d 06 58 07 38 02
+31 00 60 06 08 02 00 21 c0 05 8d 06 68 03 38 02
+31 00 60 06 08 02 60 21 c0 05 8d 06 68 05 38 02
+31 00 60 06 08 02 c0 21 c0 05 8d 06 68 07 38 02
+31 00 60 06 08 02 00 21 e0 05 8d 06 78 03 38 02
+31 00 60 06 08 02 60 21 e0 05 8d 06 78 05 38 02
+31 00 60 06 08 02 c0 21 e0 05 8d 06 78 07 38 02
+31 00 60 06 08 02 00 21 00 06 8d 06 88 03 38 02
+31 00 60 06 08 02 60 21 00 06 8d 06 88 05 38 02
+31 00 60 06 08 02 c0 21 00 06 8d 06 88 07 38 02
+31 00 60 06 08 02 00 21 20 06 8d 06 98 03 38 02
+31 00 60 06 08 02 60 21 20 06 8d 06 98 05 38 02
+31 00 60 06 08 02 c0 21 20 06 8d 06 98 07 38 02
+31 00 60 06 08 02 00 21 40 06 8d 06 a8 03 38 02
+31 00 60 06 08 02 60 21 40 06 8d 06 a8 05 38 02
+31 00 60 06 08 02 c0 21 40 06 8d 06 a8 07 38 02
+31 00 60 06 08 02 00 21 c0 06 8d 06 b8 03 38 02
+31 00 60 06 08 02 60 21 c0 06 8d 06 b8 05 38 02
+31 00 60 06 08 02 c0 21 c0 06 8d 06 b8 07 38 02
+31 00 60 06 08 02 00 21 e0 06 8d 06 c8 03 38 02
+31 00 60 06 08 02 60 21 e0 06 8d 06 c8 05 38 02
+31 00 60 06 08 02 c0 21 e0 06 8d 06 c8 07 38 02
+31 00 60 06 08 02 00 21 00 07 8d 06 d8 03 38 02
+31 00 60 06 08 02 60 21 00 07 8d 06 d8 05 38 02
+31 00 60 06 08 02 c0 21 00 07 8d 06 d8 07 38 02
+31 00 60 06 08 02 00 21 20 07 8d 06 e8 03 38 02
+31 00 60 06 08 02 60 21 20 07 8d 06 e8 05 38 02
+31 00 60 06 08 02 c0 21 20 07 8d 06 e8 07 38 02
+31 00 60 06 08 02 00 21 40 07 8d 06 f8 03 38 02
+31 00 60 06 08 02 60 21 40 07 8d 06 f8 05 38 02
+31 00 60 06 08 02 c0 21 40 07 8d 06 f8 07 38 02
+31 00 60 06 08 02 00 21 60 07 8d 06 08 02 38 02
+31 00 60 06 08 02 60 21 60 07 8d 06 08 04 38 02
+31 00 60 06 08 02 c0 21 60 07 8d 06 08 06 38 02
+31 00 60 06 08 02 20 22 60 07 8d 06 08 08 38 02
+31 00 60 06 08 02 00 21 80 07 8d 06 18 02 38 02
+31 00 60 06 08 02 60 21 80 07 8d 06 18 04 38 02
+31 00 60 06 08 02 c0 21 80 07 8d 06 18 06 38 02
+31 00 60 06 08 02 20 22 80 07 8d 06 18 08 38 02
+31 00 60 06 e0 02 00 20 c0 01 8d 06 67 80 0a 0c
+31 00 60 06 e0 02 00 20 e0 01 8d 06 77 80 0a 0c
+31 00 60 06 e0 02 00 20 00 02 8d 06 87 80 0a 0c
+31 00 60 06 e0 02 00 20 20 02 8d 06 97 80 0a 0c
+31 00 60 06 e0 02 00 20 40 02 8d 06 a7 80 0a 0c
+31 00 60 06 e0 02 00 20 60 02 8d 06 b7 80 0a 0c
+31 00 60 06 e0 02 00 20 80 02 8d 06 c7 80 0a 0c
+31 00 60 06 e0 02 00 20 a0 02 8d 06 d7 80 0a 0c
+31 00 60 06 e0 02 00 20 c0 02 8d 06 e7 80 0a 0c
+31 00 60 06 e0 02 00 20 e0 02 8d 06 f7 80 0a 0c
+31 00 60 06 e0 02 00 20 00 03 8d 06 07 81 0a 0c
+31 00 60 06 e0 02 00 20 20 03 8d 06 17 81 0a 0c
+31 00 60 06 e0 02 00 20 40 03 8d 06 27 81 0a 0c
+31 00 60 06 e0 02 00 20 60 03 8d 06 37 81 0a 0c
+31 00 60 06 e0 02 00 20 80 03 8d 06 47 81 0a 0c
+31 00 60 06 e0 02 00 20 a0 03 8d 06 57 81 0a 0c
+31 00 60 06 e0 02 00 20 c0 03 8d 06 67 81 0a 0c
+31 00 60 06 e0 02 00 20 e0 03 8d 06 77 81 0a 0c
+31 00 60 06 e0 02 00 20 00 04 8d 06 87 81 0a 0c
+31 00 60 06 e0 02 00 20 20 04 8d 06 97 81 0a 0c
+31 00 60 06 e0 02 00 20 40 04 8d 06 a7 81 0a 0c
+31 00 60 06 e0 02 00 20 60 04 8d 06 b7 81 0a 0c
+31 00 60 06 e0 02 00 20 80 04 8d 06 c7 81 0a 0c
+31 00 60 06 e0 02 00 20 a0 04 8d 06 d7 81 0a 0c
+31 00 60 06 e0 02 00 20 c0 04 8d 06 e7 81 0a 0c
+31 00 60 06 e0 02 00 20 e0 04 8d 06 f7 81 0a 0c
+31 00 60 06 e0 02 00 20 00 05 8d 06 07 82 0a 0c
+31 00 60 06 e0 02 00 20 20 05 8d 06 17 82 0a 0c
+31 00 00 0a 4c 12 20 20 20 00 00 06 00 c0 09 02
+31 00 60 0c 48 02 80 2f 40 00 8d 06 01 6e 10 02
+31 00 80 0c 48 02 60 21 60 02 8d 06 01 a6 20 04
+31 00 80 0c 40 02 00 20 80 02 8d 06 03 85 00 04
+31 00 60 06 e0 02 00 20 40 0f 8d 06 07 80 08 8c
+31 00 61 0c 42 02 00 20 c0 00 8d 06 01 6e 02 04
+31 00 61 0c 42 02 00 20 c0 00 8d 06 02 6e 02 04
+31 00 81 0c 42 02 00 20 00 01 8d 06 01 5e 02 08
+31 00 81 0c 42 02 00 20 00 01 8d 06 02 5e 02 08
+31 00 60 02 48 02 40 20 80 01 8d 06 01 50 4a 0a
+31 00 60 02 48 02 c0 20 20 02 8d 06 02 51 4a 0a
+31 00 80 02 48 02 20 23 e0 00 8d 06 01 50 8c 12
+31 00 80 02 48 02 20 24 00 02 8d 06 02 51 8c 12
+31 00 60 02 48 02 40 20 80 01 8d 06 01 20 4b 0c
+31 00 80 02 48 02 40 20 e0 01 8d 06 01 20 8d 16
+31 00 60 06 08 02 c0 26 e0 00 8d 06 48 00 28 02
+31 00 60 0c 48 02 e0 20 a0 06 8d 06 00 6e 10 02
+31 00 60 0c 40 02 00 20 a0 06 8d 06 00 95 00 02
+31 00 60 06 08 02 e0 20 a0 04 8d 06 38 04 48 02
+31 00 60 06 08 02 60 21 a0 04 8d 06 38 06 48 02
+31 00 60 06 08 02 a0 21 c0 01 8d 06 48 01 2a 04
+31 00 60 06 08 02 40 20 c0 01 8d 06 48 00 2a 04
+31 00 60 02 48 02 40 20 e0 00 8d 06 01 40 4b 12
+31 00 60 02 48 02 c0 20 00 02 8d 06 02 41 4b 12
+31 00 60 06 08 02 40 20 40 00 8d 06 78 00 38 02
+31 00 60 02 48 02 80 2f e0 00 8d 06 01 20 4a 0a
+31 00 80 02 48 02 00 2f 20 01 8d 06 01 20 8c 12
+31 00 60 06 08 02 40 26 60 06 8d 06 18 00 18 02
+31 00 60 02 48 02 60 27 00 08 8d 06 02 70 42 02
+31 00 60 02 48 02 40 20 00 08 8d 06 03 70 42 02
+31 00 60 02 48 02 c0 20 00 08 8d 06 04 70 42 02
+31 00 60 02 48 02 40 21 00 08 8d 06 05 70 42 02
+31 00 60 02 48 02 c0 21 00 08 8d 06 06 70 42 02
+31 00 60 02 48 02 40 22 00 08 8d 06 07 70 42 02
+31 00 60 02 48 02 c0 22 00 08 8d 06 08 70 42 02
+31 00 60 02 48 02 40 23 00 08 8d 06 09 70 42 02
+31 00 60 02 48 02 c0 23 00 08 8d 06 0a 70 42 02
+31 00 60 02 48 02 40 24 00 08 8d 06 0b 70 42 02
+31 00 60 02 48 02 c0 24 00 08 8d 06 0c 70 42 02
+31 00 60 02 48 02 40 25 00 08 8d 06 0d 70 42 02
+31 00 60 02 48 02 80 2f c0 00 8d 06 05 25 42 06
+31 00 60 06 e0 02 00 20 00 02 8d 06 67 80 08 0a
+31 00 60 06 e0 02 00 20 20 02 8d 06 77 80 08 0a
+31 00 60 06 e0 02 00 20 40 02 8d 06 87 80 08 0a
+31 00 60 06 e0 02 00 20 60 02 8d 06 97 80 08 0a
+31 00 60 06 e0 02 00 20 80 02 8d 06 a7 80 08 0a
+31 00 60 06 e0 02 00 20 a0 02 8d 06 b7 80 08 0a
+31 00 60 06 e0 02 00 20 c0 02 8d 06 c7 80 08 0a
+31 00 60 06 e0 02 00 20 e0 02 8d 06 d7 80 08 0a
+31 00 60 06 e0 02 00 20 00 03 8d 06 e7 80 08 0a
+31 00 60 06 e0 02 00 20 20 03 8d 06 f7 80 08 0a
+31 00 60 06 e0 02 00 20 40 03 8d 06 07 81 08 0a
+31 00 60 06 e0 02 00 20 60 03 8d 06 17 81 08 0a
+31 00 60 06 e0 02 00 20 80 03 8d 06 27 81 08 0a
+31 00 60 06 e0 02 00 20 a0 03 8d 06 37 81 08 0a
+31 00 60 06 e0 02 00 20 c0 03 8d 06 47 81 08 0a
+31 00 60 06 e0 02 00 20 e0 03 8d 06 57 81 08 0a
+31 00 60 06 e0 02 00 20 00 04 8d 06 67 81 08 0a
+31 00 60 06 e0 02 00 20 20 04 8d 06 77 81 08 0a
+31 00 60 06 e0 02 00 20 40 04 8d 06 87 81 08 0a
+31 00 60 06 e0 02 00 20 60 04 8d 06 97 81 08 0a
+31 00 60 06 e0 02 00 20 80 04 8d 06 a7 81 08 0a
+31 00 60 06 e0 02 00 20 a0 04 8d 06 b7 81 08 0a
+31 00 60 06 e0 02 00 20 c0 04 8d 06 c7 81 08 0a
+31 00 60 06 e0 02 00 20 e0 04 8d 06 d7 81 08 0a
+31 00 60 06 e0 02 00 20 00 05 8d 06 e7 81 08 0a
+31 00 60 06 e0 02 00 20 20 05 8d 06 f7 81 08 0a
+31 00 60 06 e0 02 00 20 80 00 8d 06 27 80 0a 0e
+31 00 61 0c 4a 02 60 20 60 00 8d 06 02 b7 10 04
+31 00 81 0c 4a 02 80 20 c0 00 8d 06 02 a7 20 08
+31 00 60 02 48 02 40 20 40 01 8d 06 01 30 42 0a
+31 00 60 02 48 02 c0 20 e0 01 8d 06 02 31 42 0a
+31 00 80 02 48 02 a0 23 20 01 8d 06 01 30 84 14
+31 00 80 02 48 02 a0 24 60 02 8d 06 02 31 84 14
+31 00 60 02 48 02 80 2f 40 00 8d 06 01 40 42 06
+31 00 61 0c 4a 02 80 20 80 01 8d 06 02 b5 10 02
+31 00 81 0c 4a 02 a0 20 20 02 8d 06 02 a5 20 04
+31 00 60 06 08 02 80 21 20 00 8d 06 58 00 28 02
+31 00 60 06 e0 02 00 20 80 01 8d 06 67 80 0a 0e
+31 00 60 06 08 02 80 21 20 00 8d 06 78 00 28 02
+31 00 60 06 e0 02 00 20 80 01 8d 06 87 80 0a 0e
+31 00 60 06 08 02 80 21 20 00 8d 06 98 00 28 02
+31 00 60 06 e0 02 00 20 80 01 8d 06 a7 80 0a 0e
+31 00 60 02 48 02 40 20 20 01 8d 06 01 b0 4a 02
+31 00 80 02 48 02 40 20 40 01 8d 06 01 b0 8c 02
+31 00 61 0c 42 02 00 20 60 00 8d 06 01 6c 02 06
+31 00 81 0c 42 02 00 20 80 00 8d 06 01 5c 02 0c
+31 00 60 0c 40 02 00 20 20 01 8d 06 00 6e 02 04
+31 00 60 0c 40 02 00 20 20 01 8d 06 00 6c 02 06
+31 00 80 02 48 02 20 21 20 02 8d 06 02 70 84 04
+31 00 61 0c 4a 02 c0 21 a0 01 8d 06 02 bb 10 04
+31 00 61 0c 4a 02 40 22 80 00 8d 06 02 b4 10 04
+31 00 81 0c 4a 02 c0 22 00 03 8d 06 02 ab 20 08
+31 00 80 0c 48 02 e0 22 60 00 8d 06 02 5e 20 04
+31 00 81 0c 4a 02 40 23 c0 00 8d 06 02 a4 20 08
+31 00 60 06 08 02 00 21 20 00 8d 06 68 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 88 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 a8 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 b8 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 c8 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 d8 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 e8 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 f8 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 08 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 18 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 28 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 38 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 48 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 58 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 68 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 78 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 88 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 98 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 a8 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 b8 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 c8 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 d8 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 e8 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 f8 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 08 02 28 02
+31 00 60 0b 48 02 40 20 60 00 8d 06 00 30 20 04
+31 00 80 0b 48 02 40 20 60 01 8d 06 00 30 41 08
+31 00 60 0b 48 02 40 20 00 00 8d 06 10 10 20 02
+31 00 80 0b 48 02 40 20 00 00 8d 06 10 10 41 02
+31 00 60 0b 48 02 40 20 00 00 8d 06 20 10 20 02
+31 00 80 0b 48 02 40 20 00 00 8d 06 20 10 41 02
+31 00 60 0b 48 02 40 20 00 00 8d 06 30 10 20 02
+31 00 80 0b 48 02 40 20 00 00 8d 06 30 10 41 02
+31 00 61 0c 42 02 00 20 c0 0e 8d 06 01 96 00 02
+31 00 61 0c 4a 02 e0 25 c0 0e 8d 06 01 b6 10 02
+31 00 80 0c 40 02 00 20 00 01 8d 06 02 5c 02 0c
+31 00 80 0c 48 02 80 20 20 00 8d 06 02 5c 40 04
+31 00 60 0c 40 02 00 20 80 0f 8d 06 00 96 00 02
+31 00 60 0c 48 02 60 26 80 0f 8d 06 00 b6 10 02
+31 00 60 0c 48 02 e0 2f 20 01 8d 06 01 a4 19 08
+31 10 60 0c 48 02 e0 2f 40 00 8d 06 01 b4 19 08
+31 00 80 02 48 02 a0 21 a0 02 8d 06 02 d0 85 0c
+31 00 60 02 48 02 a0 21 e0 03 8d 06 02 e0 43 0a
+31 00 60 02 48 02 80 2f c0 00 8d 06 01 10 42 04
+31 00 80 02 48 02 00 2f 00 01 8d 06 01 10 84 08
+31 00 60 0a 4c 3a 80 24 00 00 8d 06 3f 00 1c 02
+31 00 60 0a 4c 3a a0 24 00 00 8d 06 40 00 1c 02
+31 00 60 0a 4c 3a c0 24 00 00 8d 06 41 00 1c 02
+31 00 60 0a 4c 3a e0 24 00 00 8d 06 42 00 1c 02
+31 00 60 0a 4c 3a 20 25 00 00 8d 06 44 00 1c 02
+31 00 60 0a 4c 3a 00 25 00 00 8d 06 43 00 1c 02
+31 00 60 0a 4c 3a 40 25 00 00 8d 06 45 00 1c 02
+31 00 60 0a 4c 3a 60 25 00 00 8d 06 46 00 1c 02
+31 00 60 0a 4c 3a a0 25 00 00 8d 06 48 00 1c 02
+31 00 60 0a 4c 3a 80 25 00 00 8d 06 47 00 1c 02
+31 00 60 0a 4c 3a c0 25 00 00 8d 06 49 00 1c 02
+31 00 60 0a 4c 3a e0 25 00 00 8d 06 4a 00 1c 02
+31 00 60 0a 4c 3a a0 27 00 00 8d 06 57 00 1c 02
+31 00 60 0a 4c 3a 00 26 00 00 8d 06 4b 00 1c 02
+31 00 60 0a 4c 3a c0 27 00 00 8d 06 58 00 1c 02
+31 00 60 0a 4c 3a 20 27 00 00 8d 06 53 00 1c 02
+31 00 60 0a 4c 3a e0 27 00 00 8d 06 59 00 1c 02
+31 00 60 0a 4c 3a 40 27 00 00 8d 06 54 00 1c 02
+31 00 60 0a 4c 3a 00 28 00 00 8d 06 5a 00 1c 02
+31 00 60 0a 4c 3a a0 26 00 00 8d 06 4f 00 1c 02
+31 00 60 0a 4c 3a 60 27 00 00 8d 06 55 00 1c 02
+31 00 60 0a 4c 3a c0 26 00 00 8d 06 50 00 1c 02
+31 00 60 0a 4c 3a 80 27 00 00 8d 06 56 00 1c 02
+31 00 60 0a 4c 3a 60 26 00 00 8d 06 4d 00 1c 02
+31 00 60 0a 4c 3a e0 26 00 00 8d 06 51 00 1c 02
+31 00 60 0a 4c 3a 80 26 00 00 8d 06 4e 00 1c 02
+31 00 60 0a 4c 3a 00 27 00 00 8d 06 52 00 1c 02
+31 00 60 0a 4c 3a 40 26 00 00 8d 06 4c 00 1c 02
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 17 01 08 8a
+31 00 60 0c 48 02 c0 20 20 01 8d 06 02 60 40 02
+31 00 80 0c 48 02 00 22 c0 01 8d 06 02 50 80 04
+31 00 60 0c 48 02 c0 20 20 02 8d 06 00 b5 10 02
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 97 00 08 92
+31 00 60 06 e0 3a 00 20 80 00 8d 06 c7 00 08 12
+31 00 60 06 e0 3a 00 20 a0 00 8d 06 e7 00 08 12
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 07 01 08 8a
+31 00 60 02 48 02 c0 20 c0 00 8d 06 01 b1 4a 02
+31 00 60 02 48 02 40 21 40 01 8d 06 02 b2 4a 02
+31 00 60 02 48 02 c0 21 c0 01 8d 06 03 b3 4a 02
+31 00 60 02 48 02 40 22 40 02 8d 06 04 b4 4a 02
+31 00 60 02 48 02 c0 22 c0 02 8d 06 05 b5 4a 02
+31 00 60 02 48 02 80 2f 00 01 8d 06 01 20 4a 06
+31 00 80 02 48 02 00 2f 80 01 8d 06 01 20 8c 0a
+31 00 60 02 48 02 c0 20 e0 01 8d 06 02 31 42 08
+31 00 80 02 48 02 40 22 40 00 8d 06 02 31 84 10
+31 00 60 06 08 02 40 20 20 01 8d 06 28 00 3a 04
+31 00 60 06 08 02 a0 21 20 00 8d 06 98 00 38 02
+31 00 60 06 08 02 a0 21 20 00 8d 06 a8 00 38 02
+31 00 60 06 08 02 a0 21 20 00 8d 06 b8 00 38 02
+31 00 60 06 08 02 a0 21 20 00 8d 06 d8 00 38 02
+31 00 60 06 08 02 a0 21 20 00 8d 06 e8 00 38 02
+31 00 60 06 08 02 a0 21 20 00 8d 06 f8 00 38 02
+31 00 60 06 08 02 a0 21 20 00 8d 06 08 01 38 02
+31 00 60 06 08 02 a0 21 20 00 8d 06 18 01 38 02
+31 00 60 06 e0 3a 00 20 80 07 8d 06 a7 00 08 12
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 07 01 08 92
+31 00 60 0c 48 02 60 20 60 00 8d 06 01 5e 19 04
+31 00 60 0c 40 02 00 20 a0 00 8d 06 02 5e 0b 06
+31 10 60 0c 48 02 a0 20 00 01 8d 06 01 6e 19 04
+31 10 60 0c 40 02 00 20 40 01 8d 06 02 6e 0b 06
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 67 00 08 8a
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 77 00 1c 02
+31 00 60 0a 4c 3a c0 23 00 00 8d 06 75 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 74 00 1c 02
+31 00 60 0a 4c 3a c0 21 00 00 8d 06 73 00 1c 02
+31 00 60 0a 4c 3a c0 2e 00 00 8d 06 70 00 1c 02
+31 00 60 0a 4c 3a 40 2f 00 00 8d 06 71 00 1c 02
+31 00 60 0a 4c 3a c0 20 00 00 8d 06 72 00 1c 02
+31 00 60 0a 4c 3a 40 28 00 00 8d 06 6a 00 1c 02
+31 00 60 0a 4c 3a c0 28 00 00 8d 06 6b 00 1c 02
+31 00 60 0a 4c 3a c0 2c 00 00 8d 06 6c 00 1c 02
+31 00 60 0a 4c 3a 40 2d 00 00 8d 06 6d 00 1c 02
+31 00 60 0a 4c 3a c0 2d 00 00 8d 06 6e 00 1c 02
+31 00 60 0a 4c 3a 40 2e 00 00 8d 06 6f 00 1c 02
+31 00 60 0a 4c 3a c0 20 00 00 8d 06 5e 00 1c 02
+31 00 60 0a 4c 3a c0 21 00 00 8d 06 5f 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 60 00 1c 02
+31 00 60 0a 4c 3a c0 23 00 00 8d 06 61 00 1c 02
+31 00 60 0a 4c 3a 40 24 00 00 8d 06 62 00 1c 02
+31 00 60 0a 4c 3a c0 24 00 00 8d 06 63 00 1c 02
+31 00 60 0a 4c 3a 40 25 00 00 8d 06 64 00 1c 02
+31 00 60 0a 4c 3a c0 25 00 00 8d 06 65 00 1c 02
+31 00 60 0a 4c 3a 40 26 00 00 8d 06 66 00 1c 02
+31 00 60 0a 4c 3a c0 26 00 00 8d 06 67 00 1c 02
+31 00 60 0a 4c 3a 40 27 00 00 8d 06 68 00 1c 02
+31 00 60 0a 4c 3a c0 27 00 00 8d 06 69 00 1c 02
+31 00 60 0a 4c 3a 20 23 00 00 8d 06 5b 00 1c 02
+31 00 60 0a 4c 3a c0 20 00 00 8d 06 5c 00 1c 02
+31 00 60 0a 4c 3a c0 20 00 00 8d 06 5d 00 1c 02
+31 00 60 06 e0 3a 00 20 00 0a 8d 06 b7 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 d7 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 f7 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 17 01 0a 14
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 37 01 0a 14
+31 00 60 06 e0 3a 00 20 40 0b 8d 06 57 01 0a 14
+31 00 60 06 e0 3a 00 20 80 0c 8d 06 77 01 0a 14
+31 00 60 06 e0 3a 00 20 c0 0d 8d 06 97 01 0a 0c
+31 00 60 0a 4c 3a 00 22 00 00 8d 06 76 00 1c 02
+31 00 60 0a 4c 3a 80 20 00 00 8d 06 79 00 1c 02
+31 00 60 0a 4c 3a 80 20 00 00 8d 06 78 00 1c 02
+31 00 60 06 e0 3a 00 20 00 0f 8d 06 97 01 0a 8c
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 b7 00 08 8a
+31 00 60 06 08 02 c0 22 a0 06 8d 06 38 02 18 02
+31 00 60 06 08 02 c0 26 a0 06 8d 06 38 04 18 02
+31 00 60 06 08 02 60 28 a0 06 8d 06 38 06 18 02
+31 00 60 06 08 02 a0 27 a0 06 8d 06 48 02 18 02
+31 00 60 06 08 02 40 28 a0 06 8d 06 48 04 18 02
+31 00 60 06 08 02 e0 27 a0 06 8d 06 48 06 18 02
+31 00 60 06 08 02 80 28 20 08 8d 06 58 02 18 02
+31 00 60 06 08 02 a0 28 20 08 8d 06 58 04 18 02
+31 00 60 06 08 02 c0 28 20 08 8d 06 58 06 18 02
+31 00 60 06 08 02 60 29 00 03 8d 06 68 02 18 02
+31 00 60 06 08 02 80 29 00 03 8d 06 68 04 18 02
+31 00 60 06 08 02 a0 29 00 03 8d 06 68 06 18 02
+31 00 60 06 08 02 40 2a 20 03 8d 06 78 02 18 02
+31 00 60 06 08 02 60 2a 20 03 8d 06 78 04 18 02
+31 00 60 06 08 02 80 2a 20 03 8d 06 78 06 18 02
+31 00 60 06 08 02 20 2b 40 03 8d 06 88 02 18 02
+31 00 60 06 08 02 40 2b 40 03 8d 06 88 04 18 02
+31 00 60 06 08 02 60 2b 40 03 8d 06 88 06 18 02
+31 00 60 06 08 02 00 2c 60 03 8d 06 98 02 18 02
+31 00 60 06 08 02 20 2c 60 03 8d 06 98 04 18 02
+31 00 60 06 08 02 40 2c 60 03 8d 06 98 06 18 02
+31 00 60 06 08 02 e0 2c 80 03 8d 06 a8 02 18 02
+31 00 60 06 08 02 00 2d 80 03 8d 06 a8 04 18 02
+31 00 60 06 08 02 20 2d 80 03 8d 06 a8 06 18 02
+31 00 60 06 08 02 c0 2d a0 03 8d 06 b8 02 18 02
+31 00 60 06 08 02 e0 2d a0 03 8d 06 b8 04 18 02
+31 00 60 06 08 02 00 2e a0 03 8d 06 b8 06 18 02
+31 00 60 06 08 02 a0 2e c0 03 8d 06 c8 02 18 02
+31 00 60 06 08 02 c0 2e c0 03 8d 06 c8 04 18 02
+31 00 60 06 08 02 e0 2e c0 03 8d 06 c8 06 18 02
+31 00 60 06 08 02 80 2f e0 03 8d 06 d8 02 18 02
+31 00 60 06 08 02 a0 2f e0 03 8d 06 d8 04 18 02
+31 00 60 06 08 02 c0 2f e0 03 8d 06 d8 06 18 02
+31 00 60 06 08 02 40 21 00 04 8d 06 e8 02 18 02
+31 00 60 06 08 02 60 21 00 04 8d 06 e8 04 18 02
+31 00 60 06 08 02 80 21 00 04 8d 06 e8 06 18 02
+31 00 60 06 08 02 40 23 20 04 8d 06 f8 02 18 02
+31 00 60 06 08 02 60 23 20 04 8d 06 f8 04 18 02
+31 00 60 06 08 02 80 23 20 04 8d 06 f8 06 18 02
+31 00 60 06 08 02 20 24 60 04 8d 06 08 03 18 02
+31 00 60 06 08 02 40 24 60 04 8d 06 08 05 18 02
+31 00 60 06 08 02 60 24 60 04 8d 06 08 07 18 02
+31 00 60 06 08 02 00 28 80 04 8d 06 18 03 18 02
+31 00 60 06 08 02 20 25 80 04 8d 06 18 05 18 02
+31 00 60 06 08 02 40 25 80 04 8d 06 18 07 18 02
+31 00 60 06 08 02 c0 20 a0 04 8d 06 28 03 18 02
+31 00 60 06 08 02 00 26 a0 04 8d 06 28 05 18 02
+31 00 60 06 08 02 20 26 a0 04 8d 06 28 07 18 02
+31 00 60 06 08 02 60 28 c0 04 8d 06 38 03 18 02
+31 00 60 06 08 02 00 27 c0 04 8d 06 38 05 18 02
+31 00 60 06 08 02 20 27 c0 04 8d 06 38 07 18 02
+31 00 60 06 08 02 40 28 e0 04 8d 06 48 03 18 02
+31 00 60 06 08 02 e0 27 e0 04 8d 06 48 05 18 02
+31 00 60 06 08 02 00 25 e0 04 8d 06 48 07 18 02
+31 00 60 06 08 02 a0 28 00 08 8d 06 58 03 18 02
+31 00 60 06 08 02 c0 28 00 08 8d 06 58 05 18 02
+31 00 60 06 08 02 e0 28 00 08 8d 06 58 07 18 02
+31 00 60 06 08 02 80 29 20 05 8d 06 68 03 18 02
+31 00 60 06 08 02 a0 29 20 05 8d 06 68 05 18 02
+31 00 60 06 08 02 c0 29 20 05 8d 06 68 07 18 02
+31 00 60 06 08 02 60 2a 40 05 8d 06 78 03 18 02
+31 00 60 06 08 02 80 2a 40 05 8d 06 78 05 18 02
+31 00 60 06 08 02 a0 2a 40 05 8d 06 78 07 18 02
+31 00 60 06 08 02 40 2b 60 05 8d 06 88 03 18 02
+31 00 60 06 08 02 60 2b 60 05 8d 06 88 05 18 02
+31 00 60 06 08 02 80 2b 60 05 8d 06 88 07 18 02
+31 00 60 06 08 02 20 2c 80 05 8d 06 98 03 18 02
+31 00 60 06 08 02 40 2c 80 05 8d 06 98 05 18 02
+31 00 60 06 08 02 60 2c 80 05 8d 06 98 07 18 02
+31 00 60 06 08 02 00 2d a0 05 8d 06 a8 03 18 02
+31 00 60 06 08 02 20 2d a0 05 8d 06 a8 05 18 02
+31 00 60 06 08 02 40 2d a0 05 8d 06 a8 07 18 02
+31 00 60 06 08 02 e0 2d c0 05 8d 06 b8 03 18 02
+31 00 60 06 08 02 00 2e c0 05 8d 06 b8 05 18 02
+31 00 60 06 08 02 20 2e c0 05 8d 06 b8 07 18 02
+31 00 60 06 08 02 c0 2e c0 00 8d 06 c8 03 18 02
+31 00 60 06 08 02 e0 2e c0 00 8d 06 c8 05 18 02
+31 00 60 06 08 02 00 2f c0 00 8d 06 c8 07 18 02
+31 00 60 06 08 02 a0 2f 00 06 8d 06 d8 03 18 02
+31 00 60 06 08 02 c0 2f 00 06 8d 06 d8 05 18 02
+31 00 60 06 08 02 40 20 00 06 8d 06 d8 07 18 02
+31 00 60 06 08 02 80 21 20 06 8d 06 e8 03 18 02
+31 00 60 06 08 02 a0 21 20 06 8d 06 e8 05 18 02
+31 00 60 06 08 02 c0 21 20 06 8d 06 e8 07 18 02
+31 00 60 06 08 02 60 22 40 06 8d 06 f8 03 18 02
+31 00 60 06 08 02 80 22 40 06 8d 06 f8 05 18 02
+31 00 60 06 08 02 a0 26 40 06 8d 06 f8 07 18 02
+31 00 60 06 08 02 80 23 60 06 8d 06 08 04 18 02
+31 00 60 06 08 02 a0 23 60 06 8d 06 08 06 18 02
+31 00 60 06 08 02 c0 23 60 06 8d 06 08 08 18 02
+31 00 60 06 08 02 60 24 c0 02 8d 06 18 02 18 02
+31 00 60 06 08 02 80 24 c0 02 8d 06 18 04 18 02
+31 00 60 06 08 02 a0 24 c0 02 8d 06 18 06 18 02
+31 00 60 06 08 02 c0 24 c0 02 8d 06 18 08 18 02
+31 00 60 06 e0 02 00 20 c0 00 8d 06 37 80 0a 08
+31 00 60 06 e0 02 00 20 40 01 8d 06 47 80 0a 08
+31 00 60 06 e0 02 00 20 60 01 8d 06 57 80 0a 08
+31 00 60 06 e0 02 00 20 80 01 8d 06 67 80 0a 08
+31 00 60 06 e0 02 00 20 a0 01 8d 06 77 80 0a 08
+31 00 60 06 e0 02 00 20 c0 01 8d 06 87 80 0a 08
+31 00 60 06 e0 02 00 20 e0 01 8d 06 97 80 0a 08
+31 00 60 06 e0 02 00 20 00 02 8d 06 a7 80 0a 08
+31 00 60 06 e0 02 00 20 20 02 8d 06 b7 80 0a 08
+31 00 60 06 e0 02 00 20 40 02 8d 06 c7 80 0a 08
+31 00 60 06 e0 02 00 20 60 02 8d 06 d7 80 0a 08
+31 00 60 06 e0 02 00 20 80 02 8d 06 e7 80 0a 08
+31 00 60 06 e0 02 00 20 a0 02 8d 06 f7 80 0a 08
+31 00 60 06 e0 02 00 20 c0 02 8d 06 07 81 0a 08
+31 00 60 06 e0 02 00 20 e0 02 8d 06 17 81 0a 08
+31 00 60 06 e0 02 00 20 00 03 8d 06 27 81 0a 08
+31 00 60 06 e0 02 00 20 20 03 8d 06 37 81 0a 08
+31 00 60 06 e0 02 00 20 40 03 8d 06 47 81 0a 08
+31 00 60 06 e0 02 00 20 60 03 8d 06 57 81 0a 08
+31 00 60 06 e0 02 00 20 80 03 8d 06 67 81 0a 08
+31 00 60 06 e0 02 00 20 a0 03 8d 06 77 81 0a 08
+31 00 60 06 e0 02 00 20 c0 03 8d 06 87 81 0a 08
+31 00 60 06 e0 02 00 20 e0 03 8d 06 97 81 0a 08
+31 00 60 06 e0 02 00 20 00 04 8d 06 a7 81 0a 08
+31 00 60 06 e0 02 00 20 20 04 8d 06 b7 81 0a 08
+31 00 60 06 e0 02 00 20 40 04 8d 06 c7 81 0a 08
+31 00 60 06 e0 02 00 20 60 04 8d 06 d7 81 0a 08
+31 00 60 06 e0 02 00 20 80 04 8d 06 e7 81 0a 08
+31 00 60 06 e0 02 00 20 a0 04 8d 06 f7 81 0a 08
+31 00 60 06 e0 02 00 20 c0 04 8d 06 07 82 0a 08
+31 00 60 06 e0 02 00 20 e0 04 8d 06 17 82 0a 08
+31 00 60 0c 48 02 60 22 80 02 8d 06 00 5e 19 06
+31 00 60 0c 40 02 00 20 e0 02 8d 06 00 5e 0b 08
+31 00 60 0c 40 02 00 20 60 00 8d 06 02 5e 0b 08
+31 10 60 0c 40 02 00 20 20 00 8d 06 02 6e 0b 08
+31 00 60 02 48 02 40 20 e0 00 8d 06 01 40 4b 0a
+31 00 60 02 48 02 c0 20 80 01 8d 06 02 41 4b 0a
+31 00 60 06 08 02 40 24 40 05 8d 06 48 02 48 02
+31 00 60 06 08 02 c0 24 40 05 8d 06 48 04 48 02
+31 00 60 06 08 02 40 25 40 05 8d 06 48 06 48 02
+31 00 60 06 08 02 c0 22 60 05 8d 06 58 02 48 02
+31 00 60 06 08 02 40 23 60 05 8d 06 58 04 48 02
+31 00 60 06 08 02 c0 23 60 05 8d 06 58 06 48 02
+31 00 60 06 08 02 c0 22 80 05 8d 06 68 02 48 02
+31 00 60 06 08 02 40 23 80 05 8d 06 68 04 48 02
+31 00 60 06 08 02 c0 23 80 05 8d 06 68 06 48 02
+31 00 60 06 08 02 c0 22 a0 05 8d 06 78 02 48 02
+31 00 60 06 08 02 40 23 a0 05 8d 06 78 04 48 02
+31 00 60 06 08 02 c0 23 a0 05 8d 06 78 06 48 02
+31 00 60 06 08 02 c0 22 e0 06 8d 06 88 02 48 02
+31 00 60 06 08 02 40 23 e0 06 8d 06 88 04 48 02
+31 00 60 06 08 02 c0 23 e0 06 8d 06 88 06 48 02
+31 00 60 06 08 02 40 23 00 07 8d 06 98 04 48 02
+31 00 60 06 08 02 c0 22 00 07 8d 06 98 02 48 02
+31 00 60 06 08 02 c0 23 00 07 8d 06 98 06 48 02
+31 00 60 06 08 02 40 23 40 0a 8d 06 a8 04 48 02
+31 00 60 06 08 02 c0 22 40 0a 8d 06 a8 02 48 02
+31 00 60 06 08 02 c0 23 40 0a 8d 06 a8 06 48 02
+31 00 60 06 08 02 40 23 60 0a 8d 06 b8 04 48 02
+31 00 60 06 08 02 c0 22 60 0a 8d 06 b8 02 48 02
+31 00 60 06 08 02 c0 23 60 0a 8d 06 b8 06 48 02
+31 00 60 06 08 02 40 23 80 0a 8d 06 c8 06 48 02
+31 00 60 06 08 02 00 21 80 0a 8d 06 c8 02 48 02
+31 00 60 06 08 02 c0 22 80 0a 8d 06 c8 04 48 02
+31 00 60 06 08 02 00 21 a0 0a 8d 06 d8 02 48 02
+31 00 60 06 08 02 c0 22 a0 0a 8d 06 d8 04 48 02
+31 00 60 06 08 02 40 23 a0 0a 8d 06 d8 06 48 02
+31 00 60 06 08 02 00 21 c0 00 8d 06 e8 02 48 02
+31 00 60 06 08 02 c0 22 c0 00 8d 06 e8 04 48 02
+31 00 60 06 08 02 40 23 c0 00 8d 06 e8 06 48 02
+31 00 60 06 08 02 00 21 60 00 8d 06 f8 02 48 02
+31 00 60 06 08 02 c0 22 60 00 8d 06 f8 04 48 02
+31 00 60 06 08 02 40 23 60 00 8d 06 f8 06 48 02
+31 00 60 06 08 02 00 21 c0 05 8d 06 08 03 48 02
+31 00 60 06 08 02 c0 22 c0 05 8d 06 08 05 48 02
+31 00 60 06 08 02 40 23 c0 05 8d 06 08 07 48 02
+31 00 60 06 08 02 00 21 e0 05 8d 06 18 03 48 02
+31 00 60 06 08 02 c0 22 e0 05 8d 06 18 05 48 02
+31 00 60 06 08 02 40 23 e0 05 8d 06 18 07 48 02
+31 00 60 06 08 02 00 21 20 07 8d 06 28 03 48 02
+31 00 60 06 08 02 c0 22 20 07 8d 06 28 05 48 02
+31 00 60 06 08 02 40 23 20 07 8d 06 28 07 48 02
+31 00 60 06 08 02 00 21 40 07 8d 06 38 03 48 02
+31 00 60 06 08 02 c0 22 40 07 8d 06 38 05 48 02
+31 00 60 06 08 02 40 23 40 07 8d 06 38 07 48 02
+31 00 60 06 08 02 00 21 60 07 8d 06 48 03 48 02
+31 00 60 06 08 02 80 21 60 07 8d 06 48 05 48 02
+31 00 60 06 08 02 c0 22 60 07 8d 06 48 07 48 02
+31 00 60 06 08 02 00 21 80 07 8d 06 58 03 48 02
+31 00 60 06 08 02 80 21 80 07 8d 06 58 05 48 02
+31 00 60 06 08 02 c0 22 80 07 8d 06 58 07 48 02
+31 00 60 06 08 02 00 21 a0 07 8d 06 68 03 48 02
+31 00 60 06 08 02 80 21 a0 07 8d 06 68 05 48 02
+31 00 60 06 08 02 c0 22 a0 07 8d 06 68 07 48 02
+31 00 60 06 08 02 00 21 c0 07 8d 06 78 03 48 02
+31 00 60 06 08 02 80 21 c0 07 8d 06 78 05 48 02
+31 00 60 06 08 02 c0 22 c0 07 8d 06 78 07 48 02
+31 00 60 06 08 02 00 21 e0 07 8d 06 88 03 48 02
+31 00 60 06 08 02 80 21 e0 07 8d 06 88 05 48 02
+31 00 60 06 08 02 c0 22 e0 07 8d 06 88 07 48 02
+31 00 60 06 08 02 00 21 00 08 8d 06 98 03 48 02
+31 00 60 06 08 02 80 21 00 08 8d 06 98 05 48 02
+31 00 60 06 08 02 c0 22 00 08 8d 06 98 07 48 02
+31 00 60 06 08 02 00 21 80 08 8d 06 a8 03 48 02
+31 00 60 06 08 02 80 21 80 08 8d 06 a8 05 48 02
+31 00 60 06 08 02 c0 22 80 08 8d 06 a8 07 48 02
+31 00 60 06 08 02 00 21 a0 08 8d 06 b8 03 48 02
+31 00 60 06 08 02 80 21 a0 08 8d 06 b8 05 48 02
+31 00 60 06 08 02 c0 22 a0 08 8d 06 b8 07 48 02
+31 00 60 06 08 02 00 21 c0 08 8d 06 c8 03 48 02
+31 00 60 06 08 02 80 21 c0 08 8d 06 c8 05 48 02
+31 00 60 06 08 02 00 22 c0 08 8d 06 c8 07 48 02
+31 00 60 06 08 02 00 21 e0 08 8d 06 d8 03 48 02
+31 00 60 06 08 02 80 21 e0 08 8d 06 d8 05 48 02
+31 00 60 06 08 02 00 22 e0 08 8d 06 d8 07 48 02
+31 00 60 06 08 02 00 21 00 09 8d 06 e8 03 48 02
+31 00 60 06 08 02 80 21 00 09 8d 06 e8 05 48 02
+31 00 60 06 08 02 00 22 00 09 8d 06 e8 07 48 02
+31 00 60 06 08 02 00 21 20 09 8d 06 f8 03 48 02
+31 00 60 06 08 02 80 21 20 09 8d 06 f8 05 48 02
+31 00 60 06 08 02 00 22 20 09 8d 06 f8 07 48 02
+31 00 60 06 08 02 80 21 60 09 8d 06 18 04 48 02
+31 00 60 06 08 02 00 22 60 09 8d 06 18 06 48 02
+31 00 60 06 08 02 80 22 60 09 8d 06 18 08 48 02
+31 00 60 06 e0 02 00 20 80 02 8d 06 c7 00 0a 0c
+31 00 60 06 e0 02 00 20 a0 02 8d 06 d7 00 0a 0c
+31 00 60 06 e0 02 00 20 c0 02 8d 06 e7 00 0a 0c
+31 00 60 06 e0 02 00 20 e0 02 8d 06 f7 00 0a 0c
+31 00 60 06 e0 02 00 20 00 03 8d 06 07 01 0a 0c
+31 00 60 06 e0 02 00 20 20 03 8d 06 17 01 0a 0c
+31 00 60 06 e0 02 00 20 60 03 8d 06 37 01 0a 0c
+31 00 60 06 e0 02 00 20 80 03 8d 06 47 01 0a 0c
+31 00 60 06 e0 02 00 20 a0 03 8d 06 57 01 0a 0c
+31 00 60 06 e0 02 00 20 c0 03 8d 06 67 01 0a 0c
+31 00 60 06 e0 02 00 20 e0 03 8d 06 77 01 0a 0c
+31 00 60 06 e0 02 00 20 00 04 8d 06 87 01 0a 0c
+31 00 60 06 e0 02 00 20 40 04 8d 06 a7 01 0a 0c
+31 00 60 06 e0 02 00 20 60 04 8d 06 b7 01 0a 0c
+31 00 60 06 e0 02 00 20 80 04 8d 06 c7 01 0a 0c
+31 00 60 06 e0 02 00 20 a0 04 8d 06 d7 01 0a 0c
+31 00 60 06 e0 02 00 20 c0 04 8d 06 e7 01 0a 0c
+31 00 60 06 e0 02 00 20 e0 04 8d 06 f7 01 0a 0c
+31 00 80 09 0c 02 c0 25 80 01 8d 06 02 03 28 02
+31 00 80 09 0c 02 40 26 e0 01 8d 06 04 03 28 02
+31 00 80 09 0c 02 40 24 80 02 8d 06 03 03 28 02
+31 00 80 09 0c 02 00 22 a0 02 8d 06 06 03 28 02
+31 00 a0 0a 4c 3a e0 2a 00 00 8d 06 0c 20 4c 02
+31 00 a0 0a 4c 3a 60 2b 00 00 8d 06 14 20 4c 02
+31 00 a0 0a 4c 3a e0 2b 00 00 8d 06 1c 20 4c 02
+31 00 60 0c 48 02 00 21 00 03 8d 06 04 6e 10 02
+31 00 60 0c 48 02 a0 20 a0 02 8d 06 03 6e 10 02
+31 00 80 0c 48 02 c0 21 00 05 8d 06 04 5e 20 04
+31 00 80 0c 48 02 00 21 80 04 8d 06 03 5e 20 04
+31 00 61 0c 42 02 00 20 60 01 8d 06 02 60 02 0a
+31 00 81 0c 42 02 00 20 60 01 8d 06 02 50 02 14
+31 00 60 06 08 02 e0 21 80 01 8d 06 38 00 1a 04
+31 00 60 02 48 02 c0 22 c0 01 8d 06 04 84 4a 06
+31 00 60 02 48 02 c0 21 40 01 8d 06 02 82 4a 08
+31 00 60 02 48 02 40 22 40 03 8d 06 03 83 4a 0a
+31 00 60 02 48 02 40 20 c0 06 8d 06 07 a7 42 02
+31 00 60 02 48 02 c0 20 e0 06 8d 06 08 a8 42 02
+31 00 60 02 48 02 40 21 00 07 8d 06 09 a9 42 02
+31 00 60 02 48 02 c0 21 20 07 8d 06 0a aa 42 02
+31 00 60 02 48 02 40 22 40 07 8d 06 0b ab 42 02
+31 00 60 02 48 02 c0 22 60 07 8d 06 0c ac 42 02
+31 00 60 06 e0 02 00 20 20 01 8d 06 27 80 08 0c
+31 00 60 06 e0 02 00 20 40 01 8d 06 47 80 08 0c
+31 00 60 06 e0 02 00 20 60 01 8d 06 67 80 08 0c
+31 00 60 06 e0 02 00 20 c0 00 8d 06 37 80 08 0c
+31 00 60 06 e0 02 00 20 e0 00 8d 06 57 80 08 0c
+31 00 60 06 e0 02 00 20 00 01 8d 06 77 80 08 0c
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 00 01 1c 02
+31 00 60 0a 4c 3a 20 23 00 00 8d 06 c1 00 1c 02
+31 00 60 0a 4c 3a e0 29 00 00 8d 06 fe 00 1c 02
+31 00 60 0a 4c 3a 60 29 00 00 8d 06 fd 00 1c 02
+31 00 60 0a 4c 3a e0 28 00 00 8d 06 fc 00 1c 02
+31 00 60 0a 4c 3a 60 28 00 00 8d 06 fb 00 1c 02
+31 00 60 0a 4c 3a e0 27 00 00 8d 06 fa 00 1c 02
+31 00 60 0a 4c 3a e0 25 00 00 8d 06 f6 00 1c 02
+31 00 60 0a 4c 3a 00 21 00 00 8d 06 ef 00 1c 02
+31 00 60 0a 4c 3a e0 21 00 00 8d 06 f0 00 1c 02
+31 00 60 0a 4c 3a e0 22 00 00 8d 06 f1 00 1c 02
+31 00 60 0a 4c 3a e0 23 00 00 8d 06 f2 00 1c 02
+31 00 60 0a 4c 3a 60 24 00 00 8d 06 f3 00 1c 02
+31 00 60 0a 4c 3a e0 24 00 00 8d 06 f4 00 1c 02
+31 00 60 0a 4c 3a 60 25 00 00 8d 06 f5 00 1c 02
+31 00 60 0a 4c 3a 60 26 00 00 8d 06 f7 00 1c 02
+31 00 60 0a 4c 3a e0 26 00 00 8d 06 f8 00 1c 02
+31 00 60 0a 4c 3a 60 27 00 00 8d 06 f9 00 1c 02
+31 00 60 0a 4c 3a 20 2b 00 00 8d 06 ee 00 1c 02
+31 00 60 0a 4c 3a a0 2a 00 00 8d 06 ed 00 1c 02
+31 00 60 0a 4c 3a 20 2a 00 00 8d 06 ec 00 1c 02
+31 00 60 0a 4c 3a a0 29 00 00 8d 06 eb 00 1c 02
+31 00 60 0a 4c 3a 20 23 00 00 8d 06 df 00 1c 02
+31 00 60 0a 4c 3a 20 24 00 00 8d 06 e0 00 1c 02
+31 00 60 0a 4c 3a a0 28 00 00 8d 06 e9 00 1c 02
+31 00 60 0a 4c 3a 20 29 00 00 8d 06 ea 00 1c 02
+31 00 60 0a 4c 3a a0 24 00 00 8d 06 e1 00 1c 02
+31 00 60 0a 4c 3a 20 25 00 00 8d 06 e2 00 1c 02
+31 00 60 0a 4c 3a a0 25 00 00 8d 06 e3 00 1c 02
+31 00 60 0a 4c 3a 20 26 00 00 8d 06 e4 00 1c 02
+31 00 60 0a 4c 3a a0 26 00 00 8d 06 e5 00 1c 02
+31 00 60 0a 4c 3a 20 27 00 00 8d 06 e6 00 1c 02
+31 00 60 0a 4c 3a a0 27 00 00 8d 06 e7 00 1c 02
+31 00 60 0a 4c 3a 20 28 00 00 8d 06 e8 00 1c 02
+31 00 60 0a 4c 3a e0 22 00 00 8d 06 be 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 c0 00 1c 02
+31 00 60 0a 4c 3a 80 2e 00 00 8d 06 bf 00 1c 02
+31 00 60 0a 4c 3a e0 20 00 00 8d 06 c3 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 c4 00 1c 02
+31 00 60 0a 4c 3a 20 23 00 00 8d 06 c5 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 c6 00 1c 02
+31 00 60 0a 4c 3a 20 23 00 00 8d 06 c7 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 c8 00 1c 02
+31 00 60 0a 4c 3a 20 23 00 00 8d 06 c9 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 ca 00 1c 02
+31 00 60 0a 4c 3a 20 23 00 00 8d 06 cb 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 cc 00 1c 02
+31 00 60 0a 4c 3a 20 23 00 00 8d 06 cd 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 ce 00 1c 02
+31 00 60 0a 4c 3a 20 23 00 00 8d 06 cf 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 d0 00 1c 02
+31 00 60 0a 4c 3a 20 23 00 00 8d 06 d1 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 d2 00 1c 02
+31 00 60 0a 4c 3a 20 23 00 00 8d 06 d3 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 d4 00 1c 02
+31 00 60 0a 4c 3a 20 23 00 00 8d 06 d5 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 d6 00 1c 02
+31 00 60 0a 4c 3a c0 20 00 00 8d 06 db 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 dc 00 1c 02
+31 00 60 0a 4c 3a 20 23 00 00 8d 06 dd 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 de 00 1c 02
+31 00 60 0a 4c 3a 20 23 00 00 8d 06 d7 00 1c 02
+31 00 60 0a 4c 3a c0 20 00 00 8d 06 d8 00 1c 02
+31 00 60 0a 4c 3a c0 20 00 00 8d 06 d9 00 1c 02
+31 00 60 0a 4c 3a c0 20 00 00 8d 06 da 00 1c 02
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 97 01 0a 14
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 b7 01 0a 14
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 d7 01 0a 14
+31 00 80 0a 4c 3a c0 20 00 00 8d 06 a0 10 2c 02
+31 00 80 0a 4c 3a 00 21 00 00 8d 06 a2 10 2c 02
+31 00 80 0a 4c 3a 40 21 00 00 8d 06 a4 10 2c 02
+31 00 80 0a 4c 3a 80 21 00 00 8d 06 a6 10 2c 02
+31 00 80 0a 4c 3a c0 21 00 00 8d 06 a8 10 2c 02
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 f7 01 0a 14
+31 00 60 0a 4c 3a 00 22 00 00 8d 06 ff 00 1c 02
+31 00 80 0a 4c 3a 40 22 00 00 8d 06 8e 10 2c 02
+31 00 80 0a 4c 3a 80 22 00 00 8d 06 90 10 2c 02
+31 00 80 0a 4c 3a c0 22 00 00 8d 06 92 10 2c 02
+31 00 80 0a 4c 3a 00 23 00 00 8d 06 94 10 2c 02
+31 00 80 0a 4c 3a 00 22 00 00 8d 06 aa 10 2c 02
+31 00 80 0a 4c 3a 40 22 00 00 8d 06 ac 10 2c 02
+31 00 80 0a 4c 3a 80 22 00 00 8d 06 ae 10 2c 02
+31 00 80 0a 4c 3a c0 22 00 00 8d 06 b0 10 2c 02
+31 00 80 0a 4c 3a 00 23 00 00 8d 06 b2 10 2c 02
+31 00 60 0a 4c 3a 80 20 00 00 8d 06 02 01 1c 02
+31 00 60 0a 4c 3a 80 20 00 00 8d 06 01 01 1c 02
+31 00 60 0a 4c 3a a0 20 00 00 8d 06 c2 00 1c 02
+31 00 80 0a 4c 3a 00 22 00 00 8d 06 96 10 2c 02
+31 00 80 0a 4c 3a 40 22 00 00 8d 06 98 10 2c 02
+31 00 80 0a 4c 3a 80 22 00 00 8d 06 9a 10 2c 02
+31 00 80 0a 4c 3a c0 22 00 00 8d 06 9c 10 2c 02
+31 00 80 0a 4c 3a 00 23 00 00 8d 06 9e 10 2c 02
+31 00 80 0a 4c 3a 00 22 00 00 8d 06 b4 10 2c 02
+31 00 80 0a 4c 3a 40 22 00 00 8d 06 b6 10 2c 02
+31 00 80 0a 4c 3a 80 22 00 00 8d 06 b8 10 2c 02
+31 00 80 0a 4c 3a c0 22 00 00 8d 06 ba 10 2c 02
+31 00 80 0a 4c 3a 00 23 00 00 8d 06 bc 10 2c 02
+31 00 60 06 e0 3a 00 20 00 0f 8d 06 17 02 0a 8c
+31 00 60 06 08 02 00 21 c0 00 8d 06 18 03 1a 04
+31 00 60 06 08 02 20 21 c0 00 8d 06 18 05 1a 04
+31 00 60 06 08 02 40 21 c0 00 8d 06 18 07 1a 04
+31 00 60 06 08 02 60 21 c0 00 8d 06 18 09 1a 04
+31 00 60 06 08 02 e0 20 60 01 8d 06 18 02 1a 04
+31 00 60 06 08 02 00 21 60 01 8d 06 18 04 1a 04
+31 00 60 06 08 02 20 21 60 01 8d 06 18 06 1a 04
+31 00 60 06 08 02 40 21 60 01 8d 06 18 08 1a 04
+31 00 60 06 e0 02 00 20 40 01 8d 06 27 82 0a 08
+31 00 60 06 e0 02 00 20 60 01 8d 06 37 82 0a 08
+31 00 60 06 e0 02 00 20 80 01 8d 06 47 82 0a 08
+31 00 60 06 e0 02 00 20 a0 01 8d 06 57 82 0a 08
+31 00 60 06 e0 02 00 20 c0 01 8d 06 67 82 0a 08
+31 00 60 06 e0 02 00 20 e0 01 8d 06 77 82 0a 08
+31 00 60 06 e0 02 00 20 00 02 8d 06 87 82 0a 08
+31 00 60 06 e0 02 00 20 20 02 8d 06 97 82 0a 08
+31 00 60 06 e0 02 00 20 40 02 8d 06 a7 82 0a 08
+31 00 60 06 e0 02 00 20 60 02 8d 06 b7 82 0a 08
+31 00 60 06 e0 02 00 20 80 02 8d 06 c7 82 0a 08
+31 00 60 06 e0 02 00 20 a0 02 8d 06 d7 82 0a 08
+31 00 60 06 e0 02 00 20 c0 02 8d 06 e7 82 0a 08
+31 00 60 06 e0 02 00 20 e0 02 8d 06 f7 82 0a 08
+31 00 60 06 e0 02 00 20 00 03 8d 06 07 83 0a 08
+31 00 60 06 e0 02 00 20 20 03 8d 06 17 83 0a 08
+31 00 60 06 e0 02 00 20 40 03 8d 06 27 83 0a 08
+31 00 60 06 e0 02 00 20 60 03 8d 06 37 83 0a 08
+31 00 60 06 e0 02 00 20 80 03 8d 06 47 83 0a 08
+31 00 60 06 e0 02 00 20 a0 03 8d 06 57 83 0a 08
+31 00 60 06 e0 02 00 20 c0 03 8d 06 67 83 0a 08
+31 00 60 06 e0 02 00 20 e0 03 8d 06 77 83 0a 08
+31 00 60 06 e0 02 00 20 00 04 8d 06 87 83 0a 08
+31 00 60 06 e0 02 00 20 20 04 8d 06 97 83 0a 08
+31 00 60 06 e0 02 00 20 40 04 8d 06 a7 83 0a 08
+31 00 60 06 e0 02 00 20 60 04 8d 06 b7 83 0a 08
+31 00 60 06 e0 02 00 20 80 04 8d 06 c7 83 0a 08
+31 00 60 06 e0 02 00 20 a0 04 8d 06 d7 83 0a 08
+31 00 60 06 e0 02 00 20 c0 04 8d 06 e7 83 0a 08
+31 00 60 06 e0 02 00 20 e0 04 8d 06 f7 83 0a 08
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 07 00 08 8a
+31 00 61 0c 4a 02 c0 21 a0 01 8d 06 02 bd 10 04
+31 00 81 0c 4a 02 c0 22 00 03 8d 06 02 ad 20 08
+31 00 60 0c 48 02 40 20 60 02 8d 06 01 5c 29 06
+31 00 60 0c 40 02 00 20 c0 02 8d 06 02 5c 0b 0a
+31 10 60 0c 48 02 20 26 60 05 8d 06 01 6c 29 06
+31 10 60 0c 40 02 00 20 40 00 8d 06 02 6c 0b 0a
+31 00 60 02 48 02 80 2f e0 00 8d 06 01 40 4a 10
+31 00 60 02 48 02 80 2f c0 00 8d 06 01 00 4a 08
+31 00 80 02 48 02 00 2f 00 01 8d 06 01 00 8c 0e
+31 00 60 0a 4c 3a e0 2e 00 00 8d 06 b2 00 1c 02
+31 00 60 0a 4c 3a e0 24 00 00 8d 06 ad 00 1c 02
+31 00 60 0a 4c 3a 00 25 00 00 8d 06 af 00 1c 02
+31 00 60 0a 4c 3a 20 25 00 00 8d 06 ae 00 1c 02
+31 00 60 0a 4c 3a 40 25 00 00 8d 06 b0 00 1c 02
+31 00 60 0a 4c 3a 20 24 00 00 8d 06 a9 00 1c 02
+31 00 60 0a 4c 3a 40 24 00 00 8d 06 a8 00 1c 02
+31 00 60 0a 4c 3a 60 24 00 00 8d 06 aa 00 1c 02
+31 00 60 0a 4c 3a 20 23 00 00 8d 06 a4 00 1c 02
+31 00 60 0a 4c 3a 80 24 00 00 8d 06 ab 00 1c 02
+31 00 60 0a 4c 3a c0 23 00 00 8d 06 a5 00 1c 02
+31 00 60 0a 4c 3a e0 23 00 00 8d 06 a7 00 1c 02
+31 00 60 0a 4c 3a 00 24 00 00 8d 06 a6 00 1c 02
+31 00 60 0a 4c 3a 20 22 00 00 8d 06 a0 00 1c 02
+31 00 60 0a 4c 3a c0 22 00 00 8d 06 a2 00 1c 02
+31 00 60 0a 4c 3a e0 22 00 00 8d 06 a1 00 1c 02
+31 00 60 0a 4c 3a 00 23 00 00 8d 06 a3 00 1c 02
+31 00 60 0a 4c 3a a0 21 00 00 8d 06 9d 00 1c 02
+31 00 60 0a 4c 3a c0 21 00 00 8d 06 9c 00 1c 02
+31 00 60 0a 4c 3a e0 21 00 00 8d 06 9e 00 1c 02
+31 00 60 0a 4c 3a 20 21 00 00 8d 06 98 00 1c 02
+31 00 60 0a 4c 3a 00 22 00 00 8d 06 9f 00 1c 02
+31 00 60 0a 4c 3a 40 21 00 00 8d 06 99 00 1c 02
+31 00 60 0a 4c 3a 60 21 00 00 8d 06 9b 00 1c 02
+31 00 60 0a 4c 3a 80 21 00 00 8d 06 9a 00 1c 02
+31 00 60 0a 4c 3a 20 2f 00 00 8d 06 94 00 1c 02
+31 00 60 0a 4c 3a c0 20 00 00 8d 06 96 00 1c 02
+31 00 60 0a 4c 3a e0 20 00 00 8d 06 95 00 1c 02
+31 00 60 0a 4c 3a 00 21 00 00 8d 06 97 00 1c 02
+31 00 60 0a 4c 3a 80 2e 00 00 8d 06 91 00 1c 02
+31 00 60 0a 4c 3a a0 2e 00 00 8d 06 90 00 1c 02
+31 00 60 0a 4c 3a e0 2e 00 00 8d 06 92 00 1c 02
+31 00 60 0a 4c 3a 00 2e 00 00 8d 06 8c 00 1c 02
+31 00 60 0a 4c 3a 00 2f 00 00 8d 06 93 00 1c 02
+31 00 60 0a 4c 3a 20 2e 00 00 8d 06 8d 00 1c 02
+31 00 60 0a 4c 3a 40 2e 00 00 8d 06 8f 00 1c 02
+31 00 60 0a 4c 3a 60 2e 00 00 8d 06 8e 00 1c 02
+31 00 60 0a 4c 3a 00 2d 00 00 8d 06 84 00 1c 02
+31 00 60 0a 4c 3a 20 2d 00 00 8d 06 86 00 1c 02
+31 00 60 0a 4c 3a 40 2d 00 00 8d 06 85 00 1c 02
+31 00 60 0a 4c 3a 60 2d 00 00 8d 06 87 00 1c 02
+31 00 60 0a 4c 3a 80 2d 00 00 8d 06 89 00 1c 02
+31 00 60 0a 4c 3a a0 2d 00 00 8d 06 88 00 1c 02
+31 00 60 0a 4c 3a c0 2d 00 00 8d 06 8a 00 1c 02
+31 00 60 0a 4c 3a 80 2c 00 00 8d 06 80 00 1c 02
+31 00 60 0a 4c 3a e0 2d 00 00 8d 06 8b 00 1c 02
+31 00 60 0a 4c 3a a0 2c 00 00 8d 06 81 00 1c 02
+31 00 60 0a 4c 3a c0 2c 00 00 8d 06 83 00 1c 02
+31 00 60 0a 4c 3a e0 2c 00 00 8d 06 82 00 1c 02
+31 00 60 0a 4c 3a 00 2c 00 00 8d 06 7c 00 1c 02
+31 00 60 0a 4c 3a 20 2c 00 00 8d 06 7e 00 1c 02
+31 00 60 0a 4c 3a 40 2c 00 00 8d 06 7d 00 1c 02
+31 00 60 0a 4c 3a 60 2c 00 00 8d 06 7f 00 1c 02
+31 00 60 0a 4c 3a 80 25 00 00 8d 06 b1 00 1c 02
+31 00 60 0a 4c 3a c0 2b 00 00 8d 06 7a 00 1c 02
+31 00 60 0a 4c 3a e0 2b 00 00 8d 06 7b 00 1c 02
+31 00 60 0a 4c 3a c0 24 00 00 8d 06 ac 00 1c 02
+31 00 60 06 e0 3a 00 20 20 0f 8d 06 97 01 08 8a
+31 00 60 0c 48 02 40 20 20 02 8d 06 00 50 49 04
+31 00 60 0c 48 02 40 20 20 02 8d 06 00 5c 29 04
+31 00 60 0c 48 02 40 22 20 02 8d 06 00 5e 19 04
+31 00 60 0c 48 02 80 2f 60 02 8d 06 00 a7 19 06
+31 00 60 0c 48 02 80 2f 60 02 8d 06 00 ad 19 06
+31 00 60 0c 48 02 80 2f 60 02 8d 06 00 ac 19 06
+31 00 60 0c 48 02 80 2f 60 02 8d 06 00 a1 19 06
+31 00 60 0c 48 02 80 2f 60 02 8d 06 00 a2 19 06
+31 00 60 0c 48 02 80 2f 60 02 8d 06 00 a3 19 06
+31 00 60 0c 48 02 80 2f 60 02 8d 06 00 a4 19 06
+31 00 60 0c 48 02 80 2f 60 02 8d 06 00 ae 19 08
+31 00 60 0a 4c 3a 40 24 00 00 8d 06 b3 00 1c 02
+31 00 60 0a 4c 3a 60 24 00 00 8d 06 b4 00 1c 02
+31 00 60 0a 4c 3a 80 24 00 00 8d 06 b5 00 1c 02
+31 00 60 0a 4c 3a e0 2e 00 00 8d 06 b8 00 1c 02
+31 00 60 0a 4c 3a a0 2e 00 00 8d 06 b7 00 1c 02
+31 00 60 0a 4c 3a 00 25 00 00 8d 06 b6 00 1c 02
+31 00 60 0a 4c 3a 20 22 00 00 8d 06 b9 00 1c 02
+31 00 60 02 48 02 20 21 60 02 8d 06 02 e1 43 08
+31 00 80 02 48 02 e0 22 e0 00 8d 06 02 e1 85 10
+31 00 60 02 48 02 20 21 a0 02 8d 06 02 d0 43 04
+31 00 80 02 48 02 e0 22 60 01 8d 06 02 d0 85 08
+31 00 60 02 48 02 80 2f a0 00 8d 06 01 00 4b 0c
+31 00 80 02 48 02 00 2f e0 00 8d 06 01 00 8d 16
+31 00 60 0c 40 02 00 20 40 00 8d 06 01 5e 0b 06
+31 10 60 0c 40 02 00 20 40 00 8d 06 01 6e 0b 06
+31 00 61 0c 42 02 00 20 e0 00 8d 06 03 60 02 0a
+31 00 81 0c 42 02 00 20 20 01 8d 06 03 50 02 14
+31 00 60 02 48 02 40 20 e0 00 8d 06 01 40 43 0a
+31 00 60 02 48 02 c0 20 80 01 8d 06 02 41 43 0a
+31 00 60 06 08 02 c0 22 40 01 8d 06 38 01 1a 04
+31 00 60 06 08 02 a0 22 40 01 8d 06 38 03 1a 04
+31 00 60 06 08 02 20 28 40 01 8d 06 38 05 1a 04
+31 00 60 06 08 02 40 21 40 01 8d 06 38 07 1a 04
+31 00 60 06 08 02 20 28 60 01 8d 06 38 02 1a 04
+31 00 60 06 08 02 40 21 60 01 8d 06 38 04 1a 04
+31 00 60 06 08 02 60 21 60 01 8d 06 38 06 1a 04
+31 00 60 06 08 02 00 21 e0 00 8d 06 48 00 1a 04
+31 00 60 02 48 02 80 2f c0 00 8d 06 01 40 4a 0a
+31 00 61 0c 42 02 00 20 80 01 8d 06 01 97 00 04
+31 00 81 0c 42 02 00 20 80 02 8d 06 01 87 00 08
+31 00 60 02 48 02 40 20 e0 00 8d 06 01 40 4b 0c
+31 00 60 02 48 02 c0 20 a0 01 8d 06 02 41 4b 0c
+31 00 60 02 48 02 c0 21 40 01 8d 06 03 82 4a 06
+31 00 80 02 48 02 40 23 40 00 8d 06 03 82 8c 0a
+31 00 60 02 48 02 c0 20 60 01 8d 06 02 51 42 08
+31 00 80 02 48 02 40 21 60 02 8d 06 02 51 84 10
+31 00 60 0c 48 02 80 2f 40 00 8d 06 01 68 30 02
+31 00 80 0c 48 02 00 2f 40 00 8d 06 01 58 60 04
+31 00 60 06 08 02 00 21 e0 00 8d 06 28 01 3a 04
+31 00 60 02 48 02 40 20 80 01 8d 06 01 40 4b 10
+31 00 60 02 48 02 c0 20 80 02 8d 06 02 41 4b 10
+31 00 60 06 08 02 00 21 80 02 8d 06 38 01 4a 04
+31 00 60 06 08 02 80 21 80 02 8d 06 38 03 4a 04
+31 00 60 06 08 02 00 22 80 02 8d 06 38 05 4a 04
+31 00 60 06 08 02 80 22 80 02 8d 06 38 07 4a 04
+31 00 60 06 08 02 c0 21 c0 02 8d 06 38 02 4a 04
+31 00 60 06 08 02 40 22 c0 02 8d 06 38 04 4a 04
+31 00 60 06 08 02 c0 22 c0 02 8d 06 38 06 4a 04
+31 00 60 02 48 02 40 21 40 01 8d 06 04 00 42 04
+31 00 80 02 48 02 40 22 40 03 8d 06 04 00 84 08
+31 00 60 0c 40 02 00 20 20 00 8d 06 01 50 0b 10
+31 10 60 0c 40 02 00 20 20 00 8d 06 01 60 0b 10
+31 00 60 02 48 02 80 2f e0 00 8d 06 00 20 4b 0c
+31 00 60 06 08 02 a0 21 e0 04 8d 06 58 00 1a 04
+31 00 60 06 08 02 80 20 40 01 8d 06 68 00 1a 04
+31 00 60 06 08 02 80 20 60 00 8d 06 78 00 1a 04
+31 00 60 06 08 02 80 20 60 00 8d 06 88 00 1a 04
+31 00 60 06 08 02 80 20 60 00 8d 06 98 00 1a 04
+31 00 60 06 08 02 80 20 60 00 8d 06 a8 00 1a 04
+31 00 60 06 08 02 80 20 60 00 8d 06 b8 00 1a 04
+31 00 60 06 08 02 80 20 60 00 8d 06 c8 00 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 d8 00 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 e8 00 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 f8 00 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 08 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 18 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 48 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 58 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 68 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 78 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 88 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 98 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 a8 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 b8 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 c8 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 d8 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 e8 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 f8 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 08 02 1a 04
+31 00 60 02 48 02 c0 24 c0 04 8d 06 05 84 4a 08
+31 00 60 02 48 02 c0 25 e0 02 8d 06 04 83 4a 06
+31 00 60 02 48 02 80 23 80 03 8d 06 06 85 4a 06
+31 00 60 02 48 02 80 21 e0 02 8d 06 07 86 4a 06
+31 00 60 02 48 02 80 21 00 04 8d 06 08 87 4a 08
+31 00 60 02 48 02 40 23 a0 01 8d 06 09 88 4a 06
+31 00 60 02 48 02 40 23 40 03 8d 06 0a 09 4b 08
+31 00 60 02 48 02 40 20 40 00 8d 06 0b 0a 4b 0a
+31 00 60 02 48 02 c0 20 c0 00 8d 06 0c 0b 4b 08
+31 00 80 02 48 02 c0 23 20 09 8d 06 04 83 8c 0a
+31 00 80 02 48 02 00 25 40 00 8d 06 05 84 8c 0e
+31 00 80 02 48 02 a0 20 20 04 8d 06 06 85 8c 0a
+31 00 80 02 48 02 00 24 e0 06 8d 06 07 86 8c 0a
+31 00 80 02 48 02 c0 23 e0 02 8d 06 08 87 8c 0e
+31 00 80 02 48 02 a0 20 00 05 8d 06 09 88 8c 0a
+31 00 80 02 48 02 c0 24 60 08 8d 06 0a 09 8d 0e
+31 00 80 02 48 02 c0 24 40 00 8d 06 0b 0a 8d 12
+31 00 80 02 48 02 40 21 e0 04 8d 06 0c 0b 8d 0e
+31 00 60 02 48 02 40 20 c0 00 8d 06 00 20 4b 0e
+31 00 60 0c 48 02 60 28 40 00 8d 06 fe bb 10 04
+31 00 60 0c 48 02 a0 21 80 00 8d 06 02 50 49 06
+31 10 60 0c 48 02 20 21 e0 00 8d 06 02 60 49 06
+31 00 60 0c 40 02 00 20 80 00 8d 06 03 50 0b 0e
+31 10 60 0c 40 02 00 20 80 00 8d 06 03 60 0b 0e
+31 00 61 0c 4a 02 60 20 40 01 8d 06 01 b7 10 04
+31 00 61 0c 4a 02 a0 20 40 01 8d 06 01 bd 10 04
+31 00 61 0c 4a 02 c0 20 40 01 8d 06 01 bc 10 04
+31 00 61 0c 4a 02 e0 20 40 01 8d 06 01 b1 10 04
+31 00 61 0c 4a 02 20 21 40 01 8d 06 01 b3 10 04
+31 00 61 0c 4a 02 40 21 40 01 8d 06 01 b4 10 04
+31 00 61 0c 4a 02 60 21 60 01 8d 06 01 be 10 06
+31 00 81 0c 4a 02 60 20 60 02 8d 06 01 a7 20 08
+31 00 81 0c 4a 02 e0 20 60 02 8d 06 01 ad 20 08
+31 00 81 0c 4a 02 20 21 60 02 8d 06 01 ac 20 08
+31 00 81 0c 4a 02 60 21 60 02 8d 06 01 a1 20 08
+31 00 81 0c 4a 02 e0 21 60 02 8d 06 01 a3 20 08
+31 00 81 0c 4a 02 20 22 60 02 8d 06 01 a4 20 08
+31 00 81 0c 4a 02 60 22 a0 02 8d 06 01 ae 20 0c
+31 00 60 06 e0 02 00 20 00 02 8d 06 57 80 0a 0e
+31 00 60 06 08 02 c0 20 40 02 8d 06 18 03 3a 04
+31 00 60 06 08 02 20 21 40 02 8d 06 18 05 3a 04
+31 00 60 06 08 02 80 21 40 02 8d 06 18 07 3a 04
+31 00 60 06 08 02 e0 21 40 02 8d 06 18 09 3a 04
+31 00 60 06 08 02 60 21 e0 02 8d 06 18 02 3a 04
+31 00 60 06 08 02 c0 21 e0 02 8d 06 18 04 3a 04
+31 00 60 06 08 02 20 22 e0 02 8d 06 18 06 3a 04
+31 00 60 06 08 02 80 22 e0 02 8d 06 18 08 3a 04
+31 00 60 06 e0 02 00 20 80 01 8d 06 27 82 0a 0c
+31 00 60 06 e0 02 00 20 a0 01 8d 06 37 82 0a 0c
+31 00 60 06 e0 02 00 20 c0 01 8d 06 47 82 0a 0c
+31 00 60 06 e0 02 00 20 e0 01 8d 06 57 82 0a 0c
+31 00 60 06 e0 02 00 20 00 02 8d 06 67 82 0a 0c
+31 00 60 06 e0 02 00 20 20 02 8d 06 77 82 0a 0c
+31 00 60 06 e0 02 00 20 40 02 8d 06 87 82 0a 0c
+31 00 60 06 e0 02 00 20 60 02 8d 06 97 82 0a 0c
+31 00 60 06 e0 02 00 20 80 02 8d 06 a7 82 0a 0c
+31 00 60 06 e0 02 00 20 a0 02 8d 06 b7 82 0a 0c
+31 00 60 06 e0 02 00 20 c0 02 8d 06 c7 82 0a 0c
+31 00 60 06 e0 02 00 20 e0 02 8d 06 d7 82 0a 0c
+31 00 60 06 e0 02 00 20 00 03 8d 06 e7 82 0a 0c
+31 00 60 06 e0 02 00 20 20 03 8d 06 f7 82 0a 0c
+31 00 60 06 e0 02 00 20 40 03 8d 06 07 83 0a 0c
+31 00 60 06 e0 02 00 20 60 03 8d 06 17 83 0a 0c
+31 00 60 06 e0 02 00 20 80 03 8d 06 27 83 0a 0c
+31 00 60 06 e0 02 00 20 a0 03 8d 06 37 83 0a 0c
+31 00 60 06 e0 02 00 20 c0 03 8d 06 47 83 0a 0c
+31 00 60 06 e0 02 00 20 e0 03 8d 06 57 83 0a 0c
+31 00 60 06 e0 02 00 20 00 04 8d 06 67 83 0a 0c
+31 00 60 06 e0 02 00 20 20 04 8d 06 77 83 0a 0c
+31 00 60 06 e0 02 00 20 40 04 8d 06 87 83 0a 0c
+31 00 60 06 e0 02 00 20 60 04 8d 06 97 83 0a 0c
+31 00 60 06 e0 02 00 20 80 04 8d 06 a7 83 0a 0c
+31 00 60 06 e0 02 00 20 a0 04 8d 06 b7 83 0a 0c
+31 00 60 06 e0 02 00 20 c0 04 8d 06 c7 83 0a 0c
+31 00 60 06 e0 02 00 20 e0 04 8d 06 d7 83 0a 0c
+31 00 60 06 e0 02 00 20 00 05 8d 06 e7 83 0a 0c
+31 00 60 06 e0 02 00 20 20 05 8d 06 f7 83 0a 0c
+31 00 60 02 48 02 40 20 e0 00 8d 06 01 40 43 10
+31 00 60 02 48 02 c0 20 e0 01 8d 06 02 41 43 10
+31 00 60 06 08 02 00 22 00 02 8d 06 48 01 4a 04
+31 00 60 02 48 02 c0 24 c0 04 8d 06 04 84 4a 08
+31 00 60 02 48 02 c0 25 e0 02 8d 06 03 83 4a 06
+31 00 60 02 48 02 80 23 80 03 8d 06 05 85 4a 06
+31 00 60 02 48 02 80 21 e0 02 8d 06 06 86 4a 06
+31 00 60 02 48 02 80 21 00 04 8d 06 07 87 4a 08
+31 00 60 02 48 02 40 23 a0 01 8d 06 08 88 4a 06
+31 00 60 02 48 02 40 23 40 03 8d 06 09 09 4b 08
+31 00 60 02 48 02 40 20 40 00 8d 06 0a 0a 4b 0a
+31 00 60 02 48 02 40 21 40 01 8d 06 0b 0b 4b 08
+31 00 60 0c 40 02 00 20 20 00 8d 06 09 5e 0b 08
+31 10 60 0c 40 02 00 20 20 00 8d 06 09 6e 0b 08
+31 00 60 06 08 02 40 20 e0 01 8d 06 48 00 3a 04
+31 00 60 06 08 02 80 21 e0 01 8d 06 58 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 68 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 78 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 88 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 98 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 a8 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 b8 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 c8 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 d8 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 e8 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 f8 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 08 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 18 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 38 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 48 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 58 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 68 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 78 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 88 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 98 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 a8 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 b8 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 c8 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 d8 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 e8 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 f8 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 08 02 3a 04
+31 00 60 06 e0 3a 00 20 60 01 8d 06 47 00 0a 14
+31 00 60 06 e0 3a 00 20 e0 03 8d 06 87 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 0e 8d 06 87 00 0a 94
+31 00 60 02 48 02 c0 21 60 01 8d 06 02 02 4b 0a
+31 00 60 02 48 02 40 22 40 02 8d 06 03 03 4b 0c
+31 00 60 02 48 02 c0 22 00 03 8d 06 04 04 4b 08
+31 00 60 02 48 02 00 23 40 00 8d 06 03 32 42 06
+31 00 80 02 48 02 60 22 60 03 8d 06 03 32 84 0c
+31 00 60 02 48 02 80 2f c0 00 8d 06 03 23 42 06
+31 00 61 0c 42 02 00 20 60 00 8d 06 01 9d 00 04
+31 00 61 0c 42 02 00 20 60 00 8d 06 01 9c 00 04
+31 00 61 0c 42 02 00 20 60 00 8d 06 01 91 00 04
+31 00 61 0c 42 02 00 20 60 00 8d 06 01 92 00 04
+31 00 61 0c 42 02 00 20 60 00 8d 06 01 93 00 04
+31 00 61 0c 42 02 00 20 60 00 8d 06 01 94 00 04
+31 00 61 0c 42 02 00 20 20 01 8d 06 01 9e 00 06
+31 00 81 0c 42 02 00 20 80 00 8d 06 01 8d 00 08
+31 00 81 0c 42 02 00 20 80 00 8d 06 01 8c 00 08
+31 00 81 0c 42 02 00 20 80 00 8d 06 01 81 00 08
+31 00 81 0c 42 02 00 20 80 00 8d 06 01 82 00 08
+31 00 81 0c 42 02 00 20 80 00 8d 06 01 83 00 08
+31 00 81 0c 42 02 00 20 80 00 8d 06 01 84 00 08
+31 00 81 0c 42 02 00 20 c0 01 8d 06 01 8e 00 0c
+31 00 60 06 08 02 60 21 20 02 8d 06 38 03 3a 04
+31 00 60 06 08 02 c0 21 20 02 8d 06 38 05 3a 04
+31 00 60 06 08 02 20 22 20 02 8d 06 38 07 3a 04
+31 00 60 06 08 02 20 21 40 02 8d 06 38 00 3a 04
+31 00 60 06 08 02 80 21 40 02 8d 06 38 02 3a 04
+31 00 60 06 08 02 e0 21 40 02 8d 06 38 04 3a 04
+31 00 60 06 08 02 40 22 40 02 8d 06 38 06 3a 04
+31 00 60 02 48 02 80 2f c0 00 8d 06 01 40 42 08
+31 00 60 02 48 02 20 21 a0 00 8d 06 02 00 42 04
+31 00 80 02 48 02 a0 21 e0 00 8d 06 02 00 84 08
+31 00 60 0c 48 02 80 2f 40 00 8d 06 01 a5 19 04
+31 10 60 0c 48 02 20 2f 40 00 8d 06 01 b5 19 04
+31 00 60 0c 40 02 00 20 80 02 8d 06 01 81 09 06
+31 10 60 0c 40 02 00 20 40 00 8d 06 01 91 09 06
+31 00 60 0c 40 02 00 20 60 02 8d 06 01 82 09 06
+31 10 60 0c 40 02 00 20 40 00 8d 06 01 92 09 06
+31 00 60 0c 40 02 00 20 60 02 8d 06 01 83 09 06
+31 10 60 0c 40 02 00 20 40 00 8d 06 01 93 09 06
+31 00 60 06 08 02 c0 22 00 04 8d 06 38 02 28 02
+31 00 60 06 08 02 00 23 00 04 8d 06 38 04 28 02
+31 00 60 06 08 02 40 23 00 04 8d 06 38 06 28 02
+31 00 60 06 08 02 80 23 00 04 8d 06 48 02 28 02
+31 00 60 06 08 02 c0 23 00 04 8d 06 48 04 28 02
+31 00 60 06 08 02 00 24 00 04 8d 06 48 06 28 02
+31 00 60 06 08 02 c0 22 20 04 8d 06 58 02 28 02
+31 00 60 06 08 02 00 23 20 04 8d 06 58 04 28 02
+31 00 60 06 08 02 40 23 20 04 8d 06 58 06 28 02
+31 00 60 06 08 02 c0 22 40 04 8d 06 68 02 28 02
+31 00 60 06 08 02 00 23 40 04 8d 06 68 04 28 02
+31 00 60 06 08 02 40 23 40 04 8d 06 68 06 28 02
+31 00 60 06 08 02 00 23 60 04 8d 06 78 04 28 02
+31 00 60 06 08 02 c0 22 60 04 8d 06 78 02 28 02
+31 00 60 06 08 02 40 23 60 04 8d 06 78 06 28 02
+31 00 60 06 08 02 00 23 80 04 8d 06 88 06 28 02
+31 00 60 06 08 02 00 21 80 04 8d 06 88 02 28 02
+31 00 60 06 08 02 c0 22 80 04 8d 06 88 04 28 02
+31 00 60 06 08 02 00 21 a0 04 8d 06 98 02 28 02
+31 00 60 06 08 02 c0 22 a0 04 8d 06 98 04 28 02
+31 00 60 06 08 02 00 23 a0 04 8d 06 98 06 28 02
+31 00 60 06 08 02 00 21 c0 04 8d 06 a8 02 28 02
+31 00 60 06 08 02 c0 22 c0 04 8d 06 a8 04 28 02
+31 00 60 06 08 02 00 23 c0 04 8d 06 a8 06 28 02
+31 00 60 06 08 02 00 21 e0 04 8d 06 b8 02 28 02
+31 00 60 06 08 02 c0 22 e0 04 8d 06 b8 04 28 02
+31 00 60 06 08 02 00 23 e0 04 8d 06 b8 06 28 02
+31 00 60 06 08 02 00 21 00 05 8d 06 c8 02 28 02
+31 00 60 06 08 02 40 21 00 05 8d 06 c8 04 28 02
+31 00 60 06 08 02 c0 22 00 05 8d 06 c8 06 28 02
+31 00 60 06 08 02 00 21 20 05 8d 06 d8 02 28 02
+31 00 60 06 08 02 40 21 20 05 8d 06 d8 04 28 02
+31 00 60 06 08 02 c0 22 20 05 8d 06 d8 06 28 02
+31 00 60 06 08 02 00 21 40 05 8d 06 e8 02 28 02
+31 00 60 06 08 02 40 21 40 05 8d 06 e8 04 28 02
+31 00 60 06 08 02 c0 22 40 05 8d 06 e8 06 28 02
+31 00 60 06 08 02 00 21 60 05 8d 06 f8 02 28 02
+31 00 60 06 08 02 40 21 60 05 8d 06 f8 04 28 02
+31 00 60 06 08 02 c0 22 60 05 8d 06 f8 06 28 02
+31 00 60 06 08 02 00 21 80 05 8d 06 08 03 28 02
+31 00 60 06 08 02 40 21 80 05 8d 06 08 05 28 02
+31 00 60 06 08 02 80 21 80 05 8d 06 08 07 28 02
+31 00 60 06 08 02 00 21 a0 05 8d 06 18 03 28 02
+31 00 60 06 08 02 40 21 a0 05 8d 06 18 05 28 02
+31 00 60 06 08 02 80 21 a0 05 8d 06 18 07 28 02
+31 00 60 06 08 02 00 21 c0 05 8d 06 28 03 28 02
+31 00 60 06 08 02 40 21 c0 05 8d 06 28 05 28 02
+31 00 60 06 08 02 80 21 c0 05 8d 06 28 07 28 02
+31 00 60 06 08 02 00 21 e0 05 8d 06 38 03 28 02
+31 00 60 06 08 02 40 21 e0 05 8d 06 38 05 28 02
+31 00 60 06 08 02 80 21 e0 05 8d 06 38 07 28 02
+31 00 60 06 08 02 00 21 00 06 8d 06 48 03 28 02
+31 00 60 06 08 02 40 21 00 06 8d 06 48 05 28 02
+31 00 60 06 08 02 80 21 00 06 8d 06 48 07 28 02
+31 00 60 06 08 02 00 21 20 06 8d 06 58 03 28 02
+31 00 60 06 08 02 40 21 20 06 8d 06 58 05 28 02
+31 00 60 06 08 02 80 21 20 06 8d 06 58 07 28 02
+31 00 60 06 08 02 00 21 40 06 8d 06 68 03 28 02
+31 00 60 06 08 02 40 21 40 06 8d 06 68 05 28 02
+31 00 60 06 08 02 80 21 40 06 8d 06 68 07 28 02
+31 00 60 06 08 02 00 21 a0 06 8d 06 78 03 28 02
+31 00 60 06 08 02 40 21 a0 06 8d 06 78 05 28 02
+31 00 60 06 08 02 80 21 a0 06 8d 06 78 07 28 02
+31 00 60 06 08 02 00 21 c0 06 8d 06 88 03 28 02
+31 00 60 06 08 02 40 21 c0 06 8d 06 88 05 28 02
+31 00 60 06 08 02 80 21 c0 06 8d 06 88 07 28 02
+31 00 60 06 08 02 00 21 e0 06 8d 06 98 03 28 02
+31 00 60 06 08 02 40 21 e0 06 8d 06 98 05 28 02
+31 00 60 06 08 02 80 21 e0 06 8d 06 98 07 28 02
+31 00 60 06 08 02 00 21 00 07 8d 06 a8 03 28 02
+31 00 60 06 08 02 40 21 00 07 8d 06 a8 05 28 02
+31 00 60 06 08 02 80 21 00 07 8d 06 a8 07 28 02
+31 00 60 06 08 02 00 21 20 07 8d 06 b8 03 28 02
+31 00 60 06 08 02 40 21 20 07 8d 06 b8 05 28 02
+31 00 60 06 08 02 80 21 20 07 8d 06 b8 07 28 02
+31 00 60 06 08 02 00 21 40 07 8d 06 c8 03 28 02
+31 00 60 06 08 02 40 21 40 07 8d 06 c8 05 28 02
+31 00 60 06 08 02 80 21 40 07 8d 06 c8 07 28 02
+31 00 60 06 08 02 00 21 60 07 8d 06 d8 03 28 02
+31 00 60 06 08 02 40 21 60 07 8d 06 d8 05 28 02
+31 00 60 06 08 02 80 21 60 07 8d 06 d8 07 28 02
+31 00 60 06 08 02 00 21 80 07 8d 06 e8 03 28 02
+31 00 60 06 08 02 40 21 80 07 8d 06 e8 05 28 02
+31 00 60 06 08 02 80 21 80 07 8d 06 e8 07 28 02
+31 00 60 06 08 02 00 21 a0 07 8d 06 f8 03 28 02
+31 00 60 06 08 02 40 21 a0 07 8d 06 f8 05 28 02
+31 00 60 06 08 02 80 21 a0 07 8d 06 f8 07 28 02
+31 00 60 06 08 02 40 21 c0 07 8d 06 08 04 28 02
+31 00 60 06 08 02 80 21 c0 07 8d 06 08 06 28 02
+31 00 60 06 08 02 c0 21 c0 07 8d 06 08 08 28 02
+31 00 60 06 08 02 00 21 e0 07 8d 06 18 02 28 02
+31 00 60 06 08 02 40 21 e0 07 8d 06 18 04 28 02
+31 00 60 06 08 02 80 21 e0 07 8d 06 18 06 28 02
+31 00 60 06 08 02 c0 21 e0 07 8d 06 18 08 28 02
+31 00 60 02 48 02 a0 23 40 02 8d 06 08 00 42 04
+31 00 60 02 48 02 60 24 40 02 8d 06 09 01 42 04
+31 00 60 02 48 02 20 25 40 02 8d 06 0a 02 42 04
+31 00 60 02 48 02 40 20 40 02 8d 06 0b 03 42 04
+31 00 60 02 48 02 c0 20 40 02 8d 06 0c 04 42 04
+31 00 60 02 48 02 40 21 40 02 8d 06 0d 05 42 04
+31 00 60 02 48 02 c0 21 40 02 8d 06 0e 06 42 04
+31 00 60 02 48 02 40 22 40 02 8d 06 0f 07 42 04
+31 00 80 02 48 02 00 24 c0 02 8d 06 08 00 84 08
+31 00 80 02 48 02 40 25 c0 02 8d 06 09 01 84 08
+31 00 80 02 48 02 80 27 c0 02 8d 06 0a 02 84 08
+31 00 80 02 48 02 c0 28 c0 02 8d 06 0b 03 84 08
+31 00 80 02 48 02 c0 29 c0 02 8d 06 0c 04 84 08
+31 00 80 02 48 02 c0 2a c0 02 8d 06 0d 05 84 08
+31 00 80 02 48 02 c0 2b c0 02 8d 06 0e 06 84 08
+31 00 80 02 48 02 80 26 c0 02 8d 06 0f 07 84 08
+31 00 60 02 48 02 00 22 40 05 8d 06 01 21 42 06
+31 00 60 02 48 02 80 22 40 05 8d 06 02 22 42 06
+31 00 60 02 48 02 a0 23 40 05 8d 06 04 24 42 06
+31 00 60 02 48 02 c0 24 40 05 8d 06 06 26 42 06
+31 00 60 02 48 02 80 2f 40 05 8d 06 07 27 42 06
+31 00 60 06 08 02 80 21 00 02 8d 06 58 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 68 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 78 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 88 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 98 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 a8 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 b8 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 c8 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 d8 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 e8 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 f8 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 08 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 18 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 58 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 68 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 78 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 88 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 98 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 a8 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 b8 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 c8 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 d8 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 e8 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 f8 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 08 02 4a 04
+31 00 60 02 48 02 40 20 80 01 8d 06 01 50 42 0a
+31 00 60 02 48 02 c0 20 20 02 8d 06 02 51 42 0a
+31 00 80 02 48 02 60 23 e0 00 8d 06 01 50 84 14
+31 00 80 02 48 02 60 24 20 02 8d 06 02 51 84 14
+31 00 60 06 e0 3a 00 20 00 0f 8d 06 17 01 0a 8c
+31 00 60 06 08 02 60 21 20 00 8d 06 28 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 38 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 48 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 58 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 68 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 78 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 88 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 98 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 a8 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 b8 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 c8 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 d8 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 e8 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 f8 01 38 02
+31 00 60 0c 40 02 00 20 40 02 8d 06 01 87 09 08
+31 10 60 0c 40 02 00 20 40 00 8d 06 01 97 09 08
+31 00 60 0c 40 02 00 20 40 02 8d 06 01 8d 09 08
+31 10 60 0c 40 02 00 20 40 00 8d 06 01 9d 09 08
+31 00 60 0c 40 02 00 20 40 02 8d 06 01 81 09 08
+31 10 60 0c 40 02 00 20 40 00 8d 06 01 91 09 08
+31 00 60 0c 40 02 00 20 40 02 8d 06 01 82 09 08
+31 10 60 0c 40 02 00 20 40 00 8d 06 01 92 09 08
+31 00 60 0c 40 02 00 20 40 02 8d 06 01 83 09 08
+31 10 60 0c 40 02 00 20 40 00 8d 06 01 93 09 08
+31 00 60 06 08 02 60 21 a0 01 8d 06 58 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 68 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 78 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 88 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 98 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 a8 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 b8 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 c8 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 d8 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 e8 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 f8 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 08 01 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 18 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 58 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 68 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 78 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 88 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 98 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 a8 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 b8 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 c8 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 d8 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 e8 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 f8 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 08 02 2a 04
+31 00 60 02 48 02 c0 20 c0 00 8d 06 02 b1 4a 02
+31 00 60 02 48 02 40 21 40 01 8d 06 03 b2 4a 02
+31 00 60 02 48 02 c0 21 c0 01 8d 06 04 b3 4a 02
+31 00 60 02 48 02 40 22 40 02 8d 06 05 b4 4a 02
+31 00 60 02 48 02 c0 22 c0 02 8d 06 06 b5 4a 02
+31 00 80 02 48 02 40 22 40 03 8d 06 02 b1 8c 02
+31 00 80 02 48 02 80 23 60 03 8d 06 03 b2 8c 02
+31 00 80 02 48 02 80 24 80 05 8d 06 04 b3 8c 02
+31 00 80 02 48 02 40 20 a0 06 8d 06 06 b5 8c 02
+31 00 80 02 48 02 80 25 80 06 8d 06 05 b4 8c 02
+31 00 60 02 48 02 c0 20 c0 00 8d 06 02 01 4b 0a
+31 00 60 02 48 02 c0 21 60 01 8d 06 03 02 4b 0a
+31 00 60 02 48 02 40 22 40 02 8d 06 04 03 4b 0c
+31 00 60 02 48 02 c0 22 00 03 8d 06 05 04 4b 08
+31 00 80 02 48 02 40 22 40 03 8d 06 03 02 8d 12
+31 00 80 02 48 02 40 21 a0 06 8d 06 02 01 8d 12
+31 00 80 02 48 02 40 23 60 04 8d 06 04 03 8d 16
+31 00 80 02 48 02 40 24 c0 05 8d 06 05 04 8d 0e
+31 00 60 02 48 02 80 2f 20 01 8d 06 00 00 4b 0c
diff --git a/src/intel/compiler/elk/tests/gen8/sendc.asm b/src/intel/compiler/elk/tests/gen8/sendc.asm
new file mode 100644
index 00000000000..c5d0d6e590a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/sendc.asm
@@ -0,0 +1,100 @@
+sendc(8)        null<1>UW       g124<8,8,1>F    0x88031400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g120<8,8,1>F    0x90031000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 8 rlen 0 { align1 1H EOT };
+sendc(16)       null<1>UW       g114<8,8,1>F    0x82031100
+                            render MsgDesc: RT write SIMD16/RepData LastRT Surface = 0 mlen 1 rlen 0 { align1 1H EOT };
+(+f0.1) sendc(8) null<1>UW      g124<8,8,1>F    0x88031400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1401
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 1 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1001
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g13<8,8,1>F     0x0e0b0401
+                            render MsgDesc: RT write SIMD8 Surface = 1 mlen 7 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g121<8,8,1>F    0x8e0b1402
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 2 mlen 7 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g7<8,8,1>F      0x180b0001
+                            render MsgDesc: RT write SIMD16 Surface = 1 mlen 12 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g116<8,8,1>F    0x980b1002
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 2 mlen 12 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g123<8,8,1>F    0x8a031400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 5 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x94031000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 10 rlen 0 { align1 1H EOT };
+sendc(16)       null<1>UW       g119<8,8,1>F    0x92031000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 9 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g5<8,8,1>F      0x0c0b0400
+                            render MsgDesc: RT write SIMD8 Surface = 0 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g5<8,8,1>F      0x0c0b0401
+                            render MsgDesc: RT write SIMD8 Surface = 1 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g5<8,8,1>F      0x0c0b0402
+                            render MsgDesc: RT write SIMD8 Surface = 2 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g5<8,8,1>F      0x0c0b0403
+                            render MsgDesc: RT write SIMD8 Surface = 3 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g5<8,8,1>F      0x0c0b0404
+                            render MsgDesc: RT write SIMD8 Surface = 4 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1405
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 5 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g5<8,8,1>F      0x140b0000
+                            render MsgDesc: RT write SIMD16 Surface = 0 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g5<8,8,1>F      0x140b0001
+                            render MsgDesc: RT write SIMD16 Surface = 1 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g5<8,8,1>F      0x140b0002
+                            render MsgDesc: RT write SIMD16 Surface = 2 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g5<8,8,1>F      0x140b0003
+                            render MsgDesc: RT write SIMD16 Surface = 3 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g5<8,8,1>F      0x140b0004
+                            render MsgDesc: RT write SIMD16 Surface = 4 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1005
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 5 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1403
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 3 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1003
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 3 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1402
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 2 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1002
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 2 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g118<8,8,1>F    0x940b1200
+                            render MsgDesc: RT write SIMD8/DualSrcLow LastRT Surface = 0 mlen 10 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       g3<8,8,1>F      0x140b1200
+                            render MsgDesc: RT write SIMD8/DualSrcLow LastRT Surface = 0 mlen 10 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g118<8,8,1>F    0x940b1300
+                            render MsgDesc: RT write SIMD8/DualSrcHigh LastRT Surface = 0 mlen 10 rlen 0 { align1 2Q EOT };
+sendc(8)        null<1>UW       g23<8,8,1>F     0x0c0b0405
+                            render MsgDesc: RT write SIMD8 Surface = 5 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g29<8,8,1>F     0x0c0b0406
+                            render MsgDesc: RT write SIMD8 Surface = 6 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1407
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 7 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g57<8,8,1>F     0x140b0005
+                            render MsgDesc: RT write SIMD16 Surface = 5 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g17<8,8,1>F     0x140b0006
+                            render MsgDesc: RT write SIMD16 Surface = 6 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1007
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 7 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g10<8,8,1>F     0x0e0b0400
+                            render MsgDesc: RT write SIMD8 Surface = 0 mlen 7 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g121<8,8,1>F    0x8e0b1401
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 1 mlen 7 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g2<8,8,1>F      0x160b0000
+                            render MsgDesc: RT write SIMD16 Surface = 0 mlen 11 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g117<8,8,1>F    0x960b1001
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 11 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1404
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 4 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1004
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 4 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>F    0x8c0b1406
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 6 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<8,8,1>F    0x940b1006
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 6 mlen 10 rlen 0 { align1 1H EOT };
+sendc(16)       null<1>UW       g116<8,8,1>F    0x980b1001
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 12 rlen 0 { align1 1H EOT };
+sendc(16)       null<1>UW       g11<8,8,1>F     0x180b0000
+                            render MsgDesc: RT write SIMD16 Surface = 0 mlen 12 rlen 0 { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/sendc.expected b/src/intel/compiler/elk/tests/gen8/sendc.expected
new file mode 100644
index 00000000000..cf93e4426ff
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/sendc.expected
@@ -0,0 +1,50 @@
+32 00 60 05 40 3a 00 20 80 0f 8d 06 00 14 03 88
+32 00 80 05 40 3a 00 20 00 0f 8d 06 00 10 03 90
+32 00 80 05 40 3a 00 20 40 0e 8d 06 00 11 03 82
+32 00 61 05 41 3a 00 20 80 0f 8d 06 00 14 03 88
+32 00 60 05 40 3a 00 20 40 0f 8d 06 01 14 0b 8c
+32 00 80 05 40 3a 00 20 c0 0e 8d 06 01 10 0b 94
+32 00 60 05 40 3a 00 20 a0 01 8d 06 01 04 0b 0e
+32 00 60 05 40 3a 00 20 20 0f 8d 06 02 14 0b 8e
+32 00 80 05 40 3a 00 20 e0 00 8d 06 01 00 0b 18
+32 00 80 05 40 3a 00 20 80 0e 8d 06 02 10 0b 98
+32 00 60 05 40 3a 00 20 60 0f 8d 06 00 14 03 8a
+32 00 80 05 40 3a 00 20 c0 0e 8d 06 00 10 03 94
+32 00 80 05 40 3a 00 20 e0 0e 8d 06 00 10 03 92
+32 00 60 05 40 3a 00 20 a0 00 8d 06 00 04 0b 0c
+32 00 60 05 40 3a 00 20 a0 00 8d 06 01 04 0b 0c
+32 00 60 05 40 3a 00 20 a0 00 8d 06 02 04 0b 0c
+32 00 60 05 40 3a 00 20 a0 00 8d 06 03 04 0b 0c
+32 00 60 05 40 3a 00 20 a0 00 8d 06 04 04 0b 0c
+32 00 60 05 40 3a 00 20 40 0f 8d 06 05 14 0b 8c
+32 00 80 05 40 3a 00 20 a0 00 8d 06 00 00 0b 14
+32 00 80 05 40 3a 00 20 a0 00 8d 06 01 00 0b 14
+32 00 80 05 40 3a 00 20 a0 00 8d 06 02 00 0b 14
+32 00 80 05 40 3a 00 20 a0 00 8d 06 03 00 0b 14
+32 00 80 05 40 3a 00 20 a0 00 8d 06 04 00 0b 14
+32 00 80 05 40 3a 00 20 c0 0e 8d 06 05 10 0b 94
+32 00 60 05 40 3a 00 20 40 0f 8d 06 03 14 0b 8c
+32 00 80 05 40 3a 00 20 c0 0e 8d 06 03 10 0b 94
+32 00 60 05 40 3a 00 20 40 0f 8d 06 02 14 0b 8c
+32 00 80 05 40 3a 00 20 c0 0e 8d 06 02 10 0b 94
+32 00 60 05 40 3a 00 20 40 0f 8d 06 00 14 0b 8c
+32 00 80 05 40 3a 00 20 c0 0e 8d 06 00 10 0b 94
+32 00 60 05 40 3a 00 20 c0 0e 8d 06 00 12 0b 94
+32 00 60 05 40 3a 00 20 60 00 8d 06 00 12 0b 14
+32 10 60 05 40 3a 00 20 c0 0e 8d 06 00 13 0b 94
+32 00 60 05 40 3a 00 20 e0 02 8d 06 05 04 0b 0c
+32 00 60 05 40 3a 00 20 a0 03 8d 06 06 04 0b 0c
+32 00 60 05 40 3a 00 20 40 0f 8d 06 07 14 0b 8c
+32 00 80 05 40 3a 00 20 20 07 8d 06 05 00 0b 14
+32 00 80 05 40 3a 00 20 20 02 8d 06 06 00 0b 14
+32 00 80 05 40 3a 00 20 c0 0e 8d 06 07 10 0b 94
+32 00 60 05 40 3a 00 20 40 01 8d 06 00 04 0b 0e
+32 00 60 05 40 3a 00 20 20 0f 8d 06 01 14 0b 8e
+32 00 80 05 40 3a 00 20 40 00 8d 06 00 00 0b 16
+32 00 80 05 40 3a 00 20 a0 0e 8d 06 01 10 0b 96
+32 00 60 05 40 3a 00 20 40 0f 8d 06 04 14 0b 8c
+32 00 80 05 40 3a 00 20 c0 0e 8d 06 04 10 0b 94
+32 00 60 05 40 3a 00 20 40 0f 8d 06 06 14 0b 8c
+32 00 80 05 40 3a 00 20 c0 0e 8d 06 06 10 0b 94
+32 00 80 05 40 3a 00 20 80 0e 8d 06 01 10 0b 98
+32 00 80 05 40 3a 00 20 60 01 8d 06 00 00 0b 18
diff --git a/src/intel/compiler/elk/tests/gen8/shl.asm b/src/intel/compiler/elk/tests/gen8/shl.asm
new file mode 100644
index 00000000000..0ef2de7cd46
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/shl.asm
@@ -0,0 +1,13 @@
+shl(16)         g25<1>D         g27<8,8,1>D     0x00000002UD    { align1 1H };
+shl(8)          g18<1>D         g17<8,8,1>D     0x00000002UD    { align1 1Q };
+shl(1)          g2<1>UD         g5<0,1,0>UD     0x00000008UD    { align1 WE_all 1N };
+shl(8)          g4<1>UD         g6<8,8,1>UD     g3<8,8,1>UD     { align1 1Q };
+shl(1)          a0<1>UD         g27<0,1,0>UD    0x00000002UD    { align1 WE_all 1N };
+shl(16)         g36<1>D         g1<0,1,0>D      0x00000005UD    { align1 2H };
+shl(8)          g26<1>UD        g34<8,8,1>UW    0x00000002UD    { align1 1Q };
+shl(8)          g3<1>UD         g23<8,8,1>UD    g21<8,8,1>UD    { align1 WE_all 1Q };
+shl(16)         g10<1>UD        g10<8,8,1>UD    0x00000010UD    { align1 1H };
+shl(1)          g22<1>UD        g22<0,1,0>UD    0x00000004UD    { align1 WE_all 3N };
+shl(8)          g11<1>Q         g5<4,4,1>Q      g3<4,4,1>UD     { align1 1Q };
+shl(1)          a0<1>UD         g13<0,1,0>D     0x00000002UD    { align1 WE_all 1N };
+shl(8)          g22<1>Q         g8<4,4,1>Q      g4<4,4,1>UD     { align1 2Q };
diff --git a/src/intel/compiler/elk/tests/gen8/shl.expected b/src/intel/compiler/elk/tests/gen8/shl.expected
new file mode 100644
index 00000000000..2fc795e4391
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/shl.expected
@@ -0,0 +1,13 @@
+09 00 80 00 28 0a 20 23 60 03 8d 06 02 00 00 00
+09 00 60 00 28 0a 40 22 20 02 8d 06 02 00 00 00
+09 00 00 00 0c 02 40 20 a0 00 00 06 08 00 00 00
+09 00 60 00 08 02 80 20 c0 00 8d 02 60 00 8d 00
+09 00 00 00 04 02 00 22 60 03 00 06 02 00 00 00
+09 20 80 00 28 0a 80 24 20 00 00 06 05 00 00 00
+09 00 60 00 08 12 40 23 40 04 8d 06 02 00 00 00
+09 00 60 00 0c 02 60 20 e0 02 8d 02 a0 02 8d 00
+09 00 80 00 08 02 40 21 40 01 8d 06 10 00 00 00
+09 10 00 00 0c 02 c0 22 c0 02 00 06 04 00 00 00
+09 00 60 00 28 4b 60 21 a0 00 69 02 60 00 69 00
+09 00 00 00 04 0a 00 22 a0 01 00 06 02 00 00 00
+09 10 60 00 28 4b c0 22 00 01 69 02 80 00 69 00
diff --git a/src/intel/compiler/elk/tests/gen8/shr.asm b/src/intel/compiler/elk/tests/gen8/shr.asm
new file mode 100644
index 00000000000..8d6e05501e8
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/shr.asm
@@ -0,0 +1,8 @@
+shr(8)          g20<1>UD        g19<8,8,1>UD    0x00000001UD    { align1 1Q };
+shr(16)         g51<1>UD        g49<8,8,1>UD    0x00000001UD    { align1 1H };
+shr(16)         g4<1>UW         g1<1,8,0>UB     0x44440000V     { align1 1H };
+shr.z.f0.0(8)   g3<1>UD         g1<8,8,1>UD     0x0000001bUD    { align1 1Q };
+shr.z.f0.0(8)   null<1>UD       g1<8,8,1>UD     0x0000001bUD    { align1 1Q };
+shr(8)          g3<1>UW         g1.28<1,8,0>UB  0x76543210V     { align1 1Q };
+shr(8)          g3<2>UW         g5<8,8,1>UD     g4<8,8,1>UW     { align1 1Q };
+shr(16)         g20<2>UW        g15<8,8,1>UD    g13<8,8,1>UW    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/shr.expected b/src/intel/compiler/elk/tests/gen8/shr.expected
new file mode 100644
index 00000000000..9ed20276791
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/shr.expected
@@ -0,0 +1,8 @@
+08 00 60 00 08 02 80 22 60 02 8d 06 01 00 00 00
+08 00 80 00 08 02 60 26 20 06 8d 06 01 00 00 00
+08 00 80 00 48 22 80 20 20 00 2c 36 00 00 44 44
+08 00 60 01 08 02 60 20 20 00 8d 06 1b 00 00 00
+08 00 60 01 00 02 00 20 20 00 8d 06 1b 00 00 00
+08 00 60 00 48 22 60 20 3c 00 2c 36 10 32 54 76
+08 00 60 00 48 02 60 40 a0 00 8d 12 80 00 8d 00
+08 00 80 00 48 02 80 42 e0 01 8d 12 a0 01 8d 00
diff --git a/src/intel/compiler/elk/tests/gen8/wait.asm b/src/intel/compiler/elk/tests/gen8/wait.asm
new file mode 100644
index 00000000000..864acd0a8e0
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/wait.asm
@@ -0,0 +1,3 @@
+wait(1)         n0<1>UD                                         { align1 WE_all 1N };
+wait(1)         n0.1<1>UD                                       { align1 WE_all 1N };
+wait(1)         n0.2<1>UD                                       { align1 WE_all 1N };
diff --git a/src/intel/compiler/elk/tests/gen8/wait.expected b/src/intel/compiler/elk/tests/gen8/wait.expected
new file mode 100644
index 00000000000..31565e5049f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/wait.expected
@@ -0,0 +1,3 @@
+30 00 00 00 04 00 00 32 00 12 00 38 00 00 8d 00
+30 00 00 00 04 00 04 32 04 12 00 38 00 00 8d 00
+30 00 00 00 04 00 08 32 08 12 00 38 00 00 8d 00
diff --git a/src/intel/compiler/elk/tests/gen8/while.asm b/src/intel/compiler/elk/tests/gen8/while.asm
new file mode 100644
index 00000000000..7aaae755391
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/while.asm
@@ -0,0 +1,5 @@
+LABEL0:
+while(8)        JIP: LABEL0                                     { align1 1Q };
+while(16)       JIP: LABEL0                                     { align1 1H };
+(-f0.0) while(8) JIP: LABEL0                                    { align1 1Q };
+(-f0.0) while(16) JIP: LABEL0                                   { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen8/while.expected b/src/intel/compiler/elk/tests/gen8/while.expected
new file mode 100644
index 00000000000..8b6c4da652f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/while.expected
@@ -0,0 +1,4 @@
+27 00 60 00 20 0e 00 20 00 00 00 08 00 00 00 00
+27 00 80 00 20 0e 00 20 00 00 00 08 f0 ff ff ff
+27 00 71 00 20 0e 00 20 00 00 00 08 e0 ff ff ff
+27 00 91 00 20 0e 00 20 00 00 00 08 d0 ff ff ff
diff --git a/src/intel/compiler/elk/tests/gen8/xor.asm b/src/intel/compiler/elk/tests/gen8/xor.asm
new file mode 100644
index 00000000000..737a16aeb49
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/xor.asm
@@ -0,0 +1,2 @@
+xor(16)         g10<1>UD        g1<0,1,0>UD     g1.1<0,1,0>UD   { align1 1H };
+xor(8)          g4<1>UD         g5.6<0,1,0>UD   ~g5.7<0,1,0>D   { align1 1Q };
diff --git a/src/intel/compiler/elk/tests/gen8/xor.expected b/src/intel/compiler/elk/tests/gen8/xor.expected
new file mode 100644
index 00000000000..c60a3fb3866
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen8/xor.expected
@@ -0,0 +1,2 @@
+07 00 80 00 08 02 40 21 20 00 00 02 24 00 00 00
+07 00 60 00 08 02 80 20 b8 00 00 0a bc 40 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/add.asm b/src/intel/compiler/elk/tests/gen9/add.asm
new file mode 100644
index 00000000000..5d751c29ab0
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/add.asm
@@ -0,0 +1,40 @@
+add(8)          g124<1>F        g7<8,8,1>D      1D              { align1 1Q };
+add(16)         g120<1>F        g11<8,8,1>D     1D              { align1 1H };
+add(16)         g4<1>F          g1<0,1,0>F      -g1.4<0,1,0>F   { align1 1H };
+add(8)          g3.8<1>UW       g3<8,8,1>UW     0x0008UW        { align1 WE_all 1Q };
+add(16)         g3<1>D          g18<8,8,1>D     g12<8,8,1>D     { align1 1H };
+add(16)         g6<1>UW         g1.4<1,4,0>UW   0x11001010V     { align1 WE_all 1H };
+add(32)         g10<1>UW        g1.4<1,4,0>UW   0x11001010V     { align1 WE_all };
+add(8)          g2<1>D          g96<8,8,1>D     -1023D          { align1 1Q };
+add(8)          g4<1>F          g5.6<0,1,0>F    g7.2<0,1,0>F    { align1 1Q };
+add(8)          g53<1>DF        g49<4,4,1>DF    g51<4,4,1>DF    { align1 1Q };
+add.sat(16)     g5<1>UD         g3<8,8,1>UD     0x00000001UD    { align1 1H };
+add(1)          g125.3<1>UD     g0.3<0,1,0>UD   g7<0,1,0>UD     { align1 WE_all 1N };
+add(8)          a0<1>UW         g34<16,8,2>UW   0x0080UW        { align1 1Q };
+add(8)          g8<1>DF         g2<0,1,0>DF     g3.2<0,1,0>DF   { align1 2Q };
+add(16)         a0<1>UW         g3<16,8,2>UW    0x0040UW        { align1 1H };
+add.sat.le.f0.0(8) g125<1>F     -g6<8,8,1>F     0x3f000000F  /* 0.5F */ { align1 1Q };
+add.z.f0.0(8)   g8<1>F          g2<0,1,0>F      -g2.4<0,1,0>F   { align1 1Q };
+add.z.f0.0(16)  g3<1>F          g2<0,1,0>F      -g2.1<0,1,0>F   { align1 1H };
+add(8)          g3<1>UD         g2<8,8,1>UD     0xffffffffUD    { align1 1Q };
+(+f0.0) add(8)  g15<1>D         -g15<8,8,1>D    31D             { align1 1Q };
+add(1)          a0<1>UD         a0<0,1,0>UD     0x00000200UD    { align1 WE_all 1N };
+add.sat(8)      g124<1>F        g7<8,8,1>F      -g6<8,8,1>F     { align1 1Q };
+add(8)          g8<1>UD         g6<8,8,1>D      0x00000001UD    { align1 1Q };
+add(16)         g11<1>UD        g9<8,8,1>D      0x00000001UD    { align1 1H };
+(+f0.0) add(16) g8<1>D          -g8<8,8,1>D     31D             { align1 1H };
+add.sat(16)     g126<1>F        g2<0,1,0>F      g2.4<0,1,0>F    { align1 1H };
+add.sat(8)      g124<1>F        g17<8,8,1>D     1D              { align1 1Q };
+add(16)         g114<1>D        g118<8,8,1>D    g116<8,8,1>D    { align1 2H };
+add.z.f0.0(16)  null<1>D        g120<8,8,1>D    1D              { align1 1H };
+add.z.f0.0(16)  null<1>D        g116<8,8,1>D    1D              { align1 2H };
+add.z.f0.0(8)   g3<1>D          g5<8,8,1>D      g4<8,8,1>D      { align1 1Q };
+add(16)         g20<1>UD        g17<8,8,1>UD    1D              { align1 1H };
+add(8)          g7<1>F          -g6<4>.xyxyF    g6<4>.zwzwF     { align16 1Q };
+add(16)         g9<1>F          -g7<4>.xyxyF    g7<4>.zwzwF     { align16 1H };
+add(8)          g7<1>UD         g2<8,8,1>UD     -g6<8,8,1>UD    { align1 WE_all 1Q };
+add.le.f0.0(16) g1<1>D          g3.1<0,1,0>D    -g6<8,8,1>D     { align1 1H };
+add.sat(8)      g10<1>UD        g9<8,8,1>UD     0x00000001UD    { align1 1Q };
+add(1)          g14<1>UD        g14<0,1,0>UD    0x00000001UD    { align1 WE_all 3N };
+add(8)          g25<1>Q         g22<4,4,1>Q     -g24<4,4,1>Q    { align1 1Q };
+add(8)          g12<1>Q         g5<4,4,1>Q      -g11<4,4,1>Q    { align1 2Q };
diff --git a/src/intel/compiler/elk/tests/gen9/add.expected b/src/intel/compiler/elk/tests/gen9/add.expected
new file mode 100644
index 00000000000..438b6f14325
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/add.expected
@@ -0,0 +1,40 @@
+40 00 60 00 e8 0a 80 2f e0 00 8d 0e 01 00 00 00
+40 00 80 00 e8 0a 00 2f 60 01 8d 0e 01 00 00 00
+40 00 80 00 e8 3a 80 20 20 00 00 3a 30 40 00 00
+40 00 60 00 4c 12 70 20 60 00 8d 16 08 00 08 00
+40 00 80 00 28 0a 60 20 40 02 8d 0a 80 01 8d 00
+40 00 80 00 4c 12 c0 20 28 00 28 36 10 10 00 11
+40 00 a0 00 4c 12 40 21 28 00 28 36 10 10 00 11
+40 00 60 00 28 0a 40 20 00 0c 8d 0e 01 fc ff ff
+40 00 60 00 e8 3a 80 20 b8 00 00 3a e8 00 00 00
+40 00 60 00 c8 32 a0 26 20 06 69 32 60 06 69 00
+40 00 80 80 08 02 a0 20 60 00 8d 06 01 00 00 00
+40 00 00 00 0c 02 ac 2f 0c 00 00 02 e0 00 00 00
+40 00 60 00 40 12 00 22 40 04 ae 16 80 00 80 00
+40 10 60 00 c8 32 00 21 40 00 00 32 70 00 00 00
+40 00 80 00 40 12 00 22 60 00 ae 16 40 00 40 00
+40 00 60 86 e8 3a a0 2f c0 40 8d 3e 00 00 00 3f
+40 00 60 01 e8 3a 00 21 40 00 00 3a 50 40 00 00
+40 00 80 01 e8 3a 60 20 40 00 00 3a 44 40 00 00
+40 00 60 00 08 02 60 20 40 00 8d 06 ff ff ff ff
+40 00 61 00 28 0a e0 21 e0 41 8d 0e 1f 00 00 00
+40 00 00 00 04 00 00 22 00 02 00 06 00 02 00 00
+40 00 60 80 e8 3a 80 2f e0 00 8d 3a c0 40 8d 00
+40 00 60 00 08 0a 00 21 c0 00 8d 06 01 00 00 00
+40 00 80 00 08 0a 60 21 20 01 8d 06 01 00 00 00
+40 00 81 00 28 0a 00 21 00 41 8d 0e 1f 00 00 00
+40 00 80 80 e8 3a c0 2f 40 00 00 3a 50 00 00 00
+40 00 60 80 e8 0a 80 2f 20 02 8d 0e 01 00 00 00
+40 20 80 00 28 0a 40 2e c0 0e 8d 0a 80 0e 8d 00
+40 00 80 01 20 0a 00 20 00 0f 8d 0e 01 00 00 00
+40 20 80 01 20 0a 00 20 80 0e 8d 0e 01 00 00 00
+40 00 60 01 28 0a 60 20 a0 00 8d 0a 80 00 8d 00
+40 00 80 00 08 02 80 22 20 02 8d 0e 01 00 00 00
+40 01 60 00 e8 3a ef 20 c4 40 64 3a ce 00 6e 00
+40 01 80 00 e8 3a 2f 21 e4 40 64 3a ee 00 6e 00
+40 00 60 00 0c 02 e0 20 40 00 8d 02 c0 40 8d 00
+40 00 80 06 28 0a 20 20 64 00 00 0a c0 40 8d 00
+40 00 60 80 08 02 40 21 20 01 8d 06 01 00 00 00
+40 10 00 00 0c 02 c0 21 c0 01 00 06 01 00 00 00
+40 00 60 00 28 4b 20 23 c0 02 69 4a 00 43 69 00
+40 10 60 00 28 4b 80 21 a0 00 69 4a 60 41 69 00
diff --git a/src/intel/compiler/elk/tests/gen9/and.asm b/src/intel/compiler/elk/tests/gen9/and.asm
new file mode 100644
index 00000000000..2f5d123fc84
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/and.asm
@@ -0,0 +1,29 @@
+and(8)          g3<1>UD         g2<0,1,0>UD     ~g2.2<0,1,0>D   { align1 1Q };
+and(16)         g3<1>UD         g2<0,1,0>UD     ~g2.2<0,1,0>D   { align1 1H };
+and(8)          g8<1>UD         g0.1<0,1,0>UW   0x07ffUW        { align1 1Q };
+and(16)         g18<1>UD        g0.1<0,1,0>UW   0x07ffUW        { align1 1H };
+and(1)          g7<1>UD         g5<0,1,0>UD     0x000000f0UD    { align1 WE_all 1N };
+and.nz.f0.0(8)  null<1>UD       g36<8,8,1>UD    g37<8,8,1>UD    { align1 1Q };
+and.nz.f0.0(16) null<1>UD       g70<8,8,1>UD    g72<8,8,1>UD    { align1 1H };
+and.z.f0.0(16)  g21<1>UD        g19<8,8,1>UD    g17<8,8,1>UD    { align1 1H };
+and(8)          g61<1>UD        g79<8,8,1>UD    g32.1<8,4,2>UD  { align1 2Q };
+and(8)          g96<1>D         ~g94<8,8,1>D    ~g95<8,8,1>D    { align1 1Q };
+and(16)         g24<1>D         ~g20<8,8,1>D    ~g22<8,8,1>D    { align1 1H };
+and(1)          a0<1>UD         g4<0,1,0>UD     0x000000ffUD    { align1 WE_all 1N };
+and(16)         g118<1>UD       g114<8,8,1>UD   0x0000003fUD    { align1 2H };
+and(1)          g4<1>UD         g20<0,1,0>UD    0x000000ffUD    { align1 WE_all 3N };
+and.z.f0.0(8)   null<1>D        g13<8,8,1>UD    0x0000001fUD    { align1 1Q };
+and(8)          g21<1>UD        g15<8,8,1>UD    0x00000003UD    { align1 WE_all 1Q };
+and.z.f0.0(8)   null<1>UD       g20<8,8,1>UD    0x00000001UD    { align1 1Q };
+and.z.f0.0(16)  null<1>UD       g45<8,8,1>UD    0x00000001UD    { align1 1H };
+and(8)          g4<1>UW         g3<8,8,1>UW     0xfffcUW        { align1 1Q };
+and(16)         g13<1>UW        g19<16,8,2>UW   0xfffcUW        { align1 1H };
+and.nz.f0.0(8)  null<1>UD       ~g2.2<0,1,0>D   g9<8,8,1>UD     { align1 1Q };
+and(8)          g18<1>UD        ~g2.2<0,1,0>D   g7<8,8,1>UD     { align1 1Q };
+and.nz.f0.0(16) null<1>UD       ~g2.2<0,1,0>D   g14<8,8,1>UD    { align1 1H };
+and(16)         g30<1>UD        ~g2.2<0,1,0>D   g10<8,8,1>UD    { align1 1H };
+and.nz.f0.0(8)  g10<1>UD        g9<8,8,1>UD     0x00000001UD    { align1 1Q };
+and.nz.f0.0(16) g16<1>UD        g14<8,8,1>UD    0x00000001UD    { align1 1H };
+and.z.f0.0(8)   g9<1>UD         g8<8,8,1>UD     0x00000003UD    { align1 1Q };
+and(8)          g12<1>UQ        g9<4,4,1>UQ     g11<4,4,1>UQ    { align1 1Q };
+and(8)          g26<1>UQ        g18<4,4,1>UQ    g22<4,4,1>UQ    { align1 2Q };
diff --git a/src/intel/compiler/elk/tests/gen9/and.expected b/src/intel/compiler/elk/tests/gen9/and.expected
new file mode 100644
index 00000000000..4f2b62ecd1a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/and.expected
@@ -0,0 +1,29 @@
+05 00 60 00 08 02 60 20 40 00 00 0a 48 40 00 00
+05 00 80 00 08 02 60 20 40 00 00 0a 48 40 00 00
+05 00 60 00 08 12 00 21 02 00 00 16 ff 07 ff 07
+05 00 80 00 08 12 40 22 02 00 00 16 ff 07 ff 07
+05 00 00 00 0c 02 e0 20 a0 00 00 06 f0 00 00 00
+05 00 60 02 00 02 00 20 80 04 8d 02 a0 04 8d 00
+05 00 80 02 00 02 00 20 c0 08 8d 02 00 09 8d 00
+05 00 80 01 08 02 a0 22 60 02 8d 02 20 02 8d 00
+05 10 60 00 08 02 a0 27 e0 09 8d 02 04 04 8a 00
+05 00 60 00 28 0a 00 2c c0 4b 8d 0a e0 4b 8d 00
+05 00 80 00 28 0a 00 23 80 42 8d 0a c0 42 8d 00
+05 00 00 00 04 02 00 22 80 00 00 06 ff 00 00 00
+05 20 80 00 08 02 c0 2e 40 0e 8d 06 3f 00 00 00
+05 10 00 00 0c 02 80 20 80 02 00 06 ff 00 00 00
+05 00 60 01 20 02 00 20 a0 01 8d 06 1f 00 00 00
+05 00 60 00 0c 02 a0 22 e0 01 8d 06 03 00 00 00
+05 00 60 01 00 02 00 20 80 02 8d 06 01 00 00 00
+05 00 80 01 00 02 00 20 a0 05 8d 06 01 00 00 00
+05 00 60 00 48 12 80 20 60 00 8d 16 fc ff fc ff
+05 00 80 00 48 12 a0 21 60 02 ae 16 fc ff fc ff
+05 00 60 02 00 0a 00 20 48 40 00 02 20 01 8d 00
+05 00 60 00 08 0a 40 22 48 40 00 02 e0 00 8d 00
+05 00 80 02 00 0a 00 20 48 40 00 02 c0 01 8d 00
+05 00 80 00 08 0a c0 23 48 40 00 02 40 01 8d 00
+05 00 60 02 08 02 40 21 20 01 8d 06 01 00 00 00
+05 00 80 02 08 02 00 22 c0 01 8d 06 01 00 00 00
+05 00 60 01 08 02 20 21 00 01 8d 06 03 00 00 00
+05 00 60 00 08 43 80 21 20 01 69 42 60 01 69 00
+05 10 60 00 08 43 40 23 40 02 69 42 c0 02 69 00
diff --git a/src/intel/compiler/elk/tests/gen9/asr.asm b/src/intel/compiler/elk/tests/gen9/asr.asm
new file mode 100644
index 00000000000..9beabc9cc8b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/asr.asm
@@ -0,0 +1,6 @@
+asr(8)          g19<1>D         g7<8,8,1>D      0x00000001UD    { align1 1Q };
+asr(16)         g20<1>D         g2.7<0,1,0>D    0x0000001fUD    { align1 1H };
+asr.nz.f0.0(8)  null<1>D        -g0<0,1,0>W     15D             { align1 1Q };
+asr.nz.f0.0(16) null<1>D        -g0<0,1,0>W     15D             { align1 1H };
+asr(8)          g2<1>D          -g0<0,1,0>W     15D             { align1 1Q };
+asr(16)         g2<1>D          -g0<0,1,0>W     15D             { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/asr.expected b/src/intel/compiler/elk/tests/gen9/asr.expected
new file mode 100644
index 00000000000..f1832cd80d7
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/asr.expected
@@ -0,0 +1,6 @@
+0c 00 60 00 28 0a 60 22 e0 00 8d 06 01 00 00 00
+0c 00 80 00 28 0a 80 22 5c 00 00 06 1f 00 00 00
+0c 00 60 02 20 1a 00 20 00 40 00 0e 0f 00 00 00
+0c 00 80 02 20 1a 00 20 00 40 00 0e 0f 00 00 00
+0c 00 60 00 28 1a 40 20 00 40 00 0e 0f 00 00 00
+0c 00 80 00 28 1a 40 20 00 40 00 0e 0f 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/bfe.asm b/src/intel/compiler/elk/tests/gen9/bfe.asm
new file mode 100644
index 00000000000..93ec4fb18e9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/bfe.asm
@@ -0,0 +1,4 @@
+bfe(8)          g96<1>UD        g89<4,4,1>UD    g30<4,4,1>UD    g91<4,4,1>UD { align16 1Q };
+bfe(16)         g13<1>UD        g44<4,4,1>UD    g115<4,4,1>UD   g126<4,4,1>UD { align16 1H };
+bfe(8)          g18<1>D         g17<4,4,1>D     g16<4,4,1>D     g49<4,4,1>D { align16 1Q };
+bfe(16)         g13<1>D         g11<4,4,1>D     g42<4,4,1>D     g5<4,4,1>D { align16 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/bfe.expected b/src/intel/compiler/elk/tests/gen9/bfe.expected
new file mode 100644
index 00000000000..d6a91b3c387
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/bfe.expected
@@ -0,0 +1,4 @@
+18 01 60 00 00 90 1e 60 c8 91 05 39 3c 20 c7 16
+18 01 80 00 00 90 1e 0d c8 c1 02 39 e6 20 87 1f
+18 01 60 00 00 48 1e 12 c8 11 01 39 20 20 47 0c
+18 01 80 00 00 48 1e 0d c8 b1 00 39 54 20 47 01
diff --git a/src/intel/compiler/elk/tests/gen9/bfi1.asm b/src/intel/compiler/elk/tests/gen9/bfi1.asm
new file mode 100644
index 00000000000..d2bfa85d7ce
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/bfi1.asm
@@ -0,0 +1,2 @@
+bfi1(8)         g20<1>UD        g19<8,8,1>D     g18<8,8,1>D     { align1 1Q };
+bfi1(16)        g16<1>UD        g14<8,8,1>D     g12<8,8,1>D     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/bfi1.expected b/src/intel/compiler/elk/tests/gen9/bfi1.expected
new file mode 100644
index 00000000000..d8b4474c53e
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/bfi1.expected
@@ -0,0 +1,2 @@
+19 00 60 00 08 0a 80 22 60 02 8d 0a 40 02 8d 00
+19 00 80 00 08 0a 00 22 c0 01 8d 0a 80 01 8d 00
diff --git a/src/intel/compiler/elk/tests/gen9/bfi2.asm b/src/intel/compiler/elk/tests/gen9/bfi2.asm
new file mode 100644
index 00000000000..1dadebe1753
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/bfi2.asm
@@ -0,0 +1,2 @@
+bfi2(8)         g31<1>UD        g88<4,4,1>UD    g90<4,4,1>UD    g91<4,4,1>UD { align16 1Q };
+bfi2(16)        g5<1>UD         g42<4,4,1>UD    g40<4,4,1>UD    g126<4,4,1>UD { align16 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/bfi2.expected b/src/intel/compiler/elk/tests/gen9/bfi2.expected
new file mode 100644
index 00000000000..61eda29eaf4
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/bfi2.expected
@@ -0,0 +1,2 @@
+1a 01 60 00 00 90 1e 1f c8 81 05 39 b4 20 c7 16
+1a 01 80 00 00 90 1e 05 c8 a1 02 39 50 20 87 1f
diff --git a/src/intel/compiler/elk/tests/gen9/bfrev.asm b/src/intel/compiler/elk/tests/gen9/bfrev.asm
new file mode 100644
index 00000000000..44b45c53bae
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/bfrev.asm
@@ -0,0 +1,2 @@
+bfrev(8)        g5<1>UD         g5<8,8,1>UD                     { align1 1Q };
+bfrev(16)       g6<1>UD         g8<8,8,1>UD                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/bfrev.expected b/src/intel/compiler/elk/tests/gen9/bfrev.expected
new file mode 100644
index 00000000000..b4d7fb02205
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/bfrev.expected
@@ -0,0 +1,2 @@
+17 00 60 00 08 02 a0 20 a0 00 8d 00 00 00 00 00
+17 00 80 00 08 02 c0 20 00 01 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/break.asm b/src/intel/compiler/elk/tests/gen9/break.asm
new file mode 100644
index 00000000000..681b3d2c8a1
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/break.asm
@@ -0,0 +1,6 @@
+break(8)        JIP: LABEL0         UIP: LABEL1                 { align1 1Q };
+break(16)       JIP: LABEL0         UIP: LABEL1                 { align1 1H };
+LABEL0:
+(+f0.0) break(8) JIP: LABEL1        UIP: LABEL1                 { align1 1Q };
+(+f0.0) break(16) JIP: LABEL1       UIP: LABEL1                 { align1 1H };
+LABEL1:
diff --git a/src/intel/compiler/elk/tests/gen9/break.expected b/src/intel/compiler/elk/tests/gen9/break.expected
new file mode 100644
index 00000000000..f5448cdbdf3
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/break.expected
@@ -0,0 +1,4 @@
+28 00 60 00 20 0e 00 20 40 00 00 00 20 00 00 00
+28 00 80 00 20 0e 00 20 30 00 00 00 10 00 00 00
+28 00 61 00 20 0e 00 20 20 00 00 00 20 00 00 00
+28 00 81 00 20 0e 00 20 10 00 00 00 10 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/cbit.asm b/src/intel/compiler/elk/tests/gen9/cbit.asm
new file mode 100644
index 00000000000..a48d5e29182
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/cbit.asm
@@ -0,0 +1,2 @@
+cbit(8)         g9<1>UD         g31<8,8,1>UD                    { align1 1Q };
+cbit(16)        g6<1>UD         g8<8,8,1>UD                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/cbit.expected b/src/intel/compiler/elk/tests/gen9/cbit.expected
new file mode 100644
index 00000000000..8cb5ca16d1c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/cbit.expected
@@ -0,0 +1,2 @@
+4d 00 60 00 08 02 20 21 e0 03 8d 00 00 00 00 00
+4d 00 80 00 08 02 c0 20 00 01 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/cmp.asm b/src/intel/compiler/elk/tests/gen9/cmp.asm
new file mode 100644
index 00000000000..669224dcd0d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/cmp.asm
@@ -0,0 +1,104 @@
+cmp.z.f0.0(8)   null<1>F        g20<8,8,1>F     0xbf800000F  /* -1F */ { align1 1Q };
+cmp.nz.f0.0(8)  g59<1>DF        g2.1<0,1,0>DF   g59<4,4,1>DF    { align1 1Q };
+cmp.nz.f0.0(8)  g49<1>F         g47<8,8,1>F     g14.1<0,1,0>F   { align1 1Q };
+cmp.nz.f0.0(8)  null<1>D        g7<8,8,1>D      0D              { align1 1Q };
+cmp.z.f0.0(8)   g5<1>D          g4<8,8,1>D      g2.5<0,1,0>D    { align1 1Q };
+cmp.z.f0.0(16)  g7<1>D          g5<8,8,1>D      g2.5<0,1,0>D    { align1 1H };
+cmp.l.f0.0(16)  g28<1>F         g26<8,8,1>F     g24<8,8,1>F     { align1 1H };
+cmp.ge.f0.0(16) g30<1>F         g26<8,8,1>F     g24<8,8,1>F     { align1 1H };
+cmp.nz.f0.0(8)  g43<1>D         g42<8,8,1>D     g2.1<0,1,0>D    { align1 1Q };
+cmp.z.f0.0(8)   g86<1>DF        (abs)g6.2<0,1,0>DF g68<4,4,1>DF { align1 1Q };
+cmp.le.f0.0(8)  g108<1>D        g106<8,8,1>D    0D              { align1 1Q };
+cmp.nz.f0.0(8)  null<1>DF       g6.2<0,1,0>DF   g66<4,4,1>DF    { align1 1Q };
+cmp.l.f0.0(8)   g5<1>DF         g36<4,4,1>DF    g53<4,4,1>DF    { align1 1Q };
+cmp.ge.f0.0(8)  g18<1>DF        g36<4,4,1>DF    g53<4,4,1>DF    { align1 1Q };
+cmp.z.f0.0(8)   g34<1>DF        (abs)g106<4,4,1>DF g52<4,4,1>DF { align1 2Q };
+cmp.le.f0.0(16) g35<1>D         g21<8,8,1>D     0D              { align1 1H };
+cmp.nz.f0.0(8)  null<1>DF       g106<4,4,1>DF   g50<4,4,1>DF    { align1 2Q };
+cmp.nz.f0.0(8)  g113<1>DF       g3.1<0,1,0>DF   g59<4,4,1>DF    { align1 2Q };
+cmp.l.f0.0(8)   null<1>UD       g12<8,8,1>UD    0x00000004UD    { align1 1Q };
+cmp.l.f0.0(8)   g53<1>F         g52<8,8,1>F     g51<8,8,1>F     { align1 1Q };
+cmp.ge.f0.0(8)  g55<1>F         g52<8,8,1>F     g51<8,8,1>F     { align1 1Q };
+cmp.ge.f0.0(8)  g15<1>D         (abs)g12<8,8,1>D 1D             { align1 1Q };
+cmp.l.f0.0(8)   null<1>D        g6<0,1,0>D      2D              { align1 1Q };
+(+f0.1) cmp.z.f0.1(8) null<1>D  g8<8,8,1>D      0D              { align1 1Q };
+cmp.nz.f0.0(16) g11<1>D         g9<8,8,1>D      3D              { align1 1H };
+(+f0.1) cmp.z.f0.1(16) null<1>D g11<8,8,1>D     0D              { align1 1H };
+cmp.z.f0.0(8)   null<1>D        g22<8,8,1>D     1D              { align1 1Q };
+cmp.z.f0.0(16)  null<1>D        g47<8,8,1>D     1D              { align1 1H };
+cmp.ge.f0.0(8)  g30<1>UD        g29<8,8,1>UD    g5.7<0,1,0>UD   { align1 1Q };
+cmp.l.f0.0(8)   g31<1>UD        g29<8,8,1>UD    g5.3<0,1,0>UD   { align1 1Q };
+cmp.ge.f0.0(16) g50<1>UD        g48<8,8,1>UD    g7.7<0,1,0>UD   { align1 1H };
+cmp.l.f0.0(16)  g52<1>UD        g48<8,8,1>UD    g7.3<0,1,0>UD   { align1 1H };
+cmp.nz.f0.0(16) g9<1>F          g2.5<0,1,0>F    g1.1<0,1,0>F    { align1 1H };
+cmp.ge.f0.0(8)  null<1>D        g38<8,8,1>D     32D             { align1 1Q };
+cmp.ge.f0.0(8)  null<1>DF       g21<4,4,1>DF    g13<4,4,1>DF    { align1 1Q };
+cmp.ge.f0.0(16) g3<1>D          g1.1<0,1,0>D    g1<0,1,0>D      { align1 1H };
+cmp.l.f0.0(16)  g5<1>D          g1.1<0,1,0>D    g1<0,1,0>D      { align1 1H };
+cmp.z.f0.0(8)   g25<1>F         g4.3<0,1,0>F    g4.1<0,1,0>F    { align1 1Q };
+cmp.l.f0.0(8)   g33<1>D         g5<0,1,0>D      1D              { align1 1Q };
+cmp.l.f0.0(8)   g43<1>DF        g39<4,4,1>DF    g37<4,4,1>DF    { align1 2Q };
+cmp.ge.f0.0(8)  g46<1>DF        g39<4,4,1>DF    g37<4,4,1>DF    { align1 2Q };
+cmp.l.f0.0(16)  null<1>D        g6<0,1,0>D      1D              { align1 1H };
+cmp.z.f0.0(16)  g62<1>F         g12<8,8,1>F     g6.3<0,1,0>F    { align1 1H };
+cmp.nz.f0.0(8)  null<1>F        g2<0,1,0>F      0x0F  /* 0F */  { align1 1Q };
+cmp.nz.f0.0(16) null<1>F        g2<0,1,0>F      0x0F  /* 0F */  { align1 1H };
+cmp.ge.f0.0(16) null<1>UD       g46<8,8,1>UD    0x00000040UD    { align1 1H };
+cmp.z.f0.0(16)  null<1>F        g14<8,8,1>F     g6.1<0,1,0>F    { align1 1H };
+cmp.nz.f0.0(16) null<1>D        g6<0,1,0>D      0D              { align1 1H };
+cmp.l.f0.0(16)  null<1>UD       g39<8,8,1>UD    0x00000004UD    { align1 1H };
+cmp.le.f0.0(8)  null<1>F        g2<8,8,1>F      0x3f000000F  /* 0.5F */ { align1 1Q };
+cmp.le.f0.0(16) null<1>F        g2<8,8,1>F      0x3f000000F  /* 0.5F */ { align1 1H };
+cmp.le.f0.0(8)  g20<1>F         g5.3<0,1,0>F    0x0F  /* 0F */  { align1 1Q };
+cmp.ge.f0.0(8)  null<1>F        (abs)g26<8,8,1>F 0x5d5e0b6bF  /* 1e+18F */ { align1 1Q };
+cmp.g.f0.0(8)   g80<1>F         (abs)g44<8,8,1>F 0x3f800000F  /* 1F */ { align1 1Q };
+cmp.ge.f0.0(16) null<1>D        g67<8,8,1>D     32D             { align1 1H };
+cmp.g.f0.0(8)   null<1>F        g124<8,8,1>F    0x0F  /* 0F */  { align1 1Q };
+cmp.z.f0.0(8)   g4<1>F          g13<8,4,2>F     g2.5<0,1,0>F    { align1 2Q };
+cmp.g.f0.0(16)  null<1>F        g120<8,8,1>F    0x0F  /* 0F */  { align1 1H };
+cmp.g.f0.0(16)  g2<1>F          (abs)g17<8,8,1>F 0x3f800000F  /* 1F */ { align1 1H };
+cmp.l.f0.0(8)   null<1>DF       (abs)g5<0,1,0>DF g20<4,4,1>DF   { align1 1Q };
+cmp.nz.f0.0(8)  g29<1>D         g22.1<8,4,2>D   g3.2<0,1,0>D    { align1 2Q };
+cmp.l.f0.0(8)   null<1>DF       g11<4,4,1>DF    g8<4,4,1>DF     { align1 2Q };
+cmp.nz.f0.0(8)  g73<1>F         g6.1<0,1,0>F    g14<8,4,2>F     { align1 2Q };
+cmp.g.f0.0(8)   g7<1>D          g2<0,1,0>D      0D              { align1 1Q };
+cmp.l.f0.0(8)   null<1>F        g4.4<0,1,0>F    0x0F  /* 0F */  { align1 1Q };
+cmp.l.f0.0(16)  null<1>F        g6.4<0,1,0>F    0x0F  /* 0F */  { align1 1H };
+cmp.le.f0.0(8)  null<1>D        g2<8,8,1>D      50D             { align1 1Q };
+cmp.le.f0.0(16) null<1>D        g2<8,8,1>D      50D             { align1 1H };
+cmp.ge.f0.0(16) null<1>F        g35<8,8,1>F     0x3f000000F  /* 0.5F */ { align1 1H };
+cmp.le.f0.0(8)  g4<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 1Q };
+cmp.g.f0.0(8)   g5<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 1Q };
+cmp.le.f0.0(16) g5<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 1H };
+cmp.g.f0.0(16)  g7<1>UD         g2<0,1,0>UD     0x00000001UD    { align1 1H };
+cmp.le.f0.0(16) g121<1>F        g27<8,8,1>F     0x461c3f9aF  /* 9999.9F */ { align1 1H };
+cmp.z.f0.0(8)   g5<1>D          g14<8,4,2>D     g3.1<0,1,0>D    { align1 2Q };
+cmp.g.f0.0(8)   null<1>D        g5.2<0,1,0>D    31D             { align1 1Q };
+cmp.g.f0.0(8)   null<1>UD       g4.2<0,1,0>UD   0x0000001fUD    { align1 1Q };
+(+f0.1) cmp.nz.f0.1(8) null<1>UW g0<8,8,1>UW    g0<8,8,1>UW     { align1 1Q };
+(+f0.1) cmp.nz.f0.1(16) null<1>UW g0<8,8,1>UW   g0<8,8,1>UW     { align1 1H };
+cmp.z.f0.0(16)  null<1>D        g1<8,8,1>D      1024D           { align1 2H };
+cmp.l.f0.0(16)  null<1>D        g118<8,8,1>D    32D             { align1 2H };
+cmp.nz.f0.0(8)  null<1>UD       g3<8,8,1>UD     0x00000000UD    { align1 1Q };
+cmp.nz.f0.0(16) null<1>UD       g3<8,8,1>UD     0x00000000UD    { align1 1H };
+cmp.g.f0.0(16)  null<1>D        g2.1<0,1,0>D    0D              { align1 1H };
+cmp.nz.f0.0(8)  null<1>Q        g6<4,4,1>Q      g3<4,4,1>Q      { align1 1Q };
+cmp.z.f0.0(8)   g8<1>Q          g5<4,4,1>Q      g3<4,4,1>Q      { align1 1Q };
+cmp.nz.f0.0(8)  g2<1>Q          g5<4,4,1>Q      g3<4,4,1>Q      { align1 1Q };
+cmp.nz.f0.0(8)  null<1>Q        g9<4,4,1>Q      g4<4,4,1>Q      { align1 2Q };
+cmp.z.f0.0(8)   g17<1>Q         g11<4,4,1>Q     g4<4,4,1>Q      { align1 2Q };
+cmp.nz.f0.0(8)  g20<1>Q         g11<4,4,1>Q     g4<4,4,1>Q      { align1 2Q };
+cmp.z.f0.0(8)   null<1>UD       g5<8,8,1>UD     0x00000000UD    { align1 1Q };
+cmp.z.f0.0(16)  null<1>UD       g15<8,8,1>UD    0x00000000UD    { align1 1H };
+cmp.g.f0.0(16)  g1<1>D          g8<8,8,1>D      0D              { align1 1H };
+cmp.ge.f0.0(8)  null<1>UD       g10<8,8,1>UD    g8<8,8,1>UD     { align1 1Q };
+cmp.ge.f0.0(8)  null<1>DF       g37<4,4,1>DF    g26<4,4,1>DF    { align1 2Q };
+cmp.l.f0.0(8)   null<1>Q        g20<4,4,1>Q     g25<4,4,1>Q     { align1 1Q };
+cmp.l.f0.0(8)   null<1>Q        g2<4,4,1>Q      g12<4,4,1>Q     { align1 2Q };
+cmp.ge.f0.0(8)  null<1>Q        g20<4,4,1>Q     g27<4,4,1>Q     { align1 1Q };
+cmp.ge.f0.0(8)  null<1>Q        g2<4,4,1>Q      g8<4,4,1>Q      { align1 2Q };
+cmp.le.f0.0(8)  null<1>UD       g18<8,8,1>UD    0x000000ffUD    { align1 1Q };
+cmp.le.f0.0(16) null<1>UD       g32<8,8,1>UD    0x000000ffUD    { align1 1H };
+cmp.z.f0.0(8)   null<1>Q        g12<4,4,1>Q     g7<4,4,1>Q      { align1 1Q };
+cmp.z.f0.0(8)   null<1>Q        g26<4,4,1>Q     g12<4,4,1>Q     { align1 2Q };
+cmp.g.f0.0(16)  null<1>UD       g4.2<0,1,0>UD   0x0000001fUD    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/cmp.expected b/src/intel/compiler/elk/tests/gen9/cmp.expected
new file mode 100644
index 00000000000..9e13e8c926b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/cmp.expected
@@ -0,0 +1,104 @@
+10 00 60 01 e0 3a 00 20 80 02 8d 3e 00 00 80 bf
+10 00 60 02 c8 32 60 27 48 00 00 32 60 07 69 00
+10 00 60 02 e8 3a 20 26 e0 05 8d 3a c4 01 00 00
+10 00 60 02 20 0a 00 20 e0 00 8d 0e 00 00 00 00
+10 00 60 01 28 0a a0 20 80 00 8d 0a 54 00 00 00
+10 00 80 01 28 0a e0 20 a0 00 8d 0a 54 00 00 00
+10 00 80 05 e8 3a 80 23 40 03 8d 3a 00 03 8d 00
+10 00 80 04 e8 3a c0 23 40 03 8d 3a 00 03 8d 00
+10 00 60 02 28 0a 60 25 40 05 8d 0a 44 00 00 00
+10 00 60 01 c8 32 c0 2a d0 20 00 32 80 08 69 00
+10 00 60 06 28 0a 80 2d 40 0d 8d 0e 00 00 00 00
+10 00 60 02 c0 32 00 20 d0 00 00 32 40 08 69 00
+10 00 60 05 c8 32 a0 20 80 04 69 32 a0 06 69 00
+10 00 60 04 c8 32 40 22 80 04 69 32 a0 06 69 00
+10 10 60 01 c8 32 40 24 40 2d 69 32 80 06 69 00
+10 00 80 06 28 0a 60 24 a0 02 8d 0e 00 00 00 00
+10 10 60 02 c0 32 00 20 40 0d 69 32 40 06 69 00
+10 10 60 02 c8 32 20 2e 68 00 00 32 60 07 69 00
+10 00 60 05 00 02 00 20 80 01 8d 06 04 00 00 00
+10 00 60 05 e8 3a a0 26 80 06 8d 3a 60 06 8d 00
+10 00 60 04 e8 3a e0 26 80 06 8d 3a 60 06 8d 00
+10 00 60 04 28 0a e0 21 80 21 8d 0e 01 00 00 00
+10 00 60 05 20 0a 00 20 c0 00 00 0e 02 00 00 00
+10 00 61 01 21 0a 00 20 00 01 8d 0e 00 00 00 00
+10 00 80 02 28 0a 60 21 20 01 8d 0e 03 00 00 00
+10 00 81 01 21 0a 00 20 60 01 8d 0e 00 00 00 00
+10 00 60 01 20 0a 00 20 c0 02 8d 0e 01 00 00 00
+10 00 80 01 20 0a 00 20 e0 05 8d 0e 01 00 00 00
+10 00 60 04 08 02 c0 23 a0 03 8d 02 bc 00 00 00
+10 00 60 05 08 02 e0 23 a0 03 8d 02 ac 00 00 00
+10 00 80 04 08 02 40 26 00 06 8d 02 fc 00 00 00
+10 00 80 05 08 02 80 26 00 06 8d 02 ec 00 00 00
+10 00 80 02 e8 3a 20 21 54 00 00 3a 24 00 00 00
+10 00 60 04 20 0a 00 20 c0 04 8d 0e 20 00 00 00
+10 00 60 04 c0 32 00 20 a0 02 69 32 a0 01 69 00
+10 00 80 04 28 0a 60 20 24 00 00 0a 20 00 00 00
+10 00 80 05 28 0a a0 20 24 00 00 0a 20 00 00 00
+10 00 60 01 e8 3a 20 23 8c 00 00 3a 84 00 00 00
+10 00 60 05 28 0a 20 24 a0 00 00 0e 01 00 00 00
+10 10 60 05 c8 32 60 25 e0 04 69 32 a0 04 69 00
+10 10 60 04 c8 32 c0 25 e0 04 69 32 a0 04 69 00
+10 00 80 05 20 0a 00 20 c0 00 00 0e 01 00 00 00
+10 00 80 01 e8 3a c0 27 80 01 8d 3a cc 00 00 00
+10 00 60 02 e0 3a 00 20 40 00 00 3e 00 00 00 00
+10 00 80 02 e0 3a 00 20 40 00 00 3e 00 00 00 00
+10 00 80 04 00 02 00 20 c0 05 8d 06 40 00 00 00
+10 00 80 01 e0 3a 00 20 c0 01 8d 3a c4 00 00 00
+10 00 80 02 20 0a 00 20 c0 00 00 0e 00 00 00 00
+10 00 80 05 00 02 00 20 e0 04 8d 06 04 00 00 00
+10 00 60 06 e0 3a 00 20 40 00 8d 3e 00 00 00 3f
+10 00 80 06 e0 3a 00 20 40 00 8d 3e 00 00 00 3f
+10 00 60 06 e8 3a 80 22 ac 00 00 3e 00 00 00 00
+10 00 60 04 e0 3a 00 20 40 23 8d 3e 6b 0b 5e 5d
+10 00 60 03 e8 3a 00 2a 80 25 8d 3e 00 00 80 3f
+10 00 80 04 20 0a 00 20 60 08 8d 0e 20 00 00 00
+10 00 60 03 e0 3a 00 20 80 0f 8d 3e 00 00 00 00
+10 10 60 01 e8 3a 80 20 a0 01 8a 3a 54 00 00 00
+10 00 80 03 e0 3a 00 20 00 0f 8d 3e 00 00 00 00
+10 00 80 03 e8 3a 40 20 20 22 8d 3e 00 00 80 3f
+10 00 60 05 c0 32 00 20 a0 20 00 32 80 02 69 00
+10 10 60 02 28 0a a0 23 c4 02 8a 0a 68 00 00 00
+10 10 60 05 c0 32 00 20 60 01 69 32 00 01 69 00
+10 10 60 02 e8 3a 20 29 c4 00 00 3a c0 01 8a 00
+10 00 60 03 28 0a e0 20 40 00 00 0e 00 00 00 00
+10 00 60 05 e0 3a 00 20 90 00 00 3e 00 00 00 00
+10 00 80 05 e0 3a 00 20 d0 00 00 3e 00 00 00 00
+10 00 60 06 20 0a 00 20 40 00 8d 0e 32 00 00 00
+10 00 80 06 20 0a 00 20 40 00 8d 0e 32 00 00 00
+10 00 80 04 e0 3a 00 20 60 04 8d 3e 00 00 00 3f
+10 00 60 06 08 02 80 20 40 00 00 06 01 00 00 00
+10 00 60 03 08 02 a0 20 40 00 00 06 01 00 00 00
+10 00 80 06 08 02 a0 20 40 00 00 06 01 00 00 00
+10 00 80 03 08 02 e0 20 40 00 00 06 01 00 00 00
+10 00 80 06 e8 3a 20 2f 60 03 8d 3e 9a 3f 1c 46
+10 10 60 01 28 0a a0 20 c0 01 8a 0a 64 00 00 00
+10 00 60 03 20 0a 00 20 a8 00 00 0e 1f 00 00 00
+10 00 60 03 00 02 00 20 88 00 00 06 1f 00 00 00
+10 00 61 02 41 12 00 20 00 00 8d 12 00 00 8d 00
+10 00 81 02 41 12 00 20 00 00 8d 12 00 00 8d 00
+10 20 80 01 20 0a 00 20 20 00 8d 0e 00 04 00 00
+10 20 80 05 20 0a 00 20 c0 0e 8d 0e 20 00 00 00
+10 00 60 02 00 02 00 20 60 00 8d 06 00 00 00 00
+10 00 80 02 00 02 00 20 60 00 8d 06 00 00 00 00
+10 00 80 03 20 0a 00 20 44 00 00 0e 00 00 00 00
+10 00 60 02 20 4b 00 20 c0 00 69 4a 60 00 69 00
+10 00 60 01 28 4b 00 21 a0 00 69 4a 60 00 69 00
+10 00 60 02 28 4b 40 20 a0 00 69 4a 60 00 69 00
+10 10 60 02 20 4b 00 20 20 01 69 4a 80 00 69 00
+10 10 60 01 28 4b 20 22 60 01 69 4a 80 00 69 00
+10 10 60 02 28 4b 80 22 60 01 69 4a 80 00 69 00
+10 00 60 01 00 02 00 20 a0 00 8d 06 00 00 00 00
+10 00 80 01 00 02 00 20 e0 01 8d 06 00 00 00 00
+10 00 80 03 28 0a 20 20 00 01 8d 0e 00 00 00 00
+10 00 60 04 00 02 00 20 40 01 8d 02 00 01 8d 00
+10 10 60 04 c0 32 00 20 a0 04 69 32 40 03 69 00
+10 00 60 05 20 4b 00 20 80 02 69 4a 20 03 69 00
+10 10 60 05 20 4b 00 20 40 00 69 4a 80 01 69 00
+10 00 60 04 20 4b 00 20 80 02 69 4a 60 03 69 00
+10 10 60 04 20 4b 00 20 40 00 69 4a 00 01 69 00
+10 00 60 06 00 02 00 20 40 02 8d 06 ff 00 00 00
+10 00 80 06 00 02 00 20 00 04 8d 06 ff 00 00 00
+10 00 60 01 20 4b 00 20 80 01 69 4a e0 00 69 00
+10 10 60 01 20 4b 00 20 40 03 69 4a 80 01 69 00
+10 00 80 03 00 02 00 20 88 00 00 06 1f 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/cont.asm b/src/intel/compiler/elk/tests/gen9/cont.asm
new file mode 100644
index 00000000000..ca97a556e9c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/cont.asm
@@ -0,0 +1,4 @@
+cont(8)         JIP: LABEL0         UIP: LABEL1                 { align1 1Q };
+LABEL0:
+cont(16)        JIP: LABEL1         UIP: LABEL1                 { align1 1H };
+LABEL1:
diff --git a/src/intel/compiler/elk/tests/gen9/cont.expected b/src/intel/compiler/elk/tests/gen9/cont.expected
new file mode 100644
index 00000000000..d8036df8e1c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/cont.expected
@@ -0,0 +1,2 @@
+29 00 60 00 00 0e 00 34 20 00 00 00 10 00 00 00
+29 00 80 00 00 0e 00 34 10 00 00 00 10 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/cr0.asm b/src/intel/compiler/elk/tests/gen9/cr0.asm
new file mode 100644
index 00000000000..d5b67ca9cf1
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/cr0.asm
@@ -0,0 +1,14 @@
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xfffffb3fUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xffffff3fUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xfffffb7fUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xffffff7fUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xfffffbbfUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xffffffbfUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xffffffcfUD    { align1 1N switch };
+and(1)          cr0<1>UD        cr0<0,1,0>UD    0xfffffbffUD    { align1 1N switch };
+or(1)           cr0<1>UD        cr0<0,1,0>UD    0x00000400UD    { align1 1N switch };
+or(1)           cr0<1>UD        cr0<0,1,0>UD    0x00000030UD    { align1 1N switch };
+or(1)           cr0<1>UD        cr0<0,1,0>UD    0x00000040UD    { align1 1N switch };
+or(1)           cr0<1>UD        cr0<0,1,0>UD    0x00000440UD    { align1 1N switch };
+or(1)           cr0<1>UD        cr0<0,1,0>UD    0x00000080UD    { align1 1N switch };
+or(1)           cr0<1>UD        cr0<0,1,0>UD    0x00000480UD    { align1 1N switch };
diff --git a/src/intel/compiler/elk/tests/gen9/cr0.expected b/src/intel/compiler/elk/tests/gen9/cr0.expected
new file mode 100644
index 00000000000..ccf8a886035
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/cr0.expected
@@ -0,0 +1,14 @@
+05 80 00 00 00 00 00 30 00 10 00 06 3f fb ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 3f ff ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 7f fb ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 7f ff ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 bf fb ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 bf ff ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 cf ff ff ff
+05 80 00 00 00 00 00 30 00 10 00 06 ff fb ff ff
+06 80 00 00 00 00 00 30 00 10 00 06 00 04 00 00
+06 80 00 00 00 00 00 30 00 10 00 06 30 00 00 00
+06 80 00 00 00 00 00 30 00 10 00 06 40 00 00 00
+06 80 00 00 00 00 00 30 00 10 00 06 40 04 00 00
+06 80 00 00 00 00 00 30 00 10 00 06 80 00 00 00
+06 80 00 00 00 00 00 30 00 10 00 06 80 04 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/csel.asm b/src/intel/compiler/elk/tests/gen9/csel.asm
new file mode 100644
index 00000000000..6030fb39f26
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/csel.asm
@@ -0,0 +1,13 @@
+csel.nz(8)      g15<1>F         g11<4,4,1>F     (abs)g11<4,4,1>F g11<4,4,1>F { align16 1Q };
+csel.nz(16)     g14<1>F         g8<4,4,1>F      (abs)g8<4,4,1>F g8<4,4,1>F { align16 1H };
+csel.le(8)      g21<1>F         (abs)g5.3<0,1,0>F g5.0<0,1,0>F  g5.3<0,1,0>F { align16 1Q };
+csel.l(8)       g107<1>F        -g101<4,4,1>F   g101<4,4,1>F    g104<4,4,1>F { align16 1Q };
+csel.le(8)      g21<1>F         g5.0<0,1,0>F    (abs)g5.1<0,1,0>F g5.1<0,1,0>F { align16 1Q };
+csel.l(8)       g127<1>F        g2<4,4,1>F      g8<4,4,1>F      g4.0<0,1,0>F { align16 1Q };
+csel.l(16)      g126<1>F        g2<4,4,1>F      g13<4,4,1>F     g6.0<0,1,0>F { align16 1H };
+csel.le(16)     g13<1>F         (abs)g73<4,4,1>F g58<4,4,1>F    g73<4,4,1>F { align16 1H };
+csel.le(16)     g15<1>F         g58<4,4,1>F     (abs)g73<4,4,1>F g73<4,4,1>F { align16 1H };
+csel.l(16)      g69<1>F         -g65<4,4,1>F    g65<4,4,1>F     g67<4,4,1>F { align16 1H };
+csel.sat.g(8)   g125<1>F        g2.3<0,1,0>F    g2.2<0,1,0>F    g2.0<0,1,0>F { align16 1Q };
+csel.g(8)       g125<1>F        g2.3<0,1,0>F    g2.2<0,1,0>F    g2.0<0,1,0>F { align16 1Q };
+csel.g(16)      g122<1>F        g2.3<0,1,0>F    g2.2<0,1,0>F    g2.0<0,1,0>F { align16 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/csel.expected b/src/intel/compiler/elk/tests/gen9/csel.expected
new file mode 100644
index 00000000000..300b9154107
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/csel.expected
@@ -0,0 +1,13 @@
+12 01 60 02 80 00 1e 0f c8 b1 00 39 16 20 c7 02
+12 01 80 02 80 00 1e 0e c8 81 00 39 10 20 07 02
+12 01 60 06 20 00 1e 15 01 56 20 00 0a 04 58 01
+12 01 60 05 40 00 1e 6b c8 51 06 39 ca 20 07 1a
+12 01 60 06 80 00 1e 15 01 50 20 40 0a 04 48 01
+12 01 60 05 00 00 1e 7f c8 21 00 39 10 04 00 01
+12 01 80 05 00 00 1e 7e c8 21 00 39 1a 04 80 01
+12 01 80 06 20 00 1e 0d c8 91 04 39 74 20 47 12
+12 01 80 06 80 00 1e 0f c8 a1 03 39 92 20 47 12
+12 01 80 05 40 00 1e 45 c8 11 04 39 82 20 c7 10
+12 01 60 83 00 00 1e 7d 01 26 20 80 04 04 80 00
+12 01 60 03 00 00 1e 7d 01 26 20 80 04 04 80 00
+12 01 80 03 00 00 1e 7a 01 26 20 80 04 04 80 00
diff --git a/src/intel/compiler/elk/tests/gen9/else.asm b/src/intel/compiler/elk/tests/gen9/else.asm
new file mode 100644
index 00000000000..ce868a280cd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/else.asm
@@ -0,0 +1,4 @@
+else(8)         JIP: LABEL0        UIP: LABEL0                  { align1 1Q };
+else(16)        JIP: LABEL0        UIP: LABEL0                  { align1 1H };
+else(32)        JIP: LABEL0        UIP: LABEL0                  { align1 };
+LABEL0:
\ No newline at end of file
diff --git a/src/intel/compiler/elk/tests/gen9/else.expected b/src/intel/compiler/elk/tests/gen9/else.expected
new file mode 100644
index 00000000000..c7834d75bcd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/else.expected
@@ -0,0 +1,3 @@
+24 00 60 00 20 0e 00 20 30 00 00 00 30 00 00 00
+24 00 80 00 20 0e 00 20 20 00 00 00 20 00 00 00
+24 00 a0 00 20 0e 00 20 10 00 00 00 10 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/endif.asm b/src/intel/compiler/elk/tests/gen9/endif.asm
new file mode 100644
index 00000000000..206798e2de6
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/endif.asm
@@ -0,0 +1,4 @@
+endif(8)        JIP: LABEL0                                     { align1 1Q };
+endif(16)       JIP: LABEL0                                     { align1 1H };
+endif(32)       JIP: LABEL0                                     { align1 };
+LABEL0:
diff --git a/src/intel/compiler/elk/tests/gen9/endif.expected b/src/intel/compiler/elk/tests/gen9/endif.expected
new file mode 100644
index 00000000000..5f6a9feba40
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/endif.expected
@@ -0,0 +1,3 @@
+25 00 60 00 00 0e 00 00 00 00 00 08 30 00 00 00
+25 00 80 00 00 0e 00 00 00 00 00 08 20 00 00 00
+25 00 a0 00 00 0e 00 00 00 00 00 08 10 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/fbh.asm b/src/intel/compiler/elk/tests/gen9/fbh.asm
new file mode 100644
index 00000000000..fb62e766685
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/fbh.asm
@@ -0,0 +1,2 @@
+fbh(8)          g15<1>D         g35<8,8,1>D                     { align1 1Q };
+fbh(16)         g8<1>D          g4<8,8,1>D                      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/fbh.expected b/src/intel/compiler/elk/tests/gen9/fbh.expected
new file mode 100644
index 00000000000..a3a1fcee746
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/fbh.expected
@@ -0,0 +1,2 @@
+4b 00 60 00 28 0a e0 21 60 04 8d 00 00 00 00 00
+4b 00 80 00 28 0a 00 21 80 00 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/fbl.asm b/src/intel/compiler/elk/tests/gen9/fbl.asm
new file mode 100644
index 00000000000..e7f1c7020f5
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/fbl.asm
@@ -0,0 +1,3 @@
+fbl(8)          g5<1>UD         g5<8,8,1>UD                     { align1 1Q };
+fbl(16)         g6<1>UD         g8<8,8,1>UD                     { align1 1H };
+fbl(1)          g43<1>UD        mask0<0,1,0>UD                  { align1 WE_all 1N };
diff --git a/src/intel/compiler/elk/tests/gen9/fbl.expected b/src/intel/compiler/elk/tests/gen9/fbl.expected
new file mode 100644
index 00000000000..60cb680a350
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/fbl.expected
@@ -0,0 +1,3 @@
+4c 00 60 00 08 02 a0 20 a0 00 8d 00 00 00 00 00
+4c 00 80 00 08 02 c0 20 00 01 8d 00 00 00 00 00
+4c 00 00 00 0c 00 60 25 00 08 00 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/frc.asm b/src/intel/compiler/elk/tests/gen9/frc.asm
new file mode 100644
index 00000000000..910fbed5b59
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/frc.asm
@@ -0,0 +1,2 @@
+frc(8)          g28<1>F         g4<8,8,1>F                      { align1 1Q };
+frc(16)         g3<1>F          g1<0,1,0>F                      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/frc.expected b/src/intel/compiler/elk/tests/gen9/frc.expected
new file mode 100644
index 00000000000..00484ffedd3
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/frc.expected
@@ -0,0 +1,2 @@
+43 00 60 00 e8 3a 80 23 80 00 8d 00 00 00 00 00
+43 00 80 00 e8 3a 60 20 20 00 00 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/halt.asm b/src/intel/compiler/elk/tests/gen9/halt.asm
new file mode 100644
index 00000000000..726d1917f88
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/halt.asm
@@ -0,0 +1,6 @@
+(-f0.1.any4h) halt(8) JIP: LABEL0      UIP: LABEL0              { align1 1Q };
+halt(8)         JIP: LABEL1            UIP: LABEL1              { align1 1Q };
+LABEL1:
+(-f0.1.any4h) halt(16) JIP: LABEL0     UIP: LABEL0              { align1 1H };
+halt(16)        JIP: LABEL0            UIP: LABEL0              { align1 1H };
+LABEL0:
\ No newline at end of file
diff --git a/src/intel/compiler/elk/tests/gen9/halt.expected b/src/intel/compiler/elk/tests/gen9/halt.expected
new file mode 100644
index 00000000000..b0867fe7f81
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/halt.expected
@@ -0,0 +1,4 @@
+2a 00 76 00 21 0e 00 20 40 00 00 00 40 00 00 00
+2a 00 60 00 20 0e 00 20 10 00 00 00 10 00 00 00
+2a 00 96 00 21 0e 00 20 20 00 00 00 20 00 00 00
+2a 00 80 00 20 0e 00 20 10 00 00 00 10 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/if.asm b/src/intel/compiler/elk/tests/gen9/if.asm
new file mode 100644
index 00000000000..e5192c41248
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/if.asm
@@ -0,0 +1,7 @@
+(+f0.0) if(8)   JIP: LABEL0       UIP: LABEL1                   { align1 1Q };
+(-f0.0) if(8)   JIP: LABEL0       UIP: LABEL1                   { align1 1Q };
+LABEL0:
+(-f0.0) if(16)  JIP: LABEL1       UIP: LABEL1                   { align1 1H };
+(+f0.0) if(16)  JIP: LABEL1       UIP: LABEL1                   { align1 1H };
+(+f0.0) if(32)  JIP: LABEL1       UIP: LABEL1                   { align1 };
+LABEL1:
diff --git a/src/intel/compiler/elk/tests/gen9/if.expected b/src/intel/compiler/elk/tests/gen9/if.expected
new file mode 100644
index 00000000000..d11bebc1730
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/if.expected
@@ -0,0 +1,5 @@
+22 00 61 00 20 0e 00 20 50 00 00 00 20 00 00 00
+22 00 71 00 20 0e 00 20 40 00 00 00 10 00 00 00
+22 00 91 00 20 0e 00 20 30 00 00 00 30 00 00 00
+22 00 81 00 20 0e 00 20 20 00 00 00 20 00 00 00
+22 00 a1 00 20 0e 00 20 10 00 00 00 10 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/lrp.asm b/src/intel/compiler/elk/tests/gen9/lrp.asm
new file mode 100644
index 00000000000..d2445c6919b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/lrp.asm
@@ -0,0 +1,5 @@
+lrp(8)          g4<1>F          g16<4,4,1>F     g7.2<0,1,0>F    g6.6<0,1,0>F { align16 1Q };
+lrp(16)         g4<1>F          g2.4<0,1,0>F    g2.2<0,1,0>F    g2.0<0,1,0>F { align16 1H };
+lrp.z.f0.0(8)   g8<1>F          g3.2<0,1,0>F    g3.1<0,1,0>F    g3.0<0,1,0>F { align16 1Q };
+lrp.sat(8)      g7<1>F          g10<4,4,1>F     g13<4,4,1>F     g16<4,4,1>F { align16 1Q };
+lrp.sat(16)     g18<1>F         g20<4,4,1>F     g26<4,4,1>F     g32<4,4,1>F { align16 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/lrp.expected b/src/intel/compiler/elk/tests/gen9/lrp.expected
new file mode 100644
index 00000000000..b109e92a5be
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/lrp.expected
@@ -0,0 +1,5 @@
+5c 01 60 00 00 00 1e 04 c8 01 21 80 0e 04 b0 01
+5c 01 80 00 00 00 1e 04 01 28 20 80 04 04 80 00
+5c 01 60 01 00 00 1e 08 01 34 20 40 06 04 c0 00
+5c 01 60 80 00 00 1e 07 c8 a1 00 39 1a 20 07 04
+5c 01 80 80 00 00 1e 12 c8 41 01 39 34 20 07 08
diff --git a/src/intel/compiler/elk/tests/gen9/lzd.asm b/src/intel/compiler/elk/tests/gen9/lzd.asm
new file mode 100644
index 00000000000..2dba1a11453
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/lzd.asm
@@ -0,0 +1,2 @@
+lzd(8)          g25<1>UD        g3.1<0,1,0>UD                   { align1 1Q };
+lzd(16)         g27<1>UD        g3.1<0,1,0>UD                   { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/lzd.expected b/src/intel/compiler/elk/tests/gen9/lzd.expected
new file mode 100644
index 00000000000..74afe29080d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/lzd.expected
@@ -0,0 +1,2 @@
+4a 00 60 00 08 02 20 23 64 00 00 00 00 00 00 00
+4a 00 80 00 08 02 60 23 64 00 00 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/mach.asm b/src/intel/compiler/elk/tests/gen9/mach.asm
new file mode 100644
index 00000000000..7f632bf16bf
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/mach.asm
@@ -0,0 +1,4 @@
+mach(8)         g19<1>UD        g17<8,8,1>UD    0xaaaaaaabUD    { align1 1Q AccWrEnable };
+mach(8)         g23<1>D         g17<8,8,1>D     1431655766D     { align1 1Q AccWrEnable };
+mach(8)         g42<1>UD        g39<8,8,1>UD    0xaaaaaaabUD    { align1 2Q AccWrEnable };
+mach(8)         g50<1>D         g39<8,8,1>D     1431655766D     { align1 2Q AccWrEnable };
diff --git a/src/intel/compiler/elk/tests/gen9/mach.expected b/src/intel/compiler/elk/tests/gen9/mach.expected
new file mode 100644
index 00000000000..d90d46e56ef
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/mach.expected
@@ -0,0 +1,4 @@
+49 00 60 10 08 02 60 22 20 02 8d 06 ab aa aa aa
+49 00 60 10 28 0a e0 22 20 02 8d 0e 56 55 55 55
+49 10 60 10 08 02 40 25 e0 04 8d 06 ab aa aa aa
+49 10 60 10 28 0a 40 26 e0 04 8d 0e 56 55 55 55
diff --git a/src/intel/compiler/elk/tests/gen9/mad.asm b/src/intel/compiler/elk/tests/gen9/mad.asm
new file mode 100644
index 00000000000..a48131f21a1
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/mad.asm
@@ -0,0 +1,43 @@
+mad(8)          g26<1>F         g22<4,4,1>F     g2.4<0,1,0>F    g5<4,4,1>F { align16 1Q };
+mad(16)         g14<1>F         g12<4,4,1>F     g4<4,4,1>F      g4<4,4,1>F { align16 1H };
+mad(8)          g64<1>DF        g62<4,4,1>DF    g40<4,4,1>DF    g92<4,4,1>DF { align16 1Q };
+mad(8)          g80<1>DF        -g50<4,4,1>DF   g24<4,4,1>DF    g80<4,4,1>DF { align16 1Q };
+mad(8)          g27<1>DF        g48<4,4,1>DF    g106<4,4,1>DF   g25<4,4,1>DF { align16 2Q };
+mad(8)          g13<1>F         -g14.0<0,1,0>F  g11<4,4,1>F     g6<4,4,1>F { align16 1Q };
+mad(16)         g29<1>F         -g33.0<0,1,0>F  g25<4,4,1>F     g15<4,4,1>F { align16 1H };
+mad(8)          g29<1>DF        g23<4,4,1>DF    g27<4,4,1>DF    -g25<4,4,1>DF { align16 1Q };
+mad.le.f0.0(8)  g5<1>F          g3<4,4,1>F      g4.2<0,1,0>F    g64<4,4,1>F { align16 1Q };
+mad.le.f0.0(16) g7<1>F          g4<4,4,1>F      g6.2<0,1,0>F    g16<4,4,1>F { align16 1H };
+mad(8)          g32<1>F         g31<4,4,1>F     g2.3<0,1,0>F    -g15<4,4,1>F { align16 1Q };
+mad(16)         g56<1>F         g54<4,4,1>F     g2.3<0,1,0>F    -g5<4,4,1>F { align16 1H };
+mad.sat(8)      g12<1>F         g4.1<0,1,0>F    g4.0<0,1,0>F    g8<4,4,1>F { align16 1Q };
+mad.sat(16)     g18<1>F         g6.1<0,1,0>F    g6.0<0,1,0>F    g10<4,4,1>F { align16 1H };
+mad(8)          g86<1>F         g88.6<0,1,0>F   -g88.7<0,1,0>F  g77<4,4,1>F { align16 1Q };
+mad(8)          g85<1>DF        g28<4,4,1>DF    g83<4,4,1>DF    -g81<4,4,1>DF { align16 2Q };
+mad(8)          g11<1>F         -g2.0<0,1,0>F   g10<4,4,1>F     (abs)g5.6<0,1,0>F { align16 1Q };
+mad(8)          g15<1>F         g2.1<0,1,0>F    g11<4,4,1>F     (abs)g5.6<0,1,0>F { align16 1Q };
+mad.l.f0.0(8)   g2<1>F          g22<4,4,1>F     g5.7<0,1,0>F    g6.3<0,1,0>F { align16 1Q };
+mad(8)          g79<1>DF        -g39<4,4,1>DF   g21<4,4,1>DF    g79<4,4,1>DF { align16 2Q };
+mad(8)          g117<1>F        -g116<4,4,1>F   g9.0<0,1,0>F    -g113<4,4,1>F { align16 1Q };
+mad.ge.f0.0(8)  g13<1>F         g28.0<0,1,0>F   g9<4,4,1>F      -g2.4<0,1,0>F { align16 1Q };
+mad.ge.f0.0(16) g23<1>F         g17.0<0,1,0>F   g6<4,4,1>F      -g3.0<0,1,0>F { align16 1H };
+mad(8)          g26<1>F         g2.0<0,1,0>F    -g2.1<0,1,0>F   (abs)g5.6<0,1,0>F { align16 1Q };
+mad(8)          g70<1>F         -g13<4,4,1>F    -g2.1<0,1,0>F   -g47<4,4,1>F { align16 1Q };
+mad(16)         g95<1>F         -g93<4,4,1>F    g85<4,4,1>F     -g85<4,4,1>F { align16 1H };
+mad(16)         g5<1>F          -g21<4,4,1>F    -g2.1<0,1,0>F   -g85<4,4,1>F { align16 1H };
+mad(16)         g56<1>F         g6.4<0,1,0>F    -g6.5<0,1,0>F   g51<4,4,1>F { align16 1H };
+mad.sat(8)      g124<1>F        -g7<4,4,1>F     g2.6<0,1,0>F    g2.1<0,1,0>F { align16 1Q };
+mad(16)         g71<1>F         g55.0<0,1,0>F   -g55.1<0,1,0>F  (abs)g1.0<0,1,0>F { align16 1H };
+mad(16)         g77<1>F         -g55.2<0,1,0>F  g71<4,4,1>F     (abs)g1.0<0,1,0>F { align16 1H };
+mad(16)         g37<1>F         g55.3<0,1,0>F   g77<4,4,1>F     (abs)g1.0<0,1,0>F { align16 1H };
+mad(8)          g43<1>DF        g42<4,4,1>DF    -g34<4,4,1>DF   g7<4,4,1>DF { align16 1Q };
+mad(8)          g3<1>DF         g2<4,4,1>DF     -g111<4,4,1>DF  g39<4,4,1>DF { align16 2Q };
+mad(8)          g12<1>F         -g17<4,4,1>F    (abs)g7<4,4,1>F g4.0<0,1,0>F { align16 1Q };
+mad(16)         g27<1>F         -g22<4,4,1>F    (abs)g19<4,4,1>F g29.0<0,1,0>F { align16 1H };
+mad.sat(8)      g125<1>F        g9<4,4,1>F      g6<4,4,1>F      -g64.0<0,1,0>F { align16 1Q };
+mad.l.f0.0(16)  g5<1>F          g9<4,4,1>F      g2.7<0,1,0>F    g3.3<0,1,0>F { align16 1H };
+mad(8)          g6<1>DF         -g55<4,4,1>DF   g2<4,4,1>DF     -g47<4,4,1>DF { align16 1Q };
+mad.z.f0.0(8)   g8<1>F          g3.2<0,1,0>F    g3.1<0,1,0>F    g3.0<0,1,0>F { align16 1Q };
+mad(8)          g63<1>DF        -g48<4,4,1>DF   g56<4,4,1>DF    -g44<4,4,1>DF { align16 2Q };
+mad.nz.f0.0(8)  g10<1>F         -g12.0<0,1,0>F  g7<4,4,1>F      g10<4,4,1>F { align16 1Q };
+mad.nz.f0.0(16) g15<1>F         -g33.0<0,1,0>F  g9<4,4,1>F      g17<4,4,1>F { align16 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/mad.expected b/src/intel/compiler/elk/tests/gen9/mad.expected
new file mode 100644
index 00000000000..76df668b448
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/mad.expected
@@ -0,0 +1,43 @@
+5b 01 60 00 00 00 1e 1a c8 61 21 00 05 20 47 01
+5b 01 80 00 00 00 1e 0e c8 c1 00 39 08 20 07 01
+5b 01 60 00 00 d8 1e 40 c8 e1 03 39 50 20 07 17
+5b 01 60 00 40 d8 1e 50 c8 21 03 39 30 20 07 14
+5b 11 60 00 00 d8 1e 1b c8 01 03 39 d4 20 47 06
+5b 01 60 00 40 00 1e 0d 01 e0 00 39 16 20 87 01
+5b 01 80 00 40 00 1e 1d 01 10 02 39 32 20 c7 03
+5b 01 60 00 00 dc 1e 1d c8 71 01 39 36 20 47 06
+5b 01 60 06 00 00 1e 05 c8 31 20 80 08 20 07 10
+5b 01 80 06 00 00 1e 07 c8 41 20 80 0c 20 07 04
+5b 01 60 00 00 04 1e 20 c8 f1 21 c0 04 20 c7 03
+5b 01 80 00 00 04 1e 38 c8 61 23 c0 04 20 47 01
+5b 01 60 80 00 00 1e 0c 01 42 20 00 08 20 07 02
+5b 01 80 80 00 00 1e 12 01 62 20 00 0c 20 87 02
+5b 01 60 00 00 01 1e 56 01 8c 25 c0 b1 20 47 13
+5b 11 60 00 00 dc 1e 55 c8 c1 01 39 a6 20 47 14
+5b 01 60 00 40 02 1e 0b 01 20 00 39 14 04 70 01
+5b 01 60 00 00 02 1e 0f 01 22 00 39 16 04 70 01
+5b 01 60 05 00 00 1e 02 c8 61 21 c0 0b 04 98 01
+5b 11 60 00 40 d8 1e 4f c8 71 02 39 2a 20 c7 13
+5b 01 60 00 40 04 1e 75 c8 41 27 00 12 20 47 1c
+5b 01 60 04 00 04 1e 0d 01 c0 01 39 12 04 a0 00
+5b 01 80 04 00 04 1e 17 01 10 01 39 0c 04 c0 00
+5b 01 60 00 00 03 1e 1a 01 20 20 40 04 04 70 01
+5b 01 60 00 40 05 1e 46 c8 d1 20 40 04 20 c7 0b
+5b 01 80 00 40 04 1e 5f c8 d1 05 39 aa 20 47 15
+5b 01 80 00 40 05 1e 05 c8 51 21 40 04 20 47 15
+5b 01 80 00 00 01 1e 38 01 68 20 40 0d 20 c7 0c
+5b 01 60 80 40 00 1e 7c c8 71 20 80 05 04 88 00
+5b 01 80 00 00 03 1e 47 01 70 23 40 6e 04 40 00
+5b 01 80 00 40 02 1e 4d 01 74 03 39 8e 04 40 00
+5b 01 80 00 00 02 1e 25 01 76 03 39 9a 04 40 00
+5b 01 60 00 00 d9 1e 2b c8 a1 02 39 44 20 c7 01
+5b 11 60 00 00 d9 1e 03 c8 21 00 39 de 20 c7 09
+5b 01 60 00 c0 00 1e 0c c8 11 01 39 0e 04 00 01
+5b 01 80 00 c0 00 1e 1b c8 61 01 39 26 04 40 07
+5b 01 60 80 00 04 1e 7d c8 91 00 39 0c 04 00 10
+5b 01 80 05 00 00 1e 05 c8 91 20 c0 05 04 d8 00
+5b 01 60 00 40 dc 1e 06 c8 71 03 39 04 20 c7 0b
+5b 01 60 01 00 00 1e 08 01 34 20 40 06 04 c0 00
+5b 11 60 00 40 dc 1e 3f c8 01 03 39 70 20 07 0b
+5b 01 60 02 40 00 1e 0a 01 c0 00 39 0e 20 87 02
+5b 01 80 02 40 00 1e 0f 01 10 02 39 12 20 47 04
diff --git a/src/intel/compiler/elk/tests/gen9/math.asm b/src/intel/compiler/elk/tests/gen9/math.asm
new file mode 100644
index 00000000000..d6a54d2c389
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/math.asm
@@ -0,0 +1,31 @@
+math sqrt(16)   g20<1>F         g18<8,8,1>F     null<8,8,1>F    { align1 1H };
+math inv(8)     g95<1>F         g94<8,8,1>F     null<8,8,1>F    { align1 1Q };
+math inv(16)    g10<1>F         g8<8,8,1>F      null<8,8,1>F    { align1 1H };
+math intmod(8)  g3<1>UD         g1<0,1,0>UD     g1.2<0,1,0>UD   { align1 1Q };
+math intmod(8)  g4<1>UD         g1<0,1,0>UD     g1.2<0,1,0>UD   { align1 2Q };
+math sqrt(8)    g24<1>F         g23<8,8,1>F     null<8,8,1>F    { align1 1Q };
+math rsq(8)     g5<1>F          g2<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math pow(8)     g11<1>F         g10<8,8,1>F     0x42fc6666F  /* 126.2F */ { align1 1Q };
+math pow(16)    g18<1>F         g16<8,8,1>F     0x42fc6666F  /* 126.2F */ { align1 1H };
+math log(8)     g7<1>F          g6<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math log(16)    g11<1>F         g9<8,8,1>F      null<8,8,1>F    { align1 1H };
+math cos(8)     g3<1>F          g2<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math cos(16)    g4<1>F          g2<8,8,1>F      null<8,8,1>F    { align1 1H };
+math intdiv(8)  g4<1>UD         g1<0,1,0>UD     g1.4<0,1,0>UD   { align1 1Q };
+math intdiv(8)  g5<1>UD         g1<0,1,0>UD     g1.4<0,1,0>UD   { align1 2Q };
+math intdiv(8)  g24<1>D         g4<0,1,0>D      g2.2<0,1,0>D    { align1 1Q };
+math sin(8)     g10<1>F         g9<8,8,1>F      null<8,8,1>F    { align1 1Q };
+math rsq(16)    g68<1>F         g66<8,8,1>F     null<8,8,1>F    { align1 1H };
+math exp(8)     g124<1>F        g10<8,8,1>F     null<8,8,1>F    { align1 1Q };
+math exp(16)    g120<1>F        g7<8,8,1>F      null<8,8,1>F    { align1 1H };
+math intdiv(8)  g5<1>D          g2<0,1,0>D      g2.4<0,1,0>D    { align1 2Q };
+math sin(16)    g3<1>F          g2<0,1,0>F      null<8,8,1>F    { align1 1H };
+math.sat pow(8) g3<1>F          g2<0,1,0>F      g2.4<0,1,0>F    { align1 1Q };
+math.sat pow(16) g3<1>F         g2<0,1,0>F      g2.4<0,1,0>F    { align1 1H };
+math.sat sqrt(8) g3<1>F         g2<0,1,0>F      null<8,8,1>F    { align1 1Q };
+math.sat sqrt(16) g3<1>F        g2<0,1,0>F      null<8,8,1>F    { align1 1H };
+math.sat exp(8) g3<1>F          g2<0,1,0>F      null<8,8,1>F    { align1 1Q };
+math.sat exp(16) g3<1>F         g2<0,1,0>F      null<8,8,1>F    { align1 1H };
+math.sat rsq(8) g127<1>F        (abs)g7<8,8,1>F null<8,8,1>F    { align1 1Q };
+math.sat inv(8) g124<1>F        g2<0,1,0>F      null<8,8,1>F    { align1 1Q };
+math.sat log(8) g127<1>F        g7<8,8,1>F      null<8,8,1>F    { align1 1Q };
diff --git a/src/intel/compiler/elk/tests/gen9/math.expected b/src/intel/compiler/elk/tests/gen9/math.expected
new file mode 100644
index 00000000000..9837a7cee3f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/math.expected
@@ -0,0 +1,31 @@
+38 00 80 04 e8 3a 80 22 40 02 8d 38 00 00 8d 00
+38 00 60 01 e8 3a e0 2b c0 0b 8d 38 00 00 8d 00
+38 00 80 01 e8 3a 40 21 00 01 8d 38 00 00 8d 00
+38 00 60 0d 08 02 60 20 20 00 00 02 28 00 00 00
+38 10 60 0d 08 02 80 20 20 00 00 02 28 00 00 00
+38 00 60 04 e8 3a 00 23 e0 02 8d 38 00 00 8d 00
+38 00 60 05 e8 3a a0 20 40 00 8d 38 00 00 8d 00
+38 00 60 0a e8 3a 60 21 40 01 8d 3e 66 66 fc 42
+38 00 80 0a e8 3a 40 22 00 02 8d 3e 66 66 fc 42
+38 00 60 02 e8 3a e0 20 c0 00 8d 38 00 00 8d 00
+38 00 80 02 e8 3a 60 21 20 01 8d 38 00 00 8d 00
+38 00 60 07 e8 3a 60 20 40 00 8d 38 00 00 8d 00
+38 00 80 07 e8 3a 80 20 40 00 8d 38 00 00 8d 00
+38 00 60 0c 08 02 80 20 20 00 00 02 30 00 00 00
+38 10 60 0c 08 02 a0 20 20 00 00 02 30 00 00 00
+38 00 60 0c 28 0a 00 23 80 00 00 0a 48 00 00 00
+38 00 60 06 e8 3a 40 21 20 01 8d 38 00 00 8d 00
+38 00 80 05 e8 3a 80 28 40 08 8d 38 00 00 8d 00
+38 00 60 03 e8 3a 80 2f 40 01 8d 38 00 00 8d 00
+38 00 80 03 e8 3a 00 2f e0 00 8d 38 00 00 8d 00
+38 10 60 0c 28 0a a0 20 40 00 00 0a 50 00 00 00
+38 00 80 06 e8 3a 60 20 40 00 00 38 00 00 8d 00
+38 00 60 8a e8 3a 60 20 40 00 00 3a 50 00 00 00
+38 00 80 8a e8 3a 60 20 40 00 00 3a 50 00 00 00
+38 00 60 84 e8 3a 60 20 40 00 00 38 00 00 8d 00
+38 00 80 84 e8 3a 60 20 40 00 00 38 00 00 8d 00
+38 00 60 83 e8 3a 60 20 40 00 00 38 00 00 8d 00
+38 00 80 83 e8 3a 60 20 40 00 00 38 00 00 8d 00
+38 00 60 85 e8 3a e0 2f e0 20 8d 38 00 00 8d 00
+38 00 60 81 e8 3a 80 2f 40 00 00 38 00 00 8d 00
+38 00 60 82 e8 3a e0 2f e0 00 8d 38 00 00 8d 00
diff --git a/src/intel/compiler/elk/tests/gen9/mov.asm b/src/intel/compiler/elk/tests/gen9/mov.asm
new file mode 100644
index 00000000000..833631bb9e2
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/mov.asm
@@ -0,0 +1,139 @@
+mov(8)          g123<1>UD       g1<8,8,1>UD                     { align1 WE_all 1Q };
+mov(8)          g124<1>F        0x40c00000F      /* 6F */       { align1 1Q };
+mov(8)          g14<1>UD        0x00000000UD                    { align1 1Q };
+mov(8)          g17<1>F         g12<8,8,1>F                     { align1 1Q };
+mov.sat(8)      g124<1>F        g8<8,8,1>F                      { align1 1Q };
+mov(8)          g61<2>D         g22<8,8,1>D                     { align1 1Q };
+mov(8)          g21<1>D         g59<8,4,2>UD                    { align1 1Q };
+mov(8)          g4<1>D          -1D                             { align1 1Q };
+mov.nz.f0.0(8)  null<1>D        g4<8,8,1>D                      { align1 1Q };
+mov(1)          g2.2<1>UD       0x00000000UD                    { align1 WE_all 1N };
+mov(4)          g114<1>F        g2.3<8,2,4>F                    { align1 WE_all 1N };
+mov(8)          g126<1>F        g4<8,8,1>D                      { align1 1Q };
+mov(16)         g124<1>F        g4<8,8,1>D                      { align1 1H };
+mov(16)         g120<1>F        g124<8,8,1>F                    { align1 1H };
+mov(16)         g124<1>F        0x0F             /* 0F */       { align1 1H };
+mov(16)         g124<1>D        1065353216D                     { align1 1H };
+mov.nz.f0.0(16) null<1>D        g2<0,1,0>D                      { align1 1H };
+mov(8)          g3<1>UW         0x76543210V                     { align1 WE_all 1Q };
+mov(16)         g20<1>UD        g0.1<0,1,0>UD                   { align1 1H };
+mov(16)         g6<1>D          g3<8,8,1>UW                     { align1 1H };
+mov(8)          g1<1>D          g4<8,8,1>D                      { align1 2Q };
+mov(8)          g5<1>D          0D                              { align1 2Q };
+mov(8)          g2<1>F          g6<8,4,1>UW                     { align1 1Q };
+mov(8)          g7<1>D          g2<8,8,1>F                      { align1 1Q };
+mov(16)         g2<1>F          g10<8,4,1>UW                    { align1 1H };
+mov(16)         g11<1>D         g2<8,8,1>F                      { align1 1H };
+mov(8)          g80<1>DF        g5<0,1,0>DF                     { align1 1Q };
+mov(8)          g92<2>UD        g6.4<0,1,0>UD                   { align1 1Q };
+mov(8)          g62<1>Q         0xbff0000000000000Q             { align1 1Q };
+mov(8)          g92<2>F         g92<4,4,1>DF                    { align1 1Q };
+mov(8)          g92<1>DF        g95<4,4,1>F                     { align1 1Q };
+mov(8)          g106<1>DF       g2<0,1,0>F                      { align1 2Q };
+mov(8)          g48<1>Q         0xbff0000000000000Q             { align1 2Q };
+mov(8)          g127<1>UD       g106.1<8,4,2>UD                 { align1 2Q };
+mov(8)          g11<2>F         g7<4,4,1>DF                     { align1 2Q };
+mov(8)          g33<1>D         g34<8,4,2>UD                    { align1 2Q };
+mov(8)          g6<2>UD         0x00000000UD                    { align1 2Q };
+mov(8)          g2<1>UW         0x76543210UV                    { align1 1Q };
+mov(8)          g12<1>UD        g2<8,8,1>UW                     { align1 1Q };
+mov(8)          g7<1>UD         0x00080000UD                    { align1 WE_all 1Q };
+mov(1)          g2<1>F          0x3e800000F      /* 0.25F */    { align1 WE_all 1N };
+mov(8)          g15<1>F         g11<8,8,1>UD                    { align1 1Q };
+mov(1)          f0.1<1>UW       g1.14<0,1,0>UW                  { align1 WE_all 1N };
+mov(8)          g18<1>UD        g2<8,8,1>D                      { align1 1Q };
+mov(16)         g18<1>UD        g26<8,8,1>D                     { align1 1H };
+mov(16)         g120<1>D        g34<8,8,1>D                     { align1 1H };
+mov(8)          g8<1>Q          g13<4,4,1>Q                     { align1 1Q };
+mov(8)          g21<1>UD        g0<8,8,1>UD                     { align1 WE_all 2Q };
+mov(8)          g23<1>F         g6<0,1,0>F                      { align1 2Q };
+mov(1)          g21.2<1>UD      0x000003f2UD                    { align1 WE_all 3N };
+mov.nz.f0.0(8)  g19<1>D         g3<8,4,2>UD                     { align1 1Q };
+mov(1)          f1<1>UD         g1.7<0,1,0>UD                   { align1 WE_all 1N };
+mov.sat(8)      g126<1>F        0x0F             /* 0F */       { align1 1Q };
+mov.sat(8)      g124<1>F        -g36<8,8,1>D                    { align1 1Q };
+mov(8)          g41<1>F         0x0F             /* 0F */       { align1 2Q };
+mov(8)          g42<1>UD        g11<8,8,1>D                     { align1 2Q };
+mov(16)         g86<1>UD        g88<8,8,1>UD                    { align1 WE_all 1H };
+mov.sat(16)     g120<1>F        g2<0,1,0>F                      { align1 1H };
+mov(16)         g2<1>F          g18<8,8,1>UD                    { align1 1H };
+mov(8)          g4<1>UD         0x0F             /* 0F */       { align1 1Q };
+mov(8)          g8<1>DF         g2<0,1,0>D                      { align1 1Q };
+mov(16)         g8<1>UD         0x00000000UD                    { align1 1H };
+mov.nz.f0.0(8)  g4<1>F          -(abs)g2<0,1,0>F                { align1 1Q };
+(+f0.0) mov(8)  g4<1>F          0xbf800000F      /* -1F */      { align1 1Q };
+mov.nz.f0.0(16) g4<1>F          -(abs)g2<0,1,0>F                { align1 1H };
+(+f0.0) mov(16) g4<1>F          0xbf800000F      /* -1F */      { align1 1H };
+mov(1)          f1<1>UD         g1.7<0,1,0>UD                   { align1 WE_all 3N };
+mov(8)          g32<1>DF        g2<0,1,0>DF                     { align1 2Q };
+mov(8)          g5<1>F          g2<0,1,0>HF                     { align1 1Q };
+mov(16)         g6<1>F          g2<0,1,0>HF                     { align1 1H };
+mov(8)          g7<1>UD         g2<0,1,0>F                      { align1 1Q };
+mov(16)         g15<1>UD        g11<8,8,1>F                     { align1 1H };
+mov(16)         g19<1>UD        g15<16,8,2>UW                   { align1 1H };
+mov(1)          g19<1>UD        g[a0 64]<0,1,0>UD               { align1 WE_all 1N };
+mov(16)         g23<1>UD        g21<32,8,4>UB                   { align1 1H };
+mov(8)          g7<1>DF         0x0000000000000000DF /* 0DF */  { align1 1Q };
+mov(8)          g5<1>F          0x0F             /* 0F */       { align1 WE_all 1Q };
+mov(16)         g4<1>UD         0x00000000UD                    { align1 WE_all 1H };
+mov(8)          g5<2>UD         g2<0,1,0>DF                     { align1 1Q };
+mov(8)          g10<2>UD        g2<0,1,0>DF                     { align1 2Q };
+mov(8)          g3<1>DF         g2<0,1,0>UD                     { align1 1Q };
+mov(8)          g3<1>DF         g2<0,1,0>UD                     { align1 2Q };
+mov(1)          f0<1>UW         0x0000UW                        { align1 WE_all 1N };
+mov(1)          g1<1>D          0D                              { align1 WE_all 1N };
+(+f0.0.any16h) mov(1) g1<1>D    -1D                             { align1 WE_all 1N };
+mov(8)          g9<1>F          g2<0,1,0>W                      { align1 1Q };
+mov(8)          g7<1>UQ         g4<4,4,1>UQ                     { align1 1Q };
+mov(16)         g11<1>UD        0x0F             /* 0F */       { align1 1H };
+mov(8)          g5<2>D          g2<0,1,0>DF                     { align1 1Q };
+mov(8)          g10<2>D         g2<0,1,0>DF                     { align1 2Q };
+mov(1)          f1<1>UW         f0.1<0,1,0>UW                   { align1 WE_all 1N };
+mov(1)          f1<1>UW         f0.1<0,1,0>UW                   { align1 WE_all 3N };
+mov(16)         g4<1>D          0D                              { align1 2H };
+mov(8)          g14<1>UD        g13<32,8,4>UB                   { align1 1Q };
+mov(16)         g124<1>UD       g15<8,8,1>UD                    { align1 2H };
+mov(16)         g118<1>D        g122<8,8,1>UW                   { align1 2H };
+mov(16)         g101<1>UD       0x00000001UD                    { align1 2H };
+mov(1)          g4<2>UW         0x00000000UD                    { align1 WE_all 1N };
+mov(8)          g4<1>UD         f0<0,1,0>UW                     { align1 1Q };
+mov(8)          g8<1>D          g2<8,8,1>UW                     { align1 1Q };
+mov(16)         g4<1>UD         f0<0,1,0>UW                     { align1 1H };
+mov(8)          g3<1>DF         -g2<0,1,0>D                     { align1 2Q };
+mov(8)          g5<1>F          g2<0,1,0>B                      { align1 1Q };
+mov(16)         g6<1>F          g2<0,1,0>B                      { align1 1H };
+mov(8)          g4<1>DF         0x0000000000000000DF /* 0DF */  { align1 2Q };
+mov.nz.f0.0(8)  g16<1>D         g17<8,4,2>UD                    { align1 2Q };
+mov(8)          g34<1>UW        0x76543210V                     { align1 1Q };
+mov(8)          g8<1>UD         48D                             { align1 1Q };
+mov(16)         g8<1>UD         0D                              { align1 1H };
+mov(8)          g7<2>HF         g2.1<0,1,0>F                    { align1 1Q };
+mov(1)          g5<1>D          g[a0 96]<0,1,0>D                { align1 WE_all 1N };
+(+f0.0.any8h) mov(1) g2<1>D     -1D                             { align1 WE_all 1N };
+mov(8)          g9<1>UD         0D                              { align1 WE_all 1Q };
+mov(8)          g2<2>UW         g9<8,8,1>F                      { align1 1Q };
+mov(8)          g3<1>UW         g2<16,8,2>UW                    { align1 1Q };
+mov(8)          g12<1>UW        g8<16,8,2>UW                    { align1 WE_all 1Q };
+mov.sat(16)     g13<1>F         0x3f800000F      /* 1F */       { align1 1H };
+mov(16)         g19<2>UW        g17<8,8,1>F                     { align1 1H };
+mov(16)         g4<1>UW         g13<16,8,2>UW                   { align1 WE_all 1H };
+mov.nz.f0.0(8)  null<1>D        0x00000000UD                    { align1 1Q };
+mov.nz.f0.0(16) null<1>D        0x00000000UD                    { align1 1H };
+mov(4)          g3<1>UD         tm0<4,4,1>UD                    { align1 WE_all 1N };
+(+f0.0.all16h) mov(1) g1<1>D    -1D                             { align1 WE_all 1N };
+mov(8)          g9<1>F          g2<0,1,0>UB                     { align1 1Q };
+mov(16)         g6<1>F          g2<0,1,0>UB                     { align1 1H };
+mov(16)         g10<2>HF        g4<8,8,1>F                      { align1 1H };
+mov.z.f0.0(8)   null<1>UD       g2<8,8,1>UD                     { align1 1Q };
+mov.sat(8)      g125<1>F        g9<8,8,1>UD                     { align1 1Q };
+mov.z.f0.0(16)  g1<1>UD         g0.7<0,1,0>UD                   { align1 1H };
+mov.z.f0.0(8)   g18<1>D         g17<8,8,1>F                     { align1 1Q };
+mov(16)         g35<1>F         g15<16,8,2>W                    { align1 1H };
+mov(8)          g23<1>Q         g26<4,4,1>Q                     { align1 2Q };
+mov(8)          g2<1>D          0x00000000UD                    { align1 1Q };
+mov(16)         g2<1>D          0x00000000UD                    { align1 1H };
+(+f0.0.all8h) mov(1) g7<1>D     -1D                             { align1 WE_all 1N };
+mov(8)          g127<1>UB       g2<0,1,0>UB                     { align1 WE_all 1Q };
+mov.z.f0.0(8)   null<1>D        g24<8,8,1>F                     { align1 1Q };
+mov.z.f0.0(16)  null<1>D        g76<8,8,1>F                     { align1 1H };
+mov(16)         g7<1>D          g2<16,8,2>B                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/mov.expected b/src/intel/compiler/elk/tests/gen9/mov.expected
new file mode 100644
index 00000000000..c1dc96d9d60
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/mov.expected
@@ -0,0 +1,139 @@
+01 00 60 00 0c 02 60 2f 20 00 8d 00 00 00 00 00
+01 00 60 00 e8 3e 80 2f 00 00 00 38 00 00 c0 40
+01 00 60 00 08 06 c0 21 00 00 00 00 00 00 00 00
+01 00 60 00 e8 3a 20 22 80 01 8d 00 00 00 00 00
+01 00 60 80 e8 3a 80 2f 00 01 8d 00 00 00 00 00
+01 00 60 00 28 0a a0 47 c0 02 8d 00 00 00 00 00
+01 00 60 00 28 02 a0 22 60 07 8a 00 00 00 00 00
+01 00 60 00 28 0e 80 20 00 00 00 08 ff ff ff ff
+01 00 60 02 20 0a 00 20 80 00 8d 00 00 00 00 00
+01 00 00 00 0c 06 48 20 00 00 00 00 00 00 00 00
+01 00 40 00 ec 3a 40 2e 4c 00 87 00 00 00 00 00
+01 00 60 00 e8 0a c0 2f 80 00 8d 00 00 00 00 00
+01 00 80 00 e8 0a 80 2f 80 00 8d 00 00 00 00 00
+01 00 80 00 e8 3a 00 2f 80 0f 8d 00 00 00 00 00
+01 00 80 00 e8 3e 80 2f 00 00 00 38 00 00 00 00
+01 00 80 00 28 0e 80 2f 00 00 00 08 00 00 80 3f
+01 00 80 02 20 0a 00 20 40 00 00 00 00 00 00 00
+01 00 60 00 4c 36 60 20 00 00 00 30 10 32 54 76
+01 00 80 00 08 02 80 22 04 00 00 00 00 00 00 00
+01 00 80 00 28 12 c0 20 60 00 8d 00 00 00 00 00
+01 10 60 00 28 0a 20 20 80 00 8d 00 00 00 00 00
+01 10 60 00 28 0e a0 20 00 00 00 08 00 00 00 00
+01 00 60 00 e8 12 40 20 c0 00 89 00 00 00 00 00
+01 00 60 00 28 3a e0 20 40 00 8d 00 00 00 00 00
+01 00 80 00 e8 12 40 20 40 01 89 00 00 00 00 00
+01 00 80 00 28 3a 60 21 40 00 8d 00 00 00 00 00
+01 00 60 00 c8 32 00 2a a0 00 00 00 00 00 00 00
+01 00 60 00 08 02 80 4b d0 00 00 00 00 00 00 00
+01 00 60 00 28 4f c0 27 00 00 00 00 00 00 f0 bf
+01 00 60 00 e8 32 80 4b 80 0b 69 00 00 00 00 00
+01 00 60 00 c8 3a 80 2b e0 0b 69 00 00 00 00 00
+01 10 60 00 c8 3a 40 2d 40 00 00 00 00 00 00 00
+01 10 60 00 28 4f 00 26 00 00 00 00 00 00 f0 bf
+01 10 60 00 08 02 e0 2f 44 0d 8a 00 00 00 00 00
+01 10 60 00 e8 32 60 41 e0 00 69 00 00 00 00 00
+01 10 60 00 28 02 20 24 40 04 8a 00 00 00 00 00
+01 10 60 00 08 06 c0 40 00 00 00 00 00 00 00 00
+01 00 60 00 48 26 40 20 00 00 00 20 10 32 54 76
+01 00 60 00 08 12 80 21 40 00 8d 00 00 00 00 00
+01 00 60 00 0c 06 e0 20 00 00 00 00 00 00 08 00
+01 00 00 00 ec 3e 40 20 00 00 00 38 00 00 80 3e
+01 00 60 00 e8 02 e0 21 60 01 8d 00 00 00 00 00
+01 00 00 00 44 12 02 26 3c 00 00 00 00 00 00 00
+01 00 60 00 08 0a 40 22 40 00 8d 00 00 00 00 00
+01 00 80 00 08 0a 40 22 40 03 8d 00 00 00 00 00
+01 00 80 00 28 0a 00 2f 40 04 8d 00 00 00 00 00
+01 00 60 00 28 4b 00 21 a0 01 69 00 00 00 00 00
+01 10 60 00 0c 02 a0 22 00 00 8d 00 00 00 00 00
+01 10 60 00 e8 3a e0 22 c0 00 00 00 00 00 00 00
+01 10 00 00 0c 06 a8 22 00 00 00 00 f2 03 00 00
+01 00 60 02 28 02 60 22 60 00 8a 00 00 00 00 00
+01 00 00 00 04 02 20 26 3c 00 00 00 00 00 00 00
+01 00 60 80 e8 3e c0 2f 00 00 00 38 00 00 00 00
+01 00 60 80 e8 0a 80 2f 80 44 8d 00 00 00 00 00
+01 10 60 00 e8 3e 20 25 00 00 00 38 00 00 00 00
+01 10 60 00 08 0a 40 25 60 01 8d 00 00 00 00 00
+01 00 80 00 0c 02 c0 2a 00 0b 8d 00 00 00 00 00
+01 00 80 80 e8 3a 00 2f 40 00 00 00 00 00 00 00
+01 00 80 00 e8 02 40 20 40 02 8d 00 00 00 00 00
+01 00 60 00 08 3e 80 20 00 00 00 38 00 00 00 00
+01 00 60 00 c8 0a 00 21 40 00 00 00 00 00 00 00
+01 00 80 00 08 06 00 21 00 00 00 00 00 00 00 00
+01 00 60 02 e8 3a 80 20 40 60 00 00 00 00 00 00
+01 00 61 00 e8 3e 80 20 00 00 00 38 00 00 80 bf
+01 00 80 02 e8 3a 80 20 40 60 00 00 00 00 00 00
+01 00 81 00 e8 3e 80 20 00 00 00 38 00 00 80 bf
+01 10 00 00 04 02 20 26 3c 00 00 00 00 00 00 00
+01 10 60 00 c8 32 00 24 40 00 00 00 00 00 00 00
+01 00 60 00 e8 52 a0 20 40 00 00 00 00 00 00 00
+01 00 80 00 e8 52 c0 20 40 00 00 00 00 00 00 00
+01 00 60 00 08 3a e0 20 40 00 00 00 00 00 00 00
+01 00 80 00 08 3a e0 21 60 01 8d 00 00 00 00 00
+01 00 80 00 08 12 60 22 e0 01 ae 00 00 00 00 00
+01 00 00 00 0c 02 60 22 40 80 00 00 00 00 00 00
+01 00 80 00 08 22 e0 22 a0 02 cf 00 00 00 00 00
+01 00 60 00 c8 56 e0 20 00 00 00 00 00 00 00 00
+01 00 60 00 ec 3e a0 20 00 00 00 38 00 00 00 00
+01 00 80 00 0c 06 80 20 00 00 00 00 00 00 00 00
+01 00 60 00 08 32 a0 40 40 00 00 00 00 00 00 00
+01 10 60 00 08 32 40 41 40 00 00 00 00 00 00 00
+01 00 60 00 c8 02 60 20 40 00 00 00 00 00 00 00
+01 10 60 00 c8 02 60 20 40 00 00 00 00 00 00 00
+01 00 00 00 44 16 00 26 00 00 00 10 00 00 00 00
+01 00 00 00 2c 0e 20 20 00 00 00 08 00 00 00 00
+01 00 0a 00 2c 0e 20 20 00 00 00 08 ff ff ff ff
+01 00 60 00 e8 1a 20 21 40 00 00 00 00 00 00 00
+01 00 60 00 08 43 e0 20 80 00 69 00 00 00 00 00
+01 00 80 00 08 3e 60 21 00 00 00 38 00 00 00 00
+01 00 60 00 28 32 a0 40 40 00 00 00 00 00 00 00
+01 10 60 00 28 32 40 41 40 00 00 00 00 00 00 00
+01 00 00 00 44 10 20 26 02 06 00 00 00 00 00 00
+01 10 00 00 44 10 20 26 02 06 00 00 00 00 00 00
+01 20 80 00 28 0e 80 20 00 00 00 08 00 00 00 00
+01 00 60 00 08 22 c0 21 a0 01 cf 00 00 00 00 00
+01 20 80 00 08 02 80 2f e0 01 8d 00 00 00 00 00
+01 20 80 00 28 12 c0 2e 40 0f 8d 00 00 00 00 00
+01 20 80 00 08 06 a0 2c 00 00 00 00 01 00 00 00
+01 00 00 00 4c 06 80 40 00 00 00 00 00 00 00 00
+01 00 60 00 08 10 80 20 00 06 00 00 00 00 00 00
+01 00 60 00 28 12 00 21 40 00 8d 00 00 00 00 00
+01 00 80 00 08 10 80 20 00 06 00 00 00 00 00 00
+01 10 60 00 c8 0a 60 20 40 40 00 00 00 00 00 00
+01 00 60 00 e8 2a a0 20 40 00 00 00 00 00 00 00
+01 00 80 00 e8 2a c0 20 40 00 00 00 00 00 00 00
+01 10 60 00 c8 56 80 20 00 00 00 00 00 00 00 00
+01 10 60 02 28 02 00 22 20 02 8a 00 00 00 00 00
+01 00 60 00 48 36 40 24 00 00 00 30 10 32 54 76
+01 00 60 00 08 0e 00 21 00 00 00 08 30 00 00 00
+01 00 80 00 08 0e 00 21 00 00 00 08 00 00 00 00
+01 00 60 00 48 3b e0 40 44 00 00 00 00 00 00 00
+01 00 00 00 2c 0a a0 20 60 80 00 00 00 00 00 00
+01 00 08 00 2c 0e 40 20 00 00 00 08 ff ff ff ff
+01 00 60 00 0c 0e 20 21 00 00 00 08 00 00 00 00
+01 00 60 00 48 3a 40 40 20 01 8d 00 00 00 00 00
+01 00 60 00 48 12 60 20 40 00 ae 00 00 00 00 00
+01 00 60 00 4c 12 80 21 00 01 ae 00 00 00 00 00
+01 00 80 80 e8 3e a0 21 00 00 00 38 00 00 80 3f
+01 00 80 00 48 3a 60 42 20 02 8d 00 00 00 00 00
+01 00 80 00 4c 12 80 20 a0 01 ae 00 00 00 00 00
+01 00 60 02 20 06 00 20 00 00 00 00 00 00 00 00
+01 00 80 02 20 06 00 20 00 00 00 00 00 00 00 00
+01 00 40 00 0c 00 60 20 00 18 69 00 00 00 00 00
+01 00 0b 00 2c 0e 20 20 00 00 00 08 ff ff ff ff
+01 00 60 00 e8 22 20 21 40 00 00 00 00 00 00 00
+01 00 80 00 e8 22 c0 20 40 00 00 00 00 00 00 00
+01 00 80 00 48 3b 40 41 80 00 8d 00 00 00 00 00
+01 00 60 01 00 02 00 20 40 00 8d 00 00 00 00 00
+01 00 60 80 e8 02 a0 2f 20 01 8d 00 00 00 00 00
+01 00 80 01 08 02 20 20 1c 00 00 00 00 00 00 00
+01 00 60 01 28 3a 40 22 20 02 8d 00 00 00 00 00
+01 00 80 00 e8 1a 60 24 e0 01 ae 00 00 00 00 00
+01 10 60 00 28 4b e0 22 40 03 69 00 00 00 00 00
+01 00 60 00 28 06 40 20 00 00 00 00 00 00 00 00
+01 00 80 00 28 06 40 20 00 00 00 00 00 00 00 00
+01 00 09 00 2c 0e e0 20 00 00 00 08 ff ff ff ff
+01 00 60 00 8c 22 e0 2f 40 00 00 00 00 00 00 00
+01 00 60 01 20 3a 00 20 00 03 8d 00 00 00 00 00
+01 00 80 01 20 3a 00 20 80 09 8d 00 00 00 00 00
+01 00 80 00 28 2a e0 20 40 00 ae 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/mul.asm b/src/intel/compiler/elk/tests/gen9/mul.asm
new file mode 100644
index 00000000000..36f4a1bcf57
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/mul.asm
@@ -0,0 +1,31 @@
+mul(8)          g22<1>F         g4<8,8,1>F      g2<0,1,0>F      { align1 1Q };
+mul(16)         g26<1>F         g2<0,1,0>F      g2<0,1,0>F      { align1 1H };
+mul(8)          g36<1>DF        g8<0,1,0>DF     g8<0,1,0>DF     { align1 1Q };
+mul(8)          g9<1>UD         g86<8,8,1>UD    0x00000004UD    { align1 1Q };
+mul(8)          acc0<1>UD       g17<8,8,1>UD    0xaaabUW        { align1 1Q };
+mul(8)          acc0<1>D        g17<8,8,1>D     0x5556UW        { align1 1Q };
+mul(8)          g21<1>D         g20<8,8,1>D     3D              { align1 1Q };
+mul(8)          acc0<1>UD       g39<8,8,1>UD    0xaaabUW        { align1 2Q };
+mul(16)         g45<1>D         g43<8,8,1>D     3D              { align1 1H };
+mul(8)          acc0<1>D        g39<8,8,1>D     0x5556UW        { align1 2Q };
+mul.z.f0.0(8)   g10<1>F         g5<0,1,0>F      g9<8,8,1>F      { align1 1Q };
+mul(8)          g39<1>DF        g3.3<0,1,0>DF   g3.3<0,1,0>DF   { align1 2Q };
+mul.z.f0.0(16)  g6<1>F          g2<0,1,0>F      g4<8,8,1>F      { align1 1H };
+mul.sat(8)      g17<1>F         g4<8,8,1>F      g16<8,8,1>F     { align1 1Q };
+mul.sat(16)     g9<1>F          g3<8,8,1>F      g7<8,8,1>F      { align1 1H };
+mul.l.f0.0(8)   null<1>F        g6<0,1,0>F      g5.7<0,1,0>F    { align1 1Q };
+mul.sat(8)      g8<1>DF         g34<4,4,1>DF    g5<4,4,1>DF     { align1 1Q };
+mul(8)          g4<1>UQ         g8<4,4,1>UD     g12<4,4,1>UD    { align1 1Q };
+mul(8)          g20<1>UQ        g5<4,4,1>UD     g13<4,4,1>UD    { align1 2Q };
+mul(8)          g5<1>Q          g9<4,4,1>D      g13<4,4,1>D     { align1 1Q };
+mul.sat(8)      g10<1>DF        g10<4,4,1>DF    g16<4,4,1>DF    { align1 2Q };
+mul.l.f0.0(8)   g20<1>F         g2<8,8,1>F      0x42700000F  /* 60F */ { align1 1Q };
+mul.l.f0.0(16)  g32<1>F         g2<8,8,1>F      0x42700000F  /* 60F */ { align1 1H };
+mul(1)          g6<1>UD         g12<0,1,0>UD    0x00000101UD    { align1 WE_all 1N };
+mul(8)          g21<1>Q         g6<4,4,1>D      g14<4,4,1>D     { align1 2Q };
+mul.l.f0.0(16)  null<1>F        g2.2<0,1,0>F    g2.1<0,1,0>F    { align1 1H };
+mul(8)          g6<1>UW         g6<8,8,1>UW     0x0808UW        { align1 1Q };
+mul(16)         g15<1>UW        g14<16,16,1>UW  0x0808UW        { align1 1H };
+mul.nz.f0.0(8)  g6<1>F          g12<8,8,1>F     0x3f808000F  /* 1.00391F */ { align1 1Q };
+mul.nz.f0.0(16) g9<1>F          g7<8,8,1>F      0x3f808000F  /* 1.00391F */ { align1 1H };
+mul(1)          g4<1>UD         g4<0,1,0>UD     0x00000101UD    { align1 WE_all 3N };
diff --git a/src/intel/compiler/elk/tests/gen9/mul.expected b/src/intel/compiler/elk/tests/gen9/mul.expected
new file mode 100644
index 00000000000..1a1a79c6467
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/mul.expected
@@ -0,0 +1,31 @@
+41 00 60 00 e8 3a c0 22 80 00 8d 3a 40 00 00 00
+41 00 80 00 e8 3a 40 23 40 00 00 3a 40 00 00 00
+41 00 60 00 c8 32 80 24 00 01 00 32 00 01 00 00
+41 00 60 00 08 02 20 21 c0 0a 8d 06 04 00 00 00
+41 00 60 00 00 02 00 24 20 02 8d 16 ab aa ab aa
+41 00 60 00 20 0a 00 24 20 02 8d 16 56 55 56 55
+41 00 60 00 28 0a a0 22 80 02 8d 0e 03 00 00 00
+41 10 60 00 00 02 00 24 e0 04 8d 16 ab aa ab aa
+41 00 80 00 28 0a a0 25 60 05 8d 0e 03 00 00 00
+41 10 60 00 20 0a 00 24 e0 04 8d 16 56 55 56 55
+41 00 60 01 e8 3a 40 21 a0 00 00 3a 20 01 8d 00
+41 10 60 00 c8 32 e0 24 78 00 00 32 78 00 00 00
+41 00 80 01 e8 3a c0 20 40 00 00 3a 80 00 8d 00
+41 00 60 80 e8 3a 20 22 80 00 8d 3a 00 02 8d 00
+41 00 80 80 e8 3a 20 21 60 00 8d 3a e0 00 8d 00
+41 00 60 05 e0 3a 00 20 c0 00 00 3a bc 00 00 00
+41 00 60 80 c8 32 00 21 40 04 69 32 a0 00 69 00
+41 00 60 00 08 03 80 20 00 01 69 02 80 01 69 00
+41 10 60 00 08 03 80 22 a0 00 69 02 a0 01 69 00
+41 00 60 00 28 0b a0 20 20 01 69 0a a0 01 69 00
+41 10 60 80 c8 32 40 21 40 01 69 32 00 02 69 00
+41 00 60 05 e8 3a 80 22 40 00 8d 3e 00 00 70 42
+41 00 80 05 e8 3a 00 24 40 00 8d 3e 00 00 70 42
+41 00 00 00 0c 02 c0 20 80 01 00 06 01 01 00 00
+41 10 60 00 28 0b a0 22 c0 00 69 0a c0 01 69 00
+41 00 80 05 e0 3a 00 20 48 00 00 3a 44 00 00 00
+41 00 60 00 48 12 c0 20 c0 00 8d 16 08 08 08 08
+41 00 80 00 48 12 e0 21 c0 01 b1 16 08 08 08 08
+41 00 60 02 e8 3a c0 20 80 01 8d 3e 00 80 80 3f
+41 00 80 02 e8 3a 20 21 e0 00 8d 3e 00 80 80 3f
+41 10 00 00 0c 02 80 20 80 00 00 06 01 01 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/nop.asm b/src/intel/compiler/elk/tests/gen9/nop.asm
new file mode 100644
index 00000000000..0b66395094f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/nop.asm
@@ -0,0 +1 @@
+nop                                                             ;
diff --git a/src/intel/compiler/elk/tests/gen9/nop.expected b/src/intel/compiler/elk/tests/gen9/nop.expected
new file mode 100644
index 00000000000..9a3dcf265b5
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/nop.expected
@@ -0,0 +1 @@
+7e 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/not.asm b/src/intel/compiler/elk/tests/gen9/not.asm
new file mode 100644
index 00000000000..ce4592bd74f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/not.asm
@@ -0,0 +1,2 @@
+not(16)         g3<1>D          g1.2<0,1,0>D                    { align1 1H };
+not(8)          g4<1>D          g8<8,8,1>D                      { align1 1Q };
diff --git a/src/intel/compiler/elk/tests/gen9/not.expected b/src/intel/compiler/elk/tests/gen9/not.expected
new file mode 100644
index 00000000000..3a66a221c46
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/not.expected
@@ -0,0 +1,2 @@
+04 00 80 00 28 0a 60 20 28 00 00 00 00 00 00 00
+04 00 60 00 28 0a 80 20 00 01 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/or.asm b/src/intel/compiler/elk/tests/gen9/or.asm
new file mode 100644
index 00000000000..3bfcc980749
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/or.asm
@@ -0,0 +1,23 @@
+or(8)           g53<1>UD        g49<8,8,1>UD    g21<8,8,1>UD    { align1 1Q };
+or.nz.f0.0(8)   null<1>UD       g21<8,8,1>UD    g2<8,8,1>UD     { align1 1Q };
+or.nz.f0.0(8)   g5<1>UD         g62<8,8,1>UD    g67<8,8,1>UD    { align1 1Q };
+or(8)           g5<1>UD         g106.1<8,4,2>UD 0x7ff00000UD    { align1 2Q };
+or.nz.f0.0(16)  null<1>UD       g35<8,8,1>UD    g32<8,8,1>UD    { align1 1H };
+or(16)          g36<1>UD        g34<8,8,1>UD    g20<8,8,1>UD    { align1 1H };
+or.nz.f0.0(16)  g53<1>UD        g51<8,8,1>UD    g49<8,8,1>UD    { align1 1H };
+or(1)           g8<1>UD         g8<0,1,0>UD     g4<0,1,0>UD     { align1 WE_all 1N };
+or(1)           a0<1>UD         g8<0,1,0>UD     0x060ba000UD    { align1 WE_all 1N };
+(+f0.0) or(8)   g3<1>UD         g3<8,8,1>UD     0x3f800000UD    { align1 1Q };
+(+f0.0) or(16)  g3<1>UD         g3<8,8,1>UD     0x3f800000UD    { align1 1H };
+or(1)           a0<1>UD         a0<0,1,0>UD     0x02280300UD    { align1 WE_all 1N };
+or(1)           a0<1>UD         g4<0,1,0>UD     0x04036000UD    { align1 WE_all 3N };
+(+f0.0) or(8)   g17.1<2>UD      g17.1<8,4,2>UD  0x3ff00000UD    { align1 2Q };
+or(8)           g4<1>UW         g4<8,8,1>UW     g6<8,8,1>UW     { align1 1Q };
+or(16)          g16<1>UW        g14<16,16,1>UW  g15<16,16,1>UW  { align1 1H };
+or(8)           g22<1>UD        ~g2.2<0,1,0>D   g21<8,8,1>UD    { align1 1Q };
+or(16)          g37<1>UD        ~g2.2<0,1,0>D   g35<8,8,1>UD    { align1 1H };
+or(8)           g9<1>D          ~g8<8,8,1>D     ~g7<8,8,1>D     { align1 1Q };
+or(16)          g13<1>D         ~g11<8,8,1>D    ~g9<8,8,1>D     { align1 1H };
+or(1)           g14<1>UD        g14<0,1,0>UD    g19<0,1,0>UD    { align1 WE_all 3N };
+or.z.f0.0(8)    null<1>UD       g5<8,8,1>UD     g6<8,8,1>UD     { align1 1Q };
+or.z.f0.0(16)   null<1>UD       g17<8,8,1>UD    g19<8,8,1>UD    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/or.expected b/src/intel/compiler/elk/tests/gen9/or.expected
new file mode 100644
index 00000000000..61e2fccc15c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/or.expected
@@ -0,0 +1,23 @@
+06 00 60 00 08 02 a0 26 20 06 8d 02 a0 02 8d 00
+06 00 60 02 00 02 00 20 a0 02 8d 02 40 00 8d 00
+06 00 60 02 08 02 a0 20 c0 07 8d 02 60 08 8d 00
+06 10 60 00 08 02 a0 20 44 0d 8a 06 00 00 f0 7f
+06 00 80 02 00 02 00 20 60 04 8d 02 00 04 8d 00
+06 00 80 00 08 02 80 24 40 04 8d 02 80 02 8d 00
+06 00 80 02 08 02 a0 26 60 06 8d 02 20 06 8d 00
+06 00 00 00 0c 02 00 21 00 01 00 02 80 00 00 00
+06 00 00 00 04 02 00 22 00 01 00 06 00 a0 0b 06
+06 00 61 00 08 02 60 20 60 00 8d 06 00 00 80 3f
+06 00 81 00 08 02 60 20 60 00 8d 06 00 00 80 3f
+06 00 00 00 04 00 00 22 00 02 00 06 00 03 28 02
+06 10 00 00 04 02 00 22 80 00 00 06 00 60 03 04
+06 10 61 00 08 02 24 42 24 02 8a 06 00 00 f0 3f
+06 00 60 00 48 12 80 20 80 00 8d 12 c0 00 8d 00
+06 00 80 00 48 12 00 22 c0 01 b1 12 e0 01 b1 00
+06 00 60 00 08 0a c0 22 48 40 00 02 a0 02 8d 00
+06 00 80 00 08 0a a0 24 48 40 00 02 60 04 8d 00
+06 00 60 00 28 0a 20 21 00 41 8d 0a e0 40 8d 00
+06 00 80 00 28 0a a0 21 60 41 8d 0a 20 41 8d 00
+06 10 00 00 0c 02 c0 21 c0 01 00 02 60 02 00 00
+06 00 60 01 00 02 00 20 a0 00 8d 02 c0 00 8d 00
+06 00 80 01 00 02 00 20 20 02 8d 02 60 02 8d 00
diff --git a/src/intel/compiler/elk/tests/gen9/pln.asm b/src/intel/compiler/elk/tests/gen9/pln.asm
new file mode 100644
index 00000000000..5b0adcf28cd
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/pln.asm
@@ -0,0 +1,10 @@
+pln(8)          g124<1>F        g4<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln(16)         g120<1>F        g6<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.sat(8)      g9<1>F          g5<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.sat(16)     g12<1>F         g7<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.g.f0.0(8)   g7<1>F          g4<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.g.f0.0(16)  g11<1>F         g6<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.l.f0.0(8)   g8<1>F          g4<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.l.f0.0(16)  g11<1>F         g6<0,1,0>F      g2<8,8,1>F      { align1 1H };
+pln.nz.f0.0(8)  g18<1>F         g5<0,1,0>F      g2<8,8,1>F      { align1 1Q };
+pln.nz.f0.0(16) g14<1>F         g7<0,1,0>F      g2<8,8,1>F      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/pln.expected b/src/intel/compiler/elk/tests/gen9/pln.expected
new file mode 100644
index 00000000000..eb77b2a434f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/pln.expected
@@ -0,0 +1,10 @@
+5a 00 60 00 e8 3a 80 2f 80 00 00 3a 40 00 8d 00
+5a 00 80 00 e8 3a 00 2f c0 00 00 3a 40 00 8d 00
+5a 00 60 80 e8 3a 20 21 a0 00 00 3a 40 00 8d 00
+5a 00 80 80 e8 3a 80 21 e0 00 00 3a 40 00 8d 00
+5a 00 60 03 e8 3a e0 20 80 00 00 3a 40 00 8d 00
+5a 00 80 03 e8 3a 60 21 c0 00 00 3a 40 00 8d 00
+5a 00 60 05 e8 3a 00 21 80 00 00 3a 40 00 8d 00
+5a 00 80 05 e8 3a 60 21 c0 00 00 3a 40 00 8d 00
+5a 00 60 02 e8 3a 40 22 a0 00 00 3a 40 00 8d 00
+5a 00 80 02 e8 3a c0 21 e0 00 00 3a 40 00 8d 00
diff --git a/src/intel/compiler/elk/tests/gen9/rndd.asm b/src/intel/compiler/elk/tests/gen9/rndd.asm
new file mode 100644
index 00000000000..463ef808ca9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/rndd.asm
@@ -0,0 +1,5 @@
+rndd(8)         g22<1>F         g17<0,1,0>F                     { align1 1Q };
+rndd(16)        g7<1>F          g5<8,8,1>F                      { align1 1H };
+rndd.z.f0.0(8)  null<1>F        g17<8,8,1>F                     { align1 1Q };
+rndd.z.f0.0(16) null<1>F        g39<8,8,1>F                     { align1 1H };
+rndd.sat(8)     g124<1>F        g10<8,8,1>F                     { align1 1Q };
diff --git a/src/intel/compiler/elk/tests/gen9/rndd.expected b/src/intel/compiler/elk/tests/gen9/rndd.expected
new file mode 100644
index 00000000000..ff7ca82d09f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/rndd.expected
@@ -0,0 +1,5 @@
+45 00 60 00 e8 3a c0 22 20 02 00 00 00 00 00 00
+45 00 80 00 e8 3a e0 20 a0 00 8d 00 00 00 00 00
+45 00 60 01 e0 3a 00 20 20 02 8d 00 00 00 00 00
+45 00 80 01 e0 3a 00 20 e0 04 8d 00 00 00 00 00
+45 00 60 80 e8 3a 80 2f 40 01 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/rnde.asm b/src/intel/compiler/elk/tests/gen9/rnde.asm
new file mode 100644
index 00000000000..bc65bbcc02d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/rnde.asm
@@ -0,0 +1,2 @@
+rnde(8)         g7<1>F          g5<8,8,1>F                      { align1 1Q };
+rnde(16)        g11<1>F         g7<8,8,1>F                      { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/rnde.expected b/src/intel/compiler/elk/tests/gen9/rnde.expected
new file mode 100644
index 00000000000..edac496ec93
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/rnde.expected
@@ -0,0 +1,2 @@
+46 00 60 00 e8 3a e0 20 a0 00 8d 00 00 00 00 00
+46 00 80 00 e8 3a 60 21 e0 00 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/rndz.asm b/src/intel/compiler/elk/tests/gen9/rndz.asm
new file mode 100644
index 00000000000..4b082d0539b
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/rndz.asm
@@ -0,0 +1,2 @@
+rndz(8)         g7<1>F          g2<0,1,0>F                      { align1 1Q };
+rndz(16)        g102<1>F        g99<8,8,1>F                     { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/rndz.expected b/src/intel/compiler/elk/tests/gen9/rndz.expected
new file mode 100644
index 00000000000..2a79a2372d9
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/rndz.expected
@@ -0,0 +1,2 @@
+47 00 60 00 e8 3a e0 20 40 00 00 00 00 00 00 00
+47 00 80 00 e8 3a c0 2c 60 0c 8d 00 00 00 00 00
diff --git a/src/intel/compiler/elk/tests/gen9/sel.asm b/src/intel/compiler/elk/tests/gen9/sel.asm
new file mode 100644
index 00000000000..6047c31b517
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/sel.asm
@@ -0,0 +1,33 @@
+(-f0.0) sel(8)  g124<1>UD       g124<8,8,1>UD   0x3f800000UD    { align1 1Q };
+(+f0.0) sel(8)  g124<1>UD       g124<8,8,1>UD   0x00000000UD    { align1 1Q };
+(+f0.0) sel(8)  g24<1>UQ        g66<4,4,1>UQ    g40<4,4,1>UQ    { align1 1Q };
+(+f0.0) sel(8)  g36<1>UQ        g50<4,4,1>UQ    g31<4,4,1>UQ    { align1 2Q };
+sel.ge(8)       g17<1>F         (abs)g16<8,8,1>F 0x3f800000F  /* 1F */ { align1 1Q };
+sel.ge(16)      g37<1>F         (abs)g35<8,8,1>F 0x3f800000F  /* 1F */ { align1 1H };
+(+f0.0) sel(16) g26<1>UD        g31<8,8,1>UD    g33<8,8,1>UD    { align1 1H };
+(-f0.0) sel(16) g1<1>UD         g55<8,8,1>UD    0x00000000UD    { align1 1H };
+sel.l(8)        g3<1>UD         g2.1<0,1,0>UD   0x00000001UD    { align1 1Q };
+sel.l(16)       g3<1>UD         g2.1<0,1,0>UD   0x00000001UD    { align1 1H };
+sel.ge(8)       g3<1>D          g2<0,1,0>D      -1D             { align1 1Q };
+sel.l(8)        g4<1>D          g3<8,8,1>D      1D              { align1 1Q };
+sel.ge(16)      g3<1>D          g2<0,1,0>D      -1D             { align1 1H };
+sel.l(16)       g5<1>D          g3<8,8,1>D      1D              { align1 1H };
+sel.l(8)        g8<1>F          g7<8,8,1>F      0x43000000F  /* 128F */ { align1 1Q };
+(-f0.0) sel.sat(8) g126<1>F     g11<8,8,1>F     0x0F  /* 0F */  { align1 1Q };
+sel.l(8)        g18<1>DF        g5<0,1,0>DF     g5.1<0,1,0>DF   { align1 1Q };
+sel.ge(16)      g37<1>UD        g9<8,8,1>UD     g13<8,8,1>UD    { align1 1H };
+sel.ge(8)       g19<1>UD        g5<0,1,0>UD     g5.4<0,1,0>UD   { align1 1Q };
+sel.sat.l(8)    g124<1>F        g6<8,8,1>F      0x3f000000F  /* 0.5F */ { align1 1Q };
+(+f0.0) sel(8)  g26<1>F         g5<0,1,0>F      (abs)g5.3<0,1,0>F { align1 1Q };
+(-f0.0) sel(8)  g44<1>F         (abs)g41<8,8,1>F 0x3f800000F  /* 1F */ { align1 1Q };
+sel.l(16)       g120<1>F        g2.3<0,1,0>F    g2.2<0,1,0>F    { align1 1H };
+(+f0.0) sel(8)  g9<1>DF         g2<0,1,0>DF     -g2<0,1,0>DF    { align1 1Q };
+(+f0.0) sel(8)  g12<1>DF        g2<0,1,0>DF     -g2<0,1,0>DF    { align1 2Q };
+sel.ge(8)       g5<1>DF         g2<0,1,0>DF     g2.2<0,1,0>DF   { align1 1Q };
+sel.ge(8)       g35<1>DF        g2<0,1,0>DF     g2.2<0,1,0>DF   { align1 2Q };
+sel.l(8)        g11<1>DF        g35<4,4,1>DF    g3<0,1,0>DF     { align1 2Q };
+(+f0.0) sel.sat(8) g126<1>F     g11<8,8,1>F     0x0F  /* 0F */  { align1 1Q };
+(-f0.0) sel(16) g27<1>F         (abs)g25<8,8,1>F 0x3f800000F  /* 1F */ { align1 1H };
+(+f0.0) sel(16) g36<1>F         g2<0,1,0>F      (abs)g2.4<0,1,0>F { align1 1H };
+(+f0.0) sel(16) g116<1>UD       g112<8,8,1>UD   g114<8,8,1>UD   { align1 2H };
+sel.sat.l(16)   g8<1>F          g83<8,8,1>F     0x3f000000F  /* 0.5F */ { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/sel.expected b/src/intel/compiler/elk/tests/gen9/sel.expected
new file mode 100644
index 00000000000..aba34ca3d75
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/sel.expected
@@ -0,0 +1,33 @@
+02 00 71 00 08 02 80 2f 80 0f 8d 06 00 00 80 3f
+02 00 61 00 08 02 80 2f 80 0f 8d 06 00 00 00 00
+02 00 61 00 08 43 00 23 40 08 69 42 00 05 69 00
+02 10 61 00 08 43 80 24 40 06 69 42 e0 03 69 00
+02 00 60 04 e8 3a 20 22 00 22 8d 3e 00 00 80 3f
+02 00 80 04 e8 3a a0 24 60 24 8d 3e 00 00 80 3f
+02 00 81 00 08 02 40 23 e0 03 8d 02 20 04 8d 00
+02 00 91 00 08 02 20 20 e0 06 8d 06 00 00 00 00
+02 00 60 05 08 02 60 20 44 00 00 06 01 00 00 00
+02 00 80 05 08 02 60 20 44 00 00 06 01 00 00 00
+02 00 60 04 28 0a 60 20 40 00 00 0e ff ff ff ff
+02 00 60 05 28 0a 80 20 60 00 8d 0e 01 00 00 00
+02 00 80 04 28 0a 60 20 40 00 00 0e ff ff ff ff
+02 00 80 05 28 0a a0 20 60 00 8d 0e 01 00 00 00
+02 00 60 05 e8 3a 00 21 e0 00 8d 3e 00 00 00 43
+02 00 71 80 e8 3a c0 2f 60 01 8d 3e 00 00 00 00
+02 00 60 05 c8 32 40 22 a0 00 00 32 a8 00 00 00
+02 00 80 04 08 02 a0 24 20 01 8d 02 a0 01 8d 00
+02 00 60 04 08 02 60 22 a0 00 00 02 b0 00 00 00
+02 00 60 85 e8 3a 80 2f c0 00 8d 3e 00 00 00 3f
+02 00 61 00 e8 3a 40 23 a0 00 00 3a ac 20 00 00
+02 00 71 00 e8 3a 80 25 20 25 8d 3e 00 00 80 3f
+02 00 80 05 e8 3a 00 2f 4c 00 00 3a 48 00 00 00
+02 00 61 00 c8 32 20 21 40 00 00 32 40 40 00 00
+02 10 61 00 c8 32 80 21 40 00 00 32 40 40 00 00
+02 00 60 04 c8 32 a0 20 40 00 00 32 50 00 00 00
+02 10 60 04 c8 32 60 24 40 00 00 32 50 00 00 00
+02 10 60 05 c8 32 60 21 60 04 69 32 60 00 00 00
+02 00 61 80 e8 3a c0 2f 60 01 8d 3e 00 00 00 00
+02 00 91 00 e8 3a 60 23 20 23 8d 3e 00 00 80 3f
+02 00 81 00 e8 3a 80 24 40 00 00 3a 50 20 00 00
+02 20 81 00 08 02 80 2e 00 0e 8d 02 40 0e 8d 00
+02 00 80 85 e8 3a 00 21 60 0a 8d 3e 00 00 00 3f
diff --git a/src/intel/compiler/elk/tests/gen9/send.asm b/src/intel/compiler/elk/tests/gen9/send.asm
new file mode 100644
index 00000000000..918859e7d52
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/send.asm
@@ -0,0 +1,3606 @@
+send(8)         null<1>F        g123<8,8,1>F    0x8a080017
+                            urb MsgDesc: 1 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         null<1>F        g13<8,8,1>F     0x12080007
+                            urb MsgDesc: 0 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080027
+                            urb MsgDesc: 2 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(16)        g9<1>UD         g2<0,1,0>UD     0x02280300
+                            const MsgDesc: (0, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(8)         null<1>F        g119<8,8,1>F    0x92080017
+                            urb MsgDesc: 1 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(16)        null<1>UW       g127<8,8,1>UW   0x82000010
+                            thread_spawner MsgDesc: mlen 1 rlen 0           { align1 WE_all 1H EOT };
+send(8)         g124<1>UW       g13<8,8,1>UD    0x0643a001
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g23<8,8,1>UD    0x0c85a001
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g10<1>UD        g2<8,8,1>UD     0x02480028
+                            urb MsgDesc: 2 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         null<1>F        g8<8,8,1>F      0x140a0017
+                            urb MsgDesc: 1 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g118<8,8,1>F    0x940a0017
+                            urb MsgDesc: 1 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q EOT };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x08427001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g18<8,8,1>UD    0x10847001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         null<1>F        g11<8,8,1>UD    0x0c0a0037
+                            urb MsgDesc: 3 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0a080027
+                            urb MsgDesc: 2 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0c088017
+                            urb MsgDesc: 1 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0a088017
+                            urb MsgDesc: 1 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x08088017
+                            urb MsgDesc: 1 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g2<8,8,1>UD     0x06088017
+                            urb MsgDesc: 1 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0c088007
+                            urb MsgDesc: 0 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0a088007
+                            urb MsgDesc: 0 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g125<8,8,1>UD   0x86088007
+                            urb MsgDesc: 0 SIMD8 write masked mlen 3 rlen 0 { align1 1Q EOT };
+send(8)         g7<1>UW         g7<8,8,1>UD     0x0443a000
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 0 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g6<8,8,1>UD     0x0222a001
+                            sampler MsgDesc: resinfo SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 2 { align1 1Q };
+send(8)         g2<1>UW         g19<8,8,1>UD    0x084a8001
+                            sampler MsgDesc: gather4 SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g25<1>UW        g16<8,8,1>UD    0x0444a001
+                            sampler MsgDesc: resinfo SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1H };
+send(16)        g14<1>UW        g7<8,8,1>UD     0x0e8c8001
+                            sampler MsgDesc: gather4 SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         null<1>F        g11<8,8,1>F     0x12080017
+                            urb MsgDesc: 1 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>F     0x12080037
+                            urb MsgDesc: 3 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080057
+                            urb MsgDesc: 5 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         g9<1>UW         g6<8,8,1>UD     0x0613d001
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 1 { align1 1Q };
+send(16)        g12<1>UW        g14<8,8,1>UD    0x0c25d001
+                            sampler MsgDesc: ld_mcs SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 2 { align1 1H };
+send(8)         g2<1>UW         g14<8,8,1>UD    0x0643d001
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g8<1>UW         g17<8,8,1>UD    0x0a43e001
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g26<1>UW        g10<8,8,1>UD    0x0c85d001
+                            sampler MsgDesc: ld_mcs SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(16)        g34<1>UW        g16<8,8,1>UD    0x1485e001
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(8)         g5<1>UW         g2<8,8,1>UD     0x04320001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 3 { align1 1Q };
+send(16)        g7<1>UW         g2<8,8,1>UD     0x08640001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 6 { align1 1H };
+send(8)         g12<1>UW        g10<8,8,1>UD    0x0a33e001
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 3 { align1 1Q };
+send(16)        g2<1>UW         g18<8,8,1>UD    0x1465e001
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 6 { align1 1H };
+send(8)         g5<1>UW         g2<8,8,1>UD     0x04420001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g7<1>UW         g2<8,8,1>UD     0x08840001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g11<1>UW        g9<8,8,1>UD     0x0222a000
+                            sampler MsgDesc: resinfo SIMD8 Surface = 0 Sampler = 0 mlen 1 rlen 2 { align1 1Q };
+send(8)         g124<1>UW       g13<8,8,1>UD    0x064a8000
+                            sampler MsgDesc: gather4 SIMD8 Surface = 0 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g12<1>UW        g5<8,8,1>UD     0x02427000
+                            sampler MsgDesc: ld SIMD8 Surface = 0 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080037
+                            urb MsgDesc: 3 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x144a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 10 rlen 4 { align1 1Q };
+(+f1.0) send(8) g125<1>UW       g3<8,8,1>UD     0x0210b501
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, inc) mlen 1 rlen 1 { align1 1Q };
+(+f1.0) send(16) g122<1>UW      g4<8,8,1>UD     0x0420a501
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, inc) mlen 2 rlen 2 { align1 1H };
+send(8)         g6<1>UW         g12<8,8,1>UD    0x084a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g98<1>UW        g17<8,8,1>UD    0x0c43c001
+                            sampler MsgDesc: ld2dms_w SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g8<8,8,1>UD     0x064a8001
+                            sampler MsgDesc: gather4 SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g12<8,8,1>UD    0x0a8c8001
+                            sampler MsgDesc: gather4 SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g7<8,8,1>UD     0x0a1a6001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 1 { align1 1Q };
+send(8)         g7<1>UW         g12<8,8,1>UD    0x0a1a6102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 1 { align1 1Q };
+send(16)        g10<1>UW        g12<8,8,1>UD    0x122c6001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 2 { align1 1H };
+send(16)        g12<1>UW        g21<8,8,1>UD    0x122c6102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 9 rlen 2 { align1 1H };
+send(8)         g124<1>UW       g3<8,8,1>UD     0x0a43e000
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 0 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080027
+                            urb MsgDesc: 2 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         g2<1>UW         g3<8,8,1>UD     0x0643d000
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 0 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         null<1>F        g7<8,8,1>UD     0x0a080037
+                            urb MsgDesc: 3 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g8<8,8,1>UD     0x0a080047
+                            urb MsgDesc: 4 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>F     0x0c0a0017
+                            urb MsgDesc: 1 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g122<8,8,1>F    0x8c0a0017
+                            urb MsgDesc: 1 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         g13<1>UW        g10<8,8,1>UD    0x02320001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 3 { align1 1Q };
+send(16)        g22<1>UW        g18<8,8,1>UD    0x04640001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 6 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x0232a000
+                            sampler MsgDesc: resinfo SIMD8 Surface = 0 Sampler = 0 mlen 1 rlen 3 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x0c4b1001
+                            sampler MsgDesc: gather4_po SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g7<8,8,1>UD     0x168d1001
+                            sampler MsgDesc: gather4_po SIMD16 Surface = 1 Sampler = 0 mlen 11 rlen 8 { align1 1H };
+send(8)         null<1>F        g6<8,8,1>UD     0x0a088027
+                            urb MsgDesc: 2 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g7<8,8,1>UD     0x0a088037
+                            urb MsgDesc: 3 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g8<8,8,1>UD     0x0a088047
+                            urb MsgDesc: 4 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g9<8,8,1>UD     0x0a088057
+                            urb MsgDesc: 5 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         g124<1>UW       g3<8,8,1>UD     0x06427000
+                            sampler MsgDesc: ld SIMD8 Surface = 0 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x06427001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g18<8,8,1>UD    0x0c847001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g10<8,8,1>UD    0x0c424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x0c4b1000
+                            sampler MsgDesc: gather4_po SIMD8 Surface = 0 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g4<8,8,1>UD     0x0242a000
+                            sampler MsgDesc: resinfo SIMD8 Surface = 0 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x0242a101
+                            sampler MsgDesc: resinfo SIMD8 Surface = 1 Sampler = 1 mlen 1 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g10<8,8,1>UD    0x0242a202
+                            sampler MsgDesc: resinfo SIMD8 Surface = 2 Sampler = 2 mlen 1 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g14<8,8,1>UD    0x0242a303
+                            sampler MsgDesc: resinfo SIMD8 Surface = 3 Sampler = 3 mlen 1 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x0242a404
+                            sampler MsgDesc: resinfo SIMD8 Surface = 4 Sampler = 4 mlen 1 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g22<8,8,1>UD    0x0242a505
+                            sampler MsgDesc: resinfo SIMD8 Surface = 5 Sampler = 5 mlen 1 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g26<8,8,1>UD    0x0242a606
+                            sampler MsgDesc: resinfo SIMD8 Surface = 6 Sampler = 6 mlen 1 rlen 4 { align1 1Q };
+send(8)         g6<1>UD         g15<8,8,1>UD    0x042a0318
+                            urb MsgDesc: 49 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g8<1>UD         g15<8,8,1>UD    0x042a0518
+                            urb MsgDesc: 81 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g10<1>UD        g15<8,8,1>UD    0x042a0718
+                            urb MsgDesc: 113 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g12<1>UD        g15<8,8,1>UD    0x042a0918
+                            urb MsgDesc: 145 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g14<1>UD        g15<8,8,1>UD    0x042a0128
+                            urb MsgDesc: 18 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g16<1>UD        g14<8,8,1>UD    0x042a0218
+                            urb MsgDesc: 33 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g18<1>UD        g14<8,8,1>UD    0x042a0418
+                            urb MsgDesc: 65 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g20<1>UD        g14<8,8,1>UD    0x042a0618
+                            urb MsgDesc: 97 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g22<1>UD        g14<8,8,1>UD    0x042a0818
+                            urb MsgDesc: 129 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g13<1>UD        g14<8,8,1>UD    0x042a0028
+                            urb MsgDesc: 2 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g30<8,8,1>UD    0x02480208
+                            urb MsgDesc: 32 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g14<1>UD        g30<8,8,1>UD    0x02480408
+                            urb MsgDesc: 64 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g18<1>UD        g30<8,8,1>UD    0x02480608
+                            urb MsgDesc: 96 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g30<8,8,1>UD    0x02480808
+                            urb MsgDesc: 128 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0a0a8217
+                            urb MsgDesc: 33 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>UD    0x0a0a8227
+                            urb MsgDesc: 34 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0a0a8237
+                            urb MsgDesc: 35 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x0a0a8247
+                            urb MsgDesc: 36 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x0a0a8257
+                            urb MsgDesc: 37 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x0a0a8267
+                            urb MsgDesc: 38 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0a0a8277
+                            urb MsgDesc: 39 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x0a0a8287
+                            urb MsgDesc: 40 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x0a0a8297
+                            urb MsgDesc: 41 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x0a0a82a7
+                            urb MsgDesc: 42 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0a0a82b7
+                            urb MsgDesc: 43 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0a0a82c7
+                            urb MsgDesc: 44 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0a0a82d7
+                            urb MsgDesc: 45 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0a0a82e7
+                            urb MsgDesc: 46 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0a0a82f7
+                            urb MsgDesc: 47 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0a0a8307
+                            urb MsgDesc: 48 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x0a0a8317
+                            urb MsgDesc: 49 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0a0a8327
+                            urb MsgDesc: 50 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0a0a8337
+                            urb MsgDesc: 51 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0a0a8347
+                            urb MsgDesc: 52 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0a0a8357
+                            urb MsgDesc: 53 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0a0a8367
+                            urb MsgDesc: 54 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0a0a8377
+                            urb MsgDesc: 55 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x0a0a8387
+                            urb MsgDesc: 56 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0a0a8397
+                            urb MsgDesc: 57 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0a0a83a7
+                            urb MsgDesc: 58 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0a0a83b7
+                            urb MsgDesc: 59 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0a0a83c7
+                            urb MsgDesc: 60 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0a0a83d7
+                            urb MsgDesc: 61 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0a0a83e7
+                            urb MsgDesc: 62 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x0a0a83f7
+                            urb MsgDesc: 63 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>UD    0x08088027
+                            urb MsgDesc: 2 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x08088037
+                            urb MsgDesc: 3 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x08088047
+                            urb MsgDesc: 4 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x08088057
+                            urb MsgDesc: 5 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x08088067
+                            urb MsgDesc: 6 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x08088077
+                            urb MsgDesc: 7 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x08088087
+                            urb MsgDesc: 8 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x08088097
+                            urb MsgDesc: 9 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x080880a7
+                            urb MsgDesc: 10 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x080880b7
+                            urb MsgDesc: 11 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x080880c7
+                            urb MsgDesc: 12 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x080880d7
+                            urb MsgDesc: 13 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x080880e7
+                            urb MsgDesc: 14 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x080880f7
+                            urb MsgDesc: 15 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x08088107
+                            urb MsgDesc: 16 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x08088117
+                            urb MsgDesc: 17 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x08088127
+                            urb MsgDesc: 18 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x08088137
+                            urb MsgDesc: 19 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x08088147
+                            urb MsgDesc: 20 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x08088157
+                            urb MsgDesc: 21 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x08088167
+                            urb MsgDesc: 22 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x08088177
+                            urb MsgDesc: 23 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x08088187
+                            urb MsgDesc: 24 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x08088197
+                            urb MsgDesc: 25 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x080881a7
+                            urb MsgDesc: 26 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x080881b7
+                            urb MsgDesc: 27 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x080881c7
+                            urb MsgDesc: 28 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x080881d7
+                            urb MsgDesc: 29 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x080881e7
+                            urb MsgDesc: 30 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x080881f7
+                            urb MsgDesc: 31 SIMD8 write masked mlen 4 rlen 0 { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x02480018
+                            urb MsgDesc: 1 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>UD    0x0c0a0207
+                            urb MsgDesc: 32 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080057
+                            urb MsgDesc: 5 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         g10<1>UW        g18<8,8,1>UD    0x084a8000
+                            sampler MsgDesc: gather4 SIMD8 Surface = 0 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x04229001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 2 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x08449001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1H };
+send(16)        g32<1>UW        g44<8,8,1>UD    0x0865a001
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 6 { align1 1H };
+send(16)        null<1>UW       g5<8,8,1>UD     0x04008502
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, inc) mlen 2 rlen 0 { align1 1H };
+send(8)         g5<1>UW         g3<8,8,1>UD     0x02427001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g8<1>UW         g5<8,8,1>UD     0x04847001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         null<1>F        g119<8,8,1>F    0x92080007
+                            urb MsgDesc: 0 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         null<1>F        g126<8,8,1>UD   0x84080017
+                            urb MsgDesc: 1 SIMD8 write mlen 2 rlen 0        { align1 1Q EOT };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x0a4b1001
+                            sampler MsgDesc: gather4_po SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g16<1>UW        g7<8,8,1>UD     0x128d1001
+                            sampler MsgDesc: gather4_po SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         g38<1>UD        g1<8,8,1>UD     0x02180028
+                            urb MsgDesc: 2 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g40<1>UD        g1<8,8,1>UD     0x02180038
+                            urb MsgDesc: 3 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g42<1>UD        g1<8,8,1>UD     0x02180048
+                            urb MsgDesc: 4 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g44<1>UD        g1<8,8,1>UD     0x02180058
+                            urb MsgDesc: 5 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g46<1>UD        g1<8,8,1>UD     0x02180068
+                            urb MsgDesc: 6 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g48<1>UD        g1<8,8,1>UD     0x02180078
+                            urb MsgDesc: 7 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g50<1>UD        g1<8,8,1>UD     0x02180088
+                            urb MsgDesc: 8 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g52<1>UD        g1<8,8,1>UD     0x02180098
+                            urb MsgDesc: 9 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g54<1>UD        g1<8,8,1>UD     0x021800a8
+                            urb MsgDesc: 10 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g56<1>UD        g1<8,8,1>UD     0x021800b8
+                            urb MsgDesc: 11 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g58<1>UD        g1<8,8,1>UD     0x021800c8
+                            urb MsgDesc: 12 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g60<1>UD        g1<8,8,1>UD     0x021800d8
+                            urb MsgDesc: 13 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g62<1>UD        g1<8,8,1>UD     0x021800e8
+                            urb MsgDesc: 14 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g64<1>UD        g1<8,8,1>UD     0x021800f8
+                            urb MsgDesc: 15 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g66<1>UD        g1<8,8,1>UD     0x02180108
+                            urb MsgDesc: 16 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g68<1>UD        g1<8,8,1>UD     0x02180118
+                            urb MsgDesc: 17 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g70<1>UD        g1<8,8,1>UD     0x02180128
+                            urb MsgDesc: 18 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g72<1>UD        g1<8,8,1>UD     0x02180138
+                            urb MsgDesc: 19 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g74<1>UD        g1<8,8,1>UD     0x02180148
+                            urb MsgDesc: 20 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g76<1>UD        g1<8,8,1>UD     0x02180158
+                            urb MsgDesc: 21 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g78<1>UD        g1<8,8,1>UD     0x02180168
+                            urb MsgDesc: 22 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g80<1>UD        g1<8,8,1>UD     0x02180178
+                            urb MsgDesc: 23 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g82<1>UD        g1<8,8,1>UD     0x02180188
+                            urb MsgDesc: 24 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g84<1>UD        g1<8,8,1>UD     0x02180198
+                            urb MsgDesc: 25 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g86<1>UD        g1<8,8,1>UD     0x021801a8
+                            urb MsgDesc: 26 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g88<1>UD        g1<8,8,1>UD     0x021801b8
+                            urb MsgDesc: 27 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g90<1>UD        g1<8,8,1>UD     0x021801c8
+                            urb MsgDesc: 28 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g92<1>UD        g1<8,8,1>UD     0x021801d8
+                            urb MsgDesc: 29 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g94<1>UD        g1<8,8,1>UD     0x021801e8
+                            urb MsgDesc: 30 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g96<1>UD        g1<8,8,1>UD     0x021801f8
+                            urb MsgDesc: 31 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g98<1>UD        g1<8,8,1>UD     0x02180208
+                            urb MsgDesc: 32 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0c0a0027
+                            urb MsgDesc: 2 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>UW       g126<0,1,0>UD   0x040a02fd
+                            data MsgDesc: ( DC OWORD block write, 253, 2) mlen 2 rlen 0 { align1 1Q };
+send(8)         g115<1>UW       g115<0,1,0>UD   0x021802fd
+                            data MsgDesc: ( DC OWORD block read, 253, 2) mlen 1 rlen 1 { align1 WE_all 1Q };
+send(8)         null<1>F        g25<8,8,1>F     0x12080057
+                            urb MsgDesc: 5 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>F     0x12080077
+                            urb MsgDesc: 7 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g43<8,8,1>F     0x12080097
+                            urb MsgDesc: 9 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g52<8,8,1>F     0x120800b7
+                            urb MsgDesc: 11 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g61<8,8,1>F     0x120800d7
+                            urb MsgDesc: 13 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g70<8,8,1>F     0x120800f7
+                            urb MsgDesc: 15 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g2<8,8,1>F      0x12080117
+                            urb MsgDesc: 17 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g2<8,8,1>F      0x12080137
+                            urb MsgDesc: 19 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g2<8,8,1>F      0x12080157
+                            urb MsgDesc: 21 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g79<8,8,1>F     0x12080177
+                            urb MsgDesc: 23 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g88<8,8,1>F     0x12080197
+                            urb MsgDesc: 25 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g97<8,8,1>F     0x120801b7
+                            urb MsgDesc: 27 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g106<8,8,1>F    0x120801d7
+                            urb MsgDesc: 29 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g117<8,8,1>F    0x920801f7
+                            urb MsgDesc: 31 SIMD8 write mlen 9 rlen 0       { align1 1Q EOT };
+send(8)         g124<1>UW       g11<8,8,1>UD    0x02229001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 2 { align1 1Q };
+send(16)        g120<1>UW       g11<8,8,1>UD    0x04449001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1H };
+send(8)         g124<1>UW       g3<8,8,1>UD     0x08427000
+                            sampler MsgDesc: ld SIMD8 Surface = 0 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        null<1>UW       g40<8,8,1>UD    0x04008501
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, inc) mlen 2 rlen 0 { align1 1H };
+send(8)         null<1>F        g127<8,8,1>UD   0x82080007
+                            urb MsgDesc: 0 SIMD8 write mlen 1 rlen 0        { align1 1Q EOT };
+send(8)         g124<1>UW       g9<8,8,1>UD     0x0a4a8000
+                            sampler MsgDesc: gather4 SIMD8 Surface = 0 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g23<8,8,1>UD    0x0633a001
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 3 { align1 1Q };
+send(16)        g4<1>UW         g12<8,8,1>UD    0x0c65a001
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 6 { align1 1H };
+send(8)         g2<1>UW         g16<8,8,1>UD    0x0e434001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 4 { align1 2Q };
+(+f1.0) send(8) null<1>UW       g4<8,8,1>UD     0x02009501
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, inc) mlen 1 rlen 0 { align1 1Q };
+send(8)         g6<1>UW         g9<8,8,1>UD     0x08434001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         null<1>F        g102<8,8,1>F    0x120801f7
+                            urb MsgDesc: 31 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g121<8,8,1>F    0x8a080217
+                            urb MsgDesc: 33 SIMD8 write mlen 5 rlen 0       { align1 1Q EOT };
+send(16)        null<1>UW       g3<0,1,0>UD     0x02008004
+                            gateway MsgDesc: (barrier msg) mlen 1 rlen 0    { align1 WE_all 1H };
+send(16)        g3<1>UW         g14<8,8,1>UD    0x04205efe
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 254, SIMD16, Mask = 0xe) mlen 2 rlen 2 { align1 1H };
+send(8)         null<1>F        g30<8,8,1>F     0x140a0027
+                            urb MsgDesc: 2 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>F     0x0c0a0047
+                            urb MsgDesc: 4 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g126<8,8,1>UD   0x84080007
+                            urb MsgDesc: 0 SIMD8 write mlen 2 rlen 0        { align1 1Q EOT };
+send(8)         g5<1>UW         g11<8,8,1>UD    0x04415001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0x0) mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g3<8,8,1>UD     0x04416001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0x0) mlen 2 rlen 4 { align1 2Q };
+send(8)         g13<1>UD        g3<8,8,1>UD     0x02480038
+                            urb MsgDesc: 3 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         null<1>F        g7<8,8,1>F      0x140a0037
+                            urb MsgDesc: 3 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         g15<1>UD        g2<8,8,1>UD     0x02280038
+                            urb MsgDesc: 3 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080037
+                            urb MsgDesc: 3 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         null<1>F        g8<8,8,1>F      0x140a0007
+                            urb MsgDesc: 0 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g118<8,8,1>F    0x940a0007
+                            urb MsgDesc: 0 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q EOT };
+send(8)         g124<1>UW       g12<8,8,1>UD    a0<0,1,0>UD     0x00000200
+                            sampler MsgDesc: indirect                       { align1 1Q };
+send(8)         g10<1>UD        g2<8,8,1>UD     0x02480048
+                            urb MsgDesc: 4 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         g6<1>UD         g2<8,8,1>UD     0x02480088
+                            urb MsgDesc: 8 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         g14<1>UD        g2<8,8,1>UD     0x02480058
+                            urb MsgDesc: 5 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         g11<1>UD        g2<8,8,1>UD     0x024800a8
+                            urb MsgDesc: 10 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g18<1>UD        g2<8,8,1>UD     0x02480068
+                            urb MsgDesc: 6 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         g16<1>UD        g2<8,8,1>UD     0x023800c8
+                            urb MsgDesc: 12 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g2<8,8,1>UD     0x02480078
+                            urb MsgDesc: 7 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x024800b8
+                            urb MsgDesc: 11 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g7<1>UD         g2<8,8,1>UD     0x02480098
+                            urb MsgDesc: 9 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x920800b7
+                            urb MsgDesc: 11 SIMD8 write mlen 9 rlen 0       { align1 1Q EOT };
+send(8)         g6<1>UW         g8<8,8,1>UD     0x084b0000
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 0 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g7<1>UW         g0<8,8,1>UD     0x02200008
+                            pixel interp MsgDesc: (persp, per_message_offset, 0x08) mlen 1 rlen 2 { align1 1Q };
+send(16)        g9<1>UW         g0<8,8,1>UD     0x02410008
+                            pixel interp MsgDesc: (persp, per_message_offset, 0x08) mlen 1 rlen 4 { align1 1H };
+send(8)         g2<1>UW         g11<8,8,1>UD    0x0443d001
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g9<8,8,1>UD     0x0843e001
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g15<8,8,1>UD    0x0885d001
+                            sampler MsgDesc: ld_mcs SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(16)        g43<1>UW        g11<8,8,1>UD    0x1085e001
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g6<8,8,1>UD     0x0a4b1000
+                            sampler MsgDesc: gather4_po SIMD8 Surface = 0 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g74<1>UD        g2<8,8,1>UD     0x02280028
+                            urb MsgDesc: 2 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         g7<1>UD         g2<8,8,1>UD     0x02380028
+                            urb MsgDesc: 2 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         g15<1>UD        g2<8,8,1>UD     0x02380038
+                            urb MsgDesc: 3 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         g124<1>UW       g3<8,8,1>UD     0x0843e000
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 0 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g3<8,8,1>UD     0x0443d000
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 0 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g19<8,8,1>UD    0x0a4a8001
+                            sampler MsgDesc: gather4 SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g7<1>UW         g16<8,8,1>UD    0x128c8001
+                            sampler MsgDesc: gather4 SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         null<1>F        g2<8,8,1>F      0x0c0a0057
+                            urb MsgDesc: 5 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g9<8,8,1>UD     0x04080027
+                            urb MsgDesc: 2 SIMD8 write mlen 2 rlen 0        { align1 1Q };
+send(8)         g6<1>UW         g7<8,8,1>UD     0x08134001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 1 { align1 1Q };
+send(8)         g7<1>UW         g11<8,8,1>UD    0x08134102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 1 { align1 1Q };
+send(8)         g13<1>UW        g17<8,8,1>UD    0x021ab000
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 0 Sampler = 0 mlen 1 rlen 1 { align1 1Q };
+send(8)         null<1>F        g50<8,8,1>F     0x140a0057
+                            urb MsgDesc: 5 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g60<8,8,1>F     0x140a0077
+                            urb MsgDesc: 7 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g70<8,8,1>F     0x0c0a0097
+                            urb MsgDesc: 9 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g122<8,8,1>F    0x8c0a0097
+                            urb MsgDesc: 9 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x0a4b0000
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 0 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g5<1>UW         g6<8,8,1>UD     0x061a3001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 1 { align1 1Q };
+send(8)         g6<1>UW         g9<8,8,1>UD     0x061a3102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 1 { align1 1Q };
+send(16)        g9<1>UW         g11<8,8,1>UD    0x0a2c3001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 2 { align1 1H };
+send(16)        g11<1>UW        g2<8,8,1>UD     0x0a2c3102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 5 rlen 2 { align1 1H };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080077
+                            urb MsgDesc: 7 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         null<1>F        g30<8,8,1>UD    0x0c0a0067
+                            urb MsgDesc: 6 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0c0a0077
+                            urb MsgDesc: 7 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g42<8,8,1>UD    0x0c0a0087
+                            urb MsgDesc: 8 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x06420102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0c840102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x04420102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x06420304
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 3 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08840102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 4 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0c840304
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 3 mlen 6 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x04420304
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 3 mlen 2 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x06420708
+                            sampler MsgDesc: sample SIMD8 Surface = 8 Sampler = 7 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08840304
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 3 mlen 4 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0c840708
+                            sampler MsgDesc: sample SIMD16 Surface = 8 Sampler = 7 mlen 6 rlen 8 { align1 1H };
+send(8)         g3<1>UW         g11<8,8,1>UD    0x0a43c001
+                            sampler MsgDesc: ld2dms_w SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g16<1>UW        g5<8,8,1>UD     0x1485c001
+                            sampler MsgDesc: ld2dms_w SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 8 { align1 1H };
+send(16)        g4<1>UD         g13<0,1,0>UD    0x02280301
+                            const MsgDesc: (1, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x0443a001
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0885a001
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g12<1>UW        g12<8,8,1>UD    0x06125001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 1 { align1 1Q };
+send(8)         g13<1>UW        g15<8,8,1>UD    0x06125102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 1 { align1 1Q };
+send(16)        g20<1>UW        g22<8,8,1>UD    0x0c245001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 2 { align1 1H };
+send(16)        g22<1>UW        g28<8,8,1>UD    0x0c245102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 2 { align1 1H };
+send(8)         g38<1>UD        g2<8,8,1>UD     0x024800c8
+                            urb MsgDesc: 12 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g39<1>UD        g2<8,8,1>UD     0x024800d8
+                            urb MsgDesc: 13 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g40<1>UD        g2<8,8,1>UD     0x024800e8
+                            urb MsgDesc: 14 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g41<1>UD        g2<8,8,1>UD     0x024800f8
+                            urb MsgDesc: 15 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g42<1>UD        g2<8,8,1>UD     0x02480108
+                            urb MsgDesc: 16 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g43<1>UD        g2<8,8,1>UD     0x02480118
+                            urb MsgDesc: 17 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g44<1>UD        g2<8,8,1>UD     0x02480128
+                            urb MsgDesc: 18 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g45<1>UD        g2<8,8,1>UD     0x02480138
+                            urb MsgDesc: 19 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g46<1>UD        g2<8,8,1>UD     0x02480148
+                            urb MsgDesc: 20 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g47<1>UD        g2<8,8,1>UD     0x02480158
+                            urb MsgDesc: 21 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g48<1>UD        g2<8,8,1>UD     0x02480168
+                            urb MsgDesc: 22 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g49<1>UD        g2<8,8,1>UD     0x02480178
+                            urb MsgDesc: 23 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g50<1>UD        g2<8,8,1>UD     0x02480188
+                            urb MsgDesc: 24 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g51<1>UD        g2<8,8,1>UD     0x02480198
+                            urb MsgDesc: 25 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g52<1>UD        g2<8,8,1>UD     0x024801a8
+                            urb MsgDesc: 26 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g53<1>UD        g2<8,8,1>UD     0x024801b8
+                            urb MsgDesc: 27 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g54<1>UD        g2<8,8,1>UD     0x024801c8
+                            urb MsgDesc: 28 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g55<1>UD        g2<8,8,1>UD     0x024801d8
+                            urb MsgDesc: 29 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g56<1>UD        g2<8,8,1>UD     0x024801e8
+                            urb MsgDesc: 30 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g57<1>UD        g2<8,8,1>UD     0x024801f8
+                            urb MsgDesc: 31 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x080a8027
+                            urb MsgDesc: 2 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g8<8,8,1>UD     0x0a0a8027
+                            urb MsgDesc: 2 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x0e424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g7<8,8,1>UD     0x0212a000
+                            sampler MsgDesc: resinfo SIMD8 Surface = 0 Sampler = 0 mlen 1 rlen 1 { align1 1Q };
+send(8)         g8<1>UD         g14<8,8,1>UD    0x044a0128
+                            urb MsgDesc: 18 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g22<1>UD        g16<8,8,1>UD    0x044a0028
+                            urb MsgDesc: 2 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x0a080017
+                            urb MsgDesc: 1 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g7<8,8,1>F      0x0a080057
+                            urb MsgDesc: 5 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         g4<1>UW         g2<8,8,1>UD     0x02406001
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD8, Mask = 0x0) mlen 1 rlen 4 { align1 1Q };
+send(16)        g5<1>UW         g2<8,8,1>UD     0x04805001
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD16, Mask = 0x0) mlen 2 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g13<8,8,1>UD    0x084b0001
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x0e8d0001
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 8 { align1 1H };
+send(8)         g10<1>UW        g10<8,8,1>UD    0x0e134001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 1 { align1 1Q };
+send(8)         g11<1>UW        g17<8,8,1>UD    0x0e134102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 7 rlen 1 { align1 1Q };
+send(8)         g14<1>UW        g10<8,8,1>UD    0x064a8202
+                            sampler MsgDesc: gather4 SIMD8 Surface = 2 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x084a8101
+                            sampler MsgDesc: gather4 SIMD8 Surface = 1 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(8)         g5<1>UW         g6<8,8,1>UD     0x021ab001
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 1 { align1 1Q };
+send(16)        g6<1>UW         g3<8,8,1>UD     0x022cb001
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 1 Sampler = 0 mlen 1 rlen 2 { align1 1H };
+send(8)         null<1>F        g122<8,8,1>F    0x8c0a0037
+                            urb MsgDesc: 3 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         null<1>F        g10<8,8,1>F     0x12080027
+                            urb MsgDesc: 2 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080047
+                            urb MsgDesc: 4 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         g14<1>UW        g2<8,8,1>UD     0x04438000
+                            sampler MsgDesc: sample_lz SIMD8 Surface = 0 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g61<1>UD        g107<8,8,1>UD   0x02380048
+                            urb MsgDesc: 4 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         g64<1>UD        g113<8,8,1>UD   0x02380058
+                            urb MsgDesc: 5 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080047
+                            urb MsgDesc: 4 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         g5<1>UW         g4<8,8,1>UD     0x06415001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0x0) mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x06416001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0x0) mlen 3 rlen 4 { align1 2Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080077
+                            urb MsgDesc: 7 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         g12<1>UD        g8<4,4,1>UD     0x044a0038
+                            urb MsgDesc: 3 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g21<1>UD        g8<4,4,1>UD     0x044a0048
+                            urb MsgDesc: 4 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0c0a00a7
+                            urb MsgDesc: 10 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(16)        g1<1>UW         g9<8,8,1>UD     0x08858001
+                            sampler MsgDesc: sample_lz SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         null<1>F        g56<8,8,1>F     0x140a0097
+                            urb MsgDesc: 9 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g76<8,8,1>F     0x0c0a00b7
+                            urb MsgDesc: 11 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g122<8,8,1>F    0x8c0a00b7
+                            urb MsgDesc: 11 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         g4<1>UW         g3<8,8,1>UD     0x0232a001
+                            sampler MsgDesc: resinfo SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 3 { align1 1Q };
+send(16)        g8<1>UW         g3<8,8,1>UD     0x0464a001
+                            sampler MsgDesc: resinfo SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 6 { align1 1H };
+send(8)         null<1>F        g6<8,8,1>UD     0x0a080007
+                            urb MsgDesc: 0 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         g126<1>UW       g10<8,8,1>UD    0x08123001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 1 { align1 1Q };
+send(16)        g124<1>UW       g8<8,8,1>UD     0x10243001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 2 { align1 1H };
+send(8)         g12<1>UW        g12<8,8,1>UD    0x06126001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 1 { align1 1Q };
+send(8)         g13<1>UW        g15<8,8,1>UD    0x06126102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 1 { align1 1Q };
+send(16)        g20<1>UW        g22<8,8,1>UD    0x0c246001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 2 { align1 1H };
+send(16)        g22<1>UW        g28<8,8,1>UD    0x0c246102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 2 { align1 1H };
+send(8)         g4<1>UW         g0<8,8,1>UD     0x02201000
+                            pixel interp MsgDesc: (persp, sample_position, 0x00) mlen 1 rlen 2 { align1 1Q };
+send(16)        g6<1>UW         g0<8,8,1>UD     0x02411000
+                            pixel interp MsgDesc: (persp, sample_position, 0x00) mlen 1 rlen 4 { align1 1H };
+send(8)         g124<1>UW       g19<8,8,1>UD    0x0a4b0001
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x128d0001
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g15<8,8,1>UD    0x06422001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g14<1>UW        g8<8,8,1>UD     0x0c842001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         null<1>F        g118<8,8,1>F    0x940a0037
+                            urb MsgDesc: 3 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q EOT };
+send(8)         g4<1>UW         g5<8,8,1>UD     0x0212a001
+                            sampler MsgDesc: resinfo SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 1 { align1 1Q };
+send(16)        g4<1>UW         g6<8,8,1>UD     0x0424a001
+                            sampler MsgDesc: resinfo SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 2 { align1 1H };
+send(8)         g8<1>UD         g15<8,8,1>UD    0x042a0138
+                            urb MsgDesc: 19 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g10<1>UD        g15<8,8,1>UD    0x042a0338
+                            urb MsgDesc: 51 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g12<1>UD        g15<8,8,1>UD    0x042a0538
+                            urb MsgDesc: 83 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g14<1>UD        g15<8,8,1>UD    0x042a0738
+                            urb MsgDesc: 115 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g8<1>UD         g15<8,8,1>UD    0x042a0038
+                            urb MsgDesc: 3 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g10<1>UD        g15<8,8,1>UD    0x042a0238
+                            urb MsgDesc: 35 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g12<1>UD        g15<8,8,1>UD    0x042a0438
+                            urb MsgDesc: 67 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g14<1>UD        g15<8,8,1>UD    0x042a0638
+                            urb MsgDesc: 99 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g35<8,8,1>UD    0x02480228
+                            urb MsgDesc: 34 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g8<1>UD         g35<8,8,1>UD    0x02480428
+                            urb MsgDesc: 66 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g35<8,8,1>UD    0x02480628
+                            urb MsgDesc: 98 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0a0a8037
+                            urb MsgDesc: 3 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>UD    0x0a0a8047
+                            urb MsgDesc: 4 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0a0a8057
+                            urb MsgDesc: 5 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x0a0a8067
+                            urb MsgDesc: 6 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x0a0a8077
+                            urb MsgDesc: 7 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x0a0a8087
+                            urb MsgDesc: 8 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0a0a8097
+                            urb MsgDesc: 9 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x0a0a80a7
+                            urb MsgDesc: 10 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x0a0a80b7
+                            urb MsgDesc: 11 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x0a0a80c7
+                            urb MsgDesc: 12 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0a0a80d7
+                            urb MsgDesc: 13 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0a0a80e7
+                            urb MsgDesc: 14 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0a0a80f7
+                            urb MsgDesc: 15 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0a0a8107
+                            urb MsgDesc: 16 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0a0a8117
+                            urb MsgDesc: 17 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0a0a8127
+                            urb MsgDesc: 18 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x0a0a8137
+                            urb MsgDesc: 19 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0a0a8147
+                            urb MsgDesc: 20 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0a0a8157
+                            urb MsgDesc: 21 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0a0a8167
+                            urb MsgDesc: 22 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0a0a8177
+                            urb MsgDesc: 23 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0a0a8187
+                            urb MsgDesc: 24 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0a0a8197
+                            urb MsgDesc: 25 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x0a0a81a7
+                            urb MsgDesc: 26 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0a0a81b7
+                            urb MsgDesc: 27 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0a0a81c7
+                            urb MsgDesc: 28 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0a0a81d7
+                            urb MsgDesc: 29 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0a0a81e7
+                            urb MsgDesc: 30 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0a0a81f7
+                            urb MsgDesc: 31 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0a0a8207
+                            urb MsgDesc: 32 SIMD8 write per-slot masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g122<8,8,1>F    0x8c0a0027
+                            urb MsgDesc: 2 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x06424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x06229001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 2 { align1 1Q };
+send(16)        g120<1>UW       g12<8,8,1>UD    0x0c449001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1H };
+send(8)         g5<1>UW         g19<8,8,1>UD    0x0443a102
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 2 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(16)        g15<1>UW        g11<8,8,1>UD    0x0885a102
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 2 Sampler = 1 mlen 4 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g12<8,8,1>UD    0x0a43c000
+                            sampler MsgDesc: ld2dms_w SIMD8 Surface = 0 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g4<1>UW         g5<8,8,1>UD     0x04120001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 1 { align1 1Q };
+send(16)        g4<1>UW         g7<8,8,1>UD     0x08240001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 2 { align1 1H };
+send(8)         null<1>F        g118<8,8,1>F    0x940a0027
+                            urb MsgDesc: 2 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q EOT };
+send(8)         null<1>F        g2<8,8,1>F      0x12080067
+                            urb MsgDesc: 6 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080087
+                            urb MsgDesc: 8 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         g21<1>UD        g2<8,8,1>UD     0x02380068
+                            urb MsgDesc: 6 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         g35<1>UD        g2<8,8,1>UD     0x02380088
+                            urb MsgDesc: 8 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         null<1>F        g5<8,8,1>F      0x140a0067
+                            urb MsgDesc: 6 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g118<8,8,1>F    0x940a0067
+                            urb MsgDesc: 6 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q EOT };
+send(8)         g2<1>UW         g8<8,8,1>UD     0x04220001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 2 { align1 1Q };
+send(16)        g2<1>UW         g14<8,8,1>UD    0x08440001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1H };
+send(8)         null<1>F        g123<8,8,1>F    0x8a0800d7
+                            urb MsgDesc: 13 SIMD8 write mlen 5 rlen 0       { align1 1Q EOT };
+send(8)         g22<1>UW        g14<8,8,1>UD    0x064a8405
+                            sampler MsgDesc: gather4 SIMD8 Surface = 5 Sampler = 4 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x084a8102
+                            sampler MsgDesc: gather4 SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g10<8,8,1>UD    0x084a8203
+                            sampler MsgDesc: gather4 SIMD8 Surface = 3 Sampler = 2 mlen 4 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g26<8,8,1>UD    0x0a4a8304
+                            sampler MsgDesc: gather4 SIMD8 Surface = 4 Sampler = 3 mlen 5 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g43<8,8,1>UD    0x0a8c8405
+                            sampler MsgDesc: gather4 SIMD16 Surface = 5 Sampler = 4 mlen 5 rlen 8 { align1 1H };
+send(16)        g43<1>UW        g7<8,8,1>UD     0x0e8c8102
+                            sampler MsgDesc: gather4 SIMD16 Surface = 2 Sampler = 1 mlen 7 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g51<8,8,1>UD    0x0e8c8203
+                            sampler MsgDesc: gather4 SIMD16 Surface = 3 Sampler = 2 mlen 7 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g26<8,8,1>UD    0x128c8304
+                            sampler MsgDesc: gather4 SIMD16 Surface = 4 Sampler = 3 mlen 9 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g15<8,8,1>UD    0x0e4a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+send(16)        null<1>UW       g2<8,8,1>UD     0x04008601
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, dec) mlen 2 rlen 0 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x08422001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x10842001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g7<8,8,1>UD     0x08126001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 1 { align1 1Q };
+send(8)         g7<1>UW         g11<8,8,1>UD    0x08126102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 1 { align1 1Q };
+send(16)        g10<1>UW        g12<8,8,1>UD    0x10246001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 2 { align1 1H };
+send(16)        g12<1>UW        g20<8,8,1>UD    0x10246102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 8 rlen 2 { align1 1H };
+send(8)         null<1>F        g18<8,8,1>UD    0x0e0a8047
+                            urb MsgDesc: 4 SIMD8 write per-slot masked mlen 7 rlen 0 { align1 1Q };
+send(8)         g9<1>UD         g34<8,8,1>UD    0x02480218
+                            urb MsgDesc: 33 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g17<1>UD        g34<8,8,1>UD    0x02480238
+                            urb MsgDesc: 35 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g2<1>UD         g6<8,8,1>UD     0x041a0128
+                            urb MsgDesc: 18 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g22<1>UD        g8<8,8,1>UD     0x041a0028
+                            urb MsgDesc: 2 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         null<1>F        g2<8,8,1>UD     0x06088027
+                            urb MsgDesc: 2 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x06088037
+                            urb MsgDesc: 3 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x06088047
+                            urb MsgDesc: 4 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x06088057
+                            urb MsgDesc: 5 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x06088067
+                            urb MsgDesc: 6 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x06088077
+                            urb MsgDesc: 7 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x06088087
+                            urb MsgDesc: 8 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x06088097
+                            urb MsgDesc: 9 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x060880a7
+                            urb MsgDesc: 10 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x060880b7
+                            urb MsgDesc: 11 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x060880c7
+                            urb MsgDesc: 12 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x060880d7
+                            urb MsgDesc: 13 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x060880e7
+                            urb MsgDesc: 14 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x060880f7
+                            urb MsgDesc: 15 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x06088107
+                            urb MsgDesc: 16 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x06088117
+                            urb MsgDesc: 17 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x06088127
+                            urb MsgDesc: 18 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x06088137
+                            urb MsgDesc: 19 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x06088147
+                            urb MsgDesc: 20 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x06088157
+                            urb MsgDesc: 21 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x06088167
+                            urb MsgDesc: 22 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x06088177
+                            urb MsgDesc: 23 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x06088187
+                            urb MsgDesc: 24 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x06088197
+                            urb MsgDesc: 25 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x060881a7
+                            urb MsgDesc: 26 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x060881b7
+                            urb MsgDesc: 27 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x060881c7
+                            urb MsgDesc: 28 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x060881d7
+                            urb MsgDesc: 29 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x060881e7
+                            urb MsgDesc: 30 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x060881f7
+                            urb MsgDesc: 31 SIMD8 write masked mlen 3 rlen 0 { align1 1Q };
+send(8)         g3<1>UW         g10<8,8,1>UD    0x0242a001
+                            sampler MsgDesc: resinfo SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g3<1>UW         g11<8,8,1>UD    0x0484a001
+                            sampler MsgDesc: resinfo SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x06320001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 3 { align1 1Q };
+send(16)        g120<1>UW       g8<8,8,1>UD     0x0c640001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 6 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x02406000
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 0, SIMD8, Mask = 0x0) mlen 1 rlen 4 { align1 1Q };
+send(8)         g127<1>UW       g6<8,8,1>UD     0x06120001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 1 { align1 1Q };
+send(16)        g126<1>UW       g8<8,8,1>UD     0x0c240001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 2 { align1 1H };
+send(8)         g23<1>UW        g2<8,8,1>UD     0x04115e01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0xe) mlen 2 rlen 1 { align1 1Q };
+send(8)         g39<1>UW        g45<8,8,1>UD    0x04116e01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0xe) mlen 2 rlen 1 { align1 2Q };
+(+f1.0) send(8) null<1>UW       g2<8,8,1>UD     0x04018501
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, inc) mlen 2 rlen 0 { align1 1Q };
+(+f1.0) send(8) null<1>UW       g42<8,8,1>UD    0x04019501
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, inc) mlen 2 rlen 0 { align1 2Q };
+send(8)         g2<1>UW         g6<8,8,1>UD     0x04423001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g8<8,8,1>UD     0x04423102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x08843001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x08843102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 4 rlen 8 { align1 1H };
+send(8)         g6<1>UD         g22<8,8,1>UD    0x044a0318
+                            urb MsgDesc: 49 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g10<1>UD        g22<8,8,1>UD    0x044a0518
+                            urb MsgDesc: 81 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g14<1>UD        g22<8,8,1>UD    0x044a0718
+                            urb MsgDesc: 113 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g18<1>UD        g22<8,8,1>UD    0x044a0918
+                            urb MsgDesc: 145 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g13<1>UD        g29<8,8,1>UD    0x044a0218
+                            urb MsgDesc: 33 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g17<1>UD        g29<8,8,1>UD    0x044a0418
+                            urb MsgDesc: 65 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g21<1>UD        g29<8,8,1>UD    0x044a0618
+                            urb MsgDesc: 97 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g25<1>UD        g29<8,8,1>UD    0x044a0818
+                            urb MsgDesc: 129 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0c0a0217
+                            urb MsgDesc: 33 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0c0a0227
+                            urb MsgDesc: 34 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x0c0a0237
+                            urb MsgDesc: 35 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x0c0a0247
+                            urb MsgDesc: 36 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x0c0a0257
+                            urb MsgDesc: 37 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0c0a0267
+                            urb MsgDesc: 38 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x0c0a0277
+                            urb MsgDesc: 39 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x0c0a0287
+                            urb MsgDesc: 40 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x0c0a0297
+                            urb MsgDesc: 41 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0c0a02a7
+                            urb MsgDesc: 42 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0c0a02b7
+                            urb MsgDesc: 43 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0c0a02c7
+                            urb MsgDesc: 44 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0c0a02d7
+                            urb MsgDesc: 45 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0c0a02e7
+                            urb MsgDesc: 46 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0c0a02f7
+                            urb MsgDesc: 47 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x0c0a0307
+                            urb MsgDesc: 48 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0c0a0317
+                            urb MsgDesc: 49 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0c0a0327
+                            urb MsgDesc: 50 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0c0a0337
+                            urb MsgDesc: 51 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0c0a0347
+                            urb MsgDesc: 52 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0c0a0357
+                            urb MsgDesc: 53 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0c0a0367
+                            urb MsgDesc: 54 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x0c0a0377
+                            urb MsgDesc: 55 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0c0a0387
+                            urb MsgDesc: 56 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0c0a0397
+                            urb MsgDesc: 57 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0c0a03a7
+                            urb MsgDesc: 58 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0c0a03b7
+                            urb MsgDesc: 59 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0c0a03c7
+                            urb MsgDesc: 60 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0c0a03d7
+                            urb MsgDesc: 61 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x0c0a03e7
+                            urb MsgDesc: 62 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g41<8,8,1>UD    0x0c0a03f7
+                            urb MsgDesc: 63 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0a080067
+                            urb MsgDesc: 6 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x0a080077
+                            urb MsgDesc: 7 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x0a080087
+                            urb MsgDesc: 8 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x0a080097
+                            urb MsgDesc: 9 SIMD8 write mlen 5 rlen 0        { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0a0800a7
+                            urb MsgDesc: 10 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0a0800b7
+                            urb MsgDesc: 11 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0a0800c7
+                            urb MsgDesc: 12 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0a0800d7
+                            urb MsgDesc: 13 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0a0800e7
+                            urb MsgDesc: 14 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0a0800f7
+                            urb MsgDesc: 15 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x0a080107
+                            urb MsgDesc: 16 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0a080117
+                            urb MsgDesc: 17 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0a080127
+                            urb MsgDesc: 18 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0a080137
+                            urb MsgDesc: 19 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0a080147
+                            urb MsgDesc: 20 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0a080157
+                            urb MsgDesc: 21 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0a080167
+                            urb MsgDesc: 22 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x0a080177
+                            urb MsgDesc: 23 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0a080187
+                            urb MsgDesc: 24 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0a080197
+                            urb MsgDesc: 25 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0a0801a7
+                            urb MsgDesc: 26 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0a0801b7
+                            urb MsgDesc: 27 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0a0801c7
+                            urb MsgDesc: 28 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0a0801d7
+                            urb MsgDesc: 29 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x0a0801e7
+                            urb MsgDesc: 30 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         null<1>F        g41<8,8,1>UD    0x0a0801f7
+                            urb MsgDesc: 31 SIMD8 write mlen 5 rlen 0       { align1 1Q };
+send(8)         g13<1>UW        g2<8,8,1>UD     0x06123001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 1 { align1 1Q };
+send(8)         g14<1>UW        g5<8,8,1>UD     0x06123102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 1 { align1 1Q };
+send(16)        g22<1>UW        g2<8,8,1>UD     0x0c243001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 2 { align1 1H };
+send(16)        g24<1>UW        g16<8,8,1>UD    0x0c243102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 2 { align1 1H };
+send(8)         g5<1>UW         g15<8,8,1>UD    0x04420203
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 2 mlen 2 rlen 4 { align1 1Q };
+send(16)        g7<1>UW         g27<8,8,1>UD    0x08840203
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 2 mlen 4 rlen 8 { align1 1H };
+send(16)        g4<1>UW         g17<8,8,1>UD    0x0420a503
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 3, SIMD16, inc) mlen 2 rlen 2 { align1 1H };
+send(16)        null<1>UW       g18<8,8,1>UD    0x04008504
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 4, SIMD16, inc) mlen 2 rlen 0 { align1 1H };
+send(16)        g11<1>UW        g19<8,8,1>UD    0x0420a602
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, dec) mlen 2 rlen 2 { align1 1H };
+send(16)        null<1>UW       g20<8,8,1>UD    0x04008505
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 5, SIMD16, inc) mlen 2 rlen 0 { align1 1H };
+send(16)        g16<1>UW        g21<8,8,1>UD    0x04205e01
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD16, Mask = 0xe) mlen 2 rlen 2 { align1 1H };
+send(16)        null<1>UW       g22<8,8,1>UD    0x04008506
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 6, SIMD16, inc) mlen 2 rlen 0 { align1 1H };
+send(8)         g26<1>UW        g26<8,8,1>UD    0x0242a203
+                            sampler MsgDesc: resinfo SIMD8 Surface = 3 Sampler = 2 mlen 1 rlen 4 { align1 1Q };
+send(8)         g30<1>UW        g30<8,8,1>UD    0x0242a304
+                            sampler MsgDesc: resinfo SIMD8 Surface = 4 Sampler = 3 mlen 1 rlen 4 { align1 1Q };
+send(8)         g34<1>UW        g34<8,8,1>UD    0x0242a405
+                            sampler MsgDesc: resinfo SIMD8 Surface = 5 Sampler = 4 mlen 1 rlen 4 { align1 1Q };
+send(8)         g38<1>UW        g38<8,8,1>UD    0x0242a506
+                            sampler MsgDesc: resinfo SIMD8 Surface = 6 Sampler = 5 mlen 1 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g25<8,8,1>UD    0x0242a102
+                            sampler MsgDesc: resinfo SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 4 { align1 1Q };
+send(8)         g42<1>UW        g42<8,8,1>UD    0x0242a607
+                            sampler MsgDesc: resinfo SIMD8 Surface = 7 Sampler = 6 mlen 1 rlen 4 { align1 1Q };
+send(8)         g46<1>UW        g46<8,8,1>UD    0x0242a708
+                            sampler MsgDesc: resinfo SIMD8 Surface = 8 Sampler = 7 mlen 1 rlen 4 { align1 1Q };
+send(8)         g50<1>UW        g50<8,8,1>UD    0x0242a809
+                            sampler MsgDesc: resinfo SIMD8 Surface = 9 Sampler = 8 mlen 1 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g54<8,8,1>UD    0x0242a90a
+                            sampler MsgDesc: resinfo SIMD8 Surface = 10 Sampler = 9 mlen 1 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g55<8,8,1>UD    0x0242aa0b
+                            sampler MsgDesc: resinfo SIMD8 Surface = 11 Sampler = 10 mlen 1 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g56<8,8,1>UD    0x0242ab0c
+                            sampler MsgDesc: resinfo SIMD8 Surface = 12 Sampler = 11 mlen 1 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g57<8,8,1>UD    0x0242ac0d
+                            sampler MsgDesc: resinfo SIMD8 Surface = 13 Sampler = 12 mlen 1 rlen 4 { align1 1Q };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0484a102
+                            sampler MsgDesc: resinfo SIMD16 Surface = 2 Sampler = 1 mlen 2 rlen 8 { align1 1H };
+send(16)        g82<1>UW        g110<8,8,1>UD   0x0484aa0b
+                            sampler MsgDesc: resinfo SIMD16 Surface = 11 Sampler = 10 mlen 2 rlen 8 { align1 1H };
+send(16)        g18<1>UW        g26<8,8,1>UD    0x0484a203
+                            sampler MsgDesc: resinfo SIMD16 Surface = 3 Sampler = 2 mlen 2 rlen 8 { align1 1H };
+send(16)        g90<1>UW        g112<8,8,1>UD   0x0484ab0c
+                            sampler MsgDesc: resinfo SIMD16 Surface = 12 Sampler = 11 mlen 2 rlen 8 { align1 1H };
+send(16)        g98<1>UW        g106<8,8,1>UD   0x0484ac0d
+                            sampler MsgDesc: resinfo SIMD16 Surface = 13 Sampler = 12 mlen 2 rlen 8 { align1 1H };
+send(16)        g26<1>UW        g34<8,8,1>UD    0x0484a304
+                            sampler MsgDesc: resinfo SIMD16 Surface = 4 Sampler = 3 mlen 2 rlen 8 { align1 1H };
+send(16)        g34<1>UW        g42<8,8,1>UD    0x0484a405
+                            sampler MsgDesc: resinfo SIMD16 Surface = 5 Sampler = 4 mlen 2 rlen 8 { align1 1H };
+send(16)        g42<1>UW        g50<8,8,1>UD    0x0484a506
+                            sampler MsgDesc: resinfo SIMD16 Surface = 6 Sampler = 5 mlen 2 rlen 8 { align1 1H };
+send(16)        g50<1>UW        g58<8,8,1>UD    0x0484a607
+                            sampler MsgDesc: resinfo SIMD16 Surface = 7 Sampler = 6 mlen 2 rlen 8 { align1 1H };
+send(16)        g58<1>UW        g66<8,8,1>UD    0x0484a708
+                            sampler MsgDesc: resinfo SIMD16 Surface = 8 Sampler = 7 mlen 2 rlen 8 { align1 1H };
+send(16)        g66<1>UW        g74<8,8,1>UD    0x0484a809
+                            sampler MsgDesc: resinfo SIMD16 Surface = 9 Sampler = 8 mlen 2 rlen 8 { align1 1H };
+send(16)        g74<1>UW        g108<8,8,1>UD   0x0484a90a
+                            sampler MsgDesc: resinfo SIMD16 Surface = 10 Sampler = 9 mlen 2 rlen 8 { align1 1H };
+send(16)        null<1>UW       g3<8,8,1>UD     0x040085fe
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, inc) mlen 2 rlen 0 { align1 1H };
+send(8)         null<1>F        g119<8,8,1>F    0x92080067
+                            urb MsgDesc: 6 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         g6<1>UW         g20<8,8,1>UD    0x12424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 9 rlen 4 { align1 1Q };
+send(8)         g17<1>UW        g2<8,8,1>UD     0x0413a001
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 1 { align1 1Q };
+send(16)        g2<1>UW         g7<8,8,1>UD     0x0825a001
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 2 { align1 1H };
+send(8)         g9<1>UW         g17<8,8,1>UD    0x06422000
+                            sampler MsgDesc: sample_l SIMD8 Surface = 0 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        null<1>UW       g123<0,1,0>UD   0x060a03fd
+                            data MsgDesc: ( DC OWORD block write, 253, 3) mlen 3 rlen 0 { align1 1H };
+send(16)        g114<1>UW       g114<0,1,0>UD   0x022803fd
+                            data MsgDesc: ( DC OWORD block read, 253, 3) mlen 1 rlen 2 { align1 WE_all 1H };
+send(8)         null<1>F        g12<8,8,1>UD    0x0c0a0127
+                            urb MsgDesc: 18 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         g2<1>UW         g11<8,8,1>UD    0x04420405
+                            sampler MsgDesc: sample SIMD8 Surface = 5 Sampler = 4 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g12<8,8,1>UD    0x04420506
+                            sampler MsgDesc: sample SIMD8 Surface = 6 Sampler = 5 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x04420607
+                            sampler MsgDesc: sample SIMD8 Surface = 7 Sampler = 6 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g14<8,8,1>UD    0x04420708
+                            sampler MsgDesc: sample SIMD8 Surface = 8 Sampler = 7 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g15<8,8,1>UD    0x04420809
+                            sampler MsgDesc: sample SIMD8 Surface = 9 Sampler = 8 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g16<8,8,1>UD    0x0442090a
+                            sampler MsgDesc: sample SIMD8 Surface = 10 Sampler = 9 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g17<8,8,1>UD    0x04420a0b
+                            sampler MsgDesc: sample SIMD8 Surface = 11 Sampler = 10 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g18<8,8,1>UD    0x04420b0c
+                            sampler MsgDesc: sample SIMD8 Surface = 12 Sampler = 11 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g19<8,8,1>UD    0x04420c0d
+                            sampler MsgDesc: sample SIMD8 Surface = 13 Sampler = 12 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g20<8,8,1>UD    0x04420d0e
+                            sampler MsgDesc: sample SIMD8 Surface = 14 Sampler = 13 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g21<8,8,1>UD    0x04420e0f
+                            sampler MsgDesc: sample SIMD8 Surface = 15 Sampler = 14 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g22<8,8,1>UD    0x04420f10
+                            sampler MsgDesc: sample SIMD8 Surface = 16 Sampler = 15 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0011
+                            sampler MsgDesc: sample SIMD8 Surface = 17 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0112
+                            sampler MsgDesc: sample SIMD8 Surface = 18 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0213
+                            sampler MsgDesc: sample SIMD8 Surface = 19 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0314
+                            sampler MsgDesc: sample SIMD8 Surface = 20 Sampler = 3 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0415
+                            sampler MsgDesc: sample SIMD8 Surface = 21 Sampler = 4 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0516
+                            sampler MsgDesc: sample SIMD8 Surface = 22 Sampler = 5 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0617
+                            sampler MsgDesc: sample SIMD8 Surface = 23 Sampler = 6 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0718
+                            sampler MsgDesc: sample SIMD8 Surface = 24 Sampler = 7 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0819
+                            sampler MsgDesc: sample SIMD8 Surface = 25 Sampler = 8 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a091a
+                            sampler MsgDesc: sample SIMD8 Surface = 26 Sampler = 9 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0a1b
+                            sampler MsgDesc: sample SIMD8 Surface = 27 Sampler = 10 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0b1c
+                            sampler MsgDesc: sample SIMD8 Surface = 28 Sampler = 11 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0c1d
+                            sampler MsgDesc: sample SIMD8 Surface = 29 Sampler = 12 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0d1e
+                            sampler MsgDesc: sample SIMD8 Surface = 30 Sampler = 13 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g10<8,8,1>UD    0x064a0e1f
+                            sampler MsgDesc: sample SIMD8 Surface = 31 Sampler = 14 mlen 3 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x064a0f20
+                            sampler MsgDesc: sample SIMD8 Surface = 32 Sampler = 15 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g28<8,8,1>UD    0x08840405
+                            sampler MsgDesc: sample SIMD16 Surface = 5 Sampler = 4 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g29<8,8,1>UD    0x08840506
+                            sampler MsgDesc: sample SIMD16 Surface = 6 Sampler = 5 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g30<8,8,1>UD    0x08840607
+                            sampler MsgDesc: sample SIMD16 Surface = 7 Sampler = 6 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g31<8,8,1>UD    0x08840708
+                            sampler MsgDesc: sample SIMD16 Surface = 8 Sampler = 7 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g32<8,8,1>UD    0x08840809
+                            sampler MsgDesc: sample SIMD16 Surface = 9 Sampler = 8 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g33<8,8,1>UD    0x0884090a
+                            sampler MsgDesc: sample SIMD16 Surface = 10 Sampler = 9 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g34<8,8,1>UD    0x08840a0b
+                            sampler MsgDesc: sample SIMD16 Surface = 11 Sampler = 10 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g35<8,8,1>UD    0x08840b0c
+                            sampler MsgDesc: sample SIMD16 Surface = 12 Sampler = 11 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g36<8,8,1>UD    0x08840c0d
+                            sampler MsgDesc: sample SIMD16 Surface = 13 Sampler = 12 mlen 4 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g37<8,8,1>UD    0x08840d0e
+                            sampler MsgDesc: sample SIMD16 Surface = 14 Sampler = 13 mlen 4 rlen 8 { align1 1H };
+send(16)        g7<1>UW         g38<8,8,1>UD    0x08840e0f
+                            sampler MsgDesc: sample SIMD16 Surface = 15 Sampler = 14 mlen 4 rlen 8 { align1 1H };
+send(16)        g23<1>UW        g39<8,8,1>UD    0x08840f10
+                            sampler MsgDesc: sample SIMD16 Surface = 16 Sampler = 15 mlen 4 rlen 8 { align1 1H };
+send(16)        g17<1>UW        g2<8,8,1>UD     0x0a8c0011
+                            sampler MsgDesc: sample SIMD16 Surface = 17 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(16)        g29<1>UW        g7<8,8,1>UD     0x0a8c0112
+                            sampler MsgDesc: sample SIMD16 Surface = 18 Sampler = 1 mlen 5 rlen 8 { align1 1H };
+send(16)        g27<1>UW        g12<8,8,1>UD    0x0a8c0213
+                            sampler MsgDesc: sample SIMD16 Surface = 19 Sampler = 2 mlen 5 rlen 8 { align1 1H };
+send(16)        g32<1>UW        g17<8,8,1>UD    0x0a8c0314
+                            sampler MsgDesc: sample SIMD16 Surface = 20 Sampler = 3 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g22<8,8,1>UD    0x0a8c0415
+                            sampler MsgDesc: sample SIMD16 Surface = 21 Sampler = 4 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g27<8,8,1>UD    0x0a8c0516
+                            sampler MsgDesc: sample SIMD16 Surface = 22 Sampler = 5 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g32<8,8,1>UD    0x0a8c0617
+                            sampler MsgDesc: sample SIMD16 Surface = 23 Sampler = 6 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g37<8,8,1>UD    0x0a8c0718
+                            sampler MsgDesc: sample SIMD16 Surface = 24 Sampler = 7 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g42<8,8,1>UD    0x0a8c0819
+                            sampler MsgDesc: sample SIMD16 Surface = 25 Sampler = 8 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g47<8,8,1>UD    0x0a8c091a
+                            sampler MsgDesc: sample SIMD16 Surface = 26 Sampler = 9 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g52<8,8,1>UD    0x0a8c0a1b
+                            sampler MsgDesc: sample SIMD16 Surface = 27 Sampler = 10 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g57<8,8,1>UD    0x0a8c0b1c
+                            sampler MsgDesc: sample SIMD16 Surface = 28 Sampler = 11 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g62<8,8,1>UD    0x0a8c0c1d
+                            sampler MsgDesc: sample SIMD16 Surface = 29 Sampler = 12 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g67<8,8,1>UD    0x0a8c0d1e
+                            sampler MsgDesc: sample SIMD16 Surface = 30 Sampler = 13 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g72<8,8,1>UD    0x0a8c0e1f
+                            sampler MsgDesc: sample SIMD16 Surface = 31 Sampler = 14 mlen 5 rlen 8 { align1 1H };
+send(16)        g2<1>UW         g77<8,8,1>UD    0x0a8c0f20
+                            sampler MsgDesc: sample SIMD16 Surface = 32 Sampler = 15 mlen 5 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x02120102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 1 { align1 1Q };
+send(8)         g6<1>UW         g3<8,8,1>UD     0x02220102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 2 { align1 1Q };
+send(8)         g8<1>UW         g4<8,8,1>UD     0x02320102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 3 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x04240102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 2 rlen 2 { align1 1H };
+send(16)        g10<1>UW        g4<8,8,1>UD     0x04440102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 2 rlen 4 { align1 1H };
+send(16)        g14<1>UW        g6<8,8,1>UD     0x04640102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 2 rlen 6 { align1 1H };
+send(8)         null<1>F        g8<8,8,1>UD     0x0c0a8027
+                            urb MsgDesc: 2 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>F     0x12080047
+                            urb MsgDesc: 4 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080087
+                            urb MsgDesc: 8 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         g5<1>UW         g10<8,8,1>UD    0x06420001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g7<1>UW         g19<8,8,1>UD    0x0c840001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(8)         g1<1>UW         g125<8,8,1>UD   0x02106e02
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+send(8)         g8<1>UW         g22<8,8,1>UD    0x02106efe
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 254, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080097
+                            urb MsgDesc: 9 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         g29<1>UW        g5<8,8,1>UD     0x0e4b2001
+                            sampler MsgDesc: gather4_po_c SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+send(8)         g68<1>UW        g72<8,8,1>UD    0x0212a102
+                            sampler MsgDesc: resinfo SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 1 { align1 1Q };
+send(8)         g67<1>UW        g5<8,8,1>UD     0x0a126001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 1 { align1 1Q };
+send(8)         g69<1>UW        g10<8,8,1>UD    0x0a126102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 1 { align1 1Q };
+send(16)        g36<1>UW        g40<8,8,1>UD    0x0424a102
+                            sampler MsgDesc: resinfo SIMD16 Surface = 2 Sampler = 1 mlen 2 rlen 2 { align1 1H };
+send(16)        g2<1>UW         g7<8,8,1>UD     0x14246001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 2 { align1 1H };
+send(16)        g37<1>UW        g17<8,8,1>UD    0x14246102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 10 rlen 2 { align1 1H };
+send(8)         g125<1>UW       g5<8,8,1>UD     0x04220102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 2 rlen 2 { align1 1Q };
+send(16)        g122<1>UW       g7<8,8,1>UD     0x08440102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1H };
+send(8)         null<1>F        g14<8,8,1>UD    0x0c0a8037
+                            urb MsgDesc: 3 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x0c0a8047
+                            urb MsgDesc: 4 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0c0a8057
+                            urb MsgDesc: 5 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         g6<1>UW         g7<8,8,1>UD     0x081a5001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 1 { align1 1Q };
+send(8)         g7<1>UW         g11<8,8,1>UD    0x081a5102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 1 { align1 1Q };
+send(16)        g10<1>UW        g12<8,8,1>UD    0x0e2c5001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 2 { align1 1H };
+send(16)        g12<1>UW        g19<8,8,1>UD    0x0e2c5102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 7 rlen 2 { align1 1H };
+send(8)         g5<1>UW         g6<8,8,1>UD     0x081a3001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 1 { align1 1Q };
+send(8)         g6<1>UW         g10<8,8,1>UD    0x081a3102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 1 { align1 1Q };
+send(16)        g9<1>UW         g11<8,8,1>UD    0x0e2c3001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 2 { align1 1H };
+send(16)        g11<1>UW        g18<8,8,1>UD    0x0e2c3102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 7 rlen 2 { align1 1H };
+send(8)         g5<1>UW         g7<8,8,1>UD     0x04320102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 2 rlen 3 { align1 1Q };
+send(16)        g8<1>UW         g14<8,8,1>UD    0x08640102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 4 rlen 6 { align1 1H };
+send(8)         g19<1>UW        g12<8,8,1>UD    0x04320003
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 0 mlen 2 rlen 3 { align1 1Q };
+send(16)        g34<1>UW        g41<8,8,1>UD    0x08640003
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 0 mlen 4 rlen 6 { align1 1H };
+send(8)         g11<1>UW        g2<8,8,1>UD     0x0443a008
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 8 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g15<1>UW        g2<8,8,1>UD     0x0443a109
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 9 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(8)         g19<1>UW        g2<8,8,1>UD     0x0443a20a
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 10 Sampler = 2 mlen 2 rlen 4 { align1 1Q };
+send(8)         g23<1>UW        g2<8,8,1>UD     0x0443a30b
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 11 Sampler = 3 mlen 2 rlen 4 { align1 1Q };
+send(8)         g27<1>UW        g2<8,8,1>UD     0x0443a40c
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 12 Sampler = 4 mlen 2 rlen 4 { align1 1Q };
+send(8)         g31<1>UW        g2<8,8,1>UD     0x0443a50d
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 13 Sampler = 5 mlen 2 rlen 4 { align1 1Q };
+send(8)         g35<1>UW        g2<8,8,1>UD     0x0443a60e
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 14 Sampler = 6 mlen 2 rlen 4 { align1 1Q };
+send(8)         g39<1>UW        g2<8,8,1>UD     0x0443a70f
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 15 Sampler = 7 mlen 2 rlen 4 { align1 1Q };
+send(16)        g93<1>UW        g2<8,8,1>UD     0x0885a008
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 8 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(16)        g27<1>UW        g2<8,8,1>UD     0x0885a109
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 9 Sampler = 1 mlen 4 rlen 8 { align1 1H };
+send(16)        g37<1>UW        g2<8,8,1>UD     0x0885a20a
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 10 Sampler = 2 mlen 4 rlen 8 { align1 1H };
+send(16)        g47<1>UW        g2<8,8,1>UD     0x0885a30b
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 11 Sampler = 3 mlen 4 rlen 8 { align1 1H };
+send(16)        g57<1>UW        g2<8,8,1>UD     0x0885a40c
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 12 Sampler = 4 mlen 4 rlen 8 { align1 1H };
+send(16)        g67<1>UW        g2<8,8,1>UD     0x0885a50d
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 13 Sampler = 5 mlen 4 rlen 8 { align1 1H };
+send(16)        g85<1>UW        g2<8,8,1>UD     0x0885a60e
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 14 Sampler = 6 mlen 4 rlen 8 { align1 1H };
+send(16)        g77<1>UW        g2<8,8,1>UD     0x0885a70f
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 15 Sampler = 7 mlen 4 rlen 8 { align1 1H };
+send(16)        g83<1>UW        g86<8,8,1>UD    0x04205e00
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 0, SIMD16, Mask = 0xe) mlen 2 rlen 2 { align1 1H };
+send(8)         null<1>F        g122<8,8,1>F    0x8c0a0047
+                            urb MsgDesc: 4 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         g14<1>UW        g11<8,8,1>UD    0x084b0202
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 2 Sampler = 2 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x0a4b0101
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 1 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(8)         null<1>F        g3<8,8,1>F      0x12080087
+                            urb MsgDesc: 8 SIMD8 write mlen 9 rlen 0        { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a0800a7
+                            urb MsgDesc: 10 SIMD8 write mlen 5 rlen 0       { align1 1Q EOT };
+send(8)         g6<1>UW         g7<8,8,1>UD     0x081a6001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 1 { align1 1Q };
+send(8)         g7<1>UW         g11<8,8,1>UD    0x081a6102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 1 { align1 1Q };
+send(16)        g10<1>UW        g12<8,8,1>UD    0x0e2c6001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 2 { align1 1H };
+send(16)        g12<1>UW        g19<8,8,1>UD    0x0e2c6102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 7 rlen 2 { align1 1H };
+send(8)         g31<1>UD        g28<8,8,1>UD    0x02380238
+                            urb MsgDesc: 35 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g34<1>UD        g28<8,8,1>UD    0x02380438
+                            urb MsgDesc: 67 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g37<1>UD        g28<8,8,1>UD    0x02380638
+                            urb MsgDesc: 99 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g28<8,8,1>UD    0x02380248
+                            urb MsgDesc: 36 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g28<8,8,1>UD    0x02380448
+                            urb MsgDesc: 68 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g28<1>UD        g28<8,8,1>UD    0x02380648
+                            urb MsgDesc: 100 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g22<1>UD        g29<8,8,1>UD    0x02380258
+                            urb MsgDesc: 37 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g29<8,8,1>UD    0x02380458
+                            urb MsgDesc: 69 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g28<1>UD        g29<8,8,1>UD    0x02380658
+                            urb MsgDesc: 101 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g22<1>UD        g30<8,8,1>UD    0x02380268
+                            urb MsgDesc: 38 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g30<8,8,1>UD    0x02380468
+                            urb MsgDesc: 70 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g28<1>UD        g30<8,8,1>UD    0x02380668
+                            urb MsgDesc: 102 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g22<1>UD        g31<8,8,1>UD    0x02380278
+                            urb MsgDesc: 39 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g31<8,8,1>UD    0x02380478
+                            urb MsgDesc: 71 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g28<1>UD        g31<8,8,1>UD    0x02380678
+                            urb MsgDesc: 103 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g25<1>UD        g32<8,8,1>UD    0x02380488
+                            urb MsgDesc: 72 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g32<8,8,1>UD    0x02380288
+                            urb MsgDesc: 40 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g28<1>UD        g32<8,8,1>UD    0x02380688
+                            urb MsgDesc: 104 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g25<1>UD        g33<8,8,1>UD    0x02380498
+                            urb MsgDesc: 73 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g33<8,8,1>UD    0x02380298
+                            urb MsgDesc: 41 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g28<1>UD        g33<8,8,1>UD    0x02380698
+                            urb MsgDesc: 105 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g25<1>UD        g34<8,8,1>UD    0x023806a8
+                            urb MsgDesc: 106 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g34<8,8,1>UD    0x023802a8
+                            urb MsgDesc: 42 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g34<8,8,1>UD    0x023804a8
+                            urb MsgDesc: 74 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g8<1>UD         g35<8,8,1>UD    0x023802b8
+                            urb MsgDesc: 43 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g35<8,8,1>UD    0x023804b8
+                            urb MsgDesc: 75 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g35<8,8,1>UD    0x023806b8
+                            urb MsgDesc: 107 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g36<8,8,1>UD    0x023802c8
+                            urb MsgDesc: 44 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g36<8,8,1>UD    0x023804c8
+                            urb MsgDesc: 76 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g36<8,8,1>UD    0x023806c8
+                            urb MsgDesc: 108 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g37<8,8,1>UD    0x023802d8
+                            urb MsgDesc: 45 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g37<8,8,1>UD    0x023804d8
+                            urb MsgDesc: 77 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g37<8,8,1>UD    0x023806d8
+                            urb MsgDesc: 109 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g38<8,8,1>UD    0x023802e8
+                            urb MsgDesc: 46 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g38<8,8,1>UD    0x023804e8
+                            urb MsgDesc: 78 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g38<8,8,1>UD    0x023806e8
+                            urb MsgDesc: 110 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g39<8,8,1>UD    0x023802f8
+                            urb MsgDesc: 47 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g39<8,8,1>UD    0x023804f8
+                            urb MsgDesc: 79 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g25<1>UD        g39<8,8,1>UD    0x023806f8
+                            urb MsgDesc: 111 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g40<8,8,1>UD    0x02380308
+                            urb MsgDesc: 48 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g40<8,8,1>UD    0x02380508
+                            urb MsgDesc: 80 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g40<8,8,1>UD    0x02380708
+                            urb MsgDesc: 112 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g41<8,8,1>UD    0x02380318
+                            urb MsgDesc: 49 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g41<8,8,1>UD    0x02380518
+                            urb MsgDesc: 81 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g41<8,8,1>UD    0x02380718
+                            urb MsgDesc: 113 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g3<8,8,1>UD     0x02380328
+                            urb MsgDesc: 50 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g3<8,8,1>UD     0x02380528
+                            urb MsgDesc: 82 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g3<8,8,1>UD     0x02380728
+                            urb MsgDesc: 114 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g43<8,8,1>UD    0x02380338
+                            urb MsgDesc: 51 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g43<8,8,1>UD    0x02380538
+                            urb MsgDesc: 83 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g43<8,8,1>UD    0x02380738
+                            urb MsgDesc: 115 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g44<8,8,1>UD    0x02380348
+                            urb MsgDesc: 52 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g44<8,8,1>UD    0x02380548
+                            urb MsgDesc: 84 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g44<8,8,1>UD    0x02380748
+                            urb MsgDesc: 116 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g45<8,8,1>UD    0x02380358
+                            urb MsgDesc: 53 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g45<8,8,1>UD    0x02380558
+                            urb MsgDesc: 85 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g22<1>UD        g45<8,8,1>UD    0x02380758
+                            urb MsgDesc: 117 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g46<8,8,1>UD    0x02380368
+                            urb MsgDesc: 54 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g46<8,8,1>UD    0x02380568
+                            urb MsgDesc: 86 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g46<8,8,1>UD    0x02380768
+                            urb MsgDesc: 118 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g47<8,8,1>UD    0x02380378
+                            urb MsgDesc: 55 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g47<8,8,1>UD    0x02380578
+                            urb MsgDesc: 87 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g47<8,8,1>UD    0x02380778
+                            urb MsgDesc: 119 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g48<8,8,1>UD    0x02380388
+                            urb MsgDesc: 56 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g48<8,8,1>UD    0x02380588
+                            urb MsgDesc: 88 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g48<8,8,1>UD    0x02380788
+                            urb MsgDesc: 120 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g49<8,8,1>UD    0x02380398
+                            urb MsgDesc: 57 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g49<8,8,1>UD    0x02380598
+                            urb MsgDesc: 89 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g49<8,8,1>UD    0x02380798
+                            urb MsgDesc: 121 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g50<8,8,1>UD    0x023803a8
+                            urb MsgDesc: 58 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g50<8,8,1>UD    0x023805a8
+                            urb MsgDesc: 90 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g50<8,8,1>UD    0x023807a8
+                            urb MsgDesc: 122 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g54<8,8,1>UD    0x023803b8
+                            urb MsgDesc: 59 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g54<8,8,1>UD    0x023805b8
+                            urb MsgDesc: 91 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g54<8,8,1>UD    0x023807b8
+                            urb MsgDesc: 123 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g55<8,8,1>UD    0x023803c8
+                            urb MsgDesc: 60 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g55<8,8,1>UD    0x023805c8
+                            urb MsgDesc: 92 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g55<8,8,1>UD    0x023807c8
+                            urb MsgDesc: 124 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g56<8,8,1>UD    0x023803d8
+                            urb MsgDesc: 61 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g56<8,8,1>UD    0x023805d8
+                            urb MsgDesc: 93 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g56<8,8,1>UD    0x023807d8
+                            urb MsgDesc: 125 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g57<8,8,1>UD    0x023803e8
+                            urb MsgDesc: 62 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g57<8,8,1>UD    0x023805e8
+                            urb MsgDesc: 94 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g57<8,8,1>UD    0x023807e8
+                            urb MsgDesc: 126 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g58<8,8,1>UD    0x023803f8
+                            urb MsgDesc: 63 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g58<8,8,1>UD    0x023805f8
+                            urb MsgDesc: 95 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g58<8,8,1>UD    0x023807f8
+                            urb MsgDesc: 127 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g59<8,8,1>UD    0x02380208
+                            urb MsgDesc: 32 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g59<8,8,1>UD    0x02380408
+                            urb MsgDesc: 64 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g59<8,8,1>UD    0x02380608
+                            urb MsgDesc: 96 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g17<1>UD        g59<8,8,1>UD    0x02380808
+                            urb MsgDesc: 128 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         g8<1>UD         g60<8,8,1>UD    0x02380218
+                            urb MsgDesc: 33 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g60<8,8,1>UD    0x02380418
+                            urb MsgDesc: 65 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g14<1>UD        g60<8,8,1>UD    0x02380618
+                            urb MsgDesc: 97 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g17<1>UD        g60<8,8,1>UD    0x02380818
+                            urb MsgDesc: 129 SIMD8 read mlen 1 rlen 3       { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x0c0a8067
+                            urb MsgDesc: 6 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x0c0a8077
+                            urb MsgDesc: 7 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0c0a8087
+                            urb MsgDesc: 8 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x0c0a8097
+                            urb MsgDesc: 9 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x0c0a80a7
+                            urb MsgDesc: 10 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x0c0a80b7
+                            urb MsgDesc: 11 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0c0a80c7
+                            urb MsgDesc: 12 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0c0a80d7
+                            urb MsgDesc: 13 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0c0a80e7
+                            urb MsgDesc: 14 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0c0a80f7
+                            urb MsgDesc: 15 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0c0a8107
+                            urb MsgDesc: 16 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0c0a8117
+                            urb MsgDesc: 17 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x0c0a8127
+                            urb MsgDesc: 18 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0c0a8137
+                            urb MsgDesc: 19 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0c0a8147
+                            urb MsgDesc: 20 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0c0a8157
+                            urb MsgDesc: 21 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0c0a8167
+                            urb MsgDesc: 22 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0c0a8177
+                            urb MsgDesc: 23 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0c0a8187
+                            urb MsgDesc: 24 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x0c0a8197
+                            urb MsgDesc: 25 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0c0a81a7
+                            urb MsgDesc: 26 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0c0a81b7
+                            urb MsgDesc: 27 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0c0a81c7
+                            urb MsgDesc: 28 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0c0a81d7
+                            urb MsgDesc: 29 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0c0a81e7
+                            urb MsgDesc: 30 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0c0a81f7
+                            urb MsgDesc: 31 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x0c0a8207
+                            urb MsgDesc: 32 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g41<8,8,1>UD    0x0c0a8217
+                            urb MsgDesc: 33 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x02106e01
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+send(16)        g11<1>UW        g19<8,8,1>UD    0x0420a601
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, dec) mlen 2 rlen 2 { align1 1H };
+send(16)        null<1>UW       g20<8,8,1>UD    0x04008503
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 3, SIMD16, inc) mlen 2 rlen 0 { align1 1H };
+send(8)         g17<1>UW        g11<8,8,1>UD    0x0813e001
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 1 { align1 1Q };
+send(16)        g22<1>UW        g2<8,8,1>UD     0x1025e001
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 2 { align1 1H };
+send(8)         null<1>F        g122<8,8,1>UD   0x8c088007
+                            urb MsgDesc: 0 SIMD8 write masked mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x06423001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x06423102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g20<8,8,1>UD    0x0c843001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g26<8,8,1>UD    0x0c843102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         g14<1>UW        g14<8,8,1>UD    0x0a1a5001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 1 { align1 1Q };
+send(8)         g15<1>UW        g19<8,8,1>UD    0x0a1a5102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 1 { align1 1Q };
+send(16)        g39<1>UW        g7<8,8,1>UD     0x122c5001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 2 { align1 1H };
+send(16)        g41<1>UW        g16<8,8,1>UD    0x122c5102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 9 rlen 2 { align1 1H };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x0c4b2001
+                            sampler MsgDesc: gather4_po_c SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(16)        g43<1>UW        g7<8,8,1>UD     0x168d2001
+                            sampler MsgDesc: gather4_po_c SIMD16 Surface = 1 Sampler = 0 mlen 11 rlen 8 { align1 1H };
+send(8)         g54<1>UD        g7<8,8,1>UD     0x02280048
+                            urb MsgDesc: 4 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         g2<1>UW         g8<8,8,1>UD     0x02420001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g15<8,8,1>UD    0x04840001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(8)         g7<1>UW         g44<8,8,1>UD    0x02106e00
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 0, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+send(8)         null<1>UW       g44<8,8,1>UD    0x02009500
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 0, SIMD8, inc) mlen 1 rlen 0 { align1 1Q };
+send(8)         g7<1>UD         g37<8,8,1>UD    0x02480438
+                            urb MsgDesc: 67 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g11<1>UD        g37<8,8,1>UD    0x02480638
+                            urb MsgDesc: 99 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g13<1>UD        g14<8,8,1>UD    0x042a0148
+                            urb MsgDesc: 20 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g14<8,8,1>UD    0x042a0048
+                            urb MsgDesc: 4 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g124<1>UW       g13<8,8,1>UD    0x0c43c000
+                            sampler MsgDesc: ld2dms_w SIMD8 Surface = 0 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g14<8,8,1>UD    0x064a8404
+                            sampler MsgDesc: gather4 SIMD8 Surface = 4 Sampler = 4 mlen 3 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g10<8,8,1>UD    0x084a8202
+                            sampler MsgDesc: gather4 SIMD8 Surface = 2 Sampler = 2 mlen 4 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g26<8,8,1>UD    0x0a4a8303
+                            sampler MsgDesc: gather4 SIMD8 Surface = 3 Sampler = 3 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g14<8,8,1>UD    0x0e434102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 7 rlen 4 { align1 1Q };
+send(8)         g8<1>UW         g7<8,8,1>UD     0x121b4001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 9 rlen 1 { align1 1Q };
+send(8)         g9<1>UW         g16<8,8,1>UD    0x121b4102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 9 rlen 1 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x02380078
+                            urb MsgDesc: 7 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         g14<1>UW        g10<8,8,1>UD    0x064a8203
+                            sampler MsgDesc: gather4 SIMD8 Surface = 3 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(16)        g26<1>UW        g34<8,8,1>UD    0x0a8c8203
+                            sampler MsgDesc: gather4 SIMD16 Surface = 3 Sampler = 2 mlen 5 rlen 8 { align1 1H };
+send(8)         g50<1>UD        g51<8,8,1>UD    0x02180018
+                            urb MsgDesc: 1 SIMD8 read mlen 1 rlen 1         { align1 1Q };
+send(8)         g59<1>UW        g64<8,8,1>UD    0x02427002
+                            sampler MsgDesc: ld SIMD8 Surface = 2 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g64<8,8,1>UD    0x02427003
+                            sampler MsgDesc: ld SIMD8 Surface = 3 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g64<8,8,1>UD    0x02427004
+                            sampler MsgDesc: ld SIMD8 Surface = 4 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g64<8,8,1>UD    0x02427005
+                            sampler MsgDesc: ld SIMD8 Surface = 5 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g64<8,8,1>UD    0x02427006
+                            sampler MsgDesc: ld SIMD8 Surface = 6 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g64<8,8,1>UD    0x02427007
+                            sampler MsgDesc: ld SIMD8 Surface = 7 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g64<8,8,1>UD    0x02427008
+                            sampler MsgDesc: ld SIMD8 Surface = 8 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g64<8,8,1>UD    0x02427009
+                            sampler MsgDesc: ld SIMD8 Surface = 9 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g30<1>UW        g64<8,8,1>UD    0x0242700a
+                            sampler MsgDesc: ld SIMD8 Surface = 10 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g34<1>UW        g64<8,8,1>UD    0x0242700b
+                            sampler MsgDesc: ld SIMD8 Surface = 11 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g38<1>UW        g64<8,8,1>UD    0x0242700c
+                            sampler MsgDesc: ld SIMD8 Surface = 12 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g42<1>UW        g64<8,8,1>UD    0x0242700d
+                            sampler MsgDesc: ld SIMD8 Surface = 13 Sampler = 0 mlen 1 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x04438505
+                            sampler MsgDesc: sample_lz SIMD8 Surface = 5 Sampler = 5 mlen 2 rlen 4 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0a088067
+                            urb MsgDesc: 6 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x0a088077
+                            urb MsgDesc: 7 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x0a088087
+                            urb MsgDesc: 8 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x0a088097
+                            urb MsgDesc: 9 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0a0880a7
+                            urb MsgDesc: 10 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0a0880b7
+                            urb MsgDesc: 11 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0a0880c7
+                            urb MsgDesc: 12 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0a0880d7
+                            urb MsgDesc: 13 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0a0880e7
+                            urb MsgDesc: 14 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0a0880f7
+                            urb MsgDesc: 15 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x0a088107
+                            urb MsgDesc: 16 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0a088117
+                            urb MsgDesc: 17 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0a088127
+                            urb MsgDesc: 18 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0a088137
+                            urb MsgDesc: 19 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0a088147
+                            urb MsgDesc: 20 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0a088157
+                            urb MsgDesc: 21 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0a088167
+                            urb MsgDesc: 22 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x0a088177
+                            urb MsgDesc: 23 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0a088187
+                            urb MsgDesc: 24 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0a088197
+                            urb MsgDesc: 25 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0a0881a7
+                            urb MsgDesc: 26 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0a0881b7
+                            urb MsgDesc: 27 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0a0881c7
+                            urb MsgDesc: 28 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0a0881d7
+                            urb MsgDesc: 29 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x0a0881e7
+                            urb MsgDesc: 30 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g41<8,8,1>UD    0x0a0881f7
+                            urb MsgDesc: 31 SIMD8 write masked mlen 5 rlen 0 { align1 1Q };
+send(8)         null<1>F        g4<8,8,1>UD     0x0e0a8027
+                            urb MsgDesc: 2 SIMD8 write per-slot masked mlen 7 rlen 0 { align1 1Q };
+send(8)         g5<1>UW         g6<8,8,1>UD     0x04123001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 1 { align1 1Q };
+send(8)         g6<1>UW         g2<8,8,1>UD     0x04123102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 2 rlen 1 { align1 1Q };
+send(16)        g9<1>UW         g11<8,8,1>UD    0x08243001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 2 { align1 1H };
+send(16)        g11<1>UW        g2<8,8,1>UD     0x08243102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 4 rlen 2 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x0443d002
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 2 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g3<1>UW         g14<8,8,1>UD    0x0a43c102
+                            sampler MsgDesc: ld2dms_w SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g10<8,8,1>UD    0x0885d002
+                            sampler MsgDesc: ld_mcs SIMD16 Surface = 2 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(16)        g3<1>UW         g25<8,8,1>UD    0x1485c102
+                            sampler MsgDesc: ld2dms_w SIMD16 Surface = 2 Sampler = 1 mlen 10 rlen 8 { align1 1H };
+send(8)         g10<1>UW        g11<8,8,1>UD    0x0a123001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 1 { align1 1Q };
+send(8)         g11<1>UW        g16<8,8,1>UD    0x0a123102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 1 { align1 1Q };
+send(16)        g34<1>UW        g9<8,8,1>UD     0x14243001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 2 { align1 1H };
+send(16)        g36<1>UW        g19<8,8,1>UD    0x14243102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 10 rlen 2 { align1 1H };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x08426001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x08426102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x10846001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g19<8,8,1>UD    0x10846102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 8 rlen 8 { align1 1H };
+(+f1.0) send(8) g4<1>UW         g10<8,8,1>UD    0x0210b502
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD8, inc) mlen 1 rlen 1 { align1 1Q };
+(+f1.0) send(16) g5<1>UW        g13<8,8,1>UD    0x0420a502
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, inc) mlen 2 rlen 2 { align1 1H };
+send(8)         g8<1>UW         g9<8,8,1>UD     0x06321001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 3 { align1 1Q };
+send(16)        g2<1>UW         g14<8,8,1>UD    0x0c641001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 6 { align1 1H };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x04338000
+                            sampler MsgDesc: sample_lz SIMD8 Surface = 0 Sampler = 0 mlen 2 rlen 3 { align1 1Q };
+send(8)         g12<1>UD        g1<8,8,1>UD     0x02280058
+                            urb MsgDesc: 5 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0e0a8067
+                            urb MsgDesc: 6 SIMD8 write per-slot masked mlen 7 rlen 0 { align1 1Q };
+send(8)         g12<1>UD        g1<8,8,1>UD     0x02280078
+                            urb MsgDesc: 7 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0e0a8087
+                            urb MsgDesc: 8 SIMD8 write per-slot masked mlen 7 rlen 0 { align1 1Q };
+send(8)         g12<1>UD        g1<8,8,1>UD     0x02280098
+                            urb MsgDesc: 9 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0e0a80a7
+                            urb MsgDesc: 10 SIMD8 write per-slot masked mlen 7 rlen 0 { align1 1Q };
+send(16)        g9<1>UW         g17<8,8,1>UD    0x04847002
+                            sampler MsgDesc: ld SIMD16 Surface = 2 Sampler = 0 mlen 2 rlen 8 { align1 1H };
+send(16)        g23<1>UW        g32<8,8,1>UD    0x04205e02
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD16, Mask = 0xe) mlen 2 rlen 2 { align1 1H };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280068
+                            urb MsgDesc: 6 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280088
+                            urb MsgDesc: 8 SIMD8 read mlen 1 rlen 2         { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022800a8
+                            urb MsgDesc: 10 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022800b8
+                            urb MsgDesc: 11 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022800c8
+                            urb MsgDesc: 12 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022800d8
+                            urb MsgDesc: 13 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022800e8
+                            urb MsgDesc: 14 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022800f8
+                            urb MsgDesc: 15 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280108
+                            urb MsgDesc: 16 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280118
+                            urb MsgDesc: 17 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280128
+                            urb MsgDesc: 18 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280138
+                            urb MsgDesc: 19 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280148
+                            urb MsgDesc: 20 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280158
+                            urb MsgDesc: 21 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280168
+                            urb MsgDesc: 22 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280178
+                            urb MsgDesc: 23 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280188
+                            urb MsgDesc: 24 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280198
+                            urb MsgDesc: 25 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022801a8
+                            urb MsgDesc: 26 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022801b8
+                            urb MsgDesc: 27 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022801c8
+                            urb MsgDesc: 28 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022801d8
+                            urb MsgDesc: 29 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022801e8
+                            urb MsgDesc: 30 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x022801f8
+                            urb MsgDesc: 31 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g1<8,8,1>UD     0x02280208
+                            urb MsgDesc: 32 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g2<1>UW         g3<8,8,1>UD     0x04203000
+                            pixel interp MsgDesc: (persp, per_slot_offset, 0x00) mlen 2 rlen 2 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x08413000
+                            pixel interp MsgDesc: (persp, per_slot_offset, 0x00) mlen 4 rlen 4 { align1 1H };
+send(8)         g2<1>UW         g0<8,8,1>UD     0x02201010
+                            pixel interp MsgDesc: (persp, sample_position, 0x10) mlen 1 rlen 2 { align1 1Q };
+send(16)        g2<1>UW         g0<8,8,1>UD     0x02411010
+                            pixel interp MsgDesc: (persp, sample_position, 0x10) mlen 1 rlen 4 { align1 1H };
+send(8)         g2<1>UW         g0<8,8,1>UD     0x02201020
+                            pixel interp MsgDesc: (persp, sample_position, 0x20) mlen 1 rlen 2 { align1 1Q };
+send(16)        g2<1>UW         g0<8,8,1>UD     0x02411020
+                            pixel interp MsgDesc: (persp, sample_position, 0x20) mlen 1 rlen 4 { align1 1H };
+send(8)         g2<1>UW         g0<8,8,1>UD     0x02201030
+                            pixel interp MsgDesc: (persp, sample_position, 0x30) mlen 1 rlen 2 { align1 1Q };
+send(16)        g2<1>UW         g0<8,8,1>UD     0x02411030
+                            pixel interp MsgDesc: (persp, sample_position, 0x30) mlen 1 rlen 4 { align1 1H };
+send(8)         g20<1>UW        g15<8,8,1>UD    0x04320203
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 2 mlen 2 rlen 3 { align1 1Q };
+send(8)         g11<1>UW        g26<8,8,1>UD    0x04320405
+                            sampler MsgDesc: sample SIMD8 Surface = 5 Sampler = 4 mlen 2 rlen 3 { align1 1Q };
+send(8)         g8<1>UW         g24<8,8,1>UD    0x04320304
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 3 mlen 2 rlen 3 { align1 1Q };
+send(16)        g26<1>UW        g21<8,8,1>UD    0x08640203
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 2 mlen 4 rlen 6 { align1 1H };
+send(16)        g12<1>UW        g48<8,8,1>UD    0x08640405
+                            sampler MsgDesc: sample SIMD16 Surface = 5 Sampler = 4 mlen 4 rlen 6 { align1 1H };
+send(16)        g38<1>UW        g44<8,8,1>UD    0x08640304
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 3 mlen 4 rlen 6 { align1 1H };
+(+f1.0) send(8) null<1>UW       g94<8,8,1>UD    0x02009601
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, dec) mlen 1 rlen 0 { align1 1Q };
+(+f1.0) send(8) g47<1>UW        g94<8,8,1>UD    0x0210b601
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, dec) mlen 1 rlen 1 { align1 1Q };
+send(16)        g4<1>UW         g1<8,8,1>UD     0x04405c02
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD16, Mask = 0xc) mlen 2 rlen 4 { align1 1H };
+send(8)         null<1>UW       g100<8,8,1>UD   0x02009600
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 0, SIMD8, dec) mlen 1 rlen 0 { align1 1Q };
+send(8)         g51<1>UW        g100<8,8,1>UD   0x0210b600
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 0, SIMD8, dec) mlen 1 rlen 1 { align1 1Q };
+send(8)         g5<1>UW         g11<8,8,1>UD    0x064a0001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(16)        g7<1>UW         g19<8,8,1>UD    0x0a8c0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 8 { align1 1H };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080117
+                            urb MsgDesc: 17 SIMD8 write mlen 5 rlen 0       { align1 1Q EOT };
+send(8)         g3<1>UW         g3<8,8,1>UD     0x02415002
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 2, SIMD16, Mask = 0x0) mlen 1 rlen 4 { align1 1Q };
+send(8)         g5<1>UW         g4<8,8,1>UD     0x02416002
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 2, SIMD8, Mask = 0x0) mlen 1 rlen 4 { align1 2Q };
+send(8)         g6<1>UW         g16<8,8,1>UD    0x0210b500
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 0, SIMD8, inc) mlen 1 rlen 1 { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080097
+                            urb MsgDesc: 9 SIMD8 write mlen 9 rlen 0        { align1 1Q EOT };
+send(8)         null<1>F        g4<8,8,1>F      0x120800c7
+                            urb MsgDesc: 12 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g5<8,8,1>F      0x120800e7
+                            urb MsgDesc: 14 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080107
+                            urb MsgDesc: 16 SIMD8 write mlen 5 rlen 0       { align1 1Q EOT };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x08434102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(8)         g67<1>UW        g36<8,8,1>UD    0x0823e000
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 0 Sampler = 0 mlen 4 rlen 2 { align1 1Q };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x0a23c000
+                            sampler MsgDesc: ld2dms_w SIMD8 Surface = 0 Sampler = 0 mlen 5 rlen 2 { align1 1Q };
+send(8)         g9<1>UW         g15<8,8,1>UD    0x021ab101
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 1 Sampler = 1 mlen 1 rlen 1 { align1 1Q };
+send(8)         g10<1>UW        g16<8,8,1>UD    0x021ab202
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 2 Sampler = 2 mlen 1 rlen 1 { align1 1Q };
+send(8)         g11<1>UW        g17<8,8,1>UD    0x021ab303
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 3 Sampler = 3 mlen 1 rlen 1 { align1 1Q };
+send(8)         g12<1>UW        g18<8,8,1>UD    0x021ab404
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 4 Sampler = 4 mlen 1 rlen 1 { align1 1Q };
+send(8)         g13<1>UW        g19<8,8,1>UD    0x021ab505
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 5 Sampler = 5 mlen 1 rlen 1 { align1 1Q };
+send(8)         g14<1>UW        g18<8,8,1>UD    0x08123102
+                            sampler MsgDesc: sample_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 1 { align1 1Q };
+send(16)        g24<1>UW        g32<8,8,1>UD    0x10243102
+                            sampler MsgDesc: sample_c SIMD16 Surface = 2 Sampler = 1 mlen 8 rlen 2 { align1 1H };
+send(8)         g5<1>UW         g5<8,8,1>UD     0x04415000
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 0, SIMD16, Mask = 0x0) mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UD         g9<8,8,1>UD     0x043a0028
+                            urb MsgDesc: 2 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x02380098
+                            urb MsgDesc: 9 SIMD8 read mlen 1 rlen 3         { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x023800a8
+                            urb MsgDesc: 10 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x023800b8
+                            urb MsgDesc: 11 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x023800d8
+                            urb MsgDesc: 13 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x023800e8
+                            urb MsgDesc: 14 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x023800f8
+                            urb MsgDesc: 15 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x02380108
+                            urb MsgDesc: 16 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g13<1>UD        g1<8,8,1>UD     0x02380118
+                            urb MsgDesc: 17 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         null<1>F        g60<8,8,1>F     0x120800a7
+                            urb MsgDesc: 10 SIMD8 write mlen 9 rlen 0       { align1 1Q };
+send(8)         null<1>F        g119<8,8,1>F    0x92080107
+                            urb MsgDesc: 16 SIMD8 write mlen 9 rlen 0       { align1 1Q EOT };
+send(8)         g3<1>UW         g7<8,8,1>UD     0x02115e01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+send(8)         g5<1>UW         g11<8,8,1>UD    0x02116e01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 2Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080067
+                            urb MsgDesc: 6 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         null<1>F        g80<8,8,1>F     0x140a00b7
+                            urb MsgDesc: 11 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x140a00d7
+                            urb MsgDesc: 13 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x140a00f7
+                            urb MsgDesc: 15 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x140a0117
+                            urb MsgDesc: 17 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x140a0137
+                            urb MsgDesc: 19 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g90<8,8,1>F     0x140a0157
+                            urb MsgDesc: 21 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g100<8,8,1>F    0x140a0177
+                            urb MsgDesc: 23 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g110<8,8,1>F    0x0c0a0197
+                            urb MsgDesc: 25 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g120<8,8,1>F    0x8c0a0197
+                            urb MsgDesc: 25 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         null<1>F        g123<8,8,1>F    0x8a0800b7
+                            urb MsgDesc: 11 SIMD8 write mlen 5 rlen 0       { align1 1Q EOT };
+send(8)         g22<1>UD        g53<8,8,1>UD    0x02180238
+                            urb MsgDesc: 35 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g54<1>UD        g53<8,8,1>UD    0x02180438
+                            urb MsgDesc: 67 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g67<1>UD        g53<8,8,1>UD    0x02180638
+                            urb MsgDesc: 99 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g61<1>UD        g53<8,8,1>UD    0x02180248
+                            urb MsgDesc: 36 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g66<1>UD        g53<8,8,1>UD    0x02180448
+                            urb MsgDesc: 68 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g63<1>UD        g53<8,8,1>UD    0x02180648
+                            urb MsgDesc: 100 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g68<1>UD        g65<8,8,1>UD    0x02180258
+                            urb MsgDesc: 37 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g69<1>UD        g65<8,8,1>UD    0x02180458
+                            urb MsgDesc: 69 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g70<1>UD        g65<8,8,1>UD    0x02180658
+                            urb MsgDesc: 101 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g75<1>UD        g24<8,8,1>UD    0x02180268
+                            urb MsgDesc: 38 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g76<1>UD        g24<8,8,1>UD    0x02180468
+                            urb MsgDesc: 70 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g77<1>UD        g24<8,8,1>UD    0x02180668
+                            urb MsgDesc: 102 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g82<1>UD        g25<8,8,1>UD    0x02180278
+                            urb MsgDesc: 39 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g83<1>UD        g25<8,8,1>UD    0x02180478
+                            urb MsgDesc: 71 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g84<1>UD        g25<8,8,1>UD    0x02180678
+                            urb MsgDesc: 103 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g89<1>UD        g26<8,8,1>UD    0x02180288
+                            urb MsgDesc: 40 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g90<1>UD        g26<8,8,1>UD    0x02180488
+                            urb MsgDesc: 72 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g91<1>UD        g26<8,8,1>UD    0x02180688
+                            urb MsgDesc: 104 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g96<1>UD        g27<8,8,1>UD    0x02180298
+                            urb MsgDesc: 41 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g97<1>UD        g27<8,8,1>UD    0x02180498
+                            urb MsgDesc: 73 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g98<1>UD        g27<8,8,1>UD    0x02180698
+                            urb MsgDesc: 105 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g103<1>UD       g28<8,8,1>UD    0x021802a8
+                            urb MsgDesc: 42 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g104<1>UD       g28<8,8,1>UD    0x021804a8
+                            urb MsgDesc: 74 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g105<1>UD       g28<8,8,1>UD    0x021806a8
+                            urb MsgDesc: 106 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g110<1>UD       g29<8,8,1>UD    0x021802b8
+                            urb MsgDesc: 43 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g111<1>UD       g29<8,8,1>UD    0x021804b8
+                            urb MsgDesc: 75 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g112<1>UD       g29<8,8,1>UD    0x021806b8
+                            urb MsgDesc: 107 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g117<1>UD       g30<8,8,1>UD    0x021802c8
+                            urb MsgDesc: 44 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g118<1>UD       g30<8,8,1>UD    0x021804c8
+                            urb MsgDesc: 76 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g119<1>UD       g30<8,8,1>UD    0x021806c8
+                            urb MsgDesc: 108 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g124<1>UD       g31<8,8,1>UD    0x021802d8
+                            urb MsgDesc: 45 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g125<1>UD       g31<8,8,1>UD    0x021804d8
+                            urb MsgDesc: 77 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g126<1>UD       g31<8,8,1>UD    0x021806d8
+                            urb MsgDesc: 109 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g10<1>UD        g32<8,8,1>UD    0x021802e8
+                            urb MsgDesc: 46 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g11<1>UD        g32<8,8,1>UD    0x021804e8
+                            urb MsgDesc: 78 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g12<1>UD        g32<8,8,1>UD    0x021806e8
+                            urb MsgDesc: 110 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g26<1>UD        g33<8,8,1>UD    0x021802f8
+                            urb MsgDesc: 47 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g27<1>UD        g33<8,8,1>UD    0x021804f8
+                            urb MsgDesc: 79 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g28<1>UD        g33<8,8,1>UD    0x021806f8
+                            urb MsgDesc: 111 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g33<1>UD        g35<8,8,1>UD    0x02180308
+                            urb MsgDesc: 48 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g34<1>UD        g35<8,8,1>UD    0x02180508
+                            urb MsgDesc: 80 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g35<1>UD        g35<8,8,1>UD    0x02180708
+                            urb MsgDesc: 112 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g64<1>UD        g36<8,8,1>UD    0x02180318
+                            urb MsgDesc: 49 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g41<1>UD        g36<8,8,1>UD    0x02180518
+                            urb MsgDesc: 81 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g42<1>UD        g36<8,8,1>UD    0x02180718
+                            urb MsgDesc: 113 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g6<1>UD         g37<8,8,1>UD    0x02180328
+                            urb MsgDesc: 50 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g48<1>UD        g37<8,8,1>UD    0x02180528
+                            urb MsgDesc: 82 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g49<1>UD        g37<8,8,1>UD    0x02180728
+                            urb MsgDesc: 114 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g67<1>UD        g38<8,8,1>UD    0x02180338
+                            urb MsgDesc: 51 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g56<1>UD        g38<8,8,1>UD    0x02180538
+                            urb MsgDesc: 83 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g57<1>UD        g38<8,8,1>UD    0x02180738
+                            urb MsgDesc: 115 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g66<1>UD        g39<8,8,1>UD    0x02180348
+                            urb MsgDesc: 52 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g63<1>UD        g39<8,8,1>UD    0x02180548
+                            urb MsgDesc: 84 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g40<1>UD        g39<8,8,1>UD    0x02180748
+                            urb MsgDesc: 116 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g69<1>UD        g64<8,8,1>UD    0x02180358
+                            urb MsgDesc: 53 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g70<1>UD        g64<8,8,1>UD    0x02180558
+                            urb MsgDesc: 85 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g71<1>UD        g64<8,8,1>UD    0x02180758
+                            urb MsgDesc: 117 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g76<1>UD        g41<8,8,1>UD    0x02180368
+                            urb MsgDesc: 54 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g77<1>UD        g41<8,8,1>UD    0x02180568
+                            urb MsgDesc: 86 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g78<1>UD        g41<8,8,1>UD    0x02180768
+                            urb MsgDesc: 118 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g83<1>UD        g42<8,8,1>UD    0x02180378
+                            urb MsgDesc: 55 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g84<1>UD        g42<8,8,1>UD    0x02180578
+                            urb MsgDesc: 87 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g85<1>UD        g42<8,8,1>UD    0x02180778
+                            urb MsgDesc: 119 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g90<1>UD        g43<8,8,1>UD    0x02180388
+                            urb MsgDesc: 56 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g91<1>UD        g43<8,8,1>UD    0x02180588
+                            urb MsgDesc: 88 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g92<1>UD        g43<8,8,1>UD    0x02180788
+                            urb MsgDesc: 120 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g97<1>UD        g44<8,8,1>UD    0x02180398
+                            urb MsgDesc: 57 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g98<1>UD        g44<8,8,1>UD    0x02180598
+                            urb MsgDesc: 89 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g99<1>UD        g44<8,8,1>UD    0x02180798
+                            urb MsgDesc: 121 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g104<1>UD       g45<8,8,1>UD    0x021803a8
+                            urb MsgDesc: 58 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g105<1>UD       g45<8,8,1>UD    0x021805a8
+                            urb MsgDesc: 90 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g106<1>UD       g45<8,8,1>UD    0x021807a8
+                            urb MsgDesc: 122 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g111<1>UD       g46<8,8,1>UD    0x021803b8
+                            urb MsgDesc: 59 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g112<1>UD       g46<8,8,1>UD    0x021805b8
+                            urb MsgDesc: 91 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g113<1>UD       g46<8,8,1>UD    0x021807b8
+                            urb MsgDesc: 123 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g118<1>UD       g6<8,8,1>UD     0x021803c8
+                            urb MsgDesc: 60 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g119<1>UD       g6<8,8,1>UD     0x021805c8
+                            urb MsgDesc: 92 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g120<1>UD       g6<8,8,1>UD     0x021807c8
+                            urb MsgDesc: 124 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g125<1>UD       g48<8,8,1>UD    0x021803d8
+                            urb MsgDesc: 61 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g126<1>UD       g48<8,8,1>UD    0x021805d8
+                            urb MsgDesc: 93 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g2<1>UD         g48<8,8,1>UD    0x021807d8
+                            urb MsgDesc: 125 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g12<1>UD        g49<8,8,1>UD    0x021803e8
+                            urb MsgDesc: 62 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g13<1>UD        g49<8,8,1>UD    0x021805e8
+                            urb MsgDesc: 94 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g14<1>UD        g49<8,8,1>UD    0x021807e8
+                            urb MsgDesc: 126 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g19<1>UD        g50<8,8,1>UD    0x021803f8
+                            urb MsgDesc: 63 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g20<1>UD        g50<8,8,1>UD    0x021805f8
+                            urb MsgDesc: 95 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g53<1>UD        g50<8,8,1>UD    0x021807f8
+                            urb MsgDesc: 127 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g28<1>UD        g51<8,8,1>UD    0x02180408
+                            urb MsgDesc: 64 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g29<1>UD        g51<8,8,1>UD    0x02180608
+                            urb MsgDesc: 96 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g30<1>UD        g51<8,8,1>UD    0x02180808
+                            urb MsgDesc: 128 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         g35<1>UD        g22<8,8,1>UD    0x02180218
+                            urb MsgDesc: 33 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g36<1>UD        g22<8,8,1>UD    0x02180418
+                            urb MsgDesc: 65 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g37<1>UD        g22<8,8,1>UD    0x02180618
+                            urb MsgDesc: 97 SIMD8 read mlen 1 rlen 1        { align1 1Q };
+send(8)         g38<1>UD        g22<8,8,1>UD    0x02180818
+                            urb MsgDesc: 129 SIMD8 read mlen 1 rlen 1       { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x080a8037
+                            urb MsgDesc: 3 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g10<8,8,1>UD    0x080a8047
+                            urb MsgDesc: 4 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>UD    0x080a8057
+                            urb MsgDesc: 5 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x080a8067
+                            urb MsgDesc: 6 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x080a8077
+                            urb MsgDesc: 7 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x080a8087
+                            urb MsgDesc: 8 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x080a8097
+                            urb MsgDesc: 9 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x080a80a7
+                            urb MsgDesc: 10 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x080a80b7
+                            urb MsgDesc: 11 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x080a80c7
+                            urb MsgDesc: 12 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x080a80d7
+                            urb MsgDesc: 13 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x080a80e7
+                            urb MsgDesc: 14 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x080a80f7
+                            urb MsgDesc: 15 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x080a8107
+                            urb MsgDesc: 16 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x080a8117
+                            urb MsgDesc: 17 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x080a8127
+                            urb MsgDesc: 18 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x080a8137
+                            urb MsgDesc: 19 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x080a8147
+                            urb MsgDesc: 20 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x080a8157
+                            urb MsgDesc: 21 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x080a8167
+                            urb MsgDesc: 22 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x080a8177
+                            urb MsgDesc: 23 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x080a8187
+                            urb MsgDesc: 24 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x080a8197
+                            urb MsgDesc: 25 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x080a81a7
+                            urb MsgDesc: 26 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x080a81b7
+                            urb MsgDesc: 27 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x080a81c7
+                            urb MsgDesc: 28 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x080a81d7
+                            urb MsgDesc: 29 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x080a81e7
+                            urb MsgDesc: 30 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x080a81f7
+                            urb MsgDesc: 31 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x080a8207
+                            urb MsgDesc: 32 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x080a8217
+                            urb MsgDesc: 33 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         g18<1>UW        g19<8,8,1>UD    0x04115e00
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 0, SIMD16, Mask = 0xe) mlen 2 rlen 1 { align1 1Q };
+send(8)         g2<1>UW         g6<8,8,1>UD     0x0623d001
+                            sampler MsgDesc: ld_mcs SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 2 { align1 1Q };
+send(16)        g2<1>UW         g8<8,8,1>UD     0x0c45d001
+                            sampler MsgDesc: ld_mcs SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1H };
+send(8)         g101<1>UW       g10<8,8,1>UD    0x0c33c001
+                            sampler MsgDesc: ld2dms_w SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 3 { align1 1Q };
+send(8)         g14<1>UW        g11<8,8,1>UD    0x084b0203
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 3 Sampler = 2 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x0a4b0102
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 4 { align1 1Q };
+send(16)        g26<1>UW        g2<8,8,1>UD     0x0e8d0203
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 3 Sampler = 2 mlen 7 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g34<8,8,1>UD    0x128d0102
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 2 Sampler = 1 mlen 9 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g7<8,8,1>UD     0x0a1b4001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 1 { align1 1Q };
+send(8)         g7<1>UW         g12<8,8,1>UD    0x0a1b4102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 1 { align1 1Q };
+send(8)         g34<1>UD        g42<8,8,1>UD    0x02480248
+                            urb MsgDesc: 36 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g38<1>UD        g42<8,8,1>UD    0x02480448
+                            urb MsgDesc: 68 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g42<1>UD        g42<8,8,1>UD    0x02480648
+                            urb MsgDesc: 100 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g22<1>UD        g43<8,8,1>UD    0x02480258
+                            urb MsgDesc: 37 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g43<8,8,1>UD    0x02480458
+                            urb MsgDesc: 69 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g30<1>UD        g43<8,8,1>UD    0x02480658
+                            urb MsgDesc: 101 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g22<1>UD        g44<8,8,1>UD    0x02480268
+                            urb MsgDesc: 38 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g44<8,8,1>UD    0x02480468
+                            urb MsgDesc: 70 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g30<1>UD        g44<8,8,1>UD    0x02480668
+                            urb MsgDesc: 102 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g22<1>UD        g45<8,8,1>UD    0x02480278
+                            urb MsgDesc: 39 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g45<8,8,1>UD    0x02480478
+                            urb MsgDesc: 71 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g30<1>UD        g45<8,8,1>UD    0x02480678
+                            urb MsgDesc: 103 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g22<1>UD        g55<8,8,1>UD    0x02480288
+                            urb MsgDesc: 40 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g55<8,8,1>UD    0x02480488
+                            urb MsgDesc: 72 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g30<1>UD        g55<8,8,1>UD    0x02480688
+                            urb MsgDesc: 104 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g26<1>UD        g56<8,8,1>UD    0x02480498
+                            urb MsgDesc: 73 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g56<8,8,1>UD    0x02480298
+                            urb MsgDesc: 41 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g30<1>UD        g56<8,8,1>UD    0x02480698
+                            urb MsgDesc: 105 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g26<1>UD        g82<8,8,1>UD    0x024804a8
+                            urb MsgDesc: 74 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g82<8,8,1>UD    0x024802a8
+                            urb MsgDesc: 42 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g30<1>UD        g82<8,8,1>UD    0x024806a8
+                            urb MsgDesc: 106 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g26<1>UD        g83<8,8,1>UD    0x024804b8
+                            urb MsgDesc: 75 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g83<8,8,1>UD    0x024802b8
+                            urb MsgDesc: 43 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g30<1>UD        g83<8,8,1>UD    0x024806b8
+                            urb MsgDesc: 107 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g26<1>UD        g84<8,8,1>UD    0x024806c8
+                            urb MsgDesc: 108 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g84<8,8,1>UD    0x024802c8
+                            urb MsgDesc: 44 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g84<8,8,1>UD    0x024804c8
+                            urb MsgDesc: 76 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g8<1>UD         g85<8,8,1>UD    0x024802d8
+                            urb MsgDesc: 45 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g85<8,8,1>UD    0x024804d8
+                            urb MsgDesc: 77 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g85<8,8,1>UD    0x024806d8
+                            urb MsgDesc: 109 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g6<8,8,1>UD     0x024802e8
+                            urb MsgDesc: 46 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g6<8,8,1>UD     0x024804e8
+                            urb MsgDesc: 78 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g6<8,8,1>UD     0x024806e8
+                            urb MsgDesc: 110 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g3<8,8,1>UD     0x024802f8
+                            urb MsgDesc: 47 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g3<8,8,1>UD     0x024804f8
+                            urb MsgDesc: 79 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g3<8,8,1>UD     0x024806f8
+                            urb MsgDesc: 111 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g46<8,8,1>UD    0x02480308
+                            urb MsgDesc: 48 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g46<8,8,1>UD    0x02480508
+                            urb MsgDesc: 80 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g46<8,8,1>UD    0x02480708
+                            urb MsgDesc: 112 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g47<8,8,1>UD    0x02480318
+                            urb MsgDesc: 49 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g47<8,8,1>UD    0x02480518
+                            urb MsgDesc: 81 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g47<8,8,1>UD    0x02480718
+                            urb MsgDesc: 113 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g57<8,8,1>UD    0x02480328
+                            urb MsgDesc: 50 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g57<8,8,1>UD    0x02480528
+                            urb MsgDesc: 82 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g57<8,8,1>UD    0x02480728
+                            urb MsgDesc: 114 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g58<8,8,1>UD    0x02480338
+                            urb MsgDesc: 51 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g58<8,8,1>UD    0x02480538
+                            urb MsgDesc: 83 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g26<1>UD        g58<8,8,1>UD    0x02480738
+                            urb MsgDesc: 115 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g59<8,8,1>UD    0x02480348
+                            urb MsgDesc: 52 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g59<8,8,1>UD    0x02480548
+                            urb MsgDesc: 84 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g59<8,8,1>UD    0x02480748
+                            urb MsgDesc: 116 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g60<8,8,1>UD    0x02480358
+                            urb MsgDesc: 53 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g60<8,8,1>UD    0x02480558
+                            urb MsgDesc: 85 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g60<8,8,1>UD    0x02480758
+                            urb MsgDesc: 117 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g61<8,8,1>UD    0x02480368
+                            urb MsgDesc: 54 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g61<8,8,1>UD    0x02480568
+                            urb MsgDesc: 86 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g61<8,8,1>UD    0x02480768
+                            urb MsgDesc: 118 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g62<8,8,1>UD    0x02480378
+                            urb MsgDesc: 55 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g62<8,8,1>UD    0x02480578
+                            urb MsgDesc: 87 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g62<8,8,1>UD    0x02480778
+                            urb MsgDesc: 119 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g63<8,8,1>UD    0x02480388
+                            urb MsgDesc: 56 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g63<8,8,1>UD    0x02480588
+                            urb MsgDesc: 88 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g63<8,8,1>UD    0x02480788
+                            urb MsgDesc: 120 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g64<8,8,1>UD    0x02480398
+                            urb MsgDesc: 57 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g64<8,8,1>UD    0x02480598
+                            urb MsgDesc: 89 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g64<8,8,1>UD    0x02480798
+                            urb MsgDesc: 121 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g68<8,8,1>UD    0x024803a8
+                            urb MsgDesc: 58 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g68<8,8,1>UD    0x024805a8
+                            urb MsgDesc: 90 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g68<8,8,1>UD    0x024807a8
+                            urb MsgDesc: 122 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g69<8,8,1>UD    0x024803b8
+                            urb MsgDesc: 59 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g69<8,8,1>UD    0x024805b8
+                            urb MsgDesc: 91 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g22<1>UD        g69<8,8,1>UD    0x024807b8
+                            urb MsgDesc: 123 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g70<8,8,1>UD    0x024803c8
+                            urb MsgDesc: 60 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g70<8,8,1>UD    0x024805c8
+                            urb MsgDesc: 92 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g16<1>UD        g70<8,8,1>UD    0x024807c8
+                            urb MsgDesc: 124 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g71<8,8,1>UD    0x024803d8
+                            urb MsgDesc: 61 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g71<8,8,1>UD    0x024805d8
+                            urb MsgDesc: 93 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g16<1>UD        g71<8,8,1>UD    0x024807d8
+                            urb MsgDesc: 125 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g72<8,8,1>UD    0x024803e8
+                            urb MsgDesc: 62 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g72<8,8,1>UD    0x024805e8
+                            urb MsgDesc: 94 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g16<1>UD        g72<8,8,1>UD    0x024807e8
+                            urb MsgDesc: 126 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g8<1>UD         g73<8,8,1>UD    0x024803f8
+                            urb MsgDesc: 63 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g12<1>UD        g73<8,8,1>UD    0x024805f8
+                            urb MsgDesc: 95 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g16<1>UD        g73<8,8,1>UD    0x024807f8
+                            urb MsgDesc: 127 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         g12<1>UD        g75<8,8,1>UD    0x02480418
+                            urb MsgDesc: 65 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g16<1>UD        g75<8,8,1>UD    0x02480618
+                            urb MsgDesc: 97 SIMD8 read mlen 1 rlen 4        { align1 1Q };
+send(8)         g20<1>UD        g75<8,8,1>UD    0x02480818
+                            urb MsgDesc: 129 SIMD8 read mlen 1 rlen 4       { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0c0a00c7
+                            urb MsgDesc: 12 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0c0a00d7
+                            urb MsgDesc: 13 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0c0a00e7
+                            urb MsgDesc: 14 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0c0a00f7
+                            urb MsgDesc: 15 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0c0a0107
+                            urb MsgDesc: 16 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0c0a0117
+                            urb MsgDesc: 17 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0c0a0137
+                            urb MsgDesc: 19 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0c0a0147
+                            urb MsgDesc: 20 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0c0a0157
+                            urb MsgDesc: 21 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0c0a0167
+                            urb MsgDesc: 22 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0c0a0177
+                            urb MsgDesc: 23 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0c0a0187
+                            urb MsgDesc: 24 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0c0a01a7
+                            urb MsgDesc: 26 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0c0a01b7
+                            urb MsgDesc: 27 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0c0a01c7
+                            urb MsgDesc: 28 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0c0a01d7
+                            urb MsgDesc: 29 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0c0a01e7
+                            urb MsgDesc: 30 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0c0a01f7
+                            urb MsgDesc: 31 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q };
+send(16)        g46<1>UD        g12<0,1,0>UD    0x02280302
+                            const MsgDesc: (2, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(16)        g50<1>UD        g15<0,1,0>UD    0x02280304
+                            const MsgDesc: (4, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(16)        g34<1>UD        g20<0,1,0>UD    0x02280303
+                            const MsgDesc: (3, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(16)        g16<1>UD        g21<0,1,0>UD    0x02280306
+                            const MsgDesc: (6, 3, 0, 0) mlen 1 rlen 2       { align1 WE_all 1H };
+send(8)         g5<1>UW         g19<8,8,1>UD    0x02106e03
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 3, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+send(8)         g8<1>UW         g21<8,8,1>UD    0x02106e04
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 4, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+send(16)        g8<1>UW         g34<8,8,1>UD    0x04205e03
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 3, SIMD16, Mask = 0xe) mlen 2 rlen 2 { align1 1H };
+send(16)        g14<1>UW        g37<8,8,1>UD    0x04205e04
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 4, SIMD16, Mask = 0xe) mlen 2 rlen 2 { align1 1H };
+send(8)         g15<1>UD        g12<8,8,1>UD    0x041a0038
+                            urb MsgDesc: 3 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g2<1>UW         g54<8,8,1>UD    0x0242a707
+                            sampler MsgDesc: resinfo SIMD8 Surface = 7 Sampler = 7 mlen 1 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g55<8,8,1>UD    0x0242a808
+                            sampler MsgDesc: resinfo SIMD8 Surface = 8 Sampler = 8 mlen 1 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g56<8,8,1>UD    0x0242a909
+                            sampler MsgDesc: resinfo SIMD8 Surface = 9 Sampler = 9 mlen 1 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g57<8,8,1>UD    0x0242aa0a
+                            sampler MsgDesc: resinfo SIMD8 Surface = 10 Sampler = 10 mlen 1 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g58<8,8,1>UD    0x0242ab0b
+                            sampler MsgDesc: resinfo SIMD8 Surface = 11 Sampler = 11 mlen 1 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g59<8,8,1>UD    0x0242ac0c
+                            sampler MsgDesc: resinfo SIMD8 Surface = 12 Sampler = 12 mlen 1 rlen 4 { align1 1Q };
+send(8)         null<1>F        g9<8,8,1>UD     0x0c088027
+                            urb MsgDesc: 2 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g10<8,8,1>UD    0x0c088047
+                            urb MsgDesc: 4 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>UD    0x0c088067
+                            urb MsgDesc: 6 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>UD     0x0c088037
+                            urb MsgDesc: 3 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g7<8,8,1>UD     0x0c088057
+                            urb MsgDesc: 5 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g8<8,8,1>UD     0x0c088077
+                            urb MsgDesc: 7 SIMD8 write masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x140a0197
+                            urb MsgDesc: 25 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x140a01b7
+                            urb MsgDesc: 27 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x140a01d7
+                            urb MsgDesc: 29 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g6<8,8,1>F      0x140a01f7
+                            urb MsgDesc: 31 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g120<8,8,1>F    0x8c0a0217
+                            urb MsgDesc: 33 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         g8<1>UD         g6<8,8,1>UD     0x041a0318
+                            urb MsgDesc: 49 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g9<1>UD         g6<8,8,1>UD     0x041a0518
+                            urb MsgDesc: 81 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g10<1>UD        g6<8,8,1>UD     0x041a0718
+                            urb MsgDesc: 113 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g11<1>UD        g6<8,8,1>UD     0x041a0918
+                            urb MsgDesc: 145 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g7<1>UD         g11<8,8,1>UD    0x041a0218
+                            urb MsgDesc: 33 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g8<1>UD         g11<8,8,1>UD    0x041a0418
+                            urb MsgDesc: 65 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g9<1>UD         g11<8,8,1>UD    0x041a0618
+                            urb MsgDesc: 97 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g10<1>UD        g11<8,8,1>UD    0x041a0818
+                            urb MsgDesc: 129 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         null<1>F        g10<8,8,1>UD    0x080a8227
+                            urb MsgDesc: 34 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>UD    0x080a8237
+                            urb MsgDesc: 35 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x080a8247
+                            urb MsgDesc: 36 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x080a8257
+                            urb MsgDesc: 37 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x080a8267
+                            urb MsgDesc: 38 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x080a8277
+                            urb MsgDesc: 39 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x080a8287
+                            urb MsgDesc: 40 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x080a8297
+                            urb MsgDesc: 41 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x080a82a7
+                            urb MsgDesc: 42 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x080a82b7
+                            urb MsgDesc: 43 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x080a82c7
+                            urb MsgDesc: 44 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x080a82d7
+                            urb MsgDesc: 45 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x080a82e7
+                            urb MsgDesc: 46 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x080a82f7
+                            urb MsgDesc: 47 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x080a8307
+                            urb MsgDesc: 48 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x080a8317
+                            urb MsgDesc: 49 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x080a8327
+                            urb MsgDesc: 50 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x080a8337
+                            urb MsgDesc: 51 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x080a8347
+                            urb MsgDesc: 52 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x080a8357
+                            urb MsgDesc: 53 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x080a8367
+                            urb MsgDesc: 54 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x080a8377
+                            urb MsgDesc: 55 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x080a8387
+                            urb MsgDesc: 56 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x080a8397
+                            urb MsgDesc: 57 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x080a83a7
+                            urb MsgDesc: 58 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x080a83b7
+                            urb MsgDesc: 59 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x080a83c7
+                            urb MsgDesc: 60 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x080a83d7
+                            urb MsgDesc: 61 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x080a83e7
+                            urb MsgDesc: 62 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x080a83f7
+                            urb MsgDesc: 63 SIMD8 write per-slot masked mlen 4 rlen 0 { align1 1Q };
+send(8)         g8<1>UD         g9<8,8,1>UD     0x02480008
+                            urb MsgDesc: 0 SIMD8 read mlen 1 rlen 4         { align1 1Q };
+send(8)         null<1>F        g123<8,8,1>F    0x8a080007
+                            urb MsgDesc: 0 SIMD8 write mlen 5 rlen 0        { align1 1Q EOT };
+send(8)         g4<1>UW         g2<8,8,1>UD     0x04215c01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0xc) mlen 2 rlen 2 { align1 1Q };
+send(8)         g40<1>UW        g38<8,8,1>UD    0x04216c01
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0xc) mlen 2 rlen 2 { align1 2Q };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x104a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 8 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x04422001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x08842001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x06425001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g10<8,8,1>UD    0x06425102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x0c845001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0c845102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         null<1>F        g121<8,8,1>F    0x8a080197
+                            urb MsgDesc: 25 SIMD8 write mlen 5 rlen 0       { align1 1Q EOT };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x02415000
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 0, SIMD16, Mask = 0x0) mlen 1 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x06415000
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 0, SIMD16, Mask = 0x0) mlen 3 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x02215c00
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 0, SIMD16, Mask = 0xc) mlen 1 rlen 2 { align1 1Q };
+send(8)         g17<1>UW        g27<8,8,1>UD    0x02115e00
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 0, SIMD16, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x02415001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD16, Mask = 0x0) mlen 1 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g29<8,8,1>UD    0x02416001
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 1, SIMD8, Mask = 0x0) mlen 1 rlen 4 { align1 2Q };
+send(8)         g9<1>UW         g19<8,8,1>UD    0x0843e102
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g23<1>UW        g7<8,8,1>UD     0x1085e102
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 2 Sampler = 1 mlen 8 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g5<8,8,1>UD     0x0c4b0001
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(16)        g120<1>UW       g7<8,8,1>UD     0x168d0001
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 1 Sampler = 0 mlen 11 rlen 8 { align1 1H };
+send(8)         g6<1>UW         g7<8,8,1>UD     0x0a134001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 1 { align1 1Q };
+send(8)         g7<1>UW         g12<8,8,1>UD    0x0a134102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 1 { align1 1Q };
+send(8)         g22<1>UD        g10<8,8,1>UD    0x041a0138
+                            urb MsgDesc: 19 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g21<1>UD        g10<8,8,1>UD    0x041a0338
+                            urb MsgDesc: 51 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g65<1>UD        g10<8,8,1>UD    0x041a0538
+                            urb MsgDesc: 83 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g10<1>UD        g10<8,8,1>UD    0x041a0738
+                            urb MsgDesc: 115 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g65<1>UD        g11<8,8,1>UD    0x041a0238
+                            urb MsgDesc: 35 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g10<1>UD        g11<8,8,1>UD    0x041a0438
+                            urb MsgDesc: 67 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g11<1>UD        g11<8,8,1>UD    0x041a0638
+                            urb MsgDesc: 99 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g8<1>UD         g7<8,8,1>UD     0x041a0048
+                            urb MsgDesc: 4 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g6<1>UW         g10<8,8,1>UD    0x0a4a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x06426001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g10<8,8,1>UD    0x06426102
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x0c846001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g18<8,8,1>UD    0x0c846102
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 2 Sampler = 1 mlen 6 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x08320001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 3 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x10640001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 6 { align1 1H };
+send(8)         g6<1>UW         g7<8,8,1>UD     0x0c1b4001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 1 { align1 1Q };
+send(8)         g7<1>UW         g13<8,8,1>UD    0x0c1b4102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 6 rlen 1 { align1 1Q };
+send(8)         g2<1>UW         g7<8,8,1>UD     0x08425001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g11<8,8,1>UD    0x08425102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 4 { align1 1Q };
+send(16)        g2<1>UW         g11<8,8,1>UD    0x10845001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g19<8,8,1>UD    0x10845102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 8 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g2<8,8,1>UD     0x02306801
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD8, Mask = 0x8) mlen 1 rlen 3 { align1 1Q };
+send(16)        g120<1>UW       g2<8,8,1>UD     0x04605801
+                            dp data 1 MsgDesc: ( untyped surface read, Surface = 1, SIMD16, Mask = 0x8) mlen 2 rlen 6 { align1 1H };
+send(8)         g8<1>UD         g7<8,8,1>UD     0x043a0128
+                            urb MsgDesc: 18 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g12<1>UW        g5<8,8,1>UD     0x0833e001
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 3 { align1 1Q };
+send(8)         g15<1>UW        g17<8,8,1>UD    0x0823e001
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 2 { align1 1Q };
+send(16)        g7<1>UW         g13<8,8,1>UD    0x1065e001
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 6 { align1 1H };
+send(16)        g33<1>UW        g21<8,8,1>UD    0x1045e001
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 4 { align1 1H };
+send(8)         g14<1>UW        g14<8,8,1>UD    0x101b4001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 8 rlen 1 { align1 1Q };
+send(8)         g15<1>UW        g22<8,8,1>UD    0x101b4102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 8 rlen 1 { align1 1Q };
+send(8)         g8<1>UD         g20<8,8,1>UD    0x044a0138
+                            urb MsgDesc: 19 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g20<8,8,1>UD    0x044a0338
+                            urb MsgDesc: 51 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g16<1>UD        g20<8,8,1>UD    0x044a0538
+                            urb MsgDesc: 83 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g20<1>UD        g20<8,8,1>UD    0x044a0738
+                            urb MsgDesc: 115 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g14<1>UD        g22<8,8,1>UD    0x044a0238
+                            urb MsgDesc: 35 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g18<1>UD        g22<8,8,1>UD    0x044a0438
+                            urb MsgDesc: 67 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g22<1>UD        g22<8,8,1>UD    0x044a0638
+                            urb MsgDesc: 99 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g11<1>UW        g5<8,8,1>UD     0x04120003
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 0 mlen 2 rlen 1 { align1 1Q };
+send(8)         g12<1>UW        g5<8,8,1>UD     0x04120004
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 0 mlen 2 rlen 1 { align1 1Q };
+send(16)        g8<1>UW         g12<8,8,1>UD    0x08240003
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 0 mlen 4 rlen 2 { align1 1H };
+send(16)        g10<1>UW        g12<8,8,1>UD    0x08240004
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 0 mlen 4 rlen 2 { align1 1H };
+send(8)         g6<1>UW         g7<8,8,1>UD     0x08125001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 1 { align1 1Q };
+send(8)         g7<1>UW         g11<8,8,1>UD    0x08125102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 4 rlen 1 { align1 1Q };
+send(16)        g10<1>UW        g12<8,8,1>UD    0x10245001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 8 rlen 2 { align1 1H };
+send(16)        g12<1>UW        g20<8,8,1>UD    0x10245102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 8 rlen 2 { align1 1H };
+send(8)         g2<1>UW         g13<8,8,1>UD    0x0623a001
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 2 { align1 1Q };
+send(16)        g6<1>UW         g23<8,8,1>UD    0x0c45a001
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1H };
+send(8)         g124<1>UW       g7<8,8,1>UD     0x0c4b2000
+                            sampler MsgDesc: gather4_po_c SIMD8 Surface = 0 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
+send(8)         g13<1>UD        g39<8,8,1>UD    0x041a0058
+                            urb MsgDesc: 5 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g4<1>UD         g10<8,8,1>UD    0x041a0068
+                            urb MsgDesc: 6 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g4<1>UD         g3<8,8,1>UD     0x041a0078
+                            urb MsgDesc: 7 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g4<1>UD         g3<8,8,1>UD     0x041a0088
+                            urb MsgDesc: 8 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g4<1>UD         g3<8,8,1>UD     0x041a0098
+                            urb MsgDesc: 9 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g4<1>UD         g3<8,8,1>UD     0x041a00a8
+                            urb MsgDesc: 10 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g4<1>UD         g3<8,8,1>UD     0x041a00b8
+                            urb MsgDesc: 11 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g4<1>UD         g3<8,8,1>UD     0x041a00c8
+                            urb MsgDesc: 12 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a00d8
+                            urb MsgDesc: 13 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a00e8
+                            urb MsgDesc: 14 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a00f8
+                            urb MsgDesc: 15 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0108
+                            urb MsgDesc: 16 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0118
+                            urb MsgDesc: 17 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0148
+                            urb MsgDesc: 20 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0158
+                            urb MsgDesc: 21 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0168
+                            urb MsgDesc: 22 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0178
+                            urb MsgDesc: 23 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0188
+                            urb MsgDesc: 24 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0198
+                            urb MsgDesc: 25 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a01a8
+                            urb MsgDesc: 26 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a01b8
+                            urb MsgDesc: 27 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a01c8
+                            urb MsgDesc: 28 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a01d8
+                            urb MsgDesc: 29 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a01e8
+                            urb MsgDesc: 30 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a01f8
+                            urb MsgDesc: 31 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g3<1>UD         g2<8,8,1>UD     0x041a0208
+                            urb MsgDesc: 32 SIMD8 read per-slot mlen 2 rlen 1 { align1 1Q };
+send(8)         g38<1>UW        g38<8,8,1>UD    0x084a8405
+                            sampler MsgDesc: gather4 SIMD8 Surface = 5 Sampler = 4 mlen 4 rlen 4 { align1 1Q };
+send(8)         g46<1>UW        g23<8,8,1>UD    0x064a8304
+                            sampler MsgDesc: gather4 SIMD8 Surface = 4 Sampler = 3 mlen 3 rlen 4 { align1 1Q };
+send(8)         g28<1>UW        g28<8,8,1>UD    0x064a8506
+                            sampler MsgDesc: gather4 SIMD8 Surface = 6 Sampler = 5 mlen 3 rlen 4 { align1 1Q };
+send(8)         g12<1>UW        g23<8,8,1>UD    0x064a8607
+                            sampler MsgDesc: gather4 SIMD8 Surface = 7 Sampler = 6 mlen 3 rlen 4 { align1 1Q };
+send(8)         g12<1>UW        g32<8,8,1>UD    0x084a8708
+                            sampler MsgDesc: gather4 SIMD8 Surface = 8 Sampler = 7 mlen 4 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g13<8,8,1>UD    0x064a8809
+                            sampler MsgDesc: gather4 SIMD8 Surface = 9 Sampler = 8 mlen 3 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g26<8,8,1>UD    0x084b090a
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 10 Sampler = 9 mlen 4 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x0a4b0a0b
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 11 Sampler = 10 mlen 5 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g6<8,8,1>UD     0x084b0b0c
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 12 Sampler = 11 mlen 4 rlen 4 { align1 1Q };
+send(16)        g30<1>UW        g73<8,8,1>UD    0x0a8c8304
+                            sampler MsgDesc: gather4 SIMD16 Surface = 4 Sampler = 3 mlen 5 rlen 8 { align1 1H };
+send(16)        g40<1>UW        g2<8,8,1>UD     0x0e8c8405
+                            sampler MsgDesc: gather4 SIMD16 Surface = 5 Sampler = 4 mlen 7 rlen 8 { align1 1H };
+send(16)        g5<1>UW         g33<8,8,1>UD    0x0a8c8506
+                            sampler MsgDesc: gather4 SIMD16 Surface = 6 Sampler = 5 mlen 5 rlen 8 { align1 1H };
+send(16)        g32<1>UW        g55<8,8,1>UD    0x0a8c8607
+                            sampler MsgDesc: gather4 SIMD16 Surface = 7 Sampler = 6 mlen 5 rlen 8 { align1 1H };
+send(16)        g30<1>UW        g23<8,8,1>UD    0x0e8c8708
+                            sampler MsgDesc: gather4 SIMD16 Surface = 8 Sampler = 7 mlen 7 rlen 8 { align1 1H };
+send(16)        g5<1>UW         g40<8,8,1>UD    0x0a8c8809
+                            sampler MsgDesc: gather4 SIMD16 Surface = 9 Sampler = 8 mlen 5 rlen 8 { align1 1H };
+send(16)        g38<1>UW        g67<8,8,1>UD    0x0e8d090a
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 10 Sampler = 9 mlen 7 rlen 8 { align1 1H };
+send(16)        g38<1>UW        g2<8,8,1>UD     0x128d0a0b
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 11 Sampler = 10 mlen 9 rlen 8 { align1 1H };
+send(16)        g10<1>UW        g39<8,8,1>UD    0x0e8d0b0c
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 12 Sampler = 11 mlen 7 rlen 8 { align1 1H };
+send(8)         g2<1>UW         g6<8,8,1>UD     0x0e4b2000
+                            sampler MsgDesc: gather4_po_c SIMD8 Surface = 0 Sampler = 0 mlen 7 rlen 4 { align1 1Q };
+send(8)         g11<1>UW        g7<8,8,1>UD     0x04120102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 2 rlen 1 { align1 1Q };
+send(8)         g12<1>UW        g7<8,8,1>UD     0x04120203
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 2 mlen 2 rlen 1 { align1 1Q };
+send(16)        g6<1>UW         g11<8,8,1>UD    0x08240102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 4 rlen 2 { align1 1H };
+send(16)        g8<1>UW         g11<8,8,1>UD    0x08240203
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 2 mlen 4 rlen 2 { align1 1H };
+send(8)         g5<1>UW         g6<8,8,1>UD     0x04220003
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 0 mlen 2 rlen 2 { align1 1Q };
+send(16)        g8<1>UW         g12<8,8,1>UD    0x08440003
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 0 mlen 4 rlen 4 { align1 1H };
+send(8)         g5<1>UW         g2<8,8,1>UD     0x04129001
+                            sampler MsgDesc: lod SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 1 { align1 1Q };
+send(16)        g6<1>UW         g2<8,8,1>UD     0x08249001
+                            sampler MsgDesc: lod SIMD16 Surface = 1 Sampler = 0 mlen 4 rlen 2 { align1 1H };
+send(8)         g11<1>UW        g4<8,8,1>UD     0x04415002
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 2, SIMD16, Mask = 0x0) mlen 2 rlen 4 { align1 1Q };
+send(8)         g7<1>UW         g5<8,8,1>UD     0x04416002
+                            dp data 1 MsgDesc: ( DC typed surface read, Surface = 2, SIMD8, Mask = 0x0) mlen 2 rlen 4 { align1 2Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0e0a8057
+                            urb MsgDesc: 5 SIMD8 write per-slot masked mlen 7 rlen 0 { align1 1Q };
+send(8)         g6<1>UD         g18<8,8,1>UD    0x043a0318
+                            urb MsgDesc: 49 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g9<1>UD         g18<8,8,1>UD    0x043a0518
+                            urb MsgDesc: 81 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g12<1>UD        g18<8,8,1>UD    0x043a0718
+                            urb MsgDesc: 113 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g15<1>UD        g18<8,8,1>UD    0x043a0918
+                            urb MsgDesc: 145 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g11<1>UD        g23<8,8,1>UD    0x043a0218
+                            urb MsgDesc: 33 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g14<1>UD        g23<8,8,1>UD    0x043a0418
+                            urb MsgDesc: 65 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g17<1>UD        g23<8,8,1>UD    0x043a0618
+                            urb MsgDesc: 97 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g20<1>UD        g23<8,8,1>UD    0x043a0818
+                            urb MsgDesc: 129 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         null<1>F        g12<8,8,1>UD    0x0c0a8227
+                            urb MsgDesc: 34 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g13<8,8,1>UD    0x0c0a8237
+                            urb MsgDesc: 35 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g14<8,8,1>UD    0x0c0a8247
+                            urb MsgDesc: 36 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g15<8,8,1>UD    0x0c0a8257
+                            urb MsgDesc: 37 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g16<8,8,1>UD    0x0c0a8267
+                            urb MsgDesc: 38 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g17<8,8,1>UD    0x0c0a8277
+                            urb MsgDesc: 39 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g18<8,8,1>UD    0x0c0a8287
+                            urb MsgDesc: 40 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g19<8,8,1>UD    0x0c0a8297
+                            urb MsgDesc: 41 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g20<8,8,1>UD    0x0c0a82a7
+                            urb MsgDesc: 42 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g21<8,8,1>UD    0x0c0a82b7
+                            urb MsgDesc: 43 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g22<8,8,1>UD    0x0c0a82c7
+                            urb MsgDesc: 44 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g23<8,8,1>UD    0x0c0a82d7
+                            urb MsgDesc: 45 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g24<8,8,1>UD    0x0c0a82e7
+                            urb MsgDesc: 46 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g25<8,8,1>UD    0x0c0a82f7
+                            urb MsgDesc: 47 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g26<8,8,1>UD    0x0c0a8307
+                            urb MsgDesc: 48 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g27<8,8,1>UD    0x0c0a8317
+                            urb MsgDesc: 49 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g28<8,8,1>UD    0x0c0a8327
+                            urb MsgDesc: 50 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g29<8,8,1>UD    0x0c0a8337
+                            urb MsgDesc: 51 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g30<8,8,1>UD    0x0c0a8347
+                            urb MsgDesc: 52 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>UD    0x0c0a8357
+                            urb MsgDesc: 53 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g32<8,8,1>UD    0x0c0a8367
+                            urb MsgDesc: 54 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g33<8,8,1>UD    0x0c0a8377
+                            urb MsgDesc: 55 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g34<8,8,1>UD    0x0c0a8387
+                            urb MsgDesc: 56 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g35<8,8,1>UD    0x0c0a8397
+                            urb MsgDesc: 57 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g36<8,8,1>UD    0x0c0a83a7
+                            urb MsgDesc: 58 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g37<8,8,1>UD    0x0c0a83b7
+                            urb MsgDesc: 59 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g38<8,8,1>UD    0x0c0a83c7
+                            urb MsgDesc: 60 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g39<8,8,1>UD    0x0c0a83d7
+                            urb MsgDesc: 61 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g40<8,8,1>UD    0x0c0a83e7
+                            urb MsgDesc: 62 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         null<1>F        g41<8,8,1>UD    0x0c0a83f7
+                            urb MsgDesc: 63 SIMD8 write per-slot masked mlen 6 rlen 0 { align1 1Q };
+send(8)         g8<1>UW         g7<8,8,1>UD     0x10134001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 8 rlen 1 { align1 1Q };
+send(8)         g9<1>UW         g15<8,8,1>UD    0x10134102
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 2 Sampler = 1 mlen 8 rlen 1 { align1 1Q };
+send(8)         g16<1>UD        g16<8,8,1>UD    0x044a0148
+                            urb MsgDesc: 20 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g38<1>UW        g38<8,8,1>UD    0x084a8404
+                            sampler MsgDesc: gather4 SIMD8 Surface = 4 Sampler = 4 mlen 4 rlen 4 { align1 1Q };
+send(8)         g46<1>UW        g23<8,8,1>UD    0x064a8303
+                            sampler MsgDesc: gather4 SIMD8 Surface = 3 Sampler = 3 mlen 3 rlen 4 { align1 1Q };
+send(8)         g28<1>UW        g28<8,8,1>UD    0x064a8505
+                            sampler MsgDesc: gather4 SIMD8 Surface = 5 Sampler = 5 mlen 3 rlen 4 { align1 1Q };
+send(8)         g12<1>UW        g23<8,8,1>UD    0x064a8606
+                            sampler MsgDesc: gather4 SIMD8 Surface = 6 Sampler = 6 mlen 3 rlen 4 { align1 1Q };
+send(8)         g12<1>UW        g32<8,8,1>UD    0x084a8707
+                            sampler MsgDesc: gather4 SIMD8 Surface = 7 Sampler = 7 mlen 4 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g13<8,8,1>UD    0x064a8808
+                            sampler MsgDesc: gather4 SIMD8 Surface = 8 Sampler = 8 mlen 3 rlen 4 { align1 1Q };
+send(8)         g26<1>UW        g26<8,8,1>UD    0x084b0909
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 9 Sampler = 9 mlen 4 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x0a4b0a0a
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 10 Sampler = 10 mlen 5 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g10<8,8,1>UD    0x084b0b0b
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 11 Sampler = 11 mlen 4 rlen 4 { align1 1Q };
+send(8)         g2<1>UD         g15<8,8,1>UD    0x043a0048
+                            urb MsgDesc: 4 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g12<1>UD        g15<8,8,1>UD    0x043a0058
+                            urb MsgDesc: 5 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0068
+                            urb MsgDesc: 6 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0078
+                            urb MsgDesc: 7 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0088
+                            urb MsgDesc: 8 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0098
+                            urb MsgDesc: 9 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a00a8
+                            urb MsgDesc: 10 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a00b8
+                            urb MsgDesc: 11 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a00c8
+                            urb MsgDesc: 12 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a00d8
+                            urb MsgDesc: 13 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a00e8
+                            urb MsgDesc: 14 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a00f8
+                            urb MsgDesc: 15 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0108
+                            urb MsgDesc: 16 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0118
+                            urb MsgDesc: 17 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0138
+                            urb MsgDesc: 19 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0148
+                            urb MsgDesc: 20 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0158
+                            urb MsgDesc: 21 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0168
+                            urb MsgDesc: 22 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0178
+                            urb MsgDesc: 23 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0188
+                            urb MsgDesc: 24 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0198
+                            urb MsgDesc: 25 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a01a8
+                            urb MsgDesc: 26 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a01b8
+                            urb MsgDesc: 27 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a01c8
+                            urb MsgDesc: 28 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a01d8
+                            urb MsgDesc: 29 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a01e8
+                            urb MsgDesc: 30 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a01f8
+                            urb MsgDesc: 31 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g2<1>UD         g2<8,8,1>UD     0x043a0208
+                            urb MsgDesc: 32 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         null<1>F        g11<8,8,1>F     0x140a0047
+                            urb MsgDesc: 4 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g31<8,8,1>F     0x140a0087
+                            urb MsgDesc: 8 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q };
+send(8)         null<1>F        g118<8,8,1>F    0x940a0087
+                            urb MsgDesc: 8 SIMD8 write per-slot mlen 10 rlen 0 { align1 1Q EOT };
+send(8)         g14<1>UW        g11<8,8,1>UD    0x0a4b0202
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 2 Sampler = 2 mlen 5 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x0c4b0303
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 3 Sampler = 3 mlen 6 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g24<8,8,1>UD    0x084b0404
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 4 Sampler = 4 mlen 4 rlen 4 { align1 1Q };
+send(8)         g15<1>UW        g2<8,8,1>UD     0x06423203
+                            sampler MsgDesc: sample_c SIMD8 Surface = 3 Sampler = 2 mlen 3 rlen 4 { align1 1Q };
+send(16)        g19<1>UW        g27<8,8,1>UD    0x0c843203
+                            sampler MsgDesc: sample_c SIMD16 Surface = 3 Sampler = 2 mlen 6 rlen 8 { align1 1H };
+send(8)         g7<1>UW         g9<8,8,1>UD     0x0a13c001
+                            sampler MsgDesc: ld2dms_w SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 1 { align1 1Q };
+send(16)        g20<1>UW        g7<8,8,1>UD     0x1425c001
+                            sampler MsgDesc: ld2dms_w SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 2 { align1 1H };
+send(8)         g21<1>UW        g5<8,8,1>UD     0x0a33c001
+                            sampler MsgDesc: ld2dms_w SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 3 { align1 1Q };
+send(8)         g18<1>UW        g24<8,8,1>UD    0x0a23c001
+                            sampler MsgDesc: ld2dms_w SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 2 { align1 1Q };
+send(16)        g15<1>UW        g21<8,8,1>UD    0x1465c001
+                            sampler MsgDesc: ld2dms_w SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 6 { align1 1H };
+send(16)        g7<1>UW         g31<8,8,1>UD    0x1445c001
+                            sampler MsgDesc: ld2dms_w SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 4 { align1 1H };
+send(8)         g124<1>UW       g6<8,8,1>UD     0x04438303
+                            sampler MsgDesc: sample_lz SIMD8 Surface = 3 Sampler = 3 mlen 2 rlen 4 { align1 1Q };
+send(8)         g11<1>UD        g17<8,8,1>UD    0x043a0338
+                            urb MsgDesc: 51 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g14<1>UD        g17<8,8,1>UD    0x043a0538
+                            urb MsgDesc: 83 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g17<1>UD        g17<8,8,1>UD    0x043a0738
+                            urb MsgDesc: 115 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g9<1>UD         g18<8,8,1>UD    0x043a0038
+                            urb MsgDesc: 3 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g12<1>UD        g18<8,8,1>UD    0x043a0238
+                            urb MsgDesc: 35 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g15<1>UD        g18<8,8,1>UD    0x043a0438
+                            urb MsgDesc: 67 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g18<1>UD        g18<8,8,1>UD    0x043a0638
+                            urb MsgDesc: 99 SIMD8 read per-slot mlen 2 rlen 3 { align1 1Q };
+send(8)         g6<1>UW         g10<8,8,1>UD    0x08424001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q };
+send(8)         g9<1>UW         g5<8,8,1>UD     0x04420002
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g13<1>UW        g7<8,8,1>UD     0x08840002
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+(+f1.0) send(8) g124<1>UW       g2<8,8,1>UD     0x0211a501
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, inc) mlen 1 rlen 1 { align1 1Q };
+(+f1.0) send(8) g121<1>UW       g3<8,8,1>UD     0x0211b501
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, inc) mlen 1 rlen 1 { align1 2Q };
+send(8)         g22<1>UD        g32<8,8,1>UD    0x02280238
+                            urb MsgDesc: 35 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g24<1>UD        g32<8,8,1>UD    0x02280438
+                            urb MsgDesc: 67 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g26<1>UD        g32<8,8,1>UD    0x02280638
+                            urb MsgDesc: 99 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g28<1>UD        g32<8,8,1>UD    0x02280248
+                            urb MsgDesc: 36 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g30<1>UD        g32<8,8,1>UD    0x02280448
+                            urb MsgDesc: 68 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g32<1>UD        g32<8,8,1>UD    0x02280648
+                            urb MsgDesc: 100 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g22<1>UD        g33<8,8,1>UD    0x02280258
+                            urb MsgDesc: 37 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g24<1>UD        g33<8,8,1>UD    0x02280458
+                            urb MsgDesc: 69 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g26<1>UD        g33<8,8,1>UD    0x02280658
+                            urb MsgDesc: 101 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g22<1>UD        g34<8,8,1>UD    0x02280268
+                            urb MsgDesc: 38 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g24<1>UD        g34<8,8,1>UD    0x02280468
+                            urb MsgDesc: 70 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g26<1>UD        g34<8,8,1>UD    0x02280668
+                            urb MsgDesc: 102 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g24<1>UD        g35<8,8,1>UD    0x02280478
+                            urb MsgDesc: 71 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g35<8,8,1>UD    0x02280278
+                            urb MsgDesc: 39 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g26<1>UD        g35<8,8,1>UD    0x02280678
+                            urb MsgDesc: 103 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g24<1>UD        g36<8,8,1>UD    0x02280688
+                            urb MsgDesc: 104 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g36<8,8,1>UD    0x02280288
+                            urb MsgDesc: 40 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g36<8,8,1>UD    0x02280488
+                            urb MsgDesc: 72 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g8<1>UD         g37<8,8,1>UD    0x02280298
+                            urb MsgDesc: 41 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g37<8,8,1>UD    0x02280498
+                            urb MsgDesc: 73 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g24<1>UD        g37<8,8,1>UD    0x02280698
+                            urb MsgDesc: 105 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g38<8,8,1>UD    0x022802a8
+                            urb MsgDesc: 42 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g38<8,8,1>UD    0x022804a8
+                            urb MsgDesc: 74 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g24<1>UD        g38<8,8,1>UD    0x022806a8
+                            urb MsgDesc: 106 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g39<8,8,1>UD    0x022802b8
+                            urb MsgDesc: 43 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g39<8,8,1>UD    0x022804b8
+                            urb MsgDesc: 75 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g24<1>UD        g39<8,8,1>UD    0x022806b8
+                            urb MsgDesc: 107 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g40<8,8,1>UD    0x022802c8
+                            urb MsgDesc: 44 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g40<8,8,1>UD    0x022804c8
+                            urb MsgDesc: 76 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g40<8,8,1>UD    0x022806c8
+                            urb MsgDesc: 108 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g41<8,8,1>UD    0x022802d8
+                            urb MsgDesc: 45 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g41<8,8,1>UD    0x022804d8
+                            urb MsgDesc: 77 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g41<8,8,1>UD    0x022806d8
+                            urb MsgDesc: 109 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g42<8,8,1>UD    0x022802e8
+                            urb MsgDesc: 46 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g42<8,8,1>UD    0x022804e8
+                            urb MsgDesc: 78 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g42<8,8,1>UD    0x022806e8
+                            urb MsgDesc: 110 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g43<8,8,1>UD    0x022802f8
+                            urb MsgDesc: 47 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g43<8,8,1>UD    0x022804f8
+                            urb MsgDesc: 79 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g22<1>UD        g43<8,8,1>UD    0x022806f8
+                            urb MsgDesc: 111 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g44<8,8,1>UD    0x02280308
+                            urb MsgDesc: 48 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g44<8,8,1>UD    0x02280508
+                            urb MsgDesc: 80 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g44<8,8,1>UD    0x02280708
+                            urb MsgDesc: 112 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g45<8,8,1>UD    0x02280318
+                            urb MsgDesc: 49 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g45<8,8,1>UD    0x02280518
+                            urb MsgDesc: 81 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g45<8,8,1>UD    0x02280718
+                            urb MsgDesc: 113 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g46<8,8,1>UD    0x02280328
+                            urb MsgDesc: 50 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g46<8,8,1>UD    0x02280528
+                            urb MsgDesc: 82 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g46<8,8,1>UD    0x02280728
+                            urb MsgDesc: 114 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g47<8,8,1>UD    0x02280338
+                            urb MsgDesc: 51 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g47<8,8,1>UD    0x02280538
+                            urb MsgDesc: 83 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g47<8,8,1>UD    0x02280738
+                            urb MsgDesc: 115 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g48<8,8,1>UD    0x02280348
+                            urb MsgDesc: 52 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g48<8,8,1>UD    0x02280548
+                            urb MsgDesc: 84 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g48<8,8,1>UD    0x02280748
+                            urb MsgDesc: 116 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g49<8,8,1>UD    0x02280358
+                            urb MsgDesc: 53 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g49<8,8,1>UD    0x02280558
+                            urb MsgDesc: 85 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g49<8,8,1>UD    0x02280758
+                            urb MsgDesc: 117 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g50<8,8,1>UD    0x02280368
+                            urb MsgDesc: 54 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g50<8,8,1>UD    0x02280568
+                            urb MsgDesc: 86 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g50<8,8,1>UD    0x02280768
+                            urb MsgDesc: 118 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g53<8,8,1>UD    0x02280378
+                            urb MsgDesc: 55 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g53<8,8,1>UD    0x02280578
+                            urb MsgDesc: 87 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g53<8,8,1>UD    0x02280778
+                            urb MsgDesc: 119 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g54<8,8,1>UD    0x02280388
+                            urb MsgDesc: 56 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g54<8,8,1>UD    0x02280588
+                            urb MsgDesc: 88 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g54<8,8,1>UD    0x02280788
+                            urb MsgDesc: 120 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g55<8,8,1>UD    0x02280398
+                            urb MsgDesc: 57 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g55<8,8,1>UD    0x02280598
+                            urb MsgDesc: 89 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g55<8,8,1>UD    0x02280798
+                            urb MsgDesc: 121 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g56<8,8,1>UD    0x022803a8
+                            urb MsgDesc: 58 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g56<8,8,1>UD    0x022805a8
+                            urb MsgDesc: 90 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g56<8,8,1>UD    0x022807a8
+                            urb MsgDesc: 122 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g57<8,8,1>UD    0x022803b8
+                            urb MsgDesc: 59 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g57<8,8,1>UD    0x022805b8
+                            urb MsgDesc: 91 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g57<8,8,1>UD    0x022807b8
+                            urb MsgDesc: 123 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g58<8,8,1>UD    0x022803c8
+                            urb MsgDesc: 60 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g58<8,8,1>UD    0x022805c8
+                            urb MsgDesc: 92 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g58<8,8,1>UD    0x022807c8
+                            urb MsgDesc: 124 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g59<8,8,1>UD    0x022803d8
+                            urb MsgDesc: 61 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g59<8,8,1>UD    0x022805d8
+                            urb MsgDesc: 93 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g59<8,8,1>UD    0x022807d8
+                            urb MsgDesc: 125 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g60<8,8,1>UD    0x022803e8
+                            urb MsgDesc: 62 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g60<8,8,1>UD    0x022805e8
+                            urb MsgDesc: 94 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g60<8,8,1>UD    0x022807e8
+                            urb MsgDesc: 126 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g61<8,8,1>UD    0x022803f8
+                            urb MsgDesc: 63 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g61<8,8,1>UD    0x022805f8
+                            urb MsgDesc: 95 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g61<8,8,1>UD    0x022807f8
+                            urb MsgDesc: 127 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g10<1>UD        g62<8,8,1>UD    0x02280408
+                            urb MsgDesc: 64 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g62<8,8,1>UD    0x02280608
+                            urb MsgDesc: 96 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g14<1>UD        g62<8,8,1>UD    0x02280808
+                            urb MsgDesc: 128 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g8<1>UD         g63<8,8,1>UD    0x02280218
+                            urb MsgDesc: 33 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g10<1>UD        g63<8,8,1>UD    0x02280418
+                            urb MsgDesc: 65 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g12<1>UD        g63<8,8,1>UD    0x02280618
+                            urb MsgDesc: 97 SIMD8 read mlen 1 rlen 2        { align1 1Q };
+send(8)         g14<1>UD        g63<8,8,1>UD    0x02280818
+                            urb MsgDesc: 129 SIMD8 read mlen 1 rlen 2       { align1 1Q };
+send(8)         g29<1>UW        g18<8,8,1>UD    0x04420008
+                            sampler MsgDesc: sample SIMD8 Surface = 8 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(8)         g35<1>UW        g18<8,8,1>UD    0x04420109
+                            sampler MsgDesc: sample SIMD8 Surface = 9 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(8)         g41<1>UW        g18<8,8,1>UD    0x0442020a
+                            sampler MsgDesc: sample SIMD8 Surface = 10 Sampler = 2 mlen 2 rlen 4 { align1 1Q };
+send(8)         g2<1>UW         g18<8,8,1>UD    0x0442030b
+                            sampler MsgDesc: sample SIMD8 Surface = 11 Sampler = 3 mlen 2 rlen 4 { align1 1Q };
+send(8)         g6<1>UW         g18<8,8,1>UD    0x0442040c
+                            sampler MsgDesc: sample SIMD8 Surface = 12 Sampler = 4 mlen 2 rlen 4 { align1 1Q };
+send(8)         g10<1>UW        g18<8,8,1>UD    0x0442050d
+                            sampler MsgDesc: sample SIMD8 Surface = 13 Sampler = 5 mlen 2 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g18<8,8,1>UD    0x0442060e
+                            sampler MsgDesc: sample SIMD8 Surface = 14 Sampler = 6 mlen 2 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x0442070f
+                            sampler MsgDesc: sample SIMD8 Surface = 15 Sampler = 7 mlen 2 rlen 4 { align1 1Q };
+send(16)        g32<1>UW        g22<8,8,1>UD    0x08840008
+                            sampler MsgDesc: sample SIMD16 Surface = 8 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(16)        g42<1>UW        g22<8,8,1>UD    0x08840109
+                            sampler MsgDesc: sample SIMD16 Surface = 9 Sampler = 1 mlen 4 rlen 8 { align1 1H };
+send(16)        g60<1>UW        g22<8,8,1>UD    0x0884020a
+                            sampler MsgDesc: sample SIMD16 Surface = 10 Sampler = 2 mlen 4 rlen 8 { align1 1H };
+send(16)        g70<1>UW        g22<8,8,1>UD    0x0884030b
+                            sampler MsgDesc: sample SIMD16 Surface = 11 Sampler = 3 mlen 4 rlen 8 { align1 1H };
+send(16)        g78<1>UW        g22<8,8,1>UD    0x0884040c
+                            sampler MsgDesc: sample SIMD16 Surface = 12 Sampler = 4 mlen 4 rlen 8 { align1 1H };
+send(16)        g86<1>UW        g22<8,8,1>UD    0x0884050d
+                            sampler MsgDesc: sample SIMD16 Surface = 13 Sampler = 5 mlen 4 rlen 8 { align1 1H };
+send(16)        g94<1>UW        g22<8,8,1>UD    0x0884060e
+                            sampler MsgDesc: sample SIMD16 Surface = 14 Sampler = 6 mlen 4 rlen 8 { align1 1H };
+send(16)        g52<1>UW        g22<8,8,1>UD    0x0884070f
+                            sampler MsgDesc: sample SIMD16 Surface = 15 Sampler = 7 mlen 4 rlen 8 { align1 1H };
+send(8)         g16<1>UW        g42<8,8,1>UD    0x04438101
+                            sampler MsgDesc: sample_lz SIMD8 Surface = 1 Sampler = 1 mlen 2 rlen 4 { align1 1Q };
+send(8)         g20<1>UW        g42<8,8,1>UD    0x04438202
+                            sampler MsgDesc: sample_lz SIMD8 Surface = 2 Sampler = 2 mlen 2 rlen 4 { align1 1Q };
+send(8)         g29<1>UW        g42<8,8,1>UD    0x04438404
+                            sampler MsgDesc: sample_lz SIMD8 Surface = 4 Sampler = 4 mlen 2 rlen 4 { align1 1Q };
+send(8)         g38<1>UW        g42<8,8,1>UD    0x04438606
+                            sampler MsgDesc: sample_lz SIMD8 Surface = 6 Sampler = 6 mlen 2 rlen 4 { align1 1Q };
+send(8)         g124<1>UW       g42<8,8,1>UD    0x04438707
+                            sampler MsgDesc: sample_lz SIMD8 Surface = 7 Sampler = 7 mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g16<8,8,1>UD    0x044a0058
+                            urb MsgDesc: 5 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0068
+                            urb MsgDesc: 6 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0078
+                            urb MsgDesc: 7 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0088
+                            urb MsgDesc: 8 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0098
+                            urb MsgDesc: 9 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a00a8
+                            urb MsgDesc: 10 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a00b8
+                            urb MsgDesc: 11 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a00c8
+                            urb MsgDesc: 12 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a00d8
+                            urb MsgDesc: 13 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a00e8
+                            urb MsgDesc: 14 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a00f8
+                            urb MsgDesc: 15 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0108
+                            urb MsgDesc: 16 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0118
+                            urb MsgDesc: 17 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0158
+                            urb MsgDesc: 21 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0168
+                            urb MsgDesc: 22 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0178
+                            urb MsgDesc: 23 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0188
+                            urb MsgDesc: 24 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0198
+                            urb MsgDesc: 25 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a01a8
+                            urb MsgDesc: 26 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a01b8
+                            urb MsgDesc: 27 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a01c8
+                            urb MsgDesc: 28 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a01d8
+                            urb MsgDesc: 29 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a01e8
+                            urb MsgDesc: 30 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a01f8
+                            urb MsgDesc: 31 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g12<1>UD        g2<8,8,1>UD     0x044a0208
+                            urb MsgDesc: 32 SIMD8 read per-slot mlen 2 rlen 4 { align1 1Q };
+send(8)         g14<1>UW        g15<8,8,1>UD    0x0a125001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 1 { align1 1Q };
+send(8)         g15<1>UW        g20<8,8,1>UD    0x0a125102
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 2 Sampler = 1 mlen 5 rlen 1 { align1 1Q };
+send(16)        g41<1>UW        g7<8,8,1>UD     0x14245001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 10 rlen 2 { align1 1H };
+send(16)        g43<1>UW        g17<8,8,1>UD    0x14245102
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 2 Sampler = 1 mlen 10 rlen 2 { align1 1H };
+send(8)         g2<1>UW         g5<8,8,1>UD     0x06223001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 2 { align1 1Q };
+send(16)        g2<1>UW         g7<8,8,1>UD     0x0c443001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 4 { align1 1H };
+send(8)         g2<1>UW         g2<8,8,1>UD     0x06323001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 3 { align1 1Q };
+send(16)        g2<1>UW         g24<8,8,1>UD    0x0c643001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 6 rlen 6 { align1 1H };
+send(8)         null<1>F        g120<8,8,1>F    0x8c0a0117
+                            urb MsgDesc: 17 SIMD8 write per-slot mlen 6 rlen 0 { align1 1Q EOT };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380128
+                            urb MsgDesc: 18 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380138
+                            urb MsgDesc: 19 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380148
+                            urb MsgDesc: 20 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380158
+                            urb MsgDesc: 21 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380168
+                            urb MsgDesc: 22 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380178
+                            urb MsgDesc: 23 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380188
+                            urb MsgDesc: 24 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x02380198
+                            urb MsgDesc: 25 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x023801a8
+                            urb MsgDesc: 26 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x023801b8
+                            urb MsgDesc: 27 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x023801c8
+                            urb MsgDesc: 28 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x023801d8
+                            urb MsgDesc: 29 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x023801e8
+                            urb MsgDesc: 30 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g11<1>UD        g1<8,8,1>UD     0x023801f8
+                            urb MsgDesc: 31 SIMD8 read mlen 1 rlen 3        { align1 1Q };
+send(8)         g10<1>UW        g2<8,8,1>UD     0x04420004
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g2<8,8,1>UD     0x08840004
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g10<1>UW        g2<8,8,1>UD     0x04420003
+                            sampler MsgDesc: sample SIMD8 Surface = 3 Sampler = 0 mlen 2 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g2<8,8,1>UD     0x08840003
+                            sampler MsgDesc: sample SIMD16 Surface = 3 Sampler = 0 mlen 4 rlen 8 { align1 1H };
+send(8)         g11<1>UD        g13<8,8,1>UD    0x042a0058
+                            urb MsgDesc: 5 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a0068
+                            urb MsgDesc: 6 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a0078
+                            urb MsgDesc: 7 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a0088
+                            urb MsgDesc: 8 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a0098
+                            urb MsgDesc: 9 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a00a8
+                            urb MsgDesc: 10 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a00b8
+                            urb MsgDesc: 11 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a00c8
+                            urb MsgDesc: 12 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a00d8
+                            urb MsgDesc: 13 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a00e8
+                            urb MsgDesc: 14 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a00f8
+                            urb MsgDesc: 15 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a0108
+                            urb MsgDesc: 16 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g11<8,8,1>UD    0x042a0118
+                            urb MsgDesc: 17 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a0158
+                            urb MsgDesc: 21 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a0168
+                            urb MsgDesc: 22 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a0178
+                            urb MsgDesc: 23 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a0188
+                            urb MsgDesc: 24 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a0198
+                            urb MsgDesc: 25 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a01a8
+                            urb MsgDesc: 26 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a01b8
+                            urb MsgDesc: 27 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a01c8
+                            urb MsgDesc: 28 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a01d8
+                            urb MsgDesc: 29 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a01e8
+                            urb MsgDesc: 30 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a01f8
+                            urb MsgDesc: 31 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g2<1>UD         g3<8,8,1>UD     0x042a0208
+                            urb MsgDesc: 32 SIMD8 read per-slot mlen 2 rlen 2 { align1 1Q };
+send(8)         g9<1>UW         g15<8,8,1>UD    0x021ab102
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 2 Sampler = 1 mlen 1 rlen 1 { align1 1Q };
+send(8)         g10<1>UW        g16<8,8,1>UD    0x021ab203
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 3 Sampler = 2 mlen 1 rlen 1 { align1 1Q };
+send(8)         g11<1>UW        g17<8,8,1>UD    0x021ab304
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 4 Sampler = 3 mlen 1 rlen 1 { align1 1Q };
+send(8)         g12<1>UW        g18<8,8,1>UD    0x021ab405
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 5 Sampler = 4 mlen 1 rlen 1 { align1 1Q };
+send(8)         g13<1>UW        g19<8,8,1>UD    0x021ab506
+                            sampler MsgDesc: sampleinfo SIMD8 Surface = 6 Sampler = 5 mlen 1 rlen 1 { align1 1Q };
+send(16)        g14<1>UW        g16<8,8,1>UD    0x022cb102
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 2 Sampler = 1 mlen 1 rlen 2 { align1 1H };
+send(16)        g16<1>UW        g18<8,8,1>UD    0x022cb203
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 3 Sampler = 2 mlen 1 rlen 2 { align1 1H };
+send(16)        g18<1>UW        g20<8,8,1>UD    0x022cb304
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 4 Sampler = 3 mlen 1 rlen 2 { align1 1H };
+send(16)        g20<1>UW        g22<8,8,1>UD    0x022cb405
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 5 Sampler = 4 mlen 1 rlen 2 { align1 1H };
+send(16)        g22<1>UW        g24<8,8,1>UD    0x022cb506
+                            sampler MsgDesc: sampleinfo SIMD16 Surface = 6 Sampler = 5 mlen 1 rlen 2 { align1 1H };
+send(8)         g14<1>UW        g11<8,8,1>UD    0x0a4b0203
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 3 Sampler = 2 mlen 5 rlen 4 { align1 1Q };
+send(8)         g18<1>UW        g18<8,8,1>UD    0x0c4b0304
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 4 Sampler = 3 mlen 6 rlen 4 { align1 1Q };
+send(8)         g22<1>UW        g24<8,8,1>UD    0x084b0405
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 5 Sampler = 4 mlen 4 rlen 4 { align1 1Q };
+send(16)        g18<1>UW        g26<8,8,1>UD    0x128d0203
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 3 Sampler = 2 mlen 9 rlen 8 { align1 1H };
+send(16)        g26<1>UW        g35<8,8,1>UD    0x168d0304
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 4 Sampler = 3 mlen 11 rlen 8 { align1 1H };
+send(16)        g34<1>UW        g46<8,8,1>UD    0x0e8d0405
+                            sampler MsgDesc: gather4_c SIMD16 Surface = 5 Sampler = 4 mlen 7 rlen 8 { align1 1H };
+send(8)         g124<1>UW       g9<8,8,1>UD     0x0c4b0000
+                            sampler MsgDesc: gather4_c SIMD8 Surface = 0 Sampler = 0 mlen 6 rlen 4 { align1 1Q };
diff --git a/src/intel/compiler/elk/tests/gen9/send.expected b/src/intel/compiler/elk/tests/gen9/send.expected
new file mode 100644
index 00000000000..9ed63c758f3
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/send.expected
@@ -0,0 +1,1803 @@
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 17 00 08 8a
+31 00 60 06 e0 3a 00 20 a0 01 8d 06 07 00 08 12
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 27 00 08 8a
+31 00 80 09 0c 02 20 21 40 00 00 06 00 03 28 02
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 17 00 08 92
+31 00 80 07 44 12 00 20 e0 0f 8d 06 10 00 00 82
+31 00 60 02 48 02 80 2f a0 01 8d 06 01 a0 43 06
+31 00 80 02 48 02 00 2f e0 02 8d 06 01 a0 85 0c
+31 00 60 06 08 02 40 21 40 00 8d 06 28 00 48 02
+31 00 60 06 e0 3a 00 20 00 01 8d 06 17 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 0e 8d 06 17 00 0a 94
+31 00 60 02 48 02 40 20 40 01 8d 06 01 70 42 08
+31 00 80 02 48 02 40 20 40 02 8d 06 01 70 84 10
+31 00 60 06 e0 02 00 20 60 01 8d 06 37 00 0a 0c
+31 00 60 06 e0 02 00 20 c0 00 8d 06 27 00 08 0a
+31 00 60 06 e0 02 00 20 c0 00 8d 06 17 80 08 0c
+31 00 60 06 e0 02 00 20 c0 00 8d 06 17 80 08 0a
+31 00 60 06 e0 02 00 20 c0 00 8d 06 17 80 08 08
+31 00 60 06 e0 02 00 20 40 00 8d 06 17 80 08 06
+31 00 60 06 e0 02 00 20 c0 00 8d 06 07 80 08 0c
+31 00 60 06 e0 02 00 20 c0 00 8d 06 07 80 08 0a
+31 00 60 06 e0 02 00 20 a0 0f 8d 06 07 80 08 86
+31 00 60 02 48 02 e0 20 e0 00 8d 06 00 a0 43 04
+31 00 60 02 48 02 40 21 c0 00 8d 06 01 a0 22 02
+31 00 60 02 48 02 40 20 60 02 8d 06 01 80 4a 08
+31 00 80 02 48 02 20 23 00 02 8d 06 01 a0 44 04
+31 00 80 02 48 02 c0 21 e0 00 8d 06 01 80 8c 0e
+31 00 60 06 e0 3a 00 20 60 01 8d 06 17 00 08 12
+31 00 60 06 e0 3a 00 20 80 02 8d 06 37 00 08 12
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 57 00 08 8a
+31 00 60 02 48 02 20 21 c0 00 8d 06 01 d0 13 06
+31 00 80 02 48 02 80 21 c0 01 8d 06 01 d0 25 0c
+31 00 60 02 48 02 40 20 c0 01 8d 06 01 d0 43 06
+31 00 60 02 48 02 00 21 20 02 8d 06 01 e0 43 0a
+31 00 80 02 48 02 40 23 40 01 8d 06 01 d0 85 0c
+31 00 80 02 48 02 40 24 00 02 8d 06 01 e0 85 14
+31 00 60 02 48 02 a0 20 40 00 8d 06 01 00 32 04
+31 00 80 02 48 02 e0 20 40 00 8d 06 01 00 64 08
+31 00 60 02 48 02 80 21 40 01 8d 06 01 e0 33 0a
+31 00 80 02 48 02 40 20 40 02 8d 06 01 e0 65 14
+31 00 60 02 48 02 a0 20 40 00 8d 06 01 00 42 04
+31 00 80 02 48 02 e0 20 40 00 8d 06 01 00 84 08
+31 00 60 02 48 02 60 21 20 01 8d 06 00 a0 22 02
+31 00 60 02 48 02 80 2f a0 01 8d 06 00 80 4a 06
+31 00 60 02 48 02 80 21 a0 00 8d 06 00 70 42 02
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 37 00 08 8a
+31 00 60 02 48 02 c0 20 60 01 8d 06 01 40 4a 14
+31 00 61 0c 4a 02 a0 2f 60 00 8d 06 01 b5 10 02
+31 00 81 0c 4a 02 40 2f 80 00 8d 06 01 a5 20 04
+31 00 60 02 48 02 c0 20 80 01 8d 06 01 40 4a 08
+31 00 60 02 48 02 40 2c 20 02 8d 06 01 c0 43 0c
+31 00 60 02 48 02 80 2f 00 01 8d 06 01 80 4a 06
+31 00 80 02 48 02 00 2f 80 01 8d 06 01 80 8c 0a
+31 00 60 02 48 02 c0 20 e0 00 8d 06 01 60 1a 0a
+31 00 60 02 48 02 e0 20 80 01 8d 06 02 61 1a 0a
+31 00 80 02 48 02 40 21 80 01 8d 06 01 60 2c 12
+31 00 80 02 48 02 80 21 a0 02 8d 06 02 61 2c 12
+31 00 60 02 48 02 80 2f 60 00 8d 06 00 e0 43 0a
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 27 00 08 92
+31 00 60 02 48 02 40 20 60 00 8d 06 00 d0 43 06
+31 00 60 06 e0 02 00 20 e0 00 8d 06 37 00 08 0a
+31 00 60 06 e0 02 00 20 00 01 8d 06 47 00 08 0a
+31 00 60 06 e0 3a 00 20 a0 03 8d 06 17 00 0a 0c
+31 00 60 06 e0 3a 00 20 40 0f 8d 06 17 00 0a 8c
+31 00 60 02 48 02 a0 21 40 01 8d 06 01 00 32 02
+31 00 80 02 48 02 c0 22 40 02 8d 06 01 00 64 04
+31 00 60 02 48 02 80 2f 40 00 8d 06 00 a0 32 02
+31 00 60 02 48 02 40 20 a0 01 8d 06 01 10 4b 0c
+31 00 80 02 48 02 40 22 e0 00 8d 06 01 10 8d 16
+31 00 60 06 e0 02 00 20 c0 00 8d 06 27 80 08 0a
+31 00 60 06 e0 02 00 20 e0 00 8d 06 37 80 08 0a
+31 00 60 06 e0 02 00 20 00 01 8d 06 47 80 08 0a
+31 00 60 06 e0 02 00 20 20 01 8d 06 57 80 08 0a
+31 00 60 02 48 02 80 2f 60 00 8d 06 00 70 42 06
+31 00 60 02 48 02 40 20 40 01 8d 06 01 70 42 06
+31 00 80 02 48 02 40 20 40 02 8d 06 01 70 84 0c
+31 00 60 02 48 02 c0 20 40 01 8d 06 01 40 42 0c
+31 00 60 02 48 02 40 20 e0 00 8d 06 00 10 4b 0c
+31 00 60 02 48 02 40 20 80 00 8d 06 00 a0 42 02
+31 00 60 02 48 02 c0 20 c0 00 8d 06 01 a1 42 02
+31 00 60 02 48 02 40 21 40 01 8d 06 02 a2 42 02
+31 00 60 02 48 02 c0 21 c0 01 8d 06 03 a3 42 02
+31 00 60 02 48 02 40 22 40 02 8d 06 04 a4 42 02
+31 00 60 02 48 02 c0 22 c0 02 8d 06 05 a5 42 02
+31 00 60 02 48 02 40 23 40 03 8d 06 06 a6 42 02
+31 00 60 06 08 02 c0 20 e0 01 8d 06 18 03 2a 04
+31 00 60 06 08 02 00 21 e0 01 8d 06 18 05 2a 04
+31 00 60 06 08 02 40 21 e0 01 8d 06 18 07 2a 04
+31 00 60 06 08 02 80 21 e0 01 8d 06 18 09 2a 04
+31 00 60 06 08 02 c0 21 e0 01 8d 06 28 01 2a 04
+31 00 60 06 08 02 00 22 c0 01 8d 06 18 02 2a 04
+31 00 60 06 08 02 40 22 c0 01 8d 06 18 04 2a 04
+31 00 60 06 08 02 80 22 c0 01 8d 06 18 06 2a 04
+31 00 60 06 08 02 c0 22 c0 01 8d 06 18 08 2a 04
+31 00 60 06 08 02 a0 21 c0 01 8d 06 28 00 2a 04
+31 00 60 06 08 02 40 20 c0 03 8d 06 08 02 48 02
+31 00 60 06 08 02 c0 21 c0 03 8d 06 08 04 48 02
+31 00 60 06 08 02 40 22 c0 03 8d 06 08 06 48 02
+31 00 60 06 08 02 c0 22 c0 03 8d 06 08 08 48 02
+31 00 60 06 e0 02 00 20 c0 00 8d 06 17 82 0a 0a
+31 00 60 06 e0 02 00 20 60 01 8d 06 27 82 0a 0a
+31 00 60 06 e0 02 00 20 80 01 8d 06 37 82 0a 0a
+31 00 60 06 e0 02 00 20 a0 01 8d 06 47 82 0a 0a
+31 00 60 06 e0 02 00 20 c0 01 8d 06 57 82 0a 0a
+31 00 60 06 e0 02 00 20 e0 01 8d 06 67 82 0a 0a
+31 00 60 06 e0 02 00 20 00 02 8d 06 77 82 0a 0a
+31 00 60 06 e0 02 00 20 20 02 8d 06 87 82 0a 0a
+31 00 60 06 e0 02 00 20 40 02 8d 06 97 82 0a 0a
+31 00 60 06 e0 02 00 20 60 02 8d 06 a7 82 0a 0a
+31 00 60 06 e0 02 00 20 80 02 8d 06 b7 82 0a 0a
+31 00 60 06 e0 02 00 20 a0 02 8d 06 c7 82 0a 0a
+31 00 60 06 e0 02 00 20 c0 02 8d 06 d7 82 0a 0a
+31 00 60 06 e0 02 00 20 e0 02 8d 06 e7 82 0a 0a
+31 00 60 06 e0 02 00 20 00 03 8d 06 f7 82 0a 0a
+31 00 60 06 e0 02 00 20 20 03 8d 06 07 83 0a 0a
+31 00 60 06 e0 02 00 20 40 03 8d 06 17 83 0a 0a
+31 00 60 06 e0 02 00 20 60 03 8d 06 27 83 0a 0a
+31 00 60 06 e0 02 00 20 80 03 8d 06 37 83 0a 0a
+31 00 60 06 e0 02 00 20 a0 03 8d 06 47 83 0a 0a
+31 00 60 06 e0 02 00 20 c0 03 8d 06 57 83 0a 0a
+31 00 60 06 e0 02 00 20 e0 03 8d 06 67 83 0a 0a
+31 00 60 06 e0 02 00 20 00 04 8d 06 77 83 0a 0a
+31 00 60 06 e0 02 00 20 20 04 8d 06 87 83 0a 0a
+31 00 60 06 e0 02 00 20 40 04 8d 06 97 83 0a 0a
+31 00 60 06 e0 02 00 20 60 04 8d 06 a7 83 0a 0a
+31 00 60 06 e0 02 00 20 80 04 8d 06 b7 83 0a 0a
+31 00 60 06 e0 02 00 20 a0 04 8d 06 c7 83 0a 0a
+31 00 60 06 e0 02 00 20 c0 04 8d 06 d7 83 0a 0a
+31 00 60 06 e0 02 00 20 e0 04 8d 06 e7 83 0a 0a
+31 00 60 06 e0 02 00 20 00 05 8d 06 f7 83 0a 0a
+31 00 60 06 e0 02 00 20 60 01 8d 06 27 80 08 08
+31 00 60 06 e0 02 00 20 80 01 8d 06 37 80 08 08
+31 00 60 06 e0 02 00 20 a0 01 8d 06 47 80 08 08
+31 00 60 06 e0 02 00 20 c0 01 8d 06 57 80 08 08
+31 00 60 06 e0 02 00 20 e0 01 8d 06 67 80 08 08
+31 00 60 06 e0 02 00 20 00 02 8d 06 77 80 08 08
+31 00 60 06 e0 02 00 20 20 02 8d 06 87 80 08 08
+31 00 60 06 e0 02 00 20 40 02 8d 06 97 80 08 08
+31 00 60 06 e0 02 00 20 60 02 8d 06 a7 80 08 08
+31 00 60 06 e0 02 00 20 80 02 8d 06 b7 80 08 08
+31 00 60 06 e0 02 00 20 a0 02 8d 06 c7 80 08 08
+31 00 60 06 e0 02 00 20 c0 02 8d 06 d7 80 08 08
+31 00 60 06 e0 02 00 20 e0 02 8d 06 e7 80 08 08
+31 00 60 06 e0 02 00 20 00 03 8d 06 f7 80 08 08
+31 00 60 06 e0 02 00 20 20 03 8d 06 07 81 08 08
+31 00 60 06 e0 02 00 20 40 03 8d 06 17 81 08 08
+31 00 60 06 e0 02 00 20 60 03 8d 06 27 81 08 08
+31 00 60 06 e0 02 00 20 80 03 8d 06 37 81 08 08
+31 00 60 06 e0 02 00 20 a0 03 8d 06 47 81 08 08
+31 00 60 06 e0 02 00 20 c0 03 8d 06 57 81 08 08
+31 00 60 06 e0 02 00 20 e0 03 8d 06 67 81 08 08
+31 00 60 06 e0 02 00 20 00 04 8d 06 77 81 08 08
+31 00 60 06 e0 02 00 20 20 04 8d 06 87 81 08 08
+31 00 60 06 e0 02 00 20 40 04 8d 06 97 81 08 08
+31 00 60 06 e0 02 00 20 60 04 8d 06 a7 81 08 08
+31 00 60 06 e0 02 00 20 80 04 8d 06 b7 81 08 08
+31 00 60 06 e0 02 00 20 a0 04 8d 06 c7 81 08 08
+31 00 60 06 e0 02 00 20 c0 04 8d 06 d7 81 08 08
+31 00 60 06 e0 02 00 20 e0 04 8d 06 e7 81 08 08
+31 00 60 06 e0 02 00 20 00 05 8d 06 f7 81 08 08
+31 00 60 06 08 02 a0 21 20 00 8d 06 18 00 48 02
+31 00 60 06 e0 02 00 20 60 01 8d 06 07 02 0a 0c
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 57 00 08 92
+31 00 60 02 48 02 40 21 40 02 8d 06 00 80 4a 08
+31 00 60 02 48 02 80 2f 40 00 8d 06 01 90 22 04
+31 00 80 02 48 02 00 2f 40 00 8d 06 01 90 44 08
+31 00 80 02 48 02 00 24 80 05 8d 06 01 a0 65 08
+31 00 80 0c 40 02 00 20 a0 00 8d 06 02 85 00 04
+31 00 60 02 48 02 a0 20 60 00 8d 06 01 70 42 02
+31 00 80 02 48 02 00 21 a0 00 8d 06 01 70 84 04
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 07 00 08 92
+31 00 60 06 e0 02 00 20 c0 0f 8d 06 17 00 08 84
+31 00 60 02 48 02 40 20 a0 01 8d 06 01 10 4b 0a
+31 00 80 02 48 02 00 22 e0 00 8d 06 01 10 8d 12
+31 00 60 06 08 02 c0 24 20 00 8d 06 28 00 18 02
+31 00 60 06 08 02 00 25 20 00 8d 06 38 00 18 02
+31 00 60 06 08 02 40 25 20 00 8d 06 48 00 18 02
+31 00 60 06 08 02 80 25 20 00 8d 06 58 00 18 02
+31 00 60 06 08 02 c0 25 20 00 8d 06 68 00 18 02
+31 00 60 06 08 02 00 26 20 00 8d 06 78 00 18 02
+31 00 60 06 08 02 40 26 20 00 8d 06 88 00 18 02
+31 00 60 06 08 02 80 26 20 00 8d 06 98 00 18 02
+31 00 60 06 08 02 c0 26 20 00 8d 06 a8 00 18 02
+31 00 60 06 08 02 00 27 20 00 8d 06 b8 00 18 02
+31 00 60 06 08 02 40 27 20 00 8d 06 c8 00 18 02
+31 00 60 06 08 02 80 27 20 00 8d 06 d8 00 18 02
+31 00 60 06 08 02 c0 27 20 00 8d 06 e8 00 18 02
+31 00 60 06 08 02 00 28 20 00 8d 06 f8 00 18 02
+31 00 60 06 08 02 40 28 20 00 8d 06 08 01 18 02
+31 00 60 06 08 02 80 28 20 00 8d 06 18 01 18 02
+31 00 60 06 08 02 c0 28 20 00 8d 06 28 01 18 02
+31 00 60 06 08 02 00 29 20 00 8d 06 38 01 18 02
+31 00 60 06 08 02 40 29 20 00 8d 06 48 01 18 02
+31 00 60 06 08 02 80 29 20 00 8d 06 58 01 18 02
+31 00 60 06 08 02 c0 29 20 00 8d 06 68 01 18 02
+31 00 60 06 08 02 00 2a 20 00 8d 06 78 01 18 02
+31 00 60 06 08 02 40 2a 20 00 8d 06 88 01 18 02
+31 00 60 06 08 02 80 2a 20 00 8d 06 98 01 18 02
+31 00 60 06 08 02 c0 2a 20 00 8d 06 a8 01 18 02
+31 00 60 06 08 02 00 2b 20 00 8d 06 b8 01 18 02
+31 00 60 06 08 02 40 2b 20 00 8d 06 c8 01 18 02
+31 00 60 06 08 02 80 2b 20 00 8d 06 d8 01 18 02
+31 00 60 06 08 02 c0 2b 20 00 8d 06 e8 01 18 02
+31 00 60 06 08 02 00 2c 20 00 8d 06 f8 01 18 02
+31 00 60 06 08 02 40 2c 20 00 8d 06 08 02 18 02
+31 00 60 06 e0 02 00 20 80 01 8d 06 27 00 0a 0c
+31 00 60 0a 40 02 00 20 c0 0f 00 06 fd 02 0a 04
+31 00 60 0a 4c 02 60 2e 60 0e 00 06 fd 02 18 02
+31 00 60 06 e0 3a 00 20 20 03 8d 06 57 00 08 12
+31 00 60 06 e0 3a 00 20 40 04 8d 06 77 00 08 12
+31 00 60 06 e0 3a 00 20 60 05 8d 06 97 00 08 12
+31 00 60 06 e0 3a 00 20 80 06 8d 06 b7 00 08 12
+31 00 60 06 e0 3a 00 20 a0 07 8d 06 d7 00 08 12
+31 00 60 06 e0 3a 00 20 c0 08 8d 06 f7 00 08 12
+31 00 60 06 e0 3a 00 20 40 00 8d 06 17 01 08 12
+31 00 60 06 e0 3a 00 20 40 00 8d 06 37 01 08 12
+31 00 60 06 e0 3a 00 20 40 00 8d 06 57 01 08 12
+31 00 60 06 e0 3a 00 20 e0 09 8d 06 77 01 08 12
+31 00 60 06 e0 3a 00 20 00 0b 8d 06 97 01 08 12
+31 00 60 06 e0 3a 00 20 20 0c 8d 06 b7 01 08 12
+31 00 60 06 e0 3a 00 20 40 0d 8d 06 d7 01 08 12
+31 00 60 06 e0 3a 00 20 a0 0e 8d 06 f7 01 08 92
+31 00 60 02 48 02 80 2f 60 01 8d 06 01 90 22 02
+31 00 80 02 48 02 00 2f 60 01 8d 06 01 90 44 04
+31 00 60 02 48 02 80 2f 60 00 8d 06 00 70 42 08
+31 00 80 0c 40 02 00 20 00 05 8d 06 01 85 00 04
+31 00 60 06 e0 02 00 20 e0 0f 8d 06 07 00 08 82
+31 00 60 02 48 02 80 2f 20 01 8d 06 00 80 4a 0a
+31 00 60 02 48 02 40 20 e0 02 8d 06 01 a0 33 06
+31 00 80 02 48 02 80 20 80 01 8d 06 01 a0 65 0c
+31 10 60 02 48 02 40 20 00 02 8d 06 01 40 43 0e
+31 00 61 0c 42 02 00 20 80 00 8d 06 01 95 00 02
+31 00 60 02 48 02 c0 20 20 01 8d 06 01 40 43 08
+31 00 60 06 e0 3a 00 20 c0 0c 8d 06 f7 01 08 12
+31 00 60 06 e0 3a 00 20 20 0f 8d 06 17 02 08 8a
+31 00 80 03 44 02 00 20 60 00 00 06 04 80 00 02
+31 00 80 0c 48 02 60 20 c0 01 8d 06 fe 5e 20 04
+31 00 60 06 e0 3a 00 20 c0 03 8d 06 27 00 0a 14
+31 00 60 06 e0 3a 00 20 00 05 8d 06 47 00 0a 0c
+31 00 60 06 e0 02 00 20 c0 0f 8d 06 07 00 08 84
+31 00 60 0c 48 02 a0 20 60 01 8d 06 01 50 41 04
+31 10 60 0c 48 02 40 20 60 00 8d 06 01 60 41 04
+31 00 60 06 08 02 a0 21 60 00 8d 06 38 00 48 02
+31 00 60 06 e0 3a 00 20 e0 00 8d 06 37 00 0a 14
+31 00 60 06 08 02 e0 21 40 00 8d 06 38 00 28 02
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 37 00 08 92
+31 00 60 06 e0 3a 00 20 00 01 8d 06 07 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 0e 8d 06 07 00 0a 94
+31 00 60 02 48 02 80 2f 80 01 8d 00 00 02 00 00
+31 00 60 06 08 02 40 21 40 00 8d 06 48 00 48 02
+31 00 60 06 08 02 c0 20 40 00 8d 06 88 00 48 02
+31 00 60 06 08 02 c0 21 40 00 8d 06 58 00 48 02
+31 00 60 06 08 02 60 21 40 00 8d 06 a8 00 48 02
+31 00 60 06 08 02 40 22 40 00 8d 06 68 00 48 02
+31 00 60 06 08 02 00 22 40 00 8d 06 c8 00 38 02
+31 00 60 06 08 02 c0 22 40 00 8d 06 78 00 48 02
+31 00 60 06 08 02 80 21 40 00 8d 06 b8 00 48 02
+31 00 60 06 08 02 e0 20 40 00 8d 06 98 00 48 02
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 b7 00 08 92
+31 00 60 02 48 02 c0 20 00 01 8d 06 00 00 4b 08
+31 00 60 0b 48 02 e0 20 00 00 8d 06 08 00 20 02
+31 00 80 0b 48 02 20 21 00 00 8d 06 08 00 41 02
+31 00 60 02 48 02 40 20 60 01 8d 06 01 d0 43 04
+31 00 60 02 48 02 40 20 20 01 8d 06 01 e0 43 08
+31 00 80 02 48 02 40 20 e0 01 8d 06 01 d0 85 08
+31 00 80 02 48 02 60 25 60 01 8d 06 01 e0 85 10
+31 00 60 02 48 02 40 20 c0 00 8d 06 00 10 4b 0a
+31 00 60 06 08 02 40 29 40 00 8d 06 28 00 28 02
+31 00 60 06 08 02 e0 20 40 00 8d 06 28 00 38 02
+31 00 60 06 08 02 e0 21 40 00 8d 06 38 00 38 02
+31 00 60 02 48 02 80 2f 60 00 8d 06 00 e0 43 08
+31 00 60 02 48 02 40 20 60 00 8d 06 00 d0 43 04
+31 00 60 02 48 02 40 20 60 02 8d 06 01 80 4a 0a
+31 00 80 02 48 02 e0 20 00 02 8d 06 01 80 8c 12
+31 00 60 06 e0 3a 00 20 40 00 8d 06 57 00 0a 0c
+31 00 60 06 e0 02 00 20 20 01 8d 06 27 00 08 04
+31 00 60 02 48 02 c0 20 e0 00 8d 06 01 40 13 08
+31 00 60 02 48 02 e0 20 60 01 8d 06 02 41 13 08
+31 00 60 02 48 02 a0 21 20 02 8d 06 00 b0 1a 02
+31 00 60 06 e0 3a 00 20 40 06 8d 06 57 00 0a 14
+31 00 60 06 e0 3a 00 20 80 07 8d 06 77 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 08 8d 06 97 00 0a 0c
+31 00 60 06 e0 3a 00 20 40 0f 8d 06 97 00 0a 8c
+31 00 60 02 48 02 80 2f c0 00 8d 06 00 00 4b 0a
+31 00 60 02 48 02 a0 20 c0 00 8d 06 01 30 1a 06
+31 00 60 02 48 02 c0 20 20 01 8d 06 02 31 1a 06
+31 00 80 02 48 02 20 21 60 01 8d 06 01 30 2c 0a
+31 00 80 02 48 02 60 21 40 00 8d 06 02 31 2c 0a
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 77 00 08 8a
+31 00 60 06 e0 02 00 20 c0 03 8d 06 67 00 0a 0c
+31 00 60 06 e0 02 00 20 80 04 8d 06 77 00 0a 0c
+31 00 60 06 e0 02 00 20 40 05 8d 06 87 00 0a 0c
+31 00 60 02 48 02 c0 20 c0 00 8d 06 02 01 42 06
+31 00 80 02 48 02 40 21 40 02 8d 06 02 01 84 0c
+31 00 60 02 48 02 40 20 40 00 8d 06 02 01 42 04
+31 00 60 02 48 02 c0 20 c0 00 8d 06 04 03 42 06
+31 00 80 02 48 02 40 20 40 01 8d 06 02 01 84 08
+31 00 80 02 48 02 40 21 40 02 8d 06 04 03 84 0c
+31 00 60 02 48 02 40 20 40 00 8d 06 04 03 42 04
+31 00 60 02 48 02 c0 20 c0 00 8d 06 08 07 42 06
+31 00 80 02 48 02 40 20 40 01 8d 06 04 03 84 08
+31 00 80 02 48 02 40 21 40 02 8d 06 08 07 84 0c
+31 00 60 02 48 02 60 20 60 01 8d 06 01 c0 43 0a
+31 00 80 02 48 02 00 22 a0 00 8d 06 01 c0 85 14
+31 00 80 09 0c 02 80 20 a0 01 00 06 01 03 28 02
+31 00 60 02 48 02 40 20 40 00 8d 06 01 a0 43 04
+31 00 80 02 48 02 40 20 40 01 8d 06 01 a0 85 08
+31 00 60 02 48 02 80 21 80 01 8d 06 01 50 12 06
+31 00 60 02 48 02 a0 21 e0 01 8d 06 02 51 12 06
+31 00 80 02 48 02 80 22 c0 02 8d 06 01 50 24 0c
+31 00 80 02 48 02 c0 22 80 03 8d 06 02 51 24 0c
+31 00 60 06 08 02 c0 24 40 00 8d 06 c8 00 48 02
+31 00 60 06 08 02 e0 24 40 00 8d 06 d8 00 48 02
+31 00 60 06 08 02 00 25 40 00 8d 06 e8 00 48 02
+31 00 60 06 08 02 20 25 40 00 8d 06 f8 00 48 02
+31 00 60 06 08 02 40 25 40 00 8d 06 08 01 48 02
+31 00 60 06 08 02 60 25 40 00 8d 06 18 01 48 02
+31 00 60 06 08 02 80 25 40 00 8d 06 28 01 48 02
+31 00 60 06 08 02 a0 25 40 00 8d 06 38 01 48 02
+31 00 60 06 08 02 c0 25 40 00 8d 06 48 01 48 02
+31 00 60 06 08 02 e0 25 40 00 8d 06 58 01 48 02
+31 00 60 06 08 02 00 26 40 00 8d 06 68 01 48 02
+31 00 60 06 08 02 20 26 40 00 8d 06 78 01 48 02
+31 00 60 06 08 02 40 26 40 00 8d 06 88 01 48 02
+31 00 60 06 08 02 60 26 40 00 8d 06 98 01 48 02
+31 00 60 06 08 02 80 26 40 00 8d 06 a8 01 48 02
+31 00 60 06 08 02 a0 26 40 00 8d 06 b8 01 48 02
+31 00 60 06 08 02 c0 26 40 00 8d 06 c8 01 48 02
+31 00 60 06 08 02 e0 26 40 00 8d 06 d8 01 48 02
+31 00 60 06 08 02 00 27 40 00 8d 06 e8 01 48 02
+31 00 60 06 08 02 20 27 40 00 8d 06 f8 01 48 02
+31 00 60 06 e0 02 00 20 60 02 8d 06 27 80 0a 08
+31 00 60 06 e0 02 00 20 00 01 8d 06 27 80 0a 0a
+31 00 60 02 48 02 c0 20 60 01 8d 06 01 40 42 0e
+31 00 60 02 48 02 80 2f e0 00 8d 06 00 a0 12 02
+31 00 60 06 08 02 00 21 c0 01 8d 06 28 01 4a 04
+31 00 60 06 08 02 c0 22 00 02 8d 06 28 00 4a 04
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 17 00 08 0a
+31 00 60 06 e0 3a 00 20 e0 00 8d 06 57 00 08 0a
+31 00 60 0c 48 02 80 20 40 00 8d 06 01 60 40 02
+31 00 80 0c 48 02 a0 20 40 00 8d 06 01 50 80 04
+31 00 60 02 48 02 80 2f a0 01 8d 06 01 00 4b 08
+31 00 80 02 48 02 00 2f e0 00 8d 06 01 00 8d 0e
+31 00 60 02 48 02 40 21 40 01 8d 06 01 40 13 0e
+31 00 60 02 48 02 60 21 20 02 8d 06 02 41 13 0e
+31 00 60 02 48 02 c0 21 40 01 8d 06 02 82 4a 06
+31 00 60 02 48 02 c0 20 c0 00 8d 06 01 81 4a 08
+31 00 60 02 48 02 a0 20 c0 00 8d 06 01 b0 1a 02
+31 00 80 02 48 02 c0 20 60 00 8d 06 01 b0 2c 02
+31 00 60 06 e0 3a 00 20 40 0f 8d 06 37 00 0a 8c
+31 00 60 06 e0 3a 00 20 40 01 8d 06 27 00 08 12
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 47 00 08 8a
+31 00 60 02 48 02 c0 21 40 00 8d 06 00 80 43 04
+31 00 60 06 08 02 a0 27 60 0d 8d 06 48 00 38 02
+31 00 60 06 08 02 00 28 20 0e 8d 06 58 00 38 02
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 47 00 08 92
+31 00 60 0c 48 02 a0 20 80 00 8d 06 01 50 41 06
+31 10 60 0c 48 02 40 20 40 01 8d 06 01 60 41 06
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 77 00 08 92
+31 00 60 06 08 02 80 21 00 01 69 06 38 00 4a 04
+31 00 60 06 08 02 a0 22 00 01 69 06 48 00 4a 04
+31 00 60 06 e0 02 00 20 c0 02 8d 06 a7 00 0a 0c
+31 00 80 02 48 02 20 20 20 01 8d 06 01 80 85 08
+31 00 60 06 e0 3a 00 20 00 07 8d 06 97 00 0a 14
+31 00 60 06 e0 3a 00 20 80 09 8d 06 b7 00 0a 0c
+31 00 60 06 e0 3a 00 20 40 0f 8d 06 b7 00 0a 8c
+31 00 60 02 48 02 80 20 60 00 8d 06 01 a0 32 02
+31 00 80 02 48 02 00 21 60 00 8d 06 01 a0 64 04
+31 00 60 06 e0 02 00 20 c0 00 8d 06 07 00 08 0a
+31 00 60 02 48 02 c0 2f 40 01 8d 06 01 30 12 08
+31 00 80 02 48 02 80 2f 00 01 8d 06 01 30 24 10
+31 00 60 02 48 02 80 21 80 01 8d 06 01 60 12 06
+31 00 60 02 48 02 a0 21 e0 01 8d 06 02 61 12 06
+31 00 80 02 48 02 80 22 c0 02 8d 06 01 60 24 0c
+31 00 80 02 48 02 c0 22 80 03 8d 06 02 61 24 0c
+31 00 60 0b 48 02 80 20 00 00 8d 06 00 10 20 02
+31 00 80 0b 48 02 c0 20 00 00 8d 06 00 10 41 02
+31 00 60 02 48 02 80 2f 60 02 8d 06 01 00 4b 0a
+31 00 80 02 48 02 00 2f e0 00 8d 06 01 00 8d 12
+31 00 60 02 48 02 40 20 e0 01 8d 06 01 20 42 06
+31 00 80 02 48 02 c0 21 00 01 8d 06 01 20 84 0c
+31 00 60 06 e0 3a 00 20 c0 0e 8d 06 37 00 0a 94
+31 00 60 02 48 02 80 20 a0 00 8d 06 01 a0 12 02
+31 00 80 02 48 02 80 20 c0 00 8d 06 01 a0 24 04
+31 00 60 06 08 02 00 21 e0 01 8d 06 38 01 2a 04
+31 00 60 06 08 02 40 21 e0 01 8d 06 38 03 2a 04
+31 00 60 06 08 02 80 21 e0 01 8d 06 38 05 2a 04
+31 00 60 06 08 02 c0 21 e0 01 8d 06 38 07 2a 04
+31 00 60 06 08 02 00 21 e0 01 8d 06 38 00 2a 04
+31 00 60 06 08 02 40 21 e0 01 8d 06 38 02 2a 04
+31 00 60 06 08 02 80 21 e0 01 8d 06 38 04 2a 04
+31 00 60 06 08 02 c0 21 e0 01 8d 06 38 06 2a 04
+31 00 60 06 08 02 40 20 60 04 8d 06 28 02 48 02
+31 00 60 06 08 02 00 21 60 04 8d 06 28 04 48 02
+31 00 60 06 08 02 80 21 60 04 8d 06 28 06 48 02
+31 00 60 06 e0 02 00 20 c0 00 8d 06 37 80 0a 0a
+31 00 60 06 e0 02 00 20 60 01 8d 06 47 80 0a 0a
+31 00 60 06 e0 02 00 20 80 01 8d 06 57 80 0a 0a
+31 00 60 06 e0 02 00 20 a0 01 8d 06 67 80 0a 0a
+31 00 60 06 e0 02 00 20 c0 01 8d 06 77 80 0a 0a
+31 00 60 06 e0 02 00 20 e0 01 8d 06 87 80 0a 0a
+31 00 60 06 e0 02 00 20 00 02 8d 06 97 80 0a 0a
+31 00 60 06 e0 02 00 20 20 02 8d 06 a7 80 0a 0a
+31 00 60 06 e0 02 00 20 40 02 8d 06 b7 80 0a 0a
+31 00 60 06 e0 02 00 20 60 02 8d 06 c7 80 0a 0a
+31 00 60 06 e0 02 00 20 80 02 8d 06 d7 80 0a 0a
+31 00 60 06 e0 02 00 20 a0 02 8d 06 e7 80 0a 0a
+31 00 60 06 e0 02 00 20 c0 02 8d 06 f7 80 0a 0a
+31 00 60 06 e0 02 00 20 e0 02 8d 06 07 81 0a 0a
+31 00 60 06 e0 02 00 20 00 03 8d 06 17 81 0a 0a
+31 00 60 06 e0 02 00 20 20 03 8d 06 27 81 0a 0a
+31 00 60 06 e0 02 00 20 40 03 8d 06 37 81 0a 0a
+31 00 60 06 e0 02 00 20 60 03 8d 06 47 81 0a 0a
+31 00 60 06 e0 02 00 20 80 03 8d 06 57 81 0a 0a
+31 00 60 06 e0 02 00 20 a0 03 8d 06 67 81 0a 0a
+31 00 60 06 e0 02 00 20 c0 03 8d 06 77 81 0a 0a
+31 00 60 06 e0 02 00 20 e0 03 8d 06 87 81 0a 0a
+31 00 60 06 e0 02 00 20 00 04 8d 06 97 81 0a 0a
+31 00 60 06 e0 02 00 20 20 04 8d 06 a7 81 0a 0a
+31 00 60 06 e0 02 00 20 40 04 8d 06 b7 81 0a 0a
+31 00 60 06 e0 02 00 20 60 04 8d 06 c7 81 0a 0a
+31 00 60 06 e0 02 00 20 80 04 8d 06 d7 81 0a 0a
+31 00 60 06 e0 02 00 20 a0 04 8d 06 e7 81 0a 0a
+31 00 60 06 e0 02 00 20 c0 04 8d 06 f7 81 0a 0a
+31 00 60 06 e0 02 00 20 e0 04 8d 06 07 82 0a 0a
+31 00 60 06 e0 3a 00 20 40 0f 8d 06 27 00 0a 8c
+31 00 60 02 48 02 80 2f 40 00 8d 06 01 40 42 06
+31 00 60 02 48 02 80 2f 40 00 8d 06 01 90 22 06
+31 00 80 02 48 02 00 2f 80 01 8d 06 01 90 44 0c
+31 00 60 02 48 02 a0 20 60 02 8d 06 02 a1 43 04
+31 00 80 02 48 02 e0 21 60 01 8d 06 02 a1 85 08
+31 00 60 02 48 02 80 2f 80 01 8d 06 00 c0 43 0a
+31 00 60 02 48 02 80 20 a0 00 8d 06 01 00 12 04
+31 00 80 02 48 02 80 20 e0 00 8d 06 01 00 24 08
+31 00 60 06 e0 3a 00 20 c0 0e 8d 06 27 00 0a 94
+31 00 60 06 e0 3a 00 20 40 00 8d 06 67 00 08 12
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 87 00 08 8a
+31 00 60 06 08 02 a0 22 40 00 8d 06 68 00 38 02
+31 00 60 06 08 02 60 24 40 00 8d 06 88 00 38 02
+31 00 60 06 e0 3a 00 20 a0 00 8d 06 67 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 0e 8d 06 67 00 0a 94
+31 00 60 02 48 02 40 20 00 01 8d 06 01 00 22 04
+31 00 80 02 48 02 40 20 c0 01 8d 06 01 00 44 08
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 d7 00 08 8a
+31 00 60 02 48 02 c0 22 c0 01 8d 06 05 84 4a 06
+31 00 60 02 48 02 c0 20 c0 00 8d 06 02 81 4a 08
+31 00 60 02 48 02 c0 21 40 01 8d 06 03 82 4a 08
+31 00 60 02 48 02 40 22 40 03 8d 06 04 83 4a 0a
+31 00 80 02 48 02 40 22 60 05 8d 06 05 84 8c 0a
+31 00 80 02 48 02 60 25 e0 00 8d 06 02 81 8c 0e
+31 00 80 02 48 02 40 20 60 06 8d 06 03 82 8c 0e
+31 00 80 02 48 02 40 21 40 03 8d 06 04 83 8c 12
+31 00 60 02 48 02 c0 20 e0 01 8d 06 01 40 4a 0e
+31 00 80 0c 40 02 00 20 40 00 8d 06 01 86 00 04
+31 00 60 02 48 02 80 2f 40 00 8d 06 01 20 42 08
+31 00 80 02 48 02 00 2f 40 00 8d 06 01 20 84 10
+31 00 60 02 48 02 c0 20 e0 00 8d 06 01 60 12 08
+31 00 60 02 48 02 e0 20 60 01 8d 06 02 61 12 08
+31 00 80 02 48 02 40 21 80 01 8d 06 01 60 24 10
+31 00 80 02 48 02 80 21 80 02 8d 06 02 61 24 10
+31 00 60 06 e0 02 00 20 40 02 8d 06 47 80 0a 0e
+31 00 60 06 08 02 20 21 40 04 8d 06 18 02 48 02
+31 00 60 06 08 02 20 22 40 04 8d 06 38 02 48 02
+31 00 60 06 08 02 40 20 c0 00 8d 06 28 01 1a 04
+31 00 60 06 08 02 c0 22 00 01 8d 06 28 00 1a 04
+31 00 60 06 e0 02 00 20 40 00 8d 06 27 80 08 06
+31 00 60 06 e0 02 00 20 80 01 8d 06 37 80 08 06
+31 00 60 06 e0 02 00 20 a0 01 8d 06 47 80 08 06
+31 00 60 06 e0 02 00 20 c0 01 8d 06 57 80 08 06
+31 00 60 06 e0 02 00 20 e0 01 8d 06 67 80 08 06
+31 00 60 06 e0 02 00 20 00 02 8d 06 77 80 08 06
+31 00 60 06 e0 02 00 20 20 02 8d 06 87 80 08 06
+31 00 60 06 e0 02 00 20 40 02 8d 06 97 80 08 06
+31 00 60 06 e0 02 00 20 60 02 8d 06 a7 80 08 06
+31 00 60 06 e0 02 00 20 80 02 8d 06 b7 80 08 06
+31 00 60 06 e0 02 00 20 a0 02 8d 06 c7 80 08 06
+31 00 60 06 e0 02 00 20 c0 02 8d 06 d7 80 08 06
+31 00 60 06 e0 02 00 20 e0 02 8d 06 e7 80 08 06
+31 00 60 06 e0 02 00 20 00 03 8d 06 f7 80 08 06
+31 00 60 06 e0 02 00 20 20 03 8d 06 07 81 08 06
+31 00 60 06 e0 02 00 20 40 03 8d 06 17 81 08 06
+31 00 60 06 e0 02 00 20 60 03 8d 06 27 81 08 06
+31 00 60 06 e0 02 00 20 80 03 8d 06 37 81 08 06
+31 00 60 06 e0 02 00 20 a0 03 8d 06 47 81 08 06
+31 00 60 06 e0 02 00 20 c0 03 8d 06 57 81 08 06
+31 00 60 06 e0 02 00 20 e0 03 8d 06 67 81 08 06
+31 00 60 06 e0 02 00 20 00 04 8d 06 77 81 08 06
+31 00 60 06 e0 02 00 20 20 04 8d 06 87 81 08 06
+31 00 60 06 e0 02 00 20 40 04 8d 06 97 81 08 06
+31 00 60 06 e0 02 00 20 60 04 8d 06 a7 81 08 06
+31 00 60 06 e0 02 00 20 80 04 8d 06 b7 81 08 06
+31 00 60 06 e0 02 00 20 a0 04 8d 06 c7 81 08 06
+31 00 60 06 e0 02 00 20 c0 04 8d 06 d7 81 08 06
+31 00 60 06 e0 02 00 20 e0 04 8d 06 e7 81 08 06
+31 00 60 06 e0 02 00 20 00 05 8d 06 f7 81 08 06
+31 00 60 02 48 02 60 20 40 01 8d 06 01 a0 42 02
+31 00 80 02 48 02 60 20 60 01 8d 06 01 a0 84 04
+31 00 60 02 48 02 80 2f c0 00 8d 06 01 00 32 06
+31 00 80 02 48 02 00 2f 00 01 8d 06 01 00 64 0c
+31 00 60 0c 48 02 80 2f 40 00 8d 06 00 60 40 02
+31 00 60 02 48 02 e0 2f c0 00 8d 06 01 00 12 06
+31 00 80 02 48 02 c0 2f 00 01 8d 06 01 00 24 0c
+31 00 60 0c 48 02 e0 22 40 00 8d 06 01 5e 11 04
+31 10 60 0c 48 02 e0 24 a0 05 8d 06 01 6e 11 04
+31 00 61 0c 42 02 00 20 40 00 8d 06 01 85 01 04
+31 10 61 0c 42 02 00 20 40 05 8d 06 01 95 01 04
+31 00 60 02 48 02 40 20 c0 00 8d 06 01 30 42 04
+31 00 60 02 48 02 c0 20 00 01 8d 06 02 31 42 04
+31 00 80 02 48 02 40 20 40 01 8d 06 01 30 84 08
+31 00 80 02 48 02 40 21 40 02 8d 06 02 31 84 08
+31 00 60 06 08 02 c0 20 c0 02 8d 06 18 03 4a 04
+31 00 60 06 08 02 40 21 c0 02 8d 06 18 05 4a 04
+31 00 60 06 08 02 c0 21 c0 02 8d 06 18 07 4a 04
+31 00 60 06 08 02 40 22 c0 02 8d 06 18 09 4a 04
+31 00 60 06 08 02 a0 21 a0 03 8d 06 18 02 4a 04
+31 00 60 06 08 02 20 22 a0 03 8d 06 18 04 4a 04
+31 00 60 06 08 02 a0 22 a0 03 8d 06 18 06 4a 04
+31 00 60 06 08 02 20 23 a0 03 8d 06 18 08 4a 04
+31 00 60 06 e0 02 00 20 c0 00 8d 06 17 02 0a 0c
+31 00 60 06 e0 02 00 20 80 01 8d 06 27 02 0a 0c
+31 00 60 06 e0 02 00 20 a0 01 8d 06 37 02 0a 0c
+31 00 60 06 e0 02 00 20 c0 01 8d 06 47 02 0a 0c
+31 00 60 06 e0 02 00 20 e0 01 8d 06 57 02 0a 0c
+31 00 60 06 e0 02 00 20 00 02 8d 06 67 02 0a 0c
+31 00 60 06 e0 02 00 20 20 02 8d 06 77 02 0a 0c
+31 00 60 06 e0 02 00 20 40 02 8d 06 87 02 0a 0c
+31 00 60 06 e0 02 00 20 60 02 8d 06 97 02 0a 0c
+31 00 60 06 e0 02 00 20 80 02 8d 06 a7 02 0a 0c
+31 00 60 06 e0 02 00 20 a0 02 8d 06 b7 02 0a 0c
+31 00 60 06 e0 02 00 20 c0 02 8d 06 c7 02 0a 0c
+31 00 60 06 e0 02 00 20 e0 02 8d 06 d7 02 0a 0c
+31 00 60 06 e0 02 00 20 00 03 8d 06 e7 02 0a 0c
+31 00 60 06 e0 02 00 20 20 03 8d 06 f7 02 0a 0c
+31 00 60 06 e0 02 00 20 40 03 8d 06 07 03 0a 0c
+31 00 60 06 e0 02 00 20 60 03 8d 06 17 03 0a 0c
+31 00 60 06 e0 02 00 20 80 03 8d 06 27 03 0a 0c
+31 00 60 06 e0 02 00 20 a0 03 8d 06 37 03 0a 0c
+31 00 60 06 e0 02 00 20 c0 03 8d 06 47 03 0a 0c
+31 00 60 06 e0 02 00 20 e0 03 8d 06 57 03 0a 0c
+31 00 60 06 e0 02 00 20 00 04 8d 06 67 03 0a 0c
+31 00 60 06 e0 02 00 20 20 04 8d 06 77 03 0a 0c
+31 00 60 06 e0 02 00 20 40 04 8d 06 87 03 0a 0c
+31 00 60 06 e0 02 00 20 60 04 8d 06 97 03 0a 0c
+31 00 60 06 e0 02 00 20 80 04 8d 06 a7 03 0a 0c
+31 00 60 06 e0 02 00 20 a0 04 8d 06 b7 03 0a 0c
+31 00 60 06 e0 02 00 20 c0 04 8d 06 c7 03 0a 0c
+31 00 60 06 e0 02 00 20 e0 04 8d 06 d7 03 0a 0c
+31 00 60 06 e0 02 00 20 00 05 8d 06 e7 03 0a 0c
+31 00 60 06 e0 02 00 20 20 05 8d 06 f7 03 0a 0c
+31 00 60 06 e0 02 00 20 00 02 8d 06 67 00 08 0a
+31 00 60 06 e0 02 00 20 20 02 8d 06 77 00 08 0a
+31 00 60 06 e0 02 00 20 40 02 8d 06 87 00 08 0a
+31 00 60 06 e0 02 00 20 60 02 8d 06 97 00 08 0a
+31 00 60 06 e0 02 00 20 80 02 8d 06 a7 00 08 0a
+31 00 60 06 e0 02 00 20 a0 02 8d 06 b7 00 08 0a
+31 00 60 06 e0 02 00 20 c0 02 8d 06 c7 00 08 0a
+31 00 60 06 e0 02 00 20 e0 02 8d 06 d7 00 08 0a
+31 00 60 06 e0 02 00 20 00 03 8d 06 e7 00 08 0a
+31 00 60 06 e0 02 00 20 20 03 8d 06 f7 00 08 0a
+31 00 60 06 e0 02 00 20 40 03 8d 06 07 01 08 0a
+31 00 60 06 e0 02 00 20 60 03 8d 06 17 01 08 0a
+31 00 60 06 e0 02 00 20 80 03 8d 06 27 01 08 0a
+31 00 60 06 e0 02 00 20 a0 03 8d 06 37 01 08 0a
+31 00 60 06 e0 02 00 20 c0 03 8d 06 47 01 08 0a
+31 00 60 06 e0 02 00 20 e0 03 8d 06 57 01 08 0a
+31 00 60 06 e0 02 00 20 00 04 8d 06 67 01 08 0a
+31 00 60 06 e0 02 00 20 20 04 8d 06 77 01 08 0a
+31 00 60 06 e0 02 00 20 40 04 8d 06 87 01 08 0a
+31 00 60 06 e0 02 00 20 60 04 8d 06 97 01 08 0a
+31 00 60 06 e0 02 00 20 80 04 8d 06 a7 01 08 0a
+31 00 60 06 e0 02 00 20 a0 04 8d 06 b7 01 08 0a
+31 00 60 06 e0 02 00 20 c0 04 8d 06 c7 01 08 0a
+31 00 60 06 e0 02 00 20 e0 04 8d 06 d7 01 08 0a
+31 00 60 06 e0 02 00 20 00 05 8d 06 e7 01 08 0a
+31 00 60 06 e0 02 00 20 20 05 8d 06 f7 01 08 0a
+31 00 60 02 48 02 a0 21 40 00 8d 06 01 30 12 06
+31 00 60 02 48 02 c0 21 a0 00 8d 06 02 31 12 06
+31 00 80 02 48 02 c0 22 40 00 8d 06 01 30 24 0c
+31 00 80 02 48 02 00 23 00 02 8d 06 02 31 24 0c
+31 00 60 02 48 02 a0 20 e0 01 8d 06 03 02 42 04
+31 00 80 02 48 02 e0 20 60 03 8d 06 03 02 84 08
+31 00 80 0c 48 02 80 20 20 02 8d 06 03 a5 20 04
+31 00 80 0c 40 02 00 20 40 02 8d 06 04 85 00 04
+31 00 80 0c 48 02 60 21 60 02 8d 06 02 a6 20 04
+31 00 80 0c 40 02 00 20 80 02 8d 06 05 85 00 04
+31 00 80 0c 48 02 00 22 a0 02 8d 06 01 5e 20 04
+31 00 80 0c 40 02 00 20 c0 02 8d 06 06 85 00 04
+31 00 60 02 48 02 40 23 40 03 8d 06 03 a2 42 02
+31 00 60 02 48 02 c0 23 c0 03 8d 06 04 a3 42 02
+31 00 60 02 48 02 40 24 40 04 8d 06 05 a4 42 02
+31 00 60 02 48 02 c0 24 c0 04 8d 06 06 a5 42 02
+31 00 60 02 48 02 c0 22 20 03 8d 06 02 a1 42 02
+31 00 60 02 48 02 40 25 40 05 8d 06 07 a6 42 02
+31 00 60 02 48 02 c0 25 c0 05 8d 06 08 a7 42 02
+31 00 60 02 48 02 40 26 40 06 8d 06 09 a8 42 02
+31 00 60 02 48 02 40 20 c0 06 8d 06 0a a9 42 02
+31 00 60 02 48 02 c0 20 e0 06 8d 06 0b aa 42 02
+31 00 60 02 48 02 40 21 00 07 8d 06 0c ab 42 02
+31 00 60 02 48 02 c0 21 20 07 8d 06 0d ac 42 02
+31 00 80 02 48 02 40 21 40 02 8d 06 02 a1 84 04
+31 00 80 02 48 02 40 2a c0 0d 8d 06 0b aa 84 04
+31 00 80 02 48 02 40 22 40 03 8d 06 03 a2 84 04
+31 00 80 02 48 02 40 2b 00 0e 8d 06 0c ab 84 04
+31 00 80 02 48 02 40 2c 40 0d 8d 06 0d ac 84 04
+31 00 80 02 48 02 40 23 40 04 8d 06 04 a3 84 04
+31 00 80 02 48 02 40 24 40 05 8d 06 05 a4 84 04
+31 00 80 02 48 02 40 25 40 06 8d 06 06 a5 84 04
+31 00 80 02 48 02 40 26 40 07 8d 06 07 a6 84 04
+31 00 80 02 48 02 40 27 40 08 8d 06 08 a7 84 04
+31 00 80 02 48 02 40 28 40 09 8d 06 09 a8 84 04
+31 00 80 02 48 02 40 29 80 0d 8d 06 0a a9 84 04
+31 00 80 0c 40 02 00 20 60 00 8d 06 fe 85 00 04
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 67 00 08 92
+31 00 60 02 48 02 c0 20 80 02 8d 06 01 40 42 12
+31 00 60 02 48 02 20 22 40 00 8d 06 01 a0 13 04
+31 00 80 02 48 02 40 20 e0 00 8d 06 01 a0 25 08
+31 00 60 02 48 02 20 21 20 02 8d 06 00 20 42 06
+31 00 80 0a 40 02 00 20 60 0f 00 06 fd 03 0a 06
+31 00 80 0a 4c 02 40 2e 40 0e 00 06 fd 03 28 02
+31 00 60 06 e0 02 00 20 80 01 8d 06 27 01 0a 0c
+31 00 60 02 48 02 40 20 60 01 8d 06 05 04 42 04
+31 00 60 02 48 02 40 20 80 01 8d 06 06 05 42 04
+31 00 60 02 48 02 40 20 a0 01 8d 06 07 06 42 04
+31 00 60 02 48 02 40 20 c0 01 8d 06 08 07 42 04
+31 00 60 02 48 02 40 20 e0 01 8d 06 09 08 42 04
+31 00 60 02 48 02 40 20 00 02 8d 06 0a 09 42 04
+31 00 60 02 48 02 40 20 20 02 8d 06 0b 0a 42 04
+31 00 60 02 48 02 40 20 40 02 8d 06 0c 0b 42 04
+31 00 60 02 48 02 40 20 60 02 8d 06 0d 0c 42 04
+31 00 60 02 48 02 40 20 80 02 8d 06 0e 0d 42 04
+31 00 60 02 48 02 40 20 a0 02 8d 06 0f 0e 42 04
+31 00 60 02 48 02 40 20 c0 02 8d 06 10 0f 42 04
+31 00 60 02 48 02 40 20 40 01 8d 06 11 00 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 12 01 4a 06
+31 00 60 02 48 02 40 20 40 01 8d 06 13 02 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 14 03 4a 06
+31 00 60 02 48 02 40 20 40 01 8d 06 15 04 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 16 05 4a 06
+31 00 60 02 48 02 40 20 40 01 8d 06 17 06 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 18 07 4a 06
+31 00 60 02 48 02 40 20 40 01 8d 06 19 08 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 1a 09 4a 06
+31 00 60 02 48 02 40 20 40 01 8d 06 1b 0a 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 1c 0b 4a 06
+31 00 60 02 48 02 40 20 40 01 8d 06 1d 0c 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 1e 0d 4a 06
+31 00 60 02 48 02 40 20 40 01 8d 06 1f 0e 4a 06
+31 00 60 02 48 02 40 20 a0 01 8d 06 20 0f 4a 06
+31 00 80 02 48 02 40 20 80 03 8d 06 05 04 84 08
+31 00 80 02 48 02 40 20 a0 03 8d 06 06 05 84 08
+31 00 80 02 48 02 40 20 c0 03 8d 06 07 06 84 08
+31 00 80 02 48 02 40 20 e0 03 8d 06 08 07 84 08
+31 00 80 02 48 02 40 20 00 04 8d 06 09 08 84 08
+31 00 80 02 48 02 40 20 20 04 8d 06 0a 09 84 08
+31 00 80 02 48 02 40 20 40 04 8d 06 0b 0a 84 08
+31 00 80 02 48 02 40 20 60 04 8d 06 0c 0b 84 08
+31 00 80 02 48 02 40 20 80 04 8d 06 0d 0c 84 08
+31 00 80 02 48 02 40 20 a0 04 8d 06 0e 0d 84 08
+31 00 80 02 48 02 e0 20 c0 04 8d 06 0f 0e 84 08
+31 00 80 02 48 02 e0 22 e0 04 8d 06 10 0f 84 08
+31 00 80 02 48 02 20 22 40 00 8d 06 11 00 8c 0a
+31 00 80 02 48 02 a0 23 e0 00 8d 06 12 01 8c 0a
+31 00 80 02 48 02 60 23 80 01 8d 06 13 02 8c 0a
+31 00 80 02 48 02 00 24 20 02 8d 06 14 03 8c 0a
+31 00 80 02 48 02 40 20 c0 02 8d 06 15 04 8c 0a
+31 00 80 02 48 02 40 20 60 03 8d 06 16 05 8c 0a
+31 00 80 02 48 02 40 20 00 04 8d 06 17 06 8c 0a
+31 00 80 02 48 02 40 20 a0 04 8d 06 18 07 8c 0a
+31 00 80 02 48 02 40 20 40 05 8d 06 19 08 8c 0a
+31 00 80 02 48 02 40 20 e0 05 8d 06 1a 09 8c 0a
+31 00 80 02 48 02 40 20 80 06 8d 06 1b 0a 8c 0a
+31 00 80 02 48 02 40 20 20 07 8d 06 1c 0b 8c 0a
+31 00 80 02 48 02 40 20 c0 07 8d 06 1d 0c 8c 0a
+31 00 80 02 48 02 40 20 60 08 8d 06 1e 0d 8c 0a
+31 00 80 02 48 02 40 20 00 09 8d 06 1f 0e 8c 0a
+31 00 80 02 48 02 40 20 a0 09 8d 06 20 0f 8c 0a
+31 00 60 02 48 02 80 2f 40 00 8d 06 02 01 12 02
+31 00 60 02 48 02 c0 20 60 00 8d 06 02 01 22 02
+31 00 60 02 48 02 00 21 80 00 8d 06 02 01 32 02
+31 00 80 02 48 02 00 2f 40 00 8d 06 02 01 24 04
+31 00 80 02 48 02 40 21 80 00 8d 06 02 01 44 04
+31 00 80 02 48 02 c0 21 c0 00 8d 06 02 01 64 04
+31 00 60 06 e0 02 00 20 00 01 8d 06 27 80 0a 0c
+31 00 60 06 e0 3a 00 20 a0 01 8d 06 47 00 08 12
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 87 00 08 92
+31 00 60 02 48 02 a0 20 40 01 8d 06 01 00 42 06
+31 00 80 02 48 02 e0 20 60 02 8d 06 01 00 84 0c
+31 00 60 0c 48 02 20 20 a0 0f 8d 06 02 6e 10 02
+31 00 60 0c 48 02 00 21 c0 02 8d 06 fe 6e 10 02
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 97 00 08 8a
+31 00 60 02 48 02 a0 23 a0 00 8d 06 01 20 4b 0e
+31 00 60 02 48 02 80 28 00 09 8d 06 02 a1 12 02
+31 00 60 02 48 02 60 28 a0 00 8d 06 01 60 12 0a
+31 00 60 02 48 02 a0 28 40 01 8d 06 02 61 12 0a
+31 00 80 02 48 02 80 24 00 05 8d 06 02 a1 24 04
+31 00 80 02 48 02 40 20 e0 00 8d 06 01 60 24 14
+31 00 80 02 48 02 a0 24 20 02 8d 06 02 61 24 14
+31 00 60 02 48 02 a0 2f a0 00 8d 06 02 01 22 04
+31 00 80 02 48 02 40 2f e0 00 8d 06 02 01 44 08
+31 00 60 06 e0 02 00 20 c0 01 8d 06 37 80 0a 0c
+31 00 60 06 e0 02 00 20 e0 01 8d 06 47 80 0a 0c
+31 00 60 06 e0 02 00 20 00 02 8d 06 57 80 0a 0c
+31 00 60 02 48 02 c0 20 e0 00 8d 06 01 50 1a 08
+31 00 60 02 48 02 e0 20 60 01 8d 06 02 51 1a 08
+31 00 80 02 48 02 40 21 80 01 8d 06 01 50 2c 0e
+31 00 80 02 48 02 80 21 60 02 8d 06 02 51 2c 0e
+31 00 60 02 48 02 a0 20 c0 00 8d 06 01 30 1a 08
+31 00 60 02 48 02 c0 20 40 01 8d 06 02 31 1a 08
+31 00 80 02 48 02 20 21 60 01 8d 06 01 30 2c 0e
+31 00 80 02 48 02 60 21 40 02 8d 06 02 31 2c 0e
+31 00 60 02 48 02 a0 20 e0 00 8d 06 02 01 32 04
+31 00 80 02 48 02 00 21 c0 01 8d 06 02 01 64 08
+31 00 60 02 48 02 60 22 80 01 8d 06 03 00 32 04
+31 00 80 02 48 02 40 24 20 05 8d 06 03 00 64 08
+31 00 60 02 48 02 60 21 40 00 8d 06 08 a0 43 04
+31 00 60 02 48 02 e0 21 40 00 8d 06 09 a1 43 04
+31 00 60 02 48 02 60 22 40 00 8d 06 0a a2 43 04
+31 00 60 02 48 02 e0 22 40 00 8d 06 0b a3 43 04
+31 00 60 02 48 02 60 23 40 00 8d 06 0c a4 43 04
+31 00 60 02 48 02 e0 23 40 00 8d 06 0d a5 43 04
+31 00 60 02 48 02 60 24 40 00 8d 06 0e a6 43 04
+31 00 60 02 48 02 e0 24 40 00 8d 06 0f a7 43 04
+31 00 80 02 48 02 a0 2b 40 00 8d 06 08 a0 85 08
+31 00 80 02 48 02 60 23 40 00 8d 06 09 a1 85 08
+31 00 80 02 48 02 a0 24 40 00 8d 06 0a a2 85 08
+31 00 80 02 48 02 e0 25 40 00 8d 06 0b a3 85 08
+31 00 80 02 48 02 20 27 40 00 8d 06 0c a4 85 08
+31 00 80 02 48 02 60 28 40 00 8d 06 0d a5 85 08
+31 00 80 02 48 02 a0 2a 40 00 8d 06 0e a6 85 08
+31 00 80 02 48 02 a0 29 40 00 8d 06 0f a7 85 08
+31 00 80 0c 48 02 60 2a c0 0a 8d 06 00 5e 20 04
+31 00 60 06 e0 3a 00 20 40 0f 8d 06 47 00 0a 8c
+31 00 60 02 48 02 c0 21 60 01 8d 06 02 02 4b 08
+31 00 60 02 48 02 c0 20 c0 00 8d 06 01 01 4b 0a
+31 00 60 06 e0 3a 00 20 60 00 8d 06 87 00 08 12
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 a7 00 08 8a
+31 00 60 02 48 02 c0 20 e0 00 8d 06 01 60 1a 08
+31 00 60 02 48 02 e0 20 60 01 8d 06 02 61 1a 08
+31 00 80 02 48 02 40 21 80 01 8d 06 01 60 2c 0e
+31 00 80 02 48 02 80 21 60 02 8d 06 02 61 2c 0e
+31 00 60 06 08 02 e0 23 80 03 8d 06 38 02 38 02
+31 00 60 06 08 02 40 24 80 03 8d 06 38 04 38 02
+31 00 60 06 08 02 a0 24 80 03 8d 06 38 06 38 02
+31 00 60 06 08 02 c0 22 80 03 8d 06 48 02 38 02
+31 00 60 06 08 02 20 23 80 03 8d 06 48 04 38 02
+31 00 60 06 08 02 80 23 80 03 8d 06 48 06 38 02
+31 00 60 06 08 02 c0 22 a0 03 8d 06 58 02 38 02
+31 00 60 06 08 02 20 23 a0 03 8d 06 58 04 38 02
+31 00 60 06 08 02 80 23 a0 03 8d 06 58 06 38 02
+31 00 60 06 08 02 c0 22 c0 03 8d 06 68 02 38 02
+31 00 60 06 08 02 20 23 c0 03 8d 06 68 04 38 02
+31 00 60 06 08 02 80 23 c0 03 8d 06 68 06 38 02
+31 00 60 06 08 02 c0 22 e0 03 8d 06 78 02 38 02
+31 00 60 06 08 02 20 23 e0 03 8d 06 78 04 38 02
+31 00 60 06 08 02 80 23 e0 03 8d 06 78 06 38 02
+31 00 60 06 08 02 20 23 00 04 8d 06 88 04 38 02
+31 00 60 06 08 02 c0 22 00 04 8d 06 88 02 38 02
+31 00 60 06 08 02 80 23 00 04 8d 06 88 06 38 02
+31 00 60 06 08 02 20 23 20 04 8d 06 98 04 38 02
+31 00 60 06 08 02 c0 22 20 04 8d 06 98 02 38 02
+31 00 60 06 08 02 80 23 20 04 8d 06 98 06 38 02
+31 00 60 06 08 02 20 23 40 04 8d 06 a8 06 38 02
+31 00 60 06 08 02 00 21 40 04 8d 06 a8 02 38 02
+31 00 60 06 08 02 c0 22 40 04 8d 06 a8 04 38 02
+31 00 60 06 08 02 00 21 60 04 8d 06 b8 02 38 02
+31 00 60 06 08 02 c0 22 60 04 8d 06 b8 04 38 02
+31 00 60 06 08 02 20 23 60 04 8d 06 b8 06 38 02
+31 00 60 06 08 02 00 21 80 04 8d 06 c8 02 38 02
+31 00 60 06 08 02 c0 22 80 04 8d 06 c8 04 38 02
+31 00 60 06 08 02 20 23 80 04 8d 06 c8 06 38 02
+31 00 60 06 08 02 00 21 a0 04 8d 06 d8 02 38 02
+31 00 60 06 08 02 c0 22 a0 04 8d 06 d8 04 38 02
+31 00 60 06 08 02 20 23 a0 04 8d 06 d8 06 38 02
+31 00 60 06 08 02 00 21 c0 04 8d 06 e8 02 38 02
+31 00 60 06 08 02 c0 22 c0 04 8d 06 e8 04 38 02
+31 00 60 06 08 02 20 23 c0 04 8d 06 e8 06 38 02
+31 00 60 06 08 02 00 21 e0 04 8d 06 f8 02 38 02
+31 00 60 06 08 02 c0 22 e0 04 8d 06 f8 04 38 02
+31 00 60 06 08 02 20 23 e0 04 8d 06 f8 06 38 02
+31 00 60 06 08 02 00 21 00 05 8d 06 08 03 38 02
+31 00 60 06 08 02 60 21 00 05 8d 06 08 05 38 02
+31 00 60 06 08 02 c0 22 00 05 8d 06 08 07 38 02
+31 00 60 06 08 02 00 21 20 05 8d 06 18 03 38 02
+31 00 60 06 08 02 60 21 20 05 8d 06 18 05 38 02
+31 00 60 06 08 02 c0 22 20 05 8d 06 18 07 38 02
+31 00 60 06 08 02 00 21 60 00 8d 06 28 03 38 02
+31 00 60 06 08 02 60 21 60 00 8d 06 28 05 38 02
+31 00 60 06 08 02 c0 22 60 00 8d 06 28 07 38 02
+31 00 60 06 08 02 00 21 60 05 8d 06 38 03 38 02
+31 00 60 06 08 02 60 21 60 05 8d 06 38 05 38 02
+31 00 60 06 08 02 c0 22 60 05 8d 06 38 07 38 02
+31 00 60 06 08 02 00 21 80 05 8d 06 48 03 38 02
+31 00 60 06 08 02 60 21 80 05 8d 06 48 05 38 02
+31 00 60 06 08 02 c0 22 80 05 8d 06 48 07 38 02
+31 00 60 06 08 02 00 21 a0 05 8d 06 58 03 38 02
+31 00 60 06 08 02 60 21 a0 05 8d 06 58 05 38 02
+31 00 60 06 08 02 c0 22 a0 05 8d 06 58 07 38 02
+31 00 60 06 08 02 00 21 c0 05 8d 06 68 03 38 02
+31 00 60 06 08 02 60 21 c0 05 8d 06 68 05 38 02
+31 00 60 06 08 02 c0 21 c0 05 8d 06 68 07 38 02
+31 00 60 06 08 02 00 21 e0 05 8d 06 78 03 38 02
+31 00 60 06 08 02 60 21 e0 05 8d 06 78 05 38 02
+31 00 60 06 08 02 c0 21 e0 05 8d 06 78 07 38 02
+31 00 60 06 08 02 00 21 00 06 8d 06 88 03 38 02
+31 00 60 06 08 02 60 21 00 06 8d 06 88 05 38 02
+31 00 60 06 08 02 c0 21 00 06 8d 06 88 07 38 02
+31 00 60 06 08 02 00 21 20 06 8d 06 98 03 38 02
+31 00 60 06 08 02 60 21 20 06 8d 06 98 05 38 02
+31 00 60 06 08 02 c0 21 20 06 8d 06 98 07 38 02
+31 00 60 06 08 02 00 21 40 06 8d 06 a8 03 38 02
+31 00 60 06 08 02 60 21 40 06 8d 06 a8 05 38 02
+31 00 60 06 08 02 c0 21 40 06 8d 06 a8 07 38 02
+31 00 60 06 08 02 00 21 c0 06 8d 06 b8 03 38 02
+31 00 60 06 08 02 60 21 c0 06 8d 06 b8 05 38 02
+31 00 60 06 08 02 c0 21 c0 06 8d 06 b8 07 38 02
+31 00 60 06 08 02 00 21 e0 06 8d 06 c8 03 38 02
+31 00 60 06 08 02 60 21 e0 06 8d 06 c8 05 38 02
+31 00 60 06 08 02 c0 21 e0 06 8d 06 c8 07 38 02
+31 00 60 06 08 02 00 21 00 07 8d 06 d8 03 38 02
+31 00 60 06 08 02 60 21 00 07 8d 06 d8 05 38 02
+31 00 60 06 08 02 c0 21 00 07 8d 06 d8 07 38 02
+31 00 60 06 08 02 00 21 20 07 8d 06 e8 03 38 02
+31 00 60 06 08 02 60 21 20 07 8d 06 e8 05 38 02
+31 00 60 06 08 02 c0 21 20 07 8d 06 e8 07 38 02
+31 00 60 06 08 02 00 21 40 07 8d 06 f8 03 38 02
+31 00 60 06 08 02 60 21 40 07 8d 06 f8 05 38 02
+31 00 60 06 08 02 c0 21 40 07 8d 06 f8 07 38 02
+31 00 60 06 08 02 00 21 60 07 8d 06 08 02 38 02
+31 00 60 06 08 02 60 21 60 07 8d 06 08 04 38 02
+31 00 60 06 08 02 c0 21 60 07 8d 06 08 06 38 02
+31 00 60 06 08 02 20 22 60 07 8d 06 08 08 38 02
+31 00 60 06 08 02 00 21 80 07 8d 06 18 02 38 02
+31 00 60 06 08 02 60 21 80 07 8d 06 18 04 38 02
+31 00 60 06 08 02 c0 21 80 07 8d 06 18 06 38 02
+31 00 60 06 08 02 20 22 80 07 8d 06 18 08 38 02
+31 00 60 06 e0 02 00 20 c0 01 8d 06 67 80 0a 0c
+31 00 60 06 e0 02 00 20 e0 01 8d 06 77 80 0a 0c
+31 00 60 06 e0 02 00 20 00 02 8d 06 87 80 0a 0c
+31 00 60 06 e0 02 00 20 20 02 8d 06 97 80 0a 0c
+31 00 60 06 e0 02 00 20 40 02 8d 06 a7 80 0a 0c
+31 00 60 06 e0 02 00 20 60 02 8d 06 b7 80 0a 0c
+31 00 60 06 e0 02 00 20 80 02 8d 06 c7 80 0a 0c
+31 00 60 06 e0 02 00 20 a0 02 8d 06 d7 80 0a 0c
+31 00 60 06 e0 02 00 20 c0 02 8d 06 e7 80 0a 0c
+31 00 60 06 e0 02 00 20 e0 02 8d 06 f7 80 0a 0c
+31 00 60 06 e0 02 00 20 00 03 8d 06 07 81 0a 0c
+31 00 60 06 e0 02 00 20 20 03 8d 06 17 81 0a 0c
+31 00 60 06 e0 02 00 20 40 03 8d 06 27 81 0a 0c
+31 00 60 06 e0 02 00 20 60 03 8d 06 37 81 0a 0c
+31 00 60 06 e0 02 00 20 80 03 8d 06 47 81 0a 0c
+31 00 60 06 e0 02 00 20 a0 03 8d 06 57 81 0a 0c
+31 00 60 06 e0 02 00 20 c0 03 8d 06 67 81 0a 0c
+31 00 60 06 e0 02 00 20 e0 03 8d 06 77 81 0a 0c
+31 00 60 06 e0 02 00 20 00 04 8d 06 87 81 0a 0c
+31 00 60 06 e0 02 00 20 20 04 8d 06 97 81 0a 0c
+31 00 60 06 e0 02 00 20 40 04 8d 06 a7 81 0a 0c
+31 00 60 06 e0 02 00 20 60 04 8d 06 b7 81 0a 0c
+31 00 60 06 e0 02 00 20 80 04 8d 06 c7 81 0a 0c
+31 00 60 06 e0 02 00 20 a0 04 8d 06 d7 81 0a 0c
+31 00 60 06 e0 02 00 20 c0 04 8d 06 e7 81 0a 0c
+31 00 60 06 e0 02 00 20 e0 04 8d 06 f7 81 0a 0c
+31 00 60 06 e0 02 00 20 00 05 8d 06 07 82 0a 0c
+31 00 60 06 e0 02 00 20 20 05 8d 06 17 82 0a 0c
+31 00 60 0c 48 02 80 2f 40 00 8d 06 01 6e 10 02
+31 00 80 0c 48 02 60 21 60 02 8d 06 01 a6 20 04
+31 00 80 0c 40 02 00 20 80 02 8d 06 03 85 00 04
+31 00 60 02 48 02 20 22 60 01 8d 06 01 e0 13 08
+31 00 80 02 48 02 c0 22 40 00 8d 06 01 e0 25 10
+31 00 60 06 e0 02 00 20 40 0f 8d 06 07 80 08 8c
+31 00 60 02 48 02 40 20 40 00 8d 06 01 30 42 06
+31 00 60 02 48 02 c0 20 c0 00 8d 06 02 31 42 06
+31 00 80 02 48 02 40 20 80 02 8d 06 01 30 84 0c
+31 00 80 02 48 02 40 21 40 03 8d 06 02 31 84 0c
+31 00 60 02 48 02 c0 21 c0 01 8d 06 01 50 1a 0a
+31 00 60 02 48 02 e0 21 60 02 8d 06 02 51 1a 0a
+31 00 80 02 48 02 e0 24 e0 00 8d 06 01 50 2c 12
+31 00 80 02 48 02 20 25 00 02 8d 06 02 51 2c 12
+31 00 60 02 48 02 40 20 a0 01 8d 06 01 20 4b 0c
+31 00 80 02 48 02 60 25 e0 00 8d 06 01 20 8d 16
+31 00 60 06 08 02 c0 26 e0 00 8d 06 48 00 28 02
+31 00 60 02 48 02 40 20 00 01 8d 06 01 00 42 02
+31 00 80 02 48 02 40 20 e0 01 8d 06 01 00 84 04
+31 00 60 0c 48 02 e0 20 80 05 8d 06 00 6e 10 02
+31 00 60 0c 40 02 00 20 80 05 8d 06 00 95 00 02
+31 00 60 06 08 02 e0 20 a0 04 8d 06 38 04 48 02
+31 00 60 06 08 02 60 21 a0 04 8d 06 38 06 48 02
+31 00 60 06 08 02 a0 21 c0 01 8d 06 48 01 2a 04
+31 00 60 06 08 02 40 20 c0 01 8d 06 48 00 2a 04
+31 00 60 02 48 02 80 2f a0 01 8d 06 00 c0 43 0c
+31 00 60 02 48 02 c0 22 c0 01 8d 06 04 84 4a 06
+31 00 60 02 48 02 c0 21 40 01 8d 06 02 82 4a 08
+31 00 60 02 48 02 40 22 40 03 8d 06 03 83 4a 0a
+31 00 60 02 48 02 c0 20 c0 01 8d 06 02 41 43 0e
+31 00 60 02 48 02 00 21 e0 00 8d 06 01 40 1b 12
+31 00 60 02 48 02 20 21 00 02 8d 06 02 41 1b 12
+31 00 60 06 08 02 40 20 40 00 8d 06 78 00 38 02
+31 00 60 02 48 02 c0 21 40 01 8d 06 03 82 4a 06
+31 00 80 02 48 02 40 23 40 04 8d 06 03 82 8c 0a
+31 00 60 06 08 02 40 26 60 06 8d 06 18 00 18 02
+31 00 60 02 48 02 60 27 00 08 8d 06 02 70 42 02
+31 00 60 02 48 02 40 20 00 08 8d 06 03 70 42 02
+31 00 60 02 48 02 c0 20 00 08 8d 06 04 70 42 02
+31 00 60 02 48 02 40 21 00 08 8d 06 05 70 42 02
+31 00 60 02 48 02 c0 21 00 08 8d 06 06 70 42 02
+31 00 60 02 48 02 40 22 00 08 8d 06 07 70 42 02
+31 00 60 02 48 02 c0 22 00 08 8d 06 08 70 42 02
+31 00 60 02 48 02 40 23 00 08 8d 06 09 70 42 02
+31 00 60 02 48 02 c0 23 00 08 8d 06 0a 70 42 02
+31 00 60 02 48 02 40 24 00 08 8d 06 0b 70 42 02
+31 00 60 02 48 02 c0 24 00 08 8d 06 0c 70 42 02
+31 00 60 02 48 02 40 25 00 08 8d 06 0d 70 42 02
+31 00 60 02 48 02 80 2f c0 00 8d 06 05 85 43 04
+31 00 60 06 e0 02 00 20 00 02 8d 06 67 80 08 0a
+31 00 60 06 e0 02 00 20 20 02 8d 06 77 80 08 0a
+31 00 60 06 e0 02 00 20 40 02 8d 06 87 80 08 0a
+31 00 60 06 e0 02 00 20 60 02 8d 06 97 80 08 0a
+31 00 60 06 e0 02 00 20 80 02 8d 06 a7 80 08 0a
+31 00 60 06 e0 02 00 20 a0 02 8d 06 b7 80 08 0a
+31 00 60 06 e0 02 00 20 c0 02 8d 06 c7 80 08 0a
+31 00 60 06 e0 02 00 20 e0 02 8d 06 d7 80 08 0a
+31 00 60 06 e0 02 00 20 00 03 8d 06 e7 80 08 0a
+31 00 60 06 e0 02 00 20 20 03 8d 06 f7 80 08 0a
+31 00 60 06 e0 02 00 20 40 03 8d 06 07 81 08 0a
+31 00 60 06 e0 02 00 20 60 03 8d 06 17 81 08 0a
+31 00 60 06 e0 02 00 20 80 03 8d 06 27 81 08 0a
+31 00 60 06 e0 02 00 20 a0 03 8d 06 37 81 08 0a
+31 00 60 06 e0 02 00 20 c0 03 8d 06 47 81 08 0a
+31 00 60 06 e0 02 00 20 e0 03 8d 06 57 81 08 0a
+31 00 60 06 e0 02 00 20 00 04 8d 06 67 81 08 0a
+31 00 60 06 e0 02 00 20 20 04 8d 06 77 81 08 0a
+31 00 60 06 e0 02 00 20 40 04 8d 06 87 81 08 0a
+31 00 60 06 e0 02 00 20 60 04 8d 06 97 81 08 0a
+31 00 60 06 e0 02 00 20 80 04 8d 06 a7 81 08 0a
+31 00 60 06 e0 02 00 20 a0 04 8d 06 b7 81 08 0a
+31 00 60 06 e0 02 00 20 c0 04 8d 06 c7 81 08 0a
+31 00 60 06 e0 02 00 20 e0 04 8d 06 d7 81 08 0a
+31 00 60 06 e0 02 00 20 00 05 8d 06 e7 81 08 0a
+31 00 60 06 e0 02 00 20 20 05 8d 06 f7 81 08 0a
+31 00 60 06 e0 02 00 20 80 00 8d 06 27 80 0a 0e
+31 00 60 02 48 02 a0 20 c0 00 8d 06 01 30 12 04
+31 00 60 02 48 02 c0 20 40 00 8d 06 02 31 12 04
+31 00 80 02 48 02 20 21 60 01 8d 06 01 30 24 08
+31 00 80 02 48 02 60 21 40 00 8d 06 02 31 24 08
+31 00 60 02 48 02 40 20 40 00 8d 06 02 d0 43 04
+31 00 60 02 48 02 60 20 c0 01 8d 06 02 c1 43 0a
+31 00 80 02 48 02 40 20 40 01 8d 06 02 d0 85 08
+31 00 80 02 48 02 60 20 20 03 8d 06 02 c1 85 14
+31 00 60 02 48 02 40 21 60 01 8d 06 01 30 12 0a
+31 00 60 02 48 02 60 21 00 02 8d 06 02 31 12 0a
+31 00 80 02 48 02 40 24 20 01 8d 06 01 30 24 14
+31 00 80 02 48 02 80 24 60 02 8d 06 02 31 24 14
+31 00 60 02 48 02 40 20 e0 00 8d 06 01 60 42 08
+31 00 60 02 48 02 c0 20 60 01 8d 06 02 61 42 08
+31 00 80 02 48 02 40 20 60 01 8d 06 01 60 84 10
+31 00 80 02 48 02 40 21 60 02 8d 06 02 61 84 10
+31 00 61 0c 4a 02 80 20 40 01 8d 06 02 b5 10 02
+31 00 81 0c 4a 02 a0 20 a0 01 8d 06 02 a5 20 04
+31 00 60 02 48 02 00 21 20 01 8d 06 01 10 32 06
+31 00 80 02 48 02 40 20 c0 01 8d 06 01 10 64 0c
+31 00 60 02 48 02 80 2f c0 00 8d 06 00 80 33 04
+31 00 60 06 08 02 80 21 20 00 8d 06 58 00 28 02
+31 00 60 06 e0 02 00 20 80 01 8d 06 67 80 0a 0e
+31 00 60 06 08 02 80 21 20 00 8d 06 78 00 28 02
+31 00 60 06 e0 02 00 20 80 01 8d 06 87 80 0a 0e
+31 00 60 06 08 02 80 21 20 00 8d 06 98 00 28 02
+31 00 60 06 e0 02 00 20 80 01 8d 06 a7 80 0a 0e
+31 00 80 02 48 02 20 21 20 02 8d 06 02 70 84 04
+31 00 80 0c 48 02 e0 22 00 04 8d 06 02 5e 20 04
+31 00 60 06 08 02 00 21 20 00 8d 06 68 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 88 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 a8 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 b8 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 c8 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 d8 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 e8 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 f8 00 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 08 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 18 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 28 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 38 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 48 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 58 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 68 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 78 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 88 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 98 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 a8 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 b8 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 c8 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 d8 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 e8 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 f8 01 28 02
+31 00 60 06 08 02 00 21 20 00 8d 06 08 02 28 02
+31 00 60 0b 48 02 40 20 60 00 8d 06 00 30 20 04
+31 00 80 0b 48 02 40 20 60 01 8d 06 00 30 41 08
+31 00 60 0b 48 02 40 20 00 00 8d 06 10 10 20 02
+31 00 80 0b 48 02 40 20 00 00 8d 06 10 10 41 02
+31 00 60 0b 48 02 40 20 00 00 8d 06 20 10 20 02
+31 00 80 0b 48 02 40 20 00 00 8d 06 20 10 41 02
+31 00 60 0b 48 02 40 20 00 00 8d 06 30 10 20 02
+31 00 80 0b 48 02 40 20 00 00 8d 06 30 10 41 02
+31 00 60 02 48 02 80 22 e0 01 8d 06 03 02 32 04
+31 00 60 02 48 02 60 21 40 03 8d 06 05 04 32 04
+31 00 60 02 48 02 00 21 00 03 8d 06 04 03 32 04
+31 00 80 02 48 02 40 23 a0 02 8d 06 03 02 64 08
+31 00 80 02 48 02 80 21 00 06 8d 06 05 04 64 08
+31 00 80 02 48 02 c0 24 80 05 8d 06 04 03 64 08
+31 00 61 0c 42 02 00 20 c0 0b 8d 06 01 96 00 02
+31 00 61 0c 4a 02 e0 25 c0 0b 8d 06 01 b6 10 02
+31 00 80 0c 48 02 80 20 20 00 8d 06 02 5c 40 04
+31 00 60 0c 40 02 00 20 80 0c 8d 06 00 96 00 02
+31 00 60 0c 48 02 60 26 80 0c 8d 06 00 b6 10 02
+31 00 60 02 48 02 a0 20 60 01 8d 06 01 00 4a 06
+31 00 80 02 48 02 e0 20 60 02 8d 06 01 00 8c 0a
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 17 01 08 8a
+31 00 60 0c 48 02 60 20 60 00 8d 06 02 50 41 02
+31 10 60 0c 48 02 a0 20 80 00 8d 06 02 60 41 02
+31 00 60 0c 48 02 c0 20 00 02 8d 06 00 b5 10 02
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 97 00 08 92
+31 00 60 06 e0 3a 00 20 80 00 8d 06 c7 00 08 12
+31 00 60 06 e0 3a 00 20 a0 00 8d 06 e7 00 08 12
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 07 01 08 8a
+31 00 60 02 48 02 c0 20 60 01 8d 06 02 41 43 08
+31 00 60 02 48 02 60 28 80 04 8d 06 00 e0 23 08
+31 00 60 02 48 02 40 20 40 00 8d 06 00 c0 23 0a
+31 00 60 02 48 02 20 21 e0 01 8d 06 01 b1 1a 02
+31 00 60 02 48 02 40 21 00 02 8d 06 02 b2 1a 02
+31 00 60 02 48 02 60 21 20 02 8d 06 03 b3 1a 02
+31 00 60 02 48 02 80 21 40 02 8d 06 04 b4 1a 02
+31 00 60 02 48 02 a0 21 60 02 8d 06 05 b5 1a 02
+31 00 60 02 48 02 c0 21 40 02 8d 06 02 31 12 08
+31 00 80 02 48 02 00 23 00 04 8d 06 02 31 24 10
+31 00 60 0c 48 02 a0 20 a0 00 8d 06 00 50 41 04
+31 00 60 06 08 02 40 20 20 01 8d 06 28 00 3a 04
+31 00 60 06 08 02 a0 21 20 00 8d 06 98 00 38 02
+31 00 60 06 08 02 a0 21 20 00 8d 06 a8 00 38 02
+31 00 60 06 08 02 a0 21 20 00 8d 06 b8 00 38 02
+31 00 60 06 08 02 a0 21 20 00 8d 06 d8 00 38 02
+31 00 60 06 08 02 a0 21 20 00 8d 06 e8 00 38 02
+31 00 60 06 08 02 a0 21 20 00 8d 06 f8 00 38 02
+31 00 60 06 08 02 a0 21 20 00 8d 06 08 01 38 02
+31 00 60 06 08 02 a0 21 20 00 8d 06 18 01 38 02
+31 00 60 06 e0 3a 00 20 80 07 8d 06 a7 00 08 12
+31 00 60 06 e0 3a 00 20 e0 0e 8d 06 07 01 08 92
+31 00 60 0c 48 02 60 20 e0 00 8d 06 01 5e 11 02
+31 10 60 0c 48 02 a0 20 60 01 8d 06 01 6e 11 02
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 67 00 08 8a
+31 00 60 06 e0 3a 00 20 00 0a 8d 06 b7 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 d7 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 f7 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 17 01 0a 14
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 37 01 0a 14
+31 00 60 06 e0 3a 00 20 40 0b 8d 06 57 01 0a 14
+31 00 60 06 e0 3a 00 20 80 0c 8d 06 77 01 0a 14
+31 00 60 06 e0 3a 00 20 c0 0d 8d 06 97 01 0a 0c
+31 00 60 06 e0 3a 00 20 00 0f 8d 06 97 01 0a 8c
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 b7 00 08 8a
+31 00 60 06 08 02 c0 22 a0 06 8d 06 38 02 18 02
+31 00 60 06 08 02 c0 26 a0 06 8d 06 38 04 18 02
+31 00 60 06 08 02 60 28 a0 06 8d 06 38 06 18 02
+31 00 60 06 08 02 a0 27 a0 06 8d 06 48 02 18 02
+31 00 60 06 08 02 40 28 a0 06 8d 06 48 04 18 02
+31 00 60 06 08 02 e0 27 a0 06 8d 06 48 06 18 02
+31 00 60 06 08 02 80 28 20 08 8d 06 58 02 18 02
+31 00 60 06 08 02 a0 28 20 08 8d 06 58 04 18 02
+31 00 60 06 08 02 c0 28 20 08 8d 06 58 06 18 02
+31 00 60 06 08 02 60 29 00 03 8d 06 68 02 18 02
+31 00 60 06 08 02 80 29 00 03 8d 06 68 04 18 02
+31 00 60 06 08 02 a0 29 00 03 8d 06 68 06 18 02
+31 00 60 06 08 02 40 2a 20 03 8d 06 78 02 18 02
+31 00 60 06 08 02 60 2a 20 03 8d 06 78 04 18 02
+31 00 60 06 08 02 80 2a 20 03 8d 06 78 06 18 02
+31 00 60 06 08 02 20 2b 40 03 8d 06 88 02 18 02
+31 00 60 06 08 02 40 2b 40 03 8d 06 88 04 18 02
+31 00 60 06 08 02 60 2b 40 03 8d 06 88 06 18 02
+31 00 60 06 08 02 00 2c 60 03 8d 06 98 02 18 02
+31 00 60 06 08 02 20 2c 60 03 8d 06 98 04 18 02
+31 00 60 06 08 02 40 2c 60 03 8d 06 98 06 18 02
+31 00 60 06 08 02 e0 2c 80 03 8d 06 a8 02 18 02
+31 00 60 06 08 02 00 2d 80 03 8d 06 a8 04 18 02
+31 00 60 06 08 02 20 2d 80 03 8d 06 a8 06 18 02
+31 00 60 06 08 02 c0 2d a0 03 8d 06 b8 02 18 02
+31 00 60 06 08 02 e0 2d a0 03 8d 06 b8 04 18 02
+31 00 60 06 08 02 00 2e a0 03 8d 06 b8 06 18 02
+31 00 60 06 08 02 a0 2e c0 03 8d 06 c8 02 18 02
+31 00 60 06 08 02 c0 2e c0 03 8d 06 c8 04 18 02
+31 00 60 06 08 02 e0 2e c0 03 8d 06 c8 06 18 02
+31 00 60 06 08 02 80 2f e0 03 8d 06 d8 02 18 02
+31 00 60 06 08 02 a0 2f e0 03 8d 06 d8 04 18 02
+31 00 60 06 08 02 c0 2f e0 03 8d 06 d8 06 18 02
+31 00 60 06 08 02 40 21 00 04 8d 06 e8 02 18 02
+31 00 60 06 08 02 60 21 00 04 8d 06 e8 04 18 02
+31 00 60 06 08 02 80 21 00 04 8d 06 e8 06 18 02
+31 00 60 06 08 02 40 23 20 04 8d 06 f8 02 18 02
+31 00 60 06 08 02 60 23 20 04 8d 06 f8 04 18 02
+31 00 60 06 08 02 80 23 20 04 8d 06 f8 06 18 02
+31 00 60 06 08 02 20 24 60 04 8d 06 08 03 18 02
+31 00 60 06 08 02 40 24 60 04 8d 06 08 05 18 02
+31 00 60 06 08 02 60 24 60 04 8d 06 08 07 18 02
+31 00 60 06 08 02 00 28 80 04 8d 06 18 03 18 02
+31 00 60 06 08 02 20 25 80 04 8d 06 18 05 18 02
+31 00 60 06 08 02 40 25 80 04 8d 06 18 07 18 02
+31 00 60 06 08 02 c0 20 a0 04 8d 06 28 03 18 02
+31 00 60 06 08 02 00 26 a0 04 8d 06 28 05 18 02
+31 00 60 06 08 02 20 26 a0 04 8d 06 28 07 18 02
+31 00 60 06 08 02 60 28 c0 04 8d 06 38 03 18 02
+31 00 60 06 08 02 00 27 c0 04 8d 06 38 05 18 02
+31 00 60 06 08 02 20 27 c0 04 8d 06 38 07 18 02
+31 00 60 06 08 02 40 28 e0 04 8d 06 48 03 18 02
+31 00 60 06 08 02 e0 27 e0 04 8d 06 48 05 18 02
+31 00 60 06 08 02 00 25 e0 04 8d 06 48 07 18 02
+31 00 60 06 08 02 a0 28 00 08 8d 06 58 03 18 02
+31 00 60 06 08 02 c0 28 00 08 8d 06 58 05 18 02
+31 00 60 06 08 02 e0 28 00 08 8d 06 58 07 18 02
+31 00 60 06 08 02 80 29 20 05 8d 06 68 03 18 02
+31 00 60 06 08 02 a0 29 20 05 8d 06 68 05 18 02
+31 00 60 06 08 02 c0 29 20 05 8d 06 68 07 18 02
+31 00 60 06 08 02 60 2a 40 05 8d 06 78 03 18 02
+31 00 60 06 08 02 80 2a 40 05 8d 06 78 05 18 02
+31 00 60 06 08 02 a0 2a 40 05 8d 06 78 07 18 02
+31 00 60 06 08 02 40 2b 60 05 8d 06 88 03 18 02
+31 00 60 06 08 02 60 2b 60 05 8d 06 88 05 18 02
+31 00 60 06 08 02 80 2b 60 05 8d 06 88 07 18 02
+31 00 60 06 08 02 20 2c 80 05 8d 06 98 03 18 02
+31 00 60 06 08 02 40 2c 80 05 8d 06 98 05 18 02
+31 00 60 06 08 02 60 2c 80 05 8d 06 98 07 18 02
+31 00 60 06 08 02 00 2d a0 05 8d 06 a8 03 18 02
+31 00 60 06 08 02 20 2d a0 05 8d 06 a8 05 18 02
+31 00 60 06 08 02 40 2d a0 05 8d 06 a8 07 18 02
+31 00 60 06 08 02 e0 2d c0 05 8d 06 b8 03 18 02
+31 00 60 06 08 02 00 2e c0 05 8d 06 b8 05 18 02
+31 00 60 06 08 02 20 2e c0 05 8d 06 b8 07 18 02
+31 00 60 06 08 02 c0 2e c0 00 8d 06 c8 03 18 02
+31 00 60 06 08 02 e0 2e c0 00 8d 06 c8 05 18 02
+31 00 60 06 08 02 00 2f c0 00 8d 06 c8 07 18 02
+31 00 60 06 08 02 a0 2f 00 06 8d 06 d8 03 18 02
+31 00 60 06 08 02 c0 2f 00 06 8d 06 d8 05 18 02
+31 00 60 06 08 02 40 20 00 06 8d 06 d8 07 18 02
+31 00 60 06 08 02 80 21 20 06 8d 06 e8 03 18 02
+31 00 60 06 08 02 a0 21 20 06 8d 06 e8 05 18 02
+31 00 60 06 08 02 c0 21 20 06 8d 06 e8 07 18 02
+31 00 60 06 08 02 60 22 40 06 8d 06 f8 03 18 02
+31 00 60 06 08 02 80 22 40 06 8d 06 f8 05 18 02
+31 00 60 06 08 02 a0 26 40 06 8d 06 f8 07 18 02
+31 00 60 06 08 02 80 23 60 06 8d 06 08 04 18 02
+31 00 60 06 08 02 a0 23 60 06 8d 06 08 06 18 02
+31 00 60 06 08 02 c0 23 60 06 8d 06 08 08 18 02
+31 00 60 06 08 02 60 24 c0 02 8d 06 18 02 18 02
+31 00 60 06 08 02 80 24 c0 02 8d 06 18 04 18 02
+31 00 60 06 08 02 a0 24 c0 02 8d 06 18 06 18 02
+31 00 60 06 08 02 c0 24 c0 02 8d 06 18 08 18 02
+31 00 60 06 e0 02 00 20 c0 00 8d 06 37 80 0a 08
+31 00 60 06 e0 02 00 20 40 01 8d 06 47 80 0a 08
+31 00 60 06 e0 02 00 20 60 01 8d 06 57 80 0a 08
+31 00 60 06 e0 02 00 20 80 01 8d 06 67 80 0a 08
+31 00 60 06 e0 02 00 20 a0 01 8d 06 77 80 0a 08
+31 00 60 06 e0 02 00 20 c0 01 8d 06 87 80 0a 08
+31 00 60 06 e0 02 00 20 e0 01 8d 06 97 80 0a 08
+31 00 60 06 e0 02 00 20 00 02 8d 06 a7 80 0a 08
+31 00 60 06 e0 02 00 20 20 02 8d 06 b7 80 0a 08
+31 00 60 06 e0 02 00 20 40 02 8d 06 c7 80 0a 08
+31 00 60 06 e0 02 00 20 60 02 8d 06 d7 80 0a 08
+31 00 60 06 e0 02 00 20 80 02 8d 06 e7 80 0a 08
+31 00 60 06 e0 02 00 20 a0 02 8d 06 f7 80 0a 08
+31 00 60 06 e0 02 00 20 c0 02 8d 06 07 81 0a 08
+31 00 60 06 e0 02 00 20 e0 02 8d 06 17 81 0a 08
+31 00 60 06 e0 02 00 20 00 03 8d 06 27 81 0a 08
+31 00 60 06 e0 02 00 20 20 03 8d 06 37 81 0a 08
+31 00 60 06 e0 02 00 20 40 03 8d 06 47 81 0a 08
+31 00 60 06 e0 02 00 20 60 03 8d 06 57 81 0a 08
+31 00 60 06 e0 02 00 20 80 03 8d 06 67 81 0a 08
+31 00 60 06 e0 02 00 20 a0 03 8d 06 77 81 0a 08
+31 00 60 06 e0 02 00 20 c0 03 8d 06 87 81 0a 08
+31 00 60 06 e0 02 00 20 e0 03 8d 06 97 81 0a 08
+31 00 60 06 e0 02 00 20 00 04 8d 06 a7 81 0a 08
+31 00 60 06 e0 02 00 20 20 04 8d 06 b7 81 0a 08
+31 00 60 06 e0 02 00 20 40 04 8d 06 c7 81 0a 08
+31 00 60 06 e0 02 00 20 60 04 8d 06 d7 81 0a 08
+31 00 60 06 e0 02 00 20 80 04 8d 06 e7 81 0a 08
+31 00 60 06 e0 02 00 20 a0 04 8d 06 f7 81 0a 08
+31 00 60 06 e0 02 00 20 c0 04 8d 06 07 82 0a 08
+31 00 60 06 e0 02 00 20 e0 04 8d 06 17 82 0a 08
+31 00 60 0c 48 02 40 22 60 02 8d 06 00 5e 11 04
+31 00 60 02 48 02 40 20 c0 00 8d 06 01 d0 23 06
+31 00 80 02 48 02 40 20 00 01 8d 06 01 d0 45 0c
+31 00 60 02 48 02 a0 2c 40 01 8d 06 01 c0 33 0c
+31 00 60 02 48 02 c0 21 60 01 8d 06 03 02 4b 08
+31 00 60 02 48 02 c0 20 c0 00 8d 06 02 01 4b 0a
+31 00 80 02 48 02 40 23 40 00 8d 06 03 02 8d 0e
+31 00 80 02 48 02 40 21 40 04 8d 06 02 01 8d 12
+31 00 60 02 48 02 c0 20 e0 00 8d 06 01 40 1b 0a
+31 00 60 02 48 02 e0 20 80 01 8d 06 02 41 1b 0a
+31 00 60 06 08 02 40 24 40 05 8d 06 48 02 48 02
+31 00 60 06 08 02 c0 24 40 05 8d 06 48 04 48 02
+31 00 60 06 08 02 40 25 40 05 8d 06 48 06 48 02
+31 00 60 06 08 02 c0 22 60 05 8d 06 58 02 48 02
+31 00 60 06 08 02 40 23 60 05 8d 06 58 04 48 02
+31 00 60 06 08 02 c0 23 60 05 8d 06 58 06 48 02
+31 00 60 06 08 02 c0 22 80 05 8d 06 68 02 48 02
+31 00 60 06 08 02 40 23 80 05 8d 06 68 04 48 02
+31 00 60 06 08 02 c0 23 80 05 8d 06 68 06 48 02
+31 00 60 06 08 02 c0 22 a0 05 8d 06 78 02 48 02
+31 00 60 06 08 02 40 23 a0 05 8d 06 78 04 48 02
+31 00 60 06 08 02 c0 23 a0 05 8d 06 78 06 48 02
+31 00 60 06 08 02 c0 22 e0 06 8d 06 88 02 48 02
+31 00 60 06 08 02 40 23 e0 06 8d 06 88 04 48 02
+31 00 60 06 08 02 c0 23 e0 06 8d 06 88 06 48 02
+31 00 60 06 08 02 40 23 00 07 8d 06 98 04 48 02
+31 00 60 06 08 02 c0 22 00 07 8d 06 98 02 48 02
+31 00 60 06 08 02 c0 23 00 07 8d 06 98 06 48 02
+31 00 60 06 08 02 40 23 40 0a 8d 06 a8 04 48 02
+31 00 60 06 08 02 c0 22 40 0a 8d 06 a8 02 48 02
+31 00 60 06 08 02 c0 23 40 0a 8d 06 a8 06 48 02
+31 00 60 06 08 02 40 23 60 0a 8d 06 b8 04 48 02
+31 00 60 06 08 02 c0 22 60 0a 8d 06 b8 02 48 02
+31 00 60 06 08 02 c0 23 60 0a 8d 06 b8 06 48 02
+31 00 60 06 08 02 40 23 80 0a 8d 06 c8 06 48 02
+31 00 60 06 08 02 00 21 80 0a 8d 06 c8 02 48 02
+31 00 60 06 08 02 c0 22 80 0a 8d 06 c8 04 48 02
+31 00 60 06 08 02 00 21 a0 0a 8d 06 d8 02 48 02
+31 00 60 06 08 02 c0 22 a0 0a 8d 06 d8 04 48 02
+31 00 60 06 08 02 40 23 a0 0a 8d 06 d8 06 48 02
+31 00 60 06 08 02 00 21 c0 00 8d 06 e8 02 48 02
+31 00 60 06 08 02 c0 22 c0 00 8d 06 e8 04 48 02
+31 00 60 06 08 02 40 23 c0 00 8d 06 e8 06 48 02
+31 00 60 06 08 02 00 21 60 00 8d 06 f8 02 48 02
+31 00 60 06 08 02 c0 22 60 00 8d 06 f8 04 48 02
+31 00 60 06 08 02 40 23 60 00 8d 06 f8 06 48 02
+31 00 60 06 08 02 00 21 c0 05 8d 06 08 03 48 02
+31 00 60 06 08 02 c0 22 c0 05 8d 06 08 05 48 02
+31 00 60 06 08 02 40 23 c0 05 8d 06 08 07 48 02
+31 00 60 06 08 02 00 21 e0 05 8d 06 18 03 48 02
+31 00 60 06 08 02 c0 22 e0 05 8d 06 18 05 48 02
+31 00 60 06 08 02 40 23 e0 05 8d 06 18 07 48 02
+31 00 60 06 08 02 00 21 20 07 8d 06 28 03 48 02
+31 00 60 06 08 02 c0 22 20 07 8d 06 28 05 48 02
+31 00 60 06 08 02 40 23 20 07 8d 06 28 07 48 02
+31 00 60 06 08 02 00 21 40 07 8d 06 38 03 48 02
+31 00 60 06 08 02 c0 22 40 07 8d 06 38 05 48 02
+31 00 60 06 08 02 40 23 40 07 8d 06 38 07 48 02
+31 00 60 06 08 02 00 21 60 07 8d 06 48 03 48 02
+31 00 60 06 08 02 80 21 60 07 8d 06 48 05 48 02
+31 00 60 06 08 02 c0 22 60 07 8d 06 48 07 48 02
+31 00 60 06 08 02 00 21 80 07 8d 06 58 03 48 02
+31 00 60 06 08 02 80 21 80 07 8d 06 58 05 48 02
+31 00 60 06 08 02 c0 22 80 07 8d 06 58 07 48 02
+31 00 60 06 08 02 00 21 a0 07 8d 06 68 03 48 02
+31 00 60 06 08 02 80 21 a0 07 8d 06 68 05 48 02
+31 00 60 06 08 02 c0 22 a0 07 8d 06 68 07 48 02
+31 00 60 06 08 02 00 21 c0 07 8d 06 78 03 48 02
+31 00 60 06 08 02 80 21 c0 07 8d 06 78 05 48 02
+31 00 60 06 08 02 c0 22 c0 07 8d 06 78 07 48 02
+31 00 60 06 08 02 00 21 e0 07 8d 06 88 03 48 02
+31 00 60 06 08 02 80 21 e0 07 8d 06 88 05 48 02
+31 00 60 06 08 02 c0 22 e0 07 8d 06 88 07 48 02
+31 00 60 06 08 02 00 21 00 08 8d 06 98 03 48 02
+31 00 60 06 08 02 80 21 00 08 8d 06 98 05 48 02
+31 00 60 06 08 02 c0 22 00 08 8d 06 98 07 48 02
+31 00 60 06 08 02 00 21 80 08 8d 06 a8 03 48 02
+31 00 60 06 08 02 80 21 80 08 8d 06 a8 05 48 02
+31 00 60 06 08 02 c0 22 80 08 8d 06 a8 07 48 02
+31 00 60 06 08 02 00 21 a0 08 8d 06 b8 03 48 02
+31 00 60 06 08 02 80 21 a0 08 8d 06 b8 05 48 02
+31 00 60 06 08 02 c0 22 a0 08 8d 06 b8 07 48 02
+31 00 60 06 08 02 00 21 c0 08 8d 06 c8 03 48 02
+31 00 60 06 08 02 80 21 c0 08 8d 06 c8 05 48 02
+31 00 60 06 08 02 00 22 c0 08 8d 06 c8 07 48 02
+31 00 60 06 08 02 00 21 e0 08 8d 06 d8 03 48 02
+31 00 60 06 08 02 80 21 e0 08 8d 06 d8 05 48 02
+31 00 60 06 08 02 00 22 e0 08 8d 06 d8 07 48 02
+31 00 60 06 08 02 00 21 00 09 8d 06 e8 03 48 02
+31 00 60 06 08 02 80 21 00 09 8d 06 e8 05 48 02
+31 00 60 06 08 02 00 22 00 09 8d 06 e8 07 48 02
+31 00 60 06 08 02 00 21 20 09 8d 06 f8 03 48 02
+31 00 60 06 08 02 80 21 20 09 8d 06 f8 05 48 02
+31 00 60 06 08 02 00 22 20 09 8d 06 f8 07 48 02
+31 00 60 06 08 02 80 21 60 09 8d 06 18 04 48 02
+31 00 60 06 08 02 00 22 60 09 8d 06 18 06 48 02
+31 00 60 06 08 02 80 22 60 09 8d 06 18 08 48 02
+31 00 60 06 e0 02 00 20 80 02 8d 06 c7 00 0a 0c
+31 00 60 06 e0 02 00 20 a0 02 8d 06 d7 00 0a 0c
+31 00 60 06 e0 02 00 20 c0 02 8d 06 e7 00 0a 0c
+31 00 60 06 e0 02 00 20 e0 02 8d 06 f7 00 0a 0c
+31 00 60 06 e0 02 00 20 00 03 8d 06 07 01 0a 0c
+31 00 60 06 e0 02 00 20 20 03 8d 06 17 01 0a 0c
+31 00 60 06 e0 02 00 20 60 03 8d 06 37 01 0a 0c
+31 00 60 06 e0 02 00 20 80 03 8d 06 47 01 0a 0c
+31 00 60 06 e0 02 00 20 a0 03 8d 06 57 01 0a 0c
+31 00 60 06 e0 02 00 20 c0 03 8d 06 67 01 0a 0c
+31 00 60 06 e0 02 00 20 e0 03 8d 06 77 01 0a 0c
+31 00 60 06 e0 02 00 20 00 04 8d 06 87 01 0a 0c
+31 00 60 06 e0 02 00 20 40 04 8d 06 a7 01 0a 0c
+31 00 60 06 e0 02 00 20 60 04 8d 06 b7 01 0a 0c
+31 00 60 06 e0 02 00 20 80 04 8d 06 c7 01 0a 0c
+31 00 60 06 e0 02 00 20 a0 04 8d 06 d7 01 0a 0c
+31 00 60 06 e0 02 00 20 c0 04 8d 06 e7 01 0a 0c
+31 00 60 06 e0 02 00 20 e0 04 8d 06 f7 01 0a 0c
+31 00 80 09 0c 02 c0 25 80 01 00 06 02 03 28 02
+31 00 80 09 0c 02 40 26 e0 01 00 06 04 03 28 02
+31 00 80 09 0c 02 40 24 80 02 00 06 03 03 28 02
+31 00 80 09 0c 02 00 22 a0 02 00 06 06 03 28 02
+31 00 60 0c 48 02 a0 20 60 02 8d 06 03 6e 10 02
+31 00 60 0c 48 02 00 21 a0 02 8d 06 04 6e 10 02
+31 00 80 0c 48 02 00 21 40 04 8d 06 03 5e 20 04
+31 00 80 0c 48 02 c0 21 a0 04 8d 06 04 5e 20 04
+31 00 60 06 08 02 e0 21 80 01 8d 06 38 00 1a 04
+31 00 60 02 48 02 40 20 c0 06 8d 06 07 a7 42 02
+31 00 60 02 48 02 c0 20 e0 06 8d 06 08 a8 42 02
+31 00 60 02 48 02 40 21 00 07 8d 06 09 a9 42 02
+31 00 60 02 48 02 c0 21 20 07 8d 06 0a aa 42 02
+31 00 60 02 48 02 40 22 40 07 8d 06 0b ab 42 02
+31 00 60 02 48 02 c0 22 60 07 8d 06 0c ac 42 02
+31 00 60 06 e0 02 00 20 20 01 8d 06 27 80 08 0c
+31 00 60 06 e0 02 00 20 40 01 8d 06 47 80 08 0c
+31 00 60 06 e0 02 00 20 60 01 8d 06 67 80 08 0c
+31 00 60 06 e0 02 00 20 c0 00 8d 06 37 80 08 0c
+31 00 60 06 e0 02 00 20 e0 00 8d 06 57 80 08 0c
+31 00 60 06 e0 02 00 20 00 01 8d 06 77 80 08 0c
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 97 01 0a 14
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 b7 01 0a 14
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 d7 01 0a 14
+31 00 60 06 e0 3a 00 20 c0 00 8d 06 f7 01 0a 14
+31 00 60 06 e0 3a 00 20 00 0f 8d 06 17 02 0a 8c
+31 00 60 06 08 02 00 21 c0 00 8d 06 18 03 1a 04
+31 00 60 06 08 02 20 21 c0 00 8d 06 18 05 1a 04
+31 00 60 06 08 02 40 21 c0 00 8d 06 18 07 1a 04
+31 00 60 06 08 02 60 21 c0 00 8d 06 18 09 1a 04
+31 00 60 06 08 02 e0 20 60 01 8d 06 18 02 1a 04
+31 00 60 06 08 02 00 21 60 01 8d 06 18 04 1a 04
+31 00 60 06 08 02 20 21 60 01 8d 06 18 06 1a 04
+31 00 60 06 08 02 40 21 60 01 8d 06 18 08 1a 04
+31 00 60 06 e0 02 00 20 40 01 8d 06 27 82 0a 08
+31 00 60 06 e0 02 00 20 60 01 8d 06 37 82 0a 08
+31 00 60 06 e0 02 00 20 80 01 8d 06 47 82 0a 08
+31 00 60 06 e0 02 00 20 a0 01 8d 06 57 82 0a 08
+31 00 60 06 e0 02 00 20 c0 01 8d 06 67 82 0a 08
+31 00 60 06 e0 02 00 20 e0 01 8d 06 77 82 0a 08
+31 00 60 06 e0 02 00 20 00 02 8d 06 87 82 0a 08
+31 00 60 06 e0 02 00 20 20 02 8d 06 97 82 0a 08
+31 00 60 06 e0 02 00 20 40 02 8d 06 a7 82 0a 08
+31 00 60 06 e0 02 00 20 60 02 8d 06 b7 82 0a 08
+31 00 60 06 e0 02 00 20 80 02 8d 06 c7 82 0a 08
+31 00 60 06 e0 02 00 20 a0 02 8d 06 d7 82 0a 08
+31 00 60 06 e0 02 00 20 c0 02 8d 06 e7 82 0a 08
+31 00 60 06 e0 02 00 20 e0 02 8d 06 f7 82 0a 08
+31 00 60 06 e0 02 00 20 00 03 8d 06 07 83 0a 08
+31 00 60 06 e0 02 00 20 20 03 8d 06 17 83 0a 08
+31 00 60 06 e0 02 00 20 40 03 8d 06 27 83 0a 08
+31 00 60 06 e0 02 00 20 60 03 8d 06 37 83 0a 08
+31 00 60 06 e0 02 00 20 80 03 8d 06 47 83 0a 08
+31 00 60 06 e0 02 00 20 a0 03 8d 06 57 83 0a 08
+31 00 60 06 e0 02 00 20 c0 03 8d 06 67 83 0a 08
+31 00 60 06 e0 02 00 20 e0 03 8d 06 77 83 0a 08
+31 00 60 06 e0 02 00 20 00 04 8d 06 87 83 0a 08
+31 00 60 06 e0 02 00 20 20 04 8d 06 97 83 0a 08
+31 00 60 06 e0 02 00 20 40 04 8d 06 a7 83 0a 08
+31 00 60 06 e0 02 00 20 60 04 8d 06 b7 83 0a 08
+31 00 60 06 e0 02 00 20 80 04 8d 06 c7 83 0a 08
+31 00 60 06 e0 02 00 20 a0 04 8d 06 d7 83 0a 08
+31 00 60 06 e0 02 00 20 c0 04 8d 06 e7 83 0a 08
+31 00 60 06 e0 02 00 20 e0 04 8d 06 f7 83 0a 08
+31 00 60 06 08 02 00 21 20 01 8d 06 08 00 48 02
+31 00 60 06 e0 3a 00 20 60 0f 8d 06 07 00 08 8a
+31 00 60 0c 48 02 80 20 40 00 8d 06 01 5c 21 04
+31 10 60 0c 48 02 00 25 c0 04 8d 06 01 6c 21 04
+31 00 60 02 48 02 c0 20 60 01 8d 06 01 40 4a 10
+31 00 60 02 48 02 80 2f 40 00 8d 06 01 20 42 04
+31 00 80 02 48 02 00 2f 40 00 8d 06 01 20 84 08
+31 00 60 02 48 02 40 20 e0 00 8d 06 01 50 42 06
+31 00 60 02 48 02 c0 20 40 01 8d 06 02 51 42 06
+31 00 80 02 48 02 40 20 60 01 8d 06 01 50 84 0c
+31 00 80 02 48 02 40 21 40 02 8d 06 02 51 84 0c
+31 00 60 06 e0 3a 00 20 20 0f 8d 06 97 01 08 8a
+31 00 60 0c 48 02 80 2f c0 00 8d 06 00 50 41 02
+31 00 60 0c 48 02 80 2f c0 00 8d 06 00 50 41 06
+31 00 60 0c 48 02 80 2f c0 00 8d 06 00 5c 21 02
+31 00 60 0c 48 02 20 22 60 03 8d 06 00 5e 11 02
+31 00 60 0c 48 02 80 2f 40 00 8d 06 01 50 41 02
+31 10 60 0c 48 02 40 20 a0 03 8d 06 01 60 41 02
+31 00 60 02 48 02 20 21 60 02 8d 06 02 e1 43 08
+31 00 80 02 48 02 e0 22 e0 00 8d 06 02 e1 85 10
+31 00 60 02 48 02 80 2f a0 00 8d 06 01 00 4b 0c
+31 00 80 02 48 02 00 2f e0 00 8d 06 01 00 8d 16
+31 00 60 02 48 02 c0 20 e0 00 8d 06 01 40 13 0a
+31 00 60 02 48 02 e0 20 80 01 8d 06 02 41 13 0a
+31 00 60 06 08 02 c0 22 40 01 8d 06 38 01 1a 04
+31 00 60 06 08 02 a0 22 40 01 8d 06 38 03 1a 04
+31 00 60 06 08 02 20 28 40 01 8d 06 38 05 1a 04
+31 00 60 06 08 02 40 21 40 01 8d 06 38 07 1a 04
+31 00 60 06 08 02 20 28 60 01 8d 06 38 02 1a 04
+31 00 60 06 08 02 40 21 60 01 8d 06 38 04 1a 04
+31 00 60 06 08 02 60 21 60 01 8d 06 38 06 1a 04
+31 00 60 06 08 02 00 21 e0 00 8d 06 48 00 1a 04
+31 00 60 02 48 02 c0 20 40 01 8d 06 01 40 4a 0a
+31 00 60 02 48 02 40 20 e0 00 8d 06 01 60 42 06
+31 00 60 02 48 02 c0 20 40 01 8d 06 02 61 42 06
+31 00 80 02 48 02 40 20 60 01 8d 06 01 60 84 0c
+31 00 80 02 48 02 40 21 40 02 8d 06 02 61 84 0c
+31 00 60 02 48 02 80 2f 40 00 8d 06 01 00 32 08
+31 00 80 02 48 02 00 2f 40 00 8d 06 01 00 64 10
+31 00 60 02 48 02 c0 20 e0 00 8d 06 01 40 1b 0c
+31 00 60 02 48 02 e0 20 a0 01 8d 06 02 41 1b 0c
+31 00 60 02 48 02 40 20 e0 00 8d 06 01 50 42 08
+31 00 60 02 48 02 c0 20 60 01 8d 06 02 51 42 08
+31 00 80 02 48 02 40 20 60 01 8d 06 01 50 84 10
+31 00 80 02 48 02 40 21 60 02 8d 06 02 51 84 10
+31 00 60 0c 48 02 80 2f 40 00 8d 06 01 68 30 02
+31 00 80 0c 48 02 00 2f 40 00 8d 06 01 58 60 04
+31 00 60 06 08 02 00 21 e0 00 8d 06 28 01 3a 04
+31 00 60 02 48 02 80 21 a0 00 8d 06 01 e0 33 08
+31 00 60 02 48 02 e0 21 20 02 8d 06 01 e0 23 08
+31 00 80 02 48 02 e0 20 a0 01 8d 06 01 e0 65 10
+31 00 80 02 48 02 20 24 a0 02 8d 06 01 e0 45 10
+31 00 60 02 48 02 c0 21 c0 01 8d 06 01 40 1b 10
+31 00 60 02 48 02 e0 21 c0 02 8d 06 02 41 1b 10
+31 00 60 06 08 02 00 21 80 02 8d 06 38 01 4a 04
+31 00 60 06 08 02 80 21 80 02 8d 06 38 03 4a 04
+31 00 60 06 08 02 00 22 80 02 8d 06 38 05 4a 04
+31 00 60 06 08 02 80 22 80 02 8d 06 38 07 4a 04
+31 00 60 06 08 02 c0 21 c0 02 8d 06 38 02 4a 04
+31 00 60 06 08 02 40 22 c0 02 8d 06 38 04 4a 04
+31 00 60 06 08 02 c0 22 c0 02 8d 06 38 06 4a 04
+31 00 60 02 48 02 60 21 a0 00 8d 06 03 00 12 04
+31 00 60 02 48 02 80 21 a0 00 8d 06 04 00 12 04
+31 00 80 02 48 02 00 21 80 01 8d 06 03 00 24 08
+31 00 80 02 48 02 40 21 80 01 8d 06 04 00 24 08
+31 00 60 02 48 02 c0 20 e0 00 8d 06 01 50 12 08
+31 00 60 02 48 02 e0 20 60 01 8d 06 02 51 12 08
+31 00 80 02 48 02 40 21 80 01 8d 06 01 50 24 10
+31 00 80 02 48 02 80 21 80 02 8d 06 02 51 24 10
+31 00 60 02 48 02 40 20 a0 01 8d 06 01 a0 23 06
+31 00 80 02 48 02 c0 20 e0 02 8d 06 01 a0 45 0c
+31 00 60 02 48 02 80 2f e0 00 8d 06 00 20 4b 0c
+31 00 60 06 08 02 a0 21 e0 04 8d 06 58 00 1a 04
+31 00 60 06 08 02 80 20 40 01 8d 06 68 00 1a 04
+31 00 60 06 08 02 80 20 60 00 8d 06 78 00 1a 04
+31 00 60 06 08 02 80 20 60 00 8d 06 88 00 1a 04
+31 00 60 06 08 02 80 20 60 00 8d 06 98 00 1a 04
+31 00 60 06 08 02 80 20 60 00 8d 06 a8 00 1a 04
+31 00 60 06 08 02 80 20 60 00 8d 06 b8 00 1a 04
+31 00 60 06 08 02 80 20 60 00 8d 06 c8 00 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 d8 00 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 e8 00 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 f8 00 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 08 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 18 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 48 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 58 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 68 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 78 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 88 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 98 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 a8 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 b8 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 c8 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 d8 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 e8 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 f8 01 1a 04
+31 00 60 06 08 02 60 20 40 00 8d 06 08 02 1a 04
+31 00 60 02 48 02 c0 24 c0 04 8d 06 05 84 4a 08
+31 00 60 02 48 02 c0 25 e0 02 8d 06 04 83 4a 06
+31 00 60 02 48 02 80 23 80 03 8d 06 06 85 4a 06
+31 00 60 02 48 02 80 21 e0 02 8d 06 07 86 4a 06
+31 00 60 02 48 02 80 21 00 04 8d 06 08 87 4a 08
+31 00 60 02 48 02 40 23 a0 01 8d 06 09 88 4a 06
+31 00 60 02 48 02 40 23 40 03 8d 06 0a 09 4b 08
+31 00 60 02 48 02 40 20 40 00 8d 06 0b 0a 4b 0a
+31 00 60 02 48 02 c0 20 c0 00 8d 06 0c 0b 4b 08
+31 00 80 02 48 02 c0 23 20 09 8d 06 04 83 8c 0a
+31 00 80 02 48 02 00 25 40 00 8d 06 05 84 8c 0e
+31 00 80 02 48 02 a0 20 20 04 8d 06 06 85 8c 0a
+31 00 80 02 48 02 00 24 e0 06 8d 06 07 86 8c 0a
+31 00 80 02 48 02 c0 23 e0 02 8d 06 08 87 8c 0e
+31 00 80 02 48 02 a0 20 00 05 8d 06 09 88 8c 0a
+31 00 80 02 48 02 c0 24 60 08 8d 06 0a 09 8d 0e
+31 00 80 02 48 02 c0 24 40 00 8d 06 0b 0a 8d 12
+31 00 80 02 48 02 40 21 e0 04 8d 06 0c 0b 8d 0e
+31 00 60 02 48 02 40 20 c0 00 8d 06 00 20 4b 0e
+31 00 60 02 48 02 60 21 e0 00 8d 06 02 01 12 04
+31 00 60 02 48 02 80 21 e0 00 8d 06 03 02 12 04
+31 00 80 02 48 02 c0 20 60 01 8d 06 02 01 24 08
+31 00 80 02 48 02 00 21 60 01 8d 06 03 02 24 08
+31 00 60 02 48 02 a0 20 c0 00 8d 06 03 00 22 04
+31 00 80 02 48 02 00 21 80 01 8d 06 03 00 44 08
+31 00 60 02 48 02 a0 20 40 00 8d 06 01 90 12 04
+31 00 80 02 48 02 c0 20 40 00 8d 06 01 90 24 08
+31 00 60 0c 48 02 60 21 80 00 8d 06 02 50 41 04
+31 10 60 0c 48 02 e0 20 a0 00 8d 06 02 60 41 04
+31 00 60 06 e0 02 00 20 00 02 8d 06 57 80 0a 0e
+31 00 60 06 08 02 c0 20 40 02 8d 06 18 03 3a 04
+31 00 60 06 08 02 20 21 40 02 8d 06 18 05 3a 04
+31 00 60 06 08 02 80 21 40 02 8d 06 18 07 3a 04
+31 00 60 06 08 02 e0 21 40 02 8d 06 18 09 3a 04
+31 00 60 06 08 02 60 21 e0 02 8d 06 18 02 3a 04
+31 00 60 06 08 02 c0 21 e0 02 8d 06 18 04 3a 04
+31 00 60 06 08 02 20 22 e0 02 8d 06 18 06 3a 04
+31 00 60 06 08 02 80 22 e0 02 8d 06 18 08 3a 04
+31 00 60 06 e0 02 00 20 80 01 8d 06 27 82 0a 0c
+31 00 60 06 e0 02 00 20 a0 01 8d 06 37 82 0a 0c
+31 00 60 06 e0 02 00 20 c0 01 8d 06 47 82 0a 0c
+31 00 60 06 e0 02 00 20 e0 01 8d 06 57 82 0a 0c
+31 00 60 06 e0 02 00 20 00 02 8d 06 67 82 0a 0c
+31 00 60 06 e0 02 00 20 20 02 8d 06 77 82 0a 0c
+31 00 60 06 e0 02 00 20 40 02 8d 06 87 82 0a 0c
+31 00 60 06 e0 02 00 20 60 02 8d 06 97 82 0a 0c
+31 00 60 06 e0 02 00 20 80 02 8d 06 a7 82 0a 0c
+31 00 60 06 e0 02 00 20 a0 02 8d 06 b7 82 0a 0c
+31 00 60 06 e0 02 00 20 c0 02 8d 06 c7 82 0a 0c
+31 00 60 06 e0 02 00 20 e0 02 8d 06 d7 82 0a 0c
+31 00 60 06 e0 02 00 20 00 03 8d 06 e7 82 0a 0c
+31 00 60 06 e0 02 00 20 20 03 8d 06 f7 82 0a 0c
+31 00 60 06 e0 02 00 20 40 03 8d 06 07 83 0a 0c
+31 00 60 06 e0 02 00 20 60 03 8d 06 17 83 0a 0c
+31 00 60 06 e0 02 00 20 80 03 8d 06 27 83 0a 0c
+31 00 60 06 e0 02 00 20 a0 03 8d 06 37 83 0a 0c
+31 00 60 06 e0 02 00 20 c0 03 8d 06 47 83 0a 0c
+31 00 60 06 e0 02 00 20 e0 03 8d 06 57 83 0a 0c
+31 00 60 06 e0 02 00 20 00 04 8d 06 67 83 0a 0c
+31 00 60 06 e0 02 00 20 20 04 8d 06 77 83 0a 0c
+31 00 60 06 e0 02 00 20 40 04 8d 06 87 83 0a 0c
+31 00 60 06 e0 02 00 20 60 04 8d 06 97 83 0a 0c
+31 00 60 06 e0 02 00 20 80 04 8d 06 a7 83 0a 0c
+31 00 60 06 e0 02 00 20 a0 04 8d 06 b7 83 0a 0c
+31 00 60 06 e0 02 00 20 c0 04 8d 06 c7 83 0a 0c
+31 00 60 06 e0 02 00 20 e0 04 8d 06 d7 83 0a 0c
+31 00 60 06 e0 02 00 20 00 05 8d 06 e7 83 0a 0c
+31 00 60 06 e0 02 00 20 20 05 8d 06 f7 83 0a 0c
+31 00 60 02 48 02 00 21 e0 00 8d 06 01 40 13 10
+31 00 60 02 48 02 20 21 e0 01 8d 06 02 41 13 10
+31 00 60 06 08 02 00 22 00 02 8d 06 48 01 4a 04
+31 00 60 02 48 02 c0 24 c0 04 8d 06 04 84 4a 08
+31 00 60 02 48 02 c0 25 e0 02 8d 06 03 83 4a 06
+31 00 60 02 48 02 80 23 80 03 8d 06 05 85 4a 06
+31 00 60 02 48 02 80 21 e0 02 8d 06 06 86 4a 06
+31 00 60 02 48 02 80 21 00 04 8d 06 07 87 4a 08
+31 00 60 02 48 02 40 23 a0 01 8d 06 08 88 4a 06
+31 00 60 02 48 02 40 23 40 03 8d 06 09 09 4b 08
+31 00 60 02 48 02 40 20 40 00 8d 06 0a 0a 4b 0a
+31 00 60 02 48 02 40 21 40 01 8d 06 0b 0b 4b 08
+31 00 60 06 08 02 40 20 e0 01 8d 06 48 00 3a 04
+31 00 60 06 08 02 80 21 e0 01 8d 06 58 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 68 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 78 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 88 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 98 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 a8 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 b8 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 c8 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 d8 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 e8 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 f8 00 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 08 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 18 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 38 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 48 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 58 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 68 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 78 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 88 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 98 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 a8 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 b8 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 c8 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 d8 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 e8 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 f8 01 3a 04
+31 00 60 06 08 02 40 20 40 00 8d 06 08 02 3a 04
+31 00 60 06 e0 3a 00 20 60 01 8d 06 47 00 0a 14
+31 00 60 06 e0 3a 00 20 e0 03 8d 06 87 00 0a 14
+31 00 60 06 e0 3a 00 20 c0 0e 8d 06 87 00 0a 94
+31 00 60 02 48 02 c0 21 60 01 8d 06 02 02 4b 0a
+31 00 60 02 48 02 40 22 40 02 8d 06 03 03 4b 0c
+31 00 60 02 48 02 c0 22 00 03 8d 06 04 04 4b 08
+31 00 60 02 48 02 e0 21 40 00 8d 06 03 32 42 06
+31 00 80 02 48 02 60 22 60 03 8d 06 03 32 84 0c
+31 00 60 02 48 02 e0 20 20 01 8d 06 01 c0 13 0a
+31 00 80 02 48 02 80 22 e0 00 8d 06 01 c0 25 14
+31 00 60 02 48 02 a0 22 a0 00 8d 06 01 c0 33 0a
+31 00 60 02 48 02 40 22 00 03 8d 06 01 c0 23 0a
+31 00 80 02 48 02 e0 21 a0 02 8d 06 01 c0 65 14
+31 00 80 02 48 02 e0 20 e0 03 8d 06 01 c0 45 14
+31 00 60 02 48 02 80 2f c0 00 8d 06 03 83 43 04
+31 00 60 06 08 02 60 21 20 02 8d 06 38 03 3a 04
+31 00 60 06 08 02 c0 21 20 02 8d 06 38 05 3a 04
+31 00 60 06 08 02 20 22 20 02 8d 06 38 07 3a 04
+31 00 60 06 08 02 20 21 40 02 8d 06 38 00 3a 04
+31 00 60 06 08 02 80 21 40 02 8d 06 38 02 3a 04
+31 00 60 06 08 02 e0 21 40 02 8d 06 38 04 3a 04
+31 00 60 06 08 02 40 22 40 02 8d 06 38 06 3a 04
+31 00 60 02 48 02 c0 20 40 01 8d 06 01 40 42 08
+31 00 60 02 48 02 20 21 a0 00 8d 06 02 00 42 04
+31 00 80 02 48 02 a0 21 e0 00 8d 06 02 00 84 08
+31 00 61 0c 4a 02 80 2f 40 00 8d 06 01 a5 11 02
+31 10 61 0c 4a 02 20 2f 60 00 8d 06 01 b5 11 02
+31 00 60 06 08 02 c0 22 00 04 8d 06 38 02 28 02
+31 00 60 06 08 02 00 23 00 04 8d 06 38 04 28 02
+31 00 60 06 08 02 40 23 00 04 8d 06 38 06 28 02
+31 00 60 06 08 02 80 23 00 04 8d 06 48 02 28 02
+31 00 60 06 08 02 c0 23 00 04 8d 06 48 04 28 02
+31 00 60 06 08 02 00 24 00 04 8d 06 48 06 28 02
+31 00 60 06 08 02 c0 22 20 04 8d 06 58 02 28 02
+31 00 60 06 08 02 00 23 20 04 8d 06 58 04 28 02
+31 00 60 06 08 02 40 23 20 04 8d 06 58 06 28 02
+31 00 60 06 08 02 c0 22 40 04 8d 06 68 02 28 02
+31 00 60 06 08 02 00 23 40 04 8d 06 68 04 28 02
+31 00 60 06 08 02 40 23 40 04 8d 06 68 06 28 02
+31 00 60 06 08 02 00 23 60 04 8d 06 78 04 28 02
+31 00 60 06 08 02 c0 22 60 04 8d 06 78 02 28 02
+31 00 60 06 08 02 40 23 60 04 8d 06 78 06 28 02
+31 00 60 06 08 02 00 23 80 04 8d 06 88 06 28 02
+31 00 60 06 08 02 00 21 80 04 8d 06 88 02 28 02
+31 00 60 06 08 02 c0 22 80 04 8d 06 88 04 28 02
+31 00 60 06 08 02 00 21 a0 04 8d 06 98 02 28 02
+31 00 60 06 08 02 c0 22 a0 04 8d 06 98 04 28 02
+31 00 60 06 08 02 00 23 a0 04 8d 06 98 06 28 02
+31 00 60 06 08 02 00 21 c0 04 8d 06 a8 02 28 02
+31 00 60 06 08 02 c0 22 c0 04 8d 06 a8 04 28 02
+31 00 60 06 08 02 00 23 c0 04 8d 06 a8 06 28 02
+31 00 60 06 08 02 00 21 e0 04 8d 06 b8 02 28 02
+31 00 60 06 08 02 c0 22 e0 04 8d 06 b8 04 28 02
+31 00 60 06 08 02 00 23 e0 04 8d 06 b8 06 28 02
+31 00 60 06 08 02 00 21 00 05 8d 06 c8 02 28 02
+31 00 60 06 08 02 40 21 00 05 8d 06 c8 04 28 02
+31 00 60 06 08 02 c0 22 00 05 8d 06 c8 06 28 02
+31 00 60 06 08 02 00 21 20 05 8d 06 d8 02 28 02
+31 00 60 06 08 02 40 21 20 05 8d 06 d8 04 28 02
+31 00 60 06 08 02 c0 22 20 05 8d 06 d8 06 28 02
+31 00 60 06 08 02 00 21 40 05 8d 06 e8 02 28 02
+31 00 60 06 08 02 40 21 40 05 8d 06 e8 04 28 02
+31 00 60 06 08 02 c0 22 40 05 8d 06 e8 06 28 02
+31 00 60 06 08 02 00 21 60 05 8d 06 f8 02 28 02
+31 00 60 06 08 02 40 21 60 05 8d 06 f8 04 28 02
+31 00 60 06 08 02 c0 22 60 05 8d 06 f8 06 28 02
+31 00 60 06 08 02 00 21 80 05 8d 06 08 03 28 02
+31 00 60 06 08 02 40 21 80 05 8d 06 08 05 28 02
+31 00 60 06 08 02 80 21 80 05 8d 06 08 07 28 02
+31 00 60 06 08 02 00 21 a0 05 8d 06 18 03 28 02
+31 00 60 06 08 02 40 21 a0 05 8d 06 18 05 28 02
+31 00 60 06 08 02 80 21 a0 05 8d 06 18 07 28 02
+31 00 60 06 08 02 00 21 c0 05 8d 06 28 03 28 02
+31 00 60 06 08 02 40 21 c0 05 8d 06 28 05 28 02
+31 00 60 06 08 02 80 21 c0 05 8d 06 28 07 28 02
+31 00 60 06 08 02 00 21 e0 05 8d 06 38 03 28 02
+31 00 60 06 08 02 40 21 e0 05 8d 06 38 05 28 02
+31 00 60 06 08 02 80 21 e0 05 8d 06 38 07 28 02
+31 00 60 06 08 02 00 21 00 06 8d 06 48 03 28 02
+31 00 60 06 08 02 40 21 00 06 8d 06 48 05 28 02
+31 00 60 06 08 02 80 21 00 06 8d 06 48 07 28 02
+31 00 60 06 08 02 00 21 20 06 8d 06 58 03 28 02
+31 00 60 06 08 02 40 21 20 06 8d 06 58 05 28 02
+31 00 60 06 08 02 80 21 20 06 8d 06 58 07 28 02
+31 00 60 06 08 02 00 21 40 06 8d 06 68 03 28 02
+31 00 60 06 08 02 40 21 40 06 8d 06 68 05 28 02
+31 00 60 06 08 02 80 21 40 06 8d 06 68 07 28 02
+31 00 60 06 08 02 00 21 a0 06 8d 06 78 03 28 02
+31 00 60 06 08 02 40 21 a0 06 8d 06 78 05 28 02
+31 00 60 06 08 02 80 21 a0 06 8d 06 78 07 28 02
+31 00 60 06 08 02 00 21 c0 06 8d 06 88 03 28 02
+31 00 60 06 08 02 40 21 c0 06 8d 06 88 05 28 02
+31 00 60 06 08 02 80 21 c0 06 8d 06 88 07 28 02
+31 00 60 06 08 02 00 21 e0 06 8d 06 98 03 28 02
+31 00 60 06 08 02 40 21 e0 06 8d 06 98 05 28 02
+31 00 60 06 08 02 80 21 e0 06 8d 06 98 07 28 02
+31 00 60 06 08 02 00 21 00 07 8d 06 a8 03 28 02
+31 00 60 06 08 02 40 21 00 07 8d 06 a8 05 28 02
+31 00 60 06 08 02 80 21 00 07 8d 06 a8 07 28 02
+31 00 60 06 08 02 00 21 20 07 8d 06 b8 03 28 02
+31 00 60 06 08 02 40 21 20 07 8d 06 b8 05 28 02
+31 00 60 06 08 02 80 21 20 07 8d 06 b8 07 28 02
+31 00 60 06 08 02 00 21 40 07 8d 06 c8 03 28 02
+31 00 60 06 08 02 40 21 40 07 8d 06 c8 05 28 02
+31 00 60 06 08 02 80 21 40 07 8d 06 c8 07 28 02
+31 00 60 06 08 02 00 21 60 07 8d 06 d8 03 28 02
+31 00 60 06 08 02 40 21 60 07 8d 06 d8 05 28 02
+31 00 60 06 08 02 80 21 60 07 8d 06 d8 07 28 02
+31 00 60 06 08 02 00 21 80 07 8d 06 e8 03 28 02
+31 00 60 06 08 02 40 21 80 07 8d 06 e8 05 28 02
+31 00 60 06 08 02 80 21 80 07 8d 06 e8 07 28 02
+31 00 60 06 08 02 00 21 a0 07 8d 06 f8 03 28 02
+31 00 60 06 08 02 40 21 a0 07 8d 06 f8 05 28 02
+31 00 60 06 08 02 80 21 a0 07 8d 06 f8 07 28 02
+31 00 60 06 08 02 40 21 c0 07 8d 06 08 04 28 02
+31 00 60 06 08 02 80 21 c0 07 8d 06 08 06 28 02
+31 00 60 06 08 02 c0 21 c0 07 8d 06 08 08 28 02
+31 00 60 06 08 02 00 21 e0 07 8d 06 18 02 28 02
+31 00 60 06 08 02 40 21 e0 07 8d 06 18 04 28 02
+31 00 60 06 08 02 80 21 e0 07 8d 06 18 06 28 02
+31 00 60 06 08 02 c0 21 e0 07 8d 06 18 08 28 02
+31 00 60 02 48 02 a0 23 40 02 8d 06 08 00 42 04
+31 00 60 02 48 02 60 24 40 02 8d 06 09 01 42 04
+31 00 60 02 48 02 20 25 40 02 8d 06 0a 02 42 04
+31 00 60 02 48 02 40 20 40 02 8d 06 0b 03 42 04
+31 00 60 02 48 02 c0 20 40 02 8d 06 0c 04 42 04
+31 00 60 02 48 02 40 21 40 02 8d 06 0d 05 42 04
+31 00 60 02 48 02 c0 21 40 02 8d 06 0e 06 42 04
+31 00 60 02 48 02 40 22 40 02 8d 06 0f 07 42 04
+31 00 80 02 48 02 00 24 c0 02 8d 06 08 00 84 08
+31 00 80 02 48 02 40 25 c0 02 8d 06 09 01 84 08
+31 00 80 02 48 02 80 27 c0 02 8d 06 0a 02 84 08
+31 00 80 02 48 02 c0 28 c0 02 8d 06 0b 03 84 08
+31 00 80 02 48 02 c0 29 c0 02 8d 06 0c 04 84 08
+31 00 80 02 48 02 c0 2a c0 02 8d 06 0d 05 84 08
+31 00 80 02 48 02 c0 2b c0 02 8d 06 0e 06 84 08
+31 00 80 02 48 02 80 26 c0 02 8d 06 0f 07 84 08
+31 00 60 02 48 02 00 22 40 05 8d 06 01 81 43 04
+31 00 60 02 48 02 80 22 40 05 8d 06 02 82 43 04
+31 00 60 02 48 02 a0 23 40 05 8d 06 04 84 43 04
+31 00 60 02 48 02 c0 24 40 05 8d 06 06 86 43 04
+31 00 60 02 48 02 80 2f 40 05 8d 06 07 87 43 04
+31 00 60 06 08 02 80 21 00 02 8d 06 58 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 68 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 78 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 88 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 98 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 a8 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 b8 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 c8 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 d8 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 e8 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 f8 00 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 08 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 18 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 58 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 68 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 78 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 88 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 98 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 a8 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 b8 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 c8 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 d8 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 e8 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 f8 01 4a 04
+31 00 60 06 08 02 80 21 40 00 8d 06 08 02 4a 04
+31 00 60 02 48 02 c0 21 e0 01 8d 06 01 50 12 0a
+31 00 60 02 48 02 e0 21 80 02 8d 06 02 51 12 0a
+31 00 80 02 48 02 20 25 e0 00 8d 06 01 50 24 14
+31 00 80 02 48 02 60 25 20 02 8d 06 02 51 24 14
+31 00 60 02 48 02 40 20 a0 00 8d 06 01 30 22 06
+31 00 80 02 48 02 40 20 e0 00 8d 06 01 30 44 0c
+31 00 60 02 48 02 40 20 40 00 8d 06 01 30 32 06
+31 00 80 02 48 02 40 20 00 03 8d 06 01 30 64 0c
+31 00 60 06 e0 3a 00 20 00 0f 8d 06 17 01 0a 8c
+31 00 60 06 08 02 60 21 20 00 8d 06 28 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 38 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 48 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 58 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 68 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 78 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 88 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 98 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 a8 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 b8 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 c8 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 d8 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 e8 01 38 02
+31 00 60 06 08 02 60 21 20 00 8d 06 f8 01 38 02
+31 00 60 02 48 02 40 21 40 00 8d 06 04 00 42 04
+31 00 80 02 48 02 40 22 40 00 8d 06 04 00 84 08
+31 00 60 02 48 02 40 21 40 00 8d 06 03 00 42 04
+31 00 80 02 48 02 40 22 40 00 8d 06 03 00 84 08
+31 00 60 06 08 02 60 21 a0 01 8d 06 58 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 68 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 78 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 88 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 98 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 a8 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 b8 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 c8 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 d8 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 e8 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 f8 00 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 08 01 2a 04
+31 00 60 06 08 02 40 20 60 01 8d 06 18 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 58 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 68 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 78 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 88 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 98 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 a8 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 b8 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 c8 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 d8 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 e8 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 f8 01 2a 04
+31 00 60 06 08 02 40 20 60 00 8d 06 08 02 2a 04
+31 00 60 02 48 02 20 21 e0 01 8d 06 02 b1 1a 02
+31 00 60 02 48 02 40 21 00 02 8d 06 03 b2 1a 02
+31 00 60 02 48 02 60 21 20 02 8d 06 04 b3 1a 02
+31 00 60 02 48 02 80 21 40 02 8d 06 05 b4 1a 02
+31 00 60 02 48 02 a0 21 60 02 8d 06 06 b5 1a 02
+31 00 80 02 48 02 c0 21 00 02 8d 06 02 b1 2c 02
+31 00 80 02 48 02 00 22 40 02 8d 06 03 b2 2c 02
+31 00 80 02 48 02 40 22 80 02 8d 06 04 b3 2c 02
+31 00 80 02 48 02 80 22 c0 02 8d 06 05 b4 2c 02
+31 00 80 02 48 02 c0 22 00 03 8d 06 06 b5 2c 02
+31 00 60 02 48 02 c0 21 60 01 8d 06 03 02 4b 0a
+31 00 60 02 48 02 40 22 40 02 8d 06 04 03 4b 0c
+31 00 60 02 48 02 c0 22 00 03 8d 06 05 04 4b 08
+31 00 80 02 48 02 40 22 40 03 8d 06 03 02 8d 12
+31 00 80 02 48 02 40 23 60 04 8d 06 04 03 8d 16
+31 00 80 02 48 02 40 24 c0 05 8d 06 05 04 8d 0e
+31 00 60 02 48 02 80 2f 20 01 8d 06 00 00 4b 0c
diff --git a/src/intel/compiler/elk/tests/gen9/sendc.asm b/src/intel/compiler/elk/tests/gen9/sendc.asm
new file mode 100644
index 00000000000..c340cb510a6
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/sendc.asm
@@ -0,0 +1,264 @@
+sendc(8)        null<1>UW       g124<0,1,0>F    0x88031400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g120<0,1,0>F    0x90031000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 8 rlen 0 { align1 1H EOT };
+sendc(16)       null<1>UW       g114<0,1,0>F    0x82031100
+                            render MsgDesc: RT write SIMD16/RepData LastRT Surface = 0 mlen 1 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g124<8,8,1>UD   0x880ba001
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g121<8,8,1>UD   0x8e0da001
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g125<8,8,1>UD   0x860a0001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g123<8,8,1>UD   0x8a0c0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 0 { align1 1H EOT };
+(+f0.1) sendc(8) null<1>UW      g124<0,1,0>F    0x88031400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       g122<8,8,1>UD   0x8c0be001
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g117<8,8,1>UD   0x960de001
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 1 Sampler = 0 mlen 11 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g124<8,8,1>UD   0x880a0001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g121<8,8,1>UD   0x8e0c0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g118<8,8,1>UD   0x940a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 10 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       g125<8,8,1>UD   a0<0,1,0>UD     0x80000200
+                            sampler MsgDesc: indirect                       { align1 1Q EOT };
+sendc(8)        null<1>UW       g124<8,8,1>UD   0x880a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       g121<8,8,1>UD   0x8e0bc001
+                            sampler MsgDesc: ld2dms_w SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       g121<8,8,1>UD   0x8e0a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 7 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       g125<8,8,1>UD   0x860a2001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g123<8,8,1>UD   0x8a0c2001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<0,1,0>F    0x8c0b1401
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 1 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<0,1,0>F    0x940b1001
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g13<0,1,0>F     0x0e0b0401
+                            render MsgDesc: RT write SIMD8 Surface = 1 mlen 7 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g121<0,1,0>F    0x8e0b1402
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 2 mlen 7 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g7<0,1,0>F      0x180b0001
+                            render MsgDesc: RT write SIMD16 Surface = 1 mlen 12 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g116<0,1,0>F    0x980b1002
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 2 mlen 12 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g123<8,8,1>UD   0x8a0a1001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g119<8,8,1>UD   0x920c1001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 0 { align1 1H EOT };
+sendc(1)        g2<1>UW         g2<0,1,0>UW     0x0209c000
+                            data MsgDesc: ( DC mfence, 0, 0) mlen 1 rlen 0  { align1 WE_all 1N };
+sendc(8)        null<1>UW       g120<8,8,1>UD   0x900b4001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 8 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       g123<8,8,1>UD   0x8a0b4001
+                            sampler MsgDesc: sample_d_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 0 { align1 1Q EOT };
+sendc(8)        g6<1>F          g2<0,1,0>UD     0x044b4100
+                            render MsgDesc: RT read MsgCtrl = 0x1 Surface = 0 mlen 2 rlen 4 { align1 1Q };
+sendc(16)       g9<1>F          g27<0,1,0>UD    0x048b4000
+                            render MsgDesc: RT read MsgCtrl = 0x0 Surface = 0 mlen 2 rlen 8 { align1 1H };
+sendc(8)        null<1>UW       g124<8,8,1>UD   0x880a3001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g121<8,8,1>UD   0x8e0c3001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g123<0,1,0>F    0x8a031400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 5 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<0,1,0>F    0x94031000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g5<0,1,0>F      0x0c0b0400
+                            render MsgDesc: RT write SIMD8 Surface = 0 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g5<0,1,0>F      0x0c0b0401
+                            render MsgDesc: RT write SIMD8 Surface = 1 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g5<0,1,0>F      0x0c0b0402
+                            render MsgDesc: RT write SIMD8 Surface = 2 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g5<0,1,0>F      0x0c0b0403
+                            render MsgDesc: RT write SIMD8 Surface = 3 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g5<0,1,0>F      0x0c0b0404
+                            render MsgDesc: RT write SIMD8 Surface = 4 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g122<0,1,0>F    0x8c0b1405
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 5 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g5<0,1,0>F      0x140b0000
+                            render MsgDesc: RT write SIMD16 Surface = 0 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g5<0,1,0>F      0x140b0001
+                            render MsgDesc: RT write SIMD16 Surface = 1 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g5<0,1,0>F      0x140b0002
+                            render MsgDesc: RT write SIMD16 Surface = 2 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g5<0,1,0>F      0x140b0003
+                            render MsgDesc: RT write SIMD16 Surface = 3 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g5<0,1,0>F      0x140b0004
+                            render MsgDesc: RT write SIMD16 Surface = 4 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g118<0,1,0>F    0x940b1005
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 5 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        g6<1>F          g6<0,1,0>UD     0x044b4101
+                            render MsgDesc: RT read MsgCtrl = 0x1 Surface = 1 mlen 2 rlen 4 { align1 1Q };
+sendc(8)        g10<1>F         g10<0,1,0>UD    0x044b4102
+                            render MsgDesc: RT read MsgCtrl = 0x1 Surface = 2 mlen 2 rlen 4 { align1 1Q };
+sendc(8)        g14<1>F         g14<0,1,0>UD    0x044b4103
+                            render MsgDesc: RT read MsgCtrl = 0x1 Surface = 3 mlen 2 rlen 4 { align1 1Q };
+sendc(8)        null<1>UW       g122<0,1,0>F    0x8c0b1403
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 3 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       g32<1>F         g14<0,1,0>UD    0x048b4001
+                            render MsgDesc: RT read MsgCtrl = 0x0 Surface = 1 mlen 2 rlen 8 { align1 1H };
+sendc(16)       g40<1>F         g16<0,1,0>UD    0x048b4002
+                            render MsgDesc: RT read MsgCtrl = 0x0 Surface = 2 mlen 2 rlen 8 { align1 1H };
+sendc(16)       g48<1>F         g18<0,1,0>UD    0x048b4003
+                            render MsgDesc: RT read MsgCtrl = 0x0 Surface = 3 mlen 2 rlen 8 { align1 1H };
+sendc(16)       null<1>UW       g118<0,1,0>F    0x940b1003
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 3 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g124<8,8,1>UD   0x880a1001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g121<8,8,1>UD   0x8e0c1001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g125<8,8,1>UD   0x860ba001
+                            sampler MsgDesc: ld_lz SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g123<8,8,1>UD   0x8a0da001
+                            sampler MsgDesc: ld_lz SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g126<8,8,1>UD   0x840a0001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g125<8,8,1>UD   0x860c0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 3 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g124<8,8,1>UD   0x880a2001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g121<8,8,1>UD   0x8e0c2001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g123<8,8,1>UD   0x8a0be001
+                            sampler MsgDesc: ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g119<8,8,1>UD   0x920de001
+                            sampler MsgDesc: ld2dms SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g120<8,8,1>UD   0x900a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 8 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       g123<8,8,1>UD   0x8a0a2001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g119<8,8,1>UD   0x920c2001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g125<8,8,1>UD   0x860a0304
+                            sampler MsgDesc: sample SIMD8 Surface = 4 Sampler = 3 mlen 3 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g123<8,8,1>UD   0x8a0c0304
+                            sampler MsgDesc: sample SIMD16 Surface = 4 Sampler = 3 mlen 5 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>UD   0x8c0a1001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g117<8,8,1>UD   0x960c1001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 11 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g125<8,8,1>UD   0x860a3001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g123<8,8,1>UD   0x8a0c3001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<0,1,0>F    0x8c0b1402
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 2 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<0,1,0>F    0x940b1002
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 2 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g124<8,8,1>UD   0x880a6001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g121<8,8,1>UD   0x8e0c6001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g123<8,8,1>UD   0x8a0a5001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g119<8,8,1>UD   0x920c5001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>UD   0x8c0a2001
+                            sampler MsgDesc: sample_l SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g117<8,8,1>UD   0x960c2001
+                            sampler MsgDesc: sample_l SIMD16 Surface = 1 Sampler = 0 mlen 11 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<8,8,1>UD   0x8c0bc001
+                            sampler MsgDesc: ld2dms_w SIMD8 Surface = 1 Sampler = 0 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g117<8,8,1>UD   0x960dc001
+                            sampler MsgDesc: ld2dms_w SIMD16 Surface = 1 Sampler = 0 mlen 11 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<0,1,0>F    0x8c0b1400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<0,1,0>F    0x940b1000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g124<8,8,1>UD   0x880a7001
+                            sampler MsgDesc: ld SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g121<8,8,1>UD   0x8e0c7001
+                            sampler MsgDesc: ld SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g118<0,1,0>F    0x940b1200
+                            render MsgDesc: RT write SIMD8/DualSrcLow LastRT Surface = 0 mlen 10 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       g3<0,1,0>F      0x140b1200
+                            render MsgDesc: RT write SIMD8/DualSrcLow LastRT Surface = 0 mlen 10 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g118<0,1,0>F    0x940b1300
+                            render MsgDesc: RT write SIMD8/DualSrcHigh LastRT Surface = 0 mlen 10 rlen 0 { align1 2Q EOT };
+sendc(8)        null<1>UW       g123<8,8,1>UD   0x8a0a0001
+                            sampler MsgDesc: sample SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g119<8,8,1>UD   0x920c0001
+                            sampler MsgDesc: sample SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 0 { align1 1H EOT };
+sendc(16)       g11<1>F         g37<0,1,0>UD    0x048b6000
+                            render MsgDesc: RT read MsgCtrl = 0x32 Surface = 0 mlen 2 rlen 8 { align1 1H };
+sendc(8)        null<1>UW       g23<0,1,0>F     0x0c0b0405
+                            render MsgDesc: RT write SIMD8 Surface = 5 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g29<0,1,0>F     0x0c0b0406
+                            render MsgDesc: RT write SIMD8 Surface = 6 mlen 6 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g122<0,1,0>F    0x8c0b1407
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 7 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g57<0,1,0>F     0x140b0005
+                            render MsgDesc: RT write SIMD16 Surface = 5 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g67<0,1,0>F     0x140b0006
+                            render MsgDesc: RT write SIMD16 Surface = 6 mlen 10 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g118<0,1,0>F    0x940b1007
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 7 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g125<8,8,1>UD   0x860a1001
+                            sampler MsgDesc: sample_b SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g123<8,8,1>UD   0x8a0c1001
+                            sampler MsgDesc: sample_b SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g10<0,1,0>F     0x0e0b0400
+                            render MsgDesc: RT write SIMD8 Surface = 0 mlen 7 rlen 0 { align1 1Q };
+sendc(8)        null<1>UW       g121<0,1,0>F    0x8e0b1401
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 1 mlen 7 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g2<0,1,0>F      0x160b0000
+                            render MsgDesc: RT write SIMD16 Surface = 0 mlen 11 rlen 0 { align1 1H };
+sendc(16)       null<1>UW       g117<0,1,0>F    0x960b1001
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 11 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<0,1,0>F    0x8c0b1404
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 4 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<0,1,0>F    0x940b1004
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 4 mlen 10 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g122<0,1,0>F    0x8c0b1406
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 6 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g118<0,1,0>F    0x940b1006
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 6 mlen 10 rlen 0 { align1 1H EOT };
+sendc(16)       null<1>UW       g119<0,1,0>F    0x92031000
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 0 mlen 9 rlen 0 { align1 1H EOT };
+sendc(16)       null<1>UW       g116<0,1,0>F    0x980b1001
+                            render MsgDesc: RT write SIMD16 LastRT Surface = 1 mlen 12 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g123<8,8,1>UD   0x8a0a6001
+                            sampler MsgDesc: sample_l_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g119<8,8,1>UD   0x920c6001
+                            sampler MsgDesc: sample_l_c SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g125<8,8,1>UD   0x860a0102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 3 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g123<8,8,1>UD   0x8a0c0102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 5 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g124<8,8,1>UD   0x880a5001
+                            sampler MsgDesc: sample_b_c SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g121<8,8,1>UD   0x8e0c5001
+                            sampler MsgDesc: sample_b_c SIMD16 Surface = 1 Sampler = 0 mlen 7 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g123<8,8,1>UD   0x8a0a4001
+                            sampler MsgDesc: sample_d SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       g123<8,8,1>UD   0x8a0a3001
+                            sampler MsgDesc: sample_c SIMD8 Surface = 1 Sampler = 0 mlen 5 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g119<8,8,1>UD   0x920c3001
+                            sampler MsgDesc: sample_c SIMD16 Surface = 1 Sampler = 0 mlen 9 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g125<8,8,1>UD   0x860a0f10
+                            sampler MsgDesc: sample SIMD8 Surface = 16 Sampler = 15 mlen 3 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g123<8,8,1>UD   0x8a0c0f10
+                            sampler MsgDesc: sample SIMD16 Surface = 16 Sampler = 15 mlen 5 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g126<8,8,1>UD   0x840a0102
+                            sampler MsgDesc: sample SIMD8 Surface = 2 Sampler = 1 mlen 2 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g125<8,8,1>UD   0x860c0102
+                            sampler MsgDesc: sample SIMD16 Surface = 2 Sampler = 1 mlen 3 rlen 0 { align1 1H EOT };
+sendc(16)       null<1>UW       g11<0,1,0>F     0x180b0000
+                            render MsgDesc: RT write SIMD16 Surface = 0 mlen 12 rlen 0 { align1 1H };
+sendc(8)        null<1>UW       g122<0,1,0>F    0x8c031400
+                            render MsgDesc: RT write SIMD8 LastRT Surface = 0 mlen 6 rlen 0 { align1 1Q EOT };
+sendc(8)        null<1>UW       g125<8,8,1>UD   0x860a0506
+                            sampler MsgDesc: sample SIMD8 Surface = 6 Sampler = 5 mlen 3 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g123<8,8,1>UD   0x8a0c0506
+                            sampler MsgDesc: sample SIMD16 Surface = 6 Sampler = 5 mlen 5 rlen 0 { align1 1H EOT };
+sendc(8)        null<1>UW       g125<8,8,1>UD   0x860b8001
+                            sampler MsgDesc: sample_lz SIMD8 Surface = 1 Sampler = 0 mlen 3 rlen 0 { align1 1Q EOT };
+sendc(16)       null<1>UW       g123<8,8,1>UD   0x8a0d8001
+                            sampler MsgDesc: sample_lz SIMD16 Surface = 1 Sampler = 0 mlen 5 rlen 0 { align1 1H EOT };
diff --git a/src/intel/compiler/elk/tests/gen9/sendc.expected b/src/intel/compiler/elk/tests/gen9/sendc.expected
new file mode 100644
index 00000000000..b4c30df0df2
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/sendc.expected
@@ -0,0 +1,132 @@
+32 00 60 05 40 3a 00 20 80 0f 00 06 00 14 03 88
+32 00 80 05 40 3a 00 20 00 0f 00 06 00 10 03 90
+32 00 80 05 40 3a 00 20 40 0e 00 06 00 11 03 82
+32 00 60 02 40 02 00 20 80 0f 8d 06 01 a0 0b 88
+32 00 80 02 40 02 00 20 20 0f 8d 06 01 a0 0d 8e
+32 00 60 02 40 02 00 20 a0 0f 8d 06 01 00 0a 86
+32 00 80 02 40 02 00 20 60 0f 8d 06 01 00 0c 8a
+32 00 61 05 41 3a 00 20 80 0f 00 06 00 14 03 88
+32 00 60 02 40 02 00 20 40 0f 8d 06 01 e0 0b 8c
+32 00 80 02 40 02 00 20 a0 0e 8d 06 01 e0 0d 96
+32 00 60 02 40 02 00 20 80 0f 8d 06 01 00 0a 88
+32 00 80 02 40 02 00 20 20 0f 8d 06 01 00 0c 8e
+32 00 60 02 40 02 00 20 c0 0e 8d 06 01 40 0a 94
+32 00 60 02 40 02 00 20 a0 0f 8d 00 00 02 00 80
+32 00 60 02 40 02 00 20 80 0f 8d 06 01 40 0a 88
+32 00 60 02 40 02 00 20 20 0f 8d 06 01 c0 0b 8e
+32 00 60 02 40 02 00 20 20 0f 8d 06 01 40 0a 8e
+32 00 60 02 40 02 00 20 a0 0f 8d 06 01 20 0a 86
+32 00 80 02 40 02 00 20 60 0f 8d 06 01 20 0c 8a
+32 00 60 05 40 3a 00 20 40 0f 00 06 01 14 0b 8c
+32 00 80 05 40 3a 00 20 c0 0e 00 06 01 10 0b 94
+32 00 60 05 40 3a 00 20 a0 01 00 06 01 04 0b 0e
+32 00 60 05 40 3a 00 20 20 0f 00 06 02 14 0b 8e
+32 00 80 05 40 3a 00 20 e0 00 00 06 01 00 0b 18
+32 00 80 05 40 3a 00 20 80 0e 00 06 02 10 0b 98
+32 00 60 02 40 02 00 20 60 0f 8d 06 01 10 0a 8a
+32 00 80 02 40 02 00 20 e0 0e 8d 06 01 10 0c 92
+32 00 00 0a 4c 12 40 20 40 00 00 06 00 c0 09 02
+32 00 60 02 40 02 00 20 00 0f 8d 06 01 40 0b 90
+32 00 60 02 40 02 00 20 60 0f 8d 06 01 40 0b 8a
+32 00 60 05 e8 02 c0 20 40 00 00 06 00 41 4b 04
+32 00 80 05 e8 02 20 21 60 03 00 06 00 40 8b 04
+32 00 60 02 40 02 00 20 80 0f 8d 06 01 30 0a 88
+32 00 80 02 40 02 00 20 20 0f 8d 06 01 30 0c 8e
+32 00 60 05 40 3a 00 20 60 0f 00 06 00 14 03 8a
+32 00 80 05 40 3a 00 20 c0 0e 00 06 00 10 03 94
+32 00 60 05 40 3a 00 20 a0 00 00 06 00 04 0b 0c
+32 00 60 05 40 3a 00 20 a0 00 00 06 01 04 0b 0c
+32 00 60 05 40 3a 00 20 a0 00 00 06 02 04 0b 0c
+32 00 60 05 40 3a 00 20 a0 00 00 06 03 04 0b 0c
+32 00 60 05 40 3a 00 20 a0 00 00 06 04 04 0b 0c
+32 00 60 05 40 3a 00 20 40 0f 00 06 05 14 0b 8c
+32 00 80 05 40 3a 00 20 a0 00 00 06 00 00 0b 14
+32 00 80 05 40 3a 00 20 a0 00 00 06 01 00 0b 14
+32 00 80 05 40 3a 00 20 a0 00 00 06 02 00 0b 14
+32 00 80 05 40 3a 00 20 a0 00 00 06 03 00 0b 14
+32 00 80 05 40 3a 00 20 a0 00 00 06 04 00 0b 14
+32 00 80 05 40 3a 00 20 c0 0e 00 06 05 10 0b 94
+32 00 60 05 e8 02 c0 20 c0 00 00 06 01 41 4b 04
+32 00 60 05 e8 02 40 21 40 01 00 06 02 41 4b 04
+32 00 60 05 e8 02 c0 21 c0 01 00 06 03 41 4b 04
+32 00 60 05 40 3a 00 20 40 0f 00 06 03 14 0b 8c
+32 00 80 05 e8 02 00 24 c0 01 00 06 01 40 8b 04
+32 00 80 05 e8 02 00 25 00 02 00 06 02 40 8b 04
+32 00 80 05 e8 02 00 26 40 02 00 06 03 40 8b 04
+32 00 80 05 40 3a 00 20 c0 0e 00 06 03 10 0b 94
+32 00 60 02 40 02 00 20 80 0f 8d 06 01 10 0a 88
+32 00 80 02 40 02 00 20 20 0f 8d 06 01 10 0c 8e
+32 00 60 02 40 02 00 20 a0 0f 8d 06 01 a0 0b 86
+32 00 80 02 40 02 00 20 60 0f 8d 06 01 a0 0d 8a
+32 00 60 02 40 02 00 20 c0 0f 8d 06 01 00 0a 84
+32 00 80 02 40 02 00 20 a0 0f 8d 06 01 00 0c 86
+32 00 60 02 40 02 00 20 80 0f 8d 06 01 20 0a 88
+32 00 80 02 40 02 00 20 20 0f 8d 06 01 20 0c 8e
+32 00 60 02 40 02 00 20 60 0f 8d 06 01 e0 0b 8a
+32 00 80 02 40 02 00 20 e0 0e 8d 06 01 e0 0d 92
+32 00 60 02 40 02 00 20 00 0f 8d 06 01 40 0a 90
+32 00 60 02 40 02 00 20 60 0f 8d 06 01 20 0a 8a
+32 00 80 02 40 02 00 20 e0 0e 8d 06 01 20 0c 92
+32 00 60 02 40 02 00 20 a0 0f 8d 06 04 03 0a 86
+32 00 80 02 40 02 00 20 60 0f 8d 06 04 03 0c 8a
+32 00 60 02 40 02 00 20 40 0f 8d 06 01 10 0a 8c
+32 00 80 02 40 02 00 20 a0 0e 8d 06 01 10 0c 96
+32 00 60 02 40 02 00 20 a0 0f 8d 06 01 30 0a 86
+32 00 80 02 40 02 00 20 60 0f 8d 06 01 30 0c 8a
+32 00 60 05 40 3a 00 20 40 0f 00 06 02 14 0b 8c
+32 00 80 05 40 3a 00 20 c0 0e 00 06 02 10 0b 94
+32 00 60 02 40 02 00 20 80 0f 8d 06 01 60 0a 88
+32 00 80 02 40 02 00 20 20 0f 8d 06 01 60 0c 8e
+32 00 60 02 40 02 00 20 60 0f 8d 06 01 50 0a 8a
+32 00 80 02 40 02 00 20 e0 0e 8d 06 01 50 0c 92
+32 00 60 02 40 02 00 20 40 0f 8d 06 01 20 0a 8c
+32 00 80 02 40 02 00 20 a0 0e 8d 06 01 20 0c 96
+32 00 60 02 40 02 00 20 40 0f 8d 06 01 c0 0b 8c
+32 00 80 02 40 02 00 20 a0 0e 8d 06 01 c0 0d 96
+32 00 60 05 40 3a 00 20 40 0f 00 06 00 14 0b 8c
+32 00 80 05 40 3a 00 20 c0 0e 00 06 00 10 0b 94
+32 00 60 02 40 02 00 20 80 0f 8d 06 01 70 0a 88
+32 00 80 02 40 02 00 20 20 0f 8d 06 01 70 0c 8e
+32 00 60 05 40 3a 00 20 c0 0e 00 06 00 12 0b 94
+32 00 60 05 40 3a 00 20 60 00 00 06 00 12 0b 14
+32 10 60 05 40 3a 00 20 c0 0e 00 06 00 13 0b 94
+32 00 60 02 40 02 00 20 60 0f 8d 06 01 00 0a 8a
+32 00 80 02 40 02 00 20 e0 0e 8d 06 01 00 0c 92
+32 00 80 05 e8 02 60 21 a0 04 00 06 00 60 8b 04
+32 00 60 05 40 3a 00 20 e0 02 00 06 05 04 0b 0c
+32 00 60 05 40 3a 00 20 a0 03 00 06 06 04 0b 0c
+32 00 60 05 40 3a 00 20 40 0f 00 06 07 14 0b 8c
+32 00 80 05 40 3a 00 20 20 07 00 06 05 00 0b 14
+32 00 80 05 40 3a 00 20 60 08 00 06 06 00 0b 14
+32 00 80 05 40 3a 00 20 c0 0e 00 06 07 10 0b 94
+32 00 60 02 40 02 00 20 a0 0f 8d 06 01 10 0a 86
+32 00 80 02 40 02 00 20 60 0f 8d 06 01 10 0c 8a
+32 00 60 05 40 3a 00 20 40 01 00 06 00 04 0b 0e
+32 00 60 05 40 3a 00 20 20 0f 00 06 01 14 0b 8e
+32 00 80 05 40 3a 00 20 40 00 00 06 00 00 0b 16
+32 00 80 05 40 3a 00 20 a0 0e 00 06 01 10 0b 96
+32 00 60 05 40 3a 00 20 40 0f 00 06 04 14 0b 8c
+32 00 80 05 40 3a 00 20 c0 0e 00 06 04 10 0b 94
+32 00 60 05 40 3a 00 20 40 0f 00 06 06 14 0b 8c
+32 00 80 05 40 3a 00 20 c0 0e 00 06 06 10 0b 94
+32 00 80 05 40 3a 00 20 e0 0e 00 06 00 10 03 92
+32 00 80 05 40 3a 00 20 80 0e 00 06 01 10 0b 98
+32 00 60 02 40 02 00 20 60 0f 8d 06 01 60 0a 8a
+32 00 80 02 40 02 00 20 e0 0e 8d 06 01 60 0c 92
+32 00 60 02 40 02 00 20 a0 0f 8d 06 02 01 0a 86
+32 00 80 02 40 02 00 20 60 0f 8d 06 02 01 0c 8a
+32 00 60 02 40 02 00 20 80 0f 8d 06 01 50 0a 88
+32 00 80 02 40 02 00 20 20 0f 8d 06 01 50 0c 8e
+32 00 60 02 40 02 00 20 60 0f 8d 06 01 40 0a 8a
+32 00 60 02 40 02 00 20 60 0f 8d 06 01 30 0a 8a
+32 00 80 02 40 02 00 20 e0 0e 8d 06 01 30 0c 92
+32 00 60 02 40 02 00 20 a0 0f 8d 06 10 0f 0a 86
+32 00 80 02 40 02 00 20 60 0f 8d 06 10 0f 0c 8a
+32 00 60 02 40 02 00 20 c0 0f 8d 06 02 01 0a 84
+32 00 80 02 40 02 00 20 a0 0f 8d 06 02 01 0c 86
+32 00 80 05 40 3a 00 20 60 01 00 06 00 00 0b 18
+32 00 60 05 40 3a 00 20 40 0f 00 06 00 14 03 8c
+32 00 60 02 40 02 00 20 a0 0f 8d 06 06 05 0a 86
+32 00 80 02 40 02 00 20 60 0f 8d 06 06 05 0c 8a
+32 00 60 02 40 02 00 20 a0 0f 8d 06 01 80 0b 86
+32 00 80 02 40 02 00 20 60 0f 8d 06 01 80 0d 8a
diff --git a/src/intel/compiler/elk/tests/gen9/sends.asm b/src/intel/compiler/elk/tests/gen9/sends.asm
new file mode 100644
index 00000000000..9f7c8505c9c
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/sends.asm
@@ -0,0 +1,268 @@
+sends(8)        nullUD          g34UD           g36UD           0x04035001                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD16, Mask = 0x0) mlen 2 ex_mlen 4 rlen 0 { align1 1Q };
+sends(8)        nullUD          g1UD            g3UD            0x04036001                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD8, Mask = 0x0) mlen 2 ex_mlen 4 rlen 0 { align1 2Q };
+sends(8)        nullUD          g21UD           g23UD           0x04035001                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD16, Mask = 0x0) mlen 2 ex_mlen 4 rlen 0 { align1 1Q };
+(+f1.0) sends(8) g9UD           g2UD            g3UD            0x0210b201                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, or) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+(+f1.0) sends(16) g11UD         g2UD            g6UD            0x0420a201                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, or) mlen 2 ex_mlen 2 rlen 2 { align1 1H };
+sends(16)       nullUD          g6UD            g8UD            0x04025efe                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 254, SIMD16, Mask = 0xe) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+sends(16)       nullUD          g10UD           g12UD           0x040087fe                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, add) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+(+f1.0) sends(8) nullUD         g11UD           g5UD            0x04035002                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD16, Mask = 0x0) mlen 2 ex_mlen 4 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g2UD            g11UD           0x04036002                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD8, Mask = 0x0) mlen 2 ex_mlen 4 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g3UD            g4UD            0x02026001                0x00000100
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD8, Mask = 0x0) mlen 1 ex_mlen 4 rlen 0 { align1 1Q };
+(+f1.0) sends(16) nullUD        g3UD            g5UD            0x04025001                0x00000200
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD16, Mask = 0x0) mlen 2 ex_mlen 8 rlen 0 { align1 1H };
+sends(8)        nullUD          g2UD            g3UD            0x02009b00                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 0, SIMD8, imin) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g2UD            g4UD            0x04035e01                0x00000040
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD16, Mask = 0xe) mlen 2 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g45UD           g41UD           0x04036e01                0x00000040
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD8, Mask = 0xe) mlen 2 ex_mlen 1 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g2UD            g4UD            0x04018c01                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, umax) mlen 2 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g45UD           g41UD           0x04019c01                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, umax) mlen 2 ex_mlen 1 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g2UD            g4UD            0x04018401                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, mov) mlen 2 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g45UD           g41UD           0x04019401                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, mov) mlen 2 ex_mlen 1 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g2UD            g4UD            0x04018e01                0x00000080
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, cmpwr) mlen 2 ex_mlen 2 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g11UD           g13UD           0x04019e01                0x00000080
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, cmpwr) mlen 2 ex_mlen 2 rlen 0 { align1 2Q };
+sends(16)       nullUD          g3UD            g1UD            0x04008dfe                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, umin) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+sends(16)       nullUD          g5UD            g1UD            0x04008bfe                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, imin) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+sends(16)       nullUD          g3UD            g1UD            0x04008cfe                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, umax) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+sends(16)       nullUD          g5UD            g1UD            0x04008afe                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, imax) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+sends(16)       nullUD          g3UD            g1UD            0x040081fe                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, and) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+sends(16)       nullUD          g3UD            g1UD            0x040082fe                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, or) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+sends(16)       nullUD          g3UD            g1UD            0x040083fe                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, xor) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+sends(16)       nullUD          g3UD            g1UD            0x040084fe                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, mov) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+sends(16)       nullUD          g3UD            g7UD            0x04008efe                0x00000100
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, cmpwr) mlen 2 ex_mlen 4 rlen 0 { align1 1H };
+sends(16)       g1UD            g19UD           g21UD           0x0420a4fe                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, mov) mlen 2 ex_mlen 2 rlen 2 { align1 1H };
+sends(16)       g13UD           g23UD           g25UD           0x0420a2fe                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, or) mlen 2 ex_mlen 2 rlen 2 { align1 1H };
+sends(8)        nullUD          g14UD           g10UD           0x02026000                0x00000100
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 0, SIMD8, Mask = 0x0) mlen 1 ex_mlen 4 rlen 0 { align1 1Q };
+sends(8)        nullUD          g4UD            g2UD            0x02026efe                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 254, SIMD8, Mask = 0xe) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+sends(8)        g7UD            g19UD           g20UD           0x0210bdfe                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD8, umin) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+sends(8)        g11UD           g25UD           g26UD           0x0210b4fe                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD8, mov) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+sends(16)       g1UD            g14UD           g16UD           0x0420a7fe                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD16, add) mlen 2 ex_mlen 2 rlen 2 { align1 1H };
+(+f1.0) sends(8) nullUD         g2UD            g13UD           a0<0>UD                   0x00000100
+                            dp data 1 MsgDesc: indirect ex_mlen 4           { align1 1Q };
+(+f1.0) sends(8) nullUD         g5UD            g6UD            0x02026e01                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD8, Mask = 0xe) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g5UD            g6UD            0x02026e02                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 2, SIMD8, Mask = 0xe) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(16) nullUD        g6UD            g8UD            0x04025e01                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD16, Mask = 0xe) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+(+f1.0) sends(16) nullUD        g6UD            g8UD            0x04025e02                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 2, SIMD16, Mask = 0xe) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+(+f1.0) sends(8) g3UD           g8UD            g9UD            0x0210b702                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD8, add) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+(+f1.0) sends(16) g4UD          g11UD           g13UD           0x0420a702                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, add) mlen 2 ex_mlen 2 rlen 2 { align1 1H };
+(+f1.0) sends(8) nullUD         g5UD            g3UD            0x02026c01                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD8, Mask = 0xc) mlen 1 ex_mlen 2 rlen 0 { align1 1Q };
+(+f1.0) sends(16) nullUD        g19UD           g21UD           0x04025c01                0x00000100
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 1, SIMD16, Mask = 0xc) mlen 2 ex_mlen 4 rlen 0 { align1 1H };
+sends(8)        nullUD          g14UD           g15UD           0x02026e00                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 0, SIMD8, Mask = 0xe) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+sends(8)        nullUD          g16UD           g9UD            0x02026c00                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 0, SIMD8, Mask = 0xc) mlen 1 ex_mlen 2 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g15UD           g18UD           0x06035001                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD16, Mask = 0x0) mlen 3 ex_mlen 4 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g34UD           g11UD           0x06036001                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD8, Mask = 0x0) mlen 3 ex_mlen 4 rlen 0 { align1 2Q };
+(+f1.0) sends(8) g13UD          g18UD           g19UD           0x0210bb02                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD8, imin) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+(+f1.0) sends(8) g16UD          g25UD           g30UD           0x0210b402                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD8, mov) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+(+f1.0) sends(16) g22UD         g27UD           g29UD           0x0420ab02                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, imin) mlen 2 ex_mlen 2 rlen 2 { align1 1H };
+(+f1.0) sends(16) g25UD         g37UD           g2UD            0x0420a402                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, mov) mlen 2 ex_mlen 2 rlen 2 { align1 1H };
+sends(16)       nullUD          g8UD            g10UD           0x04025c02                0x00000100
+                            dp data 1 MsgDesc: ( DC untyped surface write, Surface = 2, SIMD16, Mask = 0xc) mlen 2 ex_mlen 4 rlen 0 { align1 1H };
+(+f1.0) sends(8) g127UD         g2UD            g9UD            0x0411a401                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, mov) mlen 2 ex_mlen 1 rlen 1 { align1 1Q };
+(+f1.0) sends(8) g127UD         g2UD            g4UD            0x0411b401                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, mov) mlen 2 ex_mlen 1 rlen 1 { align1 2Q };
+(+f1.0) sends(8) nullUD         g14UD           g15UD           0x02009201                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, or) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(16) nullUD        g24UD           g26UD           0x04008201                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, or) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+sends(8)        nullUD          g124UD          g11UD           0x04035000                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 0, SIMD16, Mask = 0x0) mlen 2 ex_mlen 4 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g5UD            g6UD            0x02035e02                0x00000040
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD16, Mask = 0xe) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g7UD            g9UD            0x02036e02                0x00000040
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD8, Mask = 0xe) mlen 1 ex_mlen 1 rlen 0 { align1 2Q };
+sends(8)        nullUD          g11UD           g21UD           0x04035e00                0x00000040
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 0, SIMD16, Mask = 0xe) mlen 2 ex_mlen 1 rlen 0 { align1 1Q };
+sends(8)        nullUD          g15UD           g27UD           0x04035e02                0x00000040
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD16, Mask = 0xe) mlen 2 ex_mlen 1 rlen 0 { align1 1Q };
+sends(8)        nullUD          g16UD           g28UD           0x04036e02                0x00000040
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD8, Mask = 0xe) mlen 2 ex_mlen 1 rlen 0 { align1 2Q };
+(+f1.0) sends(8) g13UD          g19UD           g20UD           0x0210bd02                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD8, umin) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+(+f1.0) sends(16) g22UD         g28UD           g30UD           0x0420ad02                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 2, SIMD16, umin) mlen 2 ex_mlen 2 rlen 2 { align1 1H };
+(+f1.0) sends(8) nullUD         g2UD            g4UD            0x04035c02                0x00000080
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD16, Mask = 0xc) mlen 2 ex_mlen 2 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g38UD           g40UD           0x04036c02                0x00000080
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD8, Mask = 0xc) mlen 2 ex_mlen 2 rlen 0 { align1 2Q };
+sends(8)        nullUD          g17UD           g6UD            0x02035000                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 0, SIMD16, Mask = 0x0) mlen 1 ex_mlen 4 rlen 0 { align1 1Q };
+sends(8)        g124UD          g20UD           g21UD           0x0211a700                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, add) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+sends(8)        g124UD          g20UD           g21UD           0x0211ad00                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, umin) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+sends(8)        g124UD          g20UD           g21UD           0x0211ac00                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, umax) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+sends(8)        g124UD          g20UD           g21UD           0x0211a100                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, and) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+sends(8)        g124UD          g20UD           g21UD           0x0211a200                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, or) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+sends(8)        g124UD          g20UD           g21UD           0x0211a300                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, xor) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+sends(8)        g124UD          g20UD           g21UD           0x0211a400                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, mov) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+sends(8)        g124UD          g21UD           g6UD            0x0211ae00                0x00000080
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 0, SIMD16, cmpwr) mlen 1 ex_mlen 2 rlen 1 { align1 1Q };
+(+f1.0) sends(8) nullUD         g16UD           g2UD            0x02035001                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD16, Mask = 0x0) mlen 1 ex_mlen 4 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g29UD           g8UD            0x02036001                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD8, Mask = 0x0) mlen 1 ex_mlen 4 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g14UD           g18UD           0x02035e01                0x00000040
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD16, Mask = 0xe) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g23UD           g7UD            0x02036e01                0x00000040
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 1, SIMD8, Mask = 0xe) mlen 1 ex_mlen 1 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g17UD           g2UD            0x02035002                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD16, Mask = 0x0) mlen 1 ex_mlen 4 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g28UD           g3UD            0x02036002                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 2, SIMD8, Mask = 0x0) mlen 1 ex_mlen 4 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g17UD           g2UD            0x02035003                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 3, SIMD16, Mask = 0x0) mlen 1 ex_mlen 4 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g23UD           g6UD            0x02036003                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 3, SIMD8, Mask = 0x0) mlen 1 ex_mlen 4 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g12UD           g13UD           0x02009701                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, add) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(16) nullUD        g20UD           g22UD           0x04008701                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, add) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+sends(8)        g7UD            g18UD           g19UD           0x0210bbfe                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 254, SIMD8, imin) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+sends(8)        nullUD          g6UD            g1UD            0x04035003                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 3, SIMD16, Mask = 0x0) mlen 2 ex_mlen 4 rlen 0 { align1 1Q };
+sends(8)        nullUD          g8UD            g10UD           0x04036003                0x00000100
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 3, SIMD8, Mask = 0x0) mlen 2 ex_mlen 4 rlen 0 { align1 2Q };
+(+f1.0) sends(8) g3UD           g21UD           g20UD           0x0210b701                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, add) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+(+f1.0) sends(8) g5UD           g21UD           g20UD           0x0210bd01                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, umin) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+(+f1.0) sends(8) g6UD           g21UD           g20UD           0x0210bc01                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, umax) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+(+f1.0) sends(8) g7UD           g21UD           g20UD           0x0210b101                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, and) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+(+f1.0) sends(8) g9UD           g21UD           g20UD           0x0210b301                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, xor) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+(+f1.0) sends(8) g10UD          g21UD           g20UD           0x0210b401                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, mov) mlen 1 ex_mlen 1 rlen 1 { align1 1Q };
+(+f1.0) sends(8) g11UD          g21UD           g11UD           0x0210be01                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, cmpwr) mlen 1 ex_mlen 2 rlen 1 { align1 1Q };
+(+f1.0) sends(16) g3UD          g38UD           g36UD           0x0420a701                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, add) mlen 2 ex_mlen 2 rlen 2 { align1 1H };
+(+f1.0) sends(16) g7UD          g38UD           g36UD           0x0420ad01                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, umin) mlen 2 ex_mlen 2 rlen 2 { align1 1H };
+(+f1.0) sends(16) g9UD          g38UD           g36UD           0x0420ac01                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, umax) mlen 2 ex_mlen 2 rlen 2 { align1 1H };
+(+f1.0) sends(16) g11UD         g38UD           g36UD           0x0420a101                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, and) mlen 2 ex_mlen 2 rlen 2 { align1 1H };
+(+f1.0) sends(16) g15UD         g38UD           g36UD           0x0420a301                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, xor) mlen 2 ex_mlen 2 rlen 2 { align1 1H };
+(+f1.0) sends(16) g17UD         g38UD           g36UD           0x0420a401                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, mov) mlen 2 ex_mlen 2 rlen 2 { align1 1H };
+(+f1.0) sends(16) g19UD         g38UD           g21UD           0x0420ae01                0x00000100
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, cmpwr) mlen 2 ex_mlen 4 rlen 2 { align1 1H };
+sends(8)        nullUD          g4UD            g12UD           0x04035e09                0x00000040
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 9, SIMD16, Mask = 0xe) mlen 2 ex_mlen 1 rlen 0 { align1 1Q };
+sends(8)        nullUD          g5UD            g13UD           0x04036e09                0x00000040
+                            dp data 1 MsgDesc: ( DC typed surface write, Surface = 9, SIMD8, Mask = 0xe) mlen 2 ex_mlen 1 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g14UD           g18UD           0x02009d01                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, umin) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g17UD           g19UD           0x02009c01                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, umax) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g19UD           g20UD           0x02009101                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, and) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g27UD           g22UD           0x02009301                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, xor) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g29UD           g23UD           0x02009401                0x00000040
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, mov) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g32UD           g2UD            0x02009e01                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD8, cmpwr) mlen 1 ex_mlen 2 rlen 0 { align1 1Q };
+(+f1.0) sends(16) nullUD        g18UD           g32UD           0x04008d01                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, umin) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+(+f1.0) sends(16) nullUD        g24UD           g33UD           0x04008c01                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, umax) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+(+f1.0) sends(16) nullUD        g30UD           g34UD           0x04008101                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, and) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+(+f1.0) sends(16) nullUD        g46UD           g36UD           0x04008301                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, xor) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+(+f1.0) sends(16) nullUD        g49UD           g37UD           0x04008401                0x00000080
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, mov) mlen 2 ex_mlen 2 rlen 0 { align1 1H };
+(+f1.0) sends(16) nullUD        g56UD           g2UD            0x04008e01                0x00000100
+                            dp data 1 MsgDesc: ( DC untyped atomic op, Surface = 1, SIMD16, cmpwr) mlen 2 ex_mlen 4 rlen 0 { align1 1H };
+(+f1.0) sends(8) nullUD         g20UD           g21UD           0x02018101                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, and) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g3UD            g38UD           0x02019101                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, and) mlen 1 ex_mlen 1 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g19UD           g20UD           0x02018201                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, or) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g3UD            g36UD           0x02019201                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, or) mlen 1 ex_mlen 1 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g19UD           g20UD           0x02018301                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, xor) mlen 1 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g3UD            g36UD           0x02019301                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, xor) mlen 1 ex_mlen 1 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g2UD            g18UD           0x04018701                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, add) mlen 2 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g2UD            g4UD            0x04019701                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, add) mlen 2 ex_mlen 1 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g2UD            g18UD           0x04018d01                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, umin) mlen 2 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g2UD            g4UD            0x04019d01                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, umin) mlen 2 ex_mlen 1 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g2UD            g18UD           0x04018101                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, and) mlen 2 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g2UD            g4UD            0x04019101                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, and) mlen 2 ex_mlen 1 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g2UD            g18UD           0x04018201                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, or) mlen 2 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g2UD            g4UD            0x04019201                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, or) mlen 2 ex_mlen 1 rlen 0 { align1 2Q };
+(+f1.0) sends(8) nullUD         g2UD            g18UD           0x04018301                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD16, xor) mlen 2 ex_mlen 1 rlen 0 { align1 1Q };
+(+f1.0) sends(8) nullUD         g2UD            g4UD            0x04019301                0x00000040
+                            dp data 1 MsgDesc: ( DC typed atomic, Surface = 1, SIMD8, xor) mlen 2 ex_mlen 1 rlen 0 { align1 2Q };
diff --git a/src/intel/compiler/elk/tests/gen9/sends.expected b/src/intel/compiler/elk/tests/gen9/sends.expected
new file mode 100644
index 00000000000..ff7a1235086
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/sends.expected
@@ -0,0 +1,134 @@
+33 00 60 0c 10 40 02 00 44 04 00 00 01 50 03 04
+33 10 60 0c 10 30 00 00 24 00 00 00 01 60 03 04
+33 00 60 0c 10 70 01 00 a4 02 00 00 01 50 03 04
+33 00 61 0c 1a 30 20 01 41 00 00 00 01 b2 10 02
+33 00 81 0c 1a 60 60 01 42 00 00 00 01 a2 20 04
+33 00 80 0c 10 80 00 00 c2 00 00 00 fe 5e 02 04
+33 00 80 0c 10 c0 00 00 42 01 00 00 fe 87 00 04
+33 00 61 0c 12 50 00 00 64 01 00 00 02 50 03 04
+33 10 61 0c 12 b0 00 00 44 00 00 00 02 60 03 04
+33 00 61 0c 12 40 00 00 64 00 00 00 01 60 02 02
+33 00 81 0c 12 50 00 00 68 00 00 00 01 50 02 04
+33 00 60 0c 10 30 00 00 41 00 00 00 00 9b 00 02
+33 00 61 0c 12 40 00 00 41 00 00 00 01 5e 03 04
+33 10 61 0c 12 90 02 00 a1 05 00 00 01 6e 03 04
+33 00 61 0c 12 40 00 00 41 00 00 00 01 8c 01 04
+33 10 61 0c 12 90 02 00 a1 05 00 00 01 9c 01 04
+33 00 61 0c 12 40 00 00 41 00 00 00 01 84 01 04
+33 10 61 0c 12 90 02 00 a1 05 00 00 01 94 01 04
+33 00 61 0c 12 40 00 00 42 00 00 00 01 8e 01 04
+33 10 61 0c 12 d0 00 00 62 01 00 00 01 9e 01 04
+33 00 80 0c 10 10 00 00 62 00 00 00 fe 8d 00 04
+33 00 80 0c 10 10 00 00 a2 00 00 00 fe 8b 00 04
+33 00 80 0c 10 10 00 00 62 00 00 00 fe 8c 00 04
+33 00 80 0c 10 10 00 00 a2 00 00 00 fe 8a 00 04
+33 00 80 0c 10 10 00 00 62 00 00 00 fe 81 00 04
+33 00 80 0c 10 10 00 00 62 00 00 00 fe 82 00 04
+33 00 80 0c 10 10 00 00 62 00 00 00 fe 83 00 04
+33 00 80 0c 10 10 00 00 62 00 00 00 fe 84 00 04
+33 00 80 0c 10 70 00 00 64 00 00 00 fe 8e 00 04
+33 00 80 0c 18 50 21 00 62 02 00 00 fe a4 20 04
+33 00 80 0c 18 90 a1 01 e2 02 00 00 fe a2 20 04
+33 00 60 0c 10 a0 00 00 c4 01 00 00 00 60 02 02
+33 00 60 0c 10 20 00 00 81 00 00 00 fe 6e 02 02
+33 00 60 0c 18 40 e1 00 61 02 00 00 fe bd 10 02
+33 00 60 0c 18 a0 61 01 21 03 00 00 fe b4 10 02
+33 00 80 0c 18 00 21 00 c2 01 00 00 fe a7 20 04
+33 00 61 0c 12 d0 00 00 44 20 00 00 00 00 00 00
+33 00 61 0c 12 60 00 00 a1 00 00 00 01 6e 02 02
+33 00 61 0c 12 60 00 00 a1 00 00 00 02 6e 02 02
+33 00 81 0c 12 80 00 00 c2 00 00 00 01 5e 02 04
+33 00 81 0c 12 80 00 00 c2 00 00 00 02 5e 02 04
+33 00 61 0c 1a 90 60 00 01 01 00 00 02 b7 10 02
+33 00 81 0c 1a d0 80 00 62 01 00 00 02 a7 20 04
+33 00 61 0c 12 30 00 00 a2 00 00 00 01 6c 02 02
+33 00 81 0c 12 50 01 00 64 02 00 00 01 5c 02 04
+33 00 60 0c 10 f0 00 00 c1 01 00 00 00 6e 02 02
+33 00 60 0c 10 90 00 00 02 02 00 00 00 6c 02 02
+33 00 61 0c 12 20 01 00 e4 01 00 00 01 50 03 06
+33 10 61 0c 12 b0 00 00 44 04 00 00 01 60 03 06
+33 00 61 0c 1a 30 a1 01 41 02 00 00 02 bb 10 02
+33 00 61 0c 1a e0 01 02 21 03 00 00 02 b4 10 02
+33 00 81 0c 1a d0 c1 02 62 03 00 00 02 ab 20 04
+33 00 81 0c 1a 20 20 03 a2 04 00 00 02 a4 20 04
+33 00 80 0c 10 a0 00 00 04 01 00 00 02 5c 02 04
+33 00 61 0c 1a 90 e0 0f 41 00 00 00 01 a4 11 04
+33 10 61 0c 1a 40 e0 0f 41 00 00 00 01 b4 11 04
+33 00 61 0c 12 f0 00 00 c1 01 00 00 01 92 00 02
+33 00 81 0c 12 a0 01 00 02 03 00 00 01 82 00 04
+33 00 60 0c 10 b0 00 00 84 0f 00 00 00 50 03 04
+33 00 61 0c 12 60 00 00 a1 00 00 00 02 5e 03 02
+33 10 61 0c 12 90 00 00 e1 00 00 00 02 6e 03 02
+33 00 60 0c 10 50 01 00 61 01 00 00 00 5e 03 04
+33 00 60 0c 10 b0 01 00 e1 01 00 00 02 5e 03 04
+33 10 60 0c 10 c0 01 00 01 02 00 00 02 6e 03 04
+33 00 61 0c 1a 40 a1 01 61 02 00 00 02 bd 10 02
+33 00 81 0c 1a e0 c1 02 82 03 00 00 02 ad 20 04
+33 00 61 0c 12 40 00 00 42 00 00 00 02 5c 03 04
+33 10 61 0c 12 80 02 00 c2 04 00 00 02 6c 03 04
+33 00 60 0c 10 60 00 00 24 02 00 00 00 50 03 02
+33 00 60 0c 18 50 81 0f 81 02 00 00 00 a7 11 02
+33 00 60 0c 18 50 81 0f 81 02 00 00 00 ad 11 02
+33 00 60 0c 18 50 81 0f 81 02 00 00 00 ac 11 02
+33 00 60 0c 18 50 81 0f 81 02 00 00 00 a1 11 02
+33 00 60 0c 18 50 81 0f 81 02 00 00 00 a2 11 02
+33 00 60 0c 18 50 81 0f 81 02 00 00 00 a3 11 02
+33 00 60 0c 18 50 81 0f 81 02 00 00 00 a4 11 02
+33 00 60 0c 18 60 80 0f a2 02 00 00 00 ae 11 02
+33 00 61 0c 12 20 00 00 04 02 00 00 01 50 03 02
+33 10 61 0c 12 80 00 00 a4 03 00 00 01 60 03 02
+33 00 61 0c 12 20 01 00 c1 01 00 00 01 5e 03 02
+33 10 61 0c 12 70 00 00 e1 02 00 00 01 6e 03 02
+33 00 61 0c 12 20 00 00 24 02 00 00 02 50 03 02
+33 10 61 0c 12 30 00 00 84 03 00 00 02 60 03 02
+33 00 61 0c 12 20 00 00 24 02 00 00 03 50 03 02
+33 10 61 0c 12 60 00 00 e4 02 00 00 03 60 03 02
+33 00 61 0c 12 d0 00 00 81 01 00 00 01 97 00 02
+33 00 81 0c 12 60 01 00 82 02 00 00 01 87 00 04
+33 00 60 0c 18 30 e1 00 41 02 00 00 fe bb 10 02
+33 00 60 0c 10 10 00 00 c4 00 00 00 03 50 03 04
+33 10 60 0c 10 a0 00 00 04 01 00 00 03 60 03 04
+33 00 61 0c 1a 40 61 00 a1 02 00 00 01 b7 10 02
+33 00 61 0c 1a 40 a1 00 a1 02 00 00 01 bd 10 02
+33 00 61 0c 1a 40 c1 00 a1 02 00 00 01 bc 10 02
+33 00 61 0c 1a 40 e1 00 a1 02 00 00 01 b1 10 02
+33 00 61 0c 1a 40 21 01 a1 02 00 00 01 b3 10 02
+33 00 61 0c 1a 40 41 01 a1 02 00 00 01 b4 10 02
+33 00 61 0c 1a b0 60 01 a2 02 00 00 01 be 10 02
+33 00 81 0c 1a 40 62 00 c2 04 00 00 01 a7 20 04
+33 00 81 0c 1a 40 e2 00 c2 04 00 00 01 ad 20 04
+33 00 81 0c 1a 40 22 01 c2 04 00 00 01 ac 20 04
+33 00 81 0c 1a 40 62 01 c2 04 00 00 01 a1 20 04
+33 00 81 0c 1a 40 e2 01 c2 04 00 00 01 a3 20 04
+33 00 81 0c 1a 40 22 02 c2 04 00 00 01 a4 20 04
+33 00 81 0c 1a 50 61 02 c4 04 00 00 01 ae 20 04
+33 00 60 0c 10 c0 00 00 81 00 00 00 09 5e 03 04
+33 10 60 0c 10 d0 00 00 a1 00 00 00 09 6e 03 04
+33 00 61 0c 12 20 01 00 c1 01 00 00 01 9d 00 02
+33 00 61 0c 12 30 01 00 21 02 00 00 01 9c 00 02
+33 00 61 0c 12 40 01 00 61 02 00 00 01 91 00 02
+33 00 61 0c 12 60 01 00 61 03 00 00 01 93 00 02
+33 00 61 0c 12 70 01 00 a1 03 00 00 01 94 00 02
+33 00 61 0c 12 20 00 00 02 04 00 00 01 9e 00 02
+33 00 81 0c 12 00 02 00 42 02 00 00 01 8d 00 04
+33 00 81 0c 12 10 02 00 02 03 00 00 01 8c 00 04
+33 00 81 0c 12 20 02 00 c2 03 00 00 01 81 00 04
+33 00 81 0c 12 40 02 00 c2 05 00 00 01 83 00 04
+33 00 81 0c 12 50 02 00 22 06 00 00 01 84 00 04
+33 00 81 0c 12 20 00 00 04 07 00 00 01 8e 00 04
+33 00 61 0c 12 50 01 00 81 02 00 00 01 81 01 02
+33 10 61 0c 12 60 02 00 61 00 00 00 01 91 01 02
+33 00 61 0c 12 40 01 00 61 02 00 00 01 82 01 02
+33 10 61 0c 12 40 02 00 61 00 00 00 01 92 01 02
+33 00 61 0c 12 40 01 00 61 02 00 00 01 83 01 02
+33 10 61 0c 12 40 02 00 61 00 00 00 01 93 01 02
+33 00 61 0c 12 20 01 00 41 00 00 00 01 87 01 04
+33 10 61 0c 12 40 00 00 41 00 00 00 01 97 01 04
+33 00 61 0c 12 20 01 00 41 00 00 00 01 8d 01 04
+33 10 61 0c 12 40 00 00 41 00 00 00 01 9d 01 04
+33 00 61 0c 12 20 01 00 41 00 00 00 01 81 01 04
+33 10 61 0c 12 40 00 00 41 00 00 00 01 91 01 04
+33 00 61 0c 12 20 01 00 41 00 00 00 01 82 01 04
+33 10 61 0c 12 40 00 00 41 00 00 00 01 92 01 04
+33 00 61 0c 12 20 01 00 41 00 00 00 01 83 01 04
+33 10 61 0c 12 40 00 00 41 00 00 00 01 93 01 04
diff --git a/src/intel/compiler/elk/tests/gen9/shl.asm b/src/intel/compiler/elk/tests/gen9/shl.asm
new file mode 100644
index 00000000000..03484a216db
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/shl.asm
@@ -0,0 +1,13 @@
+shl(16)         g18<1>D         g20<8,8,1>D     0x00000002UD    { align1 1H };
+shl(8)          g18<1>D         g17<8,8,1>D     0x00000002UD    { align1 1Q };
+shl(1)          g8<1>UD         g5<0,1,0>UD     0x00000008UD    { align1 WE_all 1N };
+shl(8)          g4<1>UD         g6<8,8,1>UD     g3<8,8,1>UD     { align1 1Q };
+shl(1)          a0<1>UD         g43<0,1,0>UD    0x00000002UD    { align1 WE_all 1N };
+shl(16)         g116<1>D        g1<0,1,0>D      0x00000005UD    { align1 2H };
+shl(8)          g26<1>UD        g34<8,8,1>UW    0x00000002UD    { align1 1Q };
+shl(8)          g3<1>UD         g23<8,8,1>UD    g21<8,8,1>UD    { align1 WE_all 1Q };
+shl(16)         g10<1>UD        g10<8,8,1>UD    0x00000010UD    { align1 1H };
+shl(1)          g14<1>UD        g21<0,1,0>UD    0x00000008UD    { align1 WE_all 3N };
+shl(8)          g11<1>Q         g5<4,4,1>Q      g3<4,4,1>UD     { align1 1Q };
+shl(1)          a0<1>UD         g13<0,1,0>D     0x00000002UD    { align1 WE_all 1N };
+shl(8)          g22<1>Q         g8<4,4,1>Q      g4<4,4,1>UD     { align1 2Q };
diff --git a/src/intel/compiler/elk/tests/gen9/shl.expected b/src/intel/compiler/elk/tests/gen9/shl.expected
new file mode 100644
index 00000000000..0b09a78762d
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/shl.expected
@@ -0,0 +1,13 @@
+09 00 80 00 28 0a 40 22 80 02 8d 06 02 00 00 00
+09 00 60 00 28 0a 40 22 20 02 8d 06 02 00 00 00
+09 00 00 00 0c 02 00 21 a0 00 00 06 08 00 00 00
+09 00 60 00 08 02 80 20 c0 00 8d 02 60 00 8d 00
+09 00 00 00 04 02 00 22 60 05 00 06 02 00 00 00
+09 20 80 00 28 0a 80 2e 20 00 00 06 05 00 00 00
+09 00 60 00 08 12 40 23 40 04 8d 06 02 00 00 00
+09 00 60 00 0c 02 60 20 e0 02 8d 02 a0 02 8d 00
+09 00 80 00 08 02 40 21 40 01 8d 06 10 00 00 00
+09 10 00 00 0c 02 c0 21 a0 02 00 06 08 00 00 00
+09 00 60 00 28 4b 60 21 a0 00 69 02 60 00 69 00
+09 00 00 00 04 0a 00 22 a0 01 00 06 02 00 00 00
+09 10 60 00 28 4b c0 22 00 01 69 02 80 00 69 00
diff --git a/src/intel/compiler/elk/tests/gen9/shr.asm b/src/intel/compiler/elk/tests/gen9/shr.asm
new file mode 100644
index 00000000000..f64c61767d2
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/shr.asm
@@ -0,0 +1,8 @@
+shr(8)          g20<1>UD        g19<8,8,1>UD    0x00000001UD    { align1 1Q };
+shr(16)         g43<1>UD        g41<8,8,1>UD    0x00000001UD    { align1 1H };
+shr.z.f0.0(8)   g3<1>UD         g1<8,8,1>UD     0x0000001bUD    { align1 1Q };
+shr(16)         g8<1>UW         g1<1,8,0>UB     0x44440000V     { align1 1H };
+shr.z.f0.0(8)   null<1>UD       g1<8,8,1>UD     0x0000001bUD    { align1 1Q };
+shr(8)          g3<1>UW         g1.28<1,8,0>UB  0x76543210V     { align1 1Q };
+shr(8)          g3<2>UW         g5<8,8,1>UD     g4<8,8,1>UW     { align1 1Q };
+shr(16)         g20<2>UW        g15<8,8,1>UD    g13<8,8,1>UW    { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/shr.expected b/src/intel/compiler/elk/tests/gen9/shr.expected
new file mode 100644
index 00000000000..58830ed506a
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/shr.expected
@@ -0,0 +1,8 @@
+08 00 60 00 08 02 80 22 60 02 8d 06 01 00 00 00
+08 00 80 00 08 02 60 25 20 05 8d 06 01 00 00 00
+08 00 60 01 08 02 60 20 20 00 8d 06 1b 00 00 00
+08 00 80 00 48 22 00 21 20 00 2c 36 00 00 44 44
+08 00 60 01 00 02 00 20 20 00 8d 06 1b 00 00 00
+08 00 60 00 48 22 60 20 3c 00 2c 36 10 32 54 76
+08 00 60 00 48 02 60 40 a0 00 8d 12 80 00 8d 00
+08 00 80 00 48 02 80 42 e0 01 8d 12 a0 01 8d 00
diff --git a/src/intel/compiler/elk/tests/gen9/wait.asm b/src/intel/compiler/elk/tests/gen9/wait.asm
new file mode 100644
index 00000000000..864acd0a8e0
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/wait.asm
@@ -0,0 +1,3 @@
+wait(1)         n0<1>UD                                         { align1 WE_all 1N };
+wait(1)         n0.1<1>UD                                       { align1 WE_all 1N };
+wait(1)         n0.2<1>UD                                       { align1 WE_all 1N };
diff --git a/src/intel/compiler/elk/tests/gen9/wait.expected b/src/intel/compiler/elk/tests/gen9/wait.expected
new file mode 100644
index 00000000000..31565e5049f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/wait.expected
@@ -0,0 +1,3 @@
+30 00 00 00 04 00 00 32 00 12 00 38 00 00 8d 00
+30 00 00 00 04 00 04 32 04 12 00 38 00 00 8d 00
+30 00 00 00 04 00 08 32 08 12 00 38 00 00 8d 00
diff --git a/src/intel/compiler/elk/tests/gen9/while.asm b/src/intel/compiler/elk/tests/gen9/while.asm
new file mode 100644
index 00000000000..7aaae755391
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/while.asm
@@ -0,0 +1,5 @@
+LABEL0:
+while(8)        JIP: LABEL0                                     { align1 1Q };
+while(16)       JIP: LABEL0                                     { align1 1H };
+(-f0.0) while(8) JIP: LABEL0                                    { align1 1Q };
+(-f0.0) while(16) JIP: LABEL0                                   { align1 1H };
diff --git a/src/intel/compiler/elk/tests/gen9/while.expected b/src/intel/compiler/elk/tests/gen9/while.expected
new file mode 100644
index 00000000000..8b6c4da652f
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/while.expected
@@ -0,0 +1,4 @@
+27 00 60 00 20 0e 00 20 00 00 00 08 00 00 00 00
+27 00 80 00 20 0e 00 20 00 00 00 08 f0 ff ff ff
+27 00 71 00 20 0e 00 20 00 00 00 08 e0 ff ff ff
+27 00 91 00 20 0e 00 20 00 00 00 08 d0 ff ff ff
diff --git a/src/intel/compiler/elk/tests/gen9/xor.asm b/src/intel/compiler/elk/tests/gen9/xor.asm
new file mode 100644
index 00000000000..bc4c05456ef
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/xor.asm
@@ -0,0 +1,2 @@
+xor(16)         g3<1>UD         g1<0,1,0>UD     g1.1<0,1,0>UD   { align1 1H };
+xor(8)          g4<1>UD         g5.6<0,1,0>UD   ~g5.7<0,1,0>D   { align1 1Q };
diff --git a/src/intel/compiler/elk/tests/gen9/xor.expected b/src/intel/compiler/elk/tests/gen9/xor.expected
new file mode 100644
index 00000000000..2e27e335dd8
--- /dev/null
+++ b/src/intel/compiler/elk/tests/gen9/xor.expected
@@ -0,0 +1,2 @@
+07 00 80 00 08 02 60 20 20 00 00 02 24 00 00 00
+07 00 60 00 08 02 80 20 b8 00 00 0a bc 40 00 00
diff --git a/src/intel/compiler/elk/tests/run-test.py b/src/intel/compiler/elk/tests/run-test.py
new file mode 100755
index 00000000000..6178a120496
--- /dev/null
+++ b/src/intel/compiler/elk/tests/run-test.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+
+import argparse
+import difflib
+import errno
+import os
+import pathlib
+import subprocess
+import sys
+
+# The meson version handles windows paths better, but if it's not available
+# fall back to shlex
+try:
+    from meson.mesonlib import split_args
+except ImportError:
+    from shlex import split as split_args
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--brw_asm',
+                    help='path to brw_asm binary')
+parser.add_argument('--gen_name',
+                    help='name of the hardware generation (as understood by brw_asm)')
+parser.add_argument('--gen_folder',
+                    type=pathlib.Path,
+                    help='name of the folder for the generation')
+args = parser.parse_args()
+
+wrapper = os.environ.get('MESON_EXE_WRAPPER')
+if wrapper is not None:
+    brw_asm = split_args(wrapper) + [args.brw_asm]
+else:
+    brw_asm = [args.brw_asm]
+
+if not args.gen_folder.is_dir():
+    print('Test files path does not exist or is not a directory.',
+          file=sys.stderr)
+    exit(99)
+
+success = True
+
+for asm_file in args.gen_folder.glob('*.asm'):
+    expected_file = asm_file.stem + '.expected'
+    expected_path = args.gen_folder / expected_file
+
+    try:
+        command = brw_asm + [
+            '--type', 'hex',
+            '--gen', args.gen_name,
+            asm_file
+        ]
+        with subprocess.Popen(command,
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.DEVNULL) as cmd:
+            lines_after = [line.decode('ascii') for line in cmd.stdout.readlines()]
+    except OSError as e:
+        if e.errno == errno.ENOEXEC:
+            print('Skipping due to inability to run host binaries.',
+                  file=sys.stderr)
+            exit(77)
+        raise
+
+    with expected_path.open() as f:
+        lines_before = f.readlines()
+
+    diff = ''.join(difflib.unified_diff(lines_before, lines_after,
+                                        expected_file, asm_file.stem + '.out'))
+
+    if diff:
+        print('Output comparison for {}:'.format(asm_file.name))
+        print(diff)
+        success = False
+    else:
+        print('{} : PASS'.format(asm_file.name))
+
+if not success:
+    exit(1)