intel/elk: Fork Gfx8- compiler by copying existing code

Based on code from commit c3ceec6cd8. Acked-by: Ian Romanick <ian.d.romanick@intel.com> Acked-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27563>
2026-05-15 18:38:05 +02:00 · 2024-01-19 11:32:57 -08:00 · 2024-01-19 11:32:57 -08:00 · d44462c08d
commit d44462c08d
parent a9214460ee
777 changed files with 151345 additions and 0 deletions
--- a/src/intel/compiler/elk/brw_asm.h
+++ b/src/intel/compiler/elk/brw_asm.h
@ -0,0 +1,122 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef BRW_ASM_H
+#define BRW_ASM_H
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <assert.h>
+
+#include "compiler/brw_reg.h"
+#include "compiler/brw_reg_type.h"
+#include "compiler/brw_eu_defines.h"
+#include "compiler/brw_inst.h"
+#include "compiler/brw_eu.h"
+#include "dev/intel_device_info.h"
+#include "util/list.h"
+
+/* glibc < 2.27 defines OVERFLOW in /usr/include/math.h. */
+#undef OVERFLOW
+
+int yyparse(void);
+int yylex(void);
+char *lex_text(void);
+
+extern struct brw_codegen *p;
+extern int errors;
+extern char *input_filename;
+
+extern struct list_head instr_labels;
+extern struct list_head target_labels;
+
+struct condition {
+   unsigned cond_modifier:4;
+   unsigned flag_reg_nr:1;
+   unsigned flag_subreg_nr:1;
+};
+
+struct predicate {
+   unsigned pred_control:4;
+   unsigned pred_inv:1;
+   unsigned flag_reg_nr:1;
+   unsigned flag_subreg_nr:1;
+};
+
+enum instoption_type {
+   INSTOPTION_FLAG,
+   INSTOPTION_DEP_INFO,
+};
+
+struct instoption {
+   enum instoption_type type;
+   union {
+      unsigned uint_value;
+      struct tgl_swsb depinfo_value;
+   };
+};
+
+struct options {
+   unsigned access_mode:1;
+   unsigned compression_control:2;
+   unsigned thread_control:2;
+   unsigned no_dd_check:1; // Dependency control
+   unsigned no_dd_clear:1; // Dependency control
+   unsigned mask_control:1;
+   unsigned debug_control:1;
+   unsigned acc_wr_control:1;
+   unsigned end_of_thread:1;
+   unsigned compaction:1;
+   unsigned qtr_ctrl:2;
+   unsigned nib_ctrl:1;
+   unsigned is_compr:1;
+   struct tgl_swsb depinfo;
+};
+
+struct msgdesc {
+   unsigned ex_bso:1;
+   unsigned src1_len:5;
+};
+
+enum instr_label_type {
+   INSTR_LABEL_JIP,
+   INSTR_LABEL_UIP,
+};
+
+struct instr_label {
+   struct list_head link;
+
+   char *name;
+   int offset;
+   enum instr_label_type type;
+};
+
+struct target_label {
+   struct list_head link;
+
+   char *name;
+   int offset;
+};
+
+#endif /* BRW_ASM_H */
--- a/src/intel/compiler/elk/brw_asm_tool.c
+++ b/src/intel/compiler/elk/brw_asm_tool.c
@ -0,0 +1,385 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include "brw_asm.h"
+#include "intel/compiler/brw_disasm_info.h"
+
+enum opt_output_type {
+   OPT_OUTPUT_HEX,
+   OPT_OUTPUT_C_LITERAL,
+   OPT_OUTPUT_BIN,
+};
+
+extern FILE *yyin;
+struct brw_codegen *p;
+static enum opt_output_type output_type = OPT_OUTPUT_BIN;
+char *input_filename = NULL;
+int errors;
+
+struct list_head instr_labels;
+struct list_head target_labels;
+
+static void
+print_help(const char *progname, FILE *file)
+{
+   fprintf(file,
+           "Usage: %s [OPTION] inputfile\n"
+           "Assemble i965 instructions from input file.\n\n"
+           "    -h, --help             display this help and exit\n"
+           "    -t, --type=OUTPUT_TYPE OUTPUT_TYPE can be 'bin' (default if omitted),\n"
+           "                           'c_literal', or 'hex'\n"
+           "    -o, --output           specify output file\n"
+           "        --compact          print compacted instructions\n"
+           "    -g, --gen=platform     assemble instructions for given \n"
+           "                           platform (3 letter platform name)\n"
+           "Example:\n"
+           "    i965_asm -g kbl input.asm -t hex -o output\n",
+           progname);
+}
+
+static uint32_t
+get_dword(const brw_inst *inst, int idx)
+{
+   uint32_t dword;
+   memcpy(&dword, (char *)inst + 4 * idx, sizeof(dword));
+   return dword;
+}
+
+static void
+print_instruction(FILE *output, bool compact, const brw_inst *instruction)
+{
+   int byte_limit;
+
+   byte_limit = (compact == true) ? 8 : 16;
+
+   switch (output_type) {
+   case OPT_OUTPUT_HEX: {
+      fprintf(output, "%02x", ((unsigned char *)instruction)[0]);
+
+      for (unsigned i = 1; i < byte_limit; i++) {
+         fprintf(output, " %02x", ((unsigned char *)instruction)[i]);
+      }
+      break;
+   }
+   case OPT_OUTPUT_C_LITERAL: {
+      fprintf(output, "\t0x%08x,", get_dword(instruction, 0));
+
+      for (unsigned i = 1; i < byte_limit / 4; i++)
+         fprintf(output, " 0x%08x,", get_dword(instruction, i));
+
+      break;
+   }
+   case OPT_OUTPUT_BIN:
+      fwrite(instruction, 1, byte_limit, output);
+      break;
+   }
+
+   if (output_type != OPT_OUTPUT_BIN) {
+      fprintf(output, "\n");
+   }
+}
+
+static struct intel_device_info *
+i965_disasm_init(uint16_t pci_id)
+{
+   struct intel_device_info *devinfo;
+
+   devinfo = malloc(sizeof *devinfo);
+   if (devinfo == NULL)
+      return NULL;
+
+   if (!intel_get_device_info_from_pci_id(pci_id, devinfo)) {
+      fprintf(stderr, "can't find device information: pci_id=0x%x\n",
+              pci_id);
+      free(devinfo);
+      return NULL;
+   }
+
+   return devinfo;
+}
+
+static bool
+i965_postprocess_labels()
+{
+   if (p->devinfo->ver < 6) {
+      return true;
+   }
+
+   void *store = p->store;
+
+   struct target_label *tlabel;
+   struct instr_label *ilabel, *s;
+
+   const unsigned to_bytes_scale = brw_jump_scale(p->devinfo);
+
+   LIST_FOR_EACH_ENTRY(tlabel, &target_labels, link) {
+      LIST_FOR_EACH_ENTRY_SAFE(ilabel, s, &instr_labels, link) {
+         if (!strcmp(tlabel->name, ilabel->name)) {
+            brw_inst *inst = store + ilabel->offset;
+
+            int relative_offset = (tlabel->offset - ilabel->offset) / sizeof(brw_inst);
+            relative_offset *= to_bytes_scale;
+
+            unsigned opcode = brw_inst_opcode(p->isa, inst);
+
+            if (ilabel->type == INSTR_LABEL_JIP) {
+               switch (opcode) {
+               case BRW_OPCODE_IF:
+               case BRW_OPCODE_ELSE:
+               case BRW_OPCODE_ENDIF:
+               case BRW_OPCODE_WHILE:
+                  if (p->devinfo->ver >= 7) {
+                     brw_inst_set_jip(p->devinfo, inst, relative_offset);
+                  } else if (p->devinfo->ver == 6) {
+                     brw_inst_set_gfx6_jump_count(p->devinfo, inst, relative_offset);
+                  }
+                  break;
+               case BRW_OPCODE_BREAK:
+               case BRW_OPCODE_HALT:
+               case BRW_OPCODE_CONTINUE:
+                  brw_inst_set_jip(p->devinfo, inst, relative_offset);
+                  break;
+               default:
+                  fprintf(stderr, "Unknown opcode %d with JIP label\n", opcode);
+                  return false;
+               }
+            } else {
+               switch (opcode) {
+               case BRW_OPCODE_IF:
+               case BRW_OPCODE_ELSE:
+                  if (p->devinfo->ver > 7) {
+                     brw_inst_set_uip(p->devinfo, inst, relative_offset);
+                  } else if (p->devinfo->ver == 7) {
+                     brw_inst_set_uip(p->devinfo, inst, relative_offset);
+                  } else if (p->devinfo->ver == 6) {
+                     // Nothing
+                  }
+                  break;
+               case BRW_OPCODE_WHILE:
+               case BRW_OPCODE_ENDIF:
+                  fprintf(stderr, "WHILE/ENDIF cannot have UIP offset\n");
+                  return false;
+               case BRW_OPCODE_BREAK:
+               case BRW_OPCODE_CONTINUE:
+               case BRW_OPCODE_HALT:
+                  brw_inst_set_uip(p->devinfo, inst, relative_offset);
+                  break;
+               default:
+                  fprintf(stderr, "Unknown opcode %d with UIP label\n", opcode);
+                  return false;
+               }
+            }
+
+            list_del(&ilabel->link);
+         }
+      }
+   }
+
+   LIST_FOR_EACH_ENTRY(ilabel, &instr_labels, link) {
+      fprintf(stderr, "Unknown label '%s'\n", ilabel->name);
+   }
+
+   return list_is_empty(&instr_labels);
+}
+
+int main(int argc, char **argv)
+{
+   char *output_file = NULL;
+   char c;
+   FILE *output = stdout;
+   bool help = false, compact = false;
+   void *store;
+   uint64_t pci_id = 0;
+   int offset = 0, err;
+   int start_offset = 0;
+   struct disasm_info *disasm_info;
+   struct intel_device_info *devinfo = NULL;
+   int result = EXIT_FAILURE;
+   list_inithead(&instr_labels);
+   list_inithead(&target_labels);
+
+   const struct option i965_asm_opts[] = {
+      { "help",          no_argument,       (int *) &help,      true },
+      { "type",          required_argument, NULL,               't' },
+      { "gen",           required_argument, NULL,               'g' },
+      { "output",        required_argument, NULL,               'o' },
+      { "compact",       no_argument,       (int *) &compact,   true },
+      { NULL,            0,                 NULL,               0 }
+   };
+
+   while ((c = getopt_long(argc, argv, ":t:g:o:h", i965_asm_opts, NULL)) != -1) {
+      switch (c) {
+      case 'g': {
+         const int id = intel_device_name_to_pci_device_id(optarg);
+         if (id < 0) {
+            fprintf(stderr, "can't parse gen: '%s', expected 3 letter "
+                            "platform name\n", optarg);
+            goto end;
+         } else {
+            pci_id = id;
+         }
+         break;
+      }
+      case 'h':
+         help = true;
+         print_help(argv[0], stderr);
+         goto end;
+      case 't': {
+         if (strcmp(optarg, "hex") == 0) {
+            output_type = OPT_OUTPUT_HEX;
+         } else if (strcmp(optarg, "c_literal") == 0) {
+            output_type = OPT_OUTPUT_C_LITERAL;
+         } else if (strcmp(optarg, "bin") == 0) {
+            output_type = OPT_OUTPUT_BIN;
+         } else {
+            fprintf(stderr, "invalid value for --type: %s\n", optarg);
+            goto end;
+         }
+         break;
+      }
+      case 'o':
+         output_file = strdup(optarg);
+         break;
+      case 0:
+         break;
+      case ':':
+         fprintf(stderr, "%s: option `-%c' requires an argument\n",
+                 argv[0], optopt);
+         goto end;
+      case '?':
+      default:
+         fprintf(stderr, "%s: option `-%c' is invalid: ignored\n",
+                 argv[0], optopt);
+         goto end;
+      }
+   }
+
+   if (help || !pci_id) {
+      print_help(argv[0], stderr);
+      goto end;
+   }
+
+   if (!argv[optind]) {
+      fprintf(stderr, "Please specify input file\n");
+      goto end;
+   }
+
+   input_filename = strdup(argv[optind]);
+   yyin = fopen(input_filename, "r");
+   if (!yyin) {
+      fprintf(stderr, "Unable to read input file : %s\n",
+              input_filename);
+      goto end;
+   }
+
+   if (output_file) {
+      output = fopen(output_file, "w");
+      if (!output) {
+         fprintf(stderr, "Couldn't open output file\n");
+         goto end;
+      }
+   }
+
+   devinfo = i965_disasm_init(pci_id);
+   if (!devinfo) {
+      fprintf(stderr, "Unable to allocate memory for "
+                      "intel_device_info struct instance.\n");
+      goto end;
+   }
+
+   struct brw_isa_info isa;
+   brw_init_isa_info(&isa, devinfo);
+
+   p = rzalloc(NULL, struct brw_codegen);
+   brw_init_codegen(&isa, p, p);
+   p->automatic_exec_sizes = false;
+
+   err = yyparse();
+   if (err || errors)
+      goto end;
+
+   if (!i965_postprocess_labels())
+      goto end;
+
+   store = p->store;
+
+   disasm_info = disasm_initialize(p->isa, NULL);
+   if (!disasm_info) {
+      fprintf(stderr, "Unable to initialize disasm_info struct instance\n");
+      goto end;
+   }
+
+   if (output_type == OPT_OUTPUT_C_LITERAL)
+      fprintf(output, "{\n");
+
+   brw_validate_instructions(p->isa, p->store, 0,
+                             p->next_insn_offset, disasm_info);
+
+   const int nr_insn = (p->next_insn_offset - start_offset) / 16;
+
+   if (compact)
+      brw_compact_instructions(p, start_offset, disasm_info);
+
+   for (int i = 0; i < nr_insn; i++) {
+      const brw_inst *insn = store + offset;
+      bool compacted = false;
+
+      if (compact && brw_inst_cmpt_control(p->devinfo, insn)) {
+            offset += 8;
+            compacted = true;
+      } else {
+            offset += 16;
+      }
+
+      print_instruction(output, compacted, insn);
+   }
+
+   ralloc_free(disasm_info);
+
+   if (output_type == OPT_OUTPUT_C_LITERAL)
+      fprintf(output, "}");
+
+   result = EXIT_SUCCESS;
+   goto end;
+
+end:
+   free(input_filename);
+   free(output_file);
+
+   if (yyin)
+      fclose(yyin);
+
+   if (output)
+      fclose(output);
+
+   if (p)
+      ralloc_free(p);
+
+   if (devinfo)
+      free(devinfo);
+
+   exit(result);
+}
--- a/src/intel/compiler/elk/brw_cfg.cpp
+++ b/src/intel/compiler/elk/brw_cfg.cpp
@ -0,0 +1,833 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_cfg.h"
+#include "util/u_dynarray.h"
+#include "brw_shader.h"
+
+/** @file brw_cfg.cpp
+ *
+ * Walks the shader instructions generated and creates a set of basic
+ * blocks with successor/predecessor edges connecting them.
+ */
+
+using namespace brw;
+
+static bblock_t *
+pop_stack(exec_list *list)
+{
+   bblock_link *link = (bblock_link *)list->get_tail();
+   bblock_t *block = link->block;
+   link->link.remove();
+
+   return block;
+}
+
+static exec_node *
+link(void *mem_ctx, bblock_t *block, enum bblock_link_kind kind)
+{
+   bblock_link *l = new(mem_ctx) bblock_link(block, kind);
+   return &l->link;
+}
+
+void
+push_stack(exec_list *list, void *mem_ctx, bblock_t *block)
+{
+   /* The kind of the link is immaterial, but we need to provide one since
+    * this is (ab)using the edge data structure in order to implement a stack.
+    */
+   list->push_tail(link(mem_ctx, block, bblock_link_logical));
+}
+
+bblock_t::bblock_t(cfg_t *cfg) :
+   cfg(cfg), start_ip(0), end_ip(0), end_ip_delta(0), num(0)
+{
+   instructions.make_empty();
+   parents.make_empty();
+   children.make_empty();
+}
+
+void
+bblock_t::add_successor(void *mem_ctx, bblock_t *successor,
+                        enum bblock_link_kind kind)
+{
+   successor->parents.push_tail(::link(mem_ctx, this, kind));
+   children.push_tail(::link(mem_ctx, successor, kind));
+}
+
+bool
+bblock_t::is_predecessor_of(const bblock_t *block,
+                            enum bblock_link_kind kind) const
+{
+   foreach_list_typed_safe (bblock_link, parent, link, &block->parents) {
+      if (parent->block == this && parent->kind <= kind) {
+         return true;
+      }
+   }
+
+   return false;
+}
+
+bool
+bblock_t::is_successor_of(const bblock_t *block,
+                          enum bblock_link_kind kind) const
+{
+   foreach_list_typed_safe (bblock_link, child, link, &block->children) {
+      if (child->block == this && child->kind <= kind) {
+         return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+ends_block(const backend_instruction *inst)
+{
+   enum opcode op = inst->opcode;
+
+   return op == BRW_OPCODE_IF ||
+          op == BRW_OPCODE_ELSE ||
+          op == BRW_OPCODE_CONTINUE ||
+          op == BRW_OPCODE_BREAK ||
+          op == BRW_OPCODE_DO ||
+          op == BRW_OPCODE_WHILE;
+}
+
+static bool
+starts_block(const backend_instruction *inst)
+{
+   enum opcode op = inst->opcode;
+
+   return op == BRW_OPCODE_DO ||
+          op == BRW_OPCODE_ENDIF;
+}
+
+bool
+bblock_t::can_combine_with(const bblock_t *that) const
+{
+   if ((const bblock_t *)this->link.next != that)
+      return false;
+
+   if (ends_block(this->end()) ||
+       starts_block(that->start()))
+      return false;
+
+   return true;
+}
+
+void
+bblock_t::combine_with(bblock_t *that)
+{
+   assert(this->can_combine_with(that));
+   foreach_list_typed (bblock_link, link, link, &that->parents) {
+      assert(link->block == this);
+   }
+
+   this->end_ip = that->end_ip;
+   this->instructions.append_list(&that->instructions);
+
+   this->cfg->remove_block(that);
+}
+
+void
+bblock_t::dump(FILE *file) const
+{
+   const backend_shader *s = this->cfg->s;
+
+   int ip = this->start_ip;
+   foreach_inst_in_block(backend_instruction, inst, this) {
+      fprintf(file, "%5d: ", ip);
+      s->dump_instruction(inst, file);
+      ip++;
+   }
+}
+
+void
+bblock_t::unlink_list(exec_list *list)
+{
+   assert(list == &parents || list == &children);
+   const bool remove_parent = list == &children;
+
+   foreach_list_typed_safe(bblock_link, link, link, list) {
+      /* Also break the links from the other block back to this block. */
+      exec_list *sub_list = remove_parent ? &link->block->parents : &link->block->children;
+
+      foreach_list_typed_safe(bblock_link, sub_link, link, sub_list) {
+         if (sub_link->block == this) {
+            sub_link->link.remove();
+            ralloc_free(sub_link);
+         }
+      }
+
+      link->link.remove();
+      ralloc_free(link);
+   }
+}
+
+cfg_t::cfg_t(const backend_shader *s, exec_list *instructions) :
+   s(s)
+{
+   mem_ctx = ralloc_context(NULL);
+   block_list.make_empty();
+   blocks = NULL;
+   num_blocks = 0;
+
+   bblock_t *cur = NULL;
+   int ip = 0;
+
+   bblock_t *entry = new_block();
+   bblock_t *cur_if = NULL;    /**< BB ending with IF. */
+   bblock_t *cur_else = NULL;  /**< BB ending with ELSE. */
+   bblock_t *cur_do = NULL;    /**< BB starting with DO. */
+   bblock_t *cur_while = NULL; /**< BB immediately following WHILE. */
+   exec_list if_stack, else_stack, do_stack, while_stack;
+   bblock_t *next;
+
+   set_next_block(&cur, entry, ip);
+
+   foreach_in_list_safe(backend_instruction, inst, instructions) {
+      /* set_next_block wants the post-incremented ip */
+      ip++;
+
+      inst->exec_node::remove();
+
+      switch (inst->opcode) {
+      case BRW_OPCODE_IF:
+         cur->instructions.push_tail(inst);
+
+	 /* Push our information onto a stack so we can recover from
+	  * nested ifs.
+	  */
+         push_stack(&if_stack, mem_ctx, cur_if);
+         push_stack(&else_stack, mem_ctx, cur_else);
+
+	 cur_if = cur;
+	 cur_else = NULL;
+
+	 /* Set up our immediately following block, full of "then"
+	  * instructions.
+	  */
+	 next = new_block();
+         cur_if->add_successor(mem_ctx, next, bblock_link_logical);
+
+	 set_next_block(&cur, next, ip);
+	 break;
+
+      case BRW_OPCODE_ELSE:
+         cur->instructions.push_tail(inst);
+
+         cur_else = cur;
+
+	 next = new_block();
+         assert(cur_if != NULL);
+         cur_if->add_successor(mem_ctx, next, bblock_link_logical);
+         cur_else->add_successor(mem_ctx, next, bblock_link_physical);
+
+	 set_next_block(&cur, next, ip);
+	 break;
+
+      case BRW_OPCODE_ENDIF: {
+         bblock_t *cur_endif;
+
+         if (cur->instructions.is_empty()) {
+            /* New block was just created; use it. */
+            cur_endif = cur;
+         } else {
+            cur_endif = new_block();
+
+            cur->add_successor(mem_ctx, cur_endif, bblock_link_logical);
+
+            set_next_block(&cur, cur_endif, ip - 1);
+         }
+
+         cur->instructions.push_tail(inst);
+
+         if (cur_else) {
+            cur_else->add_successor(mem_ctx, cur_endif, bblock_link_logical);
+         } else {
+            assert(cur_if != NULL);
+            cur_if->add_successor(mem_ctx, cur_endif, bblock_link_logical);
+         }
+
+         assert(cur_if->end()->opcode == BRW_OPCODE_IF);
+         assert(!cur_else || cur_else->end()->opcode == BRW_OPCODE_ELSE);
+
+	 /* Pop the stack so we're in the previous if/else/endif */
+	 cur_if = pop_stack(&if_stack);
+	 cur_else = pop_stack(&else_stack);
+	 break;
+      }
+      case BRW_OPCODE_DO:
+	 /* Push our information onto a stack so we can recover from
+	  * nested loops.
+	  */
+         push_stack(&do_stack, mem_ctx, cur_do);
+         push_stack(&while_stack, mem_ctx, cur_while);
+
+	 /* Set up the block just after the while.  Don't know when exactly
+	  * it will start, yet.
+	  */
+	 cur_while = new_block();
+
+         if (cur->instructions.is_empty()) {
+            /* New block was just created; use it. */
+            cur_do = cur;
+         } else {
+            cur_do = new_block();
+
+            cur->add_successor(mem_ctx, cur_do, bblock_link_logical);
+
+            set_next_block(&cur, cur_do, ip - 1);
+         }
+
+         cur->instructions.push_tail(inst);
+
+         /* Represent divergent execution of the loop as a pair of alternative
+          * edges coming out of the DO instruction: For any physical iteration
+          * of the loop a given logical thread can either start off enabled
+          * (which is represented as the "next" successor), or disabled (if it
+          * has reached a non-uniform exit of the loop during a previous
+          * iteration, which is represented as the "cur_while" successor).
+          *
+          * The disabled edge will be taken by the logical thread anytime we
+          * arrive at the DO instruction through a back-edge coming from a
+          * conditional exit of the loop where divergent control flow started.
+          *
+          * This guarantees that there is a control-flow path from any
+          * divergence point of the loop into the convergence point
+          * (immediately past the WHILE instruction) such that it overlaps the
+          * whole IP region of divergent control flow (potentially the whole
+          * loop) *and* doesn't imply the execution of any instructions part
+          * of the loop (since the corresponding execution mask bit will be
+          * disabled for a diverging thread).
+          *
+          * This way we make sure that any variables that are live throughout
+          * the region of divergence for an inactive logical thread are also
+          * considered to interfere with any other variables assigned by
+          * active logical threads within the same physical region of the
+          * program, since otherwise we would risk cross-channel data
+          * corruption.
+          */
+         next = new_block();
+         cur->add_successor(mem_ctx, next, bblock_link_logical);
+         cur->add_successor(mem_ctx, cur_while, bblock_link_physical);
+         set_next_block(&cur, next, ip);
+	 break;
+
+      case BRW_OPCODE_CONTINUE:
+         cur->instructions.push_tail(inst);
+
+         /* A conditional CONTINUE may start a region of divergent control
+          * flow until the start of the next loop iteration (*not* until the
+          * end of the loop which is why the successor is not the top-level
+          * divergence point at cur_do).  The live interval of any variable
+          * extending through a CONTINUE edge is guaranteed to overlap the
+          * whole region of divergent execution, because any variable live-out
+          * at the CONTINUE instruction will also be live-in at the top of the
+          * loop, and therefore also live-out at the bottom-most point of the
+          * loop which is reachable from the top (since a control flow path
+          * exists from a definition of the variable through this CONTINUE
+          * instruction, the top of the loop, the (reachable) bottom of the
+          * loop, the top of the loop again, into a use of the variable).
+          */
+         assert(cur_do != NULL);
+         cur->add_successor(mem_ctx, cur_do->next(), bblock_link_logical);
+
+	 next = new_block();
+	 if (inst->predicate)
+            cur->add_successor(mem_ctx, next, bblock_link_logical);
+         else
+            cur->add_successor(mem_ctx, next, bblock_link_physical);
+
+	 set_next_block(&cur, next, ip);
+	 break;
+
+      case BRW_OPCODE_BREAK:
+         cur->instructions.push_tail(inst);
+
+         /* A conditional BREAK instruction may start a region of divergent
+          * control flow until the end of the loop if the condition is
+          * non-uniform, in which case the loop will execute additional
+          * iterations with the present channel disabled.  We model this as a
+          * control flow path from the divergence point to the convergence
+          * point that overlaps the whole IP range of the loop and skips over
+          * the execution of any other instructions part of the loop.
+          *
+          * See the DO case for additional explanation.
+          */
+         assert(cur_do != NULL);
+         cur->add_successor(mem_ctx, cur_do, bblock_link_physical);
+         cur->add_successor(mem_ctx, cur_while, bblock_link_logical);
+
+	 next = new_block();
+	 if (inst->predicate)
+            cur->add_successor(mem_ctx, next, bblock_link_logical);
+         else
+            cur->add_successor(mem_ctx, next, bblock_link_physical);
+
+	 set_next_block(&cur, next, ip);
+	 break;
+
+      case BRW_OPCODE_WHILE:
+         cur->instructions.push_tail(inst);
+
+         assert(cur_do != NULL && cur_while != NULL);
+
+         /* A conditional WHILE instruction may start a region of divergent
+          * control flow until the end of the loop, just like the BREAK
+          * instruction.  See the BREAK case for more details.  OTOH an
+          * unconditional WHILE instruction is non-divergent (just like an
+          * unconditional CONTINUE), and will necessarily lead to the
+          * execution of an additional iteration of the loop for all enabled
+          * channels, so we may skip over the divergence point at the top of
+          * the loop to keep the CFG as unambiguous as possible.
+          */
+         if (inst->predicate) {
+            cur->add_successor(mem_ctx, cur_do, bblock_link_logical);
+         } else {
+            cur->add_successor(mem_ctx, cur_do->next(), bblock_link_logical);
+         }
+
+	 set_next_block(&cur, cur_while, ip);
+
+	 /* Pop the stack so we're in the previous loop */
+	 cur_do = pop_stack(&do_stack);
+	 cur_while = pop_stack(&while_stack);
+	 break;
+
+      default:
+         cur->instructions.push_tail(inst);
+	 break;
+      }
+   }
+
+   cur->end_ip = ip - 1;
+
+   make_block_array();
+}
+
+cfg_t::~cfg_t()
+{
+   ralloc_free(mem_ctx);
+}
+
+void
+cfg_t::remove_block(bblock_t *block)
+{
+   foreach_list_typed_safe (bblock_link, predecessor, link, &block->parents) {
+      /* cfg_t::validate checks that predecessor and successor lists are well
+       * formed, so it is known that the loop here would find exactly one
+       * block. Set old_link_kind to silence "variable used but not set"
+       * warnings.
+       */
+      bblock_link_kind old_link_kind = bblock_link_logical;
+
+      /* Remove block from all of its predecessors' successor lists. */
+      foreach_list_typed_safe (bblock_link, successor, link,
+                               &predecessor->block->children) {
+         if (block == successor->block) {
+            old_link_kind = successor->kind;
+            successor->link.remove();
+            ralloc_free(successor);
+            break;
+         }
+      }
+
+      /* Add removed-block's successors to its predecessors' successor lists. */
+      foreach_list_typed (bblock_link, successor, link, &block->children) {
+         bool need_to_link = true;
+         bblock_link_kind new_link_kind = MAX2(old_link_kind, successor->kind);
+
+         foreach_list_typed_safe (bblock_link, child, link, &predecessor->block->children) {
+            /* There is already a link between the two blocks. If the links
+             * are the same kind or the link is logical, do nothing. If the
+             * existing link is physical and the proposed new link is logical,
+             * promote the existing link to logical.
+             *
+             * This is accomplished by taking the minimum of the existing link
+             * kind and the proposed link kind.
+             */
+            if (child->block == successor->block) {
+               child->kind = MIN2(child->kind, new_link_kind);
+               need_to_link = false;
+               break;
+            }
+         }
+
+         if (need_to_link) {
+            predecessor->block->children.push_tail(link(mem_ctx,
+                                                        successor->block,
+                                                        new_link_kind));
+         }
+      }
+   }
+
+   foreach_list_typed_safe (bblock_link, successor, link, &block->children) {
+      /* cfg_t::validate checks that predecessor and successor lists are well
+       * formed, so it is known that the loop here would find exactly one
+       * block. Set old_link_kind to silence "variable used but not set"
+       * warnings.
+       */
+      bblock_link_kind old_link_kind = bblock_link_logical;
+
+      /* Remove block from all of its childrens' parents lists. */
+      foreach_list_typed_safe (bblock_link, predecessor, link,
+                               &successor->block->parents) {
+         if (block == predecessor->block) {
+            old_link_kind = predecessor->kind;
+            predecessor->link.remove();
+            ralloc_free(predecessor);
+         }
+      }
+
+      /* Add removed-block's predecessors to its successors' predecessor lists. */
+      foreach_list_typed (bblock_link, predecessor, link, &block->parents) {
+         bool need_to_link = true;
+         bblock_link_kind new_link_kind = MAX2(old_link_kind, predecessor->kind);
+
+         foreach_list_typed_safe (bblock_link, parent, link, &successor->block->parents) {
+            /* There is already a link between the two blocks. If the links
+             * are the same kind or the link is logical, do nothing. If the
+             * existing link is physical and the proposed new link is logical,
+             * promote the existing link to logical.
+             *
+             * This is accomplished by taking the minimum of the existing link
+             * kind and the proposed link kind.
+             */
+            if (parent->block == predecessor->block) {
+               parent->kind = MIN2(parent->kind, new_link_kind);
+               need_to_link = false;
+               break;
+            }
+         }
+
+         if (need_to_link) {
+            successor->block->parents.push_tail(link(mem_ctx,
+                                                     predecessor->block,
+                                                     new_link_kind));
+         }
+      }
+   }
+
+   block->link.remove();
+
+   for (int b = block->num; b < this->num_blocks - 1; b++) {
+      this->blocks[b] = this->blocks[b + 1];
+      this->blocks[b]->num = b;
+   }
+
+   this->blocks[this->num_blocks - 1]->num = this->num_blocks - 2;
+   this->num_blocks--;
+}
+
+bblock_t *
+cfg_t::new_block()
+{
+   bblock_t *block = new(mem_ctx) bblock_t(this);
+
+   return block;
+}
+
+void
+cfg_t::set_next_block(bblock_t **cur, bblock_t *block, int ip)
+{
+   if (*cur) {
+      (*cur)->end_ip = ip - 1;
+   }
+
+   block->start_ip = ip;
+   block->num = num_blocks++;
+   block_list.push_tail(&block->link);
+   *cur = block;
+}
+
+void
+cfg_t::make_block_array()
+{
+   blocks = ralloc_array(mem_ctx, bblock_t *, num_blocks);
+
+   int i = 0;
+   foreach_block (block, this) {
+      blocks[i++] = block;
+   }
+   assert(i == num_blocks);
+}
+
+namespace {
+
+struct link_desc {
+   char kind;
+   int num;
+};
+
+int
+compare_link_desc(const void *a, const void *b)
+{
+   const link_desc *la = (const link_desc *)a;
+   const link_desc *lb = (const link_desc *)b;
+
+   return la->num < lb->num ? -1 :
+          la->num > lb->num ? +1 :
+          la->kind < lb->kind ? -1 :
+          la->kind > lb->kind ? +1 :
+          0;
+}
+
+void
+sort_links(util_dynarray *scratch, exec_list *list)
+{
+   util_dynarray_clear(scratch);
+   foreach_list_typed(bblock_link, link, link, list) {
+      link_desc l;
+      l.kind = link->kind == bblock_link_logical ? '-' : '~';
+      l.num = link->block->num;
+      util_dynarray_append(scratch, link_desc, l);
+   }
+   qsort(scratch->data, util_dynarray_num_elements(scratch, link_desc),
+         sizeof(link_desc), compare_link_desc);
+}
+
+} /* namespace */
+
+void
+cfg_t::dump(FILE *file)
+{
+   const idom_tree *idom = (s ? &s->idom_analysis.require() : NULL);
+
+   /* Temporary storage to sort the lists of blocks.  This normalizes the
+    * output, making it possible to use it for certain tests.
+    */
+   util_dynarray scratch;
+   util_dynarray_init(&scratch, NULL);
+
+   foreach_block (block, this) {
+      if (idom && idom->parent(block))
+         fprintf(file, "START B%d IDOM(B%d)", block->num,
+                 idom->parent(block)->num);
+      else
+         fprintf(file, "START B%d IDOM(none)", block->num);
+
+      sort_links(&scratch, &block->parents);
+      util_dynarray_foreach(&scratch, link_desc, l)
+         fprintf(file, " <%cB%d", l->kind, l->num);
+      fprintf(file, "\n");
+
+      if (s != NULL)
+         block->dump(file);
+      fprintf(file, "END B%d", block->num);
+
+      sort_links(&scratch, &block->children);
+      util_dynarray_foreach(&scratch, link_desc, l)
+         fprintf(file, " %c>B%d", l->kind, l->num);
+      fprintf(file, "\n");
+   }
+
+   util_dynarray_fini(&scratch);
+}
+
+/* Calculates the immediate dominator of each block, according to "A Simple,
+ * Fast Dominance Algorithm" by Keith D. Cooper, Timothy J. Harvey, and Ken
+ * Kennedy.
+ *
+ * The authors claim that for control flow graphs of sizes normally encountered
+ * (less than 1000 nodes) that this algorithm is significantly faster than
+ * others like Lengauer-Tarjan.
+ */
+idom_tree::idom_tree(const backend_shader *s) :
+   num_parents(s->cfg->num_blocks),
+   parents(new bblock_t *[num_parents]())
+{
+   bool changed;
+
+   parents[0] = s->cfg->blocks[0];
+
+   do {
+      changed = false;
+
+      foreach_block(block, s->cfg) {
+         if (block->num == 0)
+            continue;
+
+         bblock_t *new_idom = NULL;
+         foreach_list_typed(bblock_link, parent_link, link, &block->parents) {
+            if (parent(parent_link->block)) {
+               new_idom = (new_idom ? intersect(new_idom, parent_link->block) :
+                           parent_link->block);
+            }
+         }
+
+         if (parent(block) != new_idom) {
+            parents[block->num] = new_idom;
+            changed = true;
+         }
+      }
+   } while (changed);
+}
+
+idom_tree::~idom_tree()
+{
+   delete[] parents;
+}
+
+bblock_t *
+idom_tree::intersect(bblock_t *b1, bblock_t *b2) const
+{
+   /* Note, the comparisons here are the opposite of what the paper says
+    * because we index blocks from beginning -> end (i.e. reverse post-order)
+    * instead of post-order like they assume.
+    */
+   while (b1->num != b2->num) {
+      while (b1->num > b2->num)
+         b1 = parent(b1);
+      while (b2->num > b1->num)
+         b2 = parent(b2);
+   }
+   assert(b1);
+   return b1;
+}
+
+void
+idom_tree::dump() const
+{
+   printf("digraph DominanceTree {\n");
+   for (unsigned i = 0; i < num_parents; i++)
+      printf("\t%d -> %d\n", parents[i]->num, i);
+   printf("}\n");
+}
+
+void
+cfg_t::dump_cfg()
+{
+   printf("digraph CFG {\n");
+   for (int b = 0; b < num_blocks; b++) {
+      bblock_t *block = this->blocks[b];
+
+      foreach_list_typed_safe (bblock_link, child, link, &block->children) {
+         printf("\t%d -> %d\n", b, child->block->num);
+      }
+   }
+   printf("}\n");
+}
+
+#define cfgv_assert(assertion)                                          \
+   {                                                                    \
+      if (!(assertion)) {                                               \
+         fprintf(stderr, "ASSERT: CFG validation in %s failed!\n", stage_abbrev); \
+         fprintf(stderr, "%s:%d: '%s' failed\n", __FILE__, __LINE__, #assertion);  \
+         abort();                                                       \
+      }                                                                 \
+   }
+
+#ifndef NDEBUG
+void
+cfg_t::validate(const char *stage_abbrev)
+{
+   foreach_block(block, this) {
+      foreach_list_typed(bblock_link, successor, link, &block->children) {
+         /* Each successor of a block must have one predecessor link back to
+          * the block.
+          */
+         bool successor_links_back_to_predecessor = false;
+         bblock_t *succ_block = successor->block;
+
+         foreach_list_typed(bblock_link, predecessor, link, &succ_block->parents) {
+            if (predecessor->block == block) {
+               cfgv_assert(!successor_links_back_to_predecessor);
+               cfgv_assert(successor->kind == predecessor->kind);
+               successor_links_back_to_predecessor = true;
+            }
+         }
+
+         cfgv_assert(successor_links_back_to_predecessor);
+
+         /* Each successor block must appear only once in the list of
+          * successors.
+          */
+         foreach_list_typed_from(bblock_link, later_successor, link,
+                                 &block->children, successor->link.next) {
+            cfgv_assert(successor->block != later_successor->block);
+         }
+      }
+
+      foreach_list_typed(bblock_link, predecessor, link, &block->parents) {
+         /* Each predecessor of a block must have one successor link back to
+          * the block.
+          */
+         bool predecessor_links_back_to_successor = false;
+         bblock_t *pred_block = predecessor->block;
+
+         foreach_list_typed(bblock_link, successor, link, &pred_block->children) {
+            if (successor->block == block) {
+               cfgv_assert(!predecessor_links_back_to_successor);
+               cfgv_assert(successor->kind == predecessor->kind);
+               predecessor_links_back_to_successor = true;
+            }
+         }
+
+         cfgv_assert(predecessor_links_back_to_successor);
+
+         /* Each precessor block must appear only once in the list of
+          * precessors.
+          */
+         foreach_list_typed_from(bblock_link, later_precessor, link,
+                                 &block->parents, predecessor->link.next) {
+            cfgv_assert(predecessor->block != later_precessor->block);
+         }
+      }
+
+      backend_instruction *first_inst = block->start();
+      if (first_inst->opcode == BRW_OPCODE_DO) {
+         /* DO instructions both begin and end a block, so the DO instruction
+          * must be the only instruction in the block.
+          */
+         cfgv_assert(exec_list_is_singular(&block->instructions));
+
+         /* A block starting with DO should have exactly two successors. One
+          * is a physical link to the block starting after the WHILE
+          * instruction. The other is a logical link to the block starting the
+          * body of the loop.
+          */
+         bblock_t *physical_block = nullptr;
+         bblock_t *logical_block = nullptr;
+
+         foreach_list_typed(bblock_link, child, link, &block->children) {
+            if (child->kind == bblock_link_physical) {
+               cfgv_assert(physical_block == nullptr);
+               physical_block = child->block;
+            } else {
+               cfgv_assert(logical_block == nullptr);
+               logical_block = child->block;
+            }
+         }
+
+         cfgv_assert(logical_block != nullptr);
+         cfgv_assert(physical_block != nullptr);
+      }
+   }
+}
+#endif
--- a/src/intel/compiler/elk/brw_cfg.h
+++ b/src/intel/compiler/elk/brw_cfg.h
@ -0,0 +1,532 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#ifndef BRW_CFG_H
+#define BRW_CFG_H
+
+#include "brw_ir.h"
+#ifdef __cplusplus
+#include "brw_ir_analysis.h"
+#endif
+
+struct bblock_t;
+
+/**
+ * CFG edge types.
+ *
+ * A logical edge represents a potential control flow path of the original
+ * scalar program, while a physical edge represents a control flow path that
+ * may not have existed in the original program but was introduced during
+ * vectorization in order to implement divergent control flow of different
+ * shader invocations within the same SIMD thread.
+ *
+ * All logical edges in the CFG are considered to be physical edges but not
+ * the other way around -- I.e. the logical CFG is a subset of the physical
+ * one.
+ */
+enum bblock_link_kind {
+   bblock_link_logical = 0,
+   bblock_link_physical
+};
+
+struct bblock_link {
+#ifdef __cplusplus
+   DECLARE_RALLOC_CXX_OPERATORS(bblock_link)
+
+   bblock_link(bblock_t *block, enum bblock_link_kind kind)
+      : block(block), kind(kind)
+   {
+   }
+#endif
+
+   struct exec_node link;
+   struct bblock_t *block;
+
+   /* Type of this CFG edge.  Because bblock_link_logical also implies
+    * bblock_link_physical, the proper way to test for membership of edge 'l'
+    * in CFG kind 'k' is 'l.kind <= k'.
+    */
+   enum bblock_link_kind kind;
+};
+
+struct backend_shader;
+struct cfg_t;
+
+struct bblock_t {
+#ifdef __cplusplus
+   DECLARE_RALLOC_CXX_OPERATORS(bblock_t)
+
+   explicit bblock_t(cfg_t *cfg);
+
+   void add_successor(void *mem_ctx, bblock_t *successor,
+                      enum bblock_link_kind kind);
+   bool is_predecessor_of(const bblock_t *block,
+                          enum bblock_link_kind kind) const;
+   bool is_successor_of(const bblock_t *block,
+                        enum bblock_link_kind kind) const;
+   bool can_combine_with(const bblock_t *that) const;
+   void combine_with(bblock_t *that);
+   void dump(FILE *file = stderr) const;
+
+   backend_instruction *start();
+   const backend_instruction *start() const;
+   backend_instruction *end();
+   const backend_instruction *end() const;
+
+   bblock_t *next();
+   const bblock_t *next() const;
+   bblock_t *prev();
+   const bblock_t *prev() const;
+
+   bool starts_with_control_flow() const;
+   bool ends_with_control_flow() const;
+
+   backend_instruction *first_non_control_flow_inst();
+   backend_instruction *last_non_control_flow_inst();
+
+private:
+   /**
+    * \sa unlink_parents, unlink_children
+    */
+   void unlink_list(exec_list *);
+
+public:
+   void unlink_parents()
+   {
+      unlink_list(&parents);
+   }
+
+   void unlink_children()
+   {
+      unlink_list(&children);
+   }
+#endif
+
+   struct exec_node link;
+   struct cfg_t *cfg;
+
+   int start_ip;
+   int end_ip;
+
+   /**
+    * Change in end_ip since the last time IPs of later blocks were updated.
+    */
+   int end_ip_delta;
+
+   struct exec_list instructions;
+   struct exec_list parents;
+   struct exec_list children;
+   int num;
+};
+
+static inline struct backend_instruction *
+bblock_start(struct bblock_t *block)
+{
+   return (struct backend_instruction *)exec_list_get_head(&block->instructions);
+}
+
+static inline const struct backend_instruction *
+bblock_start_const(const struct bblock_t *block)
+{
+   return (const struct backend_instruction *)exec_list_get_head_const(&block->instructions);
+}
+
+static inline struct backend_instruction *
+bblock_end(struct bblock_t *block)
+{
+   return (struct backend_instruction *)exec_list_get_tail(&block->instructions);
+}
+
+static inline const struct backend_instruction *
+bblock_end_const(const struct bblock_t *block)
+{
+   return (const struct backend_instruction *)exec_list_get_tail_const(&block->instructions);
+}
+
+static inline struct bblock_t *
+bblock_next(struct bblock_t *block)
+{
+   if (exec_node_is_tail_sentinel(block->link.next))
+      return NULL;
+
+   return (struct bblock_t *)block->link.next;
+}
+
+static inline const struct bblock_t *
+bblock_next_const(const struct bblock_t *block)
+{
+   if (exec_node_is_tail_sentinel(block->link.next))
+      return NULL;
+
+   return (const struct bblock_t *)block->link.next;
+}
+
+static inline struct bblock_t *
+bblock_prev(struct bblock_t *block)
+{
+   if (exec_node_is_head_sentinel(block->link.prev))
+      return NULL;
+
+   return (struct bblock_t *)block->link.prev;
+}
+
+static inline const struct bblock_t *
+bblock_prev_const(const struct bblock_t *block)
+{
+   if (exec_node_is_head_sentinel(block->link.prev))
+      return NULL;
+
+   return (const struct bblock_t *)block->link.prev;
+}
+
+static inline bool
+bblock_starts_with_control_flow(const struct bblock_t *block)
+{
+   enum opcode op = bblock_start_const(block)->opcode;
+   return op == BRW_OPCODE_DO || op == BRW_OPCODE_ENDIF;
+}
+
+static inline bool
+bblock_ends_with_control_flow(const struct bblock_t *block)
+{
+   enum opcode op = bblock_end_const(block)->opcode;
+   return op == BRW_OPCODE_IF ||
+          op == BRW_OPCODE_ELSE ||
+          op == BRW_OPCODE_WHILE ||
+          op == BRW_OPCODE_BREAK ||
+          op == BRW_OPCODE_CONTINUE;
+}
+
+static inline struct backend_instruction *
+bblock_first_non_control_flow_inst(struct bblock_t *block)
+{
+   struct backend_instruction *inst = bblock_start(block);
+   if (bblock_starts_with_control_flow(block))
+#ifdef __cplusplus
+      inst = (struct backend_instruction *)inst->next;
+#else
+      inst = (struct backend_instruction *)inst->link.next;
+#endif
+   return inst;
+}
+
+static inline struct backend_instruction *
+bblock_last_non_control_flow_inst(struct bblock_t *block)
+{
+   struct backend_instruction *inst = bblock_end(block);
+   if (bblock_ends_with_control_flow(block))
+#ifdef __cplusplus
+      inst = (struct backend_instruction *)inst->prev;
+#else
+      inst = (struct backend_instruction *)inst->link.prev;
+#endif
+   return inst;
+}
+
+#ifdef __cplusplus
+inline backend_instruction *
+bblock_t::start()
+{
+   return bblock_start(this);
+}
+
+inline const backend_instruction *
+bblock_t::start() const
+{
+   return bblock_start_const(this);
+}
+
+inline backend_instruction *
+bblock_t::end()
+{
+   return bblock_end(this);
+}
+
+inline const backend_instruction *
+bblock_t::end() const
+{
+   return bblock_end_const(this);
+}
+
+inline bblock_t *
+bblock_t::next()
+{
+   return bblock_next(this);
+}
+
+inline const bblock_t *
+bblock_t::next() const
+{
+   return bblock_next_const(this);
+}
+
+inline bblock_t *
+bblock_t::prev()
+{
+   return bblock_prev(this);
+}
+
+inline const bblock_t *
+bblock_t::prev() const
+{
+   return bblock_prev_const(this);
+}
+
+inline bool
+bblock_t::starts_with_control_flow() const
+{
+   return bblock_starts_with_control_flow(this);
+}
+
+inline bool
+bblock_t::ends_with_control_flow() const
+{
+   return bblock_ends_with_control_flow(this);
+}
+
+inline backend_instruction *
+bblock_t::first_non_control_flow_inst()
+{
+   return bblock_first_non_control_flow_inst(this);
+}
+
+inline backend_instruction *
+bblock_t::last_non_control_flow_inst()
+{
+   return bblock_last_non_control_flow_inst(this);
+}
+#endif
+
+struct cfg_t {
+#ifdef __cplusplus
+   DECLARE_RALLOC_CXX_OPERATORS(cfg_t)
+
+   cfg_t(const backend_shader *s, exec_list *instructions);
+   ~cfg_t();
+
+   void remove_block(bblock_t *block);
+
+   bblock_t *first_block();
+   const bblock_t *first_block() const;
+   bblock_t *last_block();
+   const bblock_t *last_block() const;
+
+   bblock_t *new_block();
+   void set_next_block(bblock_t **cur, bblock_t *block, int ip);
+   void make_block_array();
+
+   void dump(FILE *file = stderr);
+   void dump_cfg();
+
+#ifdef NDEBUG
+   void validate(UNUSED const char *stage_abbrev) { }
+#else
+   void validate(const char *stage_abbrev);
+#endif
+
+   /**
+    * Propagate bblock_t::end_ip_delta data through the CFG.
+    */
+   inline void adjust_block_ips();
+
+#endif
+   const struct backend_shader *s;
+   void *mem_ctx;
+
+   /** Ordered list (by ip) of basic blocks */
+   struct exec_list block_list;
+   struct bblock_t **blocks;
+   int num_blocks;
+};
+
+static inline struct bblock_t *
+cfg_first_block(struct cfg_t *cfg)
+{
+   return (struct bblock_t *)exec_list_get_head(&cfg->block_list);
+}
+
+static inline const struct bblock_t *
+cfg_first_block_const(const struct cfg_t *cfg)
+{
+   return (const struct bblock_t *)exec_list_get_head_const(&cfg->block_list);
+}
+
+static inline struct bblock_t *
+cfg_last_block(struct cfg_t *cfg)
+{
+   return (struct bblock_t *)exec_list_get_tail(&cfg->block_list);
+}
+
+static inline const struct bblock_t *
+cfg_last_block_const(const struct cfg_t *cfg)
+{
+   return (const struct bblock_t *)exec_list_get_tail_const(&cfg->block_list);
+}
+
+#ifdef __cplusplus
+inline bblock_t *
+cfg_t::first_block()
+{
+   return cfg_first_block(this);
+}
+
+const inline bblock_t *
+cfg_t::first_block() const
+{
+   return cfg_first_block_const(this);
+}
+
+inline bblock_t *
+cfg_t::last_block()
+{
+   return cfg_last_block(this);
+}
+
+const inline bblock_t *
+cfg_t::last_block() const
+{
+   return cfg_last_block_const(this);
+}
+#endif
+
+/* Note that this is implemented with a double for loop -- break will
+ * break from the inner loop only!
+ */
+#define foreach_block_and_inst(__block, __type, __inst, __cfg) \
+   foreach_block (__block, __cfg)                              \
+      foreach_inst_in_block (__type, __inst, __block)
+
+/* Note that this is implemented with a double for loop -- break will
+ * break from the inner loop only!
+ */
+#define foreach_block_and_inst_safe(__block, __type, __inst, __cfg) \
+   foreach_block_safe (__block, __cfg)                              \
+      foreach_inst_in_block_safe (__type, __inst, __block)
+
+#define foreach_block(__block, __cfg)                          \
+   foreach_list_typed (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_block_reverse(__block, __cfg)                  \
+   foreach_list_typed_reverse (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_block_safe(__block, __cfg)                     \
+   foreach_list_typed_safe (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_block_reverse_safe(__block, __cfg)             \
+   foreach_list_typed_reverse_safe (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_inst_in_block(__type, __inst, __block)         \
+   foreach_in_list(__type, __inst, &(__block)->instructions)
+
+#define foreach_inst_in_block_safe(__type, __inst, __block)    \
+   for (__type *__inst = (__type *)__block->instructions.head_sentinel.next, \
+               *__next = (__type *)__inst->next;               \
+        __next != NULL;                                        \
+        __inst = __next,                                       \
+        __next = (__type *)__next->next)
+
+#define foreach_inst_in_block_reverse(__type, __inst, __block) \
+   foreach_in_list_reverse(__type, __inst, &(__block)->instructions)
+
+#define foreach_inst_in_block_reverse_safe(__type, __inst, __block) \
+   foreach_in_list_reverse_safe(__type, __inst, &(__block)->instructions)
+
+#define foreach_inst_in_block_starting_from(__type, __scan_inst, __inst) \
+   for (__type *__scan_inst = (__type *)__inst->next;          \
+        !__scan_inst->is_tail_sentinel();                      \
+        __scan_inst = (__type *)__scan_inst->next)
+
+#define foreach_inst_in_block_reverse_starting_from(__type, __scan_inst, __inst) \
+   for (__type *__scan_inst = (__type *)__inst->prev;          \
+        !__scan_inst->is_head_sentinel();                      \
+        __scan_inst = (__type *)__scan_inst->prev)
+
+#ifdef __cplusplus
+inline void
+cfg_t::adjust_block_ips()
+{
+   int delta = 0;
+
+   foreach_block(block, this) {
+      block->start_ip += delta;
+      block->end_ip += delta;
+
+      delta += block->end_ip_delta;
+
+      block->end_ip_delta = 0;
+   }
+}
+
+namespace brw {
+   /**
+    * Immediate dominator tree analysis of a shader.
+    */
+   struct idom_tree {
+      idom_tree(const backend_shader *s);
+      ~idom_tree();
+
+      bool
+      validate(const backend_shader *) const
+      {
+         /* FINISHME */
+         return true;
+      }
+
+      analysis_dependency_class
+      dependency_class() const
+      {
+         return DEPENDENCY_BLOCKS;
+      }
+
+      const bblock_t *
+      parent(const bblock_t *b) const
+      {
+         assert(unsigned(b->num) < num_parents);
+         return parents[b->num];
+      }
+
+      bblock_t *
+      parent(bblock_t *b) const
+      {
+         assert(unsigned(b->num) < num_parents);
+         return parents[b->num];
+      }
+
+      bblock_t *
+      intersect(bblock_t *b1, bblock_t *b2) const;
+
+      void
+      dump() const;
+
+   private:
+      unsigned num_parents;
+      bblock_t **parents;
+   };
+}
+#endif
+
+#endif /* BRW_CFG_H */
--- a/src/intel/compiler/elk/brw_clip.h
+++ b/src/intel/compiler/elk/brw_clip.h
@ -0,0 +1,163 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#ifndef BRW_CLIP_H
+#define BRW_CLIP_H
+
+#include "brw_compiler.h"
+#include "brw_eu.h"
+
+/* Initial 3 verts, plus at most 6 additional verts from intersections
+ * with fixed planes, plus at most 8 additional verts from intersections
+ * with user clip planes
+ */
+#define MAX_VERTS (3+6+8)
+
+#define PRIM_MASK  (0x1f)
+
+struct brw_clip_compile {
+   struct brw_codegen func;
+   struct brw_clip_prog_key key;
+   struct brw_clip_prog_data prog_data;
+
+   struct {
+      struct brw_reg R0;
+      struct brw_reg vertex[MAX_VERTS];
+
+      struct brw_reg t;
+      struct brw_reg t0, t1;
+      struct brw_reg dp0, dp1;
+
+      struct brw_reg dpPrev;
+      struct brw_reg dp;
+      struct brw_reg loopcount;
+      struct brw_reg nr_verts;
+      struct brw_reg planemask;
+
+      struct brw_reg inlist;
+      struct brw_reg outlist;
+      struct brw_reg freelist;
+
+      struct brw_reg dir;
+      struct brw_reg tmp0, tmp1;
+      struct brw_reg offset;
+
+      struct brw_reg fixed_planes;
+      struct brw_reg plane_equation;
+
+      struct brw_reg ff_sync;
+
+      /* Bitmask indicating which coordinate attribute should be used for
+       * comparison to each clipping plane. A 0 indicates that VARYING_SLOT_POS
+       * should be used, because it's one of the fixed +/- x/y/z planes that
+       * constitute the bounds of the view volume. A 1 indicates that
+       * VARYING_SLOT_CLIP_VERTEX should be used (if available) since it's a user-
+       * defined clipping plane.
+       */
+      struct brw_reg vertex_src_mask;
+
+      /* Offset into the vertex of the current plane's clipdistance value */
+      struct brw_reg clipdistance_offset;
+   } reg;
+
+   /* Number of registers storing VUE data */
+   GLuint nr_regs;
+
+   GLuint first_tmp;
+   GLuint last_tmp;
+
+   bool need_direction;
+
+   struct intel_vue_map vue_map;
+};
+
+/**
+ * True if the given varying is one of the outputs of the vertex shader.
+ */
+static inline bool brw_clip_have_varying(struct brw_clip_compile *c,
+                                         GLuint varying)
+{
+   return (c->key.attrs & BITFIELD64_BIT(varying)) ? 1 : 0;
+}
+
+/* Points are only culled, so no need for a clip routine, however it
+ * works out easier to have a dummy one.
+ */
+void brw_emit_unfilled_clip( struct brw_clip_compile *c );
+void brw_emit_tri_clip( struct brw_clip_compile *c );
+void brw_emit_line_clip( struct brw_clip_compile *c );
+void brw_emit_point_clip( struct brw_clip_compile *c );
+
+/* brw_clip_tri.c, for use by the unfilled clip routine:
+ */
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c );
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c );
+void brw_clip_tri( struct brw_clip_compile *c );
+void brw_clip_tri_emit_polygon( struct brw_clip_compile *c );
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
+			      GLuint nr_verts );
+
+
+/* Utils:
+ */
+
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     bool force_edgeflag );
+
+void brw_clip_init_planes( struct brw_clip_compile *c );
+
+void brw_clip_emit_vue(struct brw_clip_compile *c,
+		       struct brw_indirect vert,
+                       enum brw_urb_write_flags flags,
+		       GLuint header);
+
+void brw_clip_kill_thread(struct brw_clip_compile *c);
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c );
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c );
+
+void brw_clip_copy_flatshaded_attributes( struct brw_clip_compile *c,
+                                          GLuint to, GLuint from );
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c );
+
+struct brw_reg get_tmp( struct brw_clip_compile *c );
+
+void brw_clip_project_position(struct brw_clip_compile *c,
+             struct brw_reg pos );
+void brw_clip_ff_sync(struct brw_clip_compile *c);
+void brw_clip_init_ff_sync(struct brw_clip_compile *c);
+
+#endif
--- a/src/intel/compiler/elk/brw_clip_line.c
+++ b/src/intel/compiler/elk/brw_clip_line.c
@ -0,0 +1,303 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#include "brw_clip.h"
+#include "brw_prim.h"
+
+static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
+{
+   const struct intel_device_info *devinfo = c->func.devinfo;
+   GLuint i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < 4; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->reg.t           = brw_vec1_grf(i, 0);
+   c->reg.t0          = brw_vec1_grf(i, 1);
+   c->reg.t1          = brw_vec1_grf(i, 2);
+   c->reg.planemask   = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dp0         = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp1         = brw_vec1_grf(i, 4);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0);
+      i++;
+   }
+
+   c->reg.vertex_src_mask = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+   c->reg.clipdistance_offset = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_W);
+   i++;
+
+   if (devinfo->ver == 5) {
+      c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+      i++;
+   }
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+/* Line clipping, more or less following the following algorithm:
+ *
+ *  for (p=0;p<MAX_PLANES;p++) {
+ *     if (clipmask & (1 << p)) {
+ *        GLfloat dp0 = DOTPROD( vtx0, plane[p] );
+ *        GLfloat dp1 = DOTPROD( vtx1, plane[p] );
+ *
+ *        if (dp1 < 0.0f) {
+ *           GLfloat t = dp1 / (dp1 - dp0);
+ *           if (t > t1) t1 = t;
+ *        } else {
+ *           GLfloat t = dp0 / (dp0 - dp1);
+ *           if (t > t0) t0 = t;
+ *        }
+ *
+ *        if (t0 + t1 >= 1.0)
+ *           return;
+ *     }
+ *  }
+ *
+ *  interp( ctx, newvtx0, vtx0, vtx1, t0 );
+ *  interp( ctx, newvtx1, vtx1, vtx0, t1 );
+ *
+ */
+static void clip_and_emit_line( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_indirect vtx0     = brw_indirect(0, 0);
+   struct brw_indirect vtx1      = brw_indirect(1, 0);
+   struct brw_indirect newvtx0   = brw_indirect(2, 0);
+   struct brw_indirect newvtx1   = brw_indirect(3, 0);
+   struct brw_indirect plane_ptr = brw_indirect(4, 0);
+   struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
+   GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
+   GLint clipdist0_offset = c->key.nr_userclip
+      ? brw_varying_to_offset(&c->vue_map, VARYING_SLOT_CLIP_DIST0)
+      : 0;
+
+   brw_MOV(p, get_addr_reg(vtx0),      brw_address(c->reg.vertex[0]));
+   brw_MOV(p, get_addr_reg(vtx1),      brw_address(c->reg.vertex[1]));
+   brw_MOV(p, get_addr_reg(newvtx0),   brw_address(c->reg.vertex[2]));
+   brw_MOV(p, get_addr_reg(newvtx1),   brw_address(c->reg.vertex[3]));
+   brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c));
+
+   /* Note: init t0, t1 together:
+    */
+   brw_MOV(p, vec2(c->reg.t0), brw_imm_f(0));
+
+   brw_clip_init_planes(c);
+   brw_clip_init_clipmask(c);
+
+   /* -ve rhw workaround */
+   if (p->devinfo->has_negative_rhw_bug) {
+      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
+              brw_imm_ud(1<<20));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+      brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
+      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+   }
+
+   /* Set the initial vertex source mask: The first 6 planes are the bounds
+    * of the view volume; the next 8 planes are the user clipping planes.
+    */
+   brw_MOV(p, c->reg.vertex_src_mask, brw_imm_ud(0x3fc0));
+
+   /* Set the initial clipdistance offset to be 6 floats before gl_ClipDistance[0].
+    * We'll increment 6 times before we start hitting actual user clipping. */
+   brw_MOV(p, c->reg.clipdistance_offset, brw_imm_d(clipdist0_offset - 6*sizeof(float)));
+
+   brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_AND(p, v1_null_ud, c->reg.planemask, brw_imm_ud(1));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+
+      brw_IF(p, BRW_EXECUTE_1);
+      {
+         brw_AND(p, v1_null_ud, c->reg.vertex_src_mask, brw_imm_ud(1));
+         brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+         brw_IF(p, BRW_EXECUTE_1);
+         {
+            /* user clip distance: just fetch the correct float from each vertex */
+            struct brw_indirect temp_ptr = brw_indirect(7, 0);
+            brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx0), c->reg.clipdistance_offset);
+            brw_MOV(p, c->reg.dp0, deref_1f(temp_ptr, 0));
+            brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx1), c->reg.clipdistance_offset);
+            brw_MOV(p, c->reg.dp1, deref_1f(temp_ptr, 0));
+         }
+         brw_ELSE(p);
+         {
+            /* fixed plane: fetch the hpos, dp4 against the plane. */
+            if (c->key.nr_userclip)
+               brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+            else
+               brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+
+            brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, hpos_offset), c->reg.plane_equation);
+            brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, hpos_offset), c->reg.plane_equation);
+         }
+         brw_ENDIF(p);
+
+         brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, vec1(c->reg.dp1), brw_imm_f(0.0f));
+
+         brw_IF(p, BRW_EXECUTE_1);
+         {
+             /*
+              * Both can be negative on GM965/G965 due to RHW workaround
+              * if so, this object should be rejected.
+              */
+             if (p->devinfo->has_negative_rhw_bug) {
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0));
+                 brw_IF(p, BRW_EXECUTE_1);
+                 {
+                     brw_clip_kill_thread(c);
+                 }
+                 brw_ENDIF(p);
+             }
+
+             brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
+             brw_math_invert(p, c->reg.t, c->reg.t);
+             brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);
+
+             brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
+             brw_MOV(p, c->reg.t1, c->reg.t);
+             brw_inst_set_pred_control(p->devinfo, brw_last_inst,
+                                       BRW_PREDICATE_NORMAL);
+	 }
+	 brw_ELSE(p);
+	 {
+             /* Coming back in.  We know that both cannot be negative
+              * because the line would have been culled in that case.
+              */
+
+             /* If both are positive, do nothing */
+             /* Only on GM965/G965 */
+             if (p->devinfo->has_negative_rhw_bug) {
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
+                 brw_IF(p, BRW_EXECUTE_1);
+             }
+
+             {
+                 brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
+                 brw_math_invert(p, c->reg.t, c->reg.t);
+                 brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);
+
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
+                 brw_MOV(p, c->reg.t0, c->reg.t);
+                 brw_inst_set_pred_control(p->devinfo, brw_last_inst,
+                                           BRW_PREDICATE_NORMAL);
+             }
+
+             if (p->devinfo->has_negative_rhw_bug) {
+                 brw_ENDIF(p);
+             }
+         }
+	 brw_ENDIF(p);
+      }
+      brw_ENDIF(p);
+
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* while (planemask>>=1) != 0
+       */
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.vertex_src_mask, c->reg.vertex_src_mask, brw_imm_ud(1));
+      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+      brw_ADD(p, c->reg.clipdistance_offset, c->reg.clipdistance_offset, brw_imm_w(sizeof(float)));
+      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+   }
+   brw_WHILE(p);
+   brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+
+   brw_ADD(p, c->reg.t, c->reg.t0, c->reg.t1);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.t, brw_imm_f(1.0));
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_interp_vertex(c, newvtx0, vtx0, vtx1, c->reg.t0, false);
+      brw_clip_interp_vertex(c, newvtx1, vtx1, vtx0, c->reg.t1, false);
+
+      brw_clip_emit_vue(c, newvtx0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
+                        (_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
+                        | URB_WRITE_PRIM_START);
+      brw_clip_emit_vue(c, newvtx1, BRW_URB_WRITE_EOT_COMPLETE,
+                        (_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
+                        | URB_WRITE_PRIM_END);
+   }
+   brw_ENDIF(p);
+   brw_clip_kill_thread(c);
+}
+
+
+
+void brw_emit_line_clip( struct brw_clip_compile *c )
+{
+   brw_clip_line_alloc_regs(c);
+   brw_clip_init_ff_sync(c);
+
+   if (c->key.contains_flat_varying) {
+      if (c->key.pv_first)
+         brw_clip_copy_flatshaded_attributes(c, 1, 0);
+      else
+         brw_clip_copy_flatshaded_attributes(c, 0, 1);
+   }
+
+   clip_and_emit_line(c);
+}
--- a/src/intel/compiler/elk/brw_clip_point.c
+++ b/src/intel/compiler/elk/brw_clip_point.c
@ -0,0 +1,45 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#include "brw_clip.h"
+
+
+/* Point clipping, nothing to do?
+ */
+void brw_emit_point_clip( struct brw_clip_compile *c )
+{
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_tri_alloc_regs(c, 0);
+   brw_clip_init_ff_sync(c);
+
+   brw_clip_kill_thread(c);
+}
--- a/src/intel/compiler/elk/brw_clip_tri.c
+++ b/src/intel/compiler/elk/brw_clip_tri.c
@ -0,0 +1,659 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#include "brw_clip.h"
+#include "brw_prim.h"
+
+static void release_tmps( struct brw_clip_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
+
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
+			      GLuint nr_verts )
+{
+   const struct intel_device_info *devinfo = c->func.devinfo;
+   GLuint i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   if (c->vue_map.num_slots % 2 && nr_verts > 0) {
+      /* The VUE has an odd number of slots so the last register is only half
+       * used.  Fill the second half with zero.
+       */
+      for (j = 0; j < 3; j++) {
+	 GLuint delta = brw_vue_slot_to_offset(c->vue_map.num_slots);
+
+	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
+      }
+   }
+
+   c->reg.t          = brw_vec1_grf(i, 0);
+   c->reg.loopcount  = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_D);
+   c->reg.nr_verts   = retype(brw_vec1_grf(i, 2), BRW_REGISTER_TYPE_UD);
+   c->reg.planemask  = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dpPrev     = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp         = brw_vec1_grf(i, 4);
+   i++;
+
+   c->reg.inlist     = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.outlist    = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.freelist   = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0);
+      i++;
+   }
+
+   if (c->key.do_unfilled) {
+      c->reg.dir     = brw_vec4_grf(i, 0);
+      c->reg.offset  = brw_vec4_grf(i, 4);
+      i++;
+      c->reg.tmp0    = brw_vec4_grf(i, 0);
+      c->reg.tmp1    = brw_vec4_grf(i, 4);
+      i++;
+   }
+
+   c->reg.vertex_src_mask = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+   c->reg.clipdistance_offset = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_W);
+   i++;
+
+   if (devinfo->ver == 5) {
+      c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+      i++;
+   }
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+
+   /* Initial list of indices for incoming vertices:
+    */
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   BRW_CONDITIONAL_EQ,
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
+
+   /* XXX: Is there an easier way to do this?  Need to reverse every
+    * second tristrip element:  Can ignore sometimes?
+    */
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[1]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[0]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(-1));
+   }
+   brw_ELSE(p);
+   {
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[0]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[1]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(1));
+   }
+   brw_ENDIF(p);
+
+   brw_MOV(p, get_element(c->reg.inlist, 2),  brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, brw_vec8_grf(c->reg.outlist.nr, 0), brw_imm_f(0));
+   brw_MOV(p, c->reg.nr_verts, brw_imm_ud(3));
+}
+
+
+
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   BRW_CONDITIONAL_EQ,
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_copy_flatshaded_attributes(c, 1, 0);
+      brw_clip_copy_flatshaded_attributes(c, 2, 0);
+   }
+   brw_ELSE(p);
+   {
+      if (c->key.pv_first) {
+	 brw_CMP(p,
+		 vec1(brw_null_reg()),
+		 BRW_CONDITIONAL_EQ,
+		 tmp0,
+		 brw_imm_ud(_3DPRIM_TRIFAN));
+	 brw_IF(p, BRW_EXECUTE_1);
+	 {
+	    brw_clip_copy_flatshaded_attributes(c, 0, 1);
+	    brw_clip_copy_flatshaded_attributes(c, 2, 1);
+	 }
+	 brw_ELSE(p);
+	 {
+	    brw_clip_copy_flatshaded_attributes(c, 1, 0);
+	    brw_clip_copy_flatshaded_attributes(c, 2, 0);
+	 }
+	 brw_ENDIF(p);
+      }
+      else {
+         brw_clip_copy_flatshaded_attributes(c, 0, 2);
+         brw_clip_copy_flatshaded_attributes(c, 1, 2);
+      }
+   }
+   brw_ENDIF(p);
+}
+
+
+/**
+ * Loads the clip distance for a vertex into `dst`, and ends with
+ * a comparison of it to zero with the condition `cond`.
+ *
+ * - If using a fixed plane, the distance is dot(hpos, plane).
+ * - If using a user clip plane, the distance is directly available in the vertex.
+ */
+static inline void
+load_clip_distance(struct brw_clip_compile *c, struct brw_indirect vtx,
+                struct brw_reg dst, GLuint hpos_offset, int cond)
+{
+   struct brw_codegen *p = &c->func;
+
+   dst = vec4(dst);
+   brw_AND(p, vec1(brw_null_reg()), c->reg.vertex_src_mask, brw_imm_ud(1));
+   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      struct brw_indirect temp_ptr = brw_indirect(7, 0);
+      brw_ADD(p, get_addr_reg(temp_ptr), get_addr_reg(vtx), c->reg.clipdistance_offset);
+      brw_MOV(p, vec1(dst), deref_1f(temp_ptr, 0));
+   }
+   brw_ELSE(p);
+   {
+      brw_MOV(p, dst, deref_4f(vtx, hpos_offset));
+      brw_DP4(p, dst, dst, c->reg.plane_equation);
+   }
+   brw_ENDIF(p);
+
+   brw_CMP(p, brw_null_reg(), cond, vec1(dst), brw_imm_f(0.0f));
+}
+
+
+/* Use mesa's clipping algorithms, translated to GFX4 assembly.
+ */
+void brw_clip_tri( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_indirect vtx = brw_indirect(0, 0);
+   struct brw_indirect vtxPrev = brw_indirect(1, 0);
+   struct brw_indirect vtxOut = brw_indirect(2, 0);
+   struct brw_indirect plane_ptr = brw_indirect(3, 0);
+   struct brw_indirect inlist_ptr = brw_indirect(4, 0);
+   struct brw_indirect outlist_ptr = brw_indirect(5, 0);
+   struct brw_indirect freelist_ptr = brw_indirect(6, 0);
+   GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
+   GLint clipdist0_offset = c->key.nr_userclip
+      ? brw_varying_to_offset(&c->vue_map, VARYING_SLOT_CLIP_DIST0)
+      : 0;
+
+   brw_MOV(p, get_addr_reg(vtxPrev),     brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, get_addr_reg(plane_ptr),   brw_clip_plane0_address(c));
+   brw_MOV(p, get_addr_reg(inlist_ptr),  brw_address(c->reg.inlist));
+   brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+
+   brw_MOV(p, get_addr_reg(freelist_ptr), brw_address(c->reg.vertex[3]) );
+
+   /* Set the initial vertex source mask: The first 6 planes are the bounds
+    * of the view volume; the next 8 planes are the user clipping planes.
+    */
+   brw_MOV(p, c->reg.vertex_src_mask, brw_imm_ud(0x3fc0));
+
+   /* Set the initial clipdistance offset to be 6 floats before gl_ClipDistance[0].
+    * We'll increment 6 times before we start hitting actual user clipping. */
+   brw_MOV(p, c->reg.clipdistance_offset, brw_imm_d(clipdist0_offset - 6*sizeof(float)));
+
+   brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_AND(p, vec1(brw_null_reg()), c->reg.planemask, brw_imm_ud(1));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+
+      brw_IF(p, BRW_EXECUTE_1);
+      {
+	 /* vtxOut = freelist_ptr++
+	  */
+	 brw_MOV(p, get_addr_reg(vtxOut),       get_addr_reg(freelist_ptr) );
+	 brw_ADD(p, get_addr_reg(freelist_ptr), get_addr_reg(freelist_ptr), brw_imm_uw(c->nr_regs * REG_SIZE));
+
+	 if (c->key.nr_userclip)
+	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+	 else
+	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+
+	 brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+	 brw_MOV(p, c->reg.nr_verts, brw_imm_ud(0));
+
+	 brw_DO(p, BRW_EXECUTE_1);
+	 {
+	    /* vtx = *input_ptr;
+	     */
+	    brw_MOV(p, get_addr_reg(vtx), deref_1uw(inlist_ptr, 0));
+
+            load_clip_distance(c, vtxPrev, c->reg.dpPrev, hpos_offset, BRW_CONDITIONAL_L);
+	    /* (prev < 0.0f) */
+	    brw_IF(p, BRW_EXECUTE_1);
+	    {
+               load_clip_distance(c, vtx, c->reg.dp, hpos_offset, BRW_CONDITIONAL_GE);
+	       /* IS_POSITIVE(next)
+		*/
+	       brw_IF(p, BRW_EXECUTE_1);
+	       {
+
+		  /* Coming back in.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dpPrev, negate(c->reg.dp));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dpPrev);
+
+		  /* If (vtxOut == 0) vtxOut = vtxPrev
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+                  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtxPrev));
+                  brw_inst_set_pred_control(p->devinfo, brw_last_inst,
+                                            BRW_PREDICATE_NORMAL);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtxPrev, vtx, c->reg.t, false);
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++;
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       }
+	       brw_ENDIF(p);
+
+	    }
+	    brw_ELSE(p);
+	    {
+	       /* *outlist_ptr++ = vtxPrev;
+		* nr_verts++;
+		*/
+	       brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxPrev));
+	       brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+	       brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+
+               load_clip_distance(c, vtx, c->reg.dp, hpos_offset, BRW_CONDITIONAL_L);
+	       /* (next < 0.0f)
+		*/
+	       brw_IF(p, BRW_EXECUTE_1);
+	       {
+		  /* Going out of bounds.  Avoid division by zero as we
+		   * know dp != dpPrev from DIFFERENT_SIGNS, above.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dp, negate(c->reg.dpPrev));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp);
+
+		  /* If (vtxOut == 0) vtxOut = vtx
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+                  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtx));
+                  brw_inst_set_pred_control(p->devinfo, brw_last_inst,
+                                            BRW_PREDICATE_NORMAL);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtx, vtxPrev, c->reg.t, true);
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++;
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       }
+	       brw_ENDIF(p);
+	    }
+	    brw_ENDIF(p);
+
+	    /* vtxPrev = vtx;
+	     * inlist_ptr++;
+	     */
+	    brw_MOV(p, get_addr_reg(vtxPrev), get_addr_reg(vtx));
+	    brw_ADD(p, get_addr_reg(inlist_ptr), get_addr_reg(inlist_ptr), brw_imm_uw(sizeof(short)));
+
+	    /* while (--loopcount != 0)
+	     */
+	    brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+            brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+	 }
+	 brw_WHILE(p);
+         brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+
+	 /* vtxPrev = *(outlist_ptr-1)  OR: outlist[nr_verts-1]
+	  * inlist = outlist
+	  * inlist_ptr = &inlist[0]
+	  * outlist_ptr = &outlist[0]
+	  */
+	 brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_w(-2));
+	 brw_MOV(p, get_addr_reg(vtxPrev), deref_1uw(outlist_ptr, 0));
+	 brw_MOV(p, brw_vec8_grf(c->reg.inlist.nr, 0), brw_vec8_grf(c->reg.outlist.nr, 0));
+	 brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist));
+	 brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+      }
+      brw_ENDIF(p);
+
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* nr_verts >= 3
+       */
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      c->reg.nr_verts,
+	      brw_imm_ud(3));
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
+
+      /* && (planemask>>=1) != 0
+       */
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.vertex_src_mask, c->reg.vertex_src_mask, brw_imm_ud(1));
+      brw_ADD(p, c->reg.clipdistance_offset, c->reg.clipdistance_offset, brw_imm_w(sizeof(float)));
+   }
+   brw_WHILE(p);
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+
+
+void brw_clip_tri_emit_polygon(struct brw_clip_compile *c)
+{
+   struct brw_codegen *p = &c->func;
+
+   /* for (loopcount = nr_verts-2; loopcount > 0; loopcount--)
+    */
+   brw_ADD(p,
+	   c->reg.loopcount,
+	   c->reg.nr_verts,
+	   brw_imm_d(-2));
+   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_G);
+
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      struct brw_indirect v0 = brw_indirect(0, 0);
+      struct brw_indirect vptr = brw_indirect(1, 0);
+
+      brw_MOV(p, get_addr_reg(vptr), brw_address(c->reg.inlist));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
+                        ((_3DPRIM_TRIFAN << URB_WRITE_PRIM_TYPE_SHIFT)
+                         | URB_WRITE_PRIM_START));
+
+      brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
+                           (_3DPRIM_TRIFAN << URB_WRITE_PRIM_TYPE_SHIFT));
+
+	 brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+         brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+      }
+      brw_WHILE(p);
+      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+
+      brw_clip_emit_vue(c, v0, BRW_URB_WRITE_EOT_COMPLETE,
+                        ((_3DPRIM_TRIFAN << URB_WRITE_PRIM_TYPE_SHIFT)
+                         | URB_WRITE_PRIM_END));
+   }
+   brw_ENDIF(p);
+}
+
+static void do_clip_tri( struct brw_clip_compile *c )
+{
+   brw_clip_init_planes(c);
+
+   brw_clip_tri(c);
+}
+
+
+static void maybe_do_clip_tri( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      do_clip_tri(c);
+   }
+   brw_ENDIF(p);
+}
+
+static void brw_clip_test( struct brw_clip_compile *c )
+{
+    struct brw_reg t = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t2 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t3 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+
+    struct brw_reg v0 = get_tmp(c);
+    struct brw_reg v1 = get_tmp(c);
+    struct brw_reg v2 = get_tmp(c);
+
+    struct brw_indirect vt0 = brw_indirect(0, 0);
+    struct brw_indirect vt1 = brw_indirect(1, 0);
+    struct brw_indirect vt2 = brw_indirect(2, 0);
+
+    struct brw_codegen *p = &c->func;
+    struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+
+    GLuint hpos_offset = brw_varying_to_offset(&c->vue_map,
+                                                   VARYING_SLOT_POS);
+
+    brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0]));
+    brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1]));
+    brw_MOV(p, get_addr_reg(vt2), brw_address(c->reg.vertex[2]));
+    brw_MOV(p, v0, deref_4f(vt0, hpos_offset));
+    brw_MOV(p, v1, deref_4f(vt1, hpos_offset));
+    brw_MOV(p, v2, deref_4f(vt2, hpos_offset));
+    brw_AND(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(~0x3f));
+
+    /* test nearz, xmin, ymin plane */
+    /* clip.xyz < -clip.w */
+    brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, negate(get_element(v0, 3)));
+    brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, negate(get_element(v1, 3)));
+    brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, negate(get_element(v2, 3)));
+
+    /* All vertices are outside of a plane, rejected */
+    brw_AND(p, t, t1, t2);
+    brw_AND(p, t, t, t3);
+    brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
+    brw_OR(p, tmp0, tmp0, get_element(t, 2));
+    brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
+    brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+    brw_IF(p, BRW_EXECUTE_1);
+    {
+        brw_clip_kill_thread(c);
+    }
+    brw_ENDIF(p);
+    brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* some vertices are inside a plane, some are outside,need to clip */
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+    brw_AND(p, t, t, brw_imm_ud(0x1));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5)));
+    brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3)));
+    brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1)));
+    brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+
+    /* test farz, xmax, ymax plane */
+    /* clip.xyz > clip.w */
+    brw_CMP(p, t1, BRW_CONDITIONAL_G, v0, get_element(v0, 3));
+    brw_CMP(p, t2, BRW_CONDITIONAL_G, v1, get_element(v1, 3));
+    brw_CMP(p, t3, BRW_CONDITIONAL_G, v2, get_element(v2, 3));
+
+    /* All vertices are outside of a plane, rejected */
+    brw_AND(p, t, t1, t2);
+    brw_AND(p, t, t, t3);
+    brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
+    brw_OR(p, tmp0, tmp0, get_element(t, 2));
+    brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
+    brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+    brw_IF(p, BRW_EXECUTE_1);
+    {
+        brw_clip_kill_thread(c);
+    }
+    brw_ENDIF(p);
+    brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* some vertices are inside a plane, some are outside,need to clip */
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+    brw_AND(p, t, t, brw_imm_ud(0x1));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4)));
+    brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2)));
+    brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0)));
+    brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+
+    release_tmps(c);
+}
+
+
+void brw_emit_tri_clip( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+   brw_clip_init_clipmask(c);
+   brw_clip_init_ff_sync(c);
+
+   /* if -ve rhw workaround bit is set,
+      do cliptest */
+   if (p->devinfo->has_negative_rhw_bug) {
+      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
+              brw_imm_ud(1<<20));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+      brw_IF(p, BRW_EXECUTE_1);
+      {
+         brw_clip_test(c);
+      }
+      brw_ENDIF(p);
+   }
+   /* Can't push into do_clip_tri because with polygon (or quad)
+    * flatshading, need to apply the flatshade here because we don't
+    * respect the PV when converting to trifan for emit:
+    */
+   if (c->key.contains_flat_varying)
+      brw_clip_tri_flat_shade(c);
+
+   if ((c->key.clip_mode == BRW_CLIP_MODE_NORMAL) ||
+       (c->key.clip_mode == BRW_CLIP_MODE_KERNEL_CLIP))
+      do_clip_tri(c);
+   else
+      maybe_do_clip_tri(c);
+
+   brw_clip_tri_emit_polygon(c);
+
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_kill_thread(c);
+}
--- a/src/intel/compiler/elk/brw_clip_unfilled.c
+++ b/src/intel/compiler/elk/brw_clip_unfilled.c
@ -0,0 +1,528 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#include "brw_clip.h"
+#include "brw_prim.h"
+
+
+/* This is performed against the original triangles, so no indirection
+ * required:
+BZZZT!
+ */
+static void compute_tri_direction( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg e = c->reg.tmp0;
+   struct brw_reg f = c->reg.tmp1;
+   GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
+   struct brw_reg v0 = byte_offset(c->reg.vertex[0], hpos_offset);
+   struct brw_reg v1 = byte_offset(c->reg.vertex[1], hpos_offset);
+   struct brw_reg v2 = byte_offset(c->reg.vertex[2], hpos_offset);
+
+
+   struct brw_reg v0n = get_tmp(c);
+   struct brw_reg v1n = get_tmp(c);
+   struct brw_reg v2n = get_tmp(c);
+
+   /* Convert to NDC.
+    * NOTE: We can't modify the original vertex coordinates,
+    * as it may impact further operations.
+    * So, we have to keep normalized coordinates in temp registers.
+    *
+    * TBD-KC
+    * Try to optimize unnecessary MOV's.
+    */
+   brw_MOV(p, v0n, v0);
+   brw_MOV(p, v1n, v1);
+   brw_MOV(p, v2n, v2);
+
+   brw_clip_project_position(c, v0n);
+   brw_clip_project_position(c, v1n);
+   brw_clip_project_position(c, v2n);
+
+   /* Calculate the vectors of two edges of the triangle:
+    */
+   brw_ADD(p, e, v0n, negate(v2n));
+   brw_ADD(p, f, v1n, negate(v2n));
+
+   /* Take their crossproduct:
+    */
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, BRW_SWIZZLE_YZXW),
+           brw_swizzle(f, BRW_SWIZZLE_ZXYW));
+   brw_MAC(p, vec4(e),  negate(brw_swizzle(e, BRW_SWIZZLE_ZXYW)),
+           brw_swizzle(f, BRW_SWIZZLE_YZXW));
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+   brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e));
+}
+
+
+static void cull_direction( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint conditional;
+
+   assert (!(c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL &&
+	     c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL));
+
+   if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p);
+}
+
+
+
+static void copy_bfc( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint conditional;
+
+   /* Do we have any colors to copy?
+    */
+   if (!(brw_clip_have_varying(c, VARYING_SLOT_COL0) &&
+         brw_clip_have_varying(c, VARYING_SLOT_BFC0)) &&
+       !(brw_clip_have_varying(c, VARYING_SLOT_COL1) &&
+         brw_clip_have_varying(c, VARYING_SLOT_BFC1)))
+      return;
+
+   /* In some weird degenerate cases we can end up testing the
+    * direction twice, once for culling and once for bfc copying.  Oh
+    * well, that's what you get for setting weird GL state.
+    */
+   if (c->key.copy_bfc_ccw)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      GLuint i;
+
+      for (i = 0; i < 3; i++) {
+	 if (brw_clip_have_varying(c, VARYING_SLOT_COL0) &&
+             brw_clip_have_varying(c, VARYING_SLOT_BFC0))
+	    brw_MOV(p,
+		    byte_offset(c->reg.vertex[i],
+                                brw_varying_to_offset(&c->vue_map,
+                                                      VARYING_SLOT_COL0)),
+		    byte_offset(c->reg.vertex[i],
+                                brw_varying_to_offset(&c->vue_map,
+                                                      VARYING_SLOT_BFC0)));
+
+	 if (brw_clip_have_varying(c, VARYING_SLOT_COL1) &&
+             brw_clip_have_varying(c, VARYING_SLOT_BFC1))
+	    brw_MOV(p,
+		    byte_offset(c->reg.vertex[i],
+                                brw_varying_to_offset(&c->vue_map,
+                                                      VARYING_SLOT_COL1)),
+		    byte_offset(c->reg.vertex[i],
+                                brw_varying_to_offset(&c->vue_map,
+                                                      VARYING_SLOT_BFC1)));
+      }
+   }
+   brw_ENDIF(p);
+}
+
+
+
+
+/*
+  GLfloat iz	= 1.0 / dir.z;
+  GLfloat ac	= dir.x * iz;
+  GLfloat bc	= dir.y * iz;
+  offset = ctx->Polygon.OffsetUnits * DEPTH_SCALE;
+  offset += MAX2( abs(ac), abs(bc) ) * ctx->Polygon.OffsetFactor;
+  if (ctx->Polygon.OffsetClamp && isfinite(ctx->Polygon.OffsetClamp)) {
+    if (ctx->Polygon.OffsetClamp < 0)
+      offset = MAX2( offset, ctx->Polygon.OffsetClamp );
+    else
+      offset = MIN2( offset, ctx->Polygon.OffsetClamp );
+  }
+  offset *= MRD;
+*/
+static void compute_offset( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg off = c->reg.offset;
+   struct brw_reg dir = c->reg.dir;
+
+   brw_math_invert(p, get_element(off, 2), get_element(dir, 2));
+   brw_MUL(p, vec2(off), vec2(dir), get_element(off, 2));
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   BRW_CONDITIONAL_GE,
+	   brw_abs(get_element(off, 0)),
+	   brw_abs(get_element(off, 1)));
+
+   brw_SEL(p, vec1(off),
+           brw_abs(get_element(off, 0)), brw_abs(get_element(off, 1)));
+   brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+
+   brw_MUL(p, vec1(off), vec1(off), brw_imm_f(c->key.offset_factor));
+   brw_ADD(p, vec1(off), vec1(off), brw_imm_f(c->key.offset_units));
+   if (c->key.offset_clamp && isfinite(c->key.offset_clamp)) {
+      brw_CMP(p,
+              vec1(brw_null_reg()),
+              c->key.offset_clamp < 0 ? BRW_CONDITIONAL_GE : BRW_CONDITIONAL_L,
+              vec1(off),
+              brw_imm_f(c->key.offset_clamp));
+      brw_SEL(p, vec1(off), vec1(off), brw_imm_f(c->key.offset_clamp));
+   }
+}
+
+
+static void merge_edgeflags( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg tmp0 = get_element_ud(c->reg.tmp0, 0);
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   BRW_CONDITIONAL_EQ,
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   /* Get away with using reg.vertex because we know that this is not
+    * a _3DPRIM_TRISTRIP_REVERSE:
+    */
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<8));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_EQ);
+      brw_MOV(p, byte_offset(c->reg.vertex[0],
+                             brw_varying_to_offset(&c->vue_map,
+                                                   VARYING_SLOT_EDGE)),
+              brw_imm_f(0));
+      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<9));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_EQ);
+      brw_MOV(p, byte_offset(c->reg.vertex[2],
+                             brw_varying_to_offset(&c->vue_map,
+                                                   VARYING_SLOT_EDGE)),
+              brw_imm_f(0));
+      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+   }
+   brw_ENDIF(p);
+}
+
+
+
+static void apply_one_offset( struct brw_clip_compile *c,
+			  struct brw_indirect vert )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint ndc_offset = brw_varying_to_offset(&c->vue_map,
+                                             BRW_VARYING_SLOT_NDC);
+   struct brw_reg z = deref_1f(vert, ndc_offset +
+			       2 * type_sz(BRW_REGISTER_TYPE_F));
+
+   brw_ADD(p, z, z, vec1(c->reg.offset));
+}
+
+
+
+/***********************************************************************
+ * Output clipped polygon as an unfilled primitive:
+ */
+static void emit_lines(struct brw_clip_compile *c,
+		       bool do_offset)
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v1 = brw_indirect(1, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+   struct brw_indirect v1ptr = brw_indirect(3, 0);
+
+   /* Need a separate loop for offset:
+    */
+   if (do_offset) {
+      brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+      brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+      brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+	 brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+	 apply_one_offset(c, v0);
+
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+         brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_G);
+      }
+      brw_WHILE(p);
+      brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+   }
+
+   /* v1ptr = &inlist[nr_verts]
+    * *v1ptr = v0
+    */
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v0ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v1ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_MOV(p, deref_1uw(v1ptr, 0), deref_1uw(v0ptr, 0));
+
+   brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_MOV(p, get_addr_reg(v1), deref_1uw(v0ptr, 2));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw edge if edgeflag != 0 */
+      brw_CMP(p,
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ,
+	      deref_1f(v0, brw_varying_to_offset(&c->vue_map,
+                                                 VARYING_SLOT_EDGE)),
+	      brw_imm_f(0));
+      brw_IF(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
+                           (_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
+                           | URB_WRITE_PRIM_START);
+	 brw_clip_emit_vue(c, v1, BRW_URB_WRITE_ALLOCATE_COMPLETE,
+                           (_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
+                           | URB_WRITE_PRIM_END);
+      }
+      brw_ENDIF(p);
+
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+   }
+   brw_WHILE(p);
+   brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+}
+
+
+
+static void emit_points(struct brw_clip_compile *c,
+			bool do_offset )
+{
+   struct brw_codegen *p = &c->func;
+
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+   brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw if edgeflag != 0
+       */
+      brw_CMP(p,
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ,
+	      deref_1f(v0, brw_varying_to_offset(&c->vue_map,
+                                                 VARYING_SLOT_EDGE)),
+	      brw_imm_f(0));
+      brw_IF(p, BRW_EXECUTE_1);
+      {
+	 if (do_offset)
+	    apply_one_offset(c, v0);
+
+	 brw_clip_emit_vue(c, v0, BRW_URB_WRITE_ALLOCATE_COMPLETE,
+                           (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT)
+                           | URB_WRITE_PRIM_START | URB_WRITE_PRIM_END);
+      }
+      brw_ENDIF(p);
+
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+   }
+   brw_WHILE(p);
+   brw_inst_set_pred_control(p->devinfo, brw_last_inst, BRW_PREDICATE_NORMAL);
+}
+
+
+
+
+
+
+
+static void emit_primitives( struct brw_clip_compile *c,
+			     GLuint mode,
+			     bool do_offset )
+{
+   switch (mode) {
+   case BRW_CLIP_FILL_MODE_FILL:
+      brw_clip_tri_emit_polygon(c);
+      break;
+
+   case BRW_CLIP_FILL_MODE_LINE:
+      emit_lines(c, do_offset);
+      break;
+
+   case BRW_CLIP_FILL_MODE_POINT:
+      emit_points(c, do_offset);
+      break;
+
+   case BRW_CLIP_FILL_MODE_CULL:
+      unreachable("not reached");
+   }
+}
+
+
+
+static void emit_unfilled_primitives( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+
+   /* Direction culling has already been done.
+    */
+   if (c->key.fill_ccw != c->key.fill_cw &&
+       c->key.fill_ccw != BRW_CLIP_FILL_MODE_CULL &&
+       c->key.fill_cw != BRW_CLIP_FILL_MODE_CULL)
+   {
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      get_element(c->reg.dir, 2),
+	      brw_imm_f(0));
+
+      brw_IF(p, BRW_EXECUTE_1);
+      {
+	 emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+      }
+      brw_ELSE(p);
+      {
+	 emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+      }
+      brw_ENDIF(p);
+   }
+   else if (c->key.fill_cw != BRW_CLIP_FILL_MODE_CULL) {
+      emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+   }
+   else if (c->key.fill_ccw != BRW_CLIP_FILL_MODE_CULL) {
+      emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+   }
+}
+
+
+
+
+static void check_nr_verts( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.nr_verts, brw_imm_d(3));
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p);
+}
+
+
+void brw_emit_unfilled_clip( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+
+   c->need_direction = ((c->key.offset_ccw || c->key.offset_cw) ||
+			(c->key.fill_ccw != c->key.fill_cw) ||
+			c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL ||
+			c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL ||
+			c->key.copy_bfc_cw ||
+			c->key.copy_bfc_ccw);
+
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+   brw_clip_init_ff_sync(c);
+
+   assert(brw_clip_have_varying(c, VARYING_SLOT_EDGE));
+
+   if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL &&
+       c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL) {
+      brw_clip_kill_thread(c);
+      return;
+   }
+
+   merge_edgeflags(c);
+
+   /* Need to use the inlist indirection here:
+    */
+   if (c->need_direction)
+      compute_tri_direction(c);
+
+   if (c->key.fill_ccw == BRW_CLIP_FILL_MODE_CULL ||
+       c->key.fill_cw == BRW_CLIP_FILL_MODE_CULL)
+      cull_direction(c);
+
+   if (c->key.offset_ccw ||
+       c->key.offset_cw)
+      compute_offset(c);
+
+   if (c->key.copy_bfc_ccw ||
+       c->key.copy_bfc_cw)
+      copy_bfc(c);
+
+   /* Need to do this whether we clip or not:
+    */
+   if (c->key.contains_flat_varying)
+      brw_clip_tri_flat_shade(c);
+
+   brw_clip_init_clipmask(c);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_init_planes(c);
+      brw_clip_tri(c);
+      check_nr_verts(c);
+   }
+   brw_ENDIF(p);
+
+   emit_unfilled_primitives(c);
+   brw_clip_kill_thread(c);
+}
--- a/src/intel/compiler/elk/brw_clip_util.c
+++ b/src/intel/compiler/elk/brw_clip_util.c
@ -0,0 +1,464 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#include "brw_clip.h"
+
+
+struct brw_reg get_tmp( struct brw_clip_compile *c )
+{
+   struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmp( struct brw_clip_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+
+
+static struct brw_reg make_plane_ud(GLuint x, GLuint y, GLuint z, GLuint w)
+{
+   return brw_imm_ud((w<<24) | (z<<16) | (y<<8) | x);
+}
+
+
+void brw_clip_init_planes( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+
+   if (!c->key.nr_userclip) {
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 0), make_plane_ud( 0,    0, 0xff, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 1), make_plane_ud( 0,    0,    1, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 2), make_plane_ud( 0, 0xff,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 3), make_plane_ud( 0,    1,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 4), make_plane_ud(0xff,  0,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 5), make_plane_ud( 1,    0,    0, 1));
+   }
+}
+
+
+
+#define W 3
+
+/* Project 'pos' to screen space (or back again), overwrite with results:
+ */
+void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos )
+{
+   struct brw_codegen *p = &c->func;
+
+   /* calc rhw
+    */
+   brw_math_invert(p, get_element(pos, W), get_element(pos, W));
+
+   /* value.xyz *= value.rhw
+    */
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, brw_writemask(pos, WRITEMASK_XYZ), pos,
+           brw_swizzle(pos, BRW_SWIZZLE_WWWW));
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+}
+
+
+static void brw_clip_project_vertex( struct brw_clip_compile *c,
+				     struct brw_indirect vert_addr )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+   GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
+   GLuint ndc_offset = brw_varying_to_offset(&c->vue_map,
+                                             BRW_VARYING_SLOT_NDC);
+
+   /* Fixup position.  Extract from the original vertex and re-project
+    * to screen space:
+    */
+   brw_MOV(p, tmp, deref_4f(vert_addr, hpos_offset));
+   brw_clip_project_position(c, tmp);
+   brw_MOV(p, deref_4f(vert_addr, ndc_offset), tmp);
+
+   release_tmp(c, tmp);
+}
+
+
+
+
+/* Interpolate between two vertices and put the result into a0.0.
+ * Increment a0.0 accordingly.
+ *
+ * Beware that dest_ptr can be equal to v0_ptr!
+ */
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     bool force_edgeflag)
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg t_nopersp, v0_ndc_copy;
+   GLuint slot;
+
+   /* Just copy the vertex header:
+    */
+   /*
+    * After CLIP stage, only first 256 bits of the VUE are read
+    * back on Ironlake, so needn't change it
+    */
+   brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
+
+
+   /* First handle the 3D and NDC interpolation, in case we
+    * need noperspective interpolation. Doing it early has no
+    * performance impact in any case.
+    */
+
+   /* Take a copy of the v0 NDC coordinates, in case dest == v0. */
+   if (c->key.contains_noperspective_varying) {
+      GLuint offset = brw_varying_to_offset(&c->vue_map,
+                                                 BRW_VARYING_SLOT_NDC);
+      v0_ndc_copy = get_tmp(c);
+      brw_MOV(p, v0_ndc_copy, deref_4f(v0_ptr, offset));
+   }
+
+   /* Compute the new 3D position
+    *
+    * dest_hpos = v0_hpos * (1 - t0) + v1_hpos * t0
+    */
+   {
+      GLuint delta = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
+      struct brw_reg tmp = get_tmp(c);
+      brw_MUL(p, vec4(brw_null_reg()), deref_4f(v1_ptr, delta), t0);
+      brw_MAC(p, tmp, negate(deref_4f(v0_ptr, delta)), t0);
+      brw_ADD(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta), tmp);
+      release_tmp(c, tmp);
+   }
+
+   /* Recreate the projected (NDC) coordinate in the new vertex header */
+   brw_clip_project_vertex(c, dest_ptr);
+
+   /* If we have noperspective attributes,
+    * we need to compute the screen-space t
+    */
+   if (c->key.contains_noperspective_varying) {
+      GLuint delta = brw_varying_to_offset(&c->vue_map,
+                                                BRW_VARYING_SLOT_NDC);
+      struct brw_reg tmp = get_tmp(c);
+      t_nopersp = get_tmp(c);
+
+      /* t_nopersp = vec4(v1.xy, dest.xy) */
+      brw_MOV(p, t_nopersp, deref_4f(v1_ptr, delta));
+      brw_MOV(p, tmp, deref_4f(dest_ptr, delta));
+      brw_set_default_access_mode(p, BRW_ALIGN_16);
+      brw_MOV(p,
+              brw_writemask(t_nopersp, WRITEMASK_ZW),
+              brw_swizzle(tmp, BRW_SWIZZLE_XYXY));
+
+      /* t_nopersp = vec4(v1.xy, dest.xy) - v0.xyxy */
+      brw_ADD(p, t_nopersp, t_nopersp,
+              negate(brw_swizzle(v0_ndc_copy, BRW_SWIZZLE_XYXY)));
+
+      /* Add the absolute values of the X and Y deltas so that if
+       * the points aren't in the same place on the screen we get
+       * nonzero values to divide.
+       *
+       * After that, we have vert1 - vert0 in t_nopersp.x and
+       * vertnew - vert0 in t_nopersp.y
+       *
+       * t_nopersp = vec2(|v1.x  -v0.x| + |v1.y  -v0.y|,
+       *                  |dest.x-v0.x| + |dest.y-v0.y|)
+       */
+      brw_ADD(p,
+              brw_writemask(t_nopersp, WRITEMASK_XY),
+              brw_abs(brw_swizzle(t_nopersp, BRW_SWIZZLE_XZXZ)),
+              brw_abs(brw_swizzle(t_nopersp, BRW_SWIZZLE_YWYW)));
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+      /* If the points are in the same place, just substitute a
+       * value to avoid divide-by-zero
+       */
+      brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ,
+              vec1(t_nopersp),
+              brw_imm_f(0));
+      brw_IF(p, BRW_EXECUTE_1);
+      brw_MOV(p, t_nopersp, brw_imm_vf4(brw_float_to_vf(1.0),
+                                        brw_float_to_vf(0.0),
+                                        brw_float_to_vf(0.0),
+                                        brw_float_to_vf(0.0)));
+      brw_ENDIF(p);
+
+      /* Now compute t_nopersp = t_nopersp.y/t_nopersp.x and broadcast it. */
+      brw_math_invert(p, get_element(t_nopersp, 0), get_element(t_nopersp, 0));
+      brw_MUL(p, vec1(t_nopersp), vec1(t_nopersp),
+            vec1(suboffset(t_nopersp, 1)));
+      brw_set_default_access_mode(p, BRW_ALIGN_16);
+      brw_MOV(p, t_nopersp, brw_swizzle(t_nopersp, BRW_SWIZZLE_XXXX));
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+      release_tmp(c, tmp);
+      release_tmp(c, v0_ndc_copy);
+   }
+
+   /* Now we can iterate over each attribute
+    * (could be done in pairs?)
+    */
+   for (slot = 0; slot < c->vue_map.num_slots; slot++) {
+      int varying = c->vue_map.slot_to_varying[slot];
+      GLuint delta = brw_vue_slot_to_offset(slot);
+
+      /* HPOS, NDC already handled above */
+      if (varying == VARYING_SLOT_POS || varying == BRW_VARYING_SLOT_NDC)
+         continue;
+
+
+      if (varying == VARYING_SLOT_EDGE) {
+	 if (force_edgeflag)
+	    brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
+	 else
+	    brw_MOV(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta));
+      } else if (varying == VARYING_SLOT_PSIZ) {
+         /* PSIZ doesn't need interpolation because it isn't used by the
+          * fragment shader.
+          */
+      } else if (varying < VARYING_SLOT_MAX) {
+	 /* This is a true vertex result (and not a special value for the VUE
+	  * header), so interpolate:
+	  *
+	  *        New = attr0 + t*attr1 - t*attr0
+          *
+          * Unless the attribute is flat shaded -- in which case just copy
+          * from one of the sources (doesn't matter which; already copied from pv)
+	  */
+         GLuint interp = c->key.interp_mode[slot];
+
+         if (interp != INTERP_MODE_FLAT) {
+            struct brw_reg tmp = get_tmp(c);
+            struct brw_reg t =
+               interp == INTERP_MODE_NOPERSPECTIVE ? t_nopersp : t0;
+
+            brw_MUL(p,
+                  vec4(brw_null_reg()),
+                  deref_4f(v1_ptr, delta),
+                  t);
+
+            brw_MAC(p,
+                  tmp,
+                  negate(deref_4f(v0_ptr, delta)),
+                  t);
+
+            brw_ADD(p,
+                  deref_4f(dest_ptr, delta),
+                  deref_4f(v0_ptr, delta),
+                  tmp);
+
+            release_tmp(c, tmp);
+         }
+         else {
+            brw_MOV(p,
+                  deref_4f(dest_ptr, delta),
+                  deref_4f(v0_ptr, delta));
+         }
+      }
+   }
+
+   if (c->vue_map.num_slots % 2) {
+      GLuint delta = brw_vue_slot_to_offset(c->vue_map.num_slots);
+
+      brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
+   }
+
+   if (c->key.contains_noperspective_varying)
+      release_tmp(c, t_nopersp);
+}
+
+void brw_clip_emit_vue(struct brw_clip_compile *c,
+		       struct brw_indirect vert,
+                       enum brw_urb_write_flags flags,
+		       GLuint header)
+{
+   struct brw_codegen *p = &c->func;
+   bool allocate = flags & BRW_URB_WRITE_ALLOCATE;
+
+   brw_clip_ff_sync(c);
+
+   /* Any URB entry that is allocated must subsequently be used or discarded,
+    * so it doesn't make sense to mark EOT and ALLOCATE at the same time.
+    */
+   assert(!(allocate && (flags & BRW_URB_WRITE_EOT)));
+
+   /* Copy the vertex from vertn into m1..mN+1:
+    */
+   brw_copy_from_indirect(p, brw_message_reg(1), vert, c->nr_regs);
+
+   /* Overwrite PrimType and PrimStart in the message header, for
+    * each vertex in turn:
+    */
+   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
+
+
+   /* Send each vertex as a separate write to the urb.  This
+    * is different to the concept in brw_sf_emit.c, where
+    * subsequent writes are used to build up a single urb
+    * entry.  Each of these writes instantiates a separate
+    * urb entry - (I think... what about 'allocate'?)
+    */
+   brw_urb_WRITE(p,
+		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+                 flags,
+		 c->nr_regs + 1, /* msg length */
+		 allocate ? 1 : 0, /* response_length */
+		 0,		/* urb offset */
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+void brw_clip_kill_thread(struct brw_clip_compile *c)
+{
+   struct brw_codegen *p = &c->func;
+
+   brw_clip_ff_sync(c);
+   /* Send an empty message to kill the thread and release any
+    * allocated urb entry:
+    */
+   brw_urb_WRITE(p,
+		 retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+                 BRW_URB_WRITE_UNUSED | BRW_URB_WRITE_EOT_COMPLETE,
+		 1, 		/* msg len */
+		 0, 		/* response len */
+		 0,
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c )
+{
+   return brw_address(c->reg.fixed_planes);
+}
+
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c )
+{
+   if (c->key.nr_userclip) {
+      return brw_imm_uw(16);
+   }
+   else {
+      return brw_imm_uw(4);
+   }
+}
+
+
+/* Distribute flatshaded attributes from provoking vertex prior to
+ * clipping.
+ */
+void brw_clip_copy_flatshaded_attributes( struct brw_clip_compile *c,
+			   GLuint to, GLuint from )
+{
+   struct brw_codegen *p = &c->func;
+
+   for (int i = 0; i < c->vue_map.num_slots; i++) {
+      if (c->key.interp_mode[i] == INTERP_MODE_FLAT) {
+         brw_MOV(p,
+                 byte_offset(c->reg.vertex[to], brw_vue_slot_to_offset(i)),
+                 byte_offset(c->reg.vertex[from], brw_vue_slot_to_offset(i)));
+      }
+   }
+}
+
+
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg incoming = get_element_ud(c->reg.R0, 2);
+
+   /* Shift so that lowest outcode bit is rightmost:
+    */
+   brw_SHR(p, c->reg.planemask, incoming, brw_imm_ud(26));
+
+   if (c->key.nr_userclip) {
+      struct brw_reg tmp = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UD);
+
+      /* Rearrange userclip outcodes so that they come directly after
+       * the fixed plane bits.
+       */
+      if (p->devinfo->ver == 5 || p->devinfo->verx10 == 45)
+         brw_AND(p, tmp, incoming, brw_imm_ud(0xff<<14));
+      else
+         brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14));
+
+      brw_SHR(p, tmp, tmp, brw_imm_ud(8));
+      brw_OR(p, c->reg.planemask, c->reg.planemask, tmp);
+
+      release_tmp(c, tmp);
+   }
+}
+
+void brw_clip_ff_sync(struct brw_clip_compile *c)
+{
+    struct brw_codegen *p = &c->func;
+
+    if (p->devinfo->ver == 5) {
+        brw_AND(p, brw_null_reg(), c->reg.ff_sync, brw_imm_ud(0x1));
+        brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
+        brw_IF(p, BRW_EXECUTE_1);
+        {
+            brw_OR(p, c->reg.ff_sync, c->reg.ff_sync, brw_imm_ud(0x1));
+            brw_ff_sync(p,
+			c->reg.R0,
+			0,
+			c->reg.R0,
+			1, /* allocate */
+			1, /* response length */
+			0 /* eot */);
+        }
+        brw_ENDIF(p);
+        brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+    }
+}
+
+void brw_clip_init_ff_sync(struct brw_clip_compile *c)
+{
+    struct brw_codegen *p = &c->func;
+
+    if (p->devinfo->ver == 5) {
+        brw_MOV(p, c->reg.ff_sync, brw_imm_ud(0));
+    }
+}
--- a/src/intel/compiler/elk/brw_compile_clip.c
+++ b/src/intel/compiler/elk/brw_compile_clip.c
@ -0,0 +1,97 @@
+/*
+ * Copyright © 2006 - 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_clip.h"
+#include "brw_disasm.h"
+
+#include "dev/intel_debug.h"
+
+const unsigned *
+brw_compile_clip(const struct brw_compiler *compiler,
+                 void *mem_ctx,
+                 const struct brw_clip_prog_key *key,
+                 struct brw_clip_prog_data *prog_data,
+                 struct intel_vue_map *vue_map,
+                 unsigned *final_assembly_size)
+{
+   struct brw_clip_compile c;
+   memset(&c, 0, sizeof(c));
+
+   /* Begin the compilation:
+    */
+   brw_init_codegen(&compiler->isa, &c.func, mem_ctx);
+
+   c.func.single_program_flow = 1;
+
+   c.key = *key;
+   c.vue_map = *vue_map;
+
+   /* nr_regs is the number of registers filled by reading data from the VUE.
+    * This program accesses the entire VUE, so nr_regs needs to be the size of
+    * the VUE (measured in pairs, since two slots are stored in each
+    * register).
+    */
+   c.nr_regs = (c.vue_map.num_slots + 1)/2;
+
+   c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.
+    */
+   brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE);
+
+   /* Would ideally have the option of producing a program which could
+    * do all three:
+    */
+   switch (key->primitive) {
+   case MESA_PRIM_TRIANGLES:
+      if (key->do_unfilled)
+	 brw_emit_unfilled_clip( &c );
+      else
+	 brw_emit_tri_clip( &c );
+      break;
+   case MESA_PRIM_LINES:
+      brw_emit_line_clip( &c );
+      break;
+   case MESA_PRIM_POINTS:
+      brw_emit_point_clip( &c );
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   brw_compact_instructions(&c.func, 0, NULL);
+
+   *prog_data = c.prog_data;
+
+   const unsigned *program = brw_get_program(&c.func, final_assembly_size);
+
+   if (INTEL_DEBUG(DEBUG_CLIP)) {
+      fprintf(stderr, "clip:\n");
+      brw_disassemble_with_labels(&compiler->isa,
+                                  program, 0, *final_assembly_size, stderr);
+      fprintf(stderr, "\n");
+   }
+
+   return program;
+}
--- a/src/intel/compiler/elk/brw_compile_ff_gs.c
+++ b/src/intel/compiler/elk/brw_compile_ff_gs.c
@ -0,0 +1,662 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#include "brw_compiler.h"
+#include "brw_disasm.h"
+#include "brw_eu.h"
+#include "brw_prim.h"
+
+#include "dev/intel_debug.h"
+
+#define MAX_GS_VERTS (4)
+
+struct brw_ff_gs_compile {
+   struct brw_codegen func;
+   struct brw_ff_gs_prog_key key;
+   struct brw_ff_gs_prog_data *prog_data;
+
+   struct {
+      struct brw_reg R0;
+
+      /**
+       * Register holding streamed vertex buffer pointers -- see the Sandy
+       * Bridge PRM, volume 2 part 1, section 4.4.2 (GS Thread Payload
+       * [DevSNB]).  These pointers are delivered in GRF 1.
+       */
+      struct brw_reg SVBI;
+
+      struct brw_reg vertex[MAX_GS_VERTS];
+      struct brw_reg header;
+      struct brw_reg temp;
+
+      /**
+       * Register holding destination indices for streamed buffer writes.
+       * Only used for SOL programs.
+       */
+      struct brw_reg destination_indices;
+   } reg;
+
+   /* Number of registers used to store vertex data */
+   GLuint nr_regs;
+
+   struct intel_vue_map vue_map;
+};
+
+/**
+ * Allocate registers for GS.
+ *
+ * If sol_program is true, then:
+ *
+ * - The thread will be spawned with the "SVBI Payload Enable" bit set, so GRF
+ *   1 needs to be set aside to hold the streamed vertex buffer indices.
+ *
+ * - The thread will need to use the destination_indices register.
+ */
+static void brw_ff_gs_alloc_regs(struct brw_ff_gs_compile *c,
+                                 GLuint nr_verts,
+                                 bool sol_program)
+{
+   GLuint i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   /* Streamed vertex buffer indices */
+   if (sol_program)
+      c->reg.SVBI = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->reg.header = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
+   c->reg.temp = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
+
+   if (sol_program) {
+      c->reg.destination_indices =
+         retype(brw_vec4_grf(i++, 0), BRW_REGISTER_TYPE_UD);
+   }
+
+   c->prog_data->urb_read_length = c->nr_regs;
+   c->prog_data->total_grf = i;
+}
+
+
+/**
+ * Set up the initial value of c->reg.header register based on c->reg.R0.
+ *
+ * The following information is passed to the GS thread in R0, and needs to be
+ * included in the first URB_WRITE or FF_SYNC message sent by the GS:
+ *
+ * - DWORD 0 [31:0] handle info (Gen4 only)
+ * - DWORD 5 [7:0] FFTID
+ * - DWORD 6 [31:0] Debug info
+ * - DWORD 7 [31:0] Debug info
+ *
+ * This function sets up the above data by copying by copying the contents of
+ * R0 to the header register.
+ */
+static void brw_ff_gs_initialize_header(struct brw_ff_gs_compile *c)
+{
+   struct brw_codegen *p = &c->func;
+   brw_MOV(p, c->reg.header, c->reg.R0);
+}
+
+/**
+ * Overwrite DWORD 2 of c->reg.header with the given immediate unsigned value.
+ *
+ * In URB_WRITE messages, DWORD 2 contains the fields PrimType, PrimStart,
+ * PrimEnd, Increment CL_INVOCATIONS, and SONumPrimsWritten, many of which we
+ * need to be able to update on a per-vertex basis.
+ */
+static void brw_ff_gs_overwrite_header_dw2(struct brw_ff_gs_compile *c,
+                                           unsigned dw2)
+{
+   struct brw_codegen *p = &c->func;
+   brw_MOV(p, get_element_ud(c->reg.header, 2), brw_imm_ud(dw2));
+}
+
+/**
+ * Overwrite DWORD 2 of c->reg.header with the primitive type from c->reg.R0.
+ *
+ * When the thread is spawned, GRF 0 contains the primitive type in bits 4:0
+ * of DWORD 2.  URB_WRITE messages need the primitive type in bits 6:2 of
+ * DWORD 2.  So this function extracts the primitive type field, bitshifts it
+ * appropriately, and stores it in c->reg.header.
+ */
+static void brw_ff_gs_overwrite_header_dw2_from_r0(struct brw_ff_gs_compile *c)
+{
+   struct brw_codegen *p = &c->func;
+   brw_AND(p, get_element_ud(c->reg.header, 2), get_element_ud(c->reg.R0, 2),
+           brw_imm_ud(0x1f));
+   brw_SHL(p, get_element_ud(c->reg.header, 2),
+           get_element_ud(c->reg.header, 2), brw_imm_ud(2));
+}
+
+/**
+ * Apply an additive offset to DWORD 2 of c->reg.header.
+ *
+ * This is used to set/unset the "PrimStart" and "PrimEnd" flags appropriately
+ * for each vertex.
+ */
+static void brw_ff_gs_offset_header_dw2(struct brw_ff_gs_compile *c,
+                                        int offset)
+{
+   struct brw_codegen *p = &c->func;
+   brw_ADD(p, get_element_d(c->reg.header, 2), get_element_d(c->reg.header, 2),
+           brw_imm_d(offset));
+}
+
+
+/**
+ * Emit a vertex using the URB_WRITE message.  Use the contents of
+ * c->reg.header for the message header, and the registers starting at \c vert
+ * for the vertex data.
+ *
+ * If \c last is true, then this is the last vertex, so no further URB space
+ * should be allocated, and this message should end the thread.
+ *
+ * If \c last is false, then a new URB entry will be allocated, and its handle
+ * will be stored in DWORD 0 of c->reg.header for use in the next URB_WRITE
+ * message.
+ */
+static void brw_ff_gs_emit_vue(struct brw_ff_gs_compile *c,
+                               struct brw_reg vert,
+                               bool last)
+{
+   struct brw_codegen *p = &c->func;
+   int write_offset = 0;
+   bool complete = false;
+
+   do {
+      /* We can't write more than 14 registers at a time to the URB */
+      int write_len = MIN2(c->nr_regs - write_offset, 14);
+      if (write_len == c->nr_regs - write_offset)
+         complete = true;
+
+      /* Copy the vertex from vertn into m1..mN+1:
+       */
+      brw_copy8(p, brw_message_reg(1), offset(vert, write_offset), write_len);
+
+      /* Send the vertex data to the URB.  If this is the last write for this
+       * vertex, then we mark it as complete, and either end the thread or
+       * allocate another vertex URB entry (depending whether this is the last
+       * vertex).
+       */
+      enum brw_urb_write_flags flags;
+      if (!complete)
+         flags = BRW_URB_WRITE_NO_FLAGS;
+      else if (last)
+         flags = BRW_URB_WRITE_EOT_COMPLETE;
+      else
+         flags = BRW_URB_WRITE_ALLOCATE_COMPLETE;
+      brw_urb_WRITE(p,
+                    (flags & BRW_URB_WRITE_ALLOCATE) ? c->reg.temp
+                    : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+                    0,
+                    c->reg.header,
+                    flags,
+                    write_len + 1, /* msg length */
+                    (flags & BRW_URB_WRITE_ALLOCATE) ? 1
+                    : 0, /* response length */
+                    write_offset,  /* urb offset */
+                    BRW_URB_SWIZZLE_NONE);
+      write_offset += write_len;
+   } while (!complete);
+
+   if (!last) {
+      brw_MOV(p, get_element_ud(c->reg.header, 0),
+              get_element_ud(c->reg.temp, 0));
+   }
+}
+
+/**
+ * Send an FF_SYNC message to ensure that all previously spawned GS threads
+ * have finished sending primitives down the pipeline, and to allocate a URB
+ * entry for the first output vertex.  Only needed on Ironlake+.
+ *
+ * This function modifies c->reg.header: in DWORD 1, it stores num_prim (which
+ * is needed by the FF_SYNC message), and in DWORD 0, it stores the handle to
+ * the allocated URB entry (which will be needed by the URB_WRITE meesage that
+ * follows).
+ */
+static void brw_ff_gs_ff_sync(struct brw_ff_gs_compile *c, int num_prim)
+{
+   struct brw_codegen *p = &c->func;
+
+   brw_MOV(p, get_element_ud(c->reg.header, 1), brw_imm_ud(num_prim));
+   brw_ff_sync(p,
+               c->reg.temp,
+               0,
+               c->reg.header,
+               1, /* allocate */
+               1, /* response length */
+               0 /* eot */);
+   brw_MOV(p, get_element_ud(c->reg.header, 0),
+           get_element_ud(c->reg.temp, 0));
+}
+
+
+static void
+brw_ff_gs_quads(struct brw_ff_gs_compile *c,
+		const struct brw_ff_gs_prog_key *key)
+{
+   brw_ff_gs_alloc_regs(c, 4, false);
+   brw_ff_gs_initialize_header(c);
+   /* Use polygons for correct edgeflag behaviour. Note that vertex 3
+    * is the PV for quads, but vertex 0 for polygons:
+    */
+   if (c->func.devinfo->ver == 5)
+      brw_ff_gs_ff_sync(c, 1);
+   brw_ff_gs_overwrite_header_dw2(
+      c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
+          | URB_WRITE_PRIM_START));
+   if (key->pv_first) {
+      brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
+             | URB_WRITE_PRIM_END));
+      brw_ff_gs_emit_vue(c, c->reg.vertex[3], 1);
+   }
+   else {
+      brw_ff_gs_emit_vue(c, c->reg.vertex[3], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
+             | URB_WRITE_PRIM_END));
+      brw_ff_gs_emit_vue(c, c->reg.vertex[2], 1);
+   }
+}
+
+static void
+brw_ff_gs_quad_strip(struct brw_ff_gs_compile *c,
+                     const struct brw_ff_gs_prog_key *key)
+{
+   brw_ff_gs_alloc_regs(c, 4, false);
+   brw_ff_gs_initialize_header(c);
+
+   if (c->func.devinfo->ver == 5)
+      brw_ff_gs_ff_sync(c, 1);
+   brw_ff_gs_overwrite_header_dw2(
+      c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
+          | URB_WRITE_PRIM_START));
+   if (key->pv_first) {
+      brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
+             | URB_WRITE_PRIM_END));
+      brw_ff_gs_emit_vue(c, c->reg.vertex[3], 1);
+   }
+   else {
+      brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[3], 0);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
+      brw_ff_gs_overwrite_header_dw2(
+         c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
+             | URB_WRITE_PRIM_END));
+      brw_ff_gs_emit_vue(c, c->reg.vertex[1], 1);
+   }
+}
+
+static void brw_ff_gs_lines(struct brw_ff_gs_compile *c)
+{
+   brw_ff_gs_alloc_regs(c, 2, false);
+   brw_ff_gs_initialize_header(c);
+
+   if (c->func.devinfo->ver == 5)
+      brw_ff_gs_ff_sync(c, 1);
+   brw_ff_gs_overwrite_header_dw2(
+      c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
+          | URB_WRITE_PRIM_START));
+   brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
+   brw_ff_gs_overwrite_header_dw2(
+      c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
+          | URB_WRITE_PRIM_END));
+   brw_ff_gs_emit_vue(c, c->reg.vertex[1], 1);
+}
+
+/**
+ * Generate the geometry shader program used on Gen6 to perform stream output
+ * (transform feedback).
+ */
+static void
+gfx6_sol_program(struct brw_ff_gs_compile *c, const struct brw_ff_gs_prog_key *key,
+                 unsigned num_verts, bool check_edge_flags)
+{
+   struct brw_codegen *p = &c->func;
+   brw_inst *inst;
+   c->prog_data->svbi_postincrement_value = num_verts;
+
+   brw_ff_gs_alloc_regs(c, num_verts, true);
+   brw_ff_gs_initialize_header(c);
+
+   if (key->num_transform_feedback_bindings > 0) {
+      unsigned vertex, binding;
+      struct brw_reg destination_indices_uw =
+         vec8(retype(c->reg.destination_indices, BRW_REGISTER_TYPE_UW));
+
+      /* Note: since we use the binding table to keep track of buffer offsets
+       * and stride, the GS doesn't need to keep track of a separate pointer
+       * into each buffer; it uses a single pointer which increments by 1 for
+       * each vertex.  So we use SVBI0 for this pointer, regardless of whether
+       * transform feedback is in interleaved or separate attribs mode.
+       *
+       * Make sure that the buffers have enough room for all the vertices.
+       */
+      brw_ADD(p, get_element_ud(c->reg.temp, 0),
+                 get_element_ud(c->reg.SVBI, 0), brw_imm_ud(num_verts));
+      brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE,
+                 get_element_ud(c->reg.temp, 0),
+                 get_element_ud(c->reg.SVBI, 4));
+      brw_IF(p, BRW_EXECUTE_1);
+
+      /* Compute the destination indices to write to.  Usually we use SVBI[0]
+       * + (0, 1, 2).  However, for odd-numbered triangles in tristrips, the
+       * vertices come down the pipeline in reversed winding order, so we need
+       * to flip the order when writing to the transform feedback buffer.  To
+       * ensure that flatshading accuracy is preserved, we need to write them
+       * in order SVBI[0] + (0, 2, 1) if we're using the first provoking
+       * vertex convention, and in order SVBI[0] + (1, 0, 2) if we're using
+       * the last provoking vertex convention.
+       *
+       * Note: since brw_imm_v can only be used in instructions in
+       * packed-word execution mode, and SVBI is a double-word, we need to
+       * first move the appropriate immediate constant ((0, 1, 2), (0, 2, 1),
+       * or (1, 0, 2)) to the destination_indices register, and then add SVBI
+       * using a separate instruction.  Also, since the immediate constant is
+       * expressed as packed words, and we need to load double-words into
+       * destination_indices, we need to intersperse zeros to fill the upper
+       * halves of each double-word.
+       */
+      brw_MOV(p, destination_indices_uw,
+              brw_imm_v(0x00020100)); /* (0, 1, 2) */
+      if (num_verts == 3) {
+         /* Get primitive type into temp register. */
+         brw_AND(p, get_element_ud(c->reg.temp, 0),
+                 get_element_ud(c->reg.R0, 2), brw_imm_ud(0x1f));
+
+         /* Test if primitive type is TRISTRIP_REVERSE.  We need to do this as
+          * an 8-wide comparison so that the conditional MOV that follows
+          * moves all 8 words correctly.
+          */
+         brw_CMP(p, vec8(brw_null_reg()), BRW_CONDITIONAL_EQ,
+                 get_element_ud(c->reg.temp, 0),
+                 brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
+
+         /* If so, then overwrite destination_indices_uw with the appropriate
+          * reordering.
+          */
+         inst = brw_MOV(p, destination_indices_uw,
+                        brw_imm_v(key->pv_first ? 0x00010200    /* (0, 2, 1) */
+                                                : 0x00020001)); /* (1, 0, 2) */
+         brw_inst_set_pred_control(p->devinfo, inst, BRW_PREDICATE_NORMAL);
+      }
+
+      assert(c->reg.destination_indices.width == BRW_EXECUTE_4);
+      brw_push_insn_state(p);
+      brw_set_default_exec_size(p, BRW_EXECUTE_4);
+      brw_ADD(p, c->reg.destination_indices,
+              c->reg.destination_indices, get_element_ud(c->reg.SVBI, 0));
+      brw_pop_insn_state(p);
+      /* For each vertex, generate code to output each varying using the
+       * appropriate binding table entry.
+       */
+      for (vertex = 0; vertex < num_verts; ++vertex) {
+         /* Set up the correct destination index for this vertex */
+         brw_MOV(p, get_element_ud(c->reg.header, 5),
+                 get_element_ud(c->reg.destination_indices, vertex));
+
+         for (binding = 0; binding < key->num_transform_feedback_bindings;
+              ++binding) {
+            unsigned char varying =
+               key->transform_feedback_bindings[binding];
+            unsigned char slot = c->vue_map.varying_to_slot[varying];
+            /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
+             *
+             *   "Prior to End of Thread with a URB_WRITE, the kernel must
+             *   ensure that all writes are complete by sending the final
+             *   write as a committed write."
+             */
+            bool final_write =
+               binding == key->num_transform_feedback_bindings - 1 &&
+               vertex == num_verts - 1;
+            struct brw_reg vertex_slot = c->reg.vertex[vertex];
+            vertex_slot.nr += slot / 2;
+            vertex_slot.subnr = (slot % 2) * 16;
+            /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w. */
+            vertex_slot.swizzle = varying == VARYING_SLOT_PSIZ
+               ? BRW_SWIZZLE_WWWW : key->transform_feedback_swizzles[binding];
+            brw_set_default_access_mode(p, BRW_ALIGN_16);
+            brw_push_insn_state(p);
+            brw_set_default_exec_size(p, BRW_EXECUTE_4);
+
+            brw_MOV(p, stride(c->reg.header, 4, 4, 1),
+                    retype(vertex_slot, BRW_REGISTER_TYPE_UD));
+            brw_pop_insn_state(p);
+
+            brw_set_default_access_mode(p, BRW_ALIGN_1);
+            brw_svb_write(p,
+                          final_write ? c->reg.temp : brw_null_reg(), /* dest */
+                          1, /* msg_reg_nr */
+                          c->reg.header, /* src0 */
+                          BRW_GFX6_SOL_BINDING_START + binding, /* binding_table_index */
+                          final_write); /* send_commit_msg */
+         }
+      }
+      brw_ENDIF(p);
+
+      /* Now, reinitialize the header register from R0 to restore the parts of
+       * the register that we overwrote while streaming out transform feedback
+       * data.
+       */
+      brw_ff_gs_initialize_header(c);
+
+      /* Finally, wait for the write commit to occur so that we can proceed to
+       * other things safely.
+       *
+       * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
+       *
+       *   The write commit does not modify the destination register, but
+       *   merely clears the dependency associated with the destination
+       *   register. Thus, a simple “mov” instruction using the register as a
+       *   source is sufficient to wait for the write commit to occur.
+       */
+      brw_MOV(p, c->reg.temp, c->reg.temp);
+   }
+
+   brw_ff_gs_ff_sync(c, 1);
+
+   brw_ff_gs_overwrite_header_dw2_from_r0(c);
+   switch (num_verts) {
+   case 1:
+      brw_ff_gs_offset_header_dw2(c,
+                                  URB_WRITE_PRIM_START | URB_WRITE_PRIM_END);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[0], true);
+      break;
+   case 2:
+      brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[0], false);
+      brw_ff_gs_offset_header_dw2(c,
+                                  URB_WRITE_PRIM_END - URB_WRITE_PRIM_START);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[1], true);
+      break;
+   case 3:
+      if (check_edge_flags) {
+         /* Only emit vertices 0 and 1 if this is the first triangle of the
+          * polygon.  Otherwise they are redundant.
+          */
+         brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+                 get_element_ud(c->reg.R0, 2),
+                 brw_imm_ud(BRW_GS_EDGE_INDICATOR_0));
+         brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+         brw_IF(p, BRW_EXECUTE_1);
+      }
+      brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[0], false);
+      brw_ff_gs_offset_header_dw2(c, -URB_WRITE_PRIM_START);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[1], false);
+      if (check_edge_flags) {
+         brw_ENDIF(p);
+         /* Only emit vertex 2 in PRIM_END mode if this is the last triangle
+          * of the polygon.  Otherwise leave the primitive incomplete because
+          * there are more polygon vertices coming.
+          */
+         brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+                 get_element_ud(c->reg.R0, 2),
+                 brw_imm_ud(BRW_GS_EDGE_INDICATOR_1));
+         brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+         brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
+      }
+      brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_END);
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+      brw_ff_gs_emit_vue(c, c->reg.vertex[2], true);
+      break;
+   }
+}
+
+const unsigned *
+brw_compile_ff_gs_prog(struct brw_compiler *compiler,
+		       void *mem_ctx,
+		       const struct brw_ff_gs_prog_key *key,
+		       struct brw_ff_gs_prog_data *prog_data,
+		       struct intel_vue_map *vue_map,
+		       unsigned *final_assembly_size)
+{
+   struct brw_ff_gs_compile c;
+   const GLuint *program;
+
+   memset(&c, 0, sizeof(c));
+
+   c.key = *key;
+   c.vue_map = *vue_map;
+   c.nr_regs = (c.vue_map.num_slots + 1)/2;
+   c.prog_data = prog_data;
+
+   mem_ctx = ralloc_context(NULL);
+
+   /* Begin the compilation:
+    */
+   brw_init_codegen(&compiler->isa, &c.func, mem_ctx);
+
+   c.func.single_program_flow = 1;
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.
+    */
+   brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE);
+
+   if (compiler->devinfo->ver >= 6) {
+      unsigned num_verts;
+      bool check_edge_flag;
+      /* On Sandybridge, we use the GS for implementing transform feedback
+       * (called "Stream Out" in the PRM).
+       */
+      switch (key->primitive) {
+      case _3DPRIM_POINTLIST:
+         num_verts = 1;
+         check_edge_flag = false;
+         break;
+      case _3DPRIM_LINELIST:
+      case _3DPRIM_LINESTRIP:
+      case _3DPRIM_LINELOOP:
+         num_verts = 2;
+         check_edge_flag = false;
+         break;
+      case _3DPRIM_TRILIST:
+      case _3DPRIM_TRIFAN:
+      case _3DPRIM_TRISTRIP:
+      case _3DPRIM_RECTLIST:
+         num_verts = 3;
+         check_edge_flag = false;
+         break;
+      case _3DPRIM_QUADLIST:
+      case _3DPRIM_QUADSTRIP:
+      case _3DPRIM_POLYGON:
+         num_verts = 3;
+         check_edge_flag = true;
+         break;
+      default:
+         unreachable("Unexpected primitive type in Gen6 SOL program.");
+      }
+      gfx6_sol_program(&c, key, num_verts, check_edge_flag);
+   } else {
+      /* On Gen4-5, we use the GS to decompose certain types of primitives.
+       * Note that primitives which don't require a GS program have already
+       * been weeded out by now.
+       */
+      switch (key->primitive) {
+      case _3DPRIM_QUADLIST:
+         brw_ff_gs_quads( &c, key );
+         break;
+      case _3DPRIM_QUADSTRIP:
+         brw_ff_gs_quad_strip( &c, key );
+         break;
+      case _3DPRIM_LINELOOP:
+         brw_ff_gs_lines( &c );
+         break;
+      default:
+         return NULL;
+      }
+   }
+
+   brw_compact_instructions(&c.func, 0, NULL);
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, final_assembly_size);
+
+   if (INTEL_DEBUG(DEBUG_GS)) {
+      fprintf(stderr, "gs:\n");
+      brw_disassemble_with_labels(&compiler->isa, c.func.store,
+                                  0, *final_assembly_size, stderr);
+      fprintf(stderr, "\n");
+    }
+
+   return program;
+}
+
--- a/src/intel/compiler/elk/brw_compile_sf.c
+++ b/src/intel/compiler/elk/brw_compile_sf.c
@ -0,0 +1,881 @@
+/*
+ * Copyright © 2006 - 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_compiler.h"
+#include "brw_disasm.h"
+#include "brw_eu.h"
+#include "brw_prim.h"
+
+#include "dev/intel_debug.h"
+
+struct brw_sf_compile {
+   struct brw_codegen func;
+   struct brw_sf_prog_key key;
+   struct brw_sf_prog_data prog_data;
+
+   struct brw_reg pv;
+   struct brw_reg det;
+   struct brw_reg dx0;
+   struct brw_reg dx2;
+   struct brw_reg dy0;
+   struct brw_reg dy2;
+
+   /* z and 1/w passed in separately:
+    */
+   struct brw_reg z[3];
+   struct brw_reg inv_w[3];
+
+   /* The vertices:
+    */
+   struct brw_reg vert[3];
+
+    /* Temporaries, allocated after last vertex reg.
+    */
+   struct brw_reg inv_det;
+   struct brw_reg a1_sub_a0;
+   struct brw_reg a2_sub_a0;
+   struct brw_reg tmp;
+
+   struct brw_reg m1Cx;
+   struct brw_reg m2Cy;
+   struct brw_reg m3C0;
+
+   GLuint nr_verts;
+   GLuint nr_attr_regs;
+   GLuint nr_setup_regs;
+   int urb_entry_read_offset;
+
+   /** The last known value of the f0.0 flag register. */
+   unsigned flag_value;
+
+   struct intel_vue_map vue_map;
+};
+
+/**
+ * Determine the vue slot corresponding to the given half of the given register.
+ */
+static inline int vert_reg_to_vue_slot(struct brw_sf_compile *c, GLuint reg,
+                                       int half)
+{
+   return (reg + c->urb_entry_read_offset) * 2 + half;
+}
+
+/**
+ * Determine the varying corresponding to the given half of the given
+ * register.  half=0 means the first half of a register, half=1 means the
+ * second half.
+ */
+static inline int vert_reg_to_varying(struct brw_sf_compile *c, GLuint reg,
+                                      int half)
+{
+   int vue_slot = vert_reg_to_vue_slot(c, reg, half);
+   return c->vue_map.slot_to_varying[vue_slot];
+}
+
+/**
+ * Determine the register corresponding to the given vue slot
+ */
+static struct brw_reg get_vue_slot(struct brw_sf_compile *c,
+                                   struct brw_reg vert,
+                                   int vue_slot)
+{
+   GLuint off = vue_slot / 2 - c->urb_entry_read_offset;
+   GLuint sub = vue_slot % 2;
+
+   return brw_vec4_grf(vert.nr + off, sub * 4);
+}
+
+/**
+ * Determine the register corresponding to the given varying.
+ */
+static struct brw_reg get_varying(struct brw_sf_compile *c,
+                                  struct brw_reg vert,
+                                  GLuint varying)
+{
+   int vue_slot = c->vue_map.varying_to_slot[varying];
+   assert (vue_slot >= c->urb_entry_read_offset);
+   return get_vue_slot(c, vert, vue_slot);
+}
+
+static bool
+have_attr(struct brw_sf_compile *c, GLuint attr)
+{
+   return (c->key.attrs & BITFIELD64_BIT(attr)) ? 1 : 0;
+}
+
+/***********************************************************************
+ * Twoside lighting
+ */
+static void copy_bfc( struct brw_sf_compile *c,
+		      struct brw_reg vert )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint i;
+
+   for (i = 0; i < 2; i++) {
+      if (have_attr(c, VARYING_SLOT_COL0+i) &&
+	  have_attr(c, VARYING_SLOT_BFC0+i))
+	 brw_MOV(p,
+		 get_varying(c, vert, VARYING_SLOT_COL0+i),
+		 get_varying(c, vert, VARYING_SLOT_BFC0+i));
+   }
+}
+
+
+static void do_twoside_color( struct brw_sf_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint backface_conditional = c->key.frontface_ccw ? BRW_CONDITIONAL_G : BRW_CONDITIONAL_L;
+
+   /* Already done in clip program:
+    */
+   if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
+      return;
+
+   /* If the vertex shader provides backface color, do the selection. The VS
+    * promises to set up the front color if the backface color is provided, but
+    * it may contain junk if never written to.
+    */
+   if (!(have_attr(c, VARYING_SLOT_COL0) && have_attr(c, VARYING_SLOT_BFC0)) &&
+       !(have_attr(c, VARYING_SLOT_COL1) && have_attr(c, VARYING_SLOT_BFC1)))
+      return;
+
+   /* Need to use BRW_EXECUTE_4 and also do an 4-wide compare in order
+    * to get all channels active inside the IF.  In the clipping code
+    * we run with NoMask, so it's not an option and we can use
+    * BRW_EXECUTE_1 for all comparisons.
+    */
+   brw_CMP(p, vec4(brw_null_reg()), backface_conditional, c->det, brw_imm_f(0));
+   brw_IF(p, BRW_EXECUTE_4);
+   {
+      switch (c->nr_verts) {
+      case 3: copy_bfc(c, c->vert[2]); FALLTHROUGH;
+      case 2: copy_bfc(c, c->vert[1]); FALLTHROUGH;
+      case 1: copy_bfc(c, c->vert[0]);
+      }
+   }
+   brw_ENDIF(p);
+}
+
+
+
+/***********************************************************************
+ * Flat shading
+ */
+
+static void copy_flatshaded_attributes(struct brw_sf_compile *c,
+                                       struct brw_reg dst,
+                                       struct brw_reg src)
+{
+   struct brw_codegen *p = &c->func;
+   int i;
+
+   for (i = 0; i < c->vue_map.num_slots; i++) {
+      if (c->key.interp_mode[i] == INTERP_MODE_FLAT) {
+         brw_MOV(p,
+                 get_vue_slot(c, dst, i),
+                 get_vue_slot(c, src, i));
+      }
+   }
+}
+
+static int count_flatshaded_attributes(struct brw_sf_compile *c)
+{
+   int i;
+   int count = 0;
+
+   for (i = 0; i < c->vue_map.num_slots; i++)
+      if (c->key.interp_mode[i] == INTERP_MODE_FLAT)
+         count++;
+
+   return count;
+}
+
+
+
+/* Need to use a computed jump to copy flatshaded attributes as the
+ * vertices are ordered according to y-coordinate before reaching this
+ * point, so the PV could be anywhere.
+ */
+static void do_flatshade_triangle( struct brw_sf_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint nr;
+   GLuint jmpi = 1;
+
+   /* Already done in clip program:
+    */
+   if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
+      return;
+
+   if (p->devinfo->ver == 5)
+       jmpi = 2;
+
+   nr = count_flatshaded_attributes(c);
+
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr*2+1)));
+   brw_JMPI(p, c->pv, BRW_PREDICATE_NONE);
+
+   copy_flatshaded_attributes(c, c->vert[1], c->vert[0]);
+   copy_flatshaded_attributes(c, c->vert[2], c->vert[0]);
+   brw_JMPI(p, brw_imm_d(jmpi*(nr*4+1)), BRW_PREDICATE_NONE);
+
+   copy_flatshaded_attributes(c, c->vert[0], c->vert[1]);
+   copy_flatshaded_attributes(c, c->vert[2], c->vert[1]);
+   brw_JMPI(p, brw_imm_d(jmpi*nr*2), BRW_PREDICATE_NONE);
+
+   copy_flatshaded_attributes(c, c->vert[0], c->vert[2]);
+   copy_flatshaded_attributes(c, c->vert[1], c->vert[2]);
+}
+
+
+static void do_flatshade_line( struct brw_sf_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint nr;
+   GLuint jmpi = 1;
+
+   /* Already done in clip program:
+    */
+   if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
+      return;
+
+   if (p->devinfo->ver == 5)
+       jmpi = 2;
+
+   nr = count_flatshaded_attributes(c);
+
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr+1)));
+   brw_JMPI(p, c->pv, BRW_PREDICATE_NONE);
+   copy_flatshaded_attributes(c, c->vert[1], c->vert[0]);
+
+   brw_JMPI(p, brw_imm_ud(jmpi*nr), BRW_PREDICATE_NONE);
+   copy_flatshaded_attributes(c, c->vert[0], c->vert[1]);
+}
+
+
+/***********************************************************************
+ * Triangle setup.
+ */
+
+
+static void alloc_regs( struct brw_sf_compile *c )
+{
+   GLuint reg, i;
+
+   /* Values computed by fixed function unit:
+    */
+   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_D);
+   c->det = brw_vec1_grf(1, 2);
+   c->dx0 = brw_vec1_grf(1, 3);
+   c->dx2 = brw_vec1_grf(1, 4);
+   c->dy0 = brw_vec1_grf(1, 5);
+   c->dy2 = brw_vec1_grf(1, 6);
+
+   /* z and 1/w passed in separately:
+    */
+   c->z[0]     = brw_vec1_grf(2, 0);
+   c->inv_w[0] = brw_vec1_grf(2, 1);
+   c->z[1]     = brw_vec1_grf(2, 2);
+   c->inv_w[1] = brw_vec1_grf(2, 3);
+   c->z[2]     = brw_vec1_grf(2, 4);
+   c->inv_w[2] = brw_vec1_grf(2, 5);
+
+   /* The vertices:
+    */
+   reg = 3;
+   for (i = 0; i < c->nr_verts; i++) {
+      c->vert[i] = brw_vec8_grf(reg, 0);
+      reg += c->nr_attr_regs;
+   }
+
+   /* Temporaries, allocated after last vertex reg.
+    */
+   c->inv_det = brw_vec1_grf(reg, 0);  reg++;
+   c->a1_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->a2_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->tmp = brw_vec8_grf(reg, 0);  reg++;
+
+   /* Note grf allocation:
+    */
+   c->prog_data.total_grf = reg;
+
+
+   /* Outputs of this program - interpolation coefficients for
+    * rasterization:
+    */
+   c->m1Cx = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 1, 0);
+   c->m2Cy = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 2, 0);
+   c->m3C0 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 3, 0);
+}
+
+
+static void copy_z_inv_w( struct brw_sf_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   GLuint i;
+
+   /* Copy both scalars with a single MOV:
+    */
+   for (i = 0; i < c->nr_verts; i++)
+      brw_MOV(p, vec2(suboffset(c->vert[i], 2)), vec2(c->z[i]));
+}
+
+
+static void invert_det( struct brw_sf_compile *c)
+{
+   /* Looks like we invert all 8 elements just to get 1/det in
+    * position 2 !?!
+    */
+   gfx4_math(&c->func,
+	     c->inv_det,
+	     BRW_MATH_FUNCTION_INV,
+	     0,
+	     c->det,
+	     BRW_MATH_PRECISION_FULL);
+
+}
+
+
+static bool
+calculate_masks(struct brw_sf_compile *c,
+                GLuint reg,
+                GLushort *pc,
+                GLushort *pc_persp,
+                GLushort *pc_linear)
+{
+   bool is_last_attr = (reg == c->nr_setup_regs - 1);
+   enum glsl_interp_mode interp;
+
+   *pc_persp = 0;
+   *pc_linear = 0;
+   *pc = 0xf;
+
+   interp = c->key.interp_mode[vert_reg_to_vue_slot(c, reg, 0)];
+   if (interp == INTERP_MODE_SMOOTH) {
+      *pc_linear = 0xf;
+      *pc_persp = 0xf;
+   } else if (interp == INTERP_MODE_NOPERSPECTIVE)
+      *pc_linear = 0xf;
+
+   /* Maybe only process one attribute on the final round:
+    */
+   if (vert_reg_to_varying(c, reg, 1) != BRW_VARYING_SLOT_COUNT) {
+      *pc |= 0xf0;
+
+      interp = c->key.interp_mode[vert_reg_to_vue_slot(c, reg, 1)];
+      if (interp == INTERP_MODE_SMOOTH) {
+         *pc_linear |= 0xf0;
+         *pc_persp |= 0xf0;
+      } else if (interp == INTERP_MODE_NOPERSPECTIVE)
+         *pc_linear |= 0xf0;
+   }
+
+   return is_last_attr;
+}
+
+/* Calculates the predicate control for which channels of a reg
+ * (containing 2 attrs) to do point sprite coordinate replacement on.
+ */
+static uint16_t
+calculate_point_sprite_mask(struct brw_sf_compile *c, GLuint reg)
+{
+   int varying1, varying2;
+   uint16_t pc = 0;
+
+   varying1 = vert_reg_to_varying(c, reg, 0);
+   if (varying1 >= VARYING_SLOT_TEX0 && varying1 <= VARYING_SLOT_TEX7) {
+      if (c->key.point_sprite_coord_replace & (1 << (varying1 - VARYING_SLOT_TEX0)))
+	 pc |= 0x0f;
+   }
+   if (varying1 == BRW_VARYING_SLOT_PNTC)
+      pc |= 0x0f;
+
+   varying2 = vert_reg_to_varying(c, reg, 1);
+   if (varying2 >= VARYING_SLOT_TEX0 && varying2 <= VARYING_SLOT_TEX7) {
+      if (c->key.point_sprite_coord_replace & (1 << (varying2 -
+                                                     VARYING_SLOT_TEX0)))
+         pc |= 0xf0;
+   }
+   if (varying2 == BRW_VARYING_SLOT_PNTC)
+      pc |= 0xf0;
+
+   return pc;
+}
+
+static void
+set_predicate_control_flag_value(struct brw_codegen *p,
+                                 struct brw_sf_compile *c,
+                                 unsigned value)
+{
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+
+   if (value != 0xff) {
+      if (value != c->flag_value) {
+         brw_MOV(p, brw_flag_reg(0, 0), brw_imm_uw(value));
+         c->flag_value = value;
+      }
+
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
+   }
+}
+
+static void brw_emit_tri_setup(struct brw_sf_compile *c, bool allocate)
+{
+   struct brw_codegen *p = &c->func;
+   GLuint i;
+
+   c->flag_value = 0xff;
+   c->nr_verts = 3;
+
+   if (allocate)
+      alloc_regs(c);
+
+   invert_det(c);
+   copy_z_inv_w(c);
+
+   if (c->key.do_twoside_color)
+      do_twoside_color(c);
+
+   if (c->key.contains_flat_varying)
+      do_flatshade_triangle(c);
+
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      struct brw_reg a2 = offset(c->vert[2], i);
+      GLushort pc, pc_persp, pc_linear;
+      bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 set_predicate_control_flag_value(p, c, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+	 brw_MUL(p, a2, a2, c->inv_w[2]);
+      }
+
+
+      /* Calculate coefficients for interpolated values:
+       */
+      if (pc_linear)
+      {
+	 set_predicate_control_flag_value(p, c, pc_linear);
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+	 brw_ADD(p, c->a2_sub_a0, a2, negate(a0));
+
+	 /* calculate dA/dx
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a1_sub_a0, c->dy2);
+	 brw_MAC(p, c->tmp, c->a2_sub_a0, negate(c->dy0));
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+
+	 /* calculate dA/dy
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a2_sub_a0, c->dx0);
+	 brw_MAC(p, c->tmp, c->a1_sub_a0, negate(c->dx2));
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 set_predicate_control_flag_value(p, c, pc);
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+
+	 /* Copy m0..m3 to URB.  m0 is implicitly copied from r0 in
+	  * the send instruction:
+	  */
+	 brw_urb_WRITE(p,
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0), /* r0, will be copied to m0 */
+                       last ? BRW_URB_WRITE_EOT_COMPLETE
+                       : BRW_URB_WRITE_NO_FLAGS,
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       i*4,	/* offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE); /* XXX: Swizzle control "SF to windower" */
+      }
+   }
+
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+
+
+static void brw_emit_line_setup(struct brw_sf_compile *c, bool allocate)
+{
+   struct brw_codegen *p = &c->func;
+   GLuint i;
+
+   c->flag_value = 0xff;
+   c->nr_verts = 2;
+
+   if (allocate)
+      alloc_regs(c);
+
+   invert_det(c);
+   copy_z_inv_w(c);
+
+   if (c->key.contains_flat_varying)
+      do_flatshade_line(c);
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      GLushort pc, pc_persp, pc_linear;
+      bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 set_predicate_control_flag_value(p, c, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+      }
+
+      /* Calculate coefficients for position, color:
+       */
+      if (pc_linear) {
+	 set_predicate_control_flag_value(p, c, pc_linear);
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+
+	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0);
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+
+	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dy0);
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 set_predicate_control_flag_value(p, c, pc);
+
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+
+	 /* Copy m0..m3 to URB.
+	  */
+	 brw_urb_WRITE(p,
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+                       last ? BRW_URB_WRITE_EOT_COMPLETE
+                       : BRW_URB_WRITE_NO_FLAGS,
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+static void brw_emit_point_sprite_setup(struct brw_sf_compile *c, bool allocate)
+{
+   struct brw_codegen *p = &c->func;
+   GLuint i;
+
+   c->flag_value = 0xff;
+   c->nr_verts = 1;
+
+   if (allocate)
+      alloc_regs(c);
+
+   copy_z_inv_w(c);
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      struct brw_reg a0 = offset(c->vert[0], i);
+      GLushort pc, pc_persp, pc_linear, pc_coord_replace;
+      bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      pc_coord_replace = calculate_point_sprite_mask(c, i);
+      pc_persp &= ~pc_coord_replace;
+
+      if (pc_persp) {
+	 set_predicate_control_flag_value(p, c, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+      }
+
+      /* Point sprite coordinate replacement: A texcoord with this
+       * enabled gets replaced with the value (x, y, 0, 1) where x and
+       * y vary from 0 to 1 across the horizontal and vertical of the
+       * point.
+       */
+      if (pc_coord_replace) {
+	 set_predicate_control_flag_value(p, c, pc_coord_replace);
+	 /* Calculate 1.0/PointWidth */
+	 gfx4_math(&c->func,
+		   c->tmp,
+		   BRW_MATH_FUNCTION_INV,
+		   0,
+		   c->dx0,
+		   BRW_MATH_PRECISION_FULL);
+
+	 brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+	 /* dA/dx, dA/dy */
+	 brw_MOV(p, c->m1Cx, brw_imm_f(0.0));
+	 brw_MOV(p, c->m2Cy, brw_imm_f(0.0));
+	 brw_MOV(p, brw_writemask(c->m1Cx, WRITEMASK_X), c->tmp);
+	 if (c->key.sprite_origin_lower_left) {
+	    brw_MOV(p, brw_writemask(c->m2Cy, WRITEMASK_Y), negate(c->tmp));
+	 } else {
+	    brw_MOV(p, brw_writemask(c->m2Cy, WRITEMASK_Y), c->tmp);
+	 }
+
+	 /* attribute constant offset */
+	 brw_MOV(p, c->m3C0, brw_imm_f(0.0));
+	 if (c->key.sprite_origin_lower_left) {
+	    brw_MOV(p, brw_writemask(c->m3C0, WRITEMASK_YW), brw_imm_f(1.0));
+	 } else {
+	    brw_MOV(p, brw_writemask(c->m3C0, WRITEMASK_W), brw_imm_f(1.0));
+	 }
+
+	 brw_set_default_access_mode(p, BRW_ALIGN_1);
+      }
+
+      if (pc & ~pc_coord_replace) {
+	 set_predicate_control_flag_value(p, c, pc & ~pc_coord_replace);
+	 brw_MOV(p, c->m1Cx, brw_imm_ud(0));
+	 brw_MOV(p, c->m2Cy, brw_imm_ud(0));
+	 brw_MOV(p, c->m3C0, a0); /* constant value */
+      }
+
+
+      set_predicate_control_flag_value(p, c, pc);
+      /* Copy m0..m3 to URB. */
+      brw_urb_WRITE(p,
+		    brw_null_reg(),
+		    0,
+		    brw_vec8_grf(0, 0),
+                    last ? BRW_URB_WRITE_EOT_COMPLETE
+                    : BRW_URB_WRITE_NO_FLAGS,
+		    4, 	/* msg len */
+		    0,	/* response len */
+		    i*4,	/* urb destination offset */
+		    BRW_URB_SWIZZLE_TRANSPOSE);
+   }
+
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+/* Points setup - several simplifications as all attributes are
+ * constant across the face of the point (point sprites excluded!)
+ */
+static void brw_emit_point_setup(struct brw_sf_compile *c, bool allocate)
+{
+   struct brw_codegen *p = &c->func;
+   GLuint i;
+
+   c->flag_value = 0xff;
+   c->nr_verts = 1;
+
+   if (allocate)
+      alloc_regs(c);
+
+   copy_z_inv_w(c);
+
+   brw_MOV(p, c->m1Cx, brw_imm_ud(0)); /* zero - move out of loop */
+   brw_MOV(p, c->m2Cy, brw_imm_ud(0)); /* zero - move out of loop */
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      struct brw_reg a0 = offset(c->vert[0], i);
+      GLushort pc, pc_persp, pc_linear;
+      bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 /* This seems odd as the values are all constant, but the
+	  * fragment shader will be expecting it:
+	  */
+	 set_predicate_control_flag_value(p, c, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+      }
+
+
+      /* The delta values are always zero, just send the starting
+       * coordinate.  Again, this is to fit in with the interpolation
+       * code in the fragment shader.
+       */
+      {
+	 set_predicate_control_flag_value(p, c, pc);
+
+	 brw_MOV(p, c->m3C0, a0); /* constant value */
+
+	 /* Copy m0..m3 to URB.
+	  */
+	 brw_urb_WRITE(p,
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+                       last ? BRW_URB_WRITE_EOT_COMPLETE
+                       : BRW_URB_WRITE_NO_FLAGS,
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+static void brw_emit_anyprim_setup( struct brw_sf_compile *c )
+{
+   struct brw_codegen *p = &c->func;
+   struct brw_reg payload_prim = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0);
+   struct brw_reg payload_attr = get_element_ud(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), 0);
+   struct brw_reg primmask;
+   int jmp;
+   struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+
+   c->nr_verts = 3;
+   alloc_regs(c);
+
+   primmask = retype(get_element(c->tmp, 0), BRW_REGISTER_TYPE_UD);
+
+   brw_MOV(p, primmask, brw_imm_ud(1));
+   brw_SHL(p, primmask, primmask, payload_prim);
+
+   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_TRILIST) |
+					       (1<<_3DPRIM_TRISTRIP) |
+					       (1<<_3DPRIM_TRIFAN) |
+					       (1<<_3DPRIM_TRISTRIP_REVERSE) |
+					       (1<<_3DPRIM_POLYGON) |
+					       (1<<_3DPRIM_RECTLIST) |
+					       (1<<_3DPRIM_TRIFAN_NOSTIPPLE)));
+   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
+   jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
+   brw_emit_tri_setup(c, false);
+   brw_land_fwd_jump(p, jmp);
+
+   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_LINELIST) |
+					       (1<<_3DPRIM_LINESTRIP) |
+					       (1<<_3DPRIM_LINELOOP) |
+					       (1<<_3DPRIM_LINESTRIP_CONT) |
+					       (1<<_3DPRIM_LINESTRIP_BF) |
+					       (1<<_3DPRIM_LINESTRIP_CONT_BF)));
+   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
+   jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
+   brw_emit_line_setup(c, false);
+   brw_land_fwd_jump(p, jmp);
+
+   brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE));
+   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
+   jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
+   brw_emit_point_sprite_setup(c, false);
+   brw_land_fwd_jump(p, jmp);
+
+   brw_emit_point_setup( c, false );
+}
+
+const unsigned *
+brw_compile_sf(const struct brw_compiler *compiler,
+               void *mem_ctx,
+               const struct brw_sf_prog_key *key,
+               struct brw_sf_prog_data *prog_data,
+               struct intel_vue_map *vue_map,
+               unsigned *final_assembly_size)
+{
+   struct brw_sf_compile c;
+   memset(&c, 0, sizeof(c));
+
+   /* Begin the compilation:
+    */
+   brw_init_codegen(&compiler->isa, &c.func, mem_ctx);
+
+   c.key = *key;
+   c.vue_map = *vue_map;
+   if (c.key.do_point_coord) {
+      /*
+       * gl_PointCoord is a FS instead of VS builtin variable, thus it's
+       * not included in c.vue_map generated in VS stage. Here we add
+       * it manually to let SF shader generate the needed interpolation
+       * coefficient for FS shader.
+       */
+      c.vue_map.varying_to_slot[BRW_VARYING_SLOT_PNTC] = c.vue_map.num_slots;
+      c.vue_map.slot_to_varying[c.vue_map.num_slots++] = BRW_VARYING_SLOT_PNTC;
+   }
+   c.urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
+   c.nr_attr_regs = (c.vue_map.num_slots + 1)/2 - c.urb_entry_read_offset;
+   c.nr_setup_regs = c.nr_attr_regs;
+
+   c.prog_data.urb_read_length = c.nr_attr_regs;
+   c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
+
+   /* Which primitive?  Or all three?
+    */
+   switch (key->primitive) {
+   case BRW_SF_PRIM_TRIANGLES:
+      c.nr_verts = 3;
+      brw_emit_tri_setup( &c, true );
+      break;
+   case BRW_SF_PRIM_LINES:
+      c.nr_verts = 2;
+      brw_emit_line_setup( &c, true );
+      break;
+   case BRW_SF_PRIM_POINTS:
+      c.nr_verts = 1;
+      if (key->do_point_sprite)
+	  brw_emit_point_sprite_setup( &c, true );
+      else
+	  brw_emit_point_setup( &c, true );
+      break;
+   case BRW_SF_PRIM_UNFILLED_TRIS:
+      c.nr_verts = 3;
+      brw_emit_anyprim_setup( &c );
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   /* FINISHME: SF programs use calculated jumps (i.e., JMPI with a register
+    * source). Compacting would be difficult.
+    */
+   /* brw_compact_instructions(&c.func, 0, 0, NULL); */
+
+   *prog_data = c.prog_data;
+
+   const unsigned *program = brw_get_program(&c.func, final_assembly_size);
+
+   if (INTEL_DEBUG(DEBUG_SF)) {
+      fprintf(stderr, "sf:\n");
+      brw_disassemble_with_labels(&compiler->isa,
+                                  program, 0, *final_assembly_size, stderr);
+      fprintf(stderr, "\n");
+   }
+
+   return program;
+}
--- a/src/intel/compiler/elk/brw_compiler.c
+++ b/src/intel/compiler/elk/brw_compiler.c
@ -0,0 +1,370 @@
+/*
+ * Copyright © 2015-2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_compiler.h"
+#include "brw_shader.h"
+#include "brw_eu.h"
+#include "brw_nir.h"
+#include "dev/intel_debug.h"
+#include "compiler/nir/nir.h"
+#include "util/u_debug.h"
+
+#define COMMON_OPTIONS                                                        \
+   .has_uclz = true,                                                          \
+   .lower_fdiv = true,                                                        \
+   .lower_scmp = true,                                                        \
+   .lower_flrp16 = true,                                                      \
+   .lower_fmod = true,                                                        \
+   .lower_ufind_msb = true,                                                   \
+   .lower_uadd_carry = true,                                                  \
+   .lower_usub_borrow = true,                                                 \
+   .lower_flrp64 = true,                                                      \
+   .lower_fisnormal = true,                                                   \
+   .lower_isign = true,                                                       \
+   .lower_ldexp = true,                                                       \
+   .lower_bitfield_extract = true,                                            \
+   .lower_bitfield_insert = true,                                             \
+   .lower_device_index_to_zero = true,                                        \
+   .vectorize_io = true,                                                      \
+   .vectorize_tess_levels = true,                                             \
+   .use_interpolated_input_intrinsics = true,                                 \
+   .lower_insert_byte = true,                                                 \
+   .lower_insert_word = true,                                                 \
+   .vertex_id_zero_based = true,                                              \
+   .lower_base_vertex = true,                                                 \
+   .support_16bit_alu = true,                                                 \
+   .lower_uniforms_to_ubo = true
+
+#define COMMON_SCALAR_OPTIONS                                                 \
+   .lower_to_scalar = true,                                                   \
+   .lower_pack_half_2x16 = true,                                              \
+   .lower_pack_snorm_2x16 = true,                                             \
+   .lower_pack_snorm_4x8 = true,                                              \
+   .lower_pack_unorm_2x16 = true,                                             \
+   .lower_pack_unorm_4x8 = true,                                              \
+   .lower_unpack_half_2x16 = true,                                            \
+   .lower_unpack_snorm_2x16 = true,                                           \
+   .lower_unpack_snorm_4x8 = true,                                            \
+   .lower_unpack_unorm_2x16 = true,                                           \
+   .lower_unpack_unorm_4x8 = true,                                            \
+   .lower_hadd64 = true,                                                      \
+   .avoid_ternary_with_two_constants = true,                                  \
+   .has_pack_32_4x8 = true,                                                   \
+   .max_unroll_iterations = 32,                                               \
+   .force_indirect_unrolling = nir_var_function_temp,                         \
+   .divergence_analysis_options =                                             \
+      (nir_divergence_single_patch_per_tcs_subgroup |                         \
+       nir_divergence_single_patch_per_tes_subgroup |                         \
+       nir_divergence_shader_record_ptr_uniform)
+
+const struct nir_shader_compiler_options brw_scalar_nir_options = {
+   COMMON_OPTIONS,
+   COMMON_SCALAR_OPTIONS,
+};
+
+const struct nir_shader_compiler_options brw_vector_nir_options = {
+   COMMON_OPTIONS,
+
+   /* In the vec4 backend, our dpN instruction replicates its result to all the
+    * components of a vec4.  We would like NIR to give us replicated fdot
+    * instructions because it can optimize better for us.
+    */
+   .fdot_replicates = true,
+
+   .lower_usub_sat = true,
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_extract_byte = true,
+   .lower_extract_word = true,
+   .intel_vec4 = true,
+   .max_unroll_iterations = 32,
+};
+
+struct brw_compiler *
+brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
+{
+   struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
+
+   compiler->devinfo = devinfo;
+
+   brw_init_isa_info(&compiler->isa, devinfo);
+
+   brw_fs_alloc_reg_sets(compiler);
+   if (devinfo->ver < 8)
+      brw_vec4_alloc_reg_set(compiler);
+
+   compiler->precise_trig = debug_get_bool_option("INTEL_PRECISE_TRIG", false);
+
+   compiler->use_tcs_multi_patch = devinfo->ver >= 12;
+
+   /* Default to the sampler since that's what we've done since forever */
+   compiler->indirect_ubos_use_sampler = true;
+
+   compiler->lower_dpas = devinfo->verx10 < 125 ||
+      intel_device_info_is_mtl(devinfo) ||
+      (intel_device_info_is_arl(devinfo) &&
+       devinfo->platform != INTEL_PLATFORM_ARL_H) ||
+      debug_get_bool_option("INTEL_LOWER_DPAS", false);
+
+   /* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */
+   for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) {
+      compiler->scalar_stage[i] = devinfo->ver >= 8 ||
+         i == MESA_SHADER_FRAGMENT || i == MESA_SHADER_COMPUTE;
+   }
+
+   for (int i = MESA_SHADER_TASK; i < MESA_VULKAN_SHADER_STAGES; i++)
+      compiler->scalar_stage[i] = true;
+
+   nir_lower_int64_options int64_options =
+      nir_lower_imul64 |
+      nir_lower_isign64 |
+      nir_lower_divmod64 |
+      nir_lower_imul_high64 |
+      nir_lower_find_lsb64 |
+      nir_lower_ufind_msb64 |
+      nir_lower_bit_count64;
+   nir_lower_doubles_options fp64_options =
+      nir_lower_drcp |
+      nir_lower_dsqrt |
+      nir_lower_drsq |
+      nir_lower_dtrunc |
+      nir_lower_dfloor |
+      nir_lower_dceil |
+      nir_lower_dfract |
+      nir_lower_dround_even |
+      nir_lower_dmod |
+      nir_lower_dsub |
+      nir_lower_ddiv;
+
+   if (!devinfo->has_64bit_float || INTEL_DEBUG(DEBUG_SOFT64))
+      fp64_options |= nir_lower_fp64_full_software;
+   if (!devinfo->has_64bit_int)
+      int64_options |= (nir_lower_int64_options)~0;
+
+   /* The Bspec's section titled "Instruction_multiply[DevBDW+]" claims that
+    * destination type can be Quadword and source type Doubleword for Gfx8 and
+    * Gfx9. So, lower 64 bit multiply instruction on rest of the platforms.
+    */
+   if (devinfo->ver < 8 || devinfo->ver > 9)
+      int64_options |= nir_lower_imul_2x32_64;
+
+   /* We want the GLSL compiler to emit code that uses condition codes */
+   for (int i = 0; i < MESA_ALL_SHADER_STAGES; i++) {
+      struct nir_shader_compiler_options *nir_options =
+         rzalloc(compiler, struct nir_shader_compiler_options);
+      bool is_scalar = compiler->scalar_stage[i];
+      if (is_scalar) {
+         *nir_options = brw_scalar_nir_options;
+         int64_options |= nir_lower_usub_sat64;
+      } else {
+         *nir_options = brw_vector_nir_options;
+      }
+
+      /* Prior to Gfx6, there are no three source operations, and Gfx11 loses
+       * LRP.
+       */
+      nir_options->lower_ffma16 = devinfo->ver < 6;
+      nir_options->lower_ffma32 = devinfo->ver < 6;
+      nir_options->lower_ffma64 = devinfo->ver < 6;
+      nir_options->lower_flrp32 = devinfo->ver < 6 || devinfo->ver >= 11;
+      nir_options->lower_fpow = devinfo->ver >= 12;
+
+      nir_options->has_bfe = devinfo->ver >= 7;
+      nir_options->has_bfm = devinfo->ver >= 7;
+      nir_options->has_bfi = devinfo->ver >= 7;
+
+      nir_options->has_rotate16 = devinfo->ver >= 11;
+      nir_options->has_rotate32 = devinfo->ver >= 11;
+      nir_options->lower_bitfield_reverse = devinfo->ver < 7;
+      nir_options->lower_find_lsb = devinfo->ver < 7;
+      nir_options->lower_ifind_msb = devinfo->ver < 7;
+      nir_options->has_iadd3 = devinfo->verx10 >= 125;
+
+      nir_options->has_sdot_4x8 = devinfo->ver >= 12;
+      nir_options->has_udot_4x8 = devinfo->ver >= 12;
+      nir_options->has_sudot_4x8 = devinfo->ver >= 12;
+      nir_options->has_sdot_4x8_sat = devinfo->ver >= 12;
+      nir_options->has_udot_4x8_sat = devinfo->ver >= 12;
+      nir_options->has_sudot_4x8_sat = devinfo->ver >= 12;
+
+      nir_options->lower_int64_options = int64_options;
+      nir_options->lower_doubles_options = fp64_options;
+
+      nir_options->unify_interfaces = i < MESA_SHADER_FRAGMENT;
+
+      nir_options->force_indirect_unrolling |=
+         brw_nir_no_indirect_mask(compiler, i);
+      nir_options->force_indirect_unrolling_sampler = devinfo->ver < 7;
+
+      if (compiler->use_tcs_multi_patch) {
+         /* TCS MULTI_PATCH mode has multiple patches per subgroup */
+         nir_options->divergence_analysis_options &=
+            ~nir_divergence_single_patch_per_tcs_subgroup;
+      }
+
+      if (devinfo->ver < 12)
+         nir_options->divergence_analysis_options |=
+            nir_divergence_single_prim_per_subgroup;
+
+      compiler->nir_options[i] = nir_options;
+   }
+
+   compiler->mesh.mue_header_packing =
+         (unsigned)debug_get_num_option("INTEL_MESH_HEADER_PACKING", 3);
+   compiler->mesh.mue_compaction =
+         debug_get_bool_option("INTEL_MESH_COMPACTION", true);
+
+   return compiler;
+}
+
+static void
+insert_u64_bit(uint64_t *val, bool add)
+{
+   *val = (*val << 1) | !!add;
+}
+
+uint64_t
+brw_get_compiler_config_value(const struct brw_compiler *compiler)
+{
+   uint64_t config = 0;
+   unsigned bits = 0;
+
+   insert_u64_bit(&config, compiler->precise_trig);
+   bits++;
+   insert_u64_bit(&config, compiler->lower_dpas);
+   bits++;
+   insert_u64_bit(&config, compiler->mesh.mue_compaction);
+   bits++;
+
+   uint64_t mask = DEBUG_DISK_CACHE_MASK;
+   bits += util_bitcount64(mask);
+
+   u_foreach_bit64(bit, mask)
+      insert_u64_bit(&config, INTEL_DEBUG(1ULL << bit));
+
+   mask = SIMD_DISK_CACHE_MASK;
+   bits += util_bitcount64(mask);
+
+   u_foreach_bit64(bit, mask)
+      insert_u64_bit(&config, (intel_simd & (1ULL << bit)) != 0);
+
+   mask = 3;
+   bits += util_bitcount64(mask);
+
+   u_foreach_bit64(bit, mask)
+      insert_u64_bit(&config, (compiler->mesh.mue_header_packing & (1ULL << bit)) != 0);
+
+   assert(bits <= util_bitcount64(UINT64_MAX));
+
+   return config;
+}
+
+void
+brw_device_sha1(char *hex,
+                const struct intel_device_info *devinfo) {
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+   brw_device_sha1_update(&ctx, devinfo);
+   unsigned char result[20];
+   _mesa_sha1_final(&ctx, result);
+   _mesa_sha1_format(hex, result);
+}
+
+unsigned
+brw_prog_data_size(gl_shader_stage stage)
+{
+   static const size_t stage_sizes[] = {
+      [MESA_SHADER_VERTEX]       = sizeof(struct brw_vs_prog_data),
+      [MESA_SHADER_TESS_CTRL]    = sizeof(struct brw_tcs_prog_data),
+      [MESA_SHADER_TESS_EVAL]    = sizeof(struct brw_tes_prog_data),
+      [MESA_SHADER_GEOMETRY]     = sizeof(struct brw_gs_prog_data),
+      [MESA_SHADER_FRAGMENT]     = sizeof(struct brw_wm_prog_data),
+      [MESA_SHADER_COMPUTE]      = sizeof(struct brw_cs_prog_data),
+      [MESA_SHADER_TASK]         = sizeof(struct brw_task_prog_data),
+      [MESA_SHADER_MESH]         = sizeof(struct brw_mesh_prog_data),
+      [MESA_SHADER_RAYGEN]       = sizeof(struct brw_bs_prog_data),
+      [MESA_SHADER_ANY_HIT]      = sizeof(struct brw_bs_prog_data),
+      [MESA_SHADER_CLOSEST_HIT]  = sizeof(struct brw_bs_prog_data),
+      [MESA_SHADER_MISS]         = sizeof(struct brw_bs_prog_data),
+      [MESA_SHADER_INTERSECTION] = sizeof(struct brw_bs_prog_data),
+      [MESA_SHADER_CALLABLE]     = sizeof(struct brw_bs_prog_data),
+      [MESA_SHADER_KERNEL]       = sizeof(struct brw_cs_prog_data),
+   };
+   assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_sizes));
+   return stage_sizes[stage];
+}
+
+unsigned
+brw_prog_key_size(gl_shader_stage stage)
+{
+   static const size_t stage_sizes[] = {
+      [MESA_SHADER_VERTEX]       = sizeof(struct brw_vs_prog_key),
+      [MESA_SHADER_TESS_CTRL]    = sizeof(struct brw_tcs_prog_key),
+      [MESA_SHADER_TESS_EVAL]    = sizeof(struct brw_tes_prog_key),
+      [MESA_SHADER_GEOMETRY]     = sizeof(struct brw_gs_prog_key),
+      [MESA_SHADER_FRAGMENT]     = sizeof(struct brw_wm_prog_key),
+      [MESA_SHADER_COMPUTE]      = sizeof(struct brw_cs_prog_key),
+      [MESA_SHADER_TASK]         = sizeof(struct brw_task_prog_key),
+      [MESA_SHADER_MESH]         = sizeof(struct brw_mesh_prog_key),
+      [MESA_SHADER_RAYGEN]       = sizeof(struct brw_bs_prog_key),
+      [MESA_SHADER_ANY_HIT]      = sizeof(struct brw_bs_prog_key),
+      [MESA_SHADER_CLOSEST_HIT]  = sizeof(struct brw_bs_prog_key),
+      [MESA_SHADER_MISS]         = sizeof(struct brw_bs_prog_key),
+      [MESA_SHADER_INTERSECTION] = sizeof(struct brw_bs_prog_key),
+      [MESA_SHADER_CALLABLE]     = sizeof(struct brw_bs_prog_key),
+      [MESA_SHADER_KERNEL]       = sizeof(struct brw_cs_prog_key),
+   };
+   assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_sizes));
+   return stage_sizes[stage];
+}
+
+void
+brw_write_shader_relocs(const struct brw_isa_info *isa,
+                        void *program,
+                        const struct brw_stage_prog_data *prog_data,
+                        struct brw_shader_reloc_value *values,
+                        unsigned num_values)
+{
+   for (unsigned i = 0; i < prog_data->num_relocs; i++) {
+      assert(prog_data->relocs[i].offset % 8 == 0);
+      void *dst = program + prog_data->relocs[i].offset;
+      for (unsigned j = 0; j < num_values; j++) {
+         if (prog_data->relocs[i].id == values[j].id) {
+            uint32_t value = values[j].value + prog_data->relocs[i].delta;
+            switch (prog_data->relocs[i].type) {
+            case BRW_SHADER_RELOC_TYPE_U32:
+               *(uint32_t *)dst = value;
+               break;
+            case BRW_SHADER_RELOC_TYPE_MOV_IMM:
+               brw_update_reloc_imm(isa, dst, value);
+               break;
+            default:
+               unreachable("Invalid relocation type");
+            }
+            break;
+         }
+      }
+   }
+}
--- a/src/intel/compiler/elk/brw_compiler.h
+++ b/src/intel/compiler/elk/brw_compiler.h
--- a/src/intel/compiler/elk/brw_dead_control_flow.cpp
+++ b/src/intel/compiler/elk/brw_dead_control_flow.cpp
@ -0,0 +1,121 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_dead_control_flow.cpp
+ *
+ * This file implements the dead control flow elimination optimization pass.
+ */
+
+#include "brw_shader.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+/* Look for and eliminate dead control flow:
+ *
+ *   - if/endif
+ *   - else in else/endif
+ *   - then in if/else/endif
+ */
+bool
+dead_control_flow_eliminate(backend_shader *s)
+{
+   bool progress = false;
+
+   foreach_block_safe (block, s->cfg) {
+      bblock_t *prev_block = block->prev();
+
+      if (!prev_block)
+         continue;
+
+      backend_instruction *const inst = block->start();
+      backend_instruction *const prev_inst = prev_block->end();
+
+      /* ENDIF instructions, by definition, can only be found at the start of
+       * basic blocks.
+       */
+      if (inst->opcode == BRW_OPCODE_ENDIF &&
+          prev_inst->opcode == BRW_OPCODE_ELSE) {
+         bblock_t *const else_block = prev_block;
+         backend_instruction *const else_inst = prev_inst;
+
+         else_inst->remove(else_block);
+         progress = true;
+      } else if (inst->opcode == BRW_OPCODE_ENDIF &&
+                 prev_inst->opcode == BRW_OPCODE_IF) {
+         bblock_t *const endif_block = block;
+         bblock_t *const if_block = prev_block;
+         backend_instruction *const endif_inst = inst;
+         backend_instruction *const if_inst = prev_inst;
+
+         bblock_t *earlier_block = NULL, *later_block = NULL;
+
+         if (if_block->start_ip == if_block->end_ip) {
+            earlier_block = if_block->prev();
+         } else {
+            earlier_block = if_block;
+         }
+         if_inst->remove(if_block);
+
+         if (endif_block->start_ip == endif_block->end_ip) {
+            later_block = endif_block->next();
+         } else {
+            later_block = endif_block;
+         }
+         endif_inst->remove(endif_block);
+
+         assert((earlier_block == NULL) == (later_block == NULL));
+         if (earlier_block && earlier_block->can_combine_with(later_block)) {
+            earlier_block->combine_with(later_block);
+
+            /* If ENDIF was in its own block, then we've now deleted it and
+             * merged the two surrounding blocks, the latter of which the
+             * __next block pointer was pointing to.
+             */
+            if (endif_block != later_block) {
+               __next = earlier_block->next();
+            }
+         }
+
+         progress = true;
+      } else if (inst->opcode == BRW_OPCODE_ELSE &&
+                 prev_inst->opcode == BRW_OPCODE_IF) {
+         bblock_t *const else_block = block;
+         backend_instruction *const if_inst = prev_inst;
+         backend_instruction *const else_inst = inst;
+
+         /* Since the else-branch is becoming the new then-branch, the
+          * condition has to be inverted.
+          */
+         if_inst->predicate_inverse = !if_inst->predicate_inverse;
+         else_inst->remove(else_block);
+
+         progress = true;
+      }
+   }
+
+   if (progress)
+      s->invalidate_analysis(DEPENDENCY_BLOCKS | DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
--- a/src/intel/compiler/elk/brw_dead_control_flow.h
+++ b/src/intel/compiler/elk/brw_dead_control_flow.h
@ -0,0 +1,31 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_DEAD_CONTROL_FLOW_H
+#define BRW_DEAD_CONTROL_FLOW_H
+
+#include "brw_shader.h"
+
+bool dead_control_flow_eliminate(backend_shader *s);
+
+#endif /* BRW_DEAD_CONTROL_FLOW_H */
--- a/src/intel/compiler/elk/brw_debug_recompile.c
+++ b/src/intel/compiler/elk/brw_debug_recompile.c
@ -0,0 +1,238 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file brw_debug_recompiles.c
+ */
+
+#include <stdio.h>
+
+#include "brw_compiler.h"
+
+static bool
+key_debug(const struct brw_compiler *c, void *log,
+          const char *name, int a, int b)
+{
+   if (a != b) {
+      brw_shader_perf_log(c, log, "  %s %d->%d\n", name, a, b);
+      return true;
+   }
+   return false;
+}
+
+static bool
+key_debug_float(const struct brw_compiler *c, void *log,
+                const char *name, float a, float b)
+{
+   if (a != b) {
+      brw_shader_perf_log(c, log, "  %s %f->%f\n", name, a, b);
+      return true;
+   }
+   return false;
+}
+
+#define check(name, field) \
+   key_debug(c, log, name, old_key->field, key->field)
+#define check_float(name, field) \
+   key_debug_float(c, log, name, old_key->field, key->field)
+
+static bool
+debug_sampler_recompile(const struct brw_compiler *c, void *log,
+                        const struct brw_sampler_prog_key_data *old_key,
+                        const struct brw_sampler_prog_key_data *key)
+{
+   bool found = false;
+
+   found |= check("gather channel quirk", gather_channel_quirk_mask);
+
+   for (unsigned i = 0; i < BRW_MAX_SAMPLERS; i++) {
+      found |= check("EXT_texture_swizzle or DEPTH_TEXTURE_MODE", swizzles[i]);
+      found |= check("textureGather workarounds", gfx6_gather_wa[i]);
+   }
+
+   for (unsigned i = 0; i < 3; i++) {
+      found |= check("GL_CLAMP enabled on any texture unit", gl_clamp_mask[i]);
+   }
+
+   return found;
+}
+
+static bool
+debug_base_recompile(const struct brw_compiler *c, void *log,
+                     const struct brw_base_prog_key *old_key,
+                     const struct brw_base_prog_key *key)
+{
+   return debug_sampler_recompile(c, log, &old_key->tex, &key->tex);
+}
+
+static void
+debug_vs_recompile(const struct brw_compiler *c, void *log,
+                   const struct brw_vs_prog_key *old_key,
+                   const struct brw_vs_prog_key *key)
+{
+   bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
+
+   for (unsigned i = 0; i < VERT_ATTRIB_MAX; i++) {
+      found |= check("vertex attrib w/a flags", gl_attrib_wa_flags[i]);
+   }
+
+   found |= check("legacy user clipping", nr_userclip_plane_consts);
+   found |= check("copy edgeflag", copy_edgeflag);
+   found |= check("pointcoord replace", point_coord_replace);
+   found |= check("vertex color clamping", clamp_vertex_color);
+
+   if (!found) {
+      brw_shader_perf_log(c, log, "  something else\n");
+   }
+}
+
+static void
+debug_tcs_recompile(const struct brw_compiler *c, void *log,
+                    const struct brw_tcs_prog_key *old_key,
+                    const struct brw_tcs_prog_key *key)
+{
+   bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
+
+   found |= check("input vertices", input_vertices);
+   found |= check("outputs written", outputs_written);
+   found |= check("patch outputs written", patch_outputs_written);
+   found |= check("tes primitive mode", _tes_primitive_mode);
+   found |= check("quads and equal_spacing workaround", quads_workaround);
+
+   if (!found) {
+      brw_shader_perf_log(c, log, "  something else\n");
+   }
+}
+
+static void
+debug_tes_recompile(const struct brw_compiler *c, void *log,
+                    const struct brw_tes_prog_key *old_key,
+                    const struct brw_tes_prog_key *key)
+{
+   bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
+
+   found |= check("inputs read", inputs_read);
+   found |= check("patch inputs read", patch_inputs_read);
+
+   if (!found) {
+      brw_shader_perf_log(c, log, "  something else\n");
+   }
+}
+
+static void
+debug_gs_recompile(const struct brw_compiler *c, void *log,
+                   const struct brw_gs_prog_key *old_key,
+                   const struct brw_gs_prog_key *key)
+{
+   bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
+
+   if (!found) {
+      brw_shader_perf_log(c, log, "  something else\n");
+   }
+}
+
+static void
+debug_fs_recompile(const struct brw_compiler *c, void *log,
+                   const struct brw_wm_prog_key *old_key,
+                   const struct brw_wm_prog_key *key)
+{
+   bool found = false;
+
+   found |= check("alphatest, computed depth, depth test, or depth write",
+                  iz_lookup);
+   found |= check("depth statistics", stats_wm);
+   found |= check("flat shading", flat_shade);
+   found |= check("number of color buffers", nr_color_regions);
+   found |= check("MRT alpha test", alpha_test_replicate_alpha);
+   found |= check("alpha to coverage", alpha_to_coverage);
+   found |= check("fragment color clamping", clamp_fragment_color);
+   found |= check("per-sample interpolation", persample_interp);
+   found |= check("multisampled FBO", multisample_fbo);
+   found |= check("line smoothing", line_aa);
+   found |= check("force dual color blending", force_dual_color_blend);
+   found |= check("coherent fb fetch", coherent_fb_fetch);
+   found |= check("ignore sample mask out", ignore_sample_mask_out);
+   found |= check("coarse pixel", coarse_pixel);
+
+   found |= check("input slots valid", input_slots_valid);
+   found |= check("mrt alpha test function", alpha_test_func);
+   found |= check("mrt alpha test reference value", alpha_test_ref);
+
+   found |= debug_base_recompile(c, log, &old_key->base, &key->base);
+
+   if (!found) {
+      brw_shader_perf_log(c, log, "  something else\n");
+   }
+}
+
+static void
+debug_cs_recompile(const struct brw_compiler *c, void *log,
+                   const struct brw_cs_prog_key *old_key,
+                   const struct brw_cs_prog_key *key)
+{
+   bool found = debug_base_recompile(c, log, &old_key->base, &key->base);
+
+   if (!found) {
+      brw_shader_perf_log(c, log, "  something else\n");
+   }
+}
+
+void
+brw_debug_key_recompile(const struct brw_compiler *c, void *log,
+                        gl_shader_stage stage,
+                        const struct brw_base_prog_key *old_key,
+                        const struct brw_base_prog_key *key)
+{
+   if (!old_key) {
+      brw_shader_perf_log(c, log, "  No previous compile found...\n");
+      return;
+   }
+
+   switch (stage) {
+   case MESA_SHADER_VERTEX:
+      debug_vs_recompile(c, log, (const struct brw_vs_prog_key *)old_key,
+                                 (const struct brw_vs_prog_key *)key);
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      debug_tcs_recompile(c, log, (const struct brw_tcs_prog_key *)old_key,
+                                  (const struct brw_tcs_prog_key *)key);
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      debug_tes_recompile(c, log, (const struct brw_tes_prog_key *)old_key,
+                                  (const struct brw_tes_prog_key *)key);
+      break;
+   case MESA_SHADER_GEOMETRY:
+      debug_gs_recompile(c, log, (const struct brw_gs_prog_key *)old_key,
+                                 (const struct brw_gs_prog_key *)key);
+      break;
+   case MESA_SHADER_FRAGMENT:
+      debug_fs_recompile(c, log, (const struct brw_wm_prog_key *)old_key,
+                                 (const struct brw_wm_prog_key *)key);
+      break;
+   case MESA_SHADER_COMPUTE:
+      debug_cs_recompile(c, log, (const struct brw_cs_prog_key *)old_key,
+                                 (const struct brw_cs_prog_key *)key);
+      break;
+   default:
+      break;
+   }
+}
--- a/src/intel/compiler/elk/brw_device_sha1_gen_c.py
+++ b/src/intel/compiler/elk/brw_device_sha1_gen_c.py
@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+COPYRIGHT = """\
+/*
+ * Copyright 2024 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+"""
+
+import argparse
+import os
+import sys
+
+from mako.template import Template
+from mako import exceptions
+
+sys.path.append(f"{os.path.dirname(sys.argv[0])}/../dev")
+import intel_device_info
+
+template = COPYRIGHT + """
+
+/* DO NOT EDIT - This file generated automatically by intel_device_serialize_c.py script */
+
+#include "dev/intel_device_info.h"
+#include "brw_compiler.h"
+#define SHA_UPDATE_FIELD(field)     _mesa_sha1_update(ctx, &devinfo->field, sizeof(devinfo->field))
+
+void
+brw_device_sha1_update(struct mesa_sha1 *ctx,
+                       const struct intel_device_info *devinfo) {
+% for member in compiler_fields:
+   SHA_UPDATE_FIELD(${member.name});
+% endfor
+}
+
+#undef SHA_UPDATE_FIELD
+
+"""
+
+def main():
+    """print intel_device_serialize.c at the specified path"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--outdir', required=True,
+                        help='Directory to put the generated files in')
+    args = parser.parse_args()
+    path = os.path.join(args.outdir, 'brw_device_sha1_gen.c')
+    device_members = intel_device_info.TYPES_BY_NAME["intel_device_info"].members
+    compiler_fields = [field for field in device_members if field.compiler_field]
+    with open(path, 'w', encoding='utf-8') as f:
+        try:
+            f.write(Template(template).render(compiler_fields=compiler_fields))
+        except:
+            print(exceptions.text_error_template().render(compiler_fields=compiler_fields))
+
+if __name__ == "__main__":
+    main()
--- a/src/intel/compiler/elk/brw_disasm.c
+++ b/src/intel/compiler/elk/brw_disasm.c
--- a/src/intel/compiler/elk/brw_disasm.h
+++ b/src/intel/compiler/elk/brw_disasm.h
@ -0,0 +1,42 @@
+/*
+ * Copyright 2024 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef BRW_DISASM_H
+#define BRW_DISASM_H
+
+#include <stdio.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct brw_isa_info;
+struct brw_inst;
+
+const struct brw_label *brw_find_label(const struct brw_label *root, int offset);
+void brw_create_label(struct brw_label **labels, int offset, void *mem_ctx);
+int brw_disassemble_inst(FILE *file, const struct brw_isa_info *isa,
+                         const struct brw_inst *inst, bool is_compacted,
+                         int offset, const struct brw_label *root_label);
+const struct
+brw_label *brw_label_assembly(const struct brw_isa_info *isa,
+                              const void *assembly, int start, int end,
+                              void *mem_ctx);
+void brw_disassemble_with_labels(const struct brw_isa_info *isa,
+                                 const void *assembly, int start, int end, FILE *out);
+void brw_disassemble(const struct brw_isa_info *isa,
+                     const void *assembly, int start, int end,
+                     const struct brw_label *root_label, FILE *out);
+int brw_disassemble_find_end(const struct brw_isa_info *isa,
+                             const void *assembly, int start);
+void brw_disassemble_with_errors(const struct brw_isa_info *isa,
+                                 const void *assembly, int start, FILE *out);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* BRW_DISASM_H */
--- a/src/intel/compiler/elk/brw_disasm_info.c
+++ b/src/intel/compiler/elk/brw_disasm_info.c
@ -0,0 +1,207 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_cfg.h"
+#include "brw_eu.h"
+#include "brw_disasm.h"
+#include "brw_disasm_info.h"
+#include "dev/intel_debug.h"
+#include "compiler/nir/nir.h"
+
+__attribute__((weak)) void nir_print_instr(UNUSED const nir_instr *instr,
+                                           UNUSED FILE *fp) {}
+
+void
+dump_assembly(void *assembly, int start_offset, int end_offset,
+              struct disasm_info *disasm, const unsigned *block_latency)
+{
+   const struct brw_isa_info *isa = disasm->isa;
+   const char *last_annotation_string = NULL;
+   const void *last_annotation_ir = NULL;
+
+   void *mem_ctx = ralloc_context(NULL);
+   const struct brw_label *root_label =
+      brw_label_assembly(isa, assembly, start_offset, end_offset, mem_ctx);
+
+   foreach_list_typed(struct inst_group, group, link, &disasm->group_list) {
+      struct exec_node *next_node = exec_node_get_next(&group->link);
+      if (exec_node_is_tail_sentinel(next_node))
+         break;
+
+      struct inst_group *next =
+         exec_node_data(struct inst_group, next_node, link);
+
+      int start_offset = group->offset;
+      int end_offset = next->offset;
+
+      if (group->block_start) {
+         fprintf(stderr, "   START B%d", group->block_start->num);
+         foreach_list_typed(struct bblock_link, predecessor_link, link,
+                            &group->block_start->parents) {
+            struct bblock_t *predecessor_block = predecessor_link->block;
+            fprintf(stderr, " <-B%d", predecessor_block->num);
+         }
+         if (block_latency)
+            fprintf(stderr, " (%u cycles)",
+                    block_latency[group->block_start->num]);
+         fprintf(stderr, "\n");
+      }
+
+      if (last_annotation_ir != group->ir) {
+         last_annotation_ir = group->ir;
+         if (last_annotation_ir) {
+            fprintf(stderr, "   ");
+            nir_print_instr(group->ir, stderr);
+            fprintf(stderr, "\n");
+         }
+      }
+
+      if (last_annotation_string != group->annotation) {
+         last_annotation_string = group->annotation;
+         if (last_annotation_string)
+            fprintf(stderr, "   %s\n", last_annotation_string);
+      }
+
+      brw_disassemble(isa, assembly, start_offset, end_offset,
+                      root_label, stderr);
+
+      if (group->error) {
+         fputs(group->error, stderr);
+      }
+
+      if (group->block_end) {
+         fprintf(stderr, "   END B%d", group->block_end->num);
+         foreach_list_typed(struct bblock_link, successor_link, link,
+                            &group->block_end->children) {
+            struct bblock_t *successor_block = successor_link->block;
+            fprintf(stderr, " ->B%d", successor_block->num);
+         }
+         fprintf(stderr, "\n");
+      }
+   }
+   fprintf(stderr, "\n");
+
+   ralloc_free(mem_ctx);
+}
+
+struct disasm_info *
+disasm_initialize(const struct brw_isa_info *isa,
+                  const struct cfg_t *cfg)
+{
+   struct disasm_info *disasm = ralloc(NULL, struct disasm_info);
+   exec_list_make_empty(&disasm->group_list);
+   disasm->isa = isa;
+   disasm->cfg = cfg;
+   disasm->cur_block = 0;
+   disasm->use_tail = false;
+   return disasm;
+}
+
+struct inst_group *
+disasm_new_inst_group(struct disasm_info *disasm, unsigned next_inst_offset)
+{
+   struct inst_group *tail = rzalloc(disasm, struct inst_group);
+   tail->offset = next_inst_offset;
+   exec_list_push_tail(&disasm->group_list, &tail->link);
+   return tail;
+}
+
+void
+disasm_annotate(struct disasm_info *disasm,
+                struct backend_instruction *inst, unsigned offset)
+{
+   const struct intel_device_info *devinfo = disasm->isa->devinfo;
+   const struct cfg_t *cfg = disasm->cfg;
+
+   struct inst_group *group;
+   if (!disasm->use_tail) {
+      group = disasm_new_inst_group(disasm, offset);
+   } else {
+      disasm->use_tail = false;
+      group = exec_node_data(struct inst_group,
+                             exec_list_get_tail_raw(&disasm->group_list), link);
+   }
+
+   if (INTEL_DEBUG(DEBUG_ANNOTATION)) {
+      group->ir = inst->ir;
+      group->annotation = inst->annotation;
+   }
+
+   if (bblock_start(cfg->blocks[disasm->cur_block]) == inst) {
+      group->block_start = cfg->blocks[disasm->cur_block];
+   }
+
+   /* There is no hardware DO instruction on Gfx6+, so since DO always
+    * starts a basic block, we need to set the .block_start of the next
+    * instruction's annotation with a pointer to the bblock started by
+    * the DO.
+    *
+    * There's also only complication from emitting an annotation without
+    * a corresponding hardware instruction to disassemble.
+    */
+   if (devinfo->ver >= 6 && inst->opcode == BRW_OPCODE_DO) {
+      disasm->use_tail = true;
+   }
+
+   if (bblock_end(cfg->blocks[disasm->cur_block]) == inst) {
+      group->block_end = cfg->blocks[disasm->cur_block];
+      disasm->cur_block++;
+   }
+}
+
+void
+disasm_insert_error(struct disasm_info *disasm, unsigned offset,
+                    unsigned inst_size, const char *error)
+{
+   foreach_list_typed(struct inst_group, cur, link, &disasm->group_list) {
+      struct exec_node *next_node = exec_node_get_next(&cur->link);
+      if (exec_node_is_tail_sentinel(next_node))
+         break;
+
+      struct inst_group *next =
+         exec_node_data(struct inst_group, next_node, link);
+
+      if (next->offset <= offset)
+         continue;
+
+      if (offset + inst_size != next->offset) {
+         struct inst_group *new = ralloc(disasm, struct inst_group);
+         memcpy(new, cur, sizeof(struct inst_group));
+
+         cur->error = NULL;
+         cur->error_length = 0;
+         cur->block_end = NULL;
+
+         new->offset = offset + inst_size;
+         new->block_start = NULL;
+
+         exec_node_insert_after(&cur->link, &new->link);
+      }
+
+      if (cur->error)
+         ralloc_strcat(&cur->error, error);
+      else
+         cur->error = ralloc_strdup(disasm, error);
+      return;
+   }
+}
--- a/src/intel/compiler/elk/brw_disasm_info.h
+++ b/src/intel/compiler/elk/brw_disasm_info.h
@ -0,0 +1,90 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _INTEL_ASM_ANNOTATION_H
+#define _INTEL_ASM_ANNOTATION_H
+
+#include "compiler/glsl/list.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct cfg_t;
+struct backend_instruction;
+struct intel_device_info;
+
+struct inst_group {
+   struct exec_node link;
+
+   int offset;
+
+   size_t error_length;
+   char *error;
+
+   /* Pointers to the basic block in the CFG if the instruction group starts
+    * or ends a basic block.
+    */
+   struct bblock_t *block_start;
+   struct bblock_t *block_end;
+
+   /* Annotation for the generated IR.  One of the two can be set. */
+   const void *ir;
+   const char *annotation;
+};
+
+struct disasm_info {
+   struct exec_list group_list;
+
+   const struct brw_isa_info *isa;
+   const struct cfg_t *cfg;
+
+   /** Block index in the cfg. */
+   int cur_block;
+   bool use_tail;
+};
+
+void
+dump_assembly(void *assembly, int start_offset, int end_offset,
+              struct disasm_info *disasm, const unsigned *block_latency);
+
+struct disasm_info *
+disasm_initialize(const struct brw_isa_info *isa,
+                  const struct cfg_t *cfg);
+
+struct inst_group *
+disasm_new_inst_group(struct disasm_info *disasm, unsigned offset);
+
+void
+disasm_annotate(struct disasm_info *disasm,
+                struct backend_instruction *inst, unsigned offset);
+
+void
+disasm_insert_error(struct disasm_info *disasm, unsigned offset,
+                    unsigned inst_size, const char *error);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* _INTEL_ASM_ANNOTATION_H */
--- a/src/intel/compiler/elk/brw_disasm_tool.c
+++ b/src/intel/compiler/elk/brw_disasm_tool.c
@ -0,0 +1,242 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+
+#include "compiler/brw_disasm.h"
+#include "compiler/brw_isa_info.h"
+#include "dev/intel_device_info.h"
+#include "util/u_dynarray.h"
+
+enum opt_input_type {
+   OPT_INPUT_BINARY,
+   OPT_INPUT_C_LITERAL,
+};
+
+static enum opt_input_type input_type = OPT_INPUT_BINARY;
+
+/* Return size of file in bytes pointed by fp */
+static long
+i965_disasm_get_file_size(FILE *fp)
+{
+   long size;
+
+   fseek(fp, 0L, SEEK_END);
+   size = ftell(fp);
+   fseek(fp, 0L, SEEK_SET);
+
+   return size;
+}
+
+/* Read hex file which should be in following format:
+ * for example :
+ *    { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }
+ */
+static void *
+i965_disasm_read_c_literal_file(FILE *fp, size_t *end)
+{
+   struct util_dynarray assembly = {};
+   uint32_t temp[2];
+
+   if (fscanf(fp, " { ") == EOF) {
+      fprintf(stderr, "Couldn't find opening `{`\n");
+      return NULL;
+   }
+
+   if (fscanf(fp, "0x%x , 0x%x", &temp[0], &temp[1]) == 2) {
+      util_dynarray_append(&assembly, uint32_t, temp[0]);
+      util_dynarray_append(&assembly, uint32_t, temp[1]);
+   } else {
+      fprintf(stderr, "Couldn't read hex values\n");
+      return NULL;
+   }
+
+   while (fscanf(fp, " , 0x%x , 0x%x ", &temp[0], &temp[1]) == 2) {
+      util_dynarray_append(&assembly, uint32_t, temp[0]);
+      util_dynarray_append(&assembly, uint32_t, temp[1]);
+   }
+
+   if (fscanf(fp, "}") == EOF) {
+      fprintf(stderr, "Couldn't find closing `}`\n");
+      return NULL;
+   }
+
+   *end = assembly.size;
+   return assembly.data;
+}
+
+static void *
+i965_disasm_read_binary(FILE *fp, size_t *end)
+{
+   size_t size;
+   void *assembly;
+
+   long sz = i965_disasm_get_file_size(fp);
+   if (sz < 0)
+      return NULL;
+
+   *end = (size_t)sz;
+   if (!*end)
+      return NULL;
+
+   assembly = malloc(*end + 1);
+   if (assembly == NULL)
+      return NULL;
+
+   size = fread(assembly, *end, 1, fp);
+   if (!size) {
+      free(assembly);
+      return NULL;
+   }
+   return assembly;
+}
+
+static void
+print_help(const char *progname, FILE *file)
+{
+   fprintf(file,
+           "Usage: %s [OPTION]...\n"
+           "Disassemble i965 instructions from binary file.\n\n"
+           "      --help             display this help and exit\n"
+           "      --input-path=PATH  read binary file from binary file PATH\n"
+           "      --type=INPUT_TYPE  INPUT_TYPE can be 'bin' (default if omitted),\n"
+           "                         'c_literal'.\n"
+           "      --gen=platform     disassemble instructions for given \n"
+           "                         platform (3 letter platform name)\n",
+           progname);
+}
+
+int main(int argc, char *argv[])
+{
+   FILE *fp = NULL;
+   void *assembly = NULL;
+   char *file_path = NULL;
+   size_t start = 0, end = 0;
+   uint16_t pci_id = 0;
+   int c;
+   int result = EXIT_FAILURE;
+
+   bool help = false;
+   const struct option i965_disasm_opts[] = {
+      { "help",          no_argument,       (int *) &help,      true },
+      { "input-path",    required_argument, NULL,               'i' },
+      { "type",          required_argument, NULL,               't' },
+      { "gen",           required_argument, NULL,               'g'},
+      { NULL,            0,                 NULL,                0 }
+   };
+
+   while ((c = getopt_long(argc, argv, ":i:t:g:h", i965_disasm_opts, NULL)) != -1) {
+      switch (c) {
+      case 'g': {
+         const int id = intel_device_name_to_pci_device_id(optarg);
+         if (id < 0) {
+            fprintf(stderr, "can't parse gen: '%s', expected 3 letter "
+                            "platform name\n", optarg);
+            goto end;
+         } else {
+            pci_id = id;
+         }
+         break;
+      }
+      case 'i':
+         file_path = strdup(optarg);
+         fp = fopen(file_path, "r");
+         if (!fp) {
+            fprintf(stderr, "Unable to read input file : %s\n",
+                    file_path);
+            goto end;
+         }
+         break;
+      case 't':
+         if (strcmp(optarg, "c_literal") == 0) {
+            input_type = OPT_INPUT_C_LITERAL;
+         } else if (strcmp(optarg, "bin") == 0) {
+            input_type = OPT_INPUT_BINARY;
+         } else {
+            fprintf(stderr, "invalid value for --type: %s\n", optarg);
+            goto end;
+         }
+         break;
+      case 'h':
+         help = true;
+         print_help(argv[0], stderr);
+         goto end;
+      case 0:
+         break;
+      case ':':
+         fprintf(stderr, "%s: option `-%c' requires an argument\n",
+                 argv[0], optopt);
+         goto end;
+      case '?':
+      default:
+         fprintf(stderr, "%s: option `-%c' is invalid: ignored\n",
+                 argv[0], optopt);
+         goto end;
+      }
+   }
+
+   if (help || !file_path || !pci_id) {
+      print_help(argv[0], stderr);
+      exit(0);
+   }
+
+   struct intel_device_info devinfo;
+   if (!intel_get_device_info_from_pci_id(pci_id, &devinfo)) {
+      fprintf(stderr, "can't find device information: pci_id=0x%x\n", pci_id);
+      exit(EXIT_FAILURE);
+   }
+
+   struct brw_isa_info isa;
+   brw_init_isa_info(&isa, &devinfo);
+
+   if (input_type == OPT_INPUT_BINARY)
+      assembly = i965_disasm_read_binary(fp, &end);
+   else if (input_type == OPT_INPUT_C_LITERAL)
+      assembly = i965_disasm_read_c_literal_file(fp, &end);
+
+   if (!assembly) {
+      if (end)
+        fprintf(stderr, "Unable to allocate buffer to read input file\n");
+      else
+        fprintf(stderr, "Failed to read input file\n");
+
+      goto end;
+   }
+
+   /* Disassemble i965 instructions from buffer assembly */
+   brw_disassemble_with_labels(&isa, assembly, start, end, stdout);
+
+   result = EXIT_SUCCESS;
+
+end:
+   if (fp)
+      fclose(fp);
+
+   free(file_path);
+   free(assembly);
+
+   exit(result);
+}
--- a/src/intel/compiler/elk/brw_eu.c
+++ b/src/intel/compiler/elk/brw_eu.c
@ -0,0 +1,856 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "brw_disasm.h"
+#include "brw_eu_defines.h"
+#include "brw_eu.h"
+#include "brw_shader.h"
+#include "intel_gfx_ver_enum.h"
+#include "dev/intel_debug.h"
+
+#include "util/u_debug.h"
+#include "util/ralloc.h"
+
+/* Returns a conditional modifier that negates the condition. */
+enum brw_conditional_mod
+brw_negate_cmod(enum brw_conditional_mod cmod)
+{
+   switch (cmod) {
+   case BRW_CONDITIONAL_Z:
+      return BRW_CONDITIONAL_NZ;
+   case BRW_CONDITIONAL_NZ:
+      return BRW_CONDITIONAL_Z;
+   case BRW_CONDITIONAL_G:
+      return BRW_CONDITIONAL_LE;
+   case BRW_CONDITIONAL_GE:
+      return BRW_CONDITIONAL_L;
+   case BRW_CONDITIONAL_L:
+      return BRW_CONDITIONAL_GE;
+   case BRW_CONDITIONAL_LE:
+      return BRW_CONDITIONAL_G;
+   default:
+      unreachable("Can't negate this cmod");
+   }
+}
+
+/* Returns the corresponding conditional mod for swapping src0 and
+ * src1 in e.g. CMP.
+ */
+enum brw_conditional_mod
+brw_swap_cmod(enum brw_conditional_mod cmod)
+{
+   switch (cmod) {
+   case BRW_CONDITIONAL_Z:
+   case BRW_CONDITIONAL_NZ:
+      return cmod;
+   case BRW_CONDITIONAL_G:
+      return BRW_CONDITIONAL_L;
+   case BRW_CONDITIONAL_GE:
+      return BRW_CONDITIONAL_LE;
+   case BRW_CONDITIONAL_L:
+      return BRW_CONDITIONAL_G;
+   case BRW_CONDITIONAL_LE:
+      return BRW_CONDITIONAL_GE;
+   default:
+      return BRW_CONDITIONAL_NONE;
+   }
+}
+
+/**
+ * Get the least significant bit offset of the i+1-th component of immediate
+ * type \p type.  For \p i equal to the two's complement of j, return the
+ * offset of the j-th component starting from the end of the vector.  For
+ * scalar register types return zero.
+ */
+static unsigned
+imm_shift(enum brw_reg_type type, unsigned i)
+{
+   assert(type != BRW_REGISTER_TYPE_UV && type != BRW_REGISTER_TYPE_V &&
+          "Not implemented.");
+
+   if (type == BRW_REGISTER_TYPE_VF)
+      return 8 * (i & 3);
+   else
+      return 0;
+}
+
+/**
+ * Swizzle an arbitrary immediate \p x of the given type according to the
+ * permutation specified as \p swz.
+ */
+uint32_t
+brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz)
+{
+   if (imm_shift(type, 1)) {
+      const unsigned n = 32 / imm_shift(type, 1);
+      uint32_t y = 0;
+
+      for (unsigned i = 0; i < n; i++) {
+         /* Shift the specified component all the way to the right and left to
+          * discard any undesired L/MSBs, then shift it right into component i.
+          */
+         y |= x >> imm_shift(type, (i & ~3) + BRW_GET_SWZ(swz, i & 3))
+                << imm_shift(type, ~0u)
+                >> imm_shift(type, ~0u - i);
+      }
+
+      return y;
+   } else {
+      return x;
+   }
+}
+
+unsigned
+brw_get_default_exec_size(struct brw_codegen *p)
+{
+   return p->current->exec_size;
+}
+
+unsigned
+brw_get_default_group(struct brw_codegen *p)
+{
+   return p->current->group;
+}
+
+unsigned
+brw_get_default_access_mode(struct brw_codegen *p)
+{
+   return p->current->access_mode;
+}
+
+struct tgl_swsb
+brw_get_default_swsb(struct brw_codegen *p)
+{
+   return p->current->swsb;
+}
+
+void
+brw_set_default_exec_size(struct brw_codegen *p, unsigned value)
+{
+   p->current->exec_size = value;
+}
+
+void brw_set_default_predicate_control(struct brw_codegen *p, enum brw_predicate pc)
+{
+   p->current->predicate = pc;
+}
+
+void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse)
+{
+   p->current->pred_inv = predicate_inverse;
+}
+
+void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg)
+{
+   assert(subreg < 2);
+   p->current->flag_subreg = reg * 2 + subreg;
+}
+
+void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode )
+{
+   p->current->access_mode = access_mode;
+}
+
+void
+brw_set_default_compression_control(struct brw_codegen *p,
+			    enum brw_compression compression_control)
+{
+   switch (compression_control) {
+   case BRW_COMPRESSION_NONE:
+      /* This is the "use the first set of bits of dmask/vmask/arf
+       * according to execsize" option.
+       */
+      p->current->group = 0;
+      break;
+   case BRW_COMPRESSION_2NDHALF:
+      /* For SIMD8, this is "use the second set of 8 bits." */
+      p->current->group = 8;
+      break;
+   case BRW_COMPRESSION_COMPRESSED:
+      /* For SIMD16 instruction compression, use the first set of 16 bits
+       * since we don't do SIMD32 dispatch.
+       */
+      p->current->group = 0;
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   if (p->devinfo->ver <= 6) {
+      p->current->compressed =
+         (compression_control == BRW_COMPRESSION_COMPRESSED);
+   }
+}
+
+/**
+ * Enable or disable instruction compression on the given instruction leaving
+ * the currently selected channel enable group untouched.
+ */
+void
+brw_inst_set_compression(const struct intel_device_info *devinfo,
+                         brw_inst *inst, bool on)
+{
+   if (devinfo->ver >= 6) {
+      /* No-op, the EU will figure out for us whether the instruction needs to
+       * be compressed.
+       */
+   } else {
+      /* The channel group and compression controls are non-orthogonal, there
+       * are two possible representations for uncompressed instructions and we
+       * may need to preserve the current one to avoid changing the selected
+       * channel group inadvertently.
+       */
+      if (on)
+         brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_COMPRESSED);
+      else if (brw_inst_qtr_control(devinfo, inst)
+               == BRW_COMPRESSION_COMPRESSED)
+         brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
+   }
+}
+
+void
+brw_set_default_compression(struct brw_codegen *p, bool on)
+{
+   p->current->compressed = on;
+}
+
+/**
+ * Apply the range of channel enable signals given by
+ * [group, group + exec_size) to the instruction passed as argument.
+ */
+void
+brw_inst_set_group(const struct intel_device_info *devinfo,
+                   brw_inst *inst, unsigned group)
+{
+   if (devinfo->ver >= 20) {
+      assert(group % 8 == 0 && group < 32);
+      brw_inst_set_qtr_control(devinfo, inst, group / 8);
+
+   } else if (devinfo->ver >= 7) {
+      assert(group % 4 == 0 && group < 32);
+      brw_inst_set_qtr_control(devinfo, inst, group / 8);
+      brw_inst_set_nib_control(devinfo, inst, (group / 4) % 2);
+
+   } else if (devinfo->ver == 6) {
+      assert(group % 8 == 0 && group < 32);
+      brw_inst_set_qtr_control(devinfo, inst, group / 8);
+
+   } else {
+      assert(group % 8 == 0 && group < 16);
+      /* The channel group and compression controls are non-orthogonal, there
+       * are two possible representations for group zero and we may need to
+       * preserve the current one to avoid changing the selected compression
+       * enable inadvertently.
+       */
+      if (group == 8)
+         brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_2NDHALF);
+      else if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_2NDHALF)
+         brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
+   }
+}
+
+void
+brw_set_default_group(struct brw_codegen *p, unsigned group)
+{
+   p->current->group = group;
+}
+
+void brw_set_default_mask_control( struct brw_codegen *p, unsigned value )
+{
+   p->current->mask_control = value;
+}
+
+void brw_set_default_saturate( struct brw_codegen *p, bool enable )
+{
+   p->current->saturate = enable;
+}
+
+void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value)
+{
+   p->current->acc_wr_control = value;
+}
+
+void brw_set_default_swsb(struct brw_codegen *p, struct tgl_swsb value)
+{
+   p->current->swsb = value;
+}
+
+void brw_push_insn_state( struct brw_codegen *p )
+{
+   assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
+   *(p->current + 1) = *p->current;
+   p->current++;
+}
+
+void brw_pop_insn_state( struct brw_codegen *p )
+{
+   assert(p->current != p->stack);
+   p->current--;
+}
+
+
+/***********************************************************************
+ */
+void
+brw_init_codegen(const struct brw_isa_info *isa,
+                 struct brw_codegen *p, void *mem_ctx)
+{
+   memset(p, 0, sizeof(*p));
+
+   p->isa = isa;
+   p->devinfo = isa->devinfo;
+   p->automatic_exec_sizes = true;
+   /*
+    * Set the initial instruction store array size to 1024, if found that
+    * isn't enough, then it will double the store size at brw_next_insn()
+    * until out of memory.
+    */
+   p->store_size = 1024;
+   p->store = rzalloc_array(mem_ctx, brw_inst, p->store_size);
+   p->nr_insn = 0;
+   p->current = p->stack;
+   memset(p->current, 0, sizeof(p->current[0]));
+
+   p->mem_ctx = mem_ctx;
+
+   /* Some defaults?
+    */
+   brw_set_default_exec_size(p, BRW_EXECUTE_8);
+   brw_set_default_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */
+   brw_set_default_saturate(p, 0);
+   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+
+   /* Set up control flow stack */
+   p->if_stack_depth = 0;
+   p->if_stack_array_size = 16;
+   p->if_stack = rzalloc_array(mem_ctx, int, p->if_stack_array_size);
+
+   p->loop_stack_depth = 0;
+   p->loop_stack_array_size = 16;
+   p->loop_stack = rzalloc_array(mem_ctx, int, p->loop_stack_array_size);
+   p->if_depth_in_loop = rzalloc_array(mem_ctx, int, p->loop_stack_array_size);
+}
+
+
+const unsigned *brw_get_program( struct brw_codegen *p,
+			       unsigned *sz )
+{
+   *sz = p->next_insn_offset;
+   return (const unsigned *)p->store;
+}
+
+const struct brw_shader_reloc *
+brw_get_shader_relocs(struct brw_codegen *p, unsigned *num_relocs)
+{
+   *num_relocs = p->num_relocs;
+   return p->relocs;
+}
+
+DEBUG_GET_ONCE_OPTION(shader_bin_dump_path, "INTEL_SHADER_BIN_DUMP_PATH", NULL);
+
+bool brw_should_dump_shader_bin(void)
+{
+   return debug_get_option_shader_bin_dump_path() != NULL;
+}
+
+void brw_dump_shader_bin(void *assembly, int start_offset, int end_offset,
+                         const char *identifier)
+{
+   char *name = ralloc_asprintf(NULL, "%s/%s.bin",
+                                debug_get_option_shader_bin_dump_path(),
+                                identifier);
+
+   int fd = open(name, O_CREAT | O_WRONLY, 0777);
+   ralloc_free(name);
+
+   if (fd < 0)
+      return;
+
+   struct stat sb;
+   if (fstat(fd, &sb) != 0 || (!S_ISREG(sb.st_mode))) {
+      close(fd);
+      return;
+   }
+
+   size_t to_write = end_offset - start_offset;
+   void *write_ptr = assembly + start_offset;
+
+   while (to_write) {
+      ssize_t ret = write(fd, write_ptr, to_write);
+
+      if (ret <= 0) {
+         close(fd);
+         return;
+      }
+
+      to_write -= ret;
+      write_ptr += ret;
+   }
+
+   close(fd);
+}
+
+bool brw_try_override_assembly(struct brw_codegen *p, int start_offset,
+                               const char *identifier)
+{
+   const char *read_path = getenv("INTEL_SHADER_ASM_READ_PATH");
+   if (!read_path) {
+      return false;
+   }
+
+   char *name = ralloc_asprintf(NULL, "%s/%s.bin", read_path, identifier);
+
+   int fd = open(name, O_RDONLY);
+   ralloc_free(name);
+
+   if (fd == -1) {
+      return false;
+   }
+
+   struct stat sb;
+   if (fstat(fd, &sb) != 0 || (!S_ISREG(sb.st_mode))) {
+      close(fd);
+      return false;
+   }
+
+   p->nr_insn -= (p->next_insn_offset - start_offset) / sizeof(brw_inst);
+   p->nr_insn += sb.st_size / sizeof(brw_inst);
+
+   p->next_insn_offset = start_offset + sb.st_size;
+   p->store_size = (start_offset + sb.st_size) / sizeof(brw_inst);
+   p->store = (brw_inst *)reralloc_size(p->mem_ctx, p->store, p->next_insn_offset);
+   assert(p->store);
+
+   ssize_t ret = read(fd, (char *)p->store + start_offset, sb.st_size);
+   close(fd);
+   if (ret != sb.st_size) {
+      return false;
+   }
+
+   ASSERTED bool valid =
+      brw_validate_instructions(p->isa, p->store,
+                                start_offset, p->next_insn_offset,
+                                NULL);
+   assert(valid);
+
+   return true;
+}
+
+const struct brw_label *
+brw_find_label(const struct brw_label *root, int offset)
+{
+   const struct brw_label *curr = root;
+
+   if (curr != NULL)
+   {
+      do {
+         if (curr->offset == offset)
+            return curr;
+
+         curr = curr->next;
+      } while (curr != NULL);
+   }
+
+   return curr;
+}
+
+void
+brw_create_label(struct brw_label **labels, int offset, void *mem_ctx)
+{
+   if (*labels != NULL) {
+      struct brw_label *curr = *labels;
+      struct brw_label *prev;
+
+      do {
+         prev = curr;
+
+         if (curr->offset == offset)
+            return;
+
+         curr = curr->next;
+      } while (curr != NULL);
+
+      curr = ralloc(mem_ctx, struct brw_label);
+      curr->offset = offset;
+      curr->number = prev->number + 1;
+      curr->next = NULL;
+      prev->next = curr;
+   } else {
+      struct brw_label *root = ralloc(mem_ctx, struct brw_label);
+      root->number = 0;
+      root->offset = offset;
+      root->next = NULL;
+      *labels = root;
+   }
+}
+
+const struct brw_label *
+brw_label_assembly(const struct brw_isa_info *isa,
+                   const void *assembly, int start, int end, void *mem_ctx)
+{
+   const struct intel_device_info *const devinfo = isa->devinfo;
+
+   struct brw_label *root_label = NULL;
+
+   int to_bytes_scale = sizeof(brw_inst) / brw_jump_scale(devinfo);
+
+   for (int offset = start; offset < end;) {
+      const brw_inst *inst = (const brw_inst *) ((const char *) assembly + offset);
+      brw_inst uncompacted;
+
+      bool is_compact = brw_inst_cmpt_control(devinfo, inst);
+
+      if (is_compact) {
+         brw_compact_inst *compacted = (brw_compact_inst *)inst;
+         brw_uncompact_instruction(isa, &uncompacted, compacted);
+         inst = &uncompacted;
+      }
+
+      if (brw_has_uip(devinfo, brw_inst_opcode(isa, inst))) {
+         /* Instructions that have UIP also have JIP. */
+         brw_create_label(&root_label,
+            offset + brw_inst_uip(devinfo, inst) * to_bytes_scale, mem_ctx);
+         brw_create_label(&root_label,
+            offset + brw_inst_jip(devinfo, inst) * to_bytes_scale, mem_ctx);
+      } else if (brw_has_jip(devinfo, brw_inst_opcode(isa, inst))) {
+         int jip;
+         if (devinfo->ver >= 7) {
+            jip = brw_inst_jip(devinfo, inst);
+         } else {
+            jip = brw_inst_gfx6_jump_count(devinfo, inst);
+         }
+
+         brw_create_label(&root_label, offset + jip * to_bytes_scale, mem_ctx);
+      }
+
+      if (is_compact) {
+         offset += sizeof(brw_compact_inst);
+      } else {
+         offset += sizeof(brw_inst);
+      }
+   }
+
+   return root_label;
+}
+
+void
+brw_disassemble_with_labels(const struct brw_isa_info *isa,
+                            const void *assembly, int start, int end, FILE *out)
+{
+   void *mem_ctx = ralloc_context(NULL);
+   const struct brw_label *root_label =
+      brw_label_assembly(isa, assembly, start, end, mem_ctx);
+
+   brw_disassemble(isa, assembly, start, end, root_label, out);
+
+   ralloc_free(mem_ctx);
+}
+
+void
+brw_disassemble(const struct brw_isa_info *isa,
+                const void *assembly, int start, int end,
+                const struct brw_label *root_label, FILE *out)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+
+   bool dump_hex = INTEL_DEBUG(DEBUG_HEX);
+
+   for (int offset = start; offset < end;) {
+      const brw_inst *insn = (const brw_inst *)((char *)assembly + offset);
+      brw_inst uncompacted;
+
+      if (root_label != NULL) {
+        const struct brw_label *label = brw_find_label(root_label, offset);
+        if (label != NULL) {
+           fprintf(out, "\nLABEL%d:\n", label->number);
+        }
+      }
+
+      bool compacted = brw_inst_cmpt_control(devinfo, insn);
+      if (0)
+         fprintf(out, "0x%08x: ", offset);
+
+      if (compacted) {
+         brw_compact_inst *compacted = (brw_compact_inst *)insn;
+         if (dump_hex) {
+            unsigned char * insn_ptr = ((unsigned char *)&insn[0]);
+            const unsigned int blank_spaces = 24;
+            for (int i = 0 ; i < 8; i = i + 4) {
+               fprintf(out, "%02x %02x %02x %02x ",
+                       insn_ptr[i],
+                       insn_ptr[i + 1],
+                       insn_ptr[i + 2],
+                       insn_ptr[i + 3]);
+            }
+            /* Make compacted instructions hex value output vertically aligned
+             * with uncompacted instructions hex value
+             */
+            fprintf(out, "%*c", blank_spaces, ' ');
+         }
+
+         brw_uncompact_instruction(isa, &uncompacted, compacted);
+         insn = &uncompacted;
+      } else {
+         if (dump_hex) {
+            unsigned char * insn_ptr = ((unsigned char *)&insn[0]);
+            for (int i = 0 ; i < 16; i = i + 4) {
+               fprintf(out, "%02x %02x %02x %02x ",
+                       insn_ptr[i],
+                       insn_ptr[i + 1],
+                       insn_ptr[i + 2],
+                       insn_ptr[i + 3]);
+            }
+         }
+      }
+
+      brw_disassemble_inst(out, isa, insn, compacted, offset, root_label);
+
+      if (compacted) {
+         offset += sizeof(brw_compact_inst);
+      } else {
+         offset += sizeof(brw_inst);
+      }
+   }
+}
+
+static const struct opcode_desc opcode_descs[] = {
+   /* IR,                 HW,  name,      nsrc, ndst, gfx_vers */
+   { BRW_OPCODE_ILLEGAL,  0,   "illegal", 0,    0,    GFX_ALL },
+   { BRW_OPCODE_SYNC,     1,   "sync",    1,    0,    GFX_GE(GFX12) },
+   { BRW_OPCODE_MOV,      1,   "mov",     1,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_MOV,      97,  "mov",     1,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_SEL,      2,   "sel",     2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_SEL,      98,  "sel",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_MOVI,     3,   "movi",    2,    1,    GFX_GE(GFX45) & GFX_LT(GFX12) },
+   { BRW_OPCODE_MOVI,     99,  "movi",    2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_NOT,      4,   "not",     1,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_NOT,      100, "not",     1,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_AND,      5,   "and",     2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_AND,      101, "and",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_OR,       6,   "or",      2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_OR,       102, "or",      2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_XOR,      7,   "xor",     2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_XOR,      103, "xor",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_SHR,      8,   "shr",     2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_SHR,      104, "shr",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_SHL,      9,   "shl",     2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_SHL,      105, "shl",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_DIM,      10,  "dim",     1,    1,    GFX75 },
+   { BRW_OPCODE_SMOV,     10,  "smov",    0,    0,    GFX_GE(GFX8) & GFX_LT(GFX12) },
+   { BRW_OPCODE_SMOV,     106, "smov",    0,    0,    GFX_GE(GFX12) },
+   { BRW_OPCODE_ASR,      12,  "asr",     2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_ASR,      108, "asr",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_ROR,      14,  "ror",     2,    1,    GFX11 },
+   { BRW_OPCODE_ROR,      110, "ror",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_ROL,      15,  "rol",     2,    1,    GFX11 },
+   { BRW_OPCODE_ROL,      111, "rol",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_CMP,      16,  "cmp",     2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_CMP,      112, "cmp",     2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_CMPN,     17,  "cmpn",    2,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_CMPN,     113, "cmpn",    2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_CSEL,     18,  "csel",    3,    1,    GFX_GE(GFX8) & GFX_LT(GFX12) },
+   { BRW_OPCODE_CSEL,     114, "csel",    3,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_F32TO16,  19,  "f32to16", 1,    1,    GFX7 | GFX75 },
+   { BRW_OPCODE_F16TO32,  20,  "f16to32", 1,    1,    GFX7 | GFX75 },
+   { BRW_OPCODE_BFREV,    23,  "bfrev",   1,    1,    GFX_GE(GFX7) & GFX_LT(GFX12) },
+   { BRW_OPCODE_BFREV,    119, "bfrev",   1,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_BFE,      24,  "bfe",     3,    1,    GFX_GE(GFX7) & GFX_LT(GFX12) },
+   { BRW_OPCODE_BFE,      120, "bfe",     3,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_BFI1,     25,  "bfi1",    2,    1,    GFX_GE(GFX7) & GFX_LT(GFX12) },
+   { BRW_OPCODE_BFI1,     121, "bfi1",    2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_BFI2,     26,  "bfi2",    3,    1,    GFX_GE(GFX7) & GFX_LT(GFX12) },
+   { BRW_OPCODE_BFI2,     122, "bfi2",    3,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_JMPI,     32,  "jmpi",    0,    0,    GFX_ALL },
+   { BRW_OPCODE_BRD,      33,  "brd",     0,    0,    GFX_GE(GFX7) },
+   { BRW_OPCODE_IF,       34,  "if",      0,    0,    GFX_ALL },
+   { BRW_OPCODE_IFF,      35,  "iff",     0,    0,    GFX_LE(GFX5) },
+   { BRW_OPCODE_BRC,      35,  "brc",     0,    0,    GFX_GE(GFX7) },
+   { BRW_OPCODE_ELSE,     36,  "else",    0,    0,    GFX_ALL },
+   { BRW_OPCODE_ENDIF,    37,  "endif",   0,    0,    GFX_ALL },
+   { BRW_OPCODE_DO,       38,  "do",      0,    0,    GFX_LE(GFX5) },
+   { BRW_OPCODE_CASE,     38,  "case",    0,    0,    GFX6 },
+   { BRW_OPCODE_WHILE,    39,  "while",   0,    0,    GFX_ALL },
+   { BRW_OPCODE_BREAK,    40,  "break",   0,    0,    GFX_ALL },
+   { BRW_OPCODE_CONTINUE, 41,  "cont",    0,    0,    GFX_ALL },
+   { BRW_OPCODE_HALT,     42,  "halt",    0,    0,    GFX_ALL },
+   { BRW_OPCODE_CALLA,    43,  "calla",   0,    0,    GFX_GE(GFX75) },
+   { BRW_OPCODE_MSAVE,    44,  "msave",   0,    0,    GFX_LE(GFX5) },
+   { BRW_OPCODE_CALL,     44,  "call",    0,    0,    GFX_GE(GFX6) },
+   { BRW_OPCODE_MREST,    45,  "mrest",   0,    0,    GFX_LE(GFX5) },
+   { BRW_OPCODE_RET,      45,  "ret",     0,    0,    GFX_GE(GFX6) },
+   { BRW_OPCODE_PUSH,     46,  "push",    0,    0,    GFX_LE(GFX5) },
+   { BRW_OPCODE_FORK,     46,  "fork",    0,    0,    GFX6 },
+   { BRW_OPCODE_GOTO,     46,  "goto",    0,    0,    GFX_GE(GFX8) },
+   { BRW_OPCODE_POP,      47,  "pop",     2,    0,    GFX_LE(GFX5) },
+   { BRW_OPCODE_WAIT,     48,  "wait",    0,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_SEND,     49,  "send",    1,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_SENDC,    50,  "sendc",   1,    1,    GFX_LT(GFX12) },
+   { BRW_OPCODE_SEND,     49,  "send",    2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_SENDC,    50,  "sendc",   2,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_SENDS,    51,  "sends",   2,    1,    GFX_GE(GFX9) & GFX_LT(GFX12) },
+   { BRW_OPCODE_SENDSC,   52,  "sendsc",  2,    1,    GFX_GE(GFX9) & GFX_LT(GFX12) },
+   { BRW_OPCODE_MATH,     56,  "math",    2,    1,    GFX_GE(GFX6) },
+   { BRW_OPCODE_ADD,      64,  "add",     2,    1,    GFX_ALL },
+   { BRW_OPCODE_MUL,      65,  "mul",     2,    1,    GFX_ALL },
+   { BRW_OPCODE_AVG,      66,  "avg",     2,    1,    GFX_ALL },
+   { BRW_OPCODE_FRC,      67,  "frc",     1,    1,    GFX_ALL },
+   { BRW_OPCODE_RNDU,     68,  "rndu",    1,    1,    GFX_ALL },
+   { BRW_OPCODE_RNDD,     69,  "rndd",    1,    1,    GFX_ALL },
+   { BRW_OPCODE_RNDE,     70,  "rnde",    1,    1,    GFX_ALL },
+   { BRW_OPCODE_RNDZ,     71,  "rndz",    1,    1,    GFX_ALL },
+   { BRW_OPCODE_MAC,      72,  "mac",     2,    1,    GFX_ALL },
+   { BRW_OPCODE_MACH,     73,  "mach",    2,    1,    GFX_ALL },
+   { BRW_OPCODE_LZD,      74,  "lzd",     1,    1,    GFX_ALL },
+   { BRW_OPCODE_FBH,      75,  "fbh",     1,    1,    GFX_GE(GFX7) },
+   { BRW_OPCODE_FBL,      76,  "fbl",     1,    1,    GFX_GE(GFX7) },
+   { BRW_OPCODE_CBIT,     77,  "cbit",    1,    1,    GFX_GE(GFX7) },
+   { BRW_OPCODE_ADDC,     78,  "addc",    2,    1,    GFX_GE(GFX7) },
+   { BRW_OPCODE_SUBB,     79,  "subb",    2,    1,    GFX_GE(GFX7) },
+   { BRW_OPCODE_SAD2,     80,  "sad2",    2,    1,    GFX_ALL },
+   { BRW_OPCODE_SADA2,    81,  "sada2",   2,    1,    GFX_ALL },
+   { BRW_OPCODE_ADD3,     82,  "add3",    3,    1,    GFX_GE(GFX125) },
+   { BRW_OPCODE_DP4,      84,  "dp4",     2,    1,    GFX_LT(GFX11) },
+   { BRW_OPCODE_DPH,      85,  "dph",     2,    1,    GFX_LT(GFX11) },
+   { BRW_OPCODE_DP3,      86,  "dp3",     2,    1,    GFX_LT(GFX11) },
+   { BRW_OPCODE_DP2,      87,  "dp2",     2,    1,    GFX_LT(GFX11) },
+   { BRW_OPCODE_DP4A,     88,  "dp4a",    3,    1,    GFX_GE(GFX12) },
+   { BRW_OPCODE_LINE,     89,  "line",    2,    1,    GFX_LE(GFX10) },
+   { BRW_OPCODE_DPAS,     89,  "dpas",    3,    1,    GFX_GE(GFX125) },
+   { BRW_OPCODE_PLN,      90,  "pln",     2,    1,    GFX_GE(GFX45) & GFX_LE(GFX10) },
+   { BRW_OPCODE_MAD,      91,  "mad",     3,    1,    GFX_GE(GFX6) },
+   { BRW_OPCODE_LRP,      92,  "lrp",     3,    1,    GFX_GE(GFX6) & GFX_LE(GFX10) },
+   { BRW_OPCODE_MADM,     93,  "madm",    3,    1,    GFX_GE(GFX8) },
+   { BRW_OPCODE_NENOP,    125, "nenop",   0,    0,    GFX45 },
+   { BRW_OPCODE_NOP,      126, "nop",     0,    0,    GFX_LT(GFX12) },
+   { BRW_OPCODE_NOP,      96,  "nop",     0,    0,    GFX_GE(GFX12) }
+};
+
+void
+brw_init_isa_info(struct brw_isa_info *isa,
+                  const struct intel_device_info *devinfo)
+{
+   isa->devinfo = devinfo;
+
+   enum gfx_ver ver = gfx_ver_from_devinfo(devinfo);
+
+   memset(isa->ir_to_descs, 0, sizeof(isa->ir_to_descs));
+   memset(isa->hw_to_descs, 0, sizeof(isa->hw_to_descs));
+
+   for (unsigned i = 0; i < ARRAY_SIZE(opcode_descs); i++) {
+      if (opcode_descs[i].gfx_vers & ver) {
+         const unsigned e = opcode_descs[i].ir;
+         const unsigned h = opcode_descs[i].hw;
+         assert(e < ARRAY_SIZE(isa->ir_to_descs) && !isa->ir_to_descs[e]);
+         assert(h < ARRAY_SIZE(isa->hw_to_descs) && !isa->hw_to_descs[h]);
+         isa->ir_to_descs[e] = &opcode_descs[i];
+         isa->hw_to_descs[h] = &opcode_descs[i];
+      }
+   }
+}
+
+/**
+ * Return the matching opcode_desc for the specified IR opcode and hardware
+ * generation, or NULL if the opcode is not supported by the device.
+ */
+const struct opcode_desc *
+brw_opcode_desc(const struct brw_isa_info *isa, enum opcode op)
+{
+   return op < ARRAY_SIZE(isa->ir_to_descs) ? isa->ir_to_descs[op] : NULL;
+}
+
+/**
+ * Return the matching opcode_desc for the specified HW opcode and hardware
+ * generation, or NULL if the opcode is not supported by the device.
+ */
+const struct opcode_desc *
+brw_opcode_desc_from_hw(const struct brw_isa_info *isa, unsigned hw)
+{
+   return hw < ARRAY_SIZE(isa->hw_to_descs) ? isa->hw_to_descs[hw] : NULL;
+}
+
+unsigned
+brw_num_sources_from_inst(const struct brw_isa_info *isa,
+                          const brw_inst *inst)
+{
+   const struct intel_device_info *devinfo = isa->devinfo;
+   const struct opcode_desc *desc =
+      brw_opcode_desc(isa, brw_inst_opcode(isa, inst));
+   unsigned math_function;
+
+   if (brw_inst_opcode(isa, inst) == BRW_OPCODE_MATH) {
+      math_function = brw_inst_math_function(devinfo, inst);
+   } else if (devinfo->ver < 6 &&
+              brw_inst_opcode(isa, inst) == BRW_OPCODE_SEND) {
+      if (brw_inst_sfid(devinfo, inst) == BRW_SFID_MATH) {
+         /* src1 must be a descriptor (including the information to determine
+          * that the SEND is doing an extended math operation), but src0 can
+          * actually be null since it serves as the source of the implicit GRF
+          * to MRF move.
+          *
+          * If we stop using that functionality, we'll have to revisit this.
+          */
+         return 2;
+      } else {
+         /* Send instructions are allowed to have null sources since they use
+          * the base_mrf field to specify which message register source.
+          */
+         return 0;
+      }
+   } else {
+      assert(desc->nsrc < 4);
+      return desc->nsrc;
+   }
+
+   switch (math_function) {
+   case BRW_MATH_FUNCTION_INV:
+   case BRW_MATH_FUNCTION_LOG:
+   case BRW_MATH_FUNCTION_EXP:
+   case BRW_MATH_FUNCTION_SQRT:
+   case BRW_MATH_FUNCTION_RSQ:
+   case BRW_MATH_FUNCTION_SIN:
+   case BRW_MATH_FUNCTION_COS:
+   case BRW_MATH_FUNCTION_SINCOS:
+   case GFX8_MATH_FUNCTION_INVM:
+   case GFX8_MATH_FUNCTION_RSQRTM:
+      return 1;
+   case BRW_MATH_FUNCTION_FDIV:
+   case BRW_MATH_FUNCTION_POW:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
+   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
+      return 2;
+   default:
+      unreachable("not reached");
+   }
+}
--- a/src/intel/compiler/elk/brw_eu.h
+++ b/src/intel/compiler/elk/brw_eu.h
--- a/src/intel/compiler/elk/brw_eu_compact.c
+++ b/src/intel/compiler/elk/brw_eu_compact.c
--- a/src/intel/compiler/elk/brw_eu_defines.h
+++ b/src/intel/compiler/elk/brw_eu_defines.h
--- a/src/intel/compiler/elk/brw_eu_emit.c
+++ b/src/intel/compiler/elk/brw_eu_emit.c
--- a/src/intel/compiler/elk/brw_eu_util.c
+++ b/src/intel/compiler/elk/brw_eu_util.c
@ -0,0 +1,119 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+
+#include "brw_eu_defines.h"
+#include "brw_eu.h"
+
+
+void brw_math_invert( struct brw_codegen *p,
+			     struct brw_reg dst,
+			     struct brw_reg src)
+{
+   gfx4_math(p,
+	     dst,
+	     BRW_MATH_FUNCTION_INV,
+	     0,
+	     src,
+	     BRW_MATH_PRECISION_FULL);
+}
+
+
+
+void brw_copy4(struct brw_codegen *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count)
+{
+   unsigned i;
+
+   dst = vec4(dst);
+   src = vec4(src);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16));
+   }
+}
+
+
+void brw_copy8(struct brw_codegen *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count)
+{
+   unsigned i;
+
+   dst = vec8(dst);
+   src = vec8(src);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+   }
+}
+
+
+void brw_copy_indirect_to_indirect(struct brw_codegen *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   unsigned count)
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, deref_4f(dst_ptr, delta),    deref_4f(src_ptr, delta));
+      brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16));
+   }
+}
+
+
+void brw_copy_from_indirect(struct brw_codegen *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    unsigned count)
+{
+   unsigned i;
+
+   dst = vec4(dst);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    deref_4f(ptr, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16));
+   }
+}
--- a/src/intel/compiler/elk/brw_eu_validate.c
+++ b/src/intel/compiler/elk/brw_eu_validate.c
--- a/src/intel/compiler/elk/brw_fs.cpp
+++ b/src/intel/compiler/elk/brw_fs.cpp
--- a/src/intel/compiler/elk/brw_fs.h
+++ b/src/intel/compiler/elk/brw_fs.h
@ -0,0 +1,637 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#ifndef BRW_FS_H
+#define BRW_FS_H
+
+#include "brw_shader.h"
+#include "brw_ir_fs.h"
+#include "brw_fs_live_variables.h"
+#include "brw_ir_performance.h"
+#include "compiler/nir/nir.h"
+
+struct bblock_t;
+namespace {
+   struct acp_entry;
+}
+
+class fs_visitor;
+
+namespace brw {
+   /**
+    * Register pressure analysis of a shader.  Estimates how many registers
+    * are live at any point of the program in GRF units.
+    */
+   struct register_pressure {
+      register_pressure(const fs_visitor *v);
+      ~register_pressure();
+
+      analysis_dependency_class
+      dependency_class() const
+      {
+         return (DEPENDENCY_INSTRUCTION_IDENTITY |
+                 DEPENDENCY_INSTRUCTION_DATA_FLOW |
+                 DEPENDENCY_VARIABLES);
+      }
+
+      bool
+      validate(const fs_visitor *) const
+      {
+         /* FINISHME */
+         return true;
+      }
+
+      unsigned *regs_live_at_ip;
+   };
+}
+
+struct brw_gs_compile;
+
+namespace brw {
+class fs_builder;
+}
+
+struct shader_stats {
+   const char *scheduler_mode;
+   unsigned promoted_constants;
+   unsigned spill_count;
+   unsigned fill_count;
+   unsigned max_register_pressure;
+};
+
+/** Register numbers for thread payload fields. */
+struct thread_payload {
+   /** The number of thread payload registers the hardware will supply. */
+   uint8_t num_regs;
+
+   virtual ~thread_payload() = default;
+
+protected:
+   thread_payload() : num_regs() {}
+};
+
+struct vs_thread_payload : public thread_payload {
+   vs_thread_payload(const fs_visitor &v);
+
+   fs_reg urb_handles;
+};
+
+struct tcs_thread_payload : public thread_payload {
+   tcs_thread_payload(const fs_visitor &v);
+
+   fs_reg patch_urb_output;
+   fs_reg primitive_id;
+   fs_reg icp_handle_start;
+};
+
+struct tes_thread_payload : public thread_payload {
+   tes_thread_payload(const fs_visitor &v);
+
+   fs_reg patch_urb_input;
+   fs_reg primitive_id;
+   fs_reg coords[3];
+   fs_reg urb_output;
+};
+
+struct gs_thread_payload : public thread_payload {
+   gs_thread_payload(fs_visitor &v);
+
+   fs_reg urb_handles;
+   fs_reg primitive_id;
+   fs_reg instance_id;
+   fs_reg icp_handle_start;
+};
+
+struct fs_thread_payload : public thread_payload {
+   fs_thread_payload(const fs_visitor &v,
+                     bool &source_depth_to_render_target,
+                     bool &runtime_check_aads_emit);
+
+   uint8_t subspan_coord_reg[2];
+   uint8_t source_depth_reg[2];
+   uint8_t source_w_reg[2];
+   uint8_t aa_dest_stencil_reg[2];
+   uint8_t dest_depth_reg[2];
+   uint8_t sample_pos_reg[2];
+   uint8_t sample_mask_in_reg[2];
+   uint8_t depth_w_coef_reg;
+   uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT][2];
+};
+
+struct cs_thread_payload : public thread_payload {
+   cs_thread_payload(const fs_visitor &v);
+
+   void load_subgroup_id(const brw::fs_builder &bld, fs_reg &dest) const;
+
+   fs_reg local_invocation_id[3];
+
+protected:
+   fs_reg subgroup_id_;
+};
+
+struct task_mesh_thread_payload : public cs_thread_payload {
+   task_mesh_thread_payload(fs_visitor &v);
+
+   fs_reg extended_parameter_0;
+   fs_reg local_index;
+   fs_reg inline_parameter;
+
+   fs_reg urb_output;
+
+   /* URB to read Task memory inputs. Only valid for MESH stage. */
+   fs_reg task_urb_input;
+};
+
+struct bs_thread_payload : public thread_payload {
+   bs_thread_payload(const fs_visitor &v);
+
+   fs_reg global_arg_ptr;
+   fs_reg local_arg_ptr;
+
+   void load_shader_type(const brw::fs_builder &bld, fs_reg &dest) const;
+};
+
+class fs_instruction_scheduler;
+
+/**
+ * The fragment shader front-end.
+ *
+ * Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
+ */
+class fs_visitor : public backend_shader
+{
+public:
+   fs_visitor(const struct brw_compiler *compiler,
+              const struct brw_compile_params *params,
+              const brw_base_prog_key *key,
+              struct brw_stage_prog_data *prog_data,
+              const nir_shader *shader,
+              unsigned dispatch_width,
+              bool needs_register_pressure,
+              bool debug_enabled);
+   fs_visitor(const struct brw_compiler *compiler,
+              const struct brw_compile_params *params,
+              const brw_wm_prog_key *key,
+              struct brw_wm_prog_data *prog_data,
+              const nir_shader *shader,
+              unsigned dispatch_width,
+              unsigned num_polygons,
+              bool needs_register_pressure,
+              bool debug_enabled);
+   fs_visitor(const struct brw_compiler *compiler,
+              const struct brw_compile_params *params,
+              struct brw_gs_compile *gs_compile,
+              struct brw_gs_prog_data *prog_data,
+              const nir_shader *shader,
+              bool needs_register_pressure,
+              bool debug_enabled);
+   void init();
+   ~fs_visitor();
+
+   fs_reg vgrf(const glsl_type *const type);
+   void import_uniforms(fs_visitor *v);
+
+   void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld,
+                                   const fs_reg &dst,
+                                   const fs_reg &surface,
+                                   const fs_reg &surface_handle,
+                                   const fs_reg &varying_offset,
+                                   uint32_t const_offset,
+                                   uint8_t alignment,
+                                   unsigned components);
+   void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf);
+
+   bool run_fs(bool allow_spilling, bool do_rep_send);
+   bool run_vs();
+   bool run_tcs();
+   bool run_tes();
+   bool run_gs();
+   bool run_cs(bool allow_spilling);
+   bool run_bs(bool allow_spilling);
+   bool run_task(bool allow_spilling);
+   bool run_mesh(bool allow_spilling);
+   void optimize();
+   void allocate_registers(bool allow_spilling);
+   uint32_t compute_max_register_pressure();
+   bool fixup_sends_duplicate_payload();
+   void fixup_3src_null_dest();
+   void emit_dummy_memory_fence_before_eot();
+   void emit_dummy_mov_instruction();
+   bool fixup_nomask_control_flow();
+   void assign_curb_setup();
+   void assign_urb_setup();
+   void convert_attr_sources_to_hw_regs(fs_inst *inst);
+   void assign_vs_urb_setup();
+   void assign_tcs_urb_setup();
+   void assign_tes_urb_setup();
+   void assign_gs_urb_setup();
+   bool assign_regs(bool allow_spilling, bool spill_all);
+   void assign_regs_trivial();
+   void calculate_payload_ranges(unsigned payload_node_count,
+                                 int *payload_last_use_ip) const;
+   bool split_virtual_grfs();
+   bool compact_virtual_grfs();
+   void assign_constant_locations();
+   bool get_pull_locs(const fs_reg &src, unsigned *out_surf_index,
+                      unsigned *out_pull_index);
+   bool lower_constant_loads();
+   virtual void invalidate_analysis(brw::analysis_dependency_class c);
+
+#ifndef NDEBUG
+   void validate();
+#else
+   void validate() {}
+#endif
+
+   bool opt_algebraic();
+   bool opt_redundant_halt();
+   bool opt_cse();
+   bool opt_cse_local(const brw::fs_live_variables &live, bblock_t *block, int &ip);
+
+   bool opt_copy_propagation();
+   bool opt_bank_conflicts();
+   bool opt_split_sends();
+   bool register_coalesce();
+   bool compute_to_mrf();
+   bool eliminate_find_live_channel();
+   bool dead_code_eliminate();
+   bool remove_duplicate_mrf_writes();
+   bool remove_extra_rounding_modes();
+
+   fs_instruction_scheduler *prepare_scheduler(void *mem_ctx);
+   void schedule_instructions_pre_ra(fs_instruction_scheduler *sched,
+                                     instruction_scheduler_mode mode);
+   void schedule_instructions_post_ra();
+
+   void insert_gfx4_send_dependency_workarounds();
+   void insert_gfx4_pre_send_dependency_workarounds(bblock_t *block,
+                                                    fs_inst *inst);
+   void insert_gfx4_post_send_dependency_workarounds(bblock_t *block,
+                                                     fs_inst *inst);
+   void vfail(const char *msg, va_list args);
+   void fail(const char *msg, ...);
+   void limit_dispatch_width(unsigned n, const char *msg);
+   bool lower_uniform_pull_constant_loads();
+   bool lower_load_payload();
+   bool lower_pack();
+   bool lower_regioning();
+   bool lower_logical_sends();
+   bool lower_integer_multiplication();
+   bool lower_minmax();
+   bool lower_simd_width();
+   bool lower_barycentrics();
+   bool lower_derivatives();
+   bool lower_find_live_channel();
+   bool lower_scoreboard();
+   bool lower_sub_sat();
+   bool opt_combine_constants();
+
+   void emit_repclear_shader();
+   void emit_interpolation_setup_gfx4();
+   void emit_interpolation_setup_gfx6();
+   bool opt_peephole_sel();
+   bool opt_saturate_propagation();
+   bool opt_cmod_propagation();
+   bool opt_zero_samples();
+
+   void set_tcs_invocation_id();
+
+   void emit_alpha_test();
+   fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
+                                 fs_reg color1, fs_reg color2,
+                                 fs_reg src0_alpha, unsigned components);
+   void do_emit_fb_writes(int nr_color_regions, bool replicate_alpha);
+   void emit_fb_writes();
+   void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg());
+   void emit_gs_control_data_bits(const fs_reg &vertex_count);
+   void emit_gs_thread_end();
+   bool mark_last_urb_write_with_eot();
+   void emit_tcs_thread_end();
+   void emit_urb_fence();
+   void emit_cs_terminate();
+
+   fs_reg interp_reg(const brw::fs_builder &bld, unsigned location,
+                     unsigned channel, unsigned comp);
+   fs_reg per_primitive_reg(const brw::fs_builder &bld,
+                            int location, unsigned comp);
+
+   virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const;
+   virtual void dump_instructions_to_file(FILE *file) const;
+
+   const brw_base_prog_key *const key;
+   const struct brw_sampler_prog_key_data *key_tex;
+
+   struct brw_gs_compile *gs_compile;
+
+   struct brw_stage_prog_data *prog_data;
+
+   brw_analysis<brw::fs_live_variables, backend_shader> live_analysis;
+   brw_analysis<brw::register_pressure, fs_visitor> regpressure_analysis;
+   brw_analysis<brw::performance, fs_visitor> performance_analysis;
+
+   /** Number of uniform variable components visited. */
+   unsigned uniforms;
+
+   /** Byte-offset for the next available spot in the scratch space buffer. */
+   unsigned last_scratch;
+
+   /**
+    * Array mapping UNIFORM register numbers to the push parameter index,
+    * or -1 if this uniform register isn't being uploaded as a push constant.
+    */
+   int *push_constant_loc;
+
+   fs_reg frag_depth;
+   fs_reg frag_stencil;
+   fs_reg sample_mask;
+   fs_reg outputs[VARYING_SLOT_MAX];
+   fs_reg dual_src_output;
+   int first_non_payload_grf;
+   /** Either BRW_MAX_GRF or GFX7_MRF_HACK_START */
+   unsigned max_grf;
+
+   bool failed;
+   char *fail_msg;
+
+   thread_payload *payload_;
+
+   thread_payload &payload() {
+      return *this->payload_;
+   }
+
+   vs_thread_payload &vs_payload() {
+      assert(stage == MESA_SHADER_VERTEX);
+      return *static_cast<vs_thread_payload *>(this->payload_);
+   }
+
+   tcs_thread_payload &tcs_payload() {
+      assert(stage == MESA_SHADER_TESS_CTRL);
+      return *static_cast<tcs_thread_payload *>(this->payload_);
+   }
+
+   tes_thread_payload &tes_payload() {
+      assert(stage == MESA_SHADER_TESS_EVAL);
+      return *static_cast<tes_thread_payload *>(this->payload_);
+   }
+
+   gs_thread_payload &gs_payload() {
+      assert(stage == MESA_SHADER_GEOMETRY);
+      return *static_cast<gs_thread_payload *>(this->payload_);
+   }
+
+   fs_thread_payload &fs_payload() {
+      assert(stage == MESA_SHADER_FRAGMENT);
+      return *static_cast<fs_thread_payload *>(this->payload_);
+   };
+
+   cs_thread_payload &cs_payload() {
+      assert(gl_shader_stage_uses_workgroup(stage));
+      return *static_cast<cs_thread_payload *>(this->payload_);
+   }
+
+   task_mesh_thread_payload &task_mesh_payload() {
+      assert(stage == MESA_SHADER_TASK || stage == MESA_SHADER_MESH);
+      return *static_cast<task_mesh_thread_payload *>(this->payload_);
+   }
+
+   bs_thread_payload &bs_payload() {
+      assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
+      return *static_cast<bs_thread_payload *>(this->payload_);
+   }
+
+   bool source_depth_to_render_target;
+   bool runtime_check_aads_emit;
+
+   fs_reg pixel_x;
+   fs_reg pixel_y;
+   fs_reg pixel_z;
+   fs_reg wpos_w;
+   fs_reg pixel_w;
+   fs_reg delta_xy[BRW_BARYCENTRIC_MODE_COUNT];
+   fs_reg final_gs_vertex_count;
+   fs_reg control_data_bits;
+   fs_reg invocation_id;
+
+   unsigned grf_used;
+   bool spilled_any_registers;
+   bool needs_register_pressure;
+
+   const unsigned dispatch_width; /**< 8, 16 or 32 */
+   const unsigned max_polygons;
+   unsigned max_dispatch_width;
+
+   /* The API selected subgroup size */
+   unsigned api_subgroup_size; /**< 0, 8, 16, 32 */
+
+   struct shader_stats shader_stats;
+
+   void lower_mul_dword_inst(fs_inst *inst, bblock_t *block);
+   void lower_mul_qword_inst(fs_inst *inst, bblock_t *block);
+   void lower_mulh_inst(fs_inst *inst, bblock_t *block);
+
+   unsigned workgroup_size() const;
+
+   void debug_optimizer(const nir_shader *nir,
+                        const char *pass_name,
+                        int iteration, int pass_num) const;
+};
+
+/**
+ * Return the flag register used in fragment shaders to keep track of live
+ * samples.  On Gfx7+ we use f1.0-f1.1 to allow discard jumps in SIMD32
+ * dispatch mode, while earlier generations are constrained to f0.1, which
+ * limits the dispatch width to SIMD16 for fragment shaders that use discard.
+ */
+static inline unsigned
+sample_mask_flag_subreg(const fs_visitor &s)
+{
+   assert(s.stage == MESA_SHADER_FRAGMENT);
+   return s.devinfo->ver >= 7 ? 2 : 1;
+}
+
+/**
+ * The fragment shader code generator.
+ *
+ * Translates FS IR to actual i965 assembly code.
+ */
+class fs_generator
+{
+public:
+   fs_generator(const struct brw_compiler *compiler,
+                const struct brw_compile_params *params,
+                struct brw_stage_prog_data *prog_data,
+                bool runtime_check_aads_emit,
+                gl_shader_stage stage);
+   ~fs_generator();
+
+   void enable_debug(const char *shader_name);
+   int generate_code(const cfg_t *cfg, int dispatch_width,
+                     struct shader_stats shader_stats,
+                     const brw::performance &perf,
+                     struct brw_compile_stats *stats,
+                     unsigned max_polygons = 0);
+   void add_const_data(void *data, unsigned size);
+   void add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt);
+   const unsigned *get_assembly();
+
+private:
+   void fire_fb_write(fs_inst *inst,
+                      struct brw_reg payload,
+                      struct brw_reg implied_header,
+                      GLuint nr);
+   void generate_send(fs_inst *inst,
+                      struct brw_reg dst,
+                      struct brw_reg desc,
+                      struct brw_reg ex_desc,
+                      struct brw_reg payload,
+                      struct brw_reg payload2);
+   void generate_fb_write(fs_inst *inst, struct brw_reg payload);
+   void generate_fb_read(fs_inst *inst, struct brw_reg dst,
+                         struct brw_reg payload);
+   void generate_cs_terminate(fs_inst *inst, struct brw_reg payload);
+   void generate_barrier(fs_inst *inst, struct brw_reg src);
+   bool generate_linterp(fs_inst *inst, struct brw_reg dst,
+			 struct brw_reg *src);
+   void generate_tex(fs_inst *inst, struct brw_reg dst,
+                     struct brw_reg surface_index,
+                     struct brw_reg sampler_index);
+   void generate_ddx(const fs_inst *inst,
+                     struct brw_reg dst, struct brw_reg src);
+   void generate_ddy(const fs_inst *inst,
+                     struct brw_reg dst, struct brw_reg src);
+   void generate_scratch_write(fs_inst *inst, struct brw_reg src);
+   void generate_scratch_read(fs_inst *inst, struct brw_reg dst);
+   void generate_scratch_read_gfx7(fs_inst *inst, struct brw_reg dst);
+   void generate_scratch_header(fs_inst *inst, struct brw_reg dst);
+   void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
+                                            struct brw_reg index,
+                                            struct brw_reg offset);
+   void generate_varying_pull_constant_load_gfx4(fs_inst *inst,
+                                                 struct brw_reg dst,
+                                                 struct brw_reg index);
+
+   void generate_set_sample_id(fs_inst *inst,
+                               struct brw_reg dst,
+                               struct brw_reg src0,
+                               struct brw_reg src1);
+
+   void generate_halt(fs_inst *inst);
+
+   void generate_mov_indirect(fs_inst *inst,
+                              struct brw_reg dst,
+                              struct brw_reg reg,
+                              struct brw_reg indirect_byte_offset);
+
+   void generate_shuffle(fs_inst *inst,
+                         struct brw_reg dst,
+                         struct brw_reg src,
+                         struct brw_reg idx);
+
+   void generate_quad_swizzle(const fs_inst *inst,
+                              struct brw_reg dst, struct brw_reg src,
+                              unsigned swiz);
+
+   bool patch_halt_jumps();
+
+   const struct brw_compiler *compiler;
+   const struct brw_compile_params *params;
+
+   const struct intel_device_info *devinfo;
+
+   struct brw_codegen *p;
+   struct brw_stage_prog_data * const prog_data;
+
+   unsigned dispatch_width; /**< 8, 16 or 32 */
+
+   exec_list discard_halt_patches;
+   bool runtime_check_aads_emit;
+   bool debug_flag;
+   const char *shader_name;
+   gl_shader_stage stage;
+   void *mem_ctx;
+};
+
+namespace brw {
+   fs_reg
+   fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
+                     brw_reg_type type = BRW_REGISTER_TYPE_F,
+                     unsigned n = 1);
+
+   fs_reg
+   fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2]);
+
+   inline fs_reg
+   dynamic_msaa_flags(const struct brw_wm_prog_data *wm_prog_data)
+   {
+      return fs_reg(UNIFORM, wm_prog_data->msaa_flags_param,
+                    BRW_REGISTER_TYPE_UD);
+   }
+
+   void
+   check_dynamic_msaa_flag(const fs_builder &bld,
+                           const struct brw_wm_prog_data *wm_prog_data,
+                           enum intel_msaa_flags flag);
+
+   bool
+   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i);
+}
+
+void shuffle_from_32bit_read(const brw::fs_builder &bld,
+                             const fs_reg &dst,
+                             const fs_reg &src,
+                             uint32_t first_component,
+                             uint32_t components);
+
+fs_reg setup_imm_df(const brw::fs_builder &bld,
+                    double v);
+
+fs_reg setup_imm_b(const brw::fs_builder &bld,
+                   int8_t v);
+
+fs_reg setup_imm_ub(const brw::fs_builder &bld,
+                   uint8_t v);
+
+enum brw_barycentric_mode brw_barycentric_mode(nir_intrinsic_instr *intr);
+
+uint32_t brw_fb_write_msg_control(const fs_inst *inst,
+                                  const struct brw_wm_prog_data *prog_data);
+
+void brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data);
+
+bool brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width);
+
+fs_reg brw_sample_mask_reg(const brw::fs_builder &bld);
+void brw_emit_predicate_on_sample_mask(const brw::fs_builder &bld, fs_inst *inst);
+
+int brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
+                                    const brw_stage_prog_data *prog_data);
+
+bool brw_lower_dpas(fs_visitor &v);
+
+void nir_to_brw(fs_visitor *s);
+
+#endif /* BRW_FS_H */
--- a/src/intel/compiler/elk/brw_fs_bank_conflicts.cpp
+++ b/src/intel/compiler/elk/brw_fs_bank_conflicts.cpp
@ -0,0 +1,955 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_bank_conflicts.cpp
+ *
+ * This file contains a GRF bank conflict mitigation pass.  The pass is
+ * intended to be run after register allocation and works by rearranging the
+ * layout of the GRF space (without altering the semantics of the program) in
+ * a way that minimizes the number of GRF bank conflicts incurred by ternary
+ * instructions.
+ *
+ * Unfortunately there is close to no information about bank conflicts in the
+ * hardware spec, but experimentally on Gfx7-Gfx9 ternary instructions seem to
+ * incur an average bank conflict penalty of one cycle per SIMD8 op whenever
+ * the second and third source are stored in the same GRF bank (\sa bank_of()
+ * for the exact bank layout) which cannot be fetched during the same cycle by
+ * the EU, unless the EU logic manages to optimize out the read cycle of a
+ * duplicate source register (\sa is_conflict_optimized_out()).
+ *
+ * The asymptotic run-time of the algorithm is dominated by the
+ * shader_conflict_weight_matrix() computation below, which is O(n) on the
+ * number of instructions in the program, however for small and medium-sized
+ * programs the run-time is likely to be dominated by
+ * optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of
+ * the program (\sa partitioning), which is bounded (since the program uses a
+ * bounded number of registers post-regalloc) and of the order of 100.  For
+ * that reason optimize_reg_permutation() is vectorized in order to keep the
+ * cubic term within reasonable bounds for m close to its theoretical maximum.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+#ifdef __SSE2__
+
+#include <emmintrin.h>
+
+/**
+ * Thin layer around vector intrinsics so they can be easily replaced with
+ * e.g. the fall-back scalar path, an implementation with different vector
+ * width or using different SIMD architectures (AVX-512?!).
+ *
+ * This implementation operates on pairs of independent SSE2 integer vectors à
+ * la SIMD16 for somewhat improved throughput.  SSE2 is supported by virtually
+ * all platforms that care about bank conflicts, so this path should almost
+ * always be available in practice.
+ */
+namespace {
+   /**
+    * SIMD integer vector data type.
+    */
+   struct vector_type {
+      __m128i v[2];
+   };
+
+   /**
+    * Scalar data type matching the representation of a single component of \p
+    * vector_type.
+    */
+   typedef int16_t scalar_type;
+
+   /**
+    * Maximum integer value representable as a \p scalar_type.
+    */
+   const scalar_type max_scalar = INT16_MAX;
+
+   /**
+    * Number of components of a \p vector_type.
+    */
+   const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type);
+
+   /**
+    * Set the i-th component of vector \p v to \p x.
+    */
+   void
+   set(vector_type &v, unsigned i, scalar_type x)
+   {
+      assert(i < vector_width);
+      memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x));
+   }
+
+   /**
+    * Get the i-th component of vector \p v.
+    */
+   scalar_type
+   get(const vector_type &v, unsigned i)
+   {
+      assert(i < vector_width);
+      scalar_type x;
+      memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x));
+      return x;
+   }
+
+   /**
+    * Add two vectors with saturation.
+    */
+   vector_type
+   adds(const vector_type &v, const vector_type &w)
+   {
+      const vector_type u = {{
+            _mm_adds_epi16(v.v[0], w.v[0]),
+            _mm_adds_epi16(v.v[1], w.v[1])
+         }};
+      return u;
+   }
+
+   /**
+    * Subtract two vectors with saturation.
+    */
+   vector_type
+   subs(const vector_type &v, const vector_type &w)
+   {
+      const vector_type u = {{
+            _mm_subs_epi16(v.v[0], w.v[0]),
+            _mm_subs_epi16(v.v[1], w.v[1])
+         }};
+      return u;
+   }
+
+   /**
+    * Compute the bitwise conjunction of two vectors.
+    */
+   vector_type
+   mask(const vector_type &v, const vector_type &w)
+   {
+      const vector_type u = {{
+            _mm_and_si128(v.v[0], w.v[0]),
+            _mm_and_si128(v.v[1], w.v[1])
+         }};
+      return u;
+   }
+
+   /**
+    * Reduce the components of a vector using saturating addition.
+    */
+   scalar_type
+   sums(const vector_type &v)
+   {
+      const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]);
+      const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e));
+      const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1));
+      const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1));
+      return _mm_extract_epi16(v1, 0);
+   }
+}
+
+#else
+
+/**
+ * Thin layer around vector intrinsics so they can be easily replaced with
+ * e.g. the fall-back scalar path, an implementation with different vector
+ * width or using different SIMD architectures (AVX-512?!).
+ *
+ * This implementation operates on scalar values and doesn't rely on
+ * any vector extensions.  This is mainly intended for debugging and
+ * to keep this file building on exotic platforms.
+ */
+namespace {
+   /**
+    * SIMD integer vector data type.
+    */
+   typedef int16_t vector_type;
+
+   /**
+    * Scalar data type matching the representation of a single component of \p
+    * vector_type.
+    */
+   typedef int16_t scalar_type;
+
+   /**
+    * Maximum integer value representable as a \p scalar_type.
+    */
+   const scalar_type max_scalar = INT16_MAX;
+
+   /**
+    * Number of components of a \p vector_type.
+    */
+   const unsigned vector_width = 1;
+
+   /**
+    * Set the i-th component of vector \p v to \p x.
+    */
+   void
+   set(vector_type &v, unsigned i, scalar_type x)
+   {
+      assert(i < vector_width);
+      v = x;
+   }
+
+   /**
+    * Get the i-th component of vector \p v.
+    */
+   scalar_type
+   get(const vector_type &v, unsigned i)
+   {
+      assert(i < vector_width);
+      return v;
+   }
+
+   /**
+    * Add two vectors with saturation.
+    */
+   vector_type
+   adds(vector_type v, vector_type w)
+   {
+      return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w));
+   }
+
+   /**
+    * Subtract two vectors with saturation.
+    */
+   vector_type
+   subs(vector_type v, vector_type w)
+   {
+      return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w));
+   }
+
+   /**
+    * Compute the bitwise conjunction of two vectors.
+    */
+   vector_type
+   mask(vector_type v, vector_type w)
+   {
+      return v & w;
+   }
+
+   /**
+    * Reduce the components of a vector using saturating addition.
+    */
+   scalar_type
+   sums(vector_type v)
+   {
+      return v;
+   }
+}
+
+#endif
+
+/**
+ * Swap \p x and \p y.
+ */
+#define SWAP(x, y) do {                          \
+      __typeof(y) _swap_tmp = y;                 \
+      y = x;                                     \
+      x = _swap_tmp;                             \
+   } while (0)
+
+namespace {
+   /**
+    * Variable-length vector type intended to represent cycle-count costs for
+    * arbitrary atom-to-bank assignments.  It's indexed by a pair of integers
+    * (i, p), where i is an atom index and p in {0, 1} indicates the parity of
+    * the conflict (respectively, whether the cost is incurred whenever the
+    * atoms are assigned the same bank b or opposite-parity banks b and b^1).
+    * \sa shader_conflict_weight_matrix()
+    */
+   struct weight_vector_type {
+      weight_vector_type() : v(NULL), size(0) {}
+
+      weight_vector_type(unsigned n) : v(alloc(n)), size(n) {}
+
+      weight_vector_type(const weight_vector_type &u) :
+         v(alloc(u.size)), size(u.size)
+      {
+         memcpy(v, u.v,
+                DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type));
+      }
+
+      ~weight_vector_type()
+      {
+         free(v);
+      }
+
+      weight_vector_type &
+      operator=(weight_vector_type u)
+      {
+         SWAP(v, u.v);
+         SWAP(size, u.size);
+         return *this;
+      }
+
+      vector_type *v;
+      unsigned size;
+
+   private:
+      static vector_type *
+      alloc(unsigned n)
+      {
+         const unsigned align = MAX2(sizeof(void *), __alignof__(vector_type));
+         const unsigned size = DIV_ROUND_UP(n, vector_width) * sizeof(vector_type);
+         void *p;
+         if (posix_memalign(&p, align, size))
+            return NULL;
+         memset(p, 0, size);
+         return reinterpret_cast<vector_type *>(p);
+      }
+   };
+
+   /**
+    * Set the (i, p)-th component of weight vector \p v to \p x.
+    */
+   void
+   set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x)
+   {
+      set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x);
+   }
+
+   /**
+    * Get the (i, p)-th component of weight vector \p v.
+    */
+   scalar_type
+   get(const weight_vector_type &v, unsigned i, unsigned p)
+   {
+      return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width);
+   }
+
+   /**
+    * Swap the (i, p)-th and (j, q)-th components of weight vector \p v.
+    */
+   void
+   swap(weight_vector_type &v,
+        unsigned i, unsigned p,
+        unsigned j, unsigned q)
+   {
+      const scalar_type tmp = get(v, i, p);
+      set(v, i, p, get(v, j, q));
+      set(v, j, q, tmp);
+   }
+}
+
+namespace {
+   /**
+    * Object that represents the partitioning of an arbitrary register space
+    * into indivisible units (referred to as atoms below) that can potentially
+    * be rearranged independently from other registers.  The partitioning is
+    * inferred from a number of contiguity requirements specified using
+    * require_contiguous().  This allows efficient look-up of the atom index a
+    * given register address belongs to, or conversely the range of register
+    * addresses that belong to a given atom.
+    */
+   struct partitioning {
+      /**
+       * Create a (for the moment unrestricted) partitioning of a register
+       * file of size \p n.  The units are arbitrary.
+       */
+      partitioning(unsigned n) :
+         max_reg(n),
+         offsets(new unsigned[n + num_terminator_atoms]),
+         atoms(new unsigned[n + num_terminator_atoms])
+      {
+         for (unsigned i = 0; i < n + num_terminator_atoms; i++) {
+            offsets[i] = i;
+            atoms[i] = i;
+         }
+      }
+
+      partitioning(const partitioning &p) :
+         max_reg(p.max_reg),
+         offsets(new unsigned[p.num_atoms() + num_terminator_atoms]),
+         atoms(new unsigned[p.max_reg + num_terminator_atoms])
+      {
+         memcpy(offsets, p.offsets,
+                sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms));
+         memcpy(atoms, p.atoms,
+                sizeof(unsigned) * (p.max_reg + num_terminator_atoms));
+      }
+
+      ~partitioning()
+      {
+         delete[] offsets;
+         delete[] atoms;
+      }
+
+      partitioning &
+      operator=(partitioning p)
+      {
+         SWAP(max_reg, p.max_reg);
+         SWAP(offsets, p.offsets);
+         SWAP(atoms, p.atoms);
+         return *this;
+      }
+
+      /**
+       * Require register range [reg, reg + n[ to be considered part of the
+       * same atom.
+       */
+      void
+      require_contiguous(unsigned reg, unsigned n)
+      {
+         unsigned r = atoms[reg];
+
+         /* Renumber atoms[reg...] = { r... } and their offsets[r...] for the
+          * case that the specified contiguity requirement leads to the fusion
+          * (yay) of one or more existing atoms.
+          */
+         for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) {
+            if (offsets[atoms[reg1]] < reg + n) {
+               atoms[reg1] = r;
+            } else {
+               if (offsets[atoms[reg1 - 1]] != offsets[atoms[reg1]])
+                  r++;
+
+               offsets[r] = offsets[atoms[reg1]];
+               atoms[reg1] = r;
+            }
+         }
+      }
+
+      /**
+       * Get the atom index register address \p reg belongs to.
+       */
+      unsigned
+      atom_of_reg(unsigned reg) const
+      {
+         return atoms[reg];
+      }
+
+      /**
+       * Get the base register address that belongs to atom \p r.
+       */
+      unsigned
+      reg_of_atom(unsigned r) const
+      {
+         return offsets[r];
+      }
+
+      /**
+       * Get the size of atom \p r in register address units.
+       */
+      unsigned
+      size_of_atom(unsigned r) const
+      {
+         assert(r < num_atoms());
+         return reg_of_atom(r + 1) - reg_of_atom(r);
+      }
+
+      /**
+       * Get the number of atoms the whole register space is partitioned into.
+       */
+      unsigned
+      num_atoms() const
+      {
+         return atoms[max_reg];
+      }
+
+   private:
+      /**
+       * Number of trailing atoms inserted for convenience so among other
+       * things we don't need to special-case the last element in
+       * size_of_atom().
+       */
+      static const unsigned num_terminator_atoms = 1;
+      unsigned max_reg;
+      unsigned *offsets;
+      unsigned *atoms;
+   };
+
+   /**
+    * Only GRF sources (whether they have been register-allocated or not) can
+    * possibly incur bank conflicts.
+    */
+   bool
+   is_grf(const fs_reg &r)
+   {
+      return r.file == VGRF || r.file == FIXED_GRF;
+   }
+
+   /**
+    * Register offset of \p r in GRF units.  Useful because the representation
+    * of GRFs post-register allocation is somewhat inconsistent and depends on
+    * whether the register already had a fixed GRF offset prior to register
+    * allocation or whether it was part of a VGRF allocation.
+    */
+   unsigned
+   reg_of(const fs_reg &r)
+   {
+      assert(is_grf(r));
+      if (r.file == VGRF)
+         return r.nr + r.offset / REG_SIZE;
+      else
+         return reg_offset(r) / REG_SIZE;
+   }
+
+   /**
+    * Calculate the finest partitioning of the GRF space compatible with the
+    * register contiguity requirements derived from all instructions part of
+    * the program.
+    */
+   partitioning
+   shader_reg_partitioning(const fs_visitor *v)
+   {
+      partitioning p(BRW_MAX_GRF);
+
+      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+         if (is_grf(inst->dst))
+            p.require_contiguous(reg_of(inst->dst), regs_written(inst));
+
+         for (int i = 0; i < inst->sources; i++) {
+            if (is_grf(inst->src[i]))
+               p.require_contiguous(reg_of(inst->src[i]), regs_read(inst, i));
+         }
+      }
+
+      return p;
+   }
+
+   /**
+    * Return the set of GRF atoms that should be left untouched at their
+    * original location to avoid violating hardware or software assumptions.
+    */
+   bool *
+   shader_reg_constraints(const fs_visitor *v, const partitioning &p)
+   {
+      bool *constrained = new bool[p.num_atoms()]();
+
+      /* These are read implicitly by some send-message instructions without
+       * any indication at the IR level.  Assume they are unsafe to move
+       * around.
+       */
+      for (unsigned reg = 0; reg < 2; reg++)
+         constrained[p.atom_of_reg(reg)] = true;
+
+      /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
+       * subsection "EUISA Instructions", Send Message (page 990):
+       *
+       * "r127 must not be used for return address when there is a src and
+       * dest overlap in send instruction."
+       *
+       * Register allocation ensures that, so don't move 127 around to avoid
+       * breaking that property.
+       */
+      if (v->devinfo->ver >= 8)
+         constrained[p.atom_of_reg(127)] = true;
+
+      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+         /* Assume that anything referenced via fixed GRFs is baked into the
+          * hardware's fixed-function logic and may be unsafe to move around.
+          * Also take into account the source GRF restrictions of EOT
+          * send-message instructions.
+          */
+         if (inst->dst.file == FIXED_GRF)
+            constrained[p.atom_of_reg(reg_of(inst->dst))] = true;
+
+         for (int i = 0; i < inst->sources; i++) {
+            if (inst->src[i].file == FIXED_GRF ||
+                (is_grf(inst->src[i]) && inst->eot))
+               constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true;
+         }
+
+         /* Preserve the original allocation of VGRFs used by the barycentric
+          * source of the LINTERP instruction on Gfx6, since pair-aligned
+          * barycentrics allow the PLN instruction to be used.
+          */
+         if (v->devinfo->has_pln && v->devinfo->ver <= 6 &&
+             inst->opcode == FS_OPCODE_LINTERP)
+            constrained[p.atom_of_reg(reg_of(inst->src[0]))] = true;
+
+         /* The location of the Gfx7 MRF hack registers is hard-coded in the
+          * rest of the compiler back-end.  Don't attempt to move them around.
+          */
+         if (v->devinfo->ver >= 7) {
+            assert(inst->dst.file != MRF);
+
+            for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
+               const unsigned reg = GFX7_MRF_HACK_START + inst->base_mrf + i;
+               constrained[p.atom_of_reg(reg)] = true;
+            }
+         }
+      }
+
+      return constrained;
+   }
+
+   /**
+    * Return whether the hardware will be able to prevent a bank conflict by
+    * optimizing out the read cycle of a source register.  The formula was
+    * found experimentally.
+    */
+   bool
+   is_conflict_optimized_out(const intel_device_info *devinfo,
+                             const fs_inst *inst)
+   {
+      return devinfo->ver >= 9 &&
+         ((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
+                                    reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
+          reg_of(inst->src[1]) == reg_of(inst->src[2]));
+   }
+
+   /**
+    * Return a matrix that allows reasonably efficient computation of the
+    * cycle-count cost of bank conflicts incurred throughout the whole program
+    * for any given atom-to-bank assignment.
+    *
+    * More precisely, if C_r_s_p is the result of this function, the total
+    * cost of all bank conflicts involving any given atom r can be readily
+    * recovered as follows:
+    *
+    *  S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p)
+    *
+    * where d_i_j is the Kronecker delta, and B_r indicates the bank
+    * assignment of r.  \sa delta_conflicts() for a vectorized implementation
+    * of the expression above.
+    *
+    * FINISHME: Teach this about the Gfx10+ bank conflict rules, which are
+    *           somewhat more relaxed than on previous generations.  In the
+    *           meantime optimizing based on Gfx9 weights is likely to be more
+    *           helpful than not optimizing at all.
+    */
+   weight_vector_type *
+   shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p)
+   {
+      weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()];
+      for (unsigned r = 0; r < p.num_atoms(); r++)
+         conflicts[r] = weight_vector_type(2 * p.num_atoms());
+
+      /* Crude approximation of the number of times the current basic block
+       * will be executed at run-time.
+       */
+      unsigned block_scale = 1;
+
+      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+         if (inst->opcode == BRW_OPCODE_DO) {
+            block_scale *= 10;
+
+         } else if (inst->opcode == BRW_OPCODE_WHILE) {
+            block_scale /= 10;
+
+         } else if (inst->is_3src(v->compiler) &&
+                    is_grf(inst->src[1]) && is_grf(inst->src[2])) {
+            const unsigned r = p.atom_of_reg(reg_of(inst->src[1]));
+            const unsigned s = p.atom_of_reg(reg_of(inst->src[2]));
+
+            /* Estimate of the cycle-count cost of incurring a bank conflict
+             * for this instruction.  This is only true on the average, for a
+             * sequence of back-to-back ternary instructions, since the EU
+             * front-end only seems to be able to issue a new instruction at
+             * an even cycle.  The cost of a bank conflict incurred by an
+             * isolated ternary instruction may be higher.
+             */
+            const unsigned exec_size = inst->dst.component_size(inst->exec_size);
+            const unsigned cycle_scale = block_scale * DIV_ROUND_UP(exec_size,
+                                                                    REG_SIZE);
+
+            /* Neglect same-atom conflicts (since they're either trivial or
+             * impossible to avoid without splitting the atom), and conflicts
+             * known to be optimized out by the hardware.
+             */
+            if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) {
+               /* Calculate the parity of the sources relative to the start of
+                * their respective atoms.  If their parity is the same (and
+                * none of the atoms straddle the 2KB mark), the instruction
+                * will incur a conflict iff both atoms are assigned the same
+                * bank b.  If their parity is opposite, the instruction will
+                * incur a conflict iff they are assigned opposite banks (b and
+                * b^1).
+                */
+               const bool p_r = 1 & (reg_of(inst->src[1]) - p.reg_of_atom(r));
+               const bool p_s = 1 & (reg_of(inst->src[2]) - p.reg_of_atom(s));
+               const unsigned p = p_r ^ p_s;
+
+               /* Calculate the updated cost of a hypothetical conflict
+                * between atoms r and s.  Note that the weight matrix is
+                * symmetric with respect to indices r and s by construction.
+                */
+               const scalar_type w = MIN2(unsigned(max_scalar),
+                                          get(conflicts[r], s, p) + cycle_scale);
+               set(conflicts[r], s, p, w);
+               set(conflicts[s], r, p, w);
+            }
+         }
+      }
+
+      return conflicts;
+   }
+
+   /**
+    * Return the set of GRF atoms that could potentially lead to bank
+    * conflicts if laid out unfavorably in the GRF space according to
+    * the specified \p conflicts matrix (\sa
+    * shader_conflict_weight_matrix()).
+    */
+   bool *
+   have_any_conflicts(const partitioning &p,
+                      const weight_vector_type *conflicts)
+   {
+      bool *any_conflicts = new bool[p.num_atoms()]();
+
+      for (unsigned r = 0; r < p.num_atoms(); r++) {
+         const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width);
+         for (unsigned s = 0; s < m; s++)
+            any_conflicts[r] |= sums(conflicts[r].v[s]);
+      }
+
+      return any_conflicts;
+   }
+
+   /**
+    * Calculate the difference between two S(B) cost estimates as defined
+    * above (\sa shader_conflict_weight_matrix()).  This represents the
+    * (partial) cycle-count benefit from moving an atom r from bank p to n.
+    * The respective bank assignments Bp and Bn are encoded as the \p
+    * bank_mask_p and \p bank_mask_n bitmasks for efficient computation,
+    * according to the formula:
+    *
+    *  bank_mask(B)_s_p = -d_(p^B_r)_(B_s)
+    *
+    * Notice the similarity with the delta function in the S(B) expression
+    * above, and how bank_mask(B) can be precomputed for every possible
+    * selection of r since bank_mask(B) only depends on it via B_r that may
+    * only assume one of four different values, so the caller can keep every
+    * possible bank_mask(B) vector in memory without much hassle (\sa
+    * bank_characteristics()).
+    */
+   int
+   delta_conflicts(const weight_vector_type &bank_mask_p,
+                   const weight_vector_type &bank_mask_n,
+                   const weight_vector_type &conflicts)
+   {
+      const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width);
+      vector_type s_p = {}, s_n = {};
+
+      for (unsigned r = 0; r < m; r++) {
+         s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r]));
+         s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r]));
+      }
+
+      return sums(subs(s_p, s_n));
+   }
+
+   /**
+    * Register atom permutation, represented as the start GRF offset each atom
+    * is mapped into.
+    */
+   struct permutation {
+      permutation() : v(NULL), size(0) {}
+
+      permutation(unsigned n) :
+         v(new unsigned[n]()), size(n) {}
+
+      permutation(const permutation &p) :
+         v(new unsigned[p.size]), size(p.size)
+      {
+         memcpy(v, p.v, p.size * sizeof(unsigned));
+      }
+
+      ~permutation()
+      {
+         delete[] v;
+      }
+
+      permutation &
+      operator=(permutation p)
+      {
+         SWAP(v, p.v);
+         SWAP(size, p.size);
+         return *this;
+      }
+
+      unsigned *v;
+      unsigned size;
+   };
+
+   /**
+    * Return an identity permutation of GRF atoms.
+    */
+   permutation
+   identity_reg_permutation(const partitioning &p)
+   {
+      permutation map(p.num_atoms());
+
+      for (unsigned r = 0; r < map.size; r++)
+         map.v[r] = p.reg_of_atom(r);
+
+      return map;
+   }
+
+   /**
+    * Return the bank index of GRF address \p reg, numbered according to the
+    * table:
+    *        Even Odd
+    *    Lo    0   1
+    *    Hi    2   3
+    */
+   unsigned
+   bank_of(unsigned reg)
+   {
+      return (reg & 0x40) >> 5 | (reg & 1);
+   }
+
+   /**
+    * Return bitmasks suitable for use as bank mask arguments for the
+    * delta_conflicts() computation.  Note that this is just the (negative)
+    * characteristic function of each bank, if you regard it as a set
+    * containing all atoms assigned to it according to the \p map array.
+    */
+   weight_vector_type *
+   bank_characteristics(const permutation &map)
+   {
+      weight_vector_type *banks = new weight_vector_type[4];
+
+      for (unsigned b = 0; b < 4; b++) {
+         banks[b] = weight_vector_type(2 * map.size);
+
+         for (unsigned j = 0; j < map.size; j++) {
+            for (unsigned p = 0; p < 2; p++)
+               set(banks[b], j, p,
+                   (b ^ p) == bank_of(map.v[j]) ? -1 : 0);
+         }
+      }
+
+      return banks;
+   }
+
+   /**
+    * Return an improved permutation of GRF atoms based on \p map attempting
+    * to reduce the total cycle-count cost of bank conflicts greedily.
+    *
+    * Note that this doesn't attempt to merge multiple atoms into one, which
+    * may allow it to do a better job in some cases -- It simply reorders
+    * existing atoms in the GRF space without affecting their identity.
+    */
+   permutation
+   optimize_reg_permutation(const partitioning &p,
+                            const bool *constrained,
+                            const weight_vector_type *conflicts,
+                            permutation map)
+   {
+      const bool *any_conflicts = have_any_conflicts(p, conflicts);
+      weight_vector_type *banks = bank_characteristics(map);
+
+      for (unsigned r = 0; r < map.size; r++) {
+         const unsigned bank_r = bank_of(map.v[r]);
+
+         if (!constrained[r]) {
+            unsigned best_s = r;
+            int best_benefit = 0;
+
+            for (unsigned s = 0; s < map.size; s++) {
+               const unsigned bank_s = bank_of(map.v[s]);
+
+               if (bank_r != bank_s && !constrained[s] &&
+                   p.size_of_atom(r) == p.size_of_atom(s) &&
+                   (any_conflicts[r] || any_conflicts[s])) {
+                  const int benefit =
+                     delta_conflicts(banks[bank_r], banks[bank_s], conflicts[r]) +
+                     delta_conflicts(banks[bank_s], banks[bank_r], conflicts[s]);
+
+                  if (benefit > best_benefit) {
+                     best_s = s;
+                     best_benefit = benefit;
+                  }
+               }
+            }
+
+            if (best_s != r) {
+               for (unsigned b = 0; b < 4; b++) {
+                  for (unsigned p = 0; p < 2; p++)
+                     swap(banks[b], r, p, best_s, p);
+               }
+
+               SWAP(map.v[r], map.v[best_s]);
+            }
+         }
+      }
+
+      delete[] banks;
+      delete[] any_conflicts;
+      return map;
+   }
+
+   /**
+    * Apply the GRF atom permutation given by \p map to register \p r and
+    * return the result.
+    */
+   fs_reg
+   transform(const partitioning &p, const permutation &map, fs_reg r)
+   {
+      if (r.file == VGRF) {
+         const unsigned reg = reg_of(r);
+         const unsigned s = p.atom_of_reg(reg);
+         r.nr = map.v[s] + reg - p.reg_of_atom(s);
+         r.offset = r.offset % REG_SIZE;
+      }
+
+      return r;
+   }
+}
+
+bool
+fs_visitor::opt_bank_conflicts()
+{
+   assert(grf_used || !"Must be called after register allocation");
+
+   /* TODO: Re-work this pass for Gfx20+. */
+   if (devinfo->ver >= 20)
+      return false;
+
+   /* No ternary instructions -- No bank conflicts. */
+   if (devinfo->ver < 6)
+      return false;
+
+   const partitioning p = shader_reg_partitioning(this);
+   const bool *constrained = shader_reg_constraints(this, p);
+   const weight_vector_type *conflicts =
+      shader_conflict_weight_matrix(this, p);
+   const permutation map =
+      optimize_reg_permutation(p, constrained, conflicts,
+                               identity_reg_permutation(p));
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      inst->dst = transform(p, map, inst->dst);
+
+      for (int i = 0; i < inst->sources; i++)
+         inst->src[i] = transform(p, map, inst->src[i]);
+   }
+
+   delete[] conflicts;
+   delete[] constrained;
+   return true;
+}
+
+/**
+ * Return whether the instruction incurs GRF bank conflict cycles.
+ *
+ * Note that this is only accurate after register allocation because otherwise
+ * we don't know which bank each VGRF is going to end up aligned to.
+ */
+bool
+has_bank_conflict(const struct brw_isa_info *isa, const fs_inst *inst)
+{
+   return is_3src(isa, inst->opcode) &&
+          is_grf(inst->src[1]) && is_grf(inst->src[2]) &&
+          bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) &&
+          !is_conflict_optimized_out(isa->devinfo, inst);
+}
--- a/src/intel/compiler/elk/brw_fs_builder.h
+++ b/src/intel/compiler/elk/brw_fs_builder.h
@ -0,0 +1,965 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_FS_BUILDER_H
+#define BRW_FS_BUILDER_H
+
+#include "brw_ir_fs.h"
+#include "brw_shader.h"
+#include "brw_eu.h"
+#include "brw_fs.h"
+
+namespace brw {
+   /**
+    * Toolbox to assemble an FS IR program out of individual instructions.
+    *
+    * This object is meant to have an interface consistent with
+    * brw::vec4_builder.  They cannot be fully interchangeable because
+    * brw::fs_builder generates scalar code while brw::vec4_builder generates
+    * vector code.
+    */
+   class fs_builder {
+   public:
+      /** Type used in this IR to represent a source of an instruction. */
+      typedef fs_reg src_reg;
+
+      /** Type used in this IR to represent the destination of an instruction. */
+      typedef fs_reg dst_reg;
+
+      /** Type used in this IR to represent an instruction. */
+      typedef fs_inst instruction;
+
+      /**
+       * Construct an fs_builder that inserts instructions into \p shader.
+       * \p dispatch_width gives the native execution width of the program.
+       */
+      fs_builder(fs_visitor *shader,
+                 unsigned dispatch_width) :
+         shader(shader), block(NULL), cursor(NULL),
+         _dispatch_width(dispatch_width),
+         _group(0),
+         force_writemask_all(false),
+         annotation()
+      {
+      }
+
+      explicit fs_builder(fs_visitor *s) : fs_builder(s, s->dispatch_width) {}
+
+      /**
+       * Construct an fs_builder that inserts instructions into \p shader
+       * before instruction \p inst in basic block \p block.  The default
+       * execution controls and debug annotation are initialized from the
+       * instruction passed as argument.
+       */
+      fs_builder(fs_visitor *shader, bblock_t *block, fs_inst *inst) :
+         shader(shader), block(block), cursor(inst),
+         _dispatch_width(inst->exec_size),
+         _group(inst->group),
+         force_writemask_all(inst->force_writemask_all)
+      {
+         annotation.str = inst->annotation;
+         annotation.ir = inst->ir;
+      }
+
+      /**
+       * Construct an fs_builder that inserts instructions before \p cursor in
+       * basic block \p block, inheriting other code generation parameters
+       * from this.
+       */
+      fs_builder
+      at(bblock_t *block, exec_node *cursor) const
+      {
+         fs_builder bld = *this;
+         bld.block = block;
+         bld.cursor = cursor;
+         return bld;
+      }
+
+      /**
+       * Construct an fs_builder appending instructions at the end of the
+       * instruction list of the shader, inheriting other code generation
+       * parameters from this.
+       */
+      fs_builder
+      at_end() const
+      {
+         return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
+      }
+
+      /**
+       * Construct a builder specifying the default SIMD width and group of
+       * channel enable signals, inheriting other code generation parameters
+       * from this.
+       *
+       * \p n gives the default SIMD width, \p i gives the slot group used for
+       * predication and control flow masking in multiples of \p n channels.
+       */
+      fs_builder
+      group(unsigned n, unsigned i) const
+      {
+         fs_builder bld = *this;
+
+         if (n <= dispatch_width() && i < dispatch_width() / n) {
+            bld._group += i * n;
+         } else {
+            /* The requested channel group isn't a subset of the channel group
+             * of this builder, which means that the resulting instructions
+             * would use (potentially undefined) channel enable signals not
+             * specified by the parent builder.  That's only valid if the
+             * instruction doesn't have per-channel semantics, in which case
+             * we should clear off the default group index in order to prevent
+             * emitting instructions with channel group not aligned to their
+             * own execution size.
+             */
+            assert(force_writemask_all);
+            bld._group = 0;
+         }
+
+         bld._dispatch_width = n;
+         return bld;
+      }
+
+      /**
+       * Alias for group() with width equal to eight.
+       */
+      fs_builder
+      quarter(unsigned i) const
+      {
+         return group(8, i);
+      }
+
+      /**
+       * Construct a builder with per-channel control flow execution masking
+       * disabled if \p b is true.  If control flow execution masking is
+       * already disabled this has no effect.
+       */
+      fs_builder
+      exec_all(bool b = true) const
+      {
+         fs_builder bld = *this;
+         if (b)
+            bld.force_writemask_all = true;
+         return bld;
+      }
+
+      /**
+       * Construct a builder with the given debug annotation info.
+       */
+      fs_builder
+      annotate(const char *str, const void *ir = NULL) const
+      {
+         fs_builder bld = *this;
+         bld.annotation.str = str;
+         bld.annotation.ir = ir;
+         return bld;
+      }
+
+      /**
+       * Get the SIMD width in use.
+       */
+      unsigned
+      dispatch_width() const
+      {
+         return _dispatch_width;
+      }
+
+      /**
+       * Get the channel group in use.
+       */
+      unsigned
+      group() const
+      {
+         return _group;
+      }
+
+      /**
+       * Allocate a virtual register of natural vector size (one for this IR)
+       * and SIMD width.  \p n gives the amount of space to allocate in
+       * dispatch_width units (which is just enough space for one logical
+       * component in this IR).
+       */
+      dst_reg
+      vgrf(enum brw_reg_type type, unsigned n = 1) const
+      {
+         const unsigned unit = reg_unit(shader->devinfo);
+         assert(dispatch_width() <= 32);
+
+         if (n > 0)
+            return dst_reg(VGRF, shader->alloc.allocate(
+                              DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
+                                           unit * REG_SIZE) * unit),
+                           type);
+         else
+            return retype(null_reg_ud(), type);
+      }
+
+      /**
+       * Create a null register of floating type.
+       */
+      dst_reg
+      null_reg_f() const
+      {
+         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
+      }
+
+      dst_reg
+      null_reg_df() const
+      {
+         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
+      }
+
+      /**
+       * Create a null register of signed integer type.
+       */
+      dst_reg
+      null_reg_d() const
+      {
+         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      }
+
+      /**
+       * Create a null register of unsigned integer type.
+       */
+      dst_reg
+      null_reg_ud() const
+      {
+         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+      }
+
+      /**
+       * Insert an instruction into the program.
+       */
+      instruction *
+      emit(const instruction &inst) const
+      {
+         return emit(new(shader->mem_ctx) instruction(inst));
+      }
+
+      /**
+       * Create and insert a nullary control instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode) const
+      {
+         return emit(instruction(opcode, dispatch_width()));
+      }
+
+      /**
+       * Create and insert a nullary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst) const
+      {
+         return emit(instruction(opcode, dispatch_width(), dst));
+      }
+
+      /**
+       * Create and insert a unary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_RCP:
+         case SHADER_OPCODE_RSQ:
+         case SHADER_OPCODE_SQRT:
+         case SHADER_OPCODE_EXP2:
+         case SHADER_OPCODE_LOG2:
+         case SHADER_OPCODE_SIN:
+         case SHADER_OPCODE_COS:
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    fix_math_operand(src0)));
+
+         default:
+            return emit(instruction(opcode, dispatch_width(), dst, src0));
+         }
+      }
+
+      /**
+       * Create and insert a binary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_POW:
+         case SHADER_OPCODE_INT_QUOTIENT:
+         case SHADER_OPCODE_INT_REMAINDER:
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    fix_math_operand(src0),
+                                    fix_math_operand(src1)));
+
+         default:
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    src0, src1));
+
+         }
+      }
+
+      /**
+       * Create and insert a ternary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1, const src_reg &src2) const
+      {
+         switch (opcode) {
+         case BRW_OPCODE_BFE:
+         case BRW_OPCODE_BFI2:
+         case BRW_OPCODE_MAD:
+         case BRW_OPCODE_LRP:
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    fix_3src_operand(src0),
+                                    fix_3src_operand(src1),
+                                    fix_3src_operand(src2)));
+
+         default:
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    src0, src1, src2));
+         }
+      }
+
+      /**
+       * Create and insert an instruction with a variable number of sources
+       * into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
+           unsigned n) const
+      {
+         /* Use the emit() methods for specific operand counts to ensure that
+          * opcode-specific operand fixups occur.
+          */
+         if (n == 2) {
+            return emit(opcode, dst, srcs[0], srcs[1]);
+         } else if (n == 3) {
+            return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
+         } else {
+            return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
+         }
+      }
+
+      /**
+       * Insert a preallocated instruction into the program.
+       */
+      instruction *
+      emit(instruction *inst) const
+      {
+         assert(inst->exec_size <= 32);
+         assert(inst->exec_size == dispatch_width() ||
+                force_writemask_all);
+
+         inst->group = _group;
+         inst->force_writemask_all = force_writemask_all;
+         inst->annotation = annotation.str;
+         inst->ir = annotation.ir;
+
+         if (block)
+            static_cast<instruction *>(cursor)->insert_before(block, inst);
+         else
+            cursor->insert_before(inst);
+
+         return inst;
+      }
+
+      /**
+       * Select \p src0 if the comparison of both sources with the given
+       * conditional mod evaluates to true, otherwise select \p src1.
+       *
+       * Generally useful to get the minimum or maximum of two values.
+       */
+      instruction *
+      emit_minmax(const dst_reg &dst, const src_reg &src0,
+                  const src_reg &src1, brw_conditional_mod mod) const
+      {
+         assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
+
+         /* In some cases we can't have bytes as operand for src1, so use the
+          * same type for both operand.
+          */
+         return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
+                                     fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Copy any live channel from \p src to the first channel of the result.
+       */
+      src_reg
+      emit_uniformize(const src_reg &src) const
+      {
+         /* FIXME: We use a vector chan_index and dst to allow constant and
+          * copy propagration to move result all the way into the consuming
+          * instruction (typically a surface index or sampler index for a
+          * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
+          * dispatch. Once we teach const/copy propagation about scalars we
+          * should go back to scalar destinations here.
+          */
+         const fs_builder ubld = exec_all();
+         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
+         const dst_reg dst = vgrf(src.type);
+
+         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
+         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
+
+         return src_reg(component(dst, 0));
+      }
+
+      src_reg
+      move_to_vgrf(const src_reg &src, unsigned num_components) const
+      {
+         src_reg *const src_comps = new src_reg[num_components];
+         for (unsigned i = 0; i < num_components; i++)
+            src_comps[i] = offset(src, dispatch_width(), i);
+
+         const dst_reg dst = vgrf(src.type, num_components);
+         LOAD_PAYLOAD(dst, src_comps, num_components, 0);
+
+         delete[] src_comps;
+
+         return src_reg(dst);
+      }
+
+      void
+      emit_scan_step(enum opcode opcode, brw_conditional_mod mod,
+                     const dst_reg &tmp,
+                     unsigned left_offset, unsigned left_stride,
+                     unsigned right_offset, unsigned right_stride) const
+      {
+         dst_reg left, right;
+         left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
+         right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
+         if ((tmp.type == BRW_REGISTER_TYPE_Q ||
+              tmp.type == BRW_REGISTER_TYPE_UQ) &&
+             !shader->devinfo->has_64bit_int) {
+            switch (opcode) {
+            case BRW_OPCODE_MUL:
+               /* This will get lowered by integer MUL lowering */
+               set_condmod(mod, emit(opcode, right, left, right));
+               break;
+
+            case BRW_OPCODE_SEL: {
+               /* In order for the comparisons to work out right, we need our
+                * comparisons to be strict.
+                */
+               assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
+               if (mod == BRW_CONDITIONAL_GE)
+                  mod = BRW_CONDITIONAL_G;
+
+               /* We treat the bottom 32 bits as unsigned regardless of
+                * whether or not the integer as a whole is signed.
+                */
+               dst_reg right_low = subscript(right, BRW_REGISTER_TYPE_UD, 0);
+               dst_reg left_low = subscript(left, BRW_REGISTER_TYPE_UD, 0);
+
+               /* The upper bits get the same sign as the 64-bit type */
+               brw_reg_type type32 = brw_reg_type_from_bit_size(32, tmp.type);
+               dst_reg right_high = subscript(right, type32, 1);
+               dst_reg left_high = subscript(left, type32, 1);
+
+               /* Build up our comparison:
+                *
+                *   l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
+                */
+               CMP(null_reg_ud(), retype(left_low, BRW_REGISTER_TYPE_UD),
+                                  retype(right_low, BRW_REGISTER_TYPE_UD), mod);
+               set_predicate(BRW_PREDICATE_NORMAL,
+                             CMP(null_reg_ud(), left_high, right_high,
+                                 BRW_CONDITIONAL_EQ));
+               set_predicate_inv(BRW_PREDICATE_NORMAL, true,
+                                 CMP(null_reg_ud(), left_high, right_high, mod));
+
+               /* We could use selects here or we could use predicated MOVs
+                * because the destination and second source (if it were a SEL)
+                * are the same.
+                */
+               set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low));
+               set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high));
+               break;
+            }
+
+            default:
+               unreachable("Unsupported 64-bit scan op");
+            }
+         } else {
+            set_condmod(mod, emit(opcode, right, left, right));
+         }
+      }
+
+      void
+      emit_scan(enum opcode opcode, const dst_reg &tmp,
+                unsigned cluster_size, brw_conditional_mod mod) const
+      {
+         assert(dispatch_width() >= 8);
+
+         /* The instruction splitting code isn't advanced enough to split
+          * these so we need to handle that ourselves.
+          */
+         if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
+            const unsigned half_width = dispatch_width() / 2;
+            const fs_builder ubld = exec_all().group(half_width, 0);
+            dst_reg left = tmp;
+            dst_reg right = horiz_offset(tmp, half_width);
+            ubld.emit_scan(opcode, left, cluster_size, mod);
+            ubld.emit_scan(opcode, right, cluster_size, mod);
+            if (cluster_size > half_width) {
+               ubld.emit_scan_step(opcode, mod, tmp,
+                                   half_width - 1, 0, half_width, 1);
+            }
+            return;
+         }
+
+         if (cluster_size > 1) {
+            const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
+            ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
+         }
+
+         if (cluster_size > 2) {
+            if (type_sz(tmp.type) <= 4) {
+               const fs_builder ubld =
+                  exec_all().group(dispatch_width() / 4, 0);
+               ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
+               ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
+            } else {
+               /* For 64-bit types, we have to do things differently because
+                * the code above would land us with destination strides that
+                * the hardware can't handle.  Fortunately, we'll only be
+                * 8-wide in that case and it's the same number of
+                * instructions.
+                */
+               const fs_builder ubld = exec_all().group(2, 0);
+               for (unsigned i = 0; i < dispatch_width(); i += 4)
+                  ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
+            }
+         }
+
+         for (unsigned i = 4;
+              i < MIN2(cluster_size, dispatch_width());
+              i *= 2) {
+            const fs_builder ubld = exec_all().group(i, 0);
+            ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
+
+            if (dispatch_width() > i * 2)
+               ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
+
+            if (dispatch_width() > i * 4) {
+               ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
+               ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
+            }
+         }
+      }
+
+      instruction *
+      emit_undef_for_dst(const instruction *old_inst) const
+      {
+         assert(old_inst->dst.file == VGRF);
+         instruction *inst = emit(SHADER_OPCODE_UNDEF,
+                                  retype(old_inst->dst, BRW_REGISTER_TYPE_UD));
+         inst->size_written = old_inst->size_written;
+
+         return inst;
+      }
+
+      /**
+       * Assorted arithmetic ops.
+       * @{
+       */
+#define ALU1(op)                                        \
+      instruction *                                     \
+      op(const dst_reg &dst, const src_reg &src0) const \
+      {                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0);       \
+      }
+
+#define ALU2(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
+      }
+
+#define ALU2_ACC(op)                                                    \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
+         inst->writes_accumulator = true;                               \
+         return inst;                                                   \
+      }
+
+#define ALU3(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
+         const src_reg &src2) const                                     \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
+      }
+
+      ALU2(ADD)
+      ALU3(ADD3)
+      ALU2_ACC(ADDC)
+      ALU2(AND)
+      ALU2(ASR)
+      ALU2(AVG)
+      ALU3(BFE)
+      ALU2(BFI1)
+      ALU3(BFI2)
+      ALU1(BFREV)
+      ALU1(CBIT)
+      ALU1(DIM)
+      ALU2(DP2)
+      ALU2(DP3)
+      ALU2(DP4)
+      ALU2(DPH)
+      ALU1(FBH)
+      ALU1(FBL)
+      ALU1(FRC)
+      ALU3(DP4A)
+      ALU2(LINE)
+      ALU1(LZD)
+      ALU2(MAC)
+      ALU2_ACC(MACH)
+      ALU3(MAD)
+      ALU1(MOV)
+      ALU2(MUL)
+      ALU1(NOT)
+      ALU2(OR)
+      ALU2(PLN)
+      ALU1(RNDD)
+      ALU1(RNDE)
+      ALU1(RNDU)
+      ALU1(RNDZ)
+      ALU2(ROL)
+      ALU2(ROR)
+      ALU2(SAD2)
+      ALU2_ACC(SADA2)
+      ALU2(SEL)
+      ALU2(SHL)
+      ALU2(SHR)
+      ALU2_ACC(SUBB)
+      ALU2(XOR)
+
+#undef ALU3
+#undef ALU2_ACC
+#undef ALU2
+#undef ALU1
+
+      instruction *
+      F32TO16(const dst_reg &dst, const src_reg &src) const
+      {
+         assert(dst.type == BRW_REGISTER_TYPE_HF);
+         assert(src.type == BRW_REGISTER_TYPE_F);
+
+         if (shader->devinfo->ver >= 8) {
+            return MOV(dst, src);
+         } else {
+            assert(shader->devinfo->ver == 7);
+            return emit(BRW_OPCODE_F32TO16,
+                        retype(dst, BRW_REGISTER_TYPE_W), src);
+         }
+      }
+
+      instruction *
+      F16TO32(const dst_reg &dst, const src_reg &src) const
+      {
+         assert(dst.type == BRW_REGISTER_TYPE_F);
+         assert(src.type == BRW_REGISTER_TYPE_HF);
+
+         if (shader->devinfo->ver >= 8) {
+            return MOV(dst, src);
+         } else {
+            assert(shader->devinfo->ver == 7);
+            return emit(BRW_OPCODE_F16TO32,
+                        dst, retype(src, BRW_REGISTER_TYPE_W));
+         }
+      }
+      /** @} */
+
+      /**
+       * CMP: Sets the low bit of the destination channels with the result
+       * of the comparison, while the upper bits are undefined, and updates
+       * the flag register with the packed 16 bits of the result.
+       */
+      instruction *
+      CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
+          brw_conditional_mod condition) const
+      {
+         /* Take the instruction:
+          *
+          * CMP null<d> src0<f> src1<f>
+          *
+          * Original gfx4 does type conversion to the destination type
+          * before comparison, producing garbage results for floating
+          * point comparisons.
+          *
+          * The destination type doesn't matter on newer generations,
+          * so we set the type to match src0 so we can compact the
+          * instruction.
+          */
+         return set_condmod(condition,
+                            emit(BRW_OPCODE_CMP, retype(dst, src0.type),
+                                 fix_unsigned_negate(src0),
+                                 fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * CMPN: Behaves like CMP, but produces true if src1 is NaN.
+       */
+      instruction *
+      CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
+           brw_conditional_mod condition) const
+      {
+         /* Take the instruction:
+          *
+          * CMP null<d> src0<f> src1<f>
+          *
+          * Original gfx4 does type conversion to the destination type
+          * before comparison, producing garbage results for floating
+          * point comparisons.
+          *
+          * The destination type doesn't matter on newer generations,
+          * so we set the type to match src0 so we can compact the
+          * instruction.
+          */
+         return set_condmod(condition,
+                            emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
+                                 fix_unsigned_negate(src0),
+                                 fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Gfx4 predicated IF.
+       */
+      instruction *
+      IF(brw_predicate predicate) const
+      {
+         return set_predicate(predicate, emit(BRW_OPCODE_IF));
+      }
+
+      /**
+       * CSEL: dst = src2 <op> 0.0f ? src0 : src1
+       */
+      instruction *
+      CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
+           const src_reg &src2, brw_conditional_mod condition) const
+      {
+         /* CSEL only operates on floats, so we can't do integer </<=/>=/>
+          * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
+          * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
+          */
+         assert(src2.type == BRW_REGISTER_TYPE_F);
+
+         return set_condmod(condition,
+                            emit(BRW_OPCODE_CSEL,
+                                 retype(dst, BRW_REGISTER_TYPE_F),
+                                 retype(src0, BRW_REGISTER_TYPE_F),
+                                 retype(src1, BRW_REGISTER_TYPE_F),
+                                 src2));
+      }
+
+      /**
+       * Emit a linear interpolation instruction.
+       */
+      instruction *
+      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
+          const src_reg &a) const
+      {
+         if (shader->devinfo->ver >= 6 && shader->devinfo->ver <= 10) {
+            /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
+             * we need to reorder the operands.
+             */
+            return emit(BRW_OPCODE_LRP, dst, a, y, x);
+
+         } else {
+            /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
+            const dst_reg y_times_a = vgrf(dst.type);
+            const dst_reg one_minus_a = vgrf(dst.type);
+            const dst_reg x_times_one_minus_a = vgrf(dst.type);
+
+            MUL(y_times_a, y, a);
+            ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
+            MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
+            return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
+         }
+      }
+
+      /**
+       * Collect a number of registers in a contiguous range of registers.
+       */
+      instruction *
+      LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
+                   unsigned sources, unsigned header_size) const
+      {
+         instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
+         inst->header_size = header_size;
+         inst->size_written = header_size * REG_SIZE;
+         for (unsigned i = header_size; i < sources; i++) {
+            inst->size_written += dispatch_width() * type_sz(src[i].type) *
+                                  dst.stride;
+         }
+
+         return inst;
+      }
+
+      instruction *
+      UNDEF(const dst_reg &dst) const
+      {
+         assert(dst.file == VGRF);
+         assert(dst.offset % REG_SIZE == 0);
+         instruction *inst = emit(SHADER_OPCODE_UNDEF,
+                                  retype(dst, BRW_REGISTER_TYPE_UD));
+         inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
+
+         return inst;
+      }
+
+      instruction *
+      DPAS(const dst_reg &dst, const src_reg &src0, const src_reg &src1, const src_reg &src2,
+           unsigned sdepth, unsigned rcount) const
+      {
+         assert(_dispatch_width == 8);
+         assert(sdepth == 8);
+         assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8);
+
+         instruction *inst = emit(BRW_OPCODE_DPAS, dst, src0, src1, src2);
+         inst->sdepth = sdepth;
+         inst->rcount = rcount;
+
+         if (dst.type == BRW_REGISTER_TYPE_HF) {
+            inst->size_written = rcount * REG_SIZE / 2;
+         } else {
+            inst->size_written = rcount * REG_SIZE;
+         }
+
+         return inst;
+      }
+
+      fs_visitor *shader;
+
+      fs_inst *BREAK()    { return emit(BRW_OPCODE_BREAK); }
+      fs_inst *DO()       { return emit(BRW_OPCODE_DO); }
+      fs_inst *ENDIF()    { return emit(BRW_OPCODE_ENDIF); }
+      fs_inst *NOP()      { return emit(BRW_OPCODE_NOP); }
+      fs_inst *WHILE()    { return emit(BRW_OPCODE_WHILE); }
+      fs_inst *CONTINUE() { return emit(BRW_OPCODE_CONTINUE); }
+
+   private:
+      /**
+       * Workaround for negation of UD registers.  See comment in
+       * fs_generator::generate_code() for more details.
+       */
+      src_reg
+      fix_unsigned_negate(const src_reg &src) const
+      {
+         if (src.type == BRW_REGISTER_TYPE_UD &&
+             src.negate) {
+            dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
+            MOV(temp, src);
+            return src_reg(temp);
+         } else {
+            return src;
+         }
+      }
+
+      /**
+       * Workaround for source register modes not supported by the ternary
+       * instruction encoding.
+       */
+      src_reg
+      fix_3src_operand(const src_reg &src) const
+      {
+         switch (src.file) {
+         case FIXED_GRF:
+            /* FINISHME: Could handle scalar region, other stride=1 regions */
+            if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
+                src.width != BRW_WIDTH_8 ||
+                src.hstride != BRW_HORIZONTAL_STRIDE_1)
+               break;
+            FALLTHROUGH;
+         case ATTR:
+         case VGRF:
+         case UNIFORM:
+         case IMM:
+            return src;
+         default:
+            break;
+         }
+
+         dst_reg expanded = vgrf(src.type);
+         MOV(expanded, src);
+         return expanded;
+      }
+
+      /**
+       * Workaround for source register modes not supported by the math
+       * instruction.
+       */
+      src_reg
+      fix_math_operand(const src_reg &src) const
+      {
+         /* Can't do hstride == 0 args on gfx6 math, so expand it out. We
+          * might be able to do better by doing execsize = 1 math and then
+          * expanding that result out, but we would need to be careful with
+          * masking.
+          *
+          * Gfx6 hardware ignores source modifiers (negate and abs) on math
+          * instructions, so we also move to a temp to set those up.
+          *
+          * Gfx7 relaxes most of the above restrictions, but still can't use IMM
+          * operands to math
+          */
+         if ((shader->devinfo->ver == 6 &&
+              (src.file == IMM || src.file == UNIFORM ||
+               src.abs || src.negate)) ||
+             (shader->devinfo->ver == 7 && src.file == IMM)) {
+            const dst_reg tmp = vgrf(src.type);
+            MOV(tmp, src);
+            return tmp;
+         } else {
+            return src;
+         }
+      }
+
+      bblock_t *block;
+      exec_node *cursor;
+
+      unsigned _dispatch_width;
+      unsigned _group;
+      bool force_writemask_all;
+
+      /** Debug annotation info. */
+      struct {
+         const char *str;
+         const void *ir;
+      } annotation;
+   };
+}
+
+static inline fs_reg
+offset(const fs_reg &reg, const brw::fs_builder &bld, unsigned delta)
+{
+   return offset(reg, bld.dispatch_width(), delta);
+}
+
+#endif
--- a/src/intel/compiler/elk/brw_fs_cmod_propagation.cpp
+++ b/src/intel/compiler/elk/brw_fs_cmod_propagation.cpp
@ -0,0 +1,568 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+
+/** @file brw_fs_cmod_propagation.cpp
+ *
+ * Implements a pass that propagates the conditional modifier from a CMP x 0.0
+ * instruction into the instruction that generated x. For instance, in this
+ * sequence
+ *
+ *    add(8)          g70<1>F    g69<8,8,1>F    4096F
+ *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
+ *
+ * we can do the comparison as part of the ADD instruction directly:
+ *
+ *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
+ *
+ * If there had been a use of the flag register and another CMP using g70
+ *
+ *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
+ *    (+f0) sel(8)    g71<F>     g72<8,8,1>F    g73<8,8,1>F
+ *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
+ *
+ * we can recognize that the CMP is generating the flag value that already
+ * exists and therefore remove the instruction.
+ */
+
+using namespace brw;
+
+static bool
+cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block,
+                          fs_inst *inst)
+{
+   bool read_flag = false;
+   const unsigned flags_written = inst->flags_written(devinfo);
+
+   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+      if (scan_inst->opcode == BRW_OPCODE_ADD &&
+          !scan_inst->is_partial_write() &&
+          scan_inst->exec_size == inst->exec_size) {
+         bool negate;
+
+         /* A CMP is basically a subtraction.  The result of the
+          * subtraction must be the same as the result of the addition.
+          * This means that one of the operands must be negated.  So (a +
+          * b) vs (a == -b) or (a + -b) vs (a == b).
+          */
+         if ((inst->src[0].equals(scan_inst->src[0]) &&
+              inst->src[1].negative_equals(scan_inst->src[1])) ||
+             (inst->src[0].equals(scan_inst->src[1]) &&
+              inst->src[1].negative_equals(scan_inst->src[0]))) {
+            negate = false;
+         } else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
+                     inst->src[1].equals(scan_inst->src[1])) ||
+                    (inst->src[0].negative_equals(scan_inst->src[1]) &&
+                     inst->src[1].equals(scan_inst->src[0]))) {
+            negate = true;
+         } else {
+            goto not_match;
+         }
+
+         /* If the scan instruction writes a different flag register than the
+          * instruction we're trying to propagate from, bail.
+          *
+          * FINISHME: The second part of the condition may be too strong.
+          * Perhaps (scan_inst->flags_written() & flags_written) !=
+          * flags_written?
+          */
+         if (scan_inst->flags_written(devinfo) != 0 &&
+             scan_inst->flags_written(devinfo) != flags_written)
+            goto not_match;
+
+         /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
+          *
+          *    * Note that the [post condition signal] bits generated at
+          *      the output of a compute are before the .sat.
+          *
+          * Paragraph about post_zero does not mention saturation, but
+          * testing it on actual GPUs shows that conditional modifiers
+          * are applied after saturation.
+          *
+          *    * post_zero bit: This bit reflects whether the final
+          *      result is zero after all the clamping, normalizing,
+          *      or format conversion logic.
+          *
+          * For signed types we don't care about saturation: it won't
+          * change the result of conditional modifier.
+          *
+          * For floating and unsigned types there two special cases,
+          * when we can remove inst even if scan_inst is saturated: G
+          * and LE. Since conditional modifiers are just comparisons
+          * against zero, saturating positive values to the upper
+          * limit never changes the result of comparison.
+          *
+          * For negative values:
+          * (sat(x) >  0) == (x >  0) --- false
+          * (sat(x) <= 0) == (x <= 0) --- true
+          */
+         const enum brw_conditional_mod cond =
+            negate ? brw_swap_cmod(inst->conditional_mod)
+            : inst->conditional_mod;
+
+         if (scan_inst->saturate &&
+             (brw_reg_type_is_floating_point(scan_inst->dst.type) ||
+              brw_reg_type_is_unsigned_integer(scan_inst->dst.type)) &&
+             (cond != BRW_CONDITIONAL_G &&
+              cond != BRW_CONDITIONAL_LE))
+            goto not_match;
+
+         /* Otherwise, try propagating the conditional. */
+         if (scan_inst->can_do_cmod() &&
+             ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
+              scan_inst->conditional_mod == cond)) {
+            scan_inst->conditional_mod = cond;
+            scan_inst->flag_subreg = inst->flag_subreg;
+            inst->remove(block, true);
+            return true;
+         }
+         break;
+      }
+
+   not_match:
+      if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
+         break;
+
+      read_flag = read_flag ||
+                  (scan_inst->flags_read(devinfo) & flags_written) != 0;
+   }
+
+   return false;
+}
+
+/**
+ * Propagate conditional modifiers from NOT instructions
+ *
+ * Attempt to convert sequences like
+ *
+ *    or(8)           g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
+ *    ...
+ *    not.nz.f0(8)    null            g78<8,8,1>UD
+ *
+ * into
+ *
+ *    or.z.f0(8)      g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
+ */
+static bool
+cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block,
+                   fs_inst *inst)
+{
+   const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod);
+   bool read_flag = false;
+   const unsigned flags_written = inst->flags_written(devinfo);
+
+   if (cond != BRW_CONDITIONAL_Z && cond != BRW_CONDITIONAL_NZ)
+      return false;
+
+   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+      if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                          inst->src[0], inst->size_read(0))) {
+         if (scan_inst->opcode != BRW_OPCODE_OR &&
+             scan_inst->opcode != BRW_OPCODE_AND)
+            break;
+
+         if (scan_inst->is_partial_write() ||
+             scan_inst->dst.offset != inst->src[0].offset ||
+             scan_inst->exec_size != inst->exec_size)
+            break;
+
+         /* If the scan instruction writes a different flag register than the
+          * instruction we're trying to propagate from, bail.
+          *
+          * FINISHME: The second part of the condition may be too strong.
+          * Perhaps (scan_inst->flags_written() & flags_written) !=
+          * flags_written?
+          */
+         if (scan_inst->flags_written(devinfo) != 0 &&
+             scan_inst->flags_written(devinfo) != flags_written)
+            break;
+
+         if (scan_inst->can_do_cmod() &&
+             ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
+              scan_inst->conditional_mod == cond)) {
+            scan_inst->conditional_mod = cond;
+            scan_inst->flag_subreg = inst->flag_subreg;
+            inst->remove(block, true);
+            return true;
+         }
+         break;
+      }
+
+      if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
+         break;
+
+      read_flag = read_flag ||
+                  (scan_inst->flags_read(devinfo) & flags_written) != 0;
+   }
+
+   return false;
+}
+
+static bool
+opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block)
+{
+   bool progress = false;
+   UNUSED int ip = block->end_ip + 1;
+
+   foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
+      ip--;
+
+      if ((inst->opcode != BRW_OPCODE_AND &&
+           inst->opcode != BRW_OPCODE_CMP &&
+           inst->opcode != BRW_OPCODE_MOV &&
+           inst->opcode != BRW_OPCODE_NOT) ||
+          inst->predicate != BRW_PREDICATE_NONE ||
+          !inst->dst.is_null() ||
+          (inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
+           inst->src[0].file != UNIFORM))
+         continue;
+
+      /* An ABS source modifier can only be handled when processing a compare
+       * with a value other than zero.
+       */
+      if (inst->src[0].abs &&
+          (inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero()))
+         continue;
+
+      /* Only an AND.NZ can be propagated.  Many AND.Z instructions are
+       * generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code).
+       * Propagating those would require inverting the condition on the CMP.
+       * This changes both the flag value and the register destination of the
+       * CMP.  That result may be used elsewhere, so we can't change its value
+       * on a whim.
+       */
+      if (inst->opcode == BRW_OPCODE_AND &&
+          !(inst->src[1].is_one() &&
+            inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+            !inst->src[0].negate))
+         continue;
+
+      /* A CMP with a second source of zero can match with anything.  A CMP
+       * with a second source that is not zero can only match with an ADD
+       * instruction.
+       *
+       * Only apply this optimization to float-point sources.  It can fail for
+       * integers.  For inputs a = 0x80000000, b = 4, int(0x80000000) < 4, but
+       * int(0x80000000) - 4 overflows and results in 0x7ffffffc.  that's not
+       * less than zero, so the flags get set differently than for (a < b).
+       */
+      if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
+         if (brw_reg_type_is_floating_point(inst->src[0].type) &&
+             cmod_propagate_cmp_to_add(devinfo, block, inst))
+            progress = true;
+
+         continue;
+      }
+
+      if (inst->opcode == BRW_OPCODE_NOT) {
+         progress = cmod_propagate_not(devinfo, block, inst) || progress;
+         continue;
+      }
+
+      bool read_flag = false;
+      const unsigned flags_written = inst->flags_written(devinfo);
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->src[0], inst->size_read(0))) {
+            /* If the scan instruction writes a different flag register than
+             * the instruction we're trying to propagate from, bail.
+             *
+             * FINISHME: The second part of the condition may be too strong.
+             * Perhaps (scan_inst->flags_written() & flags_written) !=
+             * flags_written?
+             */
+            if (scan_inst->flags_written(devinfo) != 0 &&
+                scan_inst->flags_written(devinfo) != flags_written)
+               break;
+
+            if (scan_inst->is_partial_write() ||
+                scan_inst->dst.offset != inst->src[0].offset ||
+                scan_inst->exec_size != inst->exec_size)
+               break;
+
+            /* If the write mask is different we can't propagate. */
+            if (scan_inst->force_writemask_all != inst->force_writemask_all)
+               break;
+
+            /* CMP's result is the same regardless of dest type. */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+                scan_inst->opcode == BRW_OPCODE_CMP &&
+                brw_reg_type_is_integer(inst->dst.type)) {
+               inst->remove(block, true);
+               progress = true;
+               break;
+            }
+
+            /* If the AND wasn't handled by the previous case, it isn't safe
+             * to remove it.
+             */
+            if (inst->opcode == BRW_OPCODE_AND)
+               break;
+
+            if (inst->opcode == BRW_OPCODE_MOV) {
+               if (brw_reg_type_is_floating_point(scan_inst->dst.type)) {
+                  /* If the destination type of scan_inst is floating-point,
+                   * then:
+                   *
+                   * - The source of the MOV instruction must be the same
+                   *   type.
+                   *
+                   * - The destination of the MOV instruction must be float
+                   *   point with a size at least as large as the destination
+                   *   of inst.  Size-reducing f2f conversions could cause
+                   *   non-zero values to become zero, etc.
+                   */
+                  if (scan_inst->dst.type != inst->src[0].type)
+                     break;
+
+                  if (!brw_reg_type_is_floating_point(inst->dst.type))
+                     break;
+
+                  if (type_sz(scan_inst->dst.type) > type_sz(inst->dst.type))
+                     break;
+               } else {
+                  /* If the destination type of scan_inst is integer, then:
+                   *
+                   * - The source of the MOV instruction must be integer with
+                   *   the same size.
+                   *
+                   * - If the conditional modifier is Z or NZ, then the
+                   *   destination type of inst must either be floating point
+                   *   (of any size) or integer with a size at least as large
+                   *   as the destination of inst.
+                   *
+                   * - If the conditional modifier is neither Z nor NZ, then the
+                   *   destination type of inst must either be floating point
+                   *   (of any size) or integer with a size at least as large
+                   *   as the destination of inst and the same signedness.
+                   */
+                  if (!brw_reg_type_is_integer(inst->src[0].type) ||
+                      type_sz(scan_inst->dst.type) != type_sz(inst->src[0].type))
+                     break;
+
+                  if (brw_reg_type_is_integer(inst->dst.type)) {
+                     if (type_sz(inst->dst.type) < type_sz(scan_inst->dst.type))
+                        break;
+
+                     if (inst->conditional_mod != BRW_CONDITIONAL_Z &&
+                         inst->conditional_mod != BRW_CONDITIONAL_NZ &&
+                         brw_reg_type_is_unsigned_integer(inst->dst.type) !=
+                         brw_reg_type_is_unsigned_integer(scan_inst->dst.type))
+                        break;
+                  }
+               }
+            } else {
+               /* Not safe to use inequality operators if the types are
+                * different.
+                */
+               if (scan_inst->dst.type != inst->src[0].type &&
+                   inst->conditional_mod != BRW_CONDITIONAL_Z &&
+                   inst->conditional_mod != BRW_CONDITIONAL_NZ)
+                  break;
+
+               /* Comparisons operate differently for ints and floats */
+               if (scan_inst->dst.type != inst->dst.type) {
+                  /* Comparison result may be altered if the bit-size changes
+                   * since that affects range, denorms, etc
+                   */
+                  if (type_sz(scan_inst->dst.type) != type_sz(inst->dst.type))
+                     break;
+
+                  if (brw_reg_type_is_floating_point(scan_inst->dst.type) !=
+                      brw_reg_type_is_floating_point(inst->dst.type))
+                     break;
+               }
+            }
+
+            /* Knowing following:
+             * - CMP writes to flag register the result of
+             *   applying cmod to the `src0 - src1`.
+             *   After that it stores the same value to dst.
+             *   Other instructions first store their result to
+             *   dst, and then store cmod(dst) to the flag
+             *   register.
+             * - inst is either CMP or MOV
+             * - inst->dst is null
+             * - inst->src[0] overlaps with scan_inst->dst
+             * - inst->src[1] is zero
+             * - scan_inst wrote to a flag register
+             *
+             * There can be three possible paths:
+             *
+             * - scan_inst is CMP:
+             *
+             *   Considering that src0 is either 0x0 (false),
+             *   or 0xffffffff (true), and src1 is 0x0:
+             *
+             *   - If inst's cmod is NZ, we can always remove
+             *     scan_inst: NZ is invariant for false and true. This
+             *     holds even if src0 is NaN: .nz is the only cmod,
+             *     that returns true for NaN.
+             *
+             *   - .g is invariant if src0 has a UD type
+             *
+             *   - .l is invariant if src0 has a D type
+             *
+             * - scan_inst and inst have the same cmod:
+             *
+             *   If scan_inst is anything than CMP, it already
+             *   wrote the appropriate value to the flag register.
+             *
+             * - else:
+             *
+             *   We can change cmod of scan_inst to that of inst,
+             *   and remove inst. It is valid as long as we make
+             *   sure that no instruction uses the flag register
+             *   between scan_inst and inst.
+             */
+            if (!inst->src[0].negate &&
+                scan_inst->flags_written(devinfo)) {
+               if (scan_inst->opcode == BRW_OPCODE_CMP) {
+                  if ((inst->conditional_mod == BRW_CONDITIONAL_NZ) ||
+                      (inst->conditional_mod == BRW_CONDITIONAL_G &&
+                       inst->src[0].type == BRW_REGISTER_TYPE_UD) ||
+                      (inst->conditional_mod == BRW_CONDITIONAL_L &&
+                       inst->src[0].type == BRW_REGISTER_TYPE_D)) {
+                     inst->remove(block, true);
+                     progress = true;
+                     break;
+                  }
+               } else if (scan_inst->conditional_mod == inst->conditional_mod) {
+                  /* On Gfx4 and Gfx5 sel.cond will dirty the flags, but the
+                   * flags value is not based on the result stored in the
+                   * destination.  On all other platforms sel.cond will not
+                   * write the flags, so execution will not get to this point.
+                   */
+                  if (scan_inst->opcode == BRW_OPCODE_SEL) {
+                     assert(devinfo->ver <= 5);
+                  } else {
+                     inst->remove(block, true);
+                     progress = true;
+                  }
+
+                  break;
+               } else if (!read_flag && scan_inst->can_do_cmod()) {
+                  scan_inst->conditional_mod = inst->conditional_mod;
+                  scan_inst->flag_subreg = inst->flag_subreg;
+                  inst->remove(block, true);
+                  progress = true;
+                  break;
+               }
+            }
+
+            /* The conditional mod of the CMP/CMPN instructions behaves
+             * specially because the flag output is not calculated from the
+             * result of the instruction, but the other way around, which
+             * means that even if the condmod to propagate and the condmod
+             * from the CMP instruction are the same they will in general give
+             * different results because they are evaluated based on different
+             * inputs.
+             */
+            if (scan_inst->opcode == BRW_OPCODE_CMP ||
+                scan_inst->opcode == BRW_OPCODE_CMPN)
+               break;
+
+            /* From the Sky Lake PRM, Vol 2a, "Multiply":
+             *
+             *    "When multiplying integer data types, if one of the sources
+             *     is a DW, the resulting full precision data is stored in
+             *     the accumulator. However, if the destination data type is
+             *     either W or DW, the low bits of the result are written to
+             *     the destination register and the remaining high bits are
+             *     discarded. This results in undefined Overflow and Sign
+             *     flags. Therefore, conditional modifiers and saturation
+             *     (.sat) cannot be used in this case."
+             *
+             * We just disallow cmod propagation on all integer multiplies.
+             */
+            if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
+                scan_inst->opcode == BRW_OPCODE_MUL)
+               break;
+
+            enum brw_conditional_mod cond =
+               inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
+                                   : inst->conditional_mod;
+
+            /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
+             *
+             *    * Note that the [post condition signal] bits generated at
+             *      the output of a compute are before the .sat.
+             *
+             * Paragraph about post_zero does not mention saturation, but
+             * testing it on actual GPUs shows that conditional modifiers are
+             * applied after saturation.
+             *
+             *    * post_zero bit: This bit reflects whether the final
+             *      result is zero after all the clamping, normalizing,
+             *      or format conversion logic.
+             *
+             * For this reason, no additional restrictions are necessary on
+             * instructions with saturate.
+             */
+
+            /* Otherwise, try propagating the conditional. */
+            if (scan_inst->can_do_cmod() &&
+                ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
+                 scan_inst->conditional_mod == cond)) {
+               scan_inst->conditional_mod = cond;
+               scan_inst->flag_subreg = inst->flag_subreg;
+               inst->remove(block, true);
+               progress = true;
+            }
+            break;
+         }
+
+         if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
+            break;
+
+         read_flag = read_flag ||
+                     (scan_inst->flags_read(devinfo) & flags_written) != 0;
+      }
+   }
+
+   /* There is progress if and only if instructions were removed. */
+   assert(progress == (block->end_ip_delta != 0));
+
+   return progress;
+}
+
+bool
+fs_visitor::opt_cmod_propagation()
+{
+   bool progress = false;
+
+   foreach_block_reverse(block, cfg) {
+      progress = opt_cmod_propagation_local(devinfo, block) || progress;
+   }
+
+   if (progress) {
+      cfg->adjust_block_ips();
+
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+   }
+
+   return progress;
+}
--- a/src/intel/compiler/elk/brw_fs_combine_constants.cpp
+++ b/src/intel/compiler/elk/brw_fs_combine_constants.cpp
--- a/src/intel/compiler/elk/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/elk/brw_fs_copy_propagation.cpp
--- a/src/intel/compiler/elk/brw_fs_cse.cpp
+++ b/src/intel/compiler/elk/brw_fs_cse.cpp
@ -0,0 +1,396 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_cfg.h"
+
+/** @file brw_fs_cse.cpp
+ *
+ * Support for local common subexpression elimination.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 13.1 (p378).
+ */
+
+using namespace brw;
+
+namespace {
+struct aeb_entry : public exec_node {
+   /** The instruction that generates the expression value. */
+   fs_inst *generator;
+
+   /** The temporary where the value is stored. */
+   fs_reg tmp;
+};
+}
+
+static bool
+is_expression(const fs_visitor *v, const fs_inst *const inst)
+{
+   switch (inst->opcode) {
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_SEL:
+   case BRW_OPCODE_NOT:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_CMPN:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+   case BRW_OPCODE_FRC:
+   case BRW_OPCODE_RNDU:
+   case BRW_OPCODE_RNDD:
+   case BRW_OPCODE_RNDE:
+   case BRW_OPCODE_RNDZ:
+   case BRW_OPCODE_LINE:
+   case BRW_OPCODE_PLN:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_LRP:
+   case FS_OPCODE_FB_READ_LOGICAL:
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
+   case FS_OPCODE_LINTERP:
+   case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+   case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
+   case FS_OPCODE_LOAD_LIVE_CHANNELS:
+   case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_MOV_INDIRECT:
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case SHADER_OPCODE_TXD_LOGICAL:
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+   case SHADER_OPCODE_LOD_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+   case FS_OPCODE_PACK:
+      return true;
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      return inst->mlen < 2;
+   case SHADER_OPCODE_LOAD_PAYLOAD:
+      return !is_coalescing_payload(v->alloc, inst);
+   default:
+      return inst->is_send_from_grf() && !inst->has_side_effects() &&
+         !inst->is_volatile();
+   }
+}
+
+static bool
+operands_match(const fs_inst *a, const fs_inst *b, bool *negate)
+{
+   fs_reg *xs = a->src;
+   fs_reg *ys = b->src;
+
+   if (a->opcode == BRW_OPCODE_MAD) {
+      return xs[0].equals(ys[0]) &&
+             ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
+              (xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
+   } else if (a->opcode == BRW_OPCODE_MUL && a->dst.type == BRW_REGISTER_TYPE_F) {
+      bool xs0_negate = xs[0].negate;
+      bool xs1_negate = xs[1].file == IMM ? xs[1].f < 0.0f
+                                          : xs[1].negate;
+      bool ys0_negate = ys[0].negate;
+      bool ys1_negate = ys[1].file == IMM ? ys[1].f < 0.0f
+                                          : ys[1].negate;
+      float xs1_imm = xs[1].f;
+      float ys1_imm = ys[1].f;
+
+      xs[0].negate = false;
+      xs[1].negate = false;
+      ys[0].negate = false;
+      ys[1].negate = false;
+      xs[1].f = fabsf(xs[1].f);
+      ys[1].f = fabsf(ys[1].f);
+
+      bool ret = (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
+                 (xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
+
+      xs[0].negate = xs0_negate;
+      xs[1].negate = xs[1].file == IMM ? false : xs1_negate;
+      ys[0].negate = ys0_negate;
+      ys[1].negate = ys[1].file == IMM ? false : ys1_negate;
+      xs[1].f = xs1_imm;
+      ys[1].f = ys1_imm;
+
+      *negate = (xs0_negate != xs1_negate) != (ys0_negate != ys1_negate);
+      if (*negate && (a->saturate || b->saturate))
+         return false;
+      return ret;
+   } else if (!a->is_commutative()) {
+      bool match = true;
+      for (int i = 0; i < a->sources; i++) {
+         if (!xs[i].equals(ys[i])) {
+            match = false;
+            break;
+         }
+      }
+      return match;
+   } else {
+      return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
+             (xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
+   }
+}
+
+static bool
+instructions_match(fs_inst *a, fs_inst *b, bool *negate)
+{
+   return a->opcode == b->opcode &&
+          a->force_writemask_all == b->force_writemask_all &&
+          a->exec_size == b->exec_size &&
+          a->group == b->group &&
+          a->saturate == b->saturate &&
+          a->predicate == b->predicate &&
+          a->predicate_inverse == b->predicate_inverse &&
+          a->conditional_mod == b->conditional_mod &&
+          a->flag_subreg == b->flag_subreg &&
+          a->dst.type == b->dst.type &&
+          a->offset == b->offset &&
+          a->mlen == b->mlen &&
+          a->ex_mlen == b->ex_mlen &&
+          a->sfid == b->sfid &&
+          a->desc == b->desc &&
+          a->size_written == b->size_written &&
+          a->base_mrf == b->base_mrf &&
+          a->check_tdr == b->check_tdr &&
+          a->send_has_side_effects == b->send_has_side_effects &&
+          a->eot == b->eot &&
+          a->header_size == b->header_size &&
+          a->shadow_compare == b->shadow_compare &&
+          a->pi_noperspective == b->pi_noperspective &&
+          a->target == b->target &&
+          a->sources == b->sources &&
+          operands_match(a, b, negate);
+}
+
+static void
+create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
+{
+   unsigned written = regs_written(inst);
+   unsigned dst_width =
+      DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
+   fs_inst *copy;
+
+   if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+      assert(src.file == VGRF);
+      fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg,
+                                     inst->sources);
+      for (int i = 0; i < inst->header_size; i++) {
+         payload[i] = src;
+         src.offset += REG_SIZE;
+      }
+      for (int i = inst->header_size; i < inst->sources; i++) {
+         src.type = inst->src[i].type;
+         payload[i] = src;
+         src = offset(src, bld, 1);
+      }
+      copy = bld.LOAD_PAYLOAD(inst->dst, payload, inst->sources,
+                              inst->header_size);
+   } else if (written != dst_width) {
+      assert(src.file == VGRF);
+      assert(written % dst_width == 0);
+      const int sources = written / dst_width;
+      fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources);
+      for (int i = 0; i < sources; i++) {
+         payload[i] = src;
+         src = offset(src, bld, 1);
+      }
+      copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, 0);
+   } else {
+      copy = bld.MOV(inst->dst, src);
+      copy->group = inst->group;
+      copy->force_writemask_all = inst->force_writemask_all;
+      copy->src[0].negate = negate;
+   }
+   assert(regs_written(copy) == written);
+}
+
+bool
+fs_visitor::opt_cse_local(const fs_live_variables &live, bblock_t *block, int &ip)
+{
+   bool progress = false;
+   exec_list aeb;
+
+   void *cse_ctx = ralloc_context(NULL);
+
+   foreach_inst_in_block(fs_inst, inst, block) {
+      /* Skip some cases. */
+      if (is_expression(this, inst) && !inst->is_partial_write() &&
+          ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
+           inst->dst.is_null()))
+      {
+         bool found = false;
+         bool negate = false;
+
+         foreach_in_list_use_after(aeb_entry, entry, &aeb) {
+            /* Match current instruction's expression against those in AEB. */
+            if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
+                instructions_match(inst, entry->generator, &negate)) {
+               found = true;
+               progress = true;
+               break;
+            }
+         }
+
+         if (!found) {
+            if (inst->opcode != BRW_OPCODE_MOV ||
+                (inst->opcode == BRW_OPCODE_MOV &&
+                 inst->src[0].file == IMM &&
+                 inst->src[0].type == BRW_REGISTER_TYPE_VF)) {
+               /* Our first sighting of this expression.  Create an entry. */
+               aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
+               entry->tmp = reg_undef;
+               entry->generator = inst;
+               aeb.push_tail(entry);
+            }
+         } else {
+            /* This is at least our second sighting of this expression.
+             * If we don't have a temporary already, make one.
+             */
+            bool no_existing_temp = entry->tmp.file == BAD_FILE;
+            if (no_existing_temp && !entry->generator->dst.is_null()) {
+               const fs_builder ibld = fs_builder(this, block, entry->generator)
+                                       .at(block, entry->generator->next);
+               int written = regs_written(entry->generator);
+
+               entry->tmp = fs_reg(VGRF, alloc.allocate(written),
+                                   entry->generator->dst.type);
+
+               create_copy_instr(ibld, entry->generator, entry->tmp, false);
+
+               entry->generator->dst = entry->tmp;
+            }
+
+            /* dest <- temp */
+            if (!inst->dst.is_null()) {
+               assert(inst->size_written == entry->generator->size_written);
+               assert(inst->dst.type == entry->tmp.type);
+               const fs_builder ibld(this, block, inst);
+
+               create_copy_instr(ibld, inst, entry->tmp, negate);
+            }
+
+            /* Set our iterator so that next time through the loop inst->next
+             * will get the instruction in the basic block after the one we've
+             * removed.
+             */
+            fs_inst *prev = (fs_inst *)inst->prev;
+
+            inst->remove(block);
+            inst = prev;
+         }
+      }
+
+      /* Discard jumps aren't represented in the CFG unfortunately, so we need
+       * to make sure that they behave as a CSE barrier, since we lack global
+       * dataflow information.  This is particularly likely to cause problems
+       * with instructions dependent on the current execution mask like
+       * SHADER_OPCODE_FIND_LIVE_CHANNEL.
+       */
+      if (inst->opcode == BRW_OPCODE_HALT ||
+          inst->opcode == SHADER_OPCODE_HALT_TARGET)
+         aeb.make_empty();
+
+      foreach_in_list_safe(aeb_entry, entry, &aeb) {
+         /* Kill all AEB entries that write a different value to or read from
+          * the flag register if we just wrote it.
+          */
+         if (inst->flags_written(devinfo)) {
+            bool negate; /* dummy */
+            if (entry->generator->flags_read(devinfo) ||
+                (entry->generator->flags_written(devinfo) &&
+                 !instructions_match(inst, entry->generator, &negate))) {
+               entry->remove();
+               ralloc_free(entry);
+               continue;
+            }
+         }
+
+         for (int i = 0; i < entry->generator->sources; i++) {
+            fs_reg *src_reg = &entry->generator->src[i];
+
+            /* Kill all AEB entries that use the destination we just
+             * overwrote.
+             */
+            if (regions_overlap(inst->dst, inst->size_written,
+                                entry->generator->src[i],
+                                entry->generator->size_read(i))) {
+               entry->remove();
+               ralloc_free(entry);
+               break;
+            }
+
+            /* Kill any AEB entries using registers that don't get reused any
+             * more -- a sure sign they'll fail operands_match().
+             */
+            if (src_reg->file == VGRF && live.vgrf_end[src_reg->nr] < ip) {
+               entry->remove();
+               ralloc_free(entry);
+               break;
+            }
+         }
+      }
+
+      ip++;
+   }
+
+   ralloc_free(cse_ctx);
+
+   return progress;
+}
+
+bool
+fs_visitor::opt_cse()
+{
+   const fs_live_variables &live = live_analysis.require();
+   bool progress = false;
+   int ip = 0;
+
+   foreach_block (block, cfg) {
+      progress = opt_cse_local(live, block, ip) || progress;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
--- a/src/intel/compiler/elk/brw_fs_dead_code_eliminate.cpp
+++ b/src/intel/compiler/elk/brw_fs_dead_code_eliminate.cpp
@ -0,0 +1,152 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_live_variables.h"
+#include "brw_cfg.h"
+
+/** @file brw_fs_dead_code_eliminate.cpp
+ *
+ * Dataflow-aware dead code elimination.
+ *
+ * Walks the instruction list from the bottom, removing instructions that
+ * have results that both aren't used in later blocks and haven't been read
+ * yet in the tail end of this block.
+ */
+
+using namespace brw;
+
+/**
+ * Is it safe to eliminate the instruction?
+ */
+static bool
+can_eliminate(const intel_device_info *devinfo, const fs_inst *inst,
+              BITSET_WORD *flag_live)
+{
+    return !inst->is_control_flow() &&
+           !inst->has_side_effects() &&
+           !(flag_live[0] & inst->flags_written(devinfo)) &&
+           !inst->writes_accumulator;
+}
+
+/**
+ * Is it safe to omit the write, making the destination ARF null?
+ */
+static bool
+can_omit_write(const fs_inst *inst)
+{
+   switch (inst->opcode) {
+   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+      return true;
+   default:
+      /* We can eliminate the destination write for ordinary instructions,
+       * but not most SENDs.
+       */
+      if (inst->opcode < 128 && inst->mlen == 0)
+         return true;
+
+      /* It might not be safe for other virtual opcodes. */
+      return false;
+   }
+}
+
+bool
+fs_visitor::dead_code_eliminate()
+{
+   bool progress = false;
+
+   const fs_live_variables &live_vars = live_analysis.require();
+   int num_vars = live_vars.num_vars;
+   BITSET_WORD *live = rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(num_vars));
+   BITSET_WORD *flag_live = rzalloc_array(NULL, BITSET_WORD, 1);
+
+   foreach_block_reverse_safe(block, cfg) {
+      memcpy(live, live_vars.block_data[block->num].liveout,
+             sizeof(BITSET_WORD) * BITSET_WORDS(num_vars));
+      memcpy(flag_live, live_vars.block_data[block->num].flag_liveout,
+             sizeof(BITSET_WORD));
+
+      foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
+         if (inst->dst.file == VGRF) {
+            const unsigned var = live_vars.var_from_reg(inst->dst);
+            bool result_live = false;
+
+            for (unsigned i = 0; i < regs_written(inst); i++)
+               result_live |= BITSET_TEST(live, var + i);
+
+            if (!result_live &&
+                (can_omit_write(inst) || can_eliminate(devinfo, inst, flag_live))) {
+               inst->dst = fs_reg(spread(retype(brw_null_reg(), inst->dst.type),
+                                         inst->dst.stride));
+               progress = true;
+            }
+         }
+
+         if (inst->dst.is_null() && can_eliminate(devinfo, inst, flag_live)) {
+            inst->opcode = BRW_OPCODE_NOP;
+            progress = true;
+         }
+
+         if (inst->dst.file == VGRF) {
+            if (!inst->is_partial_write()) {
+               const unsigned var = live_vars.var_from_reg(inst->dst);
+               for (unsigned i = 0; i < regs_written(inst); i++) {
+                  BITSET_CLEAR(live, var + i);
+               }
+            }
+         }
+
+         if (!inst->predicate && inst->exec_size >= 8)
+            flag_live[0] &= ~inst->flags_written(devinfo);
+
+         if (inst->opcode == BRW_OPCODE_NOP) {
+            inst->remove(block, true);
+            continue;
+         }
+
+         for (int i = 0; i < inst->sources; i++) {
+            if (inst->src[i].file == VGRF) {
+               int var = live_vars.var_from_reg(inst->src[i]);
+
+               for (unsigned j = 0; j < regs_read(inst, i); j++) {
+                  BITSET_SET(live, var + j);
+               }
+            }
+         }
+
+         flag_live[0] |= inst->flags_read(devinfo);
+      }
+   }
+
+   cfg->adjust_block_ips();
+
+   ralloc_free(live);
+   ralloc_free(flag_live);
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
--- a/src/intel/compiler/elk/brw_fs_generator.cpp
+++ b/src/intel/compiler/elk/brw_fs_generator.cpp
--- a/src/intel/compiler/elk/brw_fs_live_variables.cpp
+++ b/src/intel/compiler/elk/brw_fs_live_variables.cpp
@ -0,0 +1,371 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_live_variables.h"
+
+using namespace brw;
+
+#define MAX_INSTRUCTION (1 << 30)
+
+/** @file brw_fs_live_variables.cpp
+ *
+ * Support for calculating liveness information about virtual GRFs.
+ *
+ * This produces a live interval for each whole virtual GRF.  We could
+ * choose to expose per-component live intervals for VGRFs of size > 1,
+ * but we currently do not.  It is easier for the consumers of this
+ * information to work with whole VGRFs.
+ *
+ * However, we internally track use/def information at the per-GRF level for
+ * greater accuracy.  Large VGRFs may be accessed piecemeal over many
+ * (possibly non-adjacent) instructions.  In this case, examining a single
+ * instruction is insufficient to decide whether a whole VGRF is ultimately
+ * used or defined.  Tracking individual components allows us to easily
+ * assemble this information.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 14.1 (p444).
+ */
+
+void
+fs_live_variables::setup_one_read(struct block_data *bd,
+                                  int ip, const fs_reg &reg)
+{
+   int var = var_from_reg(reg);
+   assert(var < num_vars);
+
+   start[var] = MIN2(start[var], ip);
+   end[var] = MAX2(end[var], ip);
+
+   /* The use[] bitset marks when the block makes use of a variable (VGRF
+    * channel) without having completely defined that variable within the
+    * block.
+    */
+   if (!BITSET_TEST(bd->def, var))
+      BITSET_SET(bd->use, var);
+}
+
+void
+fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst,
+                                   int ip, const fs_reg &reg)
+{
+   int var = var_from_reg(reg);
+   assert(var < num_vars);
+
+   start[var] = MIN2(start[var], ip);
+   end[var] = MAX2(end[var], ip);
+
+   /* The def[] bitset marks when an initialization in a block completely
+    * screens off previous updates of that variable (VGRF channel).
+    */
+   if (inst->dst.file == VGRF) {
+      if (!inst->is_partial_write() && !BITSET_TEST(bd->use, var))
+         BITSET_SET(bd->def, var);
+
+      BITSET_SET(bd->defout, var);
+   }
+}
+
+/**
+ * Sets up the use[] and def[] bitsets.
+ *
+ * The basic-block-level live variable analysis needs to know which
+ * variables get used before they're completely defined, and which
+ * variables are completely defined before they're used.
+ *
+ * These are tracked at the per-component level, rather than whole VGRFs.
+ */
+void
+fs_live_variables::setup_def_use()
+{
+   int ip = 0;
+
+   foreach_block (block, cfg) {
+      assert(ip == block->start_ip);
+      if (block->num > 0)
+         assert(cfg->blocks[block->num - 1]->end_ip == ip - 1);
+
+      struct block_data *bd = &block_data[block->num];
+
+      foreach_inst_in_block(fs_inst, inst, block) {
+         /* Set use[] for this instruction */
+         for (unsigned int i = 0; i < inst->sources; i++) {
+            fs_reg reg = inst->src[i];
+
+            if (reg.file != VGRF)
+               continue;
+
+            for (unsigned j = 0; j < regs_read(inst, i); j++) {
+               setup_one_read(bd, ip, reg);
+               reg.offset += REG_SIZE;
+            }
+         }
+
+         bd->flag_use[0] |= inst->flags_read(devinfo) & ~bd->flag_def[0];
+
+         /* Set def[] for this instruction */
+         if (inst->dst.file == VGRF) {
+            fs_reg reg = inst->dst;
+            for (unsigned j = 0; j < regs_written(inst); j++) {
+               setup_one_write(bd, inst, ip, reg);
+               reg.offset += REG_SIZE;
+            }
+         }
+
+         if (!inst->predicate && inst->exec_size >= 8)
+            bd->flag_def[0] |= inst->flags_written(devinfo) & ~bd->flag_use[0];
+
+         ip++;
+      }
+   }
+}
+
+/**
+ * The algorithm incrementally sets bits in liveout and livein,
+ * propagating it through control flow.  It will eventually terminate
+ * because it only ever adds bits, and stops when no bits are added in
+ * a pass.
+ */
+void
+fs_live_variables::compute_live_variables()
+{
+   bool cont = true;
+
+   /* Propagate defin and defout down the CFG to calculate the union of live
+    * variables potentially defined along any possible control flow path.
+    */
+   do {
+      cont = false;
+
+      foreach_block (block, cfg) {
+         const struct block_data *bd = &block_data[block->num];
+
+         foreach_list_typed(bblock_link, child_link, link, &block->children) {
+            struct block_data *child_bd = &block_data[child_link->block->num];
+
+            for (int i = 0; i < bitset_words; i++) {
+               const BITSET_WORD new_def = bd->defout[i] & ~child_bd->defin[i];
+               child_bd->defin[i] |= new_def;
+               child_bd->defout[i] |= new_def;
+               cont |= new_def;
+            }
+         }
+      }
+   } while (cont);
+
+   do {
+      cont = false;
+
+      foreach_block_reverse (block, cfg) {
+         struct block_data *bd = &block_data[block->num];
+
+         /* Update liveout */
+         foreach_list_typed(bblock_link, child_link, link, &block->children) {
+            struct block_data *child_bd = &block_data[child_link->block->num];
+
+            for (int i = 0; i < bitset_words; i++) {
+               BITSET_WORD new_liveout = (child_bd->livein[i] &
+                                          ~bd->liveout[i]);
+               new_liveout &= bd->defout[i]; /* Screen off uses with no reaching def */
+               if (new_liveout)
+                  bd->liveout[i] |= new_liveout;
+            }
+            BITSET_WORD new_liveout = (child_bd->flag_livein[0] &
+                                       ~bd->flag_liveout[0]);
+            if (new_liveout)
+               bd->flag_liveout[0] |= new_liveout;
+         }
+
+         /* Update livein */
+         for (int i = 0; i < bitset_words; i++) {
+            BITSET_WORD new_livein = (bd->use[i] |
+                                      (bd->liveout[i] &
+                                       ~bd->def[i]));
+            new_livein &= bd->defin[i]; /* Screen off uses with no reaching def */
+            if (new_livein & ~bd->livein[i]) {
+               bd->livein[i] |= new_livein;
+               cont = true;
+            }
+         }
+         BITSET_WORD new_livein = (bd->flag_use[0] |
+                                   (bd->flag_liveout[0] &
+                                    ~bd->flag_def[0]));
+         if (new_livein & ~bd->flag_livein[0]) {
+            bd->flag_livein[0] |= new_livein;
+            cont = true;
+         }
+      }
+   } while (cont);
+}
+
+/**
+ * Extend the start/end ranges for each variable to account for the
+ * new information calculated from control flow.
+ */
+void
+fs_live_variables::compute_start_end()
+{
+   foreach_block (block, cfg) {
+      struct block_data *bd = &block_data[block->num];
+      unsigned i;
+
+      BITSET_FOREACH_SET(i, bd->livein, (unsigned)num_vars) {
+         start[i] = MIN2(start[i], block->start_ip);
+         end[i] = MAX2(end[i], block->start_ip);
+      }
+
+      BITSET_FOREACH_SET(i, bd->liveout, (unsigned)num_vars) {
+         start[i] = MIN2(start[i], block->end_ip);
+         end[i] = MAX2(end[i], block->end_ip);
+      }
+   }
+}
+
+fs_live_variables::fs_live_variables(const backend_shader *s)
+   : devinfo(s->devinfo), cfg(s->cfg)
+{
+   mem_ctx = ralloc_context(NULL);
+   linear_ctx *lin_ctx = linear_context(mem_ctx);
+
+   num_vgrfs = s->alloc.count;
+   num_vars = 0;
+   var_from_vgrf = linear_zalloc_array(lin_ctx, int, num_vgrfs);
+   for (int i = 0; i < num_vgrfs; i++) {
+      var_from_vgrf[i] = num_vars;
+      num_vars += s->alloc.sizes[i];
+   }
+
+   vgrf_from_var = linear_zalloc_array(lin_ctx, int, num_vars);
+   for (int i = 0; i < num_vgrfs; i++) {
+      for (unsigned j = 0; j < s->alloc.sizes[i]; j++) {
+         vgrf_from_var[var_from_vgrf[i] + j] = i;
+      }
+   }
+
+   start = ralloc_array(mem_ctx, int, num_vars);
+   end = linear_zalloc_array(lin_ctx, int, num_vars);
+   for (int i = 0; i < num_vars; i++) {
+      start[i] = MAX_INSTRUCTION;
+      end[i] = -1;
+   }
+
+   vgrf_start = ralloc_array(mem_ctx, int, num_vgrfs);
+   vgrf_end = ralloc_array(mem_ctx, int, num_vgrfs);
+   for (int i = 0; i < num_vgrfs; i++) {
+      vgrf_start[i] = MAX_INSTRUCTION;
+      vgrf_end[i] = -1;
+   }
+
+   block_data = linear_zalloc_array(lin_ctx, struct block_data, cfg->num_blocks);
+
+   bitset_words = BITSET_WORDS(num_vars);
+   for (int i = 0; i < cfg->num_blocks; i++) {
+      block_data[i].def = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      block_data[i].use = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      block_data[i].livein = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      block_data[i].liveout = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      block_data[i].defin = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+      block_data[i].defout = linear_zalloc_array(lin_ctx, BITSET_WORD, bitset_words);
+
+      block_data[i].flag_def[0] = 0;
+      block_data[i].flag_use[0] = 0;
+      block_data[i].flag_livein[0] = 0;
+      block_data[i].flag_liveout[0] = 0;
+   }
+
+   setup_def_use();
+   compute_live_variables();
+   compute_start_end();
+
+   /* Merge the per-component live ranges to whole VGRF live ranges. */
+   for (int i = 0; i < num_vars; i++) {
+      const unsigned vgrf = vgrf_from_var[i];
+      vgrf_start[vgrf] = MIN2(vgrf_start[vgrf], start[i]);
+      vgrf_end[vgrf] = MAX2(vgrf_end[vgrf], end[i]);
+   }
+}
+
+fs_live_variables::~fs_live_variables()
+{
+   ralloc_free(mem_ctx);
+}
+
+static bool
+check_register_live_range(const fs_live_variables *live, int ip,
+                          const fs_reg &reg, unsigned n)
+{
+   const unsigned var = live->var_from_reg(reg);
+
+   if (var + n > unsigned(live->num_vars) ||
+       live->vgrf_start[reg.nr] > ip || live->vgrf_end[reg.nr] < ip)
+      return false;
+
+   for (unsigned j = 0; j < n; j++) {
+      if (live->start[var + j] > ip || live->end[var + j] < ip)
+         return false;
+   }
+
+   return true;
+}
+
+bool
+fs_live_variables::validate(const backend_shader *s) const
+{
+   int ip = 0;
+
+   foreach_block_and_inst(block, fs_inst, inst, s->cfg) {
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF &&
+             !check_register_live_range(this, ip,
+                                        inst->src[i], regs_read(inst, i)))
+            return false;
+      }
+
+      if (inst->dst.file == VGRF &&
+          !check_register_live_range(this, ip, inst->dst, regs_written(inst)))
+         return false;
+
+      ip++;
+   }
+
+   return true;
+}
+
+bool
+fs_live_variables::vars_interfere(int a, int b) const
+{
+   return !(end[b] <= start[a] ||
+            end[a] <= start[b]);
+}
+
+bool
+fs_live_variables::vgrfs_interfere(int a, int b) const
+{
+   return !(vgrf_end[a] <= vgrf_start[b] ||
+            vgrf_end[b] <= vgrf_start[a]);
+}
--- a/src/intel/compiler/elk/brw_fs_live_variables.h
+++ b/src/intel/compiler/elk/brw_fs_live_variables.h
@ -0,0 +1,148 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#ifndef BRW_FS_LIVE_VARIABLES_H
+#define BRW_FS_LIVE_VARIABLES_H
+
+#include "brw_ir_analysis.h"
+#include "brw_ir_fs.h"
+#include "util/bitset.h"
+
+struct cfg_t;
+struct backend_shader;
+
+namespace brw {
+
+class fs_live_variables {
+public:
+   struct block_data {
+      /**
+       * Which variables are defined before being used in the block.
+       *
+       * Note that for our purposes, "defined" means unconditionally, completely
+       * defined.
+       */
+      BITSET_WORD *def;
+
+      /**
+       * Which variables are used before being defined in the block.
+       */
+      BITSET_WORD *use;
+
+      /** Which defs reach the entry point of the block. */
+      BITSET_WORD *livein;
+
+      /** Which defs reach the exit point of the block. */
+      BITSET_WORD *liveout;
+
+      /**
+       * Variables such that the entry point of the block may be reached from any
+       * of their definitions.
+       */
+      BITSET_WORD *defin;
+
+      /**
+       * Variables such that the exit point of the block may be reached from any
+       * of their definitions.
+       */
+      BITSET_WORD *defout;
+
+      BITSET_WORD flag_def[1];
+      BITSET_WORD flag_use[1];
+      BITSET_WORD flag_livein[1];
+      BITSET_WORD flag_liveout[1];
+   };
+
+   fs_live_variables(const backend_shader *s);
+   ~fs_live_variables();
+
+   bool validate(const backend_shader *s) const;
+
+   analysis_dependency_class
+   dependency_class() const
+   {
+      return (DEPENDENCY_INSTRUCTION_IDENTITY |
+              DEPENDENCY_INSTRUCTION_DATA_FLOW |
+              DEPENDENCY_VARIABLES);
+   }
+
+   bool vars_interfere(int a, int b) const;
+   bool vgrfs_interfere(int a, int b) const;
+   int var_from_reg(const fs_reg &reg) const
+   {
+      return var_from_vgrf[reg.nr] + reg.offset / REG_SIZE;
+   }
+
+   /** Map from virtual GRF number to index in block_data arrays. */
+   int *var_from_vgrf;
+
+   /**
+    * Map from any index in block_data to the virtual GRF containing it.
+    *
+    * For alloc.sizes of [1, 2, 3], vgrf_from_var would contain
+    * [0, 1, 1, 2, 2, 2].
+    */
+   int *vgrf_from_var;
+
+   int num_vars;
+   int num_vgrfs;
+   int bitset_words;
+
+   /** @{
+    * Final computed live ranges for each var (each component of each virtual
+    * GRF).
+    */
+   int *start;
+   int *end;
+   /** @} */
+
+   /** @{
+    * Final computed live ranges for each VGRF.
+    */
+   int *vgrf_start;
+   int *vgrf_end;
+   /** @} */
+
+   /** Per-basic-block information on live variables */
+   struct block_data *block_data;
+
+protected:
+   void setup_def_use();
+   void setup_one_read(struct block_data *bd, int ip, const fs_reg &reg);
+   void setup_one_write(struct block_data *bd, fs_inst *inst, int ip,
+                        const fs_reg &reg);
+   void compute_live_variables();
+   void compute_start_end();
+
+   const struct intel_device_info *devinfo;
+   const cfg_t *cfg;
+   void *mem_ctx;
+};
+
+} /* namespace brw */
+
+#endif /* BRW_FS_LIVE_VARIABLES_H */
--- a/src/intel/compiler/elk/brw_fs_lower_dpas.cpp
+++ b/src/intel/compiler/elk/brw_fs_lower_dpas.cpp
@ -0,0 +1,306 @@
+/*
+ * Copyright 2023 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+static void
+f16_using_mac(const fs_builder &bld, fs_inst *inst)
+{
+   /* We only intend to support configurations where the destination and
+    * accumulator have the same type.
+    */
+   if (!inst->src[0].is_null())
+      assert(inst->dst.type == inst->src[0].type);
+
+   assert(inst->src[1].type == BRW_REGISTER_TYPE_HF);
+   assert(inst->src[2].type == BRW_REGISTER_TYPE_HF);
+
+   const brw_reg_type src0_type = inst->dst.type;
+   const brw_reg_type src1_type = BRW_REGISTER_TYPE_HF;
+   const brw_reg_type src2_type = BRW_REGISTER_TYPE_HF;
+
+   const fs_reg dest = inst->dst;
+   fs_reg src0 = inst->src[0];
+   const fs_reg src1 = retype(inst->src[1], src1_type);
+   const fs_reg src2 = retype(inst->src[2], src2_type);
+
+   const unsigned dest_stride =
+      dest.type == BRW_REGISTER_TYPE_HF ? REG_SIZE / 2 : REG_SIZE;
+
+   for (unsigned r = 0; r < inst->rcount; r++) {
+      fs_reg temp = bld.vgrf(BRW_REGISTER_TYPE_HF, 1);
+
+      for (unsigned subword = 0; subword < 2; subword++) {
+         for (unsigned s = 0; s < inst->sdepth; s++) {
+            /* The first multiply of the dot-product operation has to
+             * explicitly write the accumulator register. The successive MAC
+             * instructions will implicitly read *and* write the
+             * accumulator. Those MAC instructions can also optionally
+             * explicitly write some other register.
+             *
+             * FINISHME: The accumulator can actually hold 16 HF values. On
+             * Gfx12 there are two accumulators. It should be possible to do
+             * this in SIMD16 or even SIMD32. I was unable to get this to work
+             * properly.
+             */
+            if (s == 0 && subword == 0) {
+               const unsigned acc_width = 8;
+               fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD),
+                                      inst->group % acc_width);
+
+               if (bld.shader->devinfo->verx10 >= 125) {
+                  acc = subscript(acc, BRW_REGISTER_TYPE_HF, subword);
+               } else {
+                  acc = retype(acc, BRW_REGISTER_TYPE_HF);
+               }
+
+               bld.MUL(acc,
+                       subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                        BRW_REGISTER_TYPE_UD),
+                                 BRW_REGISTER_TYPE_HF, subword),
+                       component(retype(byte_offset(src2, r * REG_SIZE),
+                                        BRW_REGISTER_TYPE_HF),
+                                 s * 2 + subword))
+                  ->writes_accumulator = true;
+
+            } else {
+               fs_reg result;
+
+               /* As mentioned above, the MAC had an optional, explicit
+                * destination register. Various optimization passes are not
+                * clever enough to understand the intricacies of this
+                * instruction, so only write the result register on the final
+                * MAC in the sequence.
+                */
+               if ((s + 1) == inst->sdepth && subword == 1)
+                  result = temp;
+               else
+                  result = retype(bld.null_reg_ud(), BRW_REGISTER_TYPE_HF);
+
+               bld.MAC(result,
+                       subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                        BRW_REGISTER_TYPE_UD),
+                                 BRW_REGISTER_TYPE_HF, subword),
+                       component(retype(byte_offset(src2, r * REG_SIZE),
+                                        BRW_REGISTER_TYPE_HF),
+                                 s * 2 + subword))
+                  ->writes_accumulator = true;
+            }
+         }
+      }
+
+      if (!src0.is_null()) {
+         if (src0_type != BRW_REGISTER_TYPE_HF) {
+            fs_reg temp2 = bld.vgrf(src0_type, 1);
+
+            bld.MOV(temp2, temp);
+
+            bld.ADD(byte_offset(dest, r * dest_stride),
+                    temp2,
+                    byte_offset(src0, r * dest_stride));
+         } else {
+            bld.ADD(byte_offset(dest, r * dest_stride),
+                    temp,
+                    byte_offset(src0, r * dest_stride));
+         }
+      } else {
+         bld.MOV(byte_offset(dest, r * dest_stride), temp);
+      }
+   }
+}
+
+static void
+int8_using_dp4a(const fs_builder &bld, fs_inst *inst)
+{
+   /* We only intend to support configurations where the destination and
+    * accumulator have the same type.
+    */
+   if (!inst->src[0].is_null())
+      assert(inst->dst.type == inst->src[0].type);
+
+   assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
+          inst->src[1].type == BRW_REGISTER_TYPE_UB);
+   assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
+          inst->src[2].type == BRW_REGISTER_TYPE_UB);
+
+   const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
+      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
+
+   const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
+      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
+
+   fs_reg dest = inst->dst;
+   fs_reg src0 = inst->src[0];
+   const fs_reg src1 = retype(inst->src[1], src1_type);
+   const fs_reg src2 = retype(inst->src[2], src2_type);
+
+   const unsigned dest_stride = REG_SIZE;
+
+   for (unsigned r = 0; r < inst->rcount; r++) {
+      if (!src0.is_null()) {
+         bld.MOV(dest, src0);
+         src0 = byte_offset(src0, dest_stride);
+      } else {
+         bld.MOV(dest, retype(brw_imm_d(0), dest.type));
+      }
+
+      for (unsigned s = 0; s < inst->sdepth; s++) {
+         bld.DP4A(dest,
+                  dest,
+                  byte_offset(src1, s * REG_SIZE),
+                  component(byte_offset(src2, r * REG_SIZE), s))
+            ->saturate = inst->saturate;
+      }
+
+      dest = byte_offset(dest, dest_stride);
+   }
+}
+
+static void
+int8_using_mul_add(const fs_builder &bld, fs_inst *inst)
+{
+   /* We only intend to support configurations where the destination and
+    * accumulator have the same type.
+    */
+   if (!inst->src[0].is_null())
+      assert(inst->dst.type == inst->src[0].type);
+
+   assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
+          inst->src[1].type == BRW_REGISTER_TYPE_UB);
+   assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
+          inst->src[2].type == BRW_REGISTER_TYPE_UB);
+
+   const brw_reg_type src0_type = inst->dst.type;
+
+   const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
+      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
+
+   const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
+      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
+
+   fs_reg dest = inst->dst;
+   fs_reg src0 = inst->src[0];
+   const fs_reg src1 = retype(inst->src[1], src1_type);
+   const fs_reg src2 = retype(inst->src[2], src2_type);
+
+   const unsigned dest_stride = REG_SIZE;
+
+   for (unsigned r = 0; r < inst->rcount; r++) {
+      if (!src0.is_null()) {
+         bld.MOV(dest, src0);
+         src0 = byte_offset(src0, dest_stride);
+      } else {
+         bld.MOV(dest, retype(brw_imm_d(0), dest.type));
+      }
+
+      for (unsigned s = 0; s < inst->sdepth; s++) {
+         fs_reg temp1 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg temp2 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg temp3 = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const brw_reg_type temp_type =
+            (inst->src[1].type == BRW_REGISTER_TYPE_B ||
+             inst->src[2].type == BRW_REGISTER_TYPE_B)
+            ? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW;
+
+         /* Expand 8 dwords of packed bytes into 16 dwords of packed
+          * words.
+          *
+          * FINISHME: Gfx9 should not need this work around. Gfx11
+          * may be able to use integer MAD. Both platforms may be
+          * able to use MAC.
+          */
+         bld.group(32, 0).MOV(retype(temp3, temp_type),
+                              retype(byte_offset(src2, r * REG_SIZE),
+                                     inst->src[2].type));
+
+         bld.MUL(subscript(temp1, temp_type, 0),
+                 subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                  BRW_REGISTER_TYPE_UD),
+                           inst->src[1].type, 0),
+                 subscript(component(retype(temp3,
+                                            BRW_REGISTER_TYPE_UD),
+                                     s * 2),
+                           temp_type, 0));
+
+         bld.MUL(subscript(temp1, temp_type, 1),
+                 subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                  BRW_REGISTER_TYPE_UD),
+                           inst->src[1].type, 1),
+                 subscript(component(retype(temp3,
+                                            BRW_REGISTER_TYPE_UD),
+                                     s * 2),
+                           temp_type, 1));
+
+         bld.MUL(subscript(temp2, temp_type, 0),
+                 subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                  BRW_REGISTER_TYPE_UD),
+                           inst->src[1].type, 2),
+                 subscript(component(retype(temp3,
+                                            BRW_REGISTER_TYPE_UD),
+                                     s * 2 + 1),
+                           temp_type, 0));
+
+         bld.MUL(subscript(temp2, temp_type, 1),
+                 subscript(retype(byte_offset(src1, s * REG_SIZE),
+                                  BRW_REGISTER_TYPE_UD),
+                           inst->src[1].type, 3),
+                 subscript(component(retype(temp3,
+                                            BRW_REGISTER_TYPE_UD),
+                                     s * 2 + 1),
+                           temp_type, 1));
+
+         bld.ADD(subscript(temp1, src0_type, 0),
+                 subscript(temp1, temp_type, 0),
+                 subscript(temp1, temp_type, 1));
+
+         bld.ADD(subscript(temp2, src0_type, 0),
+                 subscript(temp2, temp_type, 0),
+                 subscript(temp2, temp_type, 1));
+
+         bld.ADD(retype(temp1, src0_type),
+                 retype(temp1, src0_type),
+                 retype(temp2, src0_type));
+
+         bld.ADD(dest, dest, retype(temp1, src0_type))
+            ->saturate = inst->saturate;
+      }
+
+      dest = byte_offset(dest, dest_stride);
+   }
+}
+
+bool
+brw_lower_dpas(fs_visitor &v)
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, v.cfg) {
+      if (inst->opcode != BRW_OPCODE_DPAS)
+         continue;
+
+      const fs_builder bld = fs_builder(&v, block, inst).group(8, 0).exec_all();
+
+      if (brw_reg_type_is_floating_point(inst->dst.type)) {
+         f16_using_mac(bld, inst);
+      } else {
+         if (v.devinfo->ver >= 12) {
+            int8_using_dp4a(bld, inst);
+         } else {
+            int8_using_mul_add(bld, inst);
+         }
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      v.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
--- a/src/intel/compiler/elk/brw_fs_lower_pack.cpp
+++ b/src/intel/compiler/elk/brw_fs_lower_pack.cpp
@ -0,0 +1,92 @@
+/*
+ * Copyright © 2015 Connor Abbott
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/half_float.h"
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+bool
+fs_visitor::lower_pack()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      if (inst->opcode != FS_OPCODE_PACK &&
+          inst->opcode != FS_OPCODE_PACK_HALF_2x16_SPLIT)
+         continue;
+
+      assert(inst->dst.file == VGRF);
+      assert(inst->saturate == false);
+      fs_reg dst = inst->dst;
+
+      const fs_builder ibld(this, block, inst);
+      /* The lowering generates 2 instructions for what was previously 1. This
+       * can trick the IR to believe we're doing partial writes, but the
+       * register is actually fully written. Mark it as undef to help the IR
+       * reduce the liveness of the register.
+       */
+      if (!inst->is_partial_write())
+         ibld.emit_undef_for_dst(inst);
+
+      switch (inst->opcode) {
+      case FS_OPCODE_PACK:
+         for (unsigned i = 0; i < inst->sources; i++)
+            ibld.MOV(subscript(dst, inst->src[i].type, i), inst->src[i]);
+         break;
+      case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+         assert(dst.type == BRW_REGISTER_TYPE_UD);
+
+         for (unsigned i = 0; i < inst->sources; i++) {
+            if (inst->src[i].file == IMM) {
+               const uint32_t half = _mesa_float_to_half(inst->src[i].f);
+               ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, i),
+                        brw_imm_uw(half));
+            } else if (i == 1 && devinfo->ver < 9) {
+               /* Pre-Skylake requires DWord aligned destinations */
+               fs_reg tmp = ibld.vgrf(BRW_REGISTER_TYPE_UD);
+               ibld.F32TO16(subscript(tmp, BRW_REGISTER_TYPE_HF, 0),
+                            inst->src[i]);
+               ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, 1),
+                        subscript(tmp, BRW_REGISTER_TYPE_UW, 0));
+            } else {
+               ibld.F32TO16(subscript(dst, BRW_REGISTER_TYPE_HF, i),
+                            inst->src[i]);
+            }
+         }
+         break;
+      default:
+         unreachable("skipped above");
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
--- a/src/intel/compiler/elk/brw_fs_lower_regioning.cpp
+++ b/src/intel/compiler/elk/brw_fs_lower_regioning.cpp
@ -0,0 +1,677 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+namespace {
+   /* From the SKL PRM Vol 2a, "Move":
+    *
+    * "A mov with the same source and destination type, no source modifier,
+    *  and no saturation is a raw move. A packed byte destination region (B
+    *  or UB type with HorzStride == 1 and ExecSize > 1) can only be written
+    *  using raw move."
+    */
+   bool
+   is_byte_raw_mov(const fs_inst *inst)
+   {
+      return type_sz(inst->dst.type) == 1 &&
+             inst->opcode == BRW_OPCODE_MOV &&
+             inst->src[0].type == inst->dst.type &&
+             !inst->saturate &&
+             !inst->src[0].negate &&
+             !inst->src[0].abs;
+   }
+
+   /*
+    * Return an acceptable byte stride for the destination of an instruction
+    * that requires it to have some particular alignment.
+    */
+   unsigned
+   required_dst_byte_stride(const fs_inst *inst)
+   {
+      if (inst->dst.is_accumulator()) {
+         /* If the destination is an accumulator, insist that we leave the
+          * stride alone.  We cannot "fix" accumulator destinations by writing
+          * to a temporary and emitting a MOV into the original destination.
+          * For multiply instructions (our one use of the accumulator), the
+          * MUL writes the full 66 bits of the accumulator whereas the MOV we
+          * would emit only writes 33 bits and leaves the top 33 bits
+          * undefined.
+          *
+          * It's safe to just require the original stride here because the
+          * lowering pass will detect the mismatch in has_invalid_src_region
+          * and fix the sources of the multiply instead of the destination.
+          */
+         return inst->dst.stride * type_sz(inst->dst.type);
+      } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
+          !is_byte_raw_mov(inst)) {
+         return get_exec_type_size(inst);
+      } else {
+         /* Calculate the maximum byte stride and the minimum/maximum type
+          * size across all source and destination operands we are required to
+          * lower.
+          */
+         unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
+         unsigned min_size = type_sz(inst->dst.type);
+         unsigned max_size = type_sz(inst->dst.type);
+
+         for (unsigned i = 0; i < inst->sources; i++) {
+            if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
+               const unsigned size = type_sz(inst->src[i].type);
+               max_stride = MAX2(max_stride, inst->src[i].stride * size);
+               min_size = MIN2(min_size, size);
+               max_size = MAX2(max_size, size);
+            }
+         }
+
+         /* All operands involved in lowering need to fit in the calculated
+          * stride.
+          */
+         assert(max_size <= 4 * min_size);
+
+         /* Attempt to use the largest byte stride among all present operands,
+          * but never exceed a stride of 4 since that would lead to illegal
+          * destination regions during lowering.
+          */
+         return MIN2(max_stride, 4 * min_size);
+      }
+   }
+
+   /*
+    * Return an acceptable byte sub-register offset for the destination of an
+    * instruction that requires it to be aligned to the sub-register offset of
+    * the sources.
+    */
+   unsigned
+   required_dst_byte_offset(const intel_device_info *devinfo, const fs_inst *inst)
+   {
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
+            if (reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE) !=
+                reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE))
+               return 0;
+      }
+
+      return reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
+   }
+
+   /*
+    * Return the closest legal execution type for an instruction on
+    * the specified platform.
+    */
+   brw_reg_type
+   required_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
+   {
+      const brw_reg_type t = get_exec_type(inst);
+      const bool has_64bit = brw_reg_type_is_floating_point(t) ?
+         devinfo->has_64bit_float : devinfo->has_64bit_int;
+
+      switch (inst->opcode) {
+      case SHADER_OPCODE_SHUFFLE:
+         /* IVB has an issue (which we found empirically) where it reads
+          * two address register components per channel for indirectly
+          * addressed 64-bit sources.
+          *
+          * From the Cherryview PRM Vol 7. "Register Region Restrictions":
+          *
+          *    "When source or destination datatype is 64b or operation is
+          *    integer DWord multiply, indirect addressing must not be
+          *    used."
+          *
+          * Work around both of the above and handle platforms that
+          * don't support 64-bit types at all.
+          */
+         if ((!devinfo->has_64bit_int ||
+              devinfo->platform == INTEL_PLATFORM_CHV ||
+              intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
+            return BRW_REGISTER_TYPE_UD;
+         else if (has_dst_aligned_region_restriction(devinfo, inst))
+            return brw_int_type(type_sz(t), false);
+         else
+            return t;
+
+      case SHADER_OPCODE_SEL_EXEC:
+         if ((!has_64bit || devinfo->has_64bit_float_via_math_pipe) &&
+             type_sz(t) > 4)
+            return BRW_REGISTER_TYPE_UD;
+         else
+            return t;
+
+      case SHADER_OPCODE_QUAD_SWIZZLE:
+         if (has_dst_aligned_region_restriction(devinfo, inst))
+            return brw_int_type(type_sz(t), false);
+         else
+            return t;
+
+      case SHADER_OPCODE_CLUSTER_BROADCAST:
+         /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
+          *
+          *    "When source or destination datatype is 64b or operation is
+          *    integer DWord multiply, indirect addressing must not be
+          *    used."
+          *
+          * For MTL (verx10 == 125), float64 is supported, but int64 is not.
+          * Therefore we need to lower cluster broadcast using 32-bit int ops.
+          *
+          * For gfx12.5+ platforms that support int64, the register regions
+          * used by cluster broadcast aren't supported by the 64-bit pipeline.
+          *
+          * Work around the above and handle platforms that don't
+          * support 64-bit types at all.
+          */
+         if ((!has_64bit || devinfo->verx10 >= 125 ||
+              devinfo->platform == INTEL_PLATFORM_CHV ||
+              intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
+            return BRW_REGISTER_TYPE_UD;
+         else
+            return brw_int_type(type_sz(t), false);
+
+      case SHADER_OPCODE_BROADCAST:
+      case SHADER_OPCODE_MOV_INDIRECT:
+         if (((devinfo->verx10 == 70 ||
+               devinfo->platform == INTEL_PLATFORM_CHV ||
+               intel_device_info_is_9lp(devinfo) ||
+               devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
+             (devinfo->verx10 >= 125 &&
+              brw_reg_type_is_floating_point(inst->src[0].type)))
+            return brw_int_type(type_sz(t), false);
+         else
+            return t;
+
+      default:
+         return t;
+      }
+   }
+
+   /*
+    * Return the stride between channels of the specified register in
+    * byte units, or ~0u if the region cannot be represented with a
+    * single one-dimensional stride.
+    */
+   unsigned
+   byte_stride(const fs_reg &reg)
+   {
+      switch (reg.file) {
+      case BAD_FILE:
+      case UNIFORM:
+      case IMM:
+      case VGRF:
+      case MRF:
+      case ATTR:
+         return reg.stride * type_sz(reg.type);
+      case ARF:
+      case FIXED_GRF:
+         if (reg.is_null()) {
+            return 0;
+         } else {
+            const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
+            const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
+            const unsigned width = 1 << reg.width;
+
+            if (width == 1) {
+               return vstride * type_sz(reg.type);
+            } else if (hstride * width == vstride) {
+               return hstride * type_sz(reg.type);
+            } else {
+               return ~0u;
+            }
+         }
+      default:
+         unreachable("Invalid register file");
+      }
+   }
+
+   /*
+    * Return whether the instruction has an unsupported channel bit layout
+    * specified for the i-th source region.
+    */
+   bool
+   has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
+                          unsigned i)
+   {
+      if (is_send(inst) || inst->is_math() || inst->is_control_source(i) ||
+          inst->opcode == BRW_OPCODE_DPAS) {
+         return false;
+      }
+
+      /* Empirical testing shows that Broadwell has a bug affecting half-float
+       * MAD instructions when any of its sources has a non-zero offset, such
+       * as:
+       *
+       * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
+       *
+       * We used to generate code like this for SIMD8 executions where we
+       * used to pack components Y and W of a vector at offset 16B of a SIMD
+       * register. The problem doesn't occur if the stride of the source is 0.
+       */
+      if (devinfo->ver == 8 &&
+          inst->opcode == BRW_OPCODE_MAD &&
+          inst->src[i].type == BRW_REGISTER_TYPE_HF &&
+          reg_offset(inst->src[i]) % REG_SIZE > 0 &&
+          inst->src[i].stride != 0) {
+         return true;
+      }
+
+      const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
+      const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
+
+      return has_dst_aligned_region_restriction(devinfo, inst) &&
+             !is_uniform(inst->src[i]) &&
+             (byte_stride(inst->src[i]) != byte_stride(inst->dst) ||
+              src_byte_offset != dst_byte_offset);
+   }
+
+   /*
+    * Return whether the instruction has an unsupported channel bit layout
+    * specified for the destination region.
+    */
+   bool
+   has_invalid_dst_region(const intel_device_info *devinfo,
+                          const fs_inst *inst)
+   {
+      if (is_send(inst) || inst->is_math()) {
+         return false;
+      } else {
+         const brw_reg_type exec_type = get_exec_type(inst);
+         const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
+         const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
+            type_sz(inst->dst.type) < type_sz(exec_type);
+
+         return (has_dst_aligned_region_restriction(devinfo, inst) &&
+                 (required_dst_byte_stride(inst) != byte_stride(inst->dst) ||
+                  required_dst_byte_offset(devinfo, inst) != dst_byte_offset)) ||
+                (is_narrowing_conversion &&
+                 required_dst_byte_stride(inst) != byte_stride(inst->dst));
+      }
+   }
+
+   /**
+    * Return a non-zero value if the execution type of the instruction is
+    * unsupported.  The destination and sources matching the returned mask
+    * will be bit-cast to an integer type of appropriate size, lowering any
+    * source or destination modifiers into separate MOV instructions.
+    */
+   unsigned
+   has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
+   {
+      if (required_exec_type(devinfo, inst) != get_exec_type(inst)) {
+         switch (inst->opcode) {
+         case SHADER_OPCODE_SHUFFLE:
+         case SHADER_OPCODE_QUAD_SWIZZLE:
+         case SHADER_OPCODE_CLUSTER_BROADCAST:
+         case SHADER_OPCODE_BROADCAST:
+         case SHADER_OPCODE_MOV_INDIRECT:
+            return 0x1;
+
+         case SHADER_OPCODE_SEL_EXEC:
+            return 0x3;
+
+         default:
+            unreachable("Unknown invalid execution type source mask.");
+         }
+      } else {
+         return 0;
+      }
+   }
+
+   /*
+    * Return whether the instruction has unsupported source modifiers
+    * specified for the i-th source region.
+    */
+   bool
+   has_invalid_src_modifiers(const intel_device_info *devinfo,
+                             const fs_inst *inst, unsigned i)
+   {
+      return (!inst->can_do_source_mods(devinfo) &&
+              (inst->src[i].negate || inst->src[i].abs)) ||
+             ((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
+              (inst->src[i].negate || inst->src[i].abs ||
+               inst->src[i].type != get_exec_type(inst)));
+   }
+
+   /*
+    * Return whether the instruction has an unsupported type conversion
+    * specified for the destination.
+    */
+   bool
+   has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)
+   {
+      switch (inst->opcode) {
+      case BRW_OPCODE_MOV:
+         return false;
+      case BRW_OPCODE_SEL:
+         return inst->dst.type != get_exec_type(inst);
+      default:
+         /* FIXME: We assume the opcodes not explicitly mentioned before just
+          * work fine with arbitrary conversions, unless they need to be
+          * bit-cast.
+          */
+         return has_invalid_exec_type(devinfo, inst) &&
+                inst->dst.type != get_exec_type(inst);
+      }
+   }
+
+   /**
+    * Return whether the instruction has unsupported destination modifiers.
+    */
+   bool
+   has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)
+   {
+      return (has_invalid_exec_type(devinfo, inst) &&
+              (inst->saturate || inst->conditional_mod)) ||
+             has_invalid_conversion(devinfo, inst);
+   }
+
+   /**
+    * Return whether the instruction has non-standard semantics for the
+    * conditional mod which don't cause the flag register to be updated with
+    * the comparison result.
+    */
+   bool
+   has_inconsistent_cmod(const fs_inst *inst)
+   {
+      return inst->opcode == BRW_OPCODE_SEL ||
+             inst->opcode == BRW_OPCODE_CSEL ||
+             inst->opcode == BRW_OPCODE_IF ||
+             inst->opcode == BRW_OPCODE_WHILE;
+   }
+
+   bool
+   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
+}
+
+namespace brw {
+   /**
+    * Remove any modifiers from the \p i-th source region of the instruction,
+    * including negate, abs and any implicit type conversion to the execution
+    * type.  Instead any source modifiers will be implemented as a separate
+    * MOV instruction prior to the original instruction.
+    */
+   bool
+   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
+   {
+      assert(inst->components_read(i) == 1);
+      assert(v->devinfo->has_integer_dword_mul ||
+             inst->opcode != BRW_OPCODE_MUL ||
+             brw_reg_type_is_floating_point(get_exec_type(inst)) ||
+             MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||
+             type_sz(inst->src[i].type) == get_exec_type_size(inst));
+
+      const fs_builder ibld(v, block, inst);
+      const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
+
+      lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
+      inst->src[i] = tmp;
+
+      return true;
+   }
+}
+
+namespace {
+   /**
+    * Remove any modifiers from the destination region of the instruction,
+    * including saturate, conditional mod and any implicit type conversion
+    * from the execution type.  Instead any destination modifiers will be
+    * implemented as a separate MOV instruction after the original
+    * instruction.
+    */
+   bool
+   lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
+   {
+      const fs_builder ibld(v, block, inst);
+      const brw_reg_type type = get_exec_type(inst);
+      /* Not strictly necessary, but if possible use a temporary with the same
+       * channel alignment as the current destination in order to avoid
+       * violating the restrictions enforced later on by lower_src_region()
+       * and lower_dst_region(), which would introduce additional copy
+       * instructions into the program unnecessarily.
+       */
+      const unsigned stride =
+         type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
+         type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
+      fs_reg tmp = ibld.vgrf(type, stride);
+      ibld.UNDEF(tmp);
+      tmp = horiz_stride(tmp, stride);
+
+      /* Emit a MOV taking care of all the destination modifiers. */
+      fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
+      mov->saturate = inst->saturate;
+      if (!has_inconsistent_cmod(inst))
+         mov->conditional_mod = inst->conditional_mod;
+      if (inst->opcode != BRW_OPCODE_SEL) {
+         mov->predicate = inst->predicate;
+         mov->predicate_inverse = inst->predicate_inverse;
+      }
+      mov->flag_subreg = inst->flag_subreg;
+      lower_instruction(v, block, mov);
+
+      /* Point the original instruction at the temporary, and clean up any
+       * destination modifiers.
+       */
+      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
+      inst->dst = tmp;
+      inst->size_written = inst->dst.component_size(inst->exec_size);
+      inst->saturate = false;
+      if (!has_inconsistent_cmod(inst))
+         inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+      assert(!inst->flags_written(v->devinfo) || !mov->predicate);
+      return true;
+   }
+
+   /**
+    * Remove any non-trivial shuffling of data from the \p i-th source region
+    * of the instruction.  Instead implement the region as a series of integer
+    * copies into a temporary with the same channel layout as the destination.
+    */
+   bool
+   lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
+   {
+      assert(inst->components_read(i) == 1);
+      const fs_builder ibld(v, block, inst);
+      const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
+                              type_sz(inst->src[i].type);
+      assert(stride > 0);
+      fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
+      ibld.UNDEF(tmp);
+      tmp = horiz_stride(tmp, stride);
+
+      /* Emit a series of 32-bit integer copies with any source modifiers
+       * cleaned up (because their semantics are dependent on the type).
+       */
+      const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
+                                                 false);
+      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
+      fs_reg raw_src = inst->src[i];
+      raw_src.negate = false;
+      raw_src.abs = false;
+
+      for (unsigned j = 0; j < n; j++)
+         ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
+
+      /* Point the original instruction at the temporary, making sure to keep
+       * any source modifiers in the instruction.
+       */
+      fs_reg lower_src = tmp;
+      lower_src.negate = inst->src[i].negate;
+      lower_src.abs = inst->src[i].abs;
+      inst->src[i] = lower_src;
+
+      return true;
+   }
+
+   /**
+    * Remove any non-trivial shuffling of data from the destination region of
+    * the instruction.  Instead implement the region as a series of integer
+    * copies from a temporary with a channel layout compatible with the
+    * sources.
+    */
+   bool
+   lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
+   {
+      /* We cannot replace the result of an integer multiply which writes the
+       * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
+       * value whereas the MOV will act on only 32 or 33 bits of the
+       * accumulator.
+       */
+      assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
+             brw_reg_type_is_floating_point(inst->dst.type));
+
+      const fs_builder ibld(v, block, inst);
+      const unsigned stride = required_dst_byte_stride(inst) /
+                              type_sz(inst->dst.type);
+      assert(stride > 0);
+      fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
+      ibld.UNDEF(tmp);
+      tmp = horiz_stride(tmp, stride);
+
+      /* Emit a series of 32-bit integer copies from the temporary into the
+       * original destination.
+       */
+      const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
+                                                 false);
+      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
+
+      if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
+         /* Note that in general we cannot simply predicate the copies on the
+          * same flag register as the original instruction, since it may have
+          * been overwritten by the instruction itself.  Instead initialize
+          * the temporary with the previous contents of the destination
+          * register.
+          */
+         for (unsigned j = 0; j < n; j++)
+            ibld.MOV(subscript(tmp, raw_type, j),
+                     subscript(inst->dst, raw_type, j));
+      }
+
+      for (unsigned j = 0; j < n; j++)
+         ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
+                                        subscript(tmp, raw_type, j));
+
+      /* Point the original instruction at the temporary, making sure to keep
+       * any destination modifiers in the instruction.
+       */
+      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
+      inst->dst = tmp;
+      inst->size_written = inst->dst.component_size(inst->exec_size);
+
+      return true;
+   }
+
+   /**
+    * Change sources and destination of the instruction to an
+    * appropriate legal type, splitting the instruction into multiple
+    * ones of smaller execution type if necessary, to be used in cases
+    * where the execution type of an instruction is unsupported.
+    */
+   bool
+   lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)
+   {
+      assert(inst->dst.type == get_exec_type(inst));
+      const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
+      const brw_reg_type raw_type = required_exec_type(v->devinfo, inst);
+      const unsigned n = get_exec_type_size(inst) / type_sz(raw_type);
+      const fs_builder ibld(v, block, inst);
+
+      fs_reg tmp = ibld.vgrf(inst->dst.type, inst->dst.stride);
+      ibld.UNDEF(tmp);
+      tmp = horiz_stride(tmp, inst->dst.stride);
+
+      for (unsigned j = 0; j < n; j++) {
+         fs_inst sub_inst = *inst;
+
+         for (unsigned i = 0; i < inst->sources; i++) {
+            if (mask & (1u << i)) {
+               assert(inst->src[i].type == inst->dst.type);
+               sub_inst.src[i] = subscript(inst->src[i], raw_type, j);
+            }
+         }
+
+         sub_inst.dst = subscript(tmp, raw_type, j);
+
+         assert(sub_inst.size_written == sub_inst.dst.component_size(sub_inst.exec_size));
+         assert(!sub_inst.flags_written(v->devinfo) && !sub_inst.saturate);
+         ibld.emit(sub_inst);
+
+         fs_inst *mov = ibld.MOV(subscript(inst->dst, raw_type, j),
+                                 subscript(tmp, raw_type, j));
+         if (inst->opcode != BRW_OPCODE_SEL) {
+            mov->predicate = inst->predicate;
+            mov->predicate_inverse = inst->predicate_inverse;
+         }
+         lower_instruction(v, block, mov);
+      }
+
+      inst->remove(block);
+
+      return true;
+   }
+
+   /**
+    * Legalize the source and destination regioning controls of the specified
+    * instruction.
+    */
+   bool
+   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
+   {
+      const intel_device_info *devinfo = v->devinfo;
+      bool progress = false;
+
+      if (has_invalid_dst_modifiers(devinfo, inst))
+         progress |= lower_dst_modifiers(v, block, inst);
+
+      if (has_invalid_dst_region(devinfo, inst))
+         progress |= lower_dst_region(v, block, inst);
+
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (has_invalid_src_modifiers(devinfo, inst, i))
+            progress |= lower_src_modifiers(v, block, inst, i);
+
+         if (has_invalid_src_region(devinfo, inst, i))
+            progress |= lower_src_region(v, block, inst, i);
+      }
+
+      if (has_invalid_exec_type(devinfo, inst))
+         progress |= lower_exec_type(v, block, inst);
+
+      return progress;
+   }
+}
+
+bool
+fs_visitor::lower_regioning()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
+      progress |= lower_instruction(this, block, inst);
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
--- a/src/intel/compiler/elk/brw_fs_nir.cpp
+++ b/src/intel/compiler/elk/brw_fs_nir.cpp
--- a/src/intel/compiler/elk/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/elk/brw_fs_reg_allocate.cpp
--- a/src/intel/compiler/elk/brw_fs_register_coalesce.cpp
+++ b/src/intel/compiler/elk/brw_fs_register_coalesce.cpp
@ -0,0 +1,349 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_register_coalesce.cpp
+ *
+ * Implements register coalescing: Checks if the two registers involved in a
+ * raw move don't interfere, in which case they can both be stored in the same
+ * place and the MOV removed.
+ *
+ * To do this, all uses of the source of the MOV in the shader are replaced
+ * with the destination of the MOV. For example:
+ *
+ * add vgrf3:F, vgrf1:F, vgrf2:F
+ * mov vgrf4:F, vgrf3:F
+ * mul vgrf5:F, vgrf5:F, vgrf4:F
+ *
+ * becomes
+ *
+ * add vgrf4:F, vgrf1:F, vgrf2:F
+ * mul vgrf5:F, vgrf5:F, vgrf4:F
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_fs_live_variables.h"
+
+using namespace brw;
+
+static bool
+is_nop_mov(const fs_inst *inst)
+{
+   if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+      fs_reg dst = inst->dst;
+      for (int i = 0; i < inst->sources; i++) {
+         if (!dst.equals(inst->src[i])) {
+            return false;
+         }
+         dst.offset += (i < inst->header_size ? REG_SIZE :
+                        inst->exec_size * dst.stride *
+                        type_sz(inst->src[i].type));
+      }
+      return true;
+   } else if (inst->opcode == BRW_OPCODE_MOV) {
+      return inst->dst.equals(inst->src[0]);
+   }
+
+   return false;
+}
+
+static bool
+is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst)
+{
+   if ((inst->opcode != BRW_OPCODE_MOV &&
+        inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) ||
+       inst->is_partial_write() ||
+       inst->saturate ||
+       inst->src[0].file != VGRF ||
+       inst->src[0].negate ||
+       inst->src[0].abs ||
+       !inst->src[0].is_contiguous() ||
+       inst->dst.file != VGRF ||
+       inst->dst.type != inst->src[0].type) {
+      return false;
+   }
+
+   if (v->alloc.sizes[inst->src[0].nr] >
+       v->alloc.sizes[inst->dst.nr])
+      return false;
+
+   if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+      if (!is_coalescing_payload(v->alloc, inst)) {
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static bool
+can_coalesce_vars(const fs_live_variables &live, const cfg_t *cfg,
+                  const bblock_t *block, const fs_inst *inst,
+                  int dst_var, int src_var)
+{
+   if (!live.vars_interfere(src_var, dst_var))
+      return true;
+
+   int dst_start = live.start[dst_var];
+   int dst_end = live.end[dst_var];
+   int src_start = live.start[src_var];
+   int src_end = live.end[src_var];
+
+   /* Variables interfere and one line range isn't a subset of the other. */
+   if ((dst_end > src_end && src_start < dst_start) ||
+       (src_end > dst_end && dst_start < src_start))
+      return false;
+
+   /* Check for a write to either register in the intersection of their live
+    * ranges.
+    */
+   int start_ip = MAX2(dst_start, src_start);
+   int end_ip = MIN2(dst_end, src_end);
+
+   foreach_block(scan_block, cfg) {
+      if (scan_block->end_ip < start_ip)
+         continue;
+
+      int scan_ip = scan_block->start_ip - 1;
+
+      bool seen_src_write = false;
+      bool seen_copy = false;
+      foreach_inst_in_block(fs_inst, scan_inst, scan_block) {
+         scan_ip++;
+
+         /* Ignore anything before the intersection of the live ranges */
+         if (scan_ip < start_ip)
+            continue;
+
+         /* Ignore the copying instruction itself */
+         if (scan_inst == inst) {
+            seen_copy = true;
+            continue;
+         }
+
+         if (scan_ip > end_ip)
+            return true; /* registers do not interfere */
+
+         if (seen_src_write && !seen_copy) {
+            /* In order to satisfy the guarantee of register coalescing, we
+             * must ensure that the two registers always have the same value
+             * during the intersection of their live ranges.  One way to do
+             * this is to simply ensure that neither is ever written apart
+             * from the one copy which syncs up the two registers.  However,
+             * this can be overly conservative and only works in the case
+             * where the destination live range is entirely contained in the
+             * source live range.
+             *
+             * To handle the other case where the source is contained in the
+             * destination, we allow writes to the source register as long as
+             * they happen before the copy, in the same block as the copy, and
+             * the destination is never read between first such write and the
+             * copy.  This effectively moves the write from the copy up.
+             */
+            for (int j = 0; j < scan_inst->sources; j++) {
+               if (regions_overlap(scan_inst->src[j], scan_inst->size_read(j),
+                                   inst->dst, inst->size_written))
+                  return false; /* registers interfere */
+            }
+         }
+
+         /* The MOV being coalesced had better be the only instruction which
+          * writes to the coalesce destination in the intersection.
+          */
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->dst, inst->size_written))
+            return false; /* registers interfere */
+
+         /* See the big comment above */
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->src[0], inst->size_read(0))) {
+            if (seen_copy || scan_block != block ||
+                (scan_inst->force_writemask_all && !inst->force_writemask_all))
+               return false;
+            seen_src_write = true;
+         }
+      }
+   }
+
+   return true;
+}
+
+bool
+fs_visitor::register_coalesce()
+{
+   bool progress = false;
+   fs_live_variables &live = live_analysis.require();
+   int src_size = 0;
+   int channels_remaining = 0;
+   unsigned src_reg = ~0u, dst_reg = ~0u;
+   int *dst_reg_offset = new int[MAX_VGRF_SIZE(devinfo)];
+   fs_inst **mov = new fs_inst *[MAX_VGRF_SIZE(devinfo)];
+   int *dst_var = new int[MAX_VGRF_SIZE(devinfo)];
+   int *src_var = new int[MAX_VGRF_SIZE(devinfo)];
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (!is_coalesce_candidate(this, inst))
+         continue;
+
+      if (is_nop_mov(inst)) {
+         inst->opcode = BRW_OPCODE_NOP;
+         progress = true;
+         continue;
+      }
+
+      if (src_reg != inst->src[0].nr) {
+         src_reg = inst->src[0].nr;
+
+         src_size = alloc.sizes[inst->src[0].nr];
+         assert(src_size <= MAX_VGRF_SIZE(devinfo));
+
+         channels_remaining = src_size;
+         memset(mov, 0, sizeof(*mov) * MAX_VGRF_SIZE(devinfo));
+
+         dst_reg = inst->dst.nr;
+      }
+
+      if (dst_reg != inst->dst.nr)
+         continue;
+
+      if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+         for (int i = 0; i < src_size; i++) {
+            dst_reg_offset[i] = i;
+         }
+         mov[0] = inst;
+         channels_remaining -= regs_written(inst);
+      } else {
+         const int offset = inst->src[0].offset / REG_SIZE;
+         if (mov[offset]) {
+            /* This is the second time that this offset in the register has
+             * been set.  This means, in particular, that inst->dst was
+             * live before this instruction and that the live ranges of
+             * inst->dst and inst->src[0] overlap and we can't coalesce the
+             * two variables.  Let's ensure that doesn't happen.
+             */
+            channels_remaining = -1;
+            continue;
+         }
+         for (unsigned i = 0; i < MAX2(inst->size_written / REG_SIZE, 1); i++)
+            dst_reg_offset[offset + i] = inst->dst.offset / REG_SIZE + i;
+         mov[offset] = inst;
+         channels_remaining -= regs_written(inst);
+      }
+
+      if (channels_remaining)
+         continue;
+
+      bool can_coalesce = true;
+      for (int i = 0; i < src_size; i++) {
+         if (dst_reg_offset[i] != dst_reg_offset[0] + i) {
+            /* Registers are out-of-order. */
+            can_coalesce = false;
+            src_reg = ~0u;
+            break;
+         }
+
+         dst_var[i] = live.var_from_vgrf[dst_reg] + dst_reg_offset[i];
+         src_var[i] = live.var_from_vgrf[src_reg] + i;
+
+         if (!can_coalesce_vars(live, cfg, block, inst, dst_var[i], src_var[i])) {
+            can_coalesce = false;
+            src_reg = ~0u;
+            break;
+         }
+      }
+
+      if (!can_coalesce)
+         continue;
+
+      progress = true;
+
+      for (int i = 0; i < src_size; i++) {
+         if (!mov[i])
+            continue;
+
+         if (mov[i]->conditional_mod == BRW_CONDITIONAL_NONE) {
+            mov[i]->opcode = BRW_OPCODE_NOP;
+            mov[i]->dst = reg_undef;
+            for (int j = 0; j < mov[i]->sources; j++) {
+               mov[i]->src[j] = reg_undef;
+            }
+         } else {
+            /* If we have a conditional modifier, rewrite the MOV to be a
+             * MOV.cmod from the coalesced register.  Hopefully, cmod
+             * propagation will clean this up and move it to the instruction
+             * that writes the register.  If not, this keeps things correct
+             * while still letting us coalesce.
+             */
+            assert(mov[i]->opcode == BRW_OPCODE_MOV);
+            assert(mov[i]->sources == 1);
+            mov[i]->src[0] = mov[i]->dst;
+            mov[i]->dst = retype(brw_null_reg(), mov[i]->dst.type);
+         }
+      }
+
+      foreach_block_and_inst(block, fs_inst, scan_inst, cfg) {
+         if (scan_inst->dst.file == VGRF &&
+             scan_inst->dst.nr == src_reg) {
+            scan_inst->dst.nr = dst_reg;
+            scan_inst->dst.offset = scan_inst->dst.offset % REG_SIZE +
+               dst_reg_offset[scan_inst->dst.offset / REG_SIZE] * REG_SIZE;
+         }
+
+         for (int j = 0; j < scan_inst->sources; j++) {
+            if (scan_inst->src[j].file == VGRF &&
+                scan_inst->src[j].nr == src_reg) {
+               scan_inst->src[j].nr = dst_reg;
+               scan_inst->src[j].offset = scan_inst->src[j].offset % REG_SIZE +
+                  dst_reg_offset[scan_inst->src[j].offset / REG_SIZE] * REG_SIZE;
+            }
+         }
+      }
+
+      for (int i = 0; i < src_size; i++) {
+         live.start[dst_var[i]] = MIN2(live.start[dst_var[i]],
+                                       live.start[src_var[i]]);
+         live.end[dst_var[i]] = MAX2(live.end[dst_var[i]],
+                                     live.end[src_var[i]]);
+      }
+      src_reg = ~0u;
+   }
+
+   if (progress) {
+      foreach_block_and_inst_safe (block, backend_instruction, inst, cfg) {
+         if (inst->opcode == BRW_OPCODE_NOP) {
+            inst->remove(block, true);
+         }
+      }
+
+      cfg->adjust_block_ips();
+
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+   }
+
+   delete[] src_var;
+   delete[] dst_var;
+   delete[] mov;
+   delete[] dst_reg_offset;
+
+   return progress;
+}
--- a/src/intel/compiler/elk/brw_fs_saturate_propagation.cpp
+++ b/src/intel/compiler/elk/brw_fs_saturate_propagation.cpp
@ -0,0 +1,165 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_live_variables.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+/** @file brw_fs_saturate_propagation.cpp
+ *
+ * Implements a pass that propagates the SAT modifier from a MOV.SAT into the
+ * instruction that produced the source of the MOV.SAT, thereby allowing the
+ * MOV's src and dst to be coalesced and the MOV removed.
+ *
+ * For instance,
+ *
+ *    ADD     tmp, src0, src1
+ *    MOV.SAT dst, tmp
+ *
+ * would be transformed into
+ *
+ *    ADD.SAT tmp, src0, src1
+ *    MOV     dst, tmp
+ */
+
+static bool
+opt_saturate_propagation_local(const fs_live_variables &live, bblock_t *block)
+{
+   bool progress = false;
+   int ip = block->end_ip + 1;
+
+   foreach_inst_in_block_reverse(fs_inst, inst, block) {
+      ip--;
+
+      if (inst->opcode != BRW_OPCODE_MOV ||
+          !inst->saturate ||
+          inst->dst.file != VGRF ||
+          inst->dst.type != inst->src[0].type ||
+          inst->src[0].file != VGRF ||
+          inst->src[0].abs)
+         continue;
+
+      int src_var = live.var_from_reg(inst->src[0]);
+      int src_end_ip = live.end[src_var];
+
+      bool interfered = false;
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+         if (scan_inst->exec_size == inst->exec_size &&
+             regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->src[0], inst->size_read(0))) {
+            if (scan_inst->is_partial_write() ||
+                (scan_inst->dst.type != inst->dst.type &&
+                 !scan_inst->can_change_types()))
+               break;
+
+            if (scan_inst->saturate) {
+               inst->saturate = false;
+               progress = true;
+            } else if (src_end_ip == ip || inst->dst.equals(inst->src[0])) {
+               if (scan_inst->can_do_saturate()) {
+                  if (scan_inst->dst.type != inst->dst.type) {
+                     scan_inst->dst.type = inst->dst.type;
+                     for (int i = 0; i < scan_inst->sources; i++) {
+                        scan_inst->src[i].type = inst->dst.type;
+                     }
+                  }
+
+                  if (inst->src[0].negate) {
+                     if (scan_inst->opcode == BRW_OPCODE_MUL) {
+                        scan_inst->src[0].negate = !scan_inst->src[0].negate;
+                        inst->src[0].negate = false;
+                     } else if (scan_inst->opcode == BRW_OPCODE_MAD) {
+                        for (int i = 0; i < 2; i++) {
+                           if (scan_inst->src[i].file == IMM) {
+                              brw_negate_immediate(scan_inst->src[i].type,
+                                                   &scan_inst->src[i].as_brw_reg());
+                           } else {
+                              scan_inst->src[i].negate = !scan_inst->src[i].negate;
+                           }
+                        }
+                        inst->src[0].negate = false;
+                     } else if (scan_inst->opcode == BRW_OPCODE_ADD) {
+                        if (scan_inst->src[1].file == IMM) {
+                           if (!brw_negate_immediate(scan_inst->src[1].type,
+                                                     &scan_inst->src[1].as_brw_reg())) {
+                              break;
+                           }
+                        } else {
+                           scan_inst->src[1].negate = !scan_inst->src[1].negate;
+                        }
+                        scan_inst->src[0].negate = !scan_inst->src[0].negate;
+                        inst->src[0].negate = false;
+                     } else {
+                        break;
+                     }
+                  }
+
+                  scan_inst->saturate = true;
+                  inst->saturate = false;
+                  progress = true;
+               }
+            }
+            break;
+         }
+         for (int i = 0; i < scan_inst->sources; i++) {
+            if (scan_inst->src[i].file == VGRF &&
+                scan_inst->src[i].nr == inst->src[0].nr &&
+                regions_overlap(
+                  scan_inst->src[i], scan_inst->size_read(i),
+                  inst->src[0], inst->size_read(0))) {
+               if (scan_inst->opcode != BRW_OPCODE_MOV ||
+                   !scan_inst->saturate ||
+                   scan_inst->src[0].abs ||
+                   scan_inst->src[0].negate ||
+                   scan_inst->src[0].abs != inst->src[0].abs ||
+                   scan_inst->src[0].negate != inst->src[0].negate) {
+                  interfered = true;
+                  break;
+               }
+            }
+         }
+
+         if (interfered)
+            break;
+      }
+   }
+
+   return progress;
+}
+
+bool
+fs_visitor::opt_saturate_propagation()
+{
+   const fs_live_variables &live = live_analysis.require();
+   bool progress = false;
+
+   foreach_block (block, cfg) {
+      progress = opt_saturate_propagation_local(live, block) || progress;
+   }
+
+   /* Live intervals are still valid. */
+
+   return progress;
+}
--- a/src/intel/compiler/elk/brw_fs_scoreboard.cpp
+++ b/src/intel/compiler/elk/brw_fs_scoreboard.cpp
--- a/src/intel/compiler/elk/brw_fs_sel_peephole.cpp
+++ b/src/intel/compiler/elk/brw_fs_sel_peephole.cpp
@ -0,0 +1,229 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+#include "brw_cfg.h"
+
+/** @file brw_fs_sel_peephole.cpp
+ *
+ * This file contains the opt_peephole_sel() optimization pass that replaces
+ * MOV instructions to the same destination in the "then" and "else" bodies of
+ * an if statement with SEL instructions.
+ */
+
+/* Four MOVs seems to be pretty typical, so I picked the next power of two in
+ * the hopes that it would handle almost anything possible in a single
+ * pass.
+ */
+#define MAX_MOVS 8 /**< The maximum number of MOVs to attempt to match. */
+
+using namespace brw;
+
+/**
+ * Scans forwards from an IF counting consecutive MOV instructions in the
+ * "then" and "else" blocks of the if statement.
+ *
+ * A pointer to the bblock_t following the IF is passed as the <then_block>
+ * argument. The function stores pointers to the MOV instructions in the
+ * <then_mov> and <else_mov> arrays.
+ *
+ * \return the minimum number of MOVs found in the two branches or zero if
+ *         an error occurred.
+ *
+ * E.g.:
+ *                  IF ...
+ *    then_mov[0] = MOV g4, ...
+ *    then_mov[1] = MOV g5, ...
+ *    then_mov[2] = MOV g6, ...
+ *                  ELSE ...
+ *    else_mov[0] = MOV g4, ...
+ *    else_mov[1] = MOV g5, ...
+ *    else_mov[2] = MOV g7, ...
+ *                  ENDIF
+ *    returns 3.
+ */
+static int
+count_movs_from_if(const intel_device_info *devinfo,
+                   fs_inst *then_mov[MAX_MOVS], fs_inst *else_mov[MAX_MOVS],
+                   bblock_t *then_block, bblock_t *else_block)
+{
+   int then_movs = 0;
+   foreach_inst_in_block(fs_inst, inst, then_block) {
+      if (then_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV ||
+          inst->flags_written(devinfo))
+         break;
+
+      then_mov[then_movs] = inst;
+      then_movs++;
+   }
+
+   int else_movs = 0;
+   foreach_inst_in_block(fs_inst, inst, else_block) {
+      if (else_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV ||
+          inst->flags_written(devinfo))
+         break;
+
+      else_mov[else_movs] = inst;
+      else_movs++;
+   }
+
+   return MIN2(then_movs, else_movs);
+}
+
+/**
+ * Try to replace IF/MOV+/ELSE/MOV+/ENDIF with SEL.
+ *
+ * Many GLSL shaders contain the following pattern:
+ *
+ *    x = condition ? foo : bar
+ *
+ * or
+ *
+ *    if (...) a.xyzw = foo.xyzw;
+ *    else     a.xyzw = bar.xyzw;
+ *
+ * The compiler emits an ir_if tree for this, since each subexpression might be
+ * a complex tree that could have side-effects or short-circuit logic.
+ *
+ * However, the common case is to simply select one of two constants or
+ * variable values---which is exactly what SEL is for.  In this case, the
+ * assembly looks like:
+ *
+ *    (+f0) IF
+ *    MOV dst src0
+ *    ...
+ *    ELSE
+ *    MOV dst src1
+ *    ...
+ *    ENDIF
+ *
+ * where each pair of MOVs to a common destination and can be easily translated
+ * into
+ *
+ *    (+f0) SEL dst src0 src1
+ *
+ * If src0 is an immediate value, we promote it to a temporary GRF.
+ */
+bool
+fs_visitor::opt_peephole_sel()
+{
+   bool progress = false;
+
+   foreach_block (block, cfg) {
+      /* IF instructions, by definition, can only be found at the ends of
+       * basic blocks.
+       */
+      fs_inst *if_inst = (fs_inst *)block->end();
+      if (if_inst->opcode != BRW_OPCODE_IF)
+         continue;
+
+      fs_inst *else_mov[MAX_MOVS] = { NULL };
+      fs_inst *then_mov[MAX_MOVS] = { NULL };
+
+      bblock_t *then_block = block->next();
+      bblock_t *else_block = NULL;
+      foreach_list_typed(bblock_link, child, link, &block->children) {
+         if (child->block != then_block) {
+            if (child->block->prev()->end()->opcode == BRW_OPCODE_ELSE) {
+               else_block = child->block;
+            }
+            break;
+         }
+      }
+      if (else_block == NULL)
+         continue;
+
+      int movs = count_movs_from_if(devinfo, then_mov, else_mov, then_block, else_block);
+
+      if (movs == 0)
+         continue;
+
+      /* Generate SEL instructions for pairs of MOVs to a common destination. */
+      for (int i = 0; i < movs; i++) {
+         if (!then_mov[i] || !else_mov[i])
+            break;
+
+         /* Check that the MOVs are the right form. */
+         if (!then_mov[i]->dst.equals(else_mov[i]->dst) ||
+             then_mov[i]->exec_size != else_mov[i]->exec_size ||
+             then_mov[i]->group != else_mov[i]->group ||
+             then_mov[i]->force_writemask_all != else_mov[i]->force_writemask_all ||
+             then_mov[i]->is_partial_write() ||
+             else_mov[i]->is_partial_write() ||
+             then_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE ||
+             else_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE) {
+            movs = i;
+            break;
+         }
+
+         /* Check that source types for mov operations match. */
+         if (then_mov[i]->src[0].type != else_mov[i]->src[0].type) {
+            movs = i;
+            break;
+         }
+      }
+
+      if (movs == 0)
+         continue;
+
+      for (int i = 0; i < movs; i++) {
+         const fs_builder ibld = fs_builder(this, then_block, then_mov[i])
+                                 .at(block, if_inst);
+
+         if (then_mov[i]->src[0].equals(else_mov[i]->src[0])) {
+            ibld.MOV(then_mov[i]->dst, then_mov[i]->src[0]);
+         } else {
+            /* Only the last source register can be a constant, so if the MOV
+             * in the "then" clause uses a constant, we need to put it in a
+             * temporary.
+             */
+            fs_reg src0(then_mov[i]->src[0]);
+            if (src0.file == IMM) {
+               src0 = ibld.vgrf(then_mov[i]->src[0].type);
+               ibld.MOV(src0, then_mov[i]->src[0]);
+            }
+
+            /* 64-bit immediates can't be placed in src1. */
+            fs_reg src1(else_mov[i]->src[0]);
+            if (src1.file == IMM && type_sz(src1.type) == 8) {
+               src1 = ibld.vgrf(else_mov[i]->src[0].type);
+               ibld.MOV(src1, else_mov[i]->src[0]);
+            }
+
+            set_predicate_inv(if_inst->predicate, if_inst->predicate_inverse,
+                              ibld.SEL(then_mov[i]->dst, src0, src1));
+         }
+
+         then_mov[i]->remove(then_block);
+         else_mov[i]->remove(else_block);
+      }
+
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
+
+   return progress;
+}
--- a/src/intel/compiler/elk/brw_fs_thread_payload.cpp
+++ b/src/intel/compiler/elk/brw_fs_thread_payload.cpp
@ -0,0 +1,605 @@
+/*
+ * Copyright © 2006-2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+vs_thread_payload::vs_thread_payload(const fs_visitor &v)
+{
+   unsigned r = 0;
+
+   /* R0: Thread header. */
+   r += reg_unit(v.devinfo);
+
+   /* R1: URB handles. */
+   urb_handles = brw_ud8_grf(r, 0);
+   r += reg_unit(v.devinfo);
+
+   num_regs = r;
+}
+
+tcs_thread_payload::tcs_thread_payload(const fs_visitor &v)
+{
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data);
+   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(v.prog_data);
+   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) v.key;
+
+   if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH) {
+      patch_urb_output = brw_ud1_grf(0, 0);
+      primitive_id = brw_vec1_grf(0, 1);
+
+      /* r1-r4 contain the ICP handles. */
+      icp_handle_start = brw_ud8_grf(1, 0);
+
+      num_regs = 5;
+   } else {
+      assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
+      assert(tcs_key->input_vertices <= BRW_MAX_TCS_INPUT_VERTICES);
+
+      unsigned r = 0;
+
+      r += reg_unit(v.devinfo);
+
+      patch_urb_output = brw_ud8_grf(r, 0);
+      r += reg_unit(v.devinfo);
+
+      if (tcs_prog_data->include_primitive_id) {
+         primitive_id = brw_vec8_grf(r, 0);
+         r += reg_unit(v.devinfo);
+      }
+
+      /* ICP handles occupy the next 1-32 registers. */
+      icp_handle_start = brw_ud8_grf(r, 0);
+      r += brw_tcs_prog_key_input_vertices(tcs_key) * reg_unit(v.devinfo);
+
+      num_regs = r;
+   }
+}
+
+tes_thread_payload::tes_thread_payload(const fs_visitor &v)
+{
+   unsigned r = 0;
+
+   /* R0: Thread Header. */
+   patch_urb_input = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
+   primitive_id = brw_vec1_grf(0, 1);
+   r += reg_unit(v.devinfo);
+
+   /* R1-3: gl_TessCoord.xyz. */
+   for (unsigned i = 0; i < 3; i++) {
+      coords[i] = brw_vec8_grf(r, 0);
+      r += reg_unit(v.devinfo);
+   }
+
+   /* R4: URB output handles. */
+   urb_output = brw_ud8_grf(r, 0);
+   r += reg_unit(v.devinfo);
+
+   num_regs = r;
+}
+
+gs_thread_payload::gs_thread_payload(fs_visitor &v)
+{
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data);
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(v.prog_data);
+   const fs_builder bld = fs_builder(&v).at_end();
+
+   /* R0: thread header. */
+   unsigned r = reg_unit(v.devinfo);
+
+   /* R1: output URB handles. */
+   urb_handles = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   bld.AND(urb_handles, brw_ud8_grf(r, 0),
+         v.devinfo->ver >= 20 ? brw_imm_ud(0xFFFFFF) : brw_imm_ud(0xFFFF));
+
+   /* R1: Instance ID stored in bits 31:27 */
+   instance_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   bld.SHR(instance_id, brw_ud8_grf(r, 0), brw_imm_ud(27u));
+
+   r += reg_unit(v.devinfo);
+
+   if (gs_prog_data->include_primitive_id) {
+      primitive_id = brw_ud8_grf(r, 0);
+      r += reg_unit(v.devinfo);
+   }
+
+   /* Always enable VUE handles so we can safely use pull model if needed.
+    *
+    * The push model for a GS uses a ton of register space even for trivial
+    * scenarios with just a few inputs, so just make things easier and a bit
+    * safer by always having pull model available.
+    */
+   gs_prog_data->base.include_vue_handles = true;
+
+   /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
+   icp_handle_start = brw_ud8_grf(r, 0);
+   r += v.nir->info.gs.vertices_in * reg_unit(v.devinfo);
+
+   num_regs = r;
+
+   /* Use a maximum of 24 registers for push-model inputs. */
+   const unsigned max_push_components = 24;
+
+   /* If pushing our inputs would take too many registers, reduce the URB read
+    * length (which is in HWords, or 8 registers), and resort to pulling.
+    *
+    * Note that the GS reads <URB Read Length> HWords for every vertex - so we
+    * have to multiply by VerticesIn to obtain the total storage requirement.
+    */
+   if (8 * vue_prog_data->urb_read_length * v.nir->info.gs.vertices_in >
+       max_push_components) {
+      vue_prog_data->urb_read_length =
+         ROUND_DOWN_TO(max_push_components / v.nir->info.gs.vertices_in, 8) / 8;
+   }
+}
+
+static inline void
+setup_fs_payload_gfx20(fs_thread_payload &payload,
+                       const fs_visitor &v,
+                       bool &source_depth_to_render_target)
+{
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
+   const unsigned payload_width = 16;
+   assert(v.dispatch_width % payload_width == 0);
+   assert(v.devinfo->ver >= 20);
+
+   for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
+      /* R0-1: PS thread payload header, masks and pixel X/Y coordinates. */
+      payload.num_regs++;
+      payload.subspan_coord_reg[j] = payload.num_regs++;
+   }
+
+   for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
+      /* R2-13: Barycentric interpolation coordinates.  These appear
+       * in the same order that they appear in the brw_barycentric_mode
+       * enum.  Each set of coordinates occupies 2 64B registers per
+       * SIMD16 half.  Coordinates only appear if they were enabled
+       * using the "Barycentric Interpolation Mode" bits in WM_STATE.
+       */
+      for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
+         if (prog_data->barycentric_interp_modes & (1 << i)) {
+            payload.barycentric_coord_reg[i][j] = payload.num_regs;
+            payload.num_regs += payload_width / 4;
+         }
+      }
+
+      /* R14: Interpolated depth if "Pixel Shader Uses Source Depth" is set. */
+      if (prog_data->uses_src_depth) {
+         payload.source_depth_reg[j] = payload.num_regs;
+         payload.num_regs += payload_width / 8;
+      }
+
+      /* R15: Interpolated W if "Pixel Shader Uses Source W" is set. */
+      if (prog_data->uses_src_w) {
+         payload.source_w_reg[j] = payload.num_regs;
+         payload.num_regs += payload_width / 8;
+      }
+
+      /* R16: MSAA input coverage mask if "Pixel Shader Uses Input
+       * Coverage Mask" is set.
+       */
+      if (prog_data->uses_sample_mask) {
+         payload.sample_mask_in_reg[j] = payload.num_regs;
+         payload.num_regs += payload_width / 8;
+      }
+
+      /* R19: MSAA position XY offsets if "Position XY Offset Select"
+       * is either POSOFFSET_CENTROID or POSOFFSET_SAMPLE.  Note that
+       * this is delivered as a single SIMD32 vector, inconsistently
+       * with most other PS payload fields.
+       */
+      if (prog_data->uses_pos_offset && j == 0) {
+         for (unsigned k = 0; k < 2; k++) {
+            payload.sample_pos_reg[k] = payload.num_regs;
+            payload.num_regs++;
+         }
+      }
+   }
+
+   if (prog_data->uses_depth_w_coefficients) {
+      assert(v.max_polygons == 1);
+      payload.depth_w_coef_reg = payload.num_regs;
+      payload.num_regs += 2;
+   }
+
+   if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+      source_depth_to_render_target = true;
+   }
+}
+
+static inline void
+setup_fs_payload_gfx6(fs_thread_payload &payload,
+                      const fs_visitor &v,
+                      bool &source_depth_to_render_target)
+{
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
+
+   const unsigned payload_width = MIN2(16, v.dispatch_width);
+   assert(v.dispatch_width % payload_width == 0);
+   assert(v.devinfo->ver >= 6 && v.devinfo->ver < 20);
+
+   payload.num_regs = 0;
+
+   /* R0: PS thread payload header. */
+   payload.num_regs++;
+
+   for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
+      /* R1: masks, pixel X/Y coordinates. */
+      payload.subspan_coord_reg[j] = payload.num_regs++;
+   }
+
+   for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
+      /* R3-26: barycentric interpolation coordinates.  These appear in the
+       * same order that they appear in the brw_barycentric_mode enum.  Each
+       * set of coordinates occupies 2 registers if dispatch width == 8 and 4
+       * registers if dispatch width == 16.  Coordinates only appear if they
+       * were enabled using the "Barycentric Interpolation Mode" bits in
+       * WM_STATE.
+       */
+      for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
+         if (prog_data->barycentric_interp_modes & (1 << i)) {
+            payload.barycentric_coord_reg[i][j] = payload.num_regs;
+            payload.num_regs += payload_width / 4;
+         }
+      }
+
+      /* R27-28: interpolated depth if uses source depth */
+      if (prog_data->uses_src_depth) {
+         payload.source_depth_reg[j] = payload.num_regs;
+         payload.num_regs += payload_width / 8;
+      }
+
+      /* R29-30: interpolated W set if GFX6_WM_USES_SOURCE_W. */
+      if (prog_data->uses_src_w) {
+         payload.source_w_reg[j] = payload.num_regs;
+         payload.num_regs += payload_width / 8;
+      }
+
+      /* R31: MSAA position offsets. */
+      if (prog_data->uses_pos_offset) {
+         payload.sample_pos_reg[j] = payload.num_regs;
+         payload.num_regs++;
+      }
+
+      /* R32-33: MSAA input coverage mask */
+      if (prog_data->uses_sample_mask) {
+         assert(v.devinfo->ver >= 7);
+         payload.sample_mask_in_reg[j] = payload.num_regs;
+         payload.num_regs += payload_width / 8;
+      }
+   }
+
+   /* R66: Source Depth and/or W Attribute Vertex Deltas */
+   if (prog_data->uses_depth_w_coefficients) {
+      assert(v.max_polygons == 1);
+      payload.depth_w_coef_reg = payload.num_regs;
+      payload.num_regs++;
+   }
+
+   if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+      source_depth_to_render_target = true;
+   }
+}
+
+#undef P                        /* prompted depth */
+#undef C                        /* computed */
+#undef N                        /* non-promoted? */
+
+#define P 0
+#define C 1
+#define N 2
+
+static const struct {
+   GLuint mode:2;
+   GLuint sd_present:1;
+   GLuint sd_to_rt:1;
+   GLuint dd_present:1;
+   GLuint ds_present:1;
+} wm_iz_table[BRW_WM_IZ_BIT_MAX] =
+{
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 1 },
+ { N, 0, 1, 0, 1 },
+ { N, 0, 1, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 1 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 0, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 1, 1, 0, 1 },
+ { C, 0, 1, 0, 1 },
+ { C, 0, 1, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 1, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 1, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 }
+};
+
+/**
+ * \param line_aa  BRW_NEVER, BRW_ALWAYS or BRW_SOMETIMES
+ * \param lookup  bitmask of BRW_WM_IZ_* flags
+ */
+static inline void
+setup_fs_payload_gfx4(fs_thread_payload &payload,
+                      const fs_visitor &v,
+                      bool &source_depth_to_render_target,
+                      bool &runtime_check_aads_emit)
+{
+   assert(v.dispatch_width <= 16);
+
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
+   brw_wm_prog_key *key = (brw_wm_prog_key *) v.key;
+
+   GLuint reg = 1;
+   bool kill_stats_promoted_workaround = false;
+   int lookup = key->iz_lookup;
+
+   assert(lookup < BRW_WM_IZ_BIT_MAX);
+
+   /* Crazy workaround in the windowizer, which we need to track in
+    * our register allocation and render target writes.  See the "If
+    * statistics are enabled..." paragraph of 11.5.3.2: Early Depth
+    * Test Cases [Pre-DevGT] of the 3D Pipeline - Windower B-Spec.
+    */
+   if (key->stats_wm &&
+       (lookup & BRW_WM_IZ_PS_KILL_ALPHATEST_BIT) &&
+       wm_iz_table[lookup].mode == P) {
+      kill_stats_promoted_workaround = true;
+   }
+
+   payload.subspan_coord_reg[0] = reg++;
+
+   if (wm_iz_table[lookup].sd_present || prog_data->uses_src_depth ||
+       kill_stats_promoted_workaround) {
+      payload.source_depth_reg[0] = reg;
+      reg += 2;
+   }
+
+   if (wm_iz_table[lookup].sd_to_rt || kill_stats_promoted_workaround)
+      source_depth_to_render_target = true;
+
+   if (wm_iz_table[lookup].ds_present || key->line_aa != BRW_NEVER) {
+      payload.aa_dest_stencil_reg[0] = reg;
+      runtime_check_aads_emit =
+         !wm_iz_table[lookup].ds_present && key->line_aa == BRW_SOMETIMES;
+      reg++;
+   }
+
+   if (wm_iz_table[lookup].dd_present) {
+      payload.dest_depth_reg[0] = reg;
+      reg+=2;
+   }
+
+   payload.num_regs = reg;
+}
+
+#undef P                        /* prompted depth */
+#undef C                        /* computed */
+#undef N                        /* non-promoted? */
+
+fs_thread_payload::fs_thread_payload(const fs_visitor &v,
+                                     bool &source_depth_to_render_target,
+                                     bool &runtime_check_aads_emit)
+  : subspan_coord_reg(),
+    source_depth_reg(),
+    source_w_reg(),
+    aa_dest_stencil_reg(),
+    dest_depth_reg(),
+    sample_pos_reg(),
+    sample_mask_in_reg(),
+    depth_w_coef_reg(),
+    barycentric_coord_reg()
+{
+   if (v.devinfo->ver >= 20)
+      setup_fs_payload_gfx20(*this, v, source_depth_to_render_target);
+   else if (v.devinfo->ver >= 6)
+      setup_fs_payload_gfx6(*this, v, source_depth_to_render_target);
+   else
+      setup_fs_payload_gfx4(*this, v, source_depth_to_render_target,
+                            runtime_check_aads_emit);
+}
+
+cs_thread_payload::cs_thread_payload(const fs_visitor &v)
+{
+   struct brw_cs_prog_data *prog_data = brw_cs_prog_data(v.prog_data);
+
+   unsigned r = reg_unit(v.devinfo);
+
+   /* See nir_setup_uniforms for subgroup_id in earlier versions. */
+   if (v.devinfo->verx10 >= 125) {
+      subgroup_id_ = brw_ud1_grf(0, 2);
+
+      for (int i = 0; i < 3; i++) {
+         if (prog_data->generate_local_id & (1 << i)) {
+            local_invocation_id[i] = brw_uw8_grf(r, 0);
+            r += reg_unit(v.devinfo);
+            if (v.devinfo->ver < 20 && v.dispatch_width == 32)
+               r += reg_unit(v.devinfo);
+         } else {
+            local_invocation_id[i] = brw_imm_uw(0);
+         }
+      }
+
+      /* TODO: Fill out uses_btd_stack_ids automatically */
+      if (prog_data->uses_btd_stack_ids)
+         r += reg_unit(v.devinfo);
+   }
+
+   num_regs = r;
+}
+
+void
+cs_thread_payload::load_subgroup_id(const fs_builder &bld,
+                                    fs_reg &dest) const
+{
+   auto devinfo = bld.shader->devinfo;
+   dest = retype(dest, BRW_REGISTER_TYPE_UD);
+
+   if (subgroup_id_.file != BAD_FILE) {
+      assert(devinfo->verx10 >= 125);
+      bld.AND(dest, subgroup_id_, brw_imm_ud(INTEL_MASK(7, 0)));
+   } else {
+      assert(devinfo->verx10 < 125);
+      assert(gl_shader_stage_is_compute(bld.shader->stage));
+      int index = brw_get_subgroup_id_param_index(devinfo,
+                                                  bld.shader->stage_prog_data);
+      bld.MOV(dest, fs_reg(UNIFORM, index, BRW_REGISTER_TYPE_UD));
+   }
+}
+
+task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v)
+   : cs_thread_payload(v)
+{
+   /* Task and Mesh Shader Payloads (SIMD8 and SIMD16)
+    *
+    *  R0: Header
+    *  R1: Local_ID.X[0-7 or 0-15]
+    *  R2: Inline Parameter
+    *
+    * Task and Mesh Shader Payloads (SIMD32)
+    *
+    *  R0: Header
+    *  R1: Local_ID.X[0-15]
+    *  R2: Local_ID.X[16-31]
+    *  R3: Inline Parameter
+    *
+    * Local_ID.X values are 16 bits.
+    *
+    * Inline parameter is optional but always present since we use it to pass
+    * the address to descriptors.
+    */
+
+   const fs_builder bld = fs_builder(&v).at_end();
+
+   unsigned r = 0;
+   assert(subgroup_id_.file != BAD_FILE);
+   extended_parameter_0 = retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD);
+
+   if (v.devinfo->ver >= 20) {
+      urb_output = brw_ud1_grf(1, 0);
+   } else {
+      urb_output = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      /* In both mesh and task shader payload, lower 16 bits of g0.6 is
+       * an offset within Slice's Local URB, which says where shader is
+       * supposed to output its data.
+       */
+      bld.AND(urb_output, brw_ud1_grf(0, 6), brw_imm_ud(0xFFFF));
+   }
+
+   if (v.stage == MESA_SHADER_MESH) {
+      /* g0.7 is Task Shader URB Entry Offset, which contains both an offset
+       * within Slice's Local USB (bits 0:15) and a slice selector
+       * (bits 16:24). Slice selector can be non zero when mesh shader
+       * is spawned on slice other than the one where task shader was run.
+       * Bit 24 says that Slice ID is present and bits 16:23 is the Slice ID.
+       */
+      task_urb_input = brw_ud1_grf(0, 7);
+   }
+   r += reg_unit(v.devinfo);
+
+   local_index = brw_uw8_grf(r, 0);
+   r += reg_unit(v.devinfo);
+   if (v.devinfo->ver < 20 && v.dispatch_width == 32)
+      r += reg_unit(v.devinfo);
+
+   inline_parameter = brw_ud1_grf(r, 0);
+   r += reg_unit(v.devinfo);
+
+   num_regs = r;
+}
+
+bs_thread_payload::bs_thread_payload(const fs_visitor &v)
+{
+   unsigned r = 0;
+
+   /* R0: Thread header. */
+   r += reg_unit(v.devinfo);
+
+   /* R1: Stack IDs. */
+   r += reg_unit(v.devinfo);
+
+   /* R2: Inline Parameter.  Used for argument addresses. */
+   global_arg_ptr = brw_ud1_grf(r, 0);
+   local_arg_ptr = brw_ud1_grf(r, 2);
+   r += reg_unit(v.devinfo);
+
+   num_regs = r;
+}
+
+void
+bs_thread_payload::load_shader_type(const fs_builder &bld, fs_reg &dest) const
+{
+   fs_reg ud_dest = retype(dest, BRW_REGISTER_TYPE_UD);
+   bld.MOV(ud_dest, retype(brw_vec1_grf(0, 3), ud_dest.type));
+   bld.AND(ud_dest, ud_dest, brw_imm_ud(0xf));
+}
--- a/src/intel/compiler/elk/brw_fs_validate.cpp
+++ b/src/intel/compiler/elk/brw_fs_validate.cpp
@ -0,0 +1,199 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_validate.cpp
+ *
+ * Implements a pass that validates various invariants of the IR.  The current
+ * pass only validates that GRF's uses are sane.  More can be added later.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+#define fsv_assert(assertion)                                           \
+   {                                                                    \
+      if (!(assertion)) {                                               \
+         fprintf(stderr, "ASSERT: Scalar %s validation failed!\n",      \
+                 _mesa_shader_stage_to_abbrev(stage));                  \
+         dump_instruction(inst, stderr);                                \
+         fprintf(stderr, "%s:%d: '%s' failed\n", __FILE__, __LINE__, #assertion);  \
+         abort();                                                       \
+      }                                                                 \
+   }
+
+#define fsv_assert_eq(first, second)                                    \
+   {                                                                    \
+      unsigned f = (first);                                             \
+      unsigned s = (second);                                            \
+      if (f != s) {                                                     \
+         fprintf(stderr, "ASSERT: Scalar %s validation failed!\n",      \
+                 _mesa_shader_stage_to_abbrev(stage));                  \
+         dump_instruction(inst, stderr);                                \
+         fprintf(stderr, "%s:%d: A == B failed\n", __FILE__, __LINE__); \
+         fprintf(stderr, "  A = %s = %u\n", #first, f);                 \
+         fprintf(stderr, "  B = %s = %u\n", #second, s);                \
+         abort();                                                       \
+      }                                                                 \
+   }
+
+#define fsv_assert_ne(first, second)                                    \
+   {                                                                    \
+      unsigned f = (first);                                             \
+      unsigned s = (second);                                            \
+      if (f == s) {                                                     \
+         fprintf(stderr, "ASSERT: Scalar %s validation failed!\n",      \
+                 _mesa_shader_stage_to_abbrev(stage));                  \
+         dump_instruction(inst, stderr);                                \
+         fprintf(stderr, "%s:%d: A != B failed\n", __FILE__, __LINE__); \
+         fprintf(stderr, "  A = %s = %u\n", #first, f);                 \
+         fprintf(stderr, "  B = %s = %u\n", #second, s);                \
+         abort();                                                       \
+      }                                                                 \
+   }
+
+#define fsv_assert_lte(first, second)                                   \
+   {                                                                    \
+      unsigned f = (first);                                             \
+      unsigned s = (second);                                            \
+      if (f > s) {                                                      \
+         fprintf(stderr, "ASSERT: Scalar %s validation failed!\n",      \
+                 _mesa_shader_stage_to_abbrev(stage));                  \
+         dump_instruction(inst, stderr);                                \
+         fprintf(stderr, "%s:%d: A <= B failed\n", __FILE__, __LINE__); \
+         fprintf(stderr, "  A = %s = %u\n", #first, f);                 \
+         fprintf(stderr, "  B = %s = %u\n", #second, s);                \
+         abort();                                                       \
+      }                                                                 \
+   }
+
+#ifndef NDEBUG
+void
+fs_visitor::validate()
+{
+   cfg->validate(_mesa_shader_stage_to_abbrev(stage));
+
+   foreach_block_and_inst (block, fs_inst, inst, cfg) {
+      switch (inst->opcode) {
+      case SHADER_OPCODE_SEND:
+         fsv_assert(is_uniform(inst->src[0]) && is_uniform(inst->src[1]));
+         break;
+
+      case BRW_OPCODE_MOV:
+         fsv_assert(inst->sources == 1);
+         break;
+
+      default:
+         break;
+      }
+
+      if (inst->is_3src(compiler)) {
+         const unsigned integer_sources =
+            brw_reg_type_is_integer(inst->src[0].type) +
+            brw_reg_type_is_integer(inst->src[1].type) +
+            brw_reg_type_is_integer(inst->src[2].type);
+         const unsigned float_sources =
+            brw_reg_type_is_floating_point(inst->src[0].type) +
+            brw_reg_type_is_floating_point(inst->src[1].type) +
+            brw_reg_type_is_floating_point(inst->src[2].type);
+
+         fsv_assert((integer_sources == 3 && float_sources == 0) ||
+                    (integer_sources == 0 && float_sources == 3));
+
+         if (devinfo->ver >= 10) {
+            for (unsigned i = 0; i < 3; i++) {
+               if (inst->src[i].file == BRW_IMMEDIATE_VALUE)
+                  continue;
+
+               switch (inst->src[i].vstride) {
+               case BRW_VERTICAL_STRIDE_0:
+               case BRW_VERTICAL_STRIDE_4:
+               case BRW_VERTICAL_STRIDE_8:
+               case BRW_VERTICAL_STRIDE_16:
+                  break;
+
+               case BRW_VERTICAL_STRIDE_1:
+                  fsv_assert_lte(12, devinfo->ver);
+                  break;
+
+               case BRW_VERTICAL_STRIDE_2:
+                  fsv_assert_lte(devinfo->ver, 11);
+                  break;
+
+               default:
+                  fsv_assert(!"invalid vstride");
+                  break;
+               }
+            }
+         } else if (grf_used != 0) {
+            /* Only perform the pre-Gfx10 checks after register allocation has
+             * occured.
+             *
+             * Many passes (e.g., constant copy propagation) will genenerate
+             * invalid 3-source instructions with the expectation that later
+             * passes (e.g., combine constants) will fix them.
+             */
+            for (unsigned i = 0; i < 3; i++) {
+               fsv_assert_ne(inst->src[i].file, BRW_IMMEDIATE_VALUE);
+
+               /* A stride of 1 (the usual case) or 0, with a special
+                * "repctrl" bit, is allowed. The repctrl bit doesn't work for
+                * 64-bit datatypes, so if the source type is 64-bit then only
+                * a stride of 1 is allowed. From the Broadwell PRM, Volume 7
+                * "3D Media GPGPU", page 944:
+                *
+                *    This is applicable to 32b datatypes and 16b datatype. 64b
+                *    datatypes cannot use the replicate control.
+                */
+               fsv_assert_lte(inst->src[i].vstride, 1);
+
+               if (type_sz(inst->src[i].type) > 4)
+                  fsv_assert_eq(inst->src[i].vstride, 1);
+            }
+         }
+      }
+
+      if (inst->dst.file == VGRF) {
+         fsv_assert_lte(inst->dst.offset / REG_SIZE + regs_written(inst),
+                        alloc.sizes[inst->dst.nr]);
+      }
+
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF) {
+            fsv_assert_lte(inst->src[i].offset / REG_SIZE + regs_read(inst, i),
+                           alloc.sizes[inst->src[i].nr]);
+         }
+      }
+
+      /* Accumulator Registers, bspec 47251:
+       *
+       * "When destination is accumulator with offset 0, destination
+       * horizontal stride must be 1."
+       */
+      if (intel_needs_workaround(devinfo, 14014617373) &&
+          inst->dst.is_accumulator() &&
+          inst->dst.offset == 0) {
+         fsv_assert_eq(inst->dst.stride, 1);
+      }
+   }
+}
+#endif
--- a/src/intel/compiler/elk/brw_fs_visitor.cpp
+++ b/src/intel/compiler/elk/brw_fs_visitor.cpp
--- a/src/intel/compiler/elk/brw_gram.y
+++ b/src/intel/compiler/elk/brw_gram.y
--- a/src/intel/compiler/elk/brw_inst.h
+++ b/src/intel/compiler/elk/brw_inst.h
--- a/src/intel/compiler/elk/brw_interpolation_map.c
+++ b/src/intel/compiler/elk/brw_interpolation_map.c
@ -0,0 +1,108 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_compiler.h"
+#include "compiler/nir/nir.h"
+
+static char const *get_qual_name(int mode)
+{
+   switch (mode) {
+      case INTERP_MODE_NONE:          return "none";
+      case INTERP_MODE_FLAT:          return "flat";
+      case INTERP_MODE_SMOOTH:        return "smooth";
+      case INTERP_MODE_NOPERSPECTIVE: return "nopersp";
+      default:                             return "???";
+   }
+}
+
+static void
+gfx4_frag_prog_set_interp_modes(struct brw_wm_prog_data *prog_data,
+                                const struct intel_vue_map *vue_map,
+                                unsigned location, unsigned slot_count,
+                                enum glsl_interp_mode interp)
+{
+   for (unsigned k = 0; k < slot_count; k++) {
+      unsigned slot = vue_map->varying_to_slot[location + k];
+      if (slot != -1 && prog_data->interp_mode[slot] == INTERP_MODE_NONE) {
+         prog_data->interp_mode[slot] = interp;
+
+         if (prog_data->interp_mode[slot] == INTERP_MODE_FLAT) {
+            prog_data->contains_flat_varying = true;
+         } else if (prog_data->interp_mode[slot] == INTERP_MODE_NOPERSPECTIVE) {
+            prog_data->contains_noperspective_varying = true;
+         }
+      }
+   }
+}
+
+/* Set up interpolation modes for every element in the VUE */
+void
+brw_setup_vue_interpolation(const struct intel_vue_map *vue_map, nir_shader *nir,
+                            struct brw_wm_prog_data *prog_data)
+{
+   /* Initialise interp_mode. INTERP_MODE_NONE == 0 */
+   memset(prog_data->interp_mode, 0, sizeof(prog_data->interp_mode));
+
+   if (!vue_map)
+      return;
+
+   /* HPOS always wants noperspective. setting it up here allows
+    * us to not need special handling in the SF program.
+    */
+   unsigned pos_slot = vue_map->varying_to_slot[VARYING_SLOT_POS];
+   if (pos_slot != -1) {;
+      prog_data->interp_mode[pos_slot] = INTERP_MODE_NOPERSPECTIVE;
+      prog_data->contains_noperspective_varying = true;
+   }
+
+   nir_foreach_shader_in_variable(var, nir) {
+      unsigned location = var->data.location;
+      unsigned slot_count = glsl_count_attribute_slots(var->type, false);
+
+      gfx4_frag_prog_set_interp_modes(prog_data, vue_map, location, slot_count,
+                                      var->data.interpolation);
+
+      if (location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1) {
+         location = location + VARYING_SLOT_BFC0 - VARYING_SLOT_COL0;
+         gfx4_frag_prog_set_interp_modes(prog_data, vue_map, location,
+                                         slot_count, var->data.interpolation);
+      }
+   }
+
+   const bool debug = false;
+   if (debug) {
+      fprintf(stderr, "VUE map:\n");
+      for (int i = 0; i < vue_map->num_slots; i++) {
+         int varying = vue_map->slot_to_varying[i];
+         if (varying == -1) {
+            fprintf(stderr, "%d: --\n", i);
+            continue;
+         }
+
+         fprintf(stderr, "%d: %d %s ofs %d\n",
+                 i, varying,
+                 get_qual_name(prog_data->interp_mode[i]),
+                 brw_vue_slot_to_offset(i));
+      }
+   }
+}
--- a/src/intel/compiler/elk/brw_ir.h
+++ b/src/intel/compiler/elk/brw_ir.h
@ -0,0 +1,216 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_H
+#define BRW_IR_H
+
+#include <assert.h>
+#include "brw_reg.h"
+#include "compiler/glsl/list.h"
+
+#define MAX_SAMPLER_MESSAGE_SIZE 11
+
+/* The sampler can return a vec5 when sampling with sparse residency. In
+ * SIMD32, each component takes up 4 GRFs, so we need to allow up to size-20
+ * VGRFs to hold the result.
+ */
+#define MAX_VGRF_SIZE(devinfo) ((devinfo)->ver >= 20 ? 40 : 20)
+
+#ifdef __cplusplus
+struct backend_reg : private brw_reg
+{
+   backend_reg() {}
+   backend_reg(const struct brw_reg &reg) : brw_reg(reg), offset(0) {}
+
+   const brw_reg &as_brw_reg() const
+   {
+      assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM);
+      assert(offset == 0);
+      return static_cast<const brw_reg &>(*this);
+   }
+
+   brw_reg &as_brw_reg()
+   {
+      assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM);
+      assert(offset == 0);
+      return static_cast<brw_reg &>(*this);
+   }
+
+   bool equals(const backend_reg &r) const;
+   bool negative_equals(const backend_reg &r) const;
+
+   bool is_zero() const;
+   bool is_one() const;
+   bool is_negative_one() const;
+   bool is_null() const;
+   bool is_accumulator() const;
+
+   /** Offset from the start of the (virtual) register in bytes. */
+   uint16_t offset;
+
+   using brw_reg::type;
+   using brw_reg::file;
+   using brw_reg::negate;
+   using brw_reg::abs;
+   using brw_reg::address_mode;
+   using brw_reg::subnr;
+   using brw_reg::nr;
+
+   using brw_reg::swizzle;
+   using brw_reg::writemask;
+   using brw_reg::indirect_offset;
+   using brw_reg::vstride;
+   using brw_reg::width;
+   using brw_reg::hstride;
+
+   using brw_reg::df;
+   using brw_reg::f;
+   using brw_reg::d;
+   using brw_reg::ud;
+   using brw_reg::d64;
+   using brw_reg::u64;
+};
+
+struct bblock_t;
+
+struct backend_instruction : public exec_node {
+   bool is_3src(const struct brw_compiler *compiler) const;
+   bool is_math() const;
+   bool is_control_flow_begin() const;
+   bool is_control_flow_end() const;
+   bool is_control_flow() const;
+   bool is_commutative() const;
+   bool can_do_source_mods() const;
+   bool can_do_saturate() const;
+   bool can_do_cmod() const;
+   bool reads_accumulator_implicitly() const;
+   bool writes_accumulator_implicitly(const struct intel_device_info *devinfo) const;
+
+   /**
+    * Instructions that use indirect addressing have additional register
+    * regioning restrictions.
+    */
+   bool uses_indirect_addressing() const;
+
+   void remove(bblock_t *block, bool defer_later_block_ip_updates = false);
+   void insert_after(bblock_t *block, backend_instruction *inst);
+   void insert_before(bblock_t *block, backend_instruction *inst);
+
+   /**
+    * True if the instruction has side effects other than writing to
+    * its destination registers.  You are expected not to reorder or
+    * optimize these out unless you know what you are doing.
+    */
+   bool has_side_effects() const;
+
+   /**
+    * True if the instruction might be affected by side effects of other
+    * instructions.
+    */
+   bool is_volatile() const;
+#else
+struct backend_instruction {
+   struct exec_node link;
+#endif
+   /** @{
+    * Annotation for the generated IR.  One of the two can be set.
+    */
+   const void *ir;
+   const char *annotation;
+   /** @} */
+
+   /**
+    * Execution size of the instruction.  This is used by the generator to
+    * generate the correct binary for the given instruction.  Current valid
+    * values are 1, 4, 8, 16, 32.
+    */
+   uint8_t exec_size;
+
+   /**
+    * Channel group from the hardware execution and predication mask that
+    * should be applied to the instruction.  The subset of channel enable
+    * signals (calculated from the EU control flow and predication state)
+    * given by [group, group + exec_size) will be used to mask GRF writes and
+    * any other side effects of the instruction.
+    */
+   uint8_t group;
+
+   uint32_t offset; /**< spill/unspill offset or texture offset bitfield */
+   uint8_t mlen; /**< SEND message length */
+   uint8_t ex_mlen; /**< SENDS extended message length */
+   int8_t base_mrf; /**< First MRF in the SEND message, if mlen is nonzero. */
+   uint8_t target; /**< MRT target. */
+   uint8_t sfid; /**< SFID for SEND instructions */
+   uint32_t desc; /**< SEND[S] message descriptor immediate */
+   uint32_t ex_desc; /**< SEND[S] extended message descriptor immediate */
+   unsigned size_written; /**< Data written to the destination register in bytes. */
+
+   enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
+   enum brw_conditional_mod conditional_mod; /**< BRW_CONDITIONAL_* */
+   enum brw_predicate predicate;
+   bool predicate_inverse:1;
+   bool writes_accumulator:1; /**< instruction implicitly writes accumulator */
+   bool force_writemask_all:1;
+   bool no_dd_clear:1;
+   bool no_dd_check:1;
+   bool saturate:1;
+   bool shadow_compare:1;
+   bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */
+   bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */
+   bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */
+   bool send_ex_desc_scratch:1; /**< Only valid for SHADER_OPCODE_SEND, use
+                                 *   the scratch surface offset to build
+                                 *   extended descriptor
+                                 */
+   bool send_ex_bso:1; /**< Only for SHADER_OPCODE_SEND, use extended bindless
+                        *   surface offset (26bits instead of 20bits)
+                        */
+   bool predicate_trivial:1; /**< The predication mask applied to this
+                              *   instruction is guaranteed to be uniform and
+                              *   a superset of the execution mask of the
+                              *   present block, no currently enabled channels
+                              *   will be disabled by the predicate.
+                              */
+   bool eot:1;
+
+   /* Chooses which flag subregister (f0.0 to f3.1) is used for conditional
+    * mod and predication.
+    */
+   unsigned flag_subreg:3;
+
+   /**
+    * Systolic depth used by DPAS instruction.
+    */
+   unsigned sdepth:4;
+
+   /**
+    * Repeat count used by DPAS instruction.
+    */
+   unsigned rcount:4;
+
+   /** The number of hardware registers used for a message header. */
+   uint8_t header_size;
+};
+
+#endif
--- a/src/intel/compiler/elk/brw_ir_allocator.h
+++ b/src/intel/compiler/elk/brw_ir_allocator.h
@ -0,0 +1,92 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_ALLOCATOR_H
+#define BRW_IR_ALLOCATOR_H
+
+#include "util/compiler.h"
+#include "util/glheader.h"
+#include "util/macros.h"
+#include "util/rounding.h"
+#include "util/u_math.h"
+
+namespace brw {
+   /**
+    * Simple allocator used to keep track of virtual GRFs.
+    */
+   class simple_allocator {
+   public:
+      simple_allocator() :
+         sizes(NULL), offsets(NULL), count(0), total_size(0), capacity(0)
+      {
+      }
+
+      ~simple_allocator()
+      {
+         free(offsets);
+         free(sizes);
+      }
+
+      unsigned
+      allocate(unsigned size)
+      {
+         assert(size > 0);
+         if (capacity <= count) {
+            capacity = MAX2(16, capacity * 2);
+            sizes = (unsigned *)realloc(sizes, capacity * sizeof(unsigned));
+            offsets = (unsigned *)realloc(offsets, capacity * sizeof(unsigned));
+         }
+
+         sizes[count] = size;
+         offsets[count] = total_size;
+         total_size += size;
+
+         return count++;
+      }
+
+      /**
+       * Array of sizes for each allocation.  The allocation unit is up to the
+       * back-end, but it's expected to be one scalar value in the FS back-end
+       * and one vec4 in the VEC4 back-end.
+       */
+      unsigned *sizes;
+
+      /**
+       * Array of offsets from the start of the VGRF space in allocation
+       * units.
+       */
+      unsigned *offsets;
+
+      /** Total number of VGRFs allocated. */
+      unsigned count;
+
+      /** Cumulative size in allocation units. */
+      unsigned total_size;
+
+   private:
+      unsigned capacity;
+   };
+}
+
+#endif
--- a/src/intel/compiler/elk/brw_ir_analysis.h
+++ b/src/intel/compiler/elk/brw_ir_analysis.h
@ -0,0 +1,192 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_ANALYSIS_H
+#define BRW_IR_ANALYSIS_H
+
+namespace brw {
+   /**
+    * Bitset of state categories that can influence the result of IR analysis
+    * passes.
+    */
+   enum analysis_dependency_class {
+      /**
+       * The analysis doesn't depend on the IR, its result is effectively a
+       * constant during the compilation.
+       */
+      DEPENDENCY_NOTHING = 0,
+      /**
+       * The analysis depends on the set of instructions in the program and
+       * their naming.  Note that because instructions are named sequentially
+       * by IP this implies a dependency on the control flow edges between
+       * instructions.  This will be signaled whenever instructions are
+       * inserted, removed or reordered in the program.
+       */
+      DEPENDENCY_INSTRUCTION_IDENTITY = 0x1,
+      /**
+       * The analysis is sensitive to the detailed semantics of instructions
+       * in the program, where "detailed" means any change in the instruction
+       * data structures other than the linked-list pointers (which are
+       * already covered by DEPENDENCY_INSTRUCTION_IDENTITY).  E.g. changing
+       * the negate or abs flags of an instruction source would signal this
+       * flag alone because it would preserve all other instruction dependency
+       * classes.
+       */
+      DEPENDENCY_INSTRUCTION_DETAIL = 0x2,
+      /**
+       * The analysis depends on the set of data flow edges between
+       * instructions.  This will be signaled whenever the dataflow relation
+       * between instructions has potentially changed, e.g. when the VGRF
+       * index of an instruction source or destination changes (in which case
+       * it will appear in combination with DEPENDENCY_INSTRUCTION_DETAIL), or
+       * when data-dependent instructions are reordered (in which case it will
+       * appear in combination with DEPENDENCY_INSTRUCTION_IDENTITY).
+       */
+      DEPENDENCY_INSTRUCTION_DATA_FLOW = 0x4,
+      /**
+       * The analysis depends on all instruction dependency classes.  These
+       * will typically be signaled simultaneously when inserting or removing
+       * instructions in the program (or if you're feeling too lazy to read
+       * through your optimization pass to figure out which of the instruction
+       * dependency classes above it invalidates).
+       */
+      DEPENDENCY_INSTRUCTIONS = 0x7,
+      /**
+       * The analysis depends on the set of VGRFs in the program and their
+       * naming.  This will be signaled when VGRFs are allocated or released.
+       */
+      DEPENDENCY_VARIABLES = 0x8,
+      /**
+       * The analysis depends on the set of basic blocks in the program, their
+       * control flow edges and naming.
+       */
+      DEPENDENCY_BLOCKS = 0x10,
+      /**
+       * The analysis depends on the program being literally the same (good
+       * luck...), any change in the input invalidates previous analysis
+       * computations.
+       */
+      DEPENDENCY_EVERYTHING = ~0
+   };
+
+   inline analysis_dependency_class
+   operator|(analysis_dependency_class x, analysis_dependency_class y)
+   {
+      return static_cast<analysis_dependency_class>(
+         static_cast<unsigned>(x) | static_cast<unsigned>(y));
+   }
+}
+
+/**
+ * Instantiate a program analysis class \p L which can calculate an object of
+ * type \p T as result.  \p C is a closure that encapsulates whatever
+ * information is required as argument to run the analysis pass.  The purpose
+ * of this class is to make sure that:
+ *
+ *  - The analysis pass is executed lazily whenever it's needed and multiple
+ *    executions are optimized out as long as the cached result remains marked
+ *    up-to-date.
+ *
+ *  - There is no way to access the cached analysis result without first
+ *    calling L::require(), which makes sure that the analysis pass is rerun
+ *    if necessary.
+ *
+ *  - The cached result doesn't become inconsistent with the program for as
+ *    long as it remains marked up-to-date. (This is only enforced in debug
+ *    builds for performance reasons)
+ *
+ * The requirements on \p T are the following:
+ *
+ *  - Constructible with a single argument, as in 'x = T(c)' for \p c of type
+ *    \p C.
+ *
+ *  - 'x.dependency_class()' on const \p x returns a bitset of
+ *    brw::analysis_dependency_class specifying the set of IR objects that are
+ *    required to remain invariant for the cached analysis result to be
+ *    considered valid.
+ *
+ *  - 'x.validate(c)' on const \p x returns a boolean result specifying
+ *    whether the analysis result \p x is consistent with the input IR.  This
+ *    is currently only used for validation in debug builds.
+ */
+template<class T, class C>
+class brw_analysis {
+public:
+   /**
+    * Construct a program analysis.  \p c is an arbitrary object
+    * passed as argument to the constructor of the analysis result
+    * object of type \p T.
+    */
+   brw_analysis(const C *c) : c(c), p(NULL) {}
+
+   /**
+    * Destroy a program analysis.
+    */
+   ~brw_analysis()
+   {
+      delete p;
+   }
+
+   /**
+    * Obtain the result of a program analysis.  This gives a
+    * guaranteed up-to-date result, the analysis pass will be
+    * rerun implicitly if it has become stale.
+    */
+   T &
+   require()
+   {
+      if (p)
+         assert(p->validate(c));
+      else
+         p = new T(c);
+
+      return *p;
+   }
+
+   const T &
+   require() const
+   {
+      return const_cast<brw_analysis<T, C> *>(this)->require();
+   }
+
+   /**
+    * Report that dependencies of the analysis pass may have changed
+    * since the last calculation and the cached analysis result may
+    * have to be discarded.
+    */
+   void
+   invalidate(brw::analysis_dependency_class c)
+   {
+      if (p && (c & p->dependency_class())) {
+         delete p;
+         p = NULL;
+      }
+   }
+
+private:
+   const C *c;
+   T *p;
+};
+
+#endif
--- a/src/intel/compiler/elk/brw_ir_fs.h
+++ b/src/intel/compiler/elk/brw_ir_fs.h
@ -0,0 +1,737 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_FS_H
+#define BRW_IR_FS_H
+
+#include "brw_shader.h"
+
+class fs_inst;
+
+class fs_reg : public backend_reg {
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(fs_reg)
+
+   void init();
+
+   fs_reg();
+   fs_reg(struct ::brw_reg reg);
+   fs_reg(enum brw_reg_file file, unsigned nr);
+   fs_reg(enum brw_reg_file file, unsigned nr, enum brw_reg_type type);
+
+   bool equals(const fs_reg &r) const;
+   bool negative_equals(const fs_reg &r) const;
+   bool is_contiguous() const;
+
+   /**
+    * Return the size in bytes of a single logical component of the
+    * register assuming the given execution width.
+    */
+   unsigned component_size(unsigned width) const;
+
+   /** Register region horizontal stride */
+   uint8_t stride;
+};
+
+static inline fs_reg
+negate(fs_reg reg)
+{
+   assert(reg.file != IMM);
+   reg.negate = !reg.negate;
+   return reg;
+}
+
+static inline fs_reg
+retype(fs_reg reg, enum brw_reg_type type)
+{
+   reg.type = type;
+   return reg;
+}
+
+static inline fs_reg
+byte_offset(fs_reg reg, unsigned delta)
+{
+   switch (reg.file) {
+   case BAD_FILE:
+      break;
+   case VGRF:
+   case ATTR:
+   case UNIFORM:
+      reg.offset += delta;
+      break;
+   case MRF: {
+      const unsigned suboffset = reg.offset + delta;
+      reg.nr += suboffset / REG_SIZE;
+      reg.offset = suboffset % REG_SIZE;
+      break;
+   }
+   case ARF:
+   case FIXED_GRF: {
+      const unsigned suboffset = reg.subnr + delta;
+      reg.nr += suboffset / REG_SIZE;
+      reg.subnr = suboffset % REG_SIZE;
+      break;
+   }
+   case IMM:
+   default:
+      assert(delta == 0);
+   }
+   return reg;
+}
+
+static inline fs_reg
+horiz_offset(const fs_reg &reg, unsigned delta)
+{
+   switch (reg.file) {
+   case BAD_FILE:
+   case UNIFORM:
+   case IMM:
+      /* These only have a single component that is implicitly splatted.  A
+       * horizontal offset should be a harmless no-op.
+       * XXX - Handle vector immediates correctly.
+       */
+      return reg;
+   case VGRF:
+   case MRF:
+   case ATTR:
+      return byte_offset(reg, delta * reg.stride * type_sz(reg.type));
+   case ARF:
+   case FIXED_GRF:
+      if (reg.is_null()) {
+         return reg;
+      } else {
+         const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
+         const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
+         const unsigned width = 1 << reg.width;
+
+         if (delta % width == 0) {
+            return byte_offset(reg, delta / width * vstride * type_sz(reg.type));
+         } else {
+            assert(vstride == hstride * width);
+            return byte_offset(reg, delta * hstride * type_sz(reg.type));
+         }
+      }
+   }
+   unreachable("Invalid register file");
+}
+
+static inline fs_reg
+offset(fs_reg reg, unsigned width, unsigned delta)
+{
+   switch (reg.file) {
+   case BAD_FILE:
+      break;
+   case ARF:
+   case FIXED_GRF:
+   case MRF:
+   case VGRF:
+   case ATTR:
+   case UNIFORM:
+      return byte_offset(reg, delta * reg.component_size(width));
+   case IMM:
+      assert(delta == 0);
+   }
+   return reg;
+}
+
+/**
+ * Get the scalar channel of \p reg given by \p idx and replicate it to all
+ * channels of the result.
+ */
+static inline fs_reg
+component(fs_reg reg, unsigned idx)
+{
+   reg = horiz_offset(reg, idx);
+   reg.stride = 0;
+   if (reg.file == ARF || reg.file == FIXED_GRF) {
+      reg.vstride = BRW_VERTICAL_STRIDE_0;
+      reg.width = BRW_WIDTH_1;
+      reg.hstride = BRW_HORIZONTAL_STRIDE_0;
+   }
+   return reg;
+}
+
+/**
+ * Return an integer identifying the discrete address space a register is
+ * contained in.  A register is by definition fully contained in the single
+ * reg_space it belongs to, so two registers with different reg_space ids are
+ * guaranteed not to overlap.  Most register files are a single reg_space of
+ * its own, only the VGRF and ATTR files are composed of multiple discrete
+ * address spaces, one for each allocation and input attribute respectively.
+ */
+static inline uint32_t
+reg_space(const fs_reg &r)
+{
+   return r.file << 16 | (r.file == VGRF || r.file == ATTR ? r.nr : 0);
+}
+
+/**
+ * Return the base offset in bytes of a register relative to the start of its
+ * reg_space().
+ */
+static inline unsigned
+reg_offset(const fs_reg &r)
+{
+   return (r.file == VGRF || r.file == IMM || r.file == ATTR ? 0 : r.nr) *
+          (r.file == UNIFORM ? 4 : REG_SIZE) + r.offset +
+          (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
+}
+
+/**
+ * Return the amount of padding in bytes left unused between individual
+ * components of register \p r due to a (horizontal) stride value greater than
+ * one, or zero if components are tightly packed in the register file.
+ */
+static inline unsigned
+reg_padding(const fs_reg &r)
+{
+   const unsigned stride = ((r.file != ARF && r.file != FIXED_GRF) ? r.stride :
+                            r.hstride == 0 ? 0 :
+                            1 << (r.hstride - 1));
+   return (MAX2(1, stride) - 1) * type_sz(r.type);
+}
+
+/* Do not call this directly. Call regions_overlap() instead. */
+static inline bool
+regions_overlap_MRF(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
+{
+   if (r.nr & BRW_MRF_COMPR4) {
+      fs_reg t = r;
+      t.nr &= ~BRW_MRF_COMPR4;
+      /* COMPR4 regions are translated by the hardware during decompression
+       * into two separate half-regions 4 MRFs apart from each other.
+       *
+       * Note: swapping s and t in this parameter list eliminates one possible
+       * level of recursion (since the s in the called versions of
+       * regions_overlap_MRF can't be COMPR4), and that makes the compiled
+       * code a lot smaller.
+       */
+      return regions_overlap_MRF(s, ds, t, dr / 2) ||
+             regions_overlap_MRF(s, ds, byte_offset(t, 4 * REG_SIZE), dr / 2);
+   } else if (s.nr & BRW_MRF_COMPR4) {
+      return regions_overlap_MRF(s, ds, r, dr);
+   }
+
+   return !((r.nr * REG_SIZE + r.offset + dr) <= (s.nr * REG_SIZE + s.offset) ||
+            (s.nr * REG_SIZE + s.offset + ds) <= (r.nr * REG_SIZE + r.offset));
+}
+
+/**
+ * Return whether the register region starting at \p r and spanning \p dr
+ * bytes could potentially overlap the register region starting at \p s and
+ * spanning \p ds bytes.
+ */
+static inline bool
+regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
+{
+   if (r.file != s.file)
+      return false;
+
+   if (r.file == VGRF) {
+      return r.nr == s.nr &&
+             !(r.offset + dr <= s.offset || s.offset + ds <= r.offset);
+   } else if (r.file != MRF) {
+      return !(reg_offset(r) + dr <= reg_offset(s) ||
+               reg_offset(s) + ds <= reg_offset(r));
+   } else {
+      return regions_overlap_MRF(r, dr, s, ds);
+   }
+}
+
+/**
+ * Check that the register region given by r [r.offset, r.offset + dr[
+ * is fully contained inside the register region given by s
+ * [s.offset, s.offset + ds[.
+ */
+static inline bool
+region_contained_in(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
+{
+   return reg_space(r) == reg_space(s) &&
+          reg_offset(r) >= reg_offset(s) &&
+          reg_offset(r) + dr <= reg_offset(s) + ds;
+}
+
+/**
+ * Return whether the given register region is n-periodic, i.e. whether the
+ * original region remains invariant after shifting it by \p n scalar
+ * channels.
+ */
+static inline bool
+is_periodic(const fs_reg &reg, unsigned n)
+{
+   if (reg.file == BAD_FILE || reg.is_null()) {
+      return true;
+
+   } else if (reg.file == IMM) {
+      const unsigned period = (reg.type == BRW_REGISTER_TYPE_UV ||
+                               reg.type == BRW_REGISTER_TYPE_V ? 8 :
+                               reg.type == BRW_REGISTER_TYPE_VF ? 4 :
+                               1);
+      return n % period == 0;
+
+   } else if (reg.file == ARF || reg.file == FIXED_GRF) {
+      const unsigned period = (reg.hstride == 0 && reg.vstride == 0 ? 1 :
+                               reg.vstride == 0 ? 1 << reg.width :
+                               ~0);
+      return n % period == 0;
+
+   } else {
+      return reg.stride == 0;
+   }
+}
+
+static inline bool
+is_uniform(const fs_reg &reg)
+{
+   return is_periodic(reg, 1);
+}
+
+/**
+ * Get the specified 8-component quarter of a register.
+ */
+static inline fs_reg
+quarter(const fs_reg &reg, unsigned idx)
+{
+   assert(idx < 4);
+   return horiz_offset(reg, 8 * idx);
+}
+
+/**
+ * Reinterpret each channel of register \p reg as a vector of values of the
+ * given smaller type and take the i-th subcomponent from each.
+ */
+static inline fs_reg
+subscript(fs_reg reg, brw_reg_type type, unsigned i)
+{
+   assert((i + 1) * type_sz(type) <= type_sz(reg.type));
+
+   if (reg.file == ARF || reg.file == FIXED_GRF) {
+      /* The stride is encoded inconsistently for fixed GRF and ARF registers
+       * as the log2 of the actual vertical and horizontal strides.
+       */
+      const int delta = util_logbase2(type_sz(reg.type)) -
+                        util_logbase2(type_sz(type));
+      reg.hstride += (reg.hstride ? delta : 0);
+      reg.vstride += (reg.vstride ? delta : 0);
+
+   } else if (reg.file == IMM) {
+      unsigned bit_size = type_sz(type) * 8;
+      reg.u64 >>= i * bit_size;
+      reg.u64 &= BITFIELD64_MASK(bit_size);
+      if (bit_size <= 16)
+         reg.u64 |= reg.u64 << 16;
+      return retype(reg, type);
+   } else {
+      reg.stride *= type_sz(reg.type) / type_sz(type);
+   }
+
+   return byte_offset(retype(reg, type), i * type_sz(type));
+}
+
+static inline fs_reg
+horiz_stride(fs_reg reg, unsigned s)
+{
+   reg.stride *= s;
+   return reg;
+}
+
+static const fs_reg reg_undef;
+
+class fs_inst : public backend_instruction {
+   fs_inst &operator=(const fs_inst &);
+
+   void init(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
+             const fs_reg *src, unsigned sources);
+
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(fs_inst)
+
+   fs_inst();
+   fs_inst(enum opcode opcode, uint8_t exec_size);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+           const fs_reg &src0);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+           const fs_reg &src0, const fs_reg &src1);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+           const fs_reg &src0, const fs_reg &src1, const fs_reg &src2);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+           const fs_reg src[], unsigned sources);
+   fs_inst(const fs_inst &that);
+   ~fs_inst();
+
+   void resize_sources(uint8_t num_sources);
+
+   bool is_send_from_grf() const;
+   bool is_payload(unsigned arg) const;
+   bool is_partial_write() const;
+   unsigned components_read(unsigned i) const;
+   unsigned size_read(int arg) const;
+   bool can_do_source_mods(const struct intel_device_info *devinfo) const;
+   bool can_do_cmod();
+   bool can_change_types() const;
+   bool has_source_and_destination_hazard() const;
+   unsigned implied_mrf_writes() const;
+
+   /**
+    * Return whether \p arg is a control source of a virtual instruction which
+    * shouldn't contribute to the execution type and usual regioning
+    * restriction calculations of arithmetic instructions.
+    */
+   bool is_control_source(unsigned arg) const;
+
+   /**
+    * Return the subset of flag registers read by the instruction as a bitset
+    * with byte granularity.
+    */
+   unsigned flags_read(const intel_device_info *devinfo) const;
+
+   /**
+    * Return the subset of flag registers updated by the instruction (either
+    * partially or fully) as a bitset with byte granularity.
+    */
+   unsigned flags_written(const intel_device_info *devinfo) const;
+
+   /**
+    * Return true if this instruction is a sampler message gathering residency
+    * data.
+    */
+   bool has_sampler_residency() const;
+
+   fs_reg dst;
+   fs_reg *src;
+
+   uint8_t sources; /**< Number of fs_reg sources. */
+
+   bool last_rt:1;
+   bool pi_noperspective:1;   /**< Pixel interpolator noperspective flag */
+   bool keep_payload_trailing_zeros;
+
+   tgl_swsb sched; /**< Scheduling info. */
+};
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a possibly
+ * inverted predicate.
+ */
+static inline fs_inst *
+set_predicate_inv(enum brw_predicate pred, bool inverse,
+                  fs_inst *inst)
+{
+   inst->predicate = pred;
+   inst->predicate_inverse = inverse;
+   return inst;
+}
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a predicate.
+ */
+static inline fs_inst *
+set_predicate(enum brw_predicate pred, fs_inst *inst)
+{
+   return set_predicate_inv(pred, false, inst);
+}
+
+/**
+ * Write the result of evaluating the condition given by \p mod to a flag
+ * register.
+ */
+static inline fs_inst *
+set_condmod(enum brw_conditional_mod mod, fs_inst *inst)
+{
+   inst->conditional_mod = mod;
+   return inst;
+}
+
+/**
+ * Clamp the result of \p inst to the saturation range of its destination
+ * datatype.
+ */
+static inline fs_inst *
+set_saturate(bool saturate, fs_inst *inst)
+{
+   inst->saturate = saturate;
+   return inst;
+}
+
+/**
+ * Return the number of dataflow registers written by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->dst) /
+ * register_size)'.  The somewhat arbitrary register size unit is 4B for the
+ * UNIFORM and IMM files and 32B for all other files.
+ */
+inline unsigned
+regs_written(const fs_inst *inst)
+{
+   assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
+   return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE +
+                       inst->size_written -
+                       MIN2(inst->size_written, reg_padding(inst->dst)),
+                       REG_SIZE);
+}
+
+/**
+ * Return the number of dataflow registers read by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
+ * register_size)'.  The somewhat arbitrary register size unit is 4B for the
+ * UNIFORM files and 32B for all other files.
+ */
+inline unsigned
+regs_read(const fs_inst *inst, unsigned i)
+{
+   if (inst->src[i].file == IMM)
+      return 1;
+
+   const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE;
+   return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
+                       inst->size_read(i) -
+                       MIN2(inst->size_read(i), reg_padding(inst->src[i])),
+                       reg_size);
+}
+
+static inline enum brw_reg_type
+get_exec_type(const fs_inst *inst)
+{
+   brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
+
+   for (int i = 0; i < inst->sources; i++) {
+      if (inst->src[i].file != BAD_FILE &&
+          !inst->is_control_source(i)) {
+         const brw_reg_type t = get_exec_type(inst->src[i].type);
+         if (type_sz(t) > type_sz(exec_type))
+            exec_type = t;
+         else if (type_sz(t) == type_sz(exec_type) &&
+                  brw_reg_type_is_floating_point(t))
+            exec_type = t;
+      }
+   }
+
+   if (exec_type == BRW_REGISTER_TYPE_B)
+      exec_type = inst->dst.type;
+
+   assert(exec_type != BRW_REGISTER_TYPE_B);
+
+   /* Promotion of the execution type to 32-bit for conversions from or to
+    * half-float seems to be consistent with the following text from the
+    * Cherryview PRM Vol. 7, "Execution Data Type":
+    *
+    * "When single precision and half precision floats are mixed between
+    *  source operands or between source and destination operand [..] single
+    *  precision float is the execution datatype."
+    *
+    * and from "Register Region Restrictions":
+    *
+    * "Conversion between Integer and HF (Half Float) must be DWord aligned
+    *  and strided by a DWord on the destination."
+    */
+   if (type_sz(exec_type) == 2 &&
+       inst->dst.type != exec_type) {
+      if (exec_type == BRW_REGISTER_TYPE_HF)
+         exec_type = BRW_REGISTER_TYPE_F;
+      else if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+         exec_type = BRW_REGISTER_TYPE_D;
+   }
+
+   return exec_type;
+}
+
+static inline unsigned
+get_exec_type_size(const fs_inst *inst)
+{
+   return type_sz(get_exec_type(inst));
+}
+
+static inline bool
+is_send(const fs_inst *inst)
+{
+   return inst->mlen || inst->is_send_from_grf();
+}
+
+/**
+ * Return whether the instruction isn't an ALU instruction and cannot be
+ * assumed to complete in-order.
+ */
+static inline bool
+is_unordered(const intel_device_info *devinfo, const fs_inst *inst)
+{
+   return is_send(inst) || (devinfo->ver < 20 && inst->is_math()) ||
+          inst->opcode == BRW_OPCODE_DPAS ||
+          (devinfo->has_64bit_float_via_math_pipe &&
+           (get_exec_type(inst) == BRW_REGISTER_TYPE_DF ||
+            inst->dst.type == BRW_REGISTER_TYPE_DF));
+}
+
+/**
+ * Return whether the following regioning restriction applies to the specified
+ * instruction.  From the Cherryview PRM Vol 7. "Register Region
+ * Restrictions":
+ *
+ * "When source or destination datatype is 64b or operation is integer DWord
+ *  multiply, regioning in Align1 must follow these rules:
+ *
+ *  1. Source and Destination horizontal stride must be aligned to the same qword.
+ *  2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
+ *  3. Source and Destination offset must be the same, except the case of
+ *     scalar source."
+ */
+static inline bool
+has_dst_aligned_region_restriction(const intel_device_info *devinfo,
+                                   const fs_inst *inst,
+                                   brw_reg_type dst_type)
+{
+   const brw_reg_type exec_type = get_exec_type(inst);
+   /* Even though the hardware spec claims that "integer DWord multiply"
+    * operations are restricted, empirical evidence and the behavior of the
+    * simulator suggest that only 32x32-bit integer multiplication is
+    * restricted.
+    */
+   const bool is_dword_multiply = !brw_reg_type_is_floating_point(exec_type) &&
+      ((inst->opcode == BRW_OPCODE_MUL &&
+        MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||
+       (inst->opcode == BRW_OPCODE_MAD &&
+        MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
+
+   if (type_sz(dst_type) > 4 || type_sz(exec_type) > 4 ||
+       (type_sz(exec_type) == 4 && is_dword_multiply))
+      return devinfo->platform == INTEL_PLATFORM_CHV ||
+             intel_device_info_is_9lp(devinfo) ||
+             devinfo->verx10 >= 125;
+
+   else if (brw_reg_type_is_floating_point(dst_type))
+      return devinfo->verx10 >= 125;
+
+   else
+      return false;
+}
+
+static inline bool
+has_dst_aligned_region_restriction(const intel_device_info *devinfo,
+                                   const fs_inst *inst)
+{
+   return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type);
+}
+
+/**
+ * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from
+ * the specified register file into a VGRF.
+ *
+ * This implies identity register regions without any source-destination
+ * overlap, but otherwise has no implications on the location of sources and
+ * destination in the register file: Gathering any number of portions from
+ * multiple virtual registers in any order is allowed.
+ */
+inline bool
+is_copy_payload(brw_reg_file file, const fs_inst *inst)
+{
+   if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD ||
+       inst->is_partial_write() || inst->saturate ||
+       inst->dst.file != VGRF)
+      return false;
+
+   for (unsigned i = 0; i < inst->sources; i++) {
+      if (inst->src[i].file != file ||
+          inst->src[i].abs || inst->src[i].negate)
+         return false;
+
+      if (!inst->src[i].is_contiguous())
+         return false;
+
+      if (regions_overlap(inst->dst, inst->size_written,
+                          inst->src[i], inst->size_read(i)))
+         return false;
+   }
+
+   return true;
+}
+
+/**
+ * Like is_copy_payload(), but the instruction is required to copy a single
+ * contiguous block of registers from the given register file into the
+ * destination without any reordering.
+ */
+inline bool
+is_identity_payload(brw_reg_file file, const fs_inst *inst) {
+   if (is_copy_payload(file, inst)) {
+      fs_reg reg = inst->src[0];
+
+      for (unsigned i = 0; i < inst->sources; i++) {
+         reg.type = inst->src[i].type;
+         if (!inst->src[i].equals(reg))
+            return false;
+
+         reg = byte_offset(reg, inst->size_read(i));
+      }
+
+      return true;
+   } else {
+      return false;
+   }
+}
+
+/**
+ * Like is_copy_payload(), but the instruction is required to source data from
+ * at least two disjoint VGRFs.
+ *
+ * This doesn't necessarily rule out the elimination of this instruction
+ * through register coalescing, but due to limitations of the register
+ * coalesce pass it might be impossible to do so directly until a later stage,
+ * when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV
+ * instructions.
+ */
+inline bool
+is_multi_copy_payload(const fs_inst *inst) {
+   if (is_copy_payload(VGRF, inst)) {
+      for (unsigned i = 0; i < inst->sources; i++) {
+            if (inst->src[i].nr != inst->src[0].nr)
+               return true;
+      }
+   }
+
+   return false;
+}
+
+/**
+ * Like is_identity_payload(), but the instruction is required to copy the
+ * whole contents of a single VGRF into the destination.
+ *
+ * This means that there is a good chance that the instruction will be
+ * eliminated through register coalescing, but it's neither a necessary nor a
+ * sufficient condition for that to happen -- E.g. consider the case where
+ * source and destination registers diverge due to other instructions in the
+ * program overwriting part of their contents, which isn't something we can
+ * predict up front based on a cheap strictly local test of the copy
+ * instruction.
+ */
+inline bool
+is_coalescing_payload(const brw::simple_allocator &alloc, const fs_inst *inst)
+{
+   return is_identity_payload(VGRF, inst) &&
+          inst->src[0].offset == 0 &&
+          alloc.sizes[inst->src[0].nr] * REG_SIZE == inst->size_written;
+}
+
+bool
+has_bank_conflict(const struct brw_isa_info *isa, const fs_inst *inst);
+
+#endif
--- a/src/intel/compiler/elk/brw_ir_performance.cpp
+++ b/src/intel/compiler/elk/brw_ir_performance.cpp
--- a/src/intel/compiler/elk/brw_ir_performance.h
+++ b/src/intel/compiler/elk/brw_ir_performance.h
@ -0,0 +1,86 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_PERFORMANCE_H
+#define BRW_IR_PERFORMANCE_H
+
+class fs_visitor;
+
+namespace brw {
+   class vec4_visitor;
+
+   /**
+    * Various estimates of the performance of a shader based on static
+    * analysis.
+    */
+   struct performance {
+      performance(const fs_visitor *v);
+      performance(const vec4_visitor *v);
+      ~performance();
+
+      analysis_dependency_class
+      dependency_class() const
+      {
+         return (DEPENDENCY_INSTRUCTIONS |
+                 DEPENDENCY_BLOCKS);
+      }
+
+      bool
+      validate(const backend_shader *) const
+      {
+         return true;
+      }
+
+      /**
+       * Array containing estimates of the runtime of each basic block of the
+       * program in cycle units.
+       */
+      unsigned *block_latency;
+
+      /**
+       * Estimate of the runtime of the whole program in cycle units assuming
+       * uncontended execution.
+       */
+      unsigned latency;
+
+      /**
+       * Estimate of the throughput of the whole program in
+       * invocations-per-cycle units.
+       *
+       * Note that this might be lower than the ratio between the dispatch
+       * width of the program and its latency estimate in cases where
+       * performance doesn't scale without limits as a function of its thread
+       * parallelism, e.g. due to the existence of a bottleneck in a shared
+       * function.
+       */
+      float throughput;
+
+   private:
+      performance(const performance &perf);
+      performance &
+      operator=(performance u);
+   };
+}
+
+#endif
--- a/src/intel/compiler/elk/brw_ir_vec4.h
+++ b/src/intel/compiler/elk/brw_ir_vec4.h
@ -0,0 +1,475 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2011-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_VEC4_H
+#define BRW_IR_VEC4_H
+
+#include "brw_shader.h"
+
+namespace brw {
+
+class dst_reg;
+
+class src_reg : public backend_reg
+{
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(src_reg)
+
+   void init();
+
+   src_reg(enum brw_reg_file file, int nr, const glsl_type *type);
+   src_reg();
+   src_reg(struct ::brw_reg reg);
+
+   bool equals(const src_reg &r) const;
+   bool negative_equals(const src_reg &r) const;
+
+   src_reg(class vec4_visitor *v, const struct glsl_type *type);
+   src_reg(class vec4_visitor *v, const struct glsl_type *type, int size);
+
+   explicit src_reg(const dst_reg &reg);
+
+   src_reg *reladdr;
+};
+
+static inline src_reg
+retype(src_reg reg, enum brw_reg_type type)
+{
+   reg.type = type;
+   return reg;
+}
+
+namespace detail {
+
+static inline void
+add_byte_offset(backend_reg *reg, unsigned bytes)
+{
+   switch (reg->file) {
+      case BAD_FILE:
+         break;
+      case VGRF:
+      case ATTR:
+      case UNIFORM:
+         reg->offset += bytes;
+         assert(reg->offset % 16 == 0);
+         break;
+      case MRF: {
+         const unsigned suboffset = reg->offset + bytes;
+         reg->nr += suboffset / REG_SIZE;
+         reg->offset = suboffset % REG_SIZE;
+         assert(reg->offset % 16 == 0);
+         break;
+      }
+      case ARF:
+      case FIXED_GRF: {
+         const unsigned suboffset = reg->subnr + bytes;
+         reg->nr += suboffset / REG_SIZE;
+         reg->subnr = suboffset % REG_SIZE;
+         assert(reg->subnr % 16 == 0);
+         break;
+      }
+      default:
+         assert(bytes == 0);
+   }
+}
+
+} /* namespace detail */
+
+static inline src_reg
+byte_offset(src_reg reg, unsigned bytes)
+{
+   detail::add_byte_offset(&reg, bytes);
+   return reg;
+}
+
+static inline src_reg
+offset(src_reg reg, unsigned width, unsigned delta)
+{
+   const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
+   const unsigned num_components = MAX2(width / 4 * stride, 4);
+   return byte_offset(reg, num_components * type_sz(reg.type) * delta);
+}
+
+static inline src_reg
+horiz_offset(src_reg reg, unsigned delta)
+{
+   return byte_offset(reg, delta * type_sz(reg.type));
+}
+
+/**
+ * Reswizzle a given source register.
+ * \sa brw_swizzle().
+ */
+static inline src_reg
+swizzle(src_reg reg, unsigned swizzle)
+{
+   if (reg.file == IMM)
+      reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swizzle);
+   else
+      reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle);
+
+   return reg;
+}
+
+static inline src_reg
+negate(src_reg reg)
+{
+   assert(reg.file != IMM);
+   reg.negate = !reg.negate;
+   return reg;
+}
+
+static inline bool
+is_uniform(const src_reg &reg)
+{
+   return (reg.file == IMM || reg.file == UNIFORM || reg.is_null()) &&
+          (!reg.reladdr || is_uniform(*reg.reladdr));
+}
+
+class dst_reg : public backend_reg
+{
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(dst_reg)
+
+   void init();
+
+   dst_reg();
+   dst_reg(enum brw_reg_file file, int nr);
+   dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
+           unsigned writemask);
+   dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
+           unsigned writemask);
+   dst_reg(struct ::brw_reg reg);
+   dst_reg(class vec4_visitor *v, const struct glsl_type *type);
+
+   explicit dst_reg(const src_reg &reg);
+
+   bool equals(const dst_reg &r) const;
+
+   src_reg *reladdr;
+};
+
+static inline dst_reg
+retype(dst_reg reg, enum brw_reg_type type)
+{
+   reg.type = type;
+   return reg;
+}
+
+static inline dst_reg
+byte_offset(dst_reg reg, unsigned bytes)
+{
+   detail::add_byte_offset(&reg, bytes);
+   return reg;
+}
+
+static inline dst_reg
+offset(dst_reg reg, unsigned width, unsigned delta)
+{
+   const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
+   const unsigned num_components = MAX2(width / 4 * stride, 4);
+   return byte_offset(reg, num_components * type_sz(reg.type) * delta);
+}
+
+static inline dst_reg
+horiz_offset(const dst_reg &reg, unsigned delta)
+{
+   if (is_uniform(src_reg(reg)))
+      return reg;
+   else
+      return byte_offset(reg, delta * type_sz(reg.type));
+}
+
+static inline dst_reg
+writemask(dst_reg reg, unsigned mask)
+{
+   assert(reg.file != IMM);
+   assert((reg.writemask & mask) != 0);
+   reg.writemask &= mask;
+   return reg;
+}
+
+/**
+ * Return an integer identifying the discrete address space a register is
+ * contained in.  A register is by definition fully contained in the single
+ * reg_space it belongs to, so two registers with different reg_space ids are
+ * guaranteed not to overlap.  Most register files are a single reg_space of
+ * its own, only the VGRF file is composed of multiple discrete address
+ * spaces, one for each VGRF allocation.
+ */
+static inline uint32_t
+reg_space(const backend_reg &r)
+{
+   return r.file << 16 | (r.file == VGRF ? r.nr : 0);
+}
+
+/**
+ * Return the base offset in bytes of a register relative to the start of its
+ * reg_space().
+ */
+static inline unsigned
+reg_offset(const backend_reg &r)
+{
+   return (r.file == VGRF || r.file == IMM ? 0 : r.nr) *
+          (r.file == UNIFORM ? 16 : REG_SIZE) + r.offset +
+          (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
+}
+
+/**
+ * Return whether the register region starting at \p r and spanning \p dr
+ * bytes could potentially overlap the register region starting at \p s and
+ * spanning \p ds bytes.
+ */
+static inline bool
+regions_overlap(const backend_reg &r, unsigned dr,
+                const backend_reg &s, unsigned ds)
+{
+   if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) {
+      /* COMPR4 regions are translated by the hardware during decompression
+       * into two separate half-regions 4 MRFs apart from each other.
+       */
+      backend_reg t0 = r;
+      t0.nr &= ~BRW_MRF_COMPR4;
+      backend_reg t1 = t0;
+      t1.offset += 4 * REG_SIZE;
+      return regions_overlap(t0, dr / 2, s, ds) ||
+             regions_overlap(t1, dr / 2, s, ds);
+
+   } else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) {
+      return regions_overlap(s, ds, r, dr);
+
+   } else {
+      return reg_space(r) == reg_space(s) &&
+             !(reg_offset(r) + dr <= reg_offset(s) ||
+               reg_offset(s) + ds <= reg_offset(r));
+   }
+}
+
+class vec4_instruction : public backend_instruction {
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(vec4_instruction)
+
+   vec4_instruction(enum opcode opcode,
+                    const dst_reg &dst = dst_reg(),
+                    const src_reg &src0 = src_reg(),
+                    const src_reg &src1 = src_reg(),
+                    const src_reg &src2 = src_reg());
+
+   dst_reg dst;
+   src_reg src[3];
+
+   enum brw_urb_write_flags urb_write_flags;
+
+   unsigned sol_binding; /**< gfx6: SOL binding table index */
+   bool sol_final_write; /**< gfx6: send commit message */
+   unsigned sol_vertex; /**< gfx6: used for setting dst index in SVB header */
+
+   bool is_send_from_grf() const;
+   unsigned size_read(unsigned arg) const;
+   bool can_reswizzle(const struct intel_device_info *devinfo,
+                      int dst_writemask,
+                      int swizzle, int swizzle_mask);
+   void reswizzle(int dst_writemask, int swizzle);
+   bool can_do_source_mods(const struct intel_device_info *devinfo);
+   bool can_do_cmod();
+   bool can_do_writemask(const struct intel_device_info *devinfo);
+   bool can_change_types() const;
+   bool has_source_and_destination_hazard() const;
+   unsigned implied_mrf_writes() const;
+
+   bool is_align1_partial_write()
+   {
+      return opcode == VEC4_OPCODE_SET_LOW_32BIT ||
+             opcode == VEC4_OPCODE_SET_HIGH_32BIT;
+   }
+
+   bool reads_flag() const
+   {
+      return predicate || opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2;
+   }
+
+   bool reads_flag(unsigned c)
+   {
+      if (opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2)
+         return true;
+
+      switch (predicate) {
+      case BRW_PREDICATE_NONE:
+         return false;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_X:
+         return c == 0;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_Y:
+         return c == 1;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_Z:
+         return c == 2;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_W:
+         return c == 3;
+      default:
+         return true;
+      }
+   }
+
+   bool writes_flag(const intel_device_info *devinfo) const
+   {
+      return (conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) &&
+                                  opcode != BRW_OPCODE_CSEL &&
+                                  opcode != BRW_OPCODE_IF &&
+                                  opcode != BRW_OPCODE_WHILE));
+   }
+
+   bool reads_g0_implicitly() const
+   {
+      switch (opcode) {
+      case SHADER_OPCODE_TEX:
+      case SHADER_OPCODE_TXL:
+      case SHADER_OPCODE_TXD:
+      case SHADER_OPCODE_TXF:
+      case SHADER_OPCODE_TXF_CMS_W:
+      case SHADER_OPCODE_TXF_CMS:
+      case SHADER_OPCODE_TXF_MCS:
+      case SHADER_OPCODE_TXS:
+      case SHADER_OPCODE_TG4:
+      case SHADER_OPCODE_TG4_OFFSET:
+      case SHADER_OPCODE_SAMPLEINFO:
+      case VS_OPCODE_PULL_CONSTANT_LOAD:
+      case GS_OPCODE_SET_PRIMITIVE_ID:
+      case GS_OPCODE_GET_INSTANCE_ID:
+      case SHADER_OPCODE_GFX4_SCRATCH_READ:
+      case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
+         return true;
+      default:
+         return false;
+      }
+   }
+};
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a possibly
+ * inverted predicate.
+ */
+inline vec4_instruction *
+set_predicate_inv(enum brw_predicate pred, bool inverse,
+                  vec4_instruction *inst)
+{
+   inst->predicate = pred;
+   inst->predicate_inverse = inverse;
+   return inst;
+}
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a predicate.
+ */
+inline vec4_instruction *
+set_predicate(enum brw_predicate pred, vec4_instruction *inst)
+{
+   return set_predicate_inv(pred, false, inst);
+}
+
+/**
+ * Write the result of evaluating the condition given by \p mod to a flag
+ * register.
+ */
+inline vec4_instruction *
+set_condmod(enum brw_conditional_mod mod, vec4_instruction *inst)
+{
+   inst->conditional_mod = mod;
+   return inst;
+}
+
+/**
+ * Clamp the result of \p inst to the saturation range of its destination
+ * datatype.
+ */
+inline vec4_instruction *
+set_saturate(bool saturate, vec4_instruction *inst)
+{
+   inst->saturate = saturate;
+   return inst;
+}
+
+/**
+ * Return the number of dataflow registers written by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->dst) /
+ * register_size)'.  The somewhat arbitrary register size unit is 16B for the
+ * UNIFORM and IMM files and 32B for all other files.
+ */
+inline unsigned
+regs_written(const vec4_instruction *inst)
+{
+   assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
+   return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE + inst->size_written,
+                       REG_SIZE);
+}
+
+/**
+ * Return the number of dataflow registers read by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
+ * register_size)'.  The somewhat arbitrary register size unit is 16B for the
+ * UNIFORM and IMM files and 32B for all other files.
+ */
+inline unsigned
+regs_read(const vec4_instruction *inst, unsigned i)
+{
+   const unsigned reg_size =
+      inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 16 : REG_SIZE;
+   return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size + inst->size_read(i),
+                       reg_size);
+}
+
+static inline enum brw_reg_type
+get_exec_type(const vec4_instruction *inst)
+{
+   enum brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
+
+   for (int i = 0; i < 3; i++) {
+      if (inst->src[i].file != BAD_FILE) {
+         const brw_reg_type t = get_exec_type(brw_reg_type(inst->src[i].type));
+         if (type_sz(t) > type_sz(exec_type))
+            exec_type = t;
+         else if (type_sz(t) == type_sz(exec_type) &&
+                  brw_reg_type_is_floating_point(t))
+            exec_type = t;
+      }
+   }
+
+   if (exec_type == BRW_REGISTER_TYPE_B)
+      exec_type = inst->dst.type;
+
+   /* TODO: We need to handle half-float conversions. */
+   assert(exec_type != BRW_REGISTER_TYPE_HF ||
+          inst->dst.type == BRW_REGISTER_TYPE_HF);
+   assert(exec_type != BRW_REGISTER_TYPE_B);
+
+   return exec_type;
+}
+
+static inline unsigned
+get_exec_type_size(const vec4_instruction *inst)
+{
+   return type_sz(get_exec_type(inst));
+}
+
+} /* namespace brw */
+
+#endif
--- a/src/intel/compiler/elk/brw_isa_info.h
+++ b/src/intel/compiler/elk/brw_isa_info.h
@ -0,0 +1,86 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef BRW_ISA_ENCODING_H
+#define BRW_ISA_ENCODING_H
+
+#include "dev/intel_device_info.h"
+#include "brw_eu_defines.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct opcode_desc;
+
+struct brw_isa_info {
+   const struct intel_device_info *devinfo;
+
+   /* A mapping from enum opcode to the corresponding opcode_desc */
+   const struct opcode_desc *ir_to_descs[NUM_BRW_OPCODES];
+
+   /** A mapping from a HW opcode encoding to the corresponding opcode_desc */
+   const struct opcode_desc *hw_to_descs[128];
+};
+
+void brw_init_isa_info(struct brw_isa_info *isa,
+                       const struct intel_device_info *devinfo);
+
+struct opcode_desc {
+   unsigned ir;
+   unsigned hw;
+   const char *name;
+   int nsrc;
+   int ndst;
+   int gfx_vers;
+};
+
+const struct opcode_desc *
+brw_opcode_desc(const struct brw_isa_info *isa, enum opcode opcode);
+
+const struct opcode_desc *
+brw_opcode_desc_from_hw(const struct brw_isa_info *isa, unsigned hw);
+
+static inline unsigned
+brw_opcode_encode(const struct brw_isa_info *isa, enum opcode opcode)
+{
+   return brw_opcode_desc(isa, opcode)->hw;
+}
+
+static inline enum opcode
+brw_opcode_decode(const struct brw_isa_info *isa, unsigned hw)
+{
+   const struct opcode_desc *desc = brw_opcode_desc_from_hw(isa, hw);
+   return desc ? (enum opcode)desc->ir : BRW_OPCODE_ILLEGAL;
+}
+
+static inline bool
+is_3src(const struct brw_isa_info *isa, enum opcode opcode)
+{
+   const struct opcode_desc *desc = brw_opcode_desc(isa, opcode);
+   return desc && desc->nsrc == 3;
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/src/intel/compiler/elk/brw_kernel.c
+++ b/src/intel/compiler/elk/brw_kernel.c
@ -0,0 +1,790 @@
+/*
+ * Copyright © 2020 Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_kernel.h"
+#include "brw_nir.h"
+#include "intel_nir.h"
+
+#include "intel_nir.h"
+#include "nir_clc_helpers.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/spirv/nir_spirv.h"
+#include "dev/intel_debug.h"
+#include "util/u_atomic.h"
+#include "util/u_dynarray.h"
+
+static const nir_shader *
+load_clc_shader(struct brw_compiler *compiler, struct disk_cache *disk_cache,
+                const nir_shader_compiler_options *nir_options,
+                const struct spirv_to_nir_options *spirv_options)
+{
+   if (compiler->clc_shader)
+      return compiler->clc_shader;
+
+   nir_shader *nir =  nir_load_libclc_shader(64, disk_cache,
+                                             spirv_options, nir_options,
+                                             disk_cache != NULL);
+   if (nir == NULL)
+      return NULL;
+
+   const nir_shader *old_nir =
+      p_atomic_cmpxchg(&compiler->clc_shader, NULL, nir);
+   if (old_nir == NULL) {
+      /* We won the race */
+      ralloc_steal(compiler, nir);
+      return nir;
+   } else {
+      /* Someone else built the shader first */
+      ralloc_free(nir);
+      return old_nir;
+   }
+}
+
+static nir_builder
+builder_init_new_impl(nir_function *func)
+{
+   nir_function_impl *impl = nir_function_impl_create(func);
+   return nir_builder_at(nir_before_impl(impl));
+}
+
+static void
+implement_atomic_builtin(nir_function *func, nir_atomic_op atomic_op,
+                         enum glsl_base_type data_base_type,
+                         nir_variable_mode mode)
+{
+   nir_builder b = builder_init_new_impl(func);
+   const struct glsl_type *data_type = glsl_scalar_type(data_base_type);
+
+   unsigned p = 0;
+
+   nir_deref_instr *ret = NULL;
+   ret = nir_build_deref_cast(&b, nir_load_param(&b, p++),
+                              nir_var_function_temp, data_type, 0);
+
+   nir_intrinsic_op op = nir_intrinsic_deref_atomic;
+   nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b.shader, op);
+   nir_intrinsic_set_atomic_op(atomic, atomic_op);
+
+   for (unsigned i = 0; i < nir_intrinsic_infos[op].num_srcs; i++) {
+      nir_def *src = nir_load_param(&b, p++);
+      if (i == 0) {
+         /* The first source is our deref */
+         assert(nir_intrinsic_infos[op].src_components[i] == -1);
+         src = &nir_build_deref_cast(&b, src, mode, data_type, 0)->def;
+      }
+      atomic->src[i] = nir_src_for_ssa(src);
+   }
+
+   nir_def_init_for_type(&atomic->instr, &atomic->def, data_type);
+
+   nir_builder_instr_insert(&b, &atomic->instr);
+   nir_store_deref(&b, ret, &atomic->def, ~0);
+}
+
+static void
+implement_sub_group_ballot_builtin(nir_function *func)
+{
+   nir_builder b = builder_init_new_impl(func);
+   nir_deref_instr *ret =
+      nir_build_deref_cast(&b, nir_load_param(&b, 0),
+                           nir_var_function_temp, glsl_uint_type(), 0);
+   nir_def *cond = nir_load_param(&b, 1);
+
+   nir_intrinsic_instr *ballot =
+      nir_intrinsic_instr_create(b.shader, nir_intrinsic_ballot);
+   ballot->src[0] = nir_src_for_ssa(cond);
+   ballot->num_components = 1;
+   nir_def_init(&ballot->instr, &ballot->def, 1, 32);
+   nir_builder_instr_insert(&b, &ballot->instr);
+
+   nir_store_deref(&b, ret, &ballot->def, ~0);
+}
+
+static bool
+implement_intel_builtins(nir_shader *nir)
+{
+   bool progress = false;
+
+   nir_foreach_function(func, nir) {
+      if (strcmp(func->name, "_Z10atomic_minPU3AS1Vff") == 0) {
+         /* float atom_min(__global float volatile *p, float val) */
+         implement_atomic_builtin(func, nir_atomic_op_fmin,
+                                  GLSL_TYPE_FLOAT, nir_var_mem_global);
+         progress = true;
+      } else if (strcmp(func->name, "_Z10atomic_maxPU3AS1Vff") == 0) {
+         /* float atom_max(__global float volatile *p, float val) */
+         implement_atomic_builtin(func, nir_atomic_op_fmax,
+                                  GLSL_TYPE_FLOAT, nir_var_mem_global);
+         progress = true;
+      } else if (strcmp(func->name, "_Z10atomic_minPU3AS3Vff") == 0) {
+         /* float atomic_min(__shared float volatile *, float) */
+         implement_atomic_builtin(func, nir_atomic_op_fmin,
+                                  GLSL_TYPE_FLOAT, nir_var_mem_shared);
+         progress = true;
+      } else if (strcmp(func->name, "_Z10atomic_maxPU3AS3Vff") == 0) {
+         /* float atomic_max(__shared float volatile *, float) */
+         implement_atomic_builtin(func, nir_atomic_op_fmax,
+                                  GLSL_TYPE_FLOAT, nir_var_mem_shared);
+         progress = true;
+      } else if (strcmp(func->name, "intel_sub_group_ballot") == 0) {
+         implement_sub_group_ballot_builtin(func);
+         progress = true;
+      }
+   }
+
+   nir_shader_preserve_all_metadata(nir);
+
+   return progress;
+}
+
+static bool
+lower_kernel_intrinsics(nir_shader *nir)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+   bool progress = false;
+
+   unsigned kernel_sysvals_start = 0;
+   unsigned kernel_arg_start = sizeof(struct brw_kernel_sysvals);
+   nir->num_uniforms += kernel_arg_start;
+
+   nir_builder b = nir_builder_create(impl);
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_load_kernel_input: {
+            b.cursor = nir_instr_remove(&intrin->instr);
+
+            nir_intrinsic_instr *load =
+               nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
+            load->num_components = intrin->num_components;
+            load->src[0] = nir_src_for_ssa(nir_u2u32(&b, intrin->src[0].ssa));
+            nir_intrinsic_set_base(load, kernel_arg_start);
+            nir_intrinsic_set_range(load, nir->num_uniforms);
+            nir_def_init(&load->instr, &load->def,
+                         intrin->def.num_components,
+                         intrin->def.bit_size);
+            nir_builder_instr_insert(&b, &load->instr);
+
+            nir_def_rewrite_uses(&intrin->def, &load->def);
+            progress = true;
+            break;
+         }
+
+         case nir_intrinsic_load_constant_base_ptr: {
+            b.cursor = nir_instr_remove(&intrin->instr);
+            nir_def *const_data_base_addr = nir_pack_64_2x32_split(&b,
+               nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW),
+               nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH));
+            nir_def_rewrite_uses(&intrin->def, const_data_base_addr);
+            progress = true;
+            break;
+         }
+
+         case nir_intrinsic_load_num_workgroups: {
+            b.cursor = nir_instr_remove(&intrin->instr);
+
+            nir_intrinsic_instr *load =
+               nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
+            load->num_components = 3;
+            load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+            nir_intrinsic_set_base(load, kernel_sysvals_start +
+               offsetof(struct brw_kernel_sysvals, num_work_groups));
+            nir_intrinsic_set_range(load, 3 * 4);
+            nir_def_init(&load->instr, &load->def, 3, 32);
+            nir_builder_instr_insert(&b, &load->instr);
+            nir_def_rewrite_uses(&intrin->def, &load->def);
+            progress = true;
+            break;
+         }
+
+         default:
+            break;
+         }
+      }
+   }
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+   } else {
+      nir_metadata_preserve(impl, nir_metadata_all);
+   }
+
+   return progress;
+}
+
+bool
+brw_kernel_from_spirv(struct brw_compiler *compiler,
+                      struct disk_cache *disk_cache,
+                      struct brw_kernel *kernel,
+                      void *log_data, void *mem_ctx,
+                      const uint32_t *spirv, size_t spirv_size,
+                      const char *entrypoint_name,
+                      char **error_str)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   const nir_shader_compiler_options *nir_options =
+      compiler->nir_options[MESA_SHADER_KERNEL];
+
+   struct spirv_to_nir_options spirv_options = {
+      .environment = NIR_SPIRV_OPENCL,
+      .caps = {
+         .address = true,
+         .float16 = devinfo->ver >= 8,
+         .float64 = devinfo->ver >= 8,
+         .groups = true,
+         .image_write_without_format = true,
+         .int8 = devinfo->ver >= 8,
+         .int16 = devinfo->ver >= 8,
+         .int64 = devinfo->ver >= 8,
+         .int64_atomics = devinfo->ver >= 9,
+         .kernel = true,
+         .linkage = true, /* We receive linked kernel from clc */
+         .float_controls = devinfo->ver >= 8,
+         .generic_pointers = true,
+         .storage_8bit = devinfo->ver >= 8,
+         .storage_16bit = devinfo->ver >= 8,
+         .subgroup_arithmetic = true,
+         .subgroup_basic = true,
+         .subgroup_ballot = true,
+         .subgroup_dispatch = true,
+         .subgroup_quad = true,
+         .subgroup_shuffle = true,
+         .subgroup_vote = true,
+
+         .intel_subgroup_shuffle = true,
+         .intel_subgroup_buffer_block_io = true,
+      },
+      .shared_addr_format = nir_address_format_62bit_generic,
+      .global_addr_format = nir_address_format_62bit_generic,
+      .temp_addr_format = nir_address_format_62bit_generic,
+      .constant_addr_format = nir_address_format_64bit_global,
+   };
+
+   spirv_options.clc_shader = load_clc_shader(compiler, disk_cache,
+                                              nir_options, &spirv_options);
+   if (spirv_options.clc_shader == NULL) {
+      fprintf(stderr, "ERROR: libclc shader missing."
+              " Consider installing the libclc package\n");
+      abort();
+   }
+
+   assert(spirv_size % 4 == 0);
+   nir_shader *nir =
+      spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
+                   entrypoint_name, &spirv_options, nir_options);
+   nir_validate_shader(nir, "after spirv_to_nir");
+   nir_validate_ssa_dominance(nir, "after spirv_to_nir");
+   ralloc_steal(mem_ctx, nir);
+   nir->info.name = ralloc_strdup(nir, entrypoint_name);
+
+   if (INTEL_DEBUG(DEBUG_CS)) {
+      /* Re-index SSA defs so we print more sensible numbers. */
+      nir_foreach_function_impl(impl, nir) {
+         nir_index_ssa_defs(impl);
+      }
+
+      fprintf(stderr, "NIR (from SPIR-V) for kernel\n");
+      nir_print_shader(nir, stderr);
+   }
+
+   NIR_PASS_V(nir, implement_intel_builtins);
+   NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader);
+
+   /* We have to lower away local constant initializers right before we
+    * inline functions.  That way they get properly initialized at the top
+    * of the function and not at the top of its caller.
+    */
+   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
+   NIR_PASS_V(nir, nir_lower_returns);
+   NIR_PASS_V(nir, nir_inline_functions);
+   NIR_PASS_V(nir, nir_copy_prop);
+   NIR_PASS_V(nir, nir_opt_deref);
+
+   /* Pick off the single entrypoint that we want */
+   nir_remove_non_entrypoints(nir);
+
+   /* Now that we've deleted all but the main function, we can go ahead and
+    * lower the rest of the constant initializers.  We do this here so that
+    * nir_remove_dead_variables and split_per_member_structs below see the
+    * corresponding stores.
+    */
+   NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
+
+   /* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B
+    * aligned and so it can just read/write them as vec4s.  This results in a
+    * LOT of vec4->vec3 casts on loads and stores.  One solution to this
+    * problem is to get rid of all vec3 variables.
+    */
+   NIR_PASS_V(nir, nir_lower_vec3_to_vec4,
+              nir_var_shader_temp | nir_var_function_temp |
+              nir_var_mem_shared | nir_var_mem_global|
+              nir_var_mem_constant);
+
+   /* We assign explicit types early so that the optimizer can take advantage
+    * of that information and hopefully get rid of some of our memcpys.
+    */
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
+              nir_var_uniform |
+              nir_var_shader_temp | nir_var_function_temp |
+              nir_var_mem_shared | nir_var_mem_global,
+              glsl_get_cl_type_size_align);
+
+   struct brw_nir_compiler_opts opts = {};
+   brw_preprocess_nir(compiler, nir, &opts);
+
+   int max_arg_idx = -1;
+   nir_foreach_uniform_variable(var, nir) {
+      assert(var->data.location < 256);
+      max_arg_idx = MAX2(max_arg_idx, var->data.location);
+   }
+
+   kernel->args_size = nir->num_uniforms;
+   kernel->arg_count = max_arg_idx + 1;
+
+   /* No bindings */
+   struct brw_kernel_arg_desc *args =
+      rzalloc_array(mem_ctx, struct brw_kernel_arg_desc, kernel->arg_count);
+   kernel->args = args;
+
+   nir_foreach_uniform_variable(var, nir) {
+      struct brw_kernel_arg_desc arg_desc = {
+         .offset = var->data.driver_location,
+         .size = glsl_get_explicit_size(var->type, false),
+      };
+      assert(arg_desc.offset + arg_desc.size <= nir->num_uniforms);
+
+      assert(var->data.location >= 0);
+      args[var->data.location] = arg_desc;
+   }
+
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_all, NULL);
+
+   /* Lower again, this time after dead-variables to get more compact variable
+    * layouts.
+    */
+   nir->global_mem_size = 0;
+   nir->scratch_size = 0;
+   nir->info.shared_size = 0;
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
+              nir_var_shader_temp | nir_var_function_temp |
+              nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant,
+              glsl_get_cl_type_size_align);
+   if (nir->constant_data_size > 0) {
+      assert(nir->constant_data == NULL);
+      nir->constant_data = rzalloc_size(nir, nir->constant_data_size);
+      nir_gather_explicit_io_initializers(nir, nir->constant_data,
+                                          nir->constant_data_size,
+                                          nir_var_mem_constant);
+   }
+
+   if (INTEL_DEBUG(DEBUG_CS)) {
+      /* Re-index SSA defs so we print more sensible numbers. */
+      nir_foreach_function_impl(impl, nir) {
+         nir_index_ssa_defs(impl);
+      }
+
+      fprintf(stderr, "NIR (before I/O lowering) for kernel\n");
+      nir_print_shader(nir, stderr);
+   }
+
+   NIR_PASS_V(nir, nir_lower_memcpy);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant,
+              nir_address_format_64bit_global);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform,
+              nir_address_format_32bit_offset_as_64bit);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io,
+              nir_var_shader_temp | nir_var_function_temp |
+              nir_var_mem_shared | nir_var_mem_global,
+              nir_address_format_62bit_generic);
+
+   NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL);
+
+   NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, NULL);
+   NIR_PASS_V(nir, lower_kernel_intrinsics);
+
+   struct brw_cs_prog_key key = { };
+
+   memset(&kernel->prog_data, 0, sizeof(kernel->prog_data));
+   kernel->prog_data.base.nr_params = DIV_ROUND_UP(nir->num_uniforms, 4);
+
+   struct brw_compile_cs_params params = {
+      .base = {
+         .nir = nir,
+         .stats = kernel->stats,
+         .log_data = log_data,
+         .mem_ctx = mem_ctx,
+      },
+      .key = &key,
+      .prog_data = &kernel->prog_data,
+   };
+
+   kernel->code = brw_compile_cs(compiler, &params);
+
+   if (error_str)
+      *error_str = params.base.error_str;
+
+   return kernel->code != NULL;
+}
+
+static nir_def *
+rebuild_value_from_store(struct util_dynarray *stores,
+                         nir_def *value, unsigned read_offset)
+{
+   unsigned read_size = value->num_components * value->bit_size / 8;
+
+   util_dynarray_foreach(stores, nir_intrinsic_instr *, _store) {
+      nir_intrinsic_instr *store = *_store;
+
+      unsigned write_offset = nir_src_as_uint(store->src[1]);
+      unsigned write_size = nir_src_num_components(store->src[0]) *
+                            nir_src_bit_size(store->src[0]) / 8;
+      if (write_offset <= read_offset &&
+          (write_offset + write_size) >= (read_offset + read_size)) {
+         assert(nir_block_dominates(store->instr.block, value->parent_instr->block));
+         assert(write_size == read_size);
+         return store->src[0].ssa;
+      }
+   }
+   unreachable("Matching scratch store not found");
+}
+
+/**
+ * Remove temporary variables stored to scratch to be then reloaded
+ * immediately. Remap the load to the store SSA value.
+ *
+ * This workaround is only meant to be applied to shaders in src/intel/shaders
+ * were we know there should be no issue. More complex cases might not work
+ * with this approach.
+ */
+static bool
+nir_remove_llvm17_scratch(nir_shader *nir)
+{
+   struct util_dynarray scratch_stores;
+   void *mem_ctx = ralloc_context(NULL);
+
+   util_dynarray_init(&scratch_stores, mem_ctx);
+
+   nir_foreach_function_impl(func, nir) {
+      nir_foreach_block(block, func) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+            if (intrin->intrinsic != nir_intrinsic_store_scratch)
+               continue;
+
+            nir_const_value *offset = nir_src_as_const_value(intrin->src[1]);
+            if (offset != NULL) {
+               util_dynarray_append(&scratch_stores, nir_intrinsic_instr *, intrin);
+            }
+         }
+      }
+   }
+
+   bool progress = false;
+   if (util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) > 0) {
+      nir_foreach_function_impl(func, nir) {
+         nir_foreach_block(block, func) {
+            nir_foreach_instr_safe(instr, block) {
+               if (instr->type != nir_instr_type_intrinsic)
+                  continue;
+
+               nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+               if (intrin->intrinsic != nir_intrinsic_load_scratch)
+                  continue;
+
+               nir_const_value *offset = nir_src_as_const_value(intrin->src[0]);
+               if (offset == NULL)
+                  continue;
+
+               nir_def_rewrite_uses(&intrin->def,
+                                    rebuild_value_from_store(
+                                       &scratch_stores, &intrin->def,
+                                       nir_src_as_uint(intrin->src[0])));
+               nir_instr_remove(instr);
+
+               progress = true;
+            }
+         }
+      }
+   }
+
+   util_dynarray_foreach(&scratch_stores, nir_intrinsic_instr *, _store) {
+      nir_intrinsic_instr *store = *_store;
+      nir_instr_remove(&store->instr);
+   }
+
+   /* Quick sanity check */
+   assert(util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) == 0 ||
+          progress);
+
+   ralloc_free(mem_ctx);
+
+   return progress;
+}
+
+static void
+cleanup_llvm17_scratch(nir_shader *nir)
+{
+   {
+      bool progress;
+      do {
+         progress = false;
+         NIR_PASS(progress, nir, nir_copy_prop);
+         NIR_PASS(progress, nir, nir_opt_dce);
+         NIR_PASS(progress, nir, nir_opt_constant_folding);
+         NIR_PASS(progress, nir, nir_opt_cse);
+         NIR_PASS(progress, nir, nir_opt_algebraic);
+      } while (progress);
+   }
+
+   nir_remove_llvm17_scratch(nir);
+
+   {
+      bool progress;
+      do {
+         progress = false;
+         NIR_PASS(progress, nir, nir_copy_prop);
+         NIR_PASS(progress, nir, nir_opt_dce);
+         NIR_PASS(progress, nir, nir_opt_constant_folding);
+         NIR_PASS(progress, nir, nir_opt_cse);
+         NIR_PASS(progress, nir, nir_opt_algebraic);
+      } while (progress);
+   }
+}
+
+nir_shader *
+brw_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size,
+                   bool llvm17_wa)
+{
+   struct spirv_to_nir_options spirv_options = {
+      .environment = NIR_SPIRV_OPENCL,
+      .caps = {
+         .address = true,
+         .groups = true,
+         .image_write_without_format = true,
+         .int8 = true,
+         .int16 = true,
+         .int64 = true,
+         .int64_atomics = true,
+         .kernel = true,
+         .linkage = true, /* We receive linked kernel from clc */
+         .float_controls = true,
+         .generic_pointers = true,
+         .storage_8bit = true,
+         .storage_16bit = true,
+         .subgroup_arithmetic = true,
+         .subgroup_basic = true,
+         .subgroup_ballot = true,
+         .subgroup_dispatch = true,
+         .subgroup_quad = true,
+         .subgroup_shuffle = true,
+         .subgroup_vote = true,
+
+         .intel_subgroup_shuffle = true,
+         .intel_subgroup_buffer_block_io = true,
+      },
+      .shared_addr_format = nir_address_format_62bit_generic,
+      .global_addr_format = nir_address_format_62bit_generic,
+      .temp_addr_format = nir_address_format_62bit_generic,
+      .constant_addr_format = nir_address_format_64bit_global,
+      .create_library = true,
+   };
+
+   assert(spirv_size % 4 == 0);
+   nir_shader *nir =
+      spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
+                   "library", &spirv_options, &brw_scalar_nir_options);
+   nir_validate_shader(nir, "after spirv_to_nir");
+   nir_validate_ssa_dominance(nir, "after spirv_to_nir");
+   ralloc_steal(mem_ctx, nir);
+   nir->info.name = ralloc_strdup(nir, "library");
+
+   if (INTEL_DEBUG(DEBUG_CS)) {
+      /* Re-index SSA defs so we print more sensible numbers. */
+      nir_foreach_function_impl(impl, nir) {
+         nir_index_ssa_defs(impl);
+      }
+
+      fprintf(stderr, "NIR (from SPIR-V) for kernel\n");
+      nir_print_shader(nir, stderr);
+   }
+
+   NIR_PASS_V(nir, implement_intel_builtins);
+   NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader);
+
+   /* We have to lower away local constant initializers right before we
+    * inline functions.  That way they get properly initialized at the top
+    * of the function and not at the top of its caller.
+    */
+   NIR_PASS_V(nir, nir_lower_variable_initializers, ~(nir_var_shader_temp |
+                                                      nir_var_function_temp));
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo |
+              nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL);
+   {
+      bool progress;
+      do
+      {
+         progress = false;
+         NIR_PASS(progress, nir, nir_copy_prop);
+         NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+         NIR_PASS(progress, nir, nir_opt_deref);
+         NIR_PASS(progress, nir, nir_opt_dce);
+         NIR_PASS(progress, nir, nir_opt_undef);
+         NIR_PASS(progress, nir, nir_opt_constant_folding);
+         NIR_PASS(progress, nir, nir_opt_cse);
+         NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+         NIR_PASS(progress, nir, nir_opt_algebraic);
+      } while (progress);
+   }
+
+   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
+   NIR_PASS_V(nir, nir_lower_returns);
+   NIR_PASS_V(nir, nir_inline_functions);
+
+   assert(nir->scratch_size == 0);
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_cl_type_size_align);
+
+   {
+      bool progress;
+      do
+      {
+         progress = false;
+         NIR_PASS(progress, nir, nir_copy_prop);
+         NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+         NIR_PASS(progress, nir, nir_opt_deref);
+         NIR_PASS(progress, nir, nir_opt_dce);
+         NIR_PASS(progress, nir, nir_opt_undef);
+         NIR_PASS(progress, nir, nir_opt_constant_folding);
+         NIR_PASS(progress, nir, nir_opt_cse);
+         NIR_PASS(progress, nir, nir_split_var_copies);
+         NIR_PASS(progress, nir, nir_lower_var_copies);
+         NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+         NIR_PASS(progress, nir, nir_opt_algebraic);
+         NIR_PASS(progress, nir, nir_opt_if, nir_opt_if_optimize_phi_true_false);
+         NIR_PASS(progress, nir, nir_opt_dead_cf);
+         NIR_PASS(progress, nir, nir_opt_remove_phis);
+         NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
+         NIR_PASS(progress, nir, nir_lower_vec3_to_vec4, nir_var_mem_generic | nir_var_uniform);
+         NIR_PASS(progress, nir, nir_opt_memcpy);
+      } while (progress);
+   }
+
+   NIR_PASS_V(nir, nir_scale_fdiv);
+
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo |
+              nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL);
+
+
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_mem_shared | nir_var_function_temp, NULL);
+
+   nir->scratch_size = 0;
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
+              nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp |
+              nir_var_mem_global | nir_var_mem_constant,
+              glsl_get_cl_type_size_align);
+
+   // Lower memcpy - needs to wait until types are sized
+   {
+      bool progress;
+      do {
+         progress = false;
+         NIR_PASS(progress, nir, nir_opt_memcpy);
+         NIR_PASS(progress, nir, nir_copy_prop);
+         NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+         NIR_PASS(progress, nir, nir_opt_deref);
+         NIR_PASS(progress, nir, nir_opt_dce);
+         NIR_PASS(progress, nir, nir_split_var_copies);
+         NIR_PASS(progress, nir, nir_lower_var_copies);
+         NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+         NIR_PASS(progress, nir, nir_opt_constant_folding);
+         NIR_PASS(progress, nir, nir_opt_cse);
+      } while (progress);
+   }
+   NIR_PASS_V(nir, nir_lower_memcpy);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io,
+              nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp | nir_var_uniform,
+              nir_address_format_32bit_offset_as_64bit);
+
+   NIR_PASS_V(nir, nir_lower_system_values);
+
+   /* Hopefully we can drop this once lower_vars_to_ssa has improved to not
+    * lower everything to scratch.
+    */
+   if (llvm17_wa)
+      cleanup_llvm17_scratch(nir);
+
+   /* Lower again, this time after dead-variables to get more compact variable
+    * layouts.
+    */
+   nir->global_mem_size = 0;
+   nir->scratch_size = 0;
+   nir->info.shared_size = 0;
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
+              nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant,
+              glsl_get_cl_type_size_align);
+   if (nir->constant_data_size > 0) {
+      assert(nir->constant_data == NULL);
+      nir->constant_data = rzalloc_size(nir, nir->constant_data_size);
+      nir_gather_explicit_io_initializers(nir, nir->constant_data,
+                                          nir->constant_data_size,
+                                          nir_var_mem_constant);
+   }
+
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant,
+              nir_address_format_64bit_global);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform,
+              nir_address_format_32bit_offset_as_64bit);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io,
+              nir_var_shader_temp | nir_var_function_temp |
+              nir_var_mem_shared | nir_var_mem_global,
+              nir_address_format_62bit_generic);
+
+   if (INTEL_DEBUG(DEBUG_CS)) {
+      /* Re-index SSA defs so we print more sensible numbers. */
+      nir_foreach_function_impl(impl, nir) {
+         nir_index_ssa_defs(impl);
+      }
+
+      fprintf(stderr, "NIR (before I/O lowering) for kernel\n");
+      nir_print_shader(nir, stderr);
+   }
+
+   return nir;
+}
--- a/src/intel/compiler/elk/brw_kernel.h
+++ b/src/intel/compiler/elk/brw_kernel.h
@ -0,0 +1,78 @@
+/*
+ * Copyright © 2020 Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_KERNEL_H
+#define BRW_KERNEL_H
+
+#include "brw_compiler.h"
+
+struct disk_cache;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Software interface for system values in kernels
+ *
+ * These are intended to go at the start of the kernel argument buffer.
+ */
+struct brw_kernel_sysvals {
+   uint32_t num_work_groups[3];
+   uint32_t pad[5];
+};
+
+struct brw_kernel_arg_desc {
+   uint16_t offset;
+   uint16_t size;
+};
+
+struct brw_kernel {
+   struct brw_cs_prog_data prog_data;
+
+   struct brw_compile_stats stats[3];
+
+   uint16_t args_size;
+   uint16_t arg_count;
+   const struct brw_kernel_arg_desc *args;
+
+   const void *code;
+};
+
+bool
+brw_kernel_from_spirv(struct brw_compiler *compiler,
+                      struct disk_cache *disk_cache,
+                      struct brw_kernel *kernel,
+                      void *log_data, void *mem_ctx,
+                      const uint32_t *spirv, size_t spirv_size,
+                      const char *entrypoint_name,
+                      char **error_str);
+
+nir_shader *
+brw_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size,
+                   bool llvm17_wa);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* BRW_KERNEL_H */
--- a/src/intel/compiler/elk/brw_lex.l
+++ b/src/intel/compiler/elk/brw_lex.l
@ -0,0 +1,465 @@
+%option yylineno
+%option nounput
+%{
+#include <string.h>
+#include "brw_asm.h"
+#undef ALIGN16
+#include "brw_gram.tab.h"
+
+/* Locations */
+int yycolumn = 1;
+
+int saved_state = 0;
+extern char *input_filename;
+
+#define YY_NO_INPUT
+#define YY_USER_ACTION                                  	\
+	yylloc.first_line = yylloc.last_line = yylineno;	\
+	yylloc.first_column = yycolumn;			        \
+	yylloc.last_column = yycolumn + yyleng - 1;	        \
+	yycolumn += yyleng;
+%}
+
+%x BLOCK_COMMENT
+%x FILENAME
+%x CHANNEL
+%x REG
+%x DOTSEL
+%x LABEL
+%x MSGDESC
+%%
+
+ /* eat up single line comment */
+\/\/.*[\r\n]	{ yycolumn = 1; }
+
+ /* eat up multiline comment */
+\/\*		{ saved_state = YYSTATE; BEGIN(BLOCK_COMMENT); }
+
+<BLOCK_COMMENT>\*\/	{ BEGIN(saved_state); }
+
+<BLOCK_COMMENT>.     	{ }
+<BLOCK_COMMENT>[\r\n]	{ }
+
+<FILENAME>\"[^\"]+\"	{
+			   char *name = malloc(yyleng - 1);
+			   memmove(name, yytext + 1, yyleng - 2);
+			   name[yyleng-1] = '\0';
+			   input_filename = name;
+			}
+
+ /* null register */
+null 		{ BEGIN(REG); return NULL_TOKEN; }
+
+ /* Opcodes */
+add		{ yylval.integer = BRW_OPCODE_ADD; return ADD; }
+add3		{ yylval.integer = BRW_OPCODE_ADD3; return ADD3; }
+addc		{ yylval.integer = BRW_OPCODE_ADDC; return ADDC; }
+and		{ yylval.integer = BRW_OPCODE_AND; return AND; }
+asr		{ yylval.integer = BRW_OPCODE_ASR; return ASR; }
+avg		{ yylval.integer = BRW_OPCODE_AVG; return AVG; }
+bfe 		{ yylval.integer = BRW_OPCODE_BFE; return BFE; }
+bfi1 		{ yylval.integer = BRW_OPCODE_BFI1; return BFI1; }
+bfi2 		{ yylval.integer = BRW_OPCODE_BFI2; return BFI2; }
+bfrev 		{ yylval.integer = BRW_OPCODE_BFREV; return BFREV; }
+brc 		{ yylval.integer = BRW_OPCODE_BRC; return BRC; }
+brd 		{ yylval.integer = BRW_OPCODE_BRD; return BRD; }
+break 		{ yylval.integer = BRW_OPCODE_BREAK; return BREAK; }
+call 		{ yylval.integer = BRW_OPCODE_CALL; return CALL; }
+calla 		{ yylval.integer = BRW_OPCODE_CALLA; return CALLA; }
+case 		{ yylval.integer = BRW_OPCODE_CASE; return CASE; }
+cbit 		{ yylval.integer = BRW_OPCODE_CBIT; return CBIT; }
+cmp 		{ yylval.integer = BRW_OPCODE_CMP; return CMP; }
+cmpn 		{ yylval.integer = BRW_OPCODE_CMPN; return CMPN; }
+cont 		{ yylval.integer = BRW_OPCODE_CONTINUE; return CONT; }
+csel 		{ yylval.integer = BRW_OPCODE_CSEL; return CSEL; }
+dim 		{ yylval.integer = BRW_OPCODE_DIM; return DIM; }
+do 		{ yylval.integer = BRW_OPCODE_DO; return DO; }
+dp2 		{ yylval.integer = BRW_OPCODE_DP2; return DP2; }
+dp3 		{ yylval.integer = BRW_OPCODE_DP3; return DP3; }
+dp4 		{ yylval.integer = BRW_OPCODE_DP4; return DP4; }
+dp4a		{ yylval.integer = BRW_OPCODE_DP4A; return DP4A; }
+dph 		{ yylval.integer = BRW_OPCODE_DPH; return DPH; }
+else 		{ yylval.integer = BRW_OPCODE_ELSE; return ELSE; }
+endif 		{ yylval.integer = BRW_OPCODE_ENDIF; return ENDIF; }
+f16to32 	{ yylval.integer = BRW_OPCODE_F16TO32; return F16TO32; }
+f32to16 	{ yylval.integer = BRW_OPCODE_F32TO16; return F32TO16; }
+fbh 		{ yylval.integer = BRW_OPCODE_FBH; return FBH; }
+fbl 		{ yylval.integer = BRW_OPCODE_FBL; return FBL; }
+fork 		{ yylval.integer = BRW_OPCODE_FORK; return FORK; }
+frc 		{ yylval.integer = BRW_OPCODE_FRC; return FRC; }
+goto 		{ yylval.integer = BRW_OPCODE_GOTO; return GOTO; }
+halt 		{ yylval.integer = BRW_OPCODE_HALT; return HALT; }
+if 		{ yylval.integer = BRW_OPCODE_IF; return IF; }
+iff 		{ yylval.integer = BRW_OPCODE_IFF; return IFF; }
+illegal 	{ yylval.integer = BRW_OPCODE_ILLEGAL; return ILLEGAL; }
+jmpi 		{ yylval.integer = BRW_OPCODE_JMPI; return JMPI; }
+line 		{ yylval.integer = BRW_OPCODE_LINE; return LINE; }
+lrp 		{ yylval.integer = BRW_OPCODE_LRP; return LRP; }
+lzd 		{ yylval.integer = BRW_OPCODE_LZD; return LZD; }
+mac 		{ yylval.integer = BRW_OPCODE_MAC; return MAC; }
+mach 		{ yylval.integer = BRW_OPCODE_MACH; return MACH; }
+mad 		{ yylval.integer = BRW_OPCODE_MAD; return MAD; }
+madm 		{ yylval.integer = BRW_OPCODE_MADM; return MADM; }
+mov 		{ yylval.integer = BRW_OPCODE_MOV; return MOV; }
+movi 		{ yylval.integer = BRW_OPCODE_MOVI; return MOVI; }
+mul 		{ yylval.integer = BRW_OPCODE_MUL; return MUL; }
+mrest 		{ yylval.integer = BRW_OPCODE_MREST; return MREST; }
+msave 		{ yylval.integer = BRW_OPCODE_MSAVE; return MSAVE; }
+nenop 		{ yylval.integer = BRW_OPCODE_NENOP; return NENOP; }
+nop 		{ yylval.integer = BRW_OPCODE_NOP; return NOP; }
+not 		{ yylval.integer = BRW_OPCODE_NOT; return NOT; }
+or 		{ yylval.integer = BRW_OPCODE_OR; return OR; }
+pln 		{ yylval.integer = BRW_OPCODE_PLN; return PLN; }
+pop 		{ yylval.integer = BRW_OPCODE_POP; return POP; }
+push 		{ yylval.integer = BRW_OPCODE_PUSH; return PUSH; }
+ret 		{ yylval.integer = BRW_OPCODE_RET; return RET; }
+rndd 		{ yylval.integer = BRW_OPCODE_RNDD; return RNDD; }
+rnde 		{ yylval.integer = BRW_OPCODE_RNDE; return RNDE; }
+rndu 		{ yylval.integer = BRW_OPCODE_RNDU; return RNDU; }
+rndz 		{ yylval.integer = BRW_OPCODE_RNDZ; return RNDZ; }
+rol 		{ yylval.integer = BRW_OPCODE_ROL; return ROL; }
+ror 		{ yylval.integer = BRW_OPCODE_ROR; return ROR; }
+sad2 		{ yylval.integer = BRW_OPCODE_SAD2; return SAD2; }
+sada2 		{ yylval.integer = BRW_OPCODE_SADA2; return SADA2; }
+sel 		{ yylval.integer = BRW_OPCODE_SEL; return SEL; }
+send 		{
+			yylval.integer = BRW_OPCODE_SEND;
+			return p->devinfo->ver < 12 ? SEND_GFX4 : SEND_GFX12;
+		}
+sendc 		{
+			yylval.integer = BRW_OPCODE_SENDC;
+			return  p->devinfo->ver < 12 ? SENDC_GFX4 : SENDC_GFX12;
+		}
+sends 	        { yylval.integer = BRW_OPCODE_SENDS; return SENDS; }
+sendsc        	{ yylval.integer = BRW_OPCODE_SENDSC; return SENDSC; }
+shl 		{ yylval.integer = BRW_OPCODE_SHL; return SHL; }
+shr 		{ yylval.integer = BRW_OPCODE_SHR; return SHR; }
+smov 		{ yylval.integer = BRW_OPCODE_SMOV; return SMOV; }
+subb 		{ yylval.integer = BRW_OPCODE_SUBB; return SUBB; }
+wait 		{ yylval.integer = BRW_OPCODE_WAIT; return WAIT; }
+while 		{ yylval.integer = BRW_OPCODE_WHILE; return WHILE; }
+xor 		{ yylval.integer = BRW_OPCODE_XOR; return XOR; }
+sync		{ yylval.integer = BRW_OPCODE_SYNC; return SYNC; }
+
+ /* extended math functions */
+cos 		{ yylval.integer = BRW_MATH_FUNCTION_COS; return COS; }
+exp 		{ yylval.integer = BRW_MATH_FUNCTION_EXP; return EXP; }
+fdiv 		{ yylval.integer = BRW_MATH_FUNCTION_FDIV; return FDIV; }
+inv 		{ yylval.integer = BRW_MATH_FUNCTION_INV; return INV; }
+invm 		{ yylval.integer = GFX8_MATH_FUNCTION_INVM; return INVM; }
+intdiv        	{
+		   yylval.integer = BRW_MATH_FUNCTION_INT_DIV_QUOTIENT;
+		   return INTDIV;
+		}
+intdivmod    	{
+		   yylval.integer =
+		      BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER;
+		   return INTDIVMOD;
+		}
+intmod      	{
+		   yylval.integer = BRW_MATH_FUNCTION_INT_DIV_REMAINDER;
+		   return INTMOD;
+		}
+log 		{ yylval.integer = BRW_MATH_FUNCTION_LOG; return LOG; }
+pow 		{ yylval.integer = BRW_MATH_FUNCTION_POW; return POW; }
+rsq 		{ yylval.integer = BRW_MATH_FUNCTION_RSQ; return RSQ; }
+rsqrtm       	{ yylval.integer = GFX8_MATH_FUNCTION_RSQRTM; return RSQRTM; }
+sin 		{ yylval.integer = BRW_MATH_FUNCTION_SIN; return SIN; }
+sqrt 		{ yylval.integer = BRW_MATH_FUNCTION_SQRT; return SQRT; }
+sincos       	{ yylval.integer = BRW_MATH_FUNCTION_SINCOS; return SINCOS; }
+
+ /* sync instruction */
+allrd		{ yylval.integer = TGL_SYNC_ALLRD; return ALLRD; }
+allwr		{ yylval.integer = TGL_SYNC_ALLWR; return ALLWR; }
+fence		{ yylval.integer = TGL_SYNC_FENCE; return FENCE; }
+bar		{ yylval.integer = TGL_SYNC_BAR; return BAR; }
+host		{ yylval.integer = TGL_SYNC_HOST; return HOST; }
+
+ /* shared functions for send instruction */
+sampler 		{ return SAMPLER; }
+dp_sampler 		{ return DP_SAMPLER; }
+gateway 		{ return GATEWAY; }
+urb 			{ return URB; }
+thread_spawner		{ return THREAD_SPAWNER; }
+render            	{ return RENDER; }
+const 			{ return CONST; }
+data 			{ return DATA; }
+cre 			{ return CRE; }
+math 			{ return MATH; }
+read 			{ return READ; }
+write 			{ return WRITE; }
+vme 			{ return VME; }
+"pixel interp"		{ return PIXEL_INTERP; }
+"dp data 1" 		{ return DP_DATA_1; }
+"rt accel"		{ return RT_ACCEL; }
+slm			{ return SLM; }
+tgm			{ return TGM; }
+ugm			{ return UGM; }
+
+";"    	{ return SEMICOLON; }
+":"    	{ return COLON; }
+"("    	{ return LPAREN; }
+")"    	{ return RPAREN; }
+"{"    	{ return LCURLY; }
+"}"    	{ return RCURLY; }
+"["    	{ return LSQUARE; }
+"]"    	{ return RSQUARE; }
+"<"    	{ return LANGLE; }
+">"    	{ return RANGLE; }
+","    	{ return COMMA; }
+"."    	{ return DOT; }
+"+"    	{ return PLUS; }
+"-"    	{ return MINUS; }
+"~"    	{ return MINUS; }
+"(abs)"	{ return ABS; }
+
+
+"VxH"             	{ return VxH; }
+<REG>"<" 		{ return LANGLE; }
+<REG>[0-9][0-9]* 	{
+			   yylval.integer = strtoul(yytext, NULL, 10);
+			   return INTEGER;
+			}
+<REG>">" 		{ return RANGLE; }
+<REG>","		{ return COMMA; }
+<REG>"."		{ BEGIN(DOTSEL); return DOT; }
+<REG>";"		{ return SEMICOLON; }
+
+<DOTSEL>"x"	        { yylval.integer = BRW_CHANNEL_X; return X; }
+<DOTSEL>"y" 	        { yylval.integer = BRW_CHANNEL_Y; return Y; }
+<DOTSEL>"z" 	        { yylval.integer = BRW_CHANNEL_Z; return Z; }
+<DOTSEL>"w" 	        { yylval.integer = BRW_CHANNEL_W; return W; }
+<DOTSEL>[0-9][0-9]* 	{
+			   yylval.integer = strtoul(yytext, NULL, 10);
+			   BEGIN(REG);
+			   return INTEGER;
+		        }
+<DOTSEL>. 	        { yyless(0); BEGIN(INITIAL); }
+<REG>.             	{ yyless(0); BEGIN(INITIAL); }
+
+ /* Access mode */
+"align1"	{ return ALIGN1; }
+"align16"	{ return ALIGN16; }
+
+ /* Accumulator write control */
+AccWrEnable 	{ return ACCWREN; }
+
+ /* Mask control (formerly WECtrl/Write Enable Control) */
+"WE_all"	{ return WECTRL; }
+
+ /* Compaction control */
+compacted 	{ return CMPTCTRL; }
+
+ /* Debug control */
+breakpoint 	{ return BREAKPOINT; }
+
+ /* Dependency control */
+NoDDClr 	{ return NODDCLR; }
+NoDDChk 	{ return NODDCHK; }
+
+ /* End of thread */
+EOT 		{ return EOT; }
+
+ /* Mask control */
+nomask      	{ return MASK_DISABLE; }
+
+ /* Channel */
+<CHANNEL>"x" 		{ yylval.integer = BRW_CHANNEL_X; return X; }
+<CHANNEL>"y" 		{ yylval.integer = BRW_CHANNEL_Y; return Y; }
+<CHANNEL>"z" 		{ yylval.integer = BRW_CHANNEL_Z; return Z; }
+<CHANNEL>"w" 		{ yylval.integer = BRW_CHANNEL_W; return W; }
+<CHANNEL>[0-9][0-9]* 	{
+			   yylval.integer = strtoul(yytext, NULL, 10);
+			   return INTEGER;
+		        }
+<CHANNEL>"."    	{ return DOT; }
+<CHANNEL>. 		{ yyless(0); BEGIN(INITIAL); }
+
+
+ /* Predicate Control */
+<CHANNEL>".anyv"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ANYV; return ANYV; }
+<CHANNEL>".allv"      	{ yylval.integer = BRW_PREDICATE_ALIGN1_ALLV; return ALLV; }
+<CHANNEL>".any2h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ANY2H; return ANY2H; }
+<CHANNEL>".all2h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ALL2H; return ALL2H; }
+<CHANNEL>".any4h"	{ yylval.integer = BRW_PREDICATE_ALIGN16_ANY4H; return ANY4H; }
+<CHANNEL>".all4h"	{ yylval.integer = BRW_PREDICATE_ALIGN16_ALL4H; return ALL4H; }
+<CHANNEL>".any8h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ANY8H; return ANY8H; }
+<CHANNEL>".all8h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ALL8H; return ALL8H; }
+<CHANNEL>".any16h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ANY16H; return ANY16H; }
+<CHANNEL>".all16h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ALL16H; return ALL16H; }
+<CHANNEL>".any32h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ANY32H; return ANY32H; }
+<CHANNEL>".all32h"	{ yylval.integer = BRW_PREDICATE_ALIGN1_ALL32H; return ALL32H; }
+
+ /* Saturation */
+".sat"		{ return SATURATE; }
+
+ /* Thread control */
+atomic       	{ return ATOMIC; }
+switch       	{ return SWITCH; }
+
+ /* compression control */
+compr 		{ return COMPR; }
+compr4    	{ return COMPR4; }
+sechalf 	{ return SECHALF; }
+
+ /* Quarter Control */
+1[HNQ]       	{ }
+"2Q"	        { return QTR_2Q; }
+"3Q"	        { return QTR_3Q; }
+"4Q"	        { return QTR_4Q; }
+"2H"	        { return QTR_2H; }
+"2N"	        { return QTR_2N; }
+"3N"	        { return QTR_3N; }
+"4N"	        { return QTR_4N; }
+"5N"	        { return QTR_5N; }
+"6N"	        { return QTR_6N; }
+"7N"	        { return QTR_7N; }
+"8N"	        { return QTR_8N; }
+
+ /* data types */
+:?B 	{ return TYPE_B; }
+:?D 	{ return TYPE_D; }
+:?DF 	{ return TYPE_DF; }
+:?F 	{ return TYPE_F; }
+:?HF 	{ return TYPE_HF; }
+:?NF 	{ return TYPE_NF; }
+:?Q 	{ return TYPE_Q; }
+:?UB 	{ return TYPE_UB; }
+:?UD 	{ return TYPE_UD; }
+:?UW 	{ return TYPE_UW; }
+:?UQ 	{ return TYPE_UQ; }
+:?UV 	{ return TYPE_UV; }
+:?V 	{ return TYPE_V; }
+:?VF 	{ return TYPE_VF; }
+:?W 	{ return TYPE_W; }
+
+ /* Address registers */
+"a0" 		{ return ADDRREG; }
+
+ /* accumulator registers */
+"acc"[0-9]+ 	{ yylval.integer = atoi(yytext + 3); return ACCREG; }
+
+ /* channel enable registers */
+"ce0"		{ return CHANNELENABLEREG; }
+
+ /* control registers */
+"cr0" 		{ return CONTROLREG; }
+
+ /* flag registers */
+"f"[0|1] 	{ BEGIN(CHANNEL); yylval.integer = atoi(yytext + 1); return FLAGREG; }
+
+ /* message control registers */
+"m" 		{ return MSGREGFILE; }
+m[0-9]+ 	{ yylval.integer = atoi(yytext + 1); BEGIN(REG); return MSGREG; }
+
+ /* state register */
+sr[0-9]+ 	{ yylval.integer = atoi(yytext + 2); return STATEREG; }
+
+ /* notification registers */
+"n0"  		{ BEGIN(REG); return NOTIFYREG; }
+
+ /* IP register */
+"ip" 		{ return IPREG; }
+
+ /* Thread control register */
+"tdr0"		{ return THREADREG; }
+
+ /* performance register */
+"tm0" 		{ BEGIN(REG); return PERFORMANCEREG; }
+
+[gr][0-9]+ 	{
+		   yylval.integer = atoi(yytext + 1);
+		   BEGIN(REG); return GENREG;
+		}
+[gr] 		{ return GENREGFILE; }
+"mask"[0-9]+ 	{ yylval.integer = atoi(yytext + 4); return MASKREG; }
+
+ /* Conditional modifiers */
+".e" 	{ yylval.integer = BRW_CONDITIONAL_Z; return EQUAL; }
+".g" 	{ yylval.integer = BRW_CONDITIONAL_G; return GREATER; }
+".ge"	{ yylval.integer = BRW_CONDITIONAL_GE; return GREATER_EQUAL; }
+".l"	{ yylval.integer = BRW_CONDITIONAL_L; return LESS; }
+".le"	{ yylval.integer = BRW_CONDITIONAL_LE; return LESS_EQUAL; }
+".ne"	{ yylval.integer = BRW_CONDITIONAL_NZ; return NOT_EQUAL; }
+".nz"	{ yylval.integer = BRW_CONDITIONAL_NZ; return NOT_ZERO; }
+".o"	{ yylval.integer = BRW_CONDITIONAL_O; return OVERFLOW; }
+".r"	{ yylval.integer = BRW_CONDITIONAL_R; return ROUND_INCREMENT; }
+".u"	{ yylval.integer = BRW_CONDITIONAL_U; return UNORDERED; }
+".z"	{ yylval.integer = BRW_CONDITIONAL_Z; return ZERO; }
+
+ /* Eat up JIP and UIP token, their values will be parsed
+  * in numeric section
+  */
+"JIP: "		{ BEGIN(LABEL); }
+"UIP: "		{ BEGIN(LABEL); }
+"Jump: "       	{ }
+"Pop: "		{ }
+[ \t]+ 		{ }
+
+"MsgDesc: "		{ BEGIN(MSGDESC); return MSGDESC_BEGIN; }
+<MSGDESC>ex_bso		{ return EX_BSO; }
+<MSGDESC>src1_len	{ return SRC1_LEN; }
+<MSGDESC>"="		{ return ASSIGN; }
+<MSGDESC>[0-9][0-9]*	{
+			   yylval.integer = strtoul(yytext, NULL, 10);
+			   return INTEGER;
+		        }
+<MSGDESC>"{"    	{ yyless(0); BEGIN(INITIAL); return MSGDESC_END; }
+<MSGDESC>.      	{ }
+
+"0x"[0-9a-f][0-9a-f]* 	{
+			   yylval.llint = strtoull(yytext + 2, NULL, 16);
+			   return LONG;
+			}
+[0-9][0-9]* 		{
+			   yylval.llint = strtoll(yytext, NULL, 10);
+			   return LONG;
+			}
+
+ /* jump label target */
+[a-zA-Z_][0-9a-zA-Z_]*":" {
+	yylval.string = ralloc_strdup(p->mem_ctx, yytext);
+	/* Stomp the trailing ':' */
+	yylval.string[yyleng - 1] = '\0';
+	return JUMP_LABEL_TARGET;
+}
+
+ /* jump label */
+<LABEL>[a-zA-Z_][0-9a-zA-Z_]* {
+	yylval.string = ralloc_strdup(p->mem_ctx, yytext);
+	BEGIN(INITIAL);
+	return JUMP_LABEL;
+}
+
+ /* SWSB */
+"@"[1-7]	{ yylval.integer = atoi(yytext + 1); return REG_DIST_CURRENT; }
+"F@"[1-7]	{ yylval.integer = atoi(yytext + 2); return REG_DIST_FLOAT; }
+"I@"[1-7]	{ yylval.integer = atoi(yytext + 2); return REG_DIST_INT; }
+"L@"[1-7]	{ yylval.integer = atoi(yytext + 2); return REG_DIST_LONG; }
+"A@"[1-7]	{ yylval.integer = atoi(yytext + 2); return REG_DIST_ALL; }
+
+"$"[0-9]*	{ yylval.integer = atoi(yytext + 1); return SBID_ALLOC; }
+"$"[0-9]*".src"	{ yylval.integer = atoi(yytext + 1); return SBID_WAIT_SRC; }
+"$"[0-9]*".dst"	{ yylval.integer = atoi(yytext + 1); return SBID_WAIT_DST; }
+
+\n 	{ yycolumn = 1; }
+
+. 	{
+	   fprintf(stderr, "%s: %d: %s: at \"%s\"\n",
+	           input_filename, yylineno,
+	           "unexpected token", lex_text());
+	}
+%%
+
+char *
+lex_text(void)
+{
+	return yytext;
+}
+
+#ifndef yywrap
+int yywrap()
+{
+	return -1;
+}
+#endif
--- a/src/intel/compiler/elk/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/elk/brw_lower_logical_sends.cpp
--- a/src/intel/compiler/elk/brw_mesh.cpp
+++ b/src/intel/compiler/elk/brw_mesh.cpp
--- a/src/intel/compiler/elk/brw_nir.c
+++ b/src/intel/compiler/elk/brw_nir.c
--- a/src/intel/compiler/elk/brw_nir.h
+++ b/src/intel/compiler/elk/brw_nir.h
@ -0,0 +1,298 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_NIR_H
+#define BRW_NIR_H
+
+#include "brw_reg.h"
+#include "compiler/nir/nir.h"
+#include "brw_compiler.h"
+#include "nir_builder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const struct nir_shader_compiler_options brw_scalar_nir_options;
+extern const struct nir_shader_compiler_options brw_vector_nir_options;
+
+int type_size_vec4(const struct glsl_type *type, bool bindless);
+int type_size_dvec4(const struct glsl_type *type, bool bindless);
+
+static inline int
+type_size_scalar_bytes(const struct glsl_type *type, bool bindless)
+{
+   return glsl_count_dword_slots(type, bindless) * 4;
+}
+
+static inline int
+type_size_vec4_bytes(const struct glsl_type *type, bool bindless)
+{
+   return type_size_vec4(type, bindless) * 16;
+}
+
+/* Flags set in the instr->pass_flags field by i965 analysis passes */
+enum {
+   BRW_NIR_NON_BOOLEAN           = 0x0,
+
+   /* Indicates that the given instruction's destination is a boolean
+    * value but that it needs to be resolved before it can be used.
+    * On Gen <= 5, CMP instructions return a 32-bit value where the bottom
+    * bit represents the actual true/false value of the compare and the top
+    * 31 bits are undefined.  In order to use this value, we have to do a
+    * "resolve" operation by replacing the value of the CMP with -(x & 1)
+    * to sign-extend the bottom bit to 0/~0.
+    */
+   BRW_NIR_BOOLEAN_NEEDS_RESOLVE = 0x1,
+
+   /* Indicates that the given instruction's destination is a boolean
+    * value that has intentionally been left unresolved.  Not all boolean
+    * values need to be resolved immediately.  For instance, if we have
+    *
+    *    CMP r1 r2 r3
+    *    CMP r4 r5 r6
+    *    AND r7 r1 r4
+    *
+    * We don't have to resolve the result of the two CMP instructions
+    * immediately because the AND still does an AND of the bottom bits.
+    * Instead, we can save ourselves instructions by delaying the resolve
+    * until after the AND.  The result of the two CMP instructions is left
+    * as BRW_NIR_BOOLEAN_UNRESOLVED.
+    */
+   BRW_NIR_BOOLEAN_UNRESOLVED    = 0x2,
+
+   /* Indicates a that the given instruction's destination is a boolean
+    * value that does not need a resolve.  For instance, if you AND two
+    * values that are BRW_NIR_BOOLEAN_NEEDS_RESOLVE then we know that both
+    * values will be 0/~0 before we get them and the result of the AND is
+    * also guaranteed to be 0/~0 and does not need a resolve.
+    */
+   BRW_NIR_BOOLEAN_NO_RESOLVE    = 0x3,
+
+   /* A mask to mask the boolean status values off of instr->pass_flags */
+   BRW_NIR_BOOLEAN_MASK          = 0x3,
+};
+
+void brw_nir_analyze_boolean_resolves(nir_shader *nir);
+
+struct brw_nir_compiler_opts {
+   /* Soft floating point implementation shader */
+   const nir_shader *softfp64;
+
+   /* Whether robust image access is enabled */
+   bool robust_image_access;
+
+   /* Input vertices for TCS stage (0 means dynamic) */
+   unsigned input_vertices;
+};
+
+/* UBO surface index can come in 2 flavors :
+ *    - nir_intrinsic_resource_intel
+ *    - anything else
+ *
+ * In the first case, checking that the surface index is const requires
+ * checking resource_intel::src[1]. In any other case it's a simple
+ * nir_src_is_const().
+ *
+ * This function should only be called on src[0] of load_ubo intrinsics.
+ */
+static inline bool
+brw_nir_ubo_surface_index_is_pushable(nir_src src)
+{
+   nir_intrinsic_instr *intrin =
+      src.ssa->parent_instr->type == nir_instr_type_intrinsic ?
+      nir_instr_as_intrinsic(src.ssa->parent_instr) : NULL;
+
+   if (intrin && intrin->intrinsic == nir_intrinsic_resource_intel) {
+      return (nir_intrinsic_resource_access_intel(intrin) &
+              nir_resource_intel_pushable);
+   }
+
+   return nir_src_is_const(src);
+}
+
+static inline unsigned
+brw_nir_ubo_surface_index_get_push_block(nir_src src)
+{
+   if (nir_src_is_const(src))
+      return nir_src_as_uint(src);
+
+   if (!brw_nir_ubo_surface_index_is_pushable(src))
+      return UINT32_MAX;
+
+   assert(src.ssa->parent_instr->type == nir_instr_type_intrinsic);
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src.ssa->parent_instr);
+   assert(intrin->intrinsic == nir_intrinsic_resource_intel);
+
+   return nir_intrinsic_resource_block_intel(intrin);
+}
+
+/* This helper return the binding table index of a surface access (any
+ * buffer/image/etc...). It works off the source of one of the intrinsics
+ * (load_ubo, load_ssbo, store_ssbo, load_image, store_image, etc...).
+ *
+ * If the source is constant, then this is the binding table index. If we're
+ * going through a resource_intel intel intrinsic, then we need to check
+ * src[1] of that intrinsic.
+ */
+static inline unsigned
+brw_nir_ubo_surface_index_get_bti(nir_src src)
+{
+   if (nir_src_is_const(src))
+      return nir_src_as_uint(src);
+
+   assert(src.ssa->parent_instr->type == nir_instr_type_intrinsic);
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src.ssa->parent_instr);
+   if (!intrin || intrin->intrinsic != nir_intrinsic_resource_intel)
+      return UINT32_MAX;
+
+   /* In practice we could even drop this intrinsic because the bindless
+    * access always operate from a base offset coming from a push constant, so
+    * they can never be constant.
+    */
+   if (nir_intrinsic_resource_access_intel(intrin) &
+       nir_resource_intel_bindless)
+      return UINT32_MAX;
+
+   if (!nir_src_is_const(intrin->src[1]))
+      return UINT32_MAX;
+
+   return nir_src_as_uint(intrin->src[1]);
+}
+
+void brw_preprocess_nir(const struct brw_compiler *compiler,
+                        nir_shader *nir,
+                        const struct brw_nir_compiler_opts *opts);
+
+void
+brw_nir_link_shaders(const struct brw_compiler *compiler,
+                     nir_shader *producer, nir_shader *consumer);
+
+bool brw_nir_lower_cs_intrinsics(nir_shader *nir,
+                                 const struct intel_device_info *devinfo,
+                                 struct brw_cs_prog_data *prog_data);
+bool brw_nir_lower_alpha_to_coverage(nir_shader *shader,
+                                     const struct brw_wm_prog_key *key,
+                                     const struct brw_wm_prog_data *prog_data);
+void brw_nir_lower_vs_inputs(nir_shader *nir,
+                             bool edgeflag_is_last,
+                             const uint8_t *vs_attrib_wa_flags);
+void brw_nir_lower_vue_inputs(nir_shader *nir,
+                              const struct intel_vue_map *vue_map);
+void brw_nir_lower_tes_inputs(nir_shader *nir, const struct intel_vue_map *vue);
+void brw_nir_lower_fs_inputs(nir_shader *nir,
+                             const struct intel_device_info *devinfo,
+                             const struct brw_wm_prog_key *key);
+void brw_nir_lower_vue_outputs(nir_shader *nir);
+void brw_nir_lower_tcs_outputs(nir_shader *nir, const struct intel_vue_map *vue,
+                               enum tess_primitive_mode tes_primitive_mode);
+void brw_nir_lower_fs_outputs(nir_shader *nir);
+
+bool brw_nir_lower_cmat(nir_shader *nir, unsigned subgroup_size);
+
+bool brw_nir_lower_shading_rate_output(nir_shader *nir);
+
+bool brw_nir_lower_sparse_intrinsics(nir_shader *nir);
+
+struct brw_nir_lower_storage_image_opts {
+   const struct intel_device_info *devinfo;
+
+   bool lower_loads;
+   bool lower_stores;
+   bool lower_atomics;
+   bool lower_get_size;
+};
+
+bool brw_nir_lower_storage_image(nir_shader *nir,
+                                 const struct brw_nir_lower_storage_image_opts *opts);
+
+bool brw_nir_lower_mem_access_bit_sizes(nir_shader *shader,
+                                        const struct
+                                        intel_device_info *devinfo);
+
+void brw_postprocess_nir(nir_shader *nir,
+                         const struct brw_compiler *compiler,
+                         bool debug_enabled,
+                         enum brw_robustness_flags robust_flags);
+
+bool brw_nir_apply_attribute_workarounds(nir_shader *nir,
+                                         const uint8_t *attrib_wa_flags);
+
+bool brw_nir_apply_trig_workarounds(nir_shader *nir);
+
+bool brw_nir_limit_trig_input_range_workaround(nir_shader *nir);
+
+void brw_nir_apply_key(nir_shader *nir,
+                       const struct brw_compiler *compiler,
+                       const struct brw_base_prog_key *key,
+                       unsigned max_subgroup_size);
+
+unsigned brw_nir_api_subgroup_size(const nir_shader *nir,
+                                   unsigned hw_subgroup_size);
+
+enum brw_conditional_mod brw_cmod_for_nir_comparison(nir_op op);
+enum lsc_opcode lsc_aop_for_nir_intrinsic(const nir_intrinsic_instr *atomic);
+enum brw_reg_type brw_type_for_nir_type(const struct intel_device_info *devinfo,
+                                        nir_alu_type type);
+
+bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
+                                  unsigned bit_size,
+                                  unsigned num_components,
+                                  nir_intrinsic_instr *low,
+                                  nir_intrinsic_instr *high,
+                                  void *data);
+
+void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
+                                nir_shader *nir,
+                                struct brw_ubo_range out_ranges[4]);
+
+void brw_nir_optimize(nir_shader *nir, bool is_scalar,
+                      const struct intel_device_info *devinfo);
+
+nir_shader *brw_nir_create_passthrough_tcs(void *mem_ctx,
+                                           const struct brw_compiler *compiler,
+                                           const struct brw_tcs_prog_key *key);
+
+#define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0
+#define BRW_NIR_FRAG_OUTPUT_INDEX_MASK INTEL_MASK(0, 0)
+#define BRW_NIR_FRAG_OUTPUT_LOCATION_SHIFT 1
+#define BRW_NIR_FRAG_OUTPUT_LOCATION_MASK INTEL_MASK(31, 1)
+
+bool brw_nir_move_interpolation_to_top(nir_shader *nir);
+nir_def *brw_nir_load_global_const(nir_builder *b,
+                                       nir_intrinsic_instr *load_uniform,
+                                       nir_def *base_addr,
+                                       unsigned off);
+
+const struct glsl_type *brw_nir_get_var_type(const struct nir_shader *nir,
+                                             nir_variable *var);
+
+void brw_nir_adjust_payload(nir_shader *shader);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BRW_NIR_H */
--- a/src/intel/compiler/elk/brw_nir_analyze_boolean_resolves.c
+++ b/src/intel/compiler/elk/brw_nir_analyze_boolean_resolves.c
@ -0,0 +1,258 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+
+/*
+ * This file implements an analysis pass that determines when we have to do
+ * a boolean resolve on Gen <= 5.  Instructions that need a boolean resolve
+ * will have the booleans portion of the instr->pass_flags field set to
+ * BRW_NIR_BOOLEAN_NEEDS_RESOLVE.
+ */
+
+
+/** Returns the resolve status for the given source
+ *
+ * If the source has a parent instruction then the resolve status is the
+ * status of the parent instruction.  If the source does not have a parent
+ * instruction then we don't know so we return NON_BOOLEAN.
+ */
+static uint8_t
+get_resolve_status_for_src(nir_src *src)
+{
+   nir_instr *src_instr = src->ssa->parent_instr;
+   uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
+
+   /* If the source instruction needs resolve, then from the perspective
+    * of the user, it's a true boolean.
+    */
+   if (resolve_status == BRW_NIR_BOOLEAN_NEEDS_RESOLVE)
+      resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
+   return resolve_status;
+}
+
+/** Marks the given source as needing a resolve
+ *
+ * If the given source corresponds to an unresolved boolean it marks it as
+ * needing a resolve.  Otherwise, we leave it alone.
+ */
+static bool
+src_mark_needs_resolve(nir_src *src, void *void_state)
+{
+   nir_instr *src_instr = src->ssa->parent_instr;
+   uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
+
+   /* If the source instruction is unresolved, then mark it as needing
+    * to be resolved.
+    */
+   if (resolve_status == BRW_NIR_BOOLEAN_UNRESOLVED) {
+      src_instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK;
+      src_instr->pass_flags |= BRW_NIR_BOOLEAN_NEEDS_RESOLVE;
+   }
+
+   return true;
+}
+
+static bool
+analyze_boolean_resolves_block(nir_block *block)
+{
+   nir_foreach_instr(instr, block) {
+      switch (instr->type) {
+      case nir_instr_type_alu: {
+         /* For ALU instructions, the resolve status is handled in a
+          * three-step process.
+          *
+          * 1) Look at the instruction type and sources and determine if it
+          *    can be left unresolved.
+          *
+          * 2) Look at the destination and see if we have to resolve
+          *    anyway.  (This is the case if this instruction is not the
+          *    only instruction writing to a given register.)
+          *
+          * 3) If the instruction has a resolve status other than
+          *    BOOL_UNRESOLVED or BOOL_NEEDS_RESOLVE then we walk through
+          *    the sources and ensure that they are also resolved.  This
+          *    ensures that we don't end up with any stray unresolved
+          *    booleans going into ADDs or something like that.
+          */
+
+         uint8_t resolve_status;
+         nir_alu_instr *alu = nir_instr_as_alu(instr);
+         switch (alu->op) {
+         case nir_op_b32all_fequal2:
+         case nir_op_b32all_iequal2:
+         case nir_op_b32all_fequal3:
+         case nir_op_b32all_iequal3:
+         case nir_op_b32all_fequal4:
+         case nir_op_b32all_iequal4:
+         case nir_op_b32any_fnequal2:
+         case nir_op_b32any_inequal2:
+         case nir_op_b32any_fnequal3:
+         case nir_op_b32any_inequal3:
+         case nir_op_b32any_fnequal4:
+         case nir_op_b32any_inequal4:
+            /* These are only implemented by the vec4 backend and its
+             * implementation emits resolved booleans.  At some point in the
+             * future, this may change and we'll have to remove some of the
+             * above cases.
+             */
+            resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
+            break;
+
+         case nir_op_mov:
+         case nir_op_inot:
+            /* This is a single-source instruction.  Just copy the resolve
+             * status from the source.
+             */
+            resolve_status = get_resolve_status_for_src(&alu->src[0].src);
+            break;
+
+         case nir_op_b32csel:
+         case nir_op_iand:
+         case nir_op_ior:
+         case nir_op_ixor: {
+            const unsigned first = alu->op == nir_op_b32csel ? 1 : 0;
+            uint8_t src0_status = get_resolve_status_for_src(&alu->src[first + 0].src);
+            uint8_t src1_status = get_resolve_status_for_src(&alu->src[first + 1].src);
+
+            /* src0 of a bcsel is evaluated as a Boolean with the expectation
+             * that it has already been resolved.  Mark it as such.
+             */
+            if (alu->op == nir_op_b32csel)
+               src_mark_needs_resolve(&alu->src[0].src, NULL);
+
+            if (src0_status == src1_status) {
+               resolve_status = src0_status;
+            } else if (src0_status == BRW_NIR_NON_BOOLEAN ||
+                       src1_status == BRW_NIR_NON_BOOLEAN) {
+               /* If one of the sources is a non-boolean then the whole
+                * thing is a non-boolean.
+                */
+               resolve_status = BRW_NIR_NON_BOOLEAN;
+            } else {
+               /* At this point one of them is a true boolean and one is a
+                * boolean that needs a resolve.  We could either resolve the
+                * unresolved source or we could resolve here.  If we resolve
+                * the unresolved source then we get two resolves for the price
+                * of one.  Just set this one to BOOLEAN_NO_RESOLVE and we'll
+                * let the code below force a resolve on the unresolved source.
+                */
+               resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
+            }
+            break;
+         }
+
+         default:
+            if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_bool) {
+               /* This instructions will turn into a CMP when we actually emit
+                * them so the result will have to be resolved before it can be
+                * used.
+                */
+               resolve_status = BRW_NIR_BOOLEAN_UNRESOLVED;
+
+               /* Even though the destination is allowed to be left
+                * unresolved, the sources are treated as regular integers or
+                * floats so they need to be resolved.
+                */
+               nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+            } else {
+               resolve_status = BRW_NIR_NON_BOOLEAN;
+            }
+         }
+
+         /* Go ahead allow unresolved booleans. */
+         instr->pass_flags = (instr->pass_flags & ~BRW_NIR_BOOLEAN_MASK) |
+                             resolve_status;
+
+         /* Finally, resolve sources if it's needed */
+         switch (resolve_status) {
+         case BRW_NIR_BOOLEAN_NEEDS_RESOLVE:
+         case BRW_NIR_BOOLEAN_UNRESOLVED:
+            /* This instruction is either unresolved or we're doing the
+             * resolve here; leave the sources alone.
+             */
+            break;
+
+         case BRW_NIR_BOOLEAN_NO_RESOLVE:
+         case BRW_NIR_NON_BOOLEAN:
+            nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+            break;
+
+         default:
+            unreachable("Invalid boolean flag");
+         }
+
+         break;
+      }
+
+      case nir_instr_type_load_const: {
+         nir_load_const_instr *load = nir_instr_as_load_const(instr);
+
+         /* For load_const instructions, it's a boolean exactly when it holds
+          * one of the values NIR_TRUE or NIR_FALSE.
+          *
+          * Since load_const instructions don't have any sources, we don't
+          * have to worry about resolving them.
+          */
+         instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK;
+         if (load->value[0].u32 == NIR_TRUE || load->value[0].u32 == NIR_FALSE) {
+            instr->pass_flags |= BRW_NIR_BOOLEAN_NO_RESOLVE;
+         } else {
+            instr->pass_flags |= BRW_NIR_NON_BOOLEAN;
+         }
+         continue;
+      }
+
+      default:
+         /* Everything else is an unknown non-boolean value and needs to
+          * have all sources resolved.
+          */
+         instr->pass_flags = (instr->pass_flags & ~BRW_NIR_BOOLEAN_MASK) |
+                             BRW_NIR_NON_BOOLEAN;
+         nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+         continue;
+      }
+   }
+
+   nir_if *following_if = nir_block_get_following_if(block);
+   if (following_if)
+      src_mark_needs_resolve(&following_if->condition, NULL);
+
+   return true;
+}
+
+static void
+analyze_boolean_resolves_impl(nir_function_impl *impl)
+{
+   nir_foreach_block(block, impl) {
+      analyze_boolean_resolves_block(block);
+   }
+}
+
+void
+brw_nir_analyze_boolean_resolves(nir_shader *shader)
+{
+   nir_foreach_function_impl(impl, shader) {
+      analyze_boolean_resolves_impl(impl);
+   }
+}
--- a/src/intel/compiler/elk/brw_nir_analyze_ubo_ranges.c
+++ b/src/intel/compiler/elk/brw_nir_analyze_ubo_ranges.c
@ -0,0 +1,317 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir.h"
+#include "util/u_dynarray.h"
+
+/**
+ * \file brw_nir_analyze_ubo_ranges.c
+ *
+ * This pass decides which portions of UBOs to upload as push constants,
+ * so shaders can access them as part of the thread payload, rather than
+ * having to issue expensive memory reads to pull the data.
+ *
+ * The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different
+ * buffers, in GRF (256-bit/32-byte) units.
+ *
+ * To do this, we examine NIR load_ubo intrinsics, recording the number of
+ * loads at each offset.  We track offsets at a 32-byte granularity, so even
+ * fields with a bit of padding between them tend to fall into contiguous
+ * ranges.  We build a list of these ranges, tracking their "cost" (number
+ * of registers required) and "benefit" (number of pull loads eliminated
+ * by pushing the range).  We then sort the list to obtain the four best
+ * ranges (most benefit for the least cost).
+ */
+
+struct ubo_range_entry
+{
+   struct brw_ubo_range range;
+   int benefit;
+};
+
+static int
+score(const struct ubo_range_entry *entry)
+{
+   return 2 * entry->benefit - entry->range.length;
+}
+
+/**
+ * Compares score for two UBO range entries.
+ *
+ * For a descending qsort().
+ */
+static int
+cmp_ubo_range_entry(const void *va, const void *vb)
+{
+   const struct ubo_range_entry *a = va;
+   const struct ubo_range_entry *b = vb;
+
+   /* Rank based on scores, descending order */
+   int delta = score(b) - score(a);
+
+   /* Then use the UBO block index as a tie-breaker, descending order */
+   if (delta == 0)
+      delta = b->range.block - a->range.block;
+
+   /* Finally use the start offset as a second tie-breaker, ascending order */
+   if (delta == 0)
+      delta = a->range.start - b->range.start;
+
+   return delta;
+}
+
+struct ubo_block_info
+{
+   /* Each bit in the offsets bitfield represents a 32-byte section of data.
+    * If it's set to one, there is interesting UBO data at that offset.  If
+    * not, there's a "hole" - padding between data - or just nothing at all.
+    */
+   uint64_t offsets;
+   uint8_t uses[64];
+};
+
+struct ubo_analysis_state
+{
+   struct hash_table *blocks;
+   bool uses_regular_uniforms;
+};
+
+static struct ubo_block_info *
+get_block_info(struct ubo_analysis_state *state, int block)
+{
+   uint32_t hash = block + 1;
+   void *key = (void *) (uintptr_t) hash;
+
+   struct hash_entry *entry =
+      _mesa_hash_table_search_pre_hashed(state->blocks, hash, key);
+
+   if (entry)
+      return (struct ubo_block_info *) entry->data;
+
+   struct ubo_block_info *info =
+      rzalloc(state->blocks, struct ubo_block_info);
+   _mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info);
+
+   return info;
+}
+
+static void
+analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
+{
+   nir_foreach_instr(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_load_uniform:
+      case nir_intrinsic_image_deref_load:
+      case nir_intrinsic_image_deref_store:
+      case nir_intrinsic_image_deref_atomic:
+      case nir_intrinsic_image_deref_atomic_swap:
+      case nir_intrinsic_image_deref_size:
+         state->uses_regular_uniforms = true;
+         continue;
+
+      case nir_intrinsic_load_ubo:
+         break; /* Fall through to the analysis below */
+
+      default:
+         continue; /* Not a uniform or UBO intrinsic */
+      }
+
+      if (brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) &&
+          nir_src_is_const(intrin->src[1])) {
+         const int block = brw_nir_ubo_surface_index_get_push_block(intrin->src[0]);
+         const unsigned byte_offset = nir_src_as_uint(intrin->src[1]);
+         const int offset = byte_offset / 32;
+
+         /* Avoid shifting by larger than the width of our bitfield, as this
+          * is undefined in C.  Even if we require multiple bits to represent
+          * the entire value, it's OK to record a partial value - the backend
+          * is capable of falling back to pull loads for later components of
+          * vectors, as it has to shrink ranges for other reasons anyway.
+          */
+         if (offset >= 64)
+            continue;
+
+         /* The value might span multiple 32-byte chunks. */
+         const int bytes = nir_intrinsic_dest_components(intrin) *
+                           (intrin->def.bit_size / 8);
+         const int start = ROUND_DOWN_TO(byte_offset, 32);
+         const int end = ALIGN(byte_offset + bytes, 32);
+         const int chunks = (end - start) / 32;
+
+         /* TODO: should we count uses in loops as higher benefit? */
+
+         struct ubo_block_info *info = get_block_info(state, block);
+         info->offsets |= ((1ull << chunks) - 1) << offset;
+         info->uses[offset]++;
+      }
+   }
+}
+
+static void
+print_ubo_entry(FILE *file,
+                const struct ubo_range_entry *entry,
+                struct ubo_analysis_state *state)
+{
+   struct ubo_block_info *info = get_block_info(state, entry->range.block);
+
+   fprintf(file,
+           "block %2d, start %2d, length %2d, bits = %"PRIx64", "
+           "benefit %2d, cost %2d, score = %2d\n",
+           entry->range.block, entry->range.start, entry->range.length,
+           info->offsets, entry->benefit, entry->range.length, score(entry));
+}
+
+void
+brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
+                           nir_shader *nir,
+                           struct brw_ubo_range out_ranges[4])
+{
+   void *mem_ctx = ralloc_context(NULL);
+
+   struct ubo_analysis_state state = {
+      .uses_regular_uniforms = false,
+      .blocks =
+         _mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal),
+   };
+
+   /* Compute shaders use push constants to get the subgroup ID so it's
+    * best to just assume some system values are pushed.
+    */
+   if (nir->info.stage == MESA_SHADER_COMPUTE)
+      state.uses_regular_uniforms = true;
+
+   /* Walk the IR, recording how many times each UBO block/offset is used. */
+   nir_foreach_function_impl(impl, nir) {
+      nir_foreach_block(block, impl) {
+         analyze_ubos_block(&state, block);
+      }
+   }
+
+   /* Find ranges: a block, starting 32-byte offset, and length. */
+   struct util_dynarray ranges;
+   util_dynarray_init(&ranges, mem_ctx);
+
+   hash_table_foreach(state.blocks, entry) {
+      const int b = entry->hash - 1;
+      const struct ubo_block_info *info = entry->data;
+      uint64_t offsets = info->offsets;
+
+      /* Walk through the offsets bitfield, finding contiguous regions of
+       * set bits:
+       *
+       *   0000000001111111111111000000000000111111111111110000000011111100
+       *            ^^^^^^^^^^^^^            ^^^^^^^^^^^^^^        ^^^^^^
+       *
+       * Each of these will become a UBO range.
+       */
+      while (offsets != 0) {
+         /* Find the first 1 in the offsets bitfield.  This represents the
+          * start of a range of interesting UBO data.  Make it zero-indexed.
+          */
+         int first_bit = ffsll(offsets) - 1;
+
+         /* Find the first 0 bit in offsets beyond first_bit.  To find the
+          * first zero bit, we find the first 1 bit in the complement.  In
+          * order to ignore bits before first_bit, we mask off those bits.
+          */
+         int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1;
+
+         if (first_hole == -1) {
+            /* If we didn't find a hole, then set it to the end of the
+             * bitfield.  There are no more ranges to process.
+             */
+            first_hole = 64;
+            offsets = 0;
+         } else {
+            /* We've processed all bits before first_hole.  Mask them off. */
+            offsets &= ~((1ull << first_hole) - 1);
+         }
+
+         struct ubo_range_entry *entry =
+            util_dynarray_grow(&ranges, struct ubo_range_entry, 1);
+
+         entry->range.block = b;
+         entry->range.start = first_bit;
+         /* first_hole is one beyond the end, so we don't need to add 1 */
+         entry->range.length = first_hole - first_bit;
+         entry->benefit = 0;
+
+         for (int i = 0; i < entry->range.length; i++)
+            entry->benefit += info->uses[first_bit + i];
+      }
+   }
+
+   int nr_entries = ranges.size / sizeof(struct ubo_range_entry);
+
+   if (0) {
+      util_dynarray_foreach(&ranges, struct ubo_range_entry, entry) {
+         print_ubo_entry(stderr, entry, &state);
+      }
+   }
+
+   /* TODO: Consider combining ranges.
+    *
+    * We can only push 3-4 ranges via 3DSTATE_CONSTANT_XS.  If there are
+    * more ranges, and two are close by with only a small hole, it may be
+    * worth combining them.  The holes will waste register space, but the
+    * benefit of removing pulls may outweigh that cost.
+    */
+
+   /* Sort the list so the most beneficial ranges are at the front. */
+   if (nr_entries > 0) {
+      qsort(ranges.data, nr_entries, sizeof(struct ubo_range_entry),
+            cmp_ubo_range_entry);
+   }
+
+   struct ubo_range_entry *entries = ranges.data;
+
+   /* Return the top 4 or so.  We drop by one if regular uniforms are in
+    * use, assuming one push buffer will be dedicated to those.  We may
+    * also only get 3 on Haswell if we can't write INSTPM.
+    *
+    * The backend may need to shrink these ranges to ensure that they
+    * don't exceed the maximum push constant limits.  It can simply drop
+    * the tail of the list, as that's the least valuable portion.  We
+    * unfortunately can't truncate it here, because we don't know what
+    * the backend is planning to do with regular uniforms.
+    */
+   const int max_ubos = (compiler->constant_buffer_0_is_relative ? 3 : 4) -
+                        state.uses_regular_uniforms;
+   nr_entries = MIN2(nr_entries, max_ubos);
+
+   for (int i = 0; i < nr_entries; i++) {
+      out_ranges[i] = entries[i].range;
+   }
+   for (int i = nr_entries; i < 4; i++) {
+      out_ranges[i].block = 0;
+      out_ranges[i].start = 0;
+      out_ranges[i].length = 0;
+   }
+
+   ralloc_free(ranges.mem_ctx);
+}
--- a/src/intel/compiler/elk/brw_nir_attribute_workarounds.c
+++ b/src/intel/compiler/elk/brw_nir_attribute_workarounds.c
@ -0,0 +1,132 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "brw_nir.h"
+
+/**
+ * Prior to Haswell, the hardware can't natively support GL_FIXED or
+ * 2_10_10_10_REV vertex formats.  This pass inserts extra shader code
+ * to produce the correct values.
+ */
+
+static bool
+apply_attr_wa_instr(nir_builder *b, nir_instr *instr, void *cb_data)
+{
+   const uint8_t *attrib_wa_flags = cb_data;
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   if (intrin->intrinsic != nir_intrinsic_load_input)
+      return false;
+
+   uint8_t wa_flags = attrib_wa_flags[nir_intrinsic_base(intrin)];
+   if (wa_flags == 0)
+      return false;
+
+   b->cursor = nir_after_instr(instr);
+
+   nir_def *val = &intrin->def;
+
+   /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
+    * come in as floating point conversions of the integer values.
+    */
+   if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
+      nir_def *scaled =
+         nir_fmul_imm(b, val, 1.0f / 65536.0f);
+      nir_def *comps[4];
+      for (int i = 0; i < val->num_components; i++) {
+         bool rescale = i < (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK);
+         comps[i] = nir_channel(b, rescale ? scaled : val, i);
+      }
+      val = nir_vec(b, comps, val->num_components);
+   }
+
+   /* Do sign recovery for 2101010 formats if required. */
+   if (wa_flags & BRW_ATTRIB_WA_SIGN) {
+      /* sign recovery shift: <22, 22, 22, 30> */
+      nir_def *shift = nir_imm_ivec4(b, 22, 22, 22, 30);
+      val = nir_ishr(b, nir_ishl(b, val, shift), shift);
+   }
+
+   /* Apply BGRA swizzle if required. */
+   if (wa_flags & BRW_ATTRIB_WA_BGRA) {
+      val = nir_swizzle(b, val, (unsigned[4]){2,1,0,3}, 4);
+   }
+
+   if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
+      /* ES 3.0 has different rules for converting signed normalized
+       * fixed-point numbers than desktop GL.
+       */
+      if (wa_flags & BRW_ATTRIB_WA_SIGN) {
+         /* According to equation 2.2 of the ES 3.0 specification,
+          * signed normalization conversion is done by:
+          *
+          * f = c / (2^(b-1)-1)
+          *
+          * OpenGL 4.2+ uses this equation as well.  Since most contexts
+          * promote to the new higher version, and this is what Haswell+
+          * hardware does anyway, we just always use this formula.
+          */
+         nir_def *es3_normalize_factor =
+            nir_imm_vec4(b, 1.0f / ((1 << 9) - 1), 1.0f / ((1 << 9) - 1),
+                            1.0f / ((1 << 9) - 1), 1.0f / ((1 << 1) - 1));
+         val = nir_fmax(b,
+                        nir_fmul(b, nir_i2f32(b, val), es3_normalize_factor),
+                        nir_imm_float(b, -1.0f));
+      } else {
+         /* The following equation is from the OpenGL 3.2 specification:
+          *
+          * 2.1 unsigned normalization
+          * f = c/(2^n-1)
+          */
+         nir_def *normalize_factor =
+            nir_imm_vec4(b, 1.0f / ((1 << 10) - 1), 1.0f / ((1 << 10) - 1),
+                            1.0f / ((1 << 10) - 1), 1.0f / ((1 << 2)  - 1));
+
+         val = nir_fmul(b, nir_u2f32(b, val), normalize_factor);
+      }
+   }
+
+   if (wa_flags & BRW_ATTRIB_WA_SCALE) {
+      val = (wa_flags & BRW_ATTRIB_WA_SIGN) ? nir_i2f32(b, val)
+                                            : nir_u2f32(b, val);
+   }
+
+   nir_def_rewrite_uses_after(&intrin->def, val,
+                                  val->parent_instr);
+
+   return true;
+}
+
+bool
+brw_nir_apply_attribute_workarounds(nir_shader *shader,
+                                    const uint8_t *attrib_wa_flags)
+{
+   return nir_shader_instructions_pass(shader, apply_attr_wa_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       (void *)attrib_wa_flags);
+}
--- a/src/intel/compiler/elk/brw_nir_lower_alpha_to_coverage.c
+++ b/src/intel/compiler/elk/brw_nir_lower_alpha_to_coverage.c
@ -0,0 +1,192 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "brw_nir.h"
+
+/**
+ * We need to compute alpha to coverage dithering manually in shader
+ * and replace sample mask store with the bitwise-AND of sample mask and
+ * alpha to coverage dithering.
+ *
+ * The following formula is used to compute final sample mask:
+ *  m = int(16.0 * clamp(src0_alpha, 0.0, 1.0))
+ *  dither_mask = 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) |
+ *     0x0808 * (m & 2) | 0x0100 * (m & 1)
+ *  sample_mask = sample_mask & dither_mask
+ *
+ * It gives a number of ones proportional to the alpha for 2, 4, 8 or 16
+ * least significant bits of the result:
+ *  0.0000 0000000000000000
+ *  0.0625 0000000100000000
+ *  0.1250 0001000000010000
+ *  0.1875 0001000100010000
+ *  0.2500 1000100010001000
+ *  0.3125 1000100110001000
+ *  0.3750 1001100010011000
+ *  0.4375 1001100110011000
+ *  0.5000 1010101010101010
+ *  0.5625 1010101110101010
+ *  0.6250 1011101010111010
+ *  0.6875 1011101110111010
+ *  0.7500 1110111011101110
+ *  0.8125 1110111111101110
+ *  0.8750 1111111011111110
+ *  0.9375 1111111111111110
+ *  1.0000 1111111111111111
+ */
+static nir_def *
+build_dither_mask(nir_builder *b, nir_def *color)
+{
+   assert(color->num_components == 4);
+   nir_def *alpha = nir_channel(b, color, 3);
+
+   nir_def *m =
+      nir_f2i32(b, nir_fmul_imm(b, nir_fsat(b, alpha), 16.0));
+
+   nir_def *part_a =
+      nir_iand_imm(b, nir_ushr(b, nir_imm_int(b, 0xfea80),
+                                  nir_iand_imm(b, m, ~3)),
+                      0xf);
+
+   nir_def *part_b = nir_iand_imm(b, m, 2);
+   nir_def *part_c = nir_iand_imm(b, m, 1);
+
+   return nir_ior(b, nir_imul_imm(b, part_a, 0x1111),
+                     nir_ior(b, nir_imul_imm(b, part_b, 0x0808),
+                                nir_imul_imm(b, part_c, 0x0100)));
+}
+
+bool
+brw_nir_lower_alpha_to_coverage(nir_shader *shader,
+                                const struct brw_wm_prog_key *key,
+                                const struct brw_wm_prog_data *prog_data)
+{
+   assert(shader->info.stage == MESA_SHADER_FRAGMENT);
+   assert(key->alpha_to_coverage != BRW_NEVER);
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+
+   const uint64_t outputs_written = shader->info.outputs_written;
+   if (!(outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) ||
+       !(outputs_written & (BITFIELD64_BIT(FRAG_RESULT_COLOR) |
+                            BITFIELD64_BIT(FRAG_RESULT_DATA0))))
+      goto skip;
+
+   nir_intrinsic_instr *sample_mask_write = NULL;
+   nir_intrinsic_instr *color0_write = NULL;
+   bool sample_mask_write_first = false;
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         if (intrin->intrinsic != nir_intrinsic_store_output)
+            continue;
+
+         /* We call nir_lower_io_to_temporaries to lower FS outputs to
+          * temporaries with a copy at the end so this should be the last
+          * block in the shader.
+          */
+         assert(block->cf_node.parent == &impl->cf_node);
+         assert(nir_cf_node_is_last(&block->cf_node));
+
+         /* See store_output in fs_visitor::nir_emit_fs_intrinsic */
+         const unsigned store_offset = nir_src_as_uint(intrin->src[1]);
+         const unsigned driver_location = nir_intrinsic_base(intrin) +
+            SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
+
+         /* Extract the FRAG_RESULT */
+         const unsigned location =
+            GET_FIELD(driver_location, BRW_NIR_FRAG_OUTPUT_LOCATION);
+
+         if (location == FRAG_RESULT_SAMPLE_MASK) {
+            assert(sample_mask_write == NULL);
+            sample_mask_write = intrin;
+            sample_mask_write_first = (color0_write == NULL);
+         }
+
+         if (location == FRAG_RESULT_COLOR ||
+             location == FRAG_RESULT_DATA0) {
+            assert(color0_write == NULL);
+            color0_write = intrin;
+         }
+      }
+   }
+
+   /* It's possible that shader_info may be out-of-date and the writes to
+    * either gl_SampleMask or the first color value may have been removed.
+    * This can happen if, for instance a nir_undef is written to the
+    * color value.  In that case, just bail and don't do anything rather
+    * than crashing.
+    */
+   if (color0_write == NULL || sample_mask_write == NULL)
+      goto skip;
+
+   /* It's possible that the color value isn't actually a vec4.  In this case,
+    * assuming an alpha of 1.0 and letting the sample mask pass through
+    * unaltered seems like the kindest thing to do to apps.
+    */
+   nir_def *color0 = color0_write->src[0].ssa;
+   if (color0->num_components < 4)
+      goto skip;
+
+   nir_def *sample_mask = sample_mask_write->src[0].ssa;
+
+   if (sample_mask_write_first) {
+      /* If the sample mask write comes before the write to color0, we need
+       * to move it because it's going to use the value from color0 to
+       * compute the sample mask.
+       */
+      nir_instr_remove(&sample_mask_write->instr);
+      nir_instr_insert(nir_after_instr(&color0_write->instr),
+                       &sample_mask_write->instr);
+   }
+
+   nir_builder b = nir_builder_at(nir_before_instr(&sample_mask_write->instr));
+
+   /* Combine dither_mask and the gl_SampleMask value */
+   nir_def *dither_mask = build_dither_mask(&b, color0);
+   dither_mask = nir_iand(&b, sample_mask, dither_mask);
+
+   if (key->alpha_to_coverage == BRW_SOMETIMES) {
+      nir_def *push_flags =
+         nir_load_uniform(&b, 1, 32, nir_imm_int(&b, prog_data->msaa_flags_param * 4));
+      nir_def *alpha_to_coverage =
+         nir_test_mask(&b, push_flags, INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE);
+      dither_mask = nir_bcsel(&b, alpha_to_coverage,
+                              dither_mask, sample_mask_write->src[0].ssa);
+   }
+
+   nir_src_rewrite(&sample_mask_write->src[0], dither_mask);
+
+   nir_metadata_preserve(impl, nir_metadata_block_index |
+                               nir_metadata_dominance);
+   return true;
+
+skip:
+   nir_metadata_preserve(impl, nir_metadata_all);
+   return false;
+}
--- a/src/intel/compiler/elk/brw_nir_lower_cooperative_matrix.c
+++ b/src/intel/compiler/elk/brw_nir_lower_cooperative_matrix.c
@ -0,0 +1,818 @@
+/*
+ * Copyright 2023 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+/**
+ * \file brw_nir_lower_cooperative_matrix.c
+ * Lower cooperative matrix to subgroup operations.
+ *
+ * All supported matrix types are assumed to have either 8 rows or 8
+ * columns. The other dimension of the matrix is typically 8 times the number
+ * of data elements that can be stored in a 32-bit dword. Matrix data is
+ * indexed by a combination of an array element and a subgroup invocation ID.
+ *
+ * Two layouts for matrix data are used. In the first layout,
+ * subgroupShuffle(slice[N], ...) accesses row N of the matrix. This will be
+ * called row-major hereafter. In the other layout,
+ * subgroupShuffle(slice[...], M) accesses column M of the matrix. This will
+ * be called column-major hereafter. In cases where a single 32-bit value is
+ * stored in each entry, these layouts are identical.
+ *
+ * The subtle difference arises when multiple values are packed into a single
+ * 32-bit dword. If two 16-bit values are packed in a single 32-bit value in
+ * column-major, subgroupShuffle(slice[0], 1) holds matrix entries m[1][1] and
+ * m[2][1] (in m[row][column] notation). In row-major, that same shuffle holds
+ * m[0][2] and m[0][3].
+ *
+ * There is an alternate way to think about the matrix layouts. Every matrix
+ * size supported by the Intel driver is either Sx8 (e.g., 16x8 for float16 B
+ * matrix) or Sx8T (e.g., 8x32 for int8 A matrix). The A matrix and B matrix
+ * layouts are such that a single 8 dword register hold an entire row of the
+ * matrix.
+ *
+ * Consider a matrix stored starting in register g32. In an A matrix, the
+ * packed dwords of g32 contain only the data for a single row of the
+ * matrix. g32 is row 0, g33 is row 1, etc. In a B matrix, the packed dwords
+ * of g(32+N).X contain only the data for a single column of the
+ * matrix. g[32:40].0 is column 0, g[32:40].1 is column 1, etc.
+ *
+ * This leads to some shenanigans in \c lower_cmat_load_store.
+ *
+ * In the common case, A, C, and result matrices are stored row major while B
+ * matrices are stored column major. This arrangement facilitates efficient
+ * dot product operations using DPAS or DP4A instructions.
+ *
+ * Future optimizations are possible when row and column major are
+ * flipped. That is, efficient dot products are also possible when A, C, and
+ * result matrices are column major while B is row major.
+ */
+
+#include "brw_nir.h"
+
+struct lower_cmat_state {
+   nir_shader *shader;
+
+   struct hash_table *slice_coop_types;
+
+   struct hash_table *vars_to_slice;
+
+   unsigned subgroup_size;
+};
+
+static void
+print_coop_types(struct lower_cmat_state *state)
+{
+   fprintf(stderr, "--- Slices to Cooperative Matrix type table\n");
+   hash_table_foreach(state->slice_coop_types, e) {
+      nir_variable *var = (void *)e->key;
+      const struct glsl_type *t = e->data;
+      fprintf(stderr, "%p: %s -> %s\n", var, var->name, glsl_get_type_name(t));
+   }
+   fprintf(stderr, "\n\n");
+}
+
+static const struct glsl_type *
+get_coop_type_for_slice(struct lower_cmat_state *state, nir_deref_instr *deref)
+{
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+   struct hash_entry *entry = _mesa_hash_table_search(state->slice_coop_types, var);
+
+   assert(entry != NULL);
+
+   return entry->data;
+}
+
+static bool
+lower_cmat_filter(const nir_instr *instr, const void *_state)
+{
+   if (instr->type == nir_instr_type_deref) {
+      nir_deref_instr *deref = nir_instr_as_deref(instr);
+      return glsl_type_is_cmat(deref->type);
+   }
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_cmat_construct:
+   case nir_intrinsic_cmat_load:
+   case nir_intrinsic_cmat_store:
+   case nir_intrinsic_cmat_length:
+   case nir_intrinsic_cmat_muladd:
+   case nir_intrinsic_cmat_unary_op:
+   case nir_intrinsic_cmat_binary_op:
+   case nir_intrinsic_cmat_scalar_op:
+   case nir_intrinsic_cmat_bitcast:
+   case nir_intrinsic_cmat_insert:
+   case nir_intrinsic_cmat_extract:
+   case nir_intrinsic_cmat_copy:
+      return true;
+
+   default:
+      return false;
+   }
+}
+
+/**
+ * Get number of matrix elements packed in each component of the slice.
+ */
+static unsigned
+get_packing_factor(const struct glsl_cmat_description desc,
+                   const struct glsl_type *slice_type)
+{
+   const struct glsl_type *slice_element_type = glsl_without_array(slice_type);
+
+   assert(!glsl_type_is_cmat(slice_type));
+
+   assert(glsl_get_bit_size(slice_element_type) >= glsl_base_type_get_bit_size(desc.element_type));
+   assert(glsl_get_bit_size(slice_element_type) % glsl_base_type_get_bit_size(desc.element_type) == 0);
+
+   return glsl_get_bit_size(slice_element_type) / glsl_base_type_get_bit_size(desc.element_type);
+}
+
+static const struct glsl_type *
+get_slice_type_from_desc(const struct lower_cmat_state *state,
+                         const struct glsl_cmat_description desc)
+{
+   enum glsl_base_type base_type;
+
+   /* Number of matrix elements stored by each subgroup invocation. If the
+    * data is packed, the slice size will be less than this.
+    */
+   const unsigned elements_per_invocation =
+      (desc.rows * desc.cols) / state->subgroup_size;
+
+   assert(elements_per_invocation > 0);
+
+   const unsigned element_bits = 32;
+   const unsigned bits = glsl_base_type_get_bit_size(desc.element_type);
+   unsigned packing_factor = MIN2(elements_per_invocation,
+                                  element_bits / bits);
+
+   /* Adjust the packing factor so that each row of the matrix fills and
+    * entire GRF.
+    *
+    * The in-register layout of B matrices is different, so those are handled
+    * more like column major (for row major matrices). See the file comment
+    * for more details.
+    */
+   const unsigned actual_cols = desc.use != GLSL_CMAT_USE_B ? desc.cols : desc.rows;
+   while ((actual_cols / packing_factor) < 8) {
+      assert(packing_factor > 1);
+      packing_factor /= 2;
+   }
+
+   switch (desc.element_type) {
+   case GLSL_TYPE_FLOAT:
+      base_type = GLSL_TYPE_FLOAT;
+      break;
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_FLOAT16:
+   case GLSL_TYPE_UINT8:
+   case GLSL_TYPE_UINT16:
+      base_type = glsl_get_base_type(glsl_uintN_t_type(packing_factor * bits));
+      break;
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_INT8:
+   case GLSL_TYPE_INT16:
+      base_type = glsl_get_base_type(glsl_intN_t_type(packing_factor * bits));
+      break;
+   default:
+      unreachable("Invalid cooperative matrix element type.");
+   }
+
+   unsigned len = elements_per_invocation / packing_factor;
+
+   /* Supported matrix sizes are designed to fill either 4 or 8 SIMD8
+    * registers. That means:
+    *
+    *          4 regsiters   8 registers
+    * SIMD32     len = 1       len = 2
+    * SIMD16     len = 2       len = 4
+    * SIMD8      len = 4       len = 8
+    *
+    * If configurations are added that result in other values of len, at the
+    * very least this assertion will need to be updated. The only value of len
+    * that makes sense to add would be 16, and that would be a lot of
+    * registers.
+    */
+   assert(len == 1 || len == 2 || len == 4 || len == 8);
+
+   const struct glsl_type *slice_type = glsl_vector_type(base_type, len);
+
+   assert(packing_factor == get_packing_factor(desc, slice_type));
+
+   return slice_type;
+}
+
+static const struct glsl_type *
+get_slice_type(const struct lower_cmat_state *state,
+               const struct glsl_type *type)
+{
+   if (glsl_type_is_array(type)) {
+      const struct glsl_type *slice_type =
+         get_slice_type(state, glsl_get_array_element(type));
+
+      return glsl_array_type(slice_type, glsl_array_size(type), 0);
+   }
+
+   assert(glsl_type_is_cmat(type));
+
+   return get_slice_type_from_desc(state,
+                                   *glsl_get_cmat_description(type));
+}
+
+static nir_deref_instr *
+create_local_slice(struct lower_cmat_state *state, nir_builder *b,
+                   const struct glsl_type *mat_type, const char *name)
+{
+   const struct glsl_type *slice_type = get_slice_type(state, mat_type);
+   nir_variable *slice_var = nir_local_variable_create(b->impl, slice_type, name);
+   _mesa_hash_table_insert(state->slice_coop_types, slice_var, (void *)mat_type);
+   return nir_build_deref_var(b, slice_var);
+}
+
+static void
+lower_cmat_load_store(nir_builder *b, nir_intrinsic_instr *intrin,
+                      struct lower_cmat_state *state)
+{
+   const bool load = intrin->intrinsic == nir_intrinsic_cmat_load;
+   const unsigned mat_src = load ? 0 : 1;
+   const unsigned ptr_src = load ? 1 : 0;
+
+   nir_deref_instr *slice = nir_src_as_deref(intrin->src[mat_src]);
+   const struct glsl_type *mat_type = get_coop_type_for_slice(state, slice);
+   const struct glsl_cmat_description *desc = glsl_get_cmat_description(mat_type);
+
+   nir_def *results[NIR_MAX_VEC_COMPONENTS];
+   const unsigned num_components = glsl_get_vector_elements(slice->type);
+   const unsigned packing_factor = get_packing_factor(*desc, slice->type);
+
+   nir_deref_instr *pointer = nir_src_as_deref(intrin->src[ptr_src]);
+
+   if ((nir_intrinsic_matrix_layout(intrin) == GLSL_MATRIX_LAYOUT_ROW_MAJOR) ==
+       (desc->use != GLSL_CMAT_USE_B)) {
+      nir_def *stride = nir_udiv_imm(b, intrin->src[2].ssa, packing_factor);
+
+      const struct glsl_type *element_type =
+         glsl_scalar_type(glsl_get_base_type(slice->type));
+
+      pointer = nir_build_deref_cast(b, &pointer->def, pointer->modes,
+                                     element_type,
+                                     glsl_get_bit_size(element_type) / 8);
+
+      nir_def *invocation = nir_load_subgroup_invocation(b);
+      nir_def *base_offset;
+      nir_def *step;
+
+      if (desc->use != GLSL_CMAT_USE_B) {
+         base_offset = nir_iadd(b,
+                                nir_imul(b,
+                                         nir_udiv_imm(b, invocation, 8),
+                                         stride),
+                                nir_umod_imm(b, invocation, 8));
+
+         step = nir_imul_imm(b, stride, state->subgroup_size / 8);
+      } else {
+         base_offset = nir_iadd(b,
+                                nir_imul(b,
+                                         nir_umod_imm(b, invocation, 8),
+                                         stride),
+                                nir_udiv_imm(b, invocation, 8));
+
+         step = nir_imm_int(b, state->subgroup_size / 8);
+      }
+
+      for (unsigned i = 0; i < num_components; i++) {
+         nir_def *offset = nir_imul_imm(b, step, i);
+
+         nir_deref_instr *memory_deref =
+            nir_build_deref_ptr_as_array(b, pointer,
+                                         nir_i2iN(b,
+                                                  nir_iadd(b,
+                                                           base_offset,
+                                                           offset),
+                                                  pointer->def.bit_size));
+
+         if (load) {
+            results[i] = nir_load_deref(b, memory_deref);
+         } else {
+            nir_def *src = nir_channel(b, nir_load_deref(b, slice), i);
+            nir_store_deref(b, memory_deref, src, 0x1);
+         }
+      }
+   } else {
+      nir_def *stride = intrin->src[2].ssa;
+
+      const struct glsl_type *element_type = glsl_scalar_type(desc->element_type);
+      const unsigned element_bits = glsl_base_type_get_bit_size(desc->element_type);
+      const unsigned element_stride = element_bits / 8;
+
+      pointer = nir_build_deref_cast(b, &pointer->def, pointer->modes, element_type,
+                                     element_stride);
+
+      nir_def *invocation_div_8 = nir_udiv_imm(b, nir_load_subgroup_invocation(b), 8);
+      nir_def *invocation_mod_8 = nir_umod_imm(b, nir_load_subgroup_invocation(b), 8);
+
+      nir_def *packed_stride = nir_imul_imm(b, stride, packing_factor);
+
+      for (unsigned i = 0; i < num_components; i++) {
+         const unsigned i_offset = i * (state->subgroup_size / 8);
+         nir_def *v[4];
+
+         for (unsigned j = 0; j < packing_factor; j++) {
+            nir_def *j_offset = nir_imul_imm(b, stride, j);
+            nir_def *offset;
+
+            if (desc->use != GLSL_CMAT_USE_B) {
+               offset = nir_iadd(b,
+                                 nir_iadd(b,
+                                          nir_imul(b,
+                                                   invocation_mod_8,
+                                                   packed_stride),
+                                          invocation_div_8),
+                                 nir_iadd_imm(b, j_offset, i_offset));
+            } else {
+               offset = nir_iadd(b,
+                                 nir_iadd(b,
+                                          nir_imul(b,
+                                                   invocation_div_8,
+                                                   packed_stride),
+                                          invocation_mod_8),
+                                 nir_iadd(b,
+                                          nir_imul_imm(b,
+                                                       packed_stride,
+                                                       i_offset),
+                                          j_offset));
+            }
+
+            nir_deref_instr *memory_deref =
+               nir_build_deref_ptr_as_array(b, pointer,
+                                            nir_i2iN(b,
+                                                     offset,
+                                                     pointer->def.bit_size));
+
+            if (load) {
+               v[j] = nir_load_deref(b, memory_deref);
+            } else {
+               nir_def *src = nir_channel(b, nir_load_deref(b, slice), i);
+
+               nir_def *v =
+                  nir_channel(b, nir_unpack_bits(b, src, element_bits), j);
+
+               nir_store_deref(b, memory_deref, v, 0x1);
+            }
+         }
+
+         if (load) {
+            results[i] = nir_pack_bits(b, nir_vec(b, v, packing_factor),
+                                       packing_factor * element_bits);
+         }
+      }
+   }
+
+   if (load)
+      nir_store_deref(b, slice, nir_vec(b, results, num_components),
+                      nir_component_mask(num_components));
+}
+
+static void
+lower_cmat_unary_op(nir_builder *b, nir_intrinsic_instr *intrin,
+                    struct lower_cmat_state *state)
+{
+   nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
+   nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[1]);
+   nir_def *results[NIR_MAX_VEC_COMPONENTS];
+   const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
+
+   const struct glsl_type *dst_mat_type =
+      get_coop_type_for_slice(state, dst_slice);
+   const struct glsl_type *src_mat_type =
+      get_coop_type_for_slice(state, src_slice);
+
+   const struct glsl_cmat_description dst_desc =
+      *glsl_get_cmat_description(dst_mat_type);
+
+   const struct glsl_cmat_description src_desc =
+      *glsl_get_cmat_description(src_mat_type);
+
+   const unsigned dst_bits = glsl_base_type_bit_size(dst_desc.element_type);
+   const unsigned src_bits = glsl_base_type_bit_size(src_desc.element_type);
+
+   /* The type of the returned slice may be different from the type of the
+    * input slice.
+    */
+   const unsigned dst_packing_factor =
+      get_packing_factor(dst_desc, dst_slice->type);
+
+   const unsigned src_packing_factor =
+      get_packing_factor(src_desc, src_slice->type);
+
+   const nir_op op = nir_intrinsic_alu_op(intrin);
+
+   /* There are three possible cases:
+    *
+    * 1. dst_packing_factor == src_packing_factor. This is the common case,
+    *    and handling it is straightforward.
+    *
+    * 2. dst_packing_factor > src_packing_factor. This occurs when converting a
+    *    float32_t matrix slice to a packed float16_t slice. Loop over the size
+    *    of the destination slice, but read multiple entries from the source
+    *    slice on each iteration.
+    *
+    * 3. dst_packing_factor < src_packing_factor. This occurs when converting a
+    *    packed int8_t matrix slice to an int32_t slice. Loop over the size of
+    *    the source slice, but write multiple entries to the destination slice
+    *    on each iteration.
+    *
+    * Handle all cases by iterating over the total (non-packed) number of
+    * elements in the slice. When dst_packing_factor values have been
+    * calculated, store them.
+    */
+   assert((dst_packing_factor * glsl_get_vector_elements(dst_slice->type)) ==
+          (src_packing_factor * glsl_get_vector_elements(src_slice->type)));
+
+   /* Stores at most dst_packing_factor partial results. */
+   nir_def *v[4];
+   assert(dst_packing_factor <= 4);
+
+   for (unsigned i = 0; i < num_components * dst_packing_factor; i++) {
+      const unsigned dst_chan_index = i % dst_packing_factor;
+      const unsigned src_chan_index = i % src_packing_factor;
+      const unsigned dst_index = i / dst_packing_factor;
+      const unsigned src_index = i / src_packing_factor;
+
+      nir_def *src =
+         nir_channel(b,
+                     nir_unpack_bits(b,
+                                     nir_channel(b,
+                                                 nir_load_deref(b, src_slice),
+                                                 src_index),
+                                     src_bits),
+                     src_chan_index);
+
+      v[dst_chan_index] = nir_build_alu1(b, op, src);
+
+      if (dst_chan_index == (dst_packing_factor - 1)) {
+         results[dst_index] =
+            nir_pack_bits(b, nir_vec(b, v, dst_packing_factor),
+                          dst_packing_factor * dst_bits);
+      }
+   }
+
+   nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
+                   nir_component_mask(num_components));
+}
+
+static void
+lower_cmat_binary_op(nir_builder *b, nir_intrinsic_instr *intrin,
+                     struct lower_cmat_state *state)
+{
+   nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
+   nir_deref_instr *src_a_slice = nir_src_as_deref(intrin->src[1]);
+   nir_deref_instr *src_b_slice = nir_src_as_deref(intrin->src[2]);
+
+   nir_def *src_a = nir_load_deref(b, src_a_slice);
+   nir_def *src_b = nir_load_deref(b, src_b_slice);
+   nir_def *results[NIR_MAX_VEC_COMPONENTS];
+   const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
+
+   const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
+   ASSERTED const struct glsl_type *src_a_mat_type = get_coop_type_for_slice(state, src_a_slice);
+   ASSERTED const struct glsl_type *src_b_mat_type = get_coop_type_for_slice(state, src_b_slice);
+
+   const struct glsl_cmat_description desc =
+      *glsl_get_cmat_description(dst_mat_type);
+
+   assert(dst_mat_type == src_a_mat_type);
+   assert(dst_mat_type == src_b_mat_type);
+
+   const unsigned bits = glsl_base_type_bit_size(desc.element_type);
+   const unsigned packing_factor = get_packing_factor(desc, dst_slice->type);
+
+   for (unsigned i = 0; i < num_components; i++) {
+      nir_def *val_a = nir_channel(b, src_a, i);
+      nir_def *val_b = nir_channel(b, src_b, i);
+
+      results[i] =
+         nir_pack_bits(b, nir_build_alu2(b, nir_intrinsic_alu_op(intrin),
+                                         nir_unpack_bits(b, val_a, bits),
+                                         nir_unpack_bits(b, val_b, bits)),
+                       packing_factor * bits);
+   }
+
+   nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
+                   nir_component_mask(num_components));
+}
+
+static void
+lower_cmat_scalar_op(nir_builder *b, nir_intrinsic_instr *intrin,
+                     struct lower_cmat_state *state)
+{
+   nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
+   nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[1]);
+   nir_def *scalar = intrin->src[2].ssa;
+
+   nir_def *src = nir_load_deref(b, src_slice);
+   nir_def *results[NIR_MAX_VEC_COMPONENTS];
+   const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
+
+   ASSERTED const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
+   ASSERTED const struct glsl_type *src_mat_type = get_coop_type_for_slice(state, src_slice);
+   assert(dst_mat_type == src_mat_type);
+
+   const struct glsl_cmat_description desc =
+      *glsl_get_cmat_description(dst_mat_type);
+
+   const unsigned bits = glsl_base_type_bit_size(desc.element_type);
+   const unsigned packing_factor = get_packing_factor(desc, dst_slice->type);
+
+   for (unsigned i = 0; i < num_components; i++) {
+      nir_def *val = nir_channel(b, src, i);
+
+      results[i] =
+         nir_pack_bits(b, nir_build_alu2(b, nir_intrinsic_alu_op(intrin),
+                                         nir_unpack_bits(b, val, bits),
+                                         scalar),
+                       packing_factor * bits);
+   }
+
+   nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
+                   nir_component_mask(num_components));
+}
+
+static nir_deref_instr *
+lower_cmat_deref(nir_builder *b, nir_deref_instr *deref,
+                 struct lower_cmat_state *state)
+{
+   nir_deref_instr *parent = nir_deref_instr_parent(deref);
+   if (parent) {
+      assert(deref->deref_type == nir_deref_type_array);
+      parent = lower_cmat_deref(b, parent, state);
+      return nir_build_deref_array(b, parent, deref->arr.index.ssa);
+   } else {
+      assert(deref->deref_type == nir_deref_type_var);
+      assert(deref->var);
+      assert(glsl_type_is_cmat(glsl_without_array(deref->var->type)));
+
+      struct hash_entry *entry = _mesa_hash_table_search(state->vars_to_slice, deref->var);
+      assert(entry);
+      return nir_build_deref_var(b, (nir_variable *)entry->data);
+   }
+}
+
+static nir_def *
+lower_cmat_instr(nir_builder *b, nir_instr *instr, void *_state)
+{
+   struct lower_cmat_state *state = _state;
+
+   if (instr->type == nir_instr_type_deref) {
+      nir_deref_instr *deref = lower_cmat_deref(b, nir_instr_as_deref(instr), state);
+      return &deref->def;
+   }
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_cmat_load:
+   case nir_intrinsic_cmat_store:
+      lower_cmat_load_store(b, intrin, state);
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+
+   case nir_intrinsic_cmat_construct: {
+      nir_deref_instr *slice = nir_src_as_deref(intrin->src[0]);
+      nir_def *src = intrin->src[1].ssa;
+
+      const struct glsl_type *mat_type = get_coop_type_for_slice(state, slice);
+      const struct glsl_cmat_description desc =
+         *glsl_get_cmat_description(mat_type);
+      const unsigned packing_factor = get_packing_factor(desc, slice->type);
+
+      if (packing_factor > 1) {
+         src = nir_pack_bits(b, nir_replicate(b, src, packing_factor),
+                             packing_factor * glsl_base_type_get_bit_size(desc.element_type));
+      }
+
+      const unsigned num_components = glsl_get_vector_elements(slice->type);
+
+      nir_store_deref(b, slice, nir_replicate(b, src, num_components),
+                      nir_component_mask(num_components));
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+   }
+
+   case nir_intrinsic_cmat_unary_op:
+      lower_cmat_unary_op(b, intrin, state);
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+
+   case nir_intrinsic_cmat_binary_op:
+      lower_cmat_binary_op(b, intrin, state);
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+
+   case nir_intrinsic_cmat_scalar_op:
+      lower_cmat_scalar_op(b, intrin, state);
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+
+   case nir_intrinsic_cmat_length: {
+      const struct glsl_cmat_description desc = nir_intrinsic_cmat_desc(intrin);
+      const struct glsl_type *mat_type = glsl_cmat_type(&desc);
+      const struct glsl_type *slice_type = get_slice_type(state, mat_type);
+      return nir_imm_intN_t(b, (get_packing_factor(desc, slice_type) *
+                                glsl_get_vector_elements(slice_type)), 32);
+   }
+
+   case nir_intrinsic_cmat_muladd: {
+      nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
+      nir_deref_instr *A_slice = nir_src_as_deref(intrin->src[1]);
+      nir_deref_instr *B_slice = nir_src_as_deref(intrin->src[2]);
+      nir_deref_instr *accum_slice = nir_src_as_deref(intrin->src[3]);
+
+      const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
+      const struct glsl_cmat_description dst_desc = *glsl_get_cmat_description(dst_mat_type);
+
+      const struct glsl_type *src_mat_type = get_coop_type_for_slice(state, A_slice);
+      const struct glsl_cmat_description src_desc = *glsl_get_cmat_description(src_mat_type);
+
+      const unsigned packing_factor = get_packing_factor(dst_desc, dst_slice->type);
+      const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
+
+      nir_def *result =
+         nir_dpas_intel(b,
+                        packing_factor * glsl_base_type_get_bit_size(dst_desc.element_type),
+                        nir_load_deref(b, A_slice),
+                        nir_load_deref(b, B_slice),
+                        nir_load_deref(b, accum_slice),
+                        .dest_type = nir_get_nir_type_for_glsl_base_type(dst_desc.element_type),
+                        .src_type = nir_get_nir_type_for_glsl_base_type(src_desc.element_type),
+                        .saturate = nir_intrinsic_saturate(intrin),
+                        .cmat_signed_mask = nir_intrinsic_cmat_signed_mask(intrin),
+                        .systolic_depth = 8,
+                        .repeat_count = 8);
+
+      nir_store_deref(b, dst_slice, result,
+                      nir_component_mask(num_components));
+
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+   }
+
+   case nir_intrinsic_cmat_bitcast: {
+      nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
+      nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[1]);
+
+      const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
+
+      assert(glsl_get_vector_elements(src_slice->type) == num_components);
+
+      nir_store_deref(b, dst_slice, nir_load_deref(b, src_slice),
+                      nir_component_mask(num_components));
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+   }
+
+   case nir_intrinsic_cmat_copy:
+      nir_copy_deref(b,
+                     nir_src_as_deref(intrin->src[0]),
+                     nir_src_as_deref(intrin->src[1]));
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+
+   case nir_intrinsic_cmat_insert: {
+      nir_deref_instr *dst_slice = nir_src_as_deref(intrin->src[0]);
+      nir_def *scalar = intrin->src[1].ssa;
+      nir_deref_instr *src_slice = nir_src_as_deref(intrin->src[2]);
+      const nir_src dst_index = intrin->src[3];
+
+      const struct glsl_type *dst_mat_type = get_coop_type_for_slice(state, dst_slice);
+      ASSERTED const struct glsl_type *src_mat_type = get_coop_type_for_slice(state, src_slice);
+      assert(dst_mat_type == src_mat_type);
+
+      const struct glsl_cmat_description desc =
+         *glsl_get_cmat_description(dst_mat_type);
+
+      const unsigned bits = glsl_base_type_bit_size(desc.element_type);
+      const unsigned packing_factor = get_packing_factor(desc, dst_slice->type);
+      const unsigned num_components = glsl_get_vector_elements(dst_slice->type);
+
+      nir_def *slice_index = nir_udiv_imm(b, dst_index.ssa, packing_factor);
+      nir_def *vector_index = nir_umod_imm(b, dst_index.ssa, packing_factor);
+      nir_def *results[NIR_MAX_VEC_COMPONENTS];
+
+      const int slice_constant_index = nir_src_is_const(dst_index)
+         ? nir_src_as_uint(dst_index) / packing_factor
+         : -1;
+
+      for (unsigned i = 0; i < num_components; i++) {
+         nir_def *val = nir_channel(b, nir_load_deref(b, src_slice), i);
+         nir_def *insert;
+
+         if (slice_constant_index < 0 || slice_constant_index == i) {
+            if (packing_factor == 1) {
+               insert = scalar;
+            } else {
+               nir_def *unpacked = nir_unpack_bits(b, val, bits);
+               nir_def *v = nir_vector_insert(b, unpacked, scalar, vector_index);
+
+               insert = nir_pack_bits(b, v, bits * packing_factor);
+            }
+         } else {
+            insert = val;
+         }
+
+         results[i] = slice_constant_index < 0
+            ? nir_bcsel(b, nir_ieq_imm(b, slice_index, i), insert, val)
+            : insert;
+      }
+
+      nir_store_deref(b, dst_slice, nir_vec(b, results, num_components),
+                      nir_component_mask(num_components));
+
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+   }
+
+   case nir_intrinsic_cmat_extract: {
+      nir_deref_instr *slice = nir_src_as_deref(intrin->src[0]);
+      const struct glsl_type *mat_type = get_coop_type_for_slice(state, slice);
+      nir_def *index = intrin->src[1].ssa;
+
+      const struct glsl_cmat_description desc =
+         *glsl_get_cmat_description(mat_type);
+
+      const unsigned bits = glsl_base_type_bit_size(desc.element_type);
+      const unsigned packing_factor = get_packing_factor(desc, slice->type);
+
+      nir_def *src =
+         nir_vector_extract(b, nir_load_deref(b, slice),
+                            nir_udiv_imm(b, index, packing_factor));
+
+      if (packing_factor == 1) {
+         return src;
+      } else {
+         return nir_vector_extract(b,
+                                   nir_unpack_bits(b, src, bits),
+                                   nir_umod_imm(b, index, packing_factor));
+      }
+
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+   }
+
+   default:
+      unreachable("invalid cooperative matrix intrinsic");
+   }
+}
+
+static void
+create_slice_var(struct lower_cmat_state *state, nir_variable *var,
+                 nir_function_impl *impl)
+{
+   // TODO: without array
+   const struct glsl_type *mat_type = glsl_without_array(var->type);
+
+   assert(glsl_type_is_cmat(mat_type));
+   assert((!impl && var->data.mode == nir_var_shader_temp) ||
+          ( impl && var->data.mode == nir_var_function_temp));
+
+   const struct glsl_type *slice_type = get_slice_type(state, var->type);
+   const char *slice_name = ralloc_asprintf(state->shader, "%s_slice", var->name);
+   nir_variable *slice_var = impl ?
+      nir_local_variable_create(impl, slice_type, slice_name) :
+      nir_variable_create(state->shader, var->data.mode, slice_type, slice_name);
+
+   _mesa_hash_table_insert(state->vars_to_slice, var, slice_var);
+   _mesa_hash_table_insert(state->slice_coop_types, slice_var, (void *)mat_type);
+}
+
+bool
+brw_nir_lower_cmat(nir_shader *shader, unsigned subgroup_size)
+{
+   void *temp_ctx = ralloc_context(NULL);
+
+   struct lower_cmat_state state = {
+      .shader = shader,
+      .slice_coop_types = _mesa_pointer_hash_table_create(temp_ctx),
+      .vars_to_slice = _mesa_pointer_hash_table_create(temp_ctx),
+      .subgroup_size = subgroup_size,
+   };
+
+   /* Create a slice array for each variable and add a map from the original
+    * variable back to it, so it can be reached during lowering.
+    *
+    * TODO: Cooperative matrix inside struct?
+    */
+   nir_foreach_variable_in_shader(var, shader) {
+      if (glsl_type_is_cmat(glsl_without_array(var->type)))
+         create_slice_var(&state, var, NULL);
+   }
+   nir_foreach_function(func, shader) {
+      nir_foreach_function_temp_variable(var, func->impl) {
+         if (glsl_type_is_cmat(glsl_without_array(var->type)))
+            create_slice_var(&state, var, func->impl);
+      }
+   }
+
+   bool progress = nir_shader_lower_instructions(shader,
+                                                 lower_cmat_filter,
+                                                 lower_cmat_instr,
+                                                 &state);
+
+   ralloc_free(temp_ctx);
+
+   return progress;
+}
--- a/src/intel/compiler/elk/brw_nir_lower_cs_intrinsics.c
+++ b/src/intel/compiler/elk/brw_nir_lower_cs_intrinsics.c
@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+struct lower_intrinsics_state {
+   nir_shader *nir;
+   nir_function_impl *impl;
+   bool progress;
+   bool hw_generated_local_id;
+   nir_builder builder;
+};
+
+static void
+compute_local_index_id(nir_builder *b,
+                       nir_shader *nir,
+                       nir_def **local_index,
+                       nir_def **local_id)
+{
+   nir_def *subgroup_id = nir_load_subgroup_id(b);
+
+   nir_def *thread_local_id =
+      nir_imul(b, subgroup_id, nir_load_simd_width_intel(b));
+   nir_def *channel = nir_load_subgroup_invocation(b);
+   nir_def *linear = nir_iadd(b, channel, thread_local_id);
+
+   nir_def *size_x;
+   nir_def *size_y;
+   if (nir->info.workgroup_size_variable) {
+      nir_def *size_xyz = nir_load_workgroup_size(b);
+      size_x = nir_channel(b, size_xyz, 0);
+      size_y = nir_channel(b, size_xyz, 1);
+   } else {
+      size_x = nir_imm_int(b, nir->info.workgroup_size[0]);
+      size_y = nir_imm_int(b, nir->info.workgroup_size[1]);
+   }
+   nir_def *size_xy = nir_imul(b, size_x, size_y);
+
+   /* The local invocation index and ID must respect the following
+    *
+    *    gl_LocalInvocationID.x =
+    *       gl_LocalInvocationIndex % gl_WorkGroupSize.x;
+    *    gl_LocalInvocationID.y =
+    *       (gl_LocalInvocationIndex / gl_WorkGroupSize.x) %
+    *       gl_WorkGroupSize.y;
+    *    gl_LocalInvocationID.z =
+    *       (gl_LocalInvocationIndex /
+    *        (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
+    *       gl_WorkGroupSize.z;
+    *
+    * However, the final % gl_WorkGroupSize.z does nothing unless we
+    * accidentally end up with a gl_LocalInvocationIndex that is too
+    * large so it can safely be omitted.
+    */
+
+   nir_def *id_x, *id_y, *id_z;
+   switch (nir->info.cs.derivative_group) {
+   case DERIVATIVE_GROUP_NONE:
+      if (nir->info.num_images == 0 &&
+          nir->info.num_textures == 0) {
+         /* X-major lid order. Optimal for linear accesses only,
+          * which are usually buffers. X,Y ordering will look like:
+          * (0,0) (1,0) (2,0) ... (size_x-1,0) (0,1) (1,1) ...
+          */
+         id_x = nir_umod(b, linear, size_x);
+         id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
+         *local_index = linear;
+      } else if (!nir->info.workgroup_size_variable &&
+                 nir->info.workgroup_size[1] % 4 == 0) {
+         /* 1x4 block X-major lid order. Same as X-major except increments in
+          * blocks of width=1 height=4. Always optimal for tileY and usually
+          * optimal for linear accesses.
+          *   x = (linear / 4) % size_x
+          *   y = ((linear % 4) + (linear / 4 / size_x) * 4) % size_y
+          * X,Y ordering will look like: (0,0) (0,1) (0,2) (0,3) (1,0) (1,1)
+          * (1,2) (1,3) (2,0) ... (size_x-1,3) (0,4) (0,5) (0,6) (0,7) (1,4) ...
+          */
+         const unsigned height = 4;
+         nir_def *block = nir_udiv_imm(b, linear, height);
+         id_x = nir_umod(b, block, size_x);
+         id_y = nir_umod(b,
+                         nir_iadd(b,
+                                  nir_umod_imm(b, linear, height),
+                                  nir_imul_imm(b,
+                                               nir_udiv(b, block, size_x),
+                                               height)),
+                         size_y);
+      } else {
+         /* Y-major lid order. Optimal for tileY accesses only,
+          * which are usually images. X,Y ordering will look like:
+          * (0,0) (0,1) (0,2) ... (0,size_y-1) (1,0) (1,1) ...
+          */
+         id_y = nir_umod(b, linear, size_y);
+         id_x = nir_umod(b, nir_udiv(b, linear, size_y), size_x);
+      }
+
+      id_z = nir_udiv(b, linear, size_xy);
+      *local_id = nir_vec3(b, id_x, id_y, id_z);
+      if (!*local_index) {
+         *local_index = nir_iadd(b, nir_iadd(b, id_x,
+                                                nir_imul(b, id_y, size_x)),
+                                                nir_imul(b, id_z, size_xy));
+      }
+      break;
+   case DERIVATIVE_GROUP_LINEAR:
+      /* For linear, just set the local invocation index linearly,
+       * and calculate local invocation ID from that.
+       */
+      id_x = nir_umod(b, linear, size_x);
+      id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
+      id_z = nir_udiv(b, linear, size_xy);
+      *local_id = nir_vec3(b, id_x, id_y, id_z);
+      *local_index = linear;
+      break;
+   case DERIVATIVE_GROUP_QUADS: {
+      /* For quads, first we figure out the 2x2 grid the invocation
+       * belongs to -- treating extra Z layers as just more rows.
+       * Then map that into local invocation ID (trivial) and local
+       * invocation index.  Skipping Z simplify index calculation.
+       */
+
+      nir_def *one = nir_imm_int(b, 1);
+      nir_def *double_size_x = nir_ishl(b, size_x, one);
+
+      /* ID within a pair of rows, where each group of 4 is 2x2 quad. */
+      nir_def *row_pair_id = nir_umod(b, linear, double_size_x);
+      nir_def *y_row_pairs = nir_udiv(b, linear, double_size_x);
+
+      nir_def *x =
+         nir_ior(b,
+                 nir_iand(b, row_pair_id, one),
+                 nir_iand(b, nir_ishr(b, row_pair_id, one),
+                          nir_imm_int(b, 0xfffffffe)));
+      nir_def *y =
+         nir_ior(b,
+                 nir_ishl(b, y_row_pairs, one),
+                 nir_iand(b, nir_ishr(b, row_pair_id, one), one));
+
+      *local_id = nir_vec3(b, x,
+                           nir_umod(b, y, size_y),
+                           nir_udiv(b, y, size_y));
+      *local_index = nir_iadd(b, x, nir_imul(b, y, size_x));
+      break;
+   }
+   default:
+      unreachable("invalid derivative group");
+   }
+}
+
+static bool
+lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
+                                  nir_block *block)
+{
+   bool progress = false;
+   nir_builder *b = &state->builder;
+   nir_shader *nir = state->nir;
+
+   /* Reuse calculated values inside the block. */
+   nir_def *local_index = NULL;
+   nir_def *local_id = NULL;
+
+   nir_foreach_instr_safe(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
+
+      b->cursor = nir_after_instr(&intrinsic->instr);
+
+      nir_def *sysval;
+      switch (intrinsic->intrinsic) {
+      case nir_intrinsic_load_local_invocation_id:
+         if (state->hw_generated_local_id)
+            continue;
+
+         FALLTHROUGH;
+      case nir_intrinsic_load_local_invocation_index: {
+         if (!local_index && !nir->info.workgroup_size_variable) {
+            const uint16_t *ws = nir->info.workgroup_size;
+            if (ws[0] * ws[1] * ws[2] == 1) {
+               nir_def *zero = nir_imm_int(b, 0);
+               local_index = zero;
+               local_id = nir_replicate(b, zero, 3);
+            }
+         }
+
+         if (!local_index) {
+            if (nir->info.stage == MESA_SHADER_TASK ||
+                nir->info.stage == MESA_SHADER_MESH) {
+               /* Will be lowered by nir_emit_task_mesh_intrinsic() using
+                * information from the payload.
+                */
+               continue;
+            }
+
+            if (state->hw_generated_local_id) {
+               nir_def *local_id_vec = nir_load_local_invocation_id(b);
+               nir_def *local_id[3] = { nir_channel(b, local_id_vec, 0),
+                                        nir_channel(b, local_id_vec, 1),
+                                        nir_channel(b, local_id_vec, 2) };
+               nir_def *size_x = nir_imm_int(b, nir->info.workgroup_size[0]);
+               nir_def *size_y = nir_imm_int(b, nir->info.workgroup_size[1]);
+
+               sysval = nir_imul(b, local_id[2], nir_imul(b, size_x, size_y));
+               sysval = nir_iadd(b, sysval, nir_imul(b, local_id[1], size_x));
+               sysval = nir_iadd(b, sysval, local_id[0]);
+               local_index = sysval;
+               break;
+            }
+
+            /* First time we are using those, so let's calculate them. */
+            assert(!local_id);
+            compute_local_index_id(b, nir, &local_index, &local_id);
+         }
+
+         assert(local_id);
+         assert(local_index);
+         if (intrinsic->intrinsic == nir_intrinsic_load_local_invocation_id)
+            sysval = local_id;
+         else
+            sysval = local_index;
+         break;
+      }
+
+      case nir_intrinsic_load_num_subgroups: {
+         nir_def *size;
+         if (state->nir->info.workgroup_size_variable) {
+            nir_def *size_xyz = nir_load_workgroup_size(b);
+            nir_def *size_x = nir_channel(b, size_xyz, 0);
+            nir_def *size_y = nir_channel(b, size_xyz, 1);
+            nir_def *size_z = nir_channel(b, size_xyz, 2);
+            size = nir_imul(b, nir_imul(b, size_x, size_y), size_z);
+         } else {
+            size = nir_imm_int(b, nir->info.workgroup_size[0] *
+                                  nir->info.workgroup_size[1] *
+                                  nir->info.workgroup_size[2]);
+         }
+
+         /* Calculate the equivalent of DIV_ROUND_UP. */
+         nir_def *simd_width = nir_load_simd_width_intel(b);
+         sysval =
+            nir_udiv(b, nir_iadd_imm(b, nir_iadd(b, size, simd_width), -1),
+                        simd_width);
+         break;
+      }
+
+      default:
+         continue;
+      }
+
+      if (intrinsic->def.bit_size == 64)
+         sysval = nir_u2u64(b, sysval);
+
+      nir_def_rewrite_uses(&intrinsic->def, sysval);
+      nir_instr_remove(&intrinsic->instr);
+
+      state->progress = true;
+   }
+
+   return progress;
+}
+
+static void
+lower_cs_intrinsics_convert_impl(struct lower_intrinsics_state *state)
+{
+   state->builder = nir_builder_create(state->impl);
+
+   nir_foreach_block(block, state->impl) {
+      lower_cs_intrinsics_convert_block(state, block);
+   }
+
+   nir_metadata_preserve(state->impl,
+                         nir_metadata_block_index | nir_metadata_dominance);
+}
+
+bool
+brw_nir_lower_cs_intrinsics(nir_shader *nir,
+                            const struct intel_device_info *devinfo,
+                            struct brw_cs_prog_data *prog_data)
+{
+   assert(gl_shader_stage_uses_workgroup(nir->info.stage));
+
+   struct lower_intrinsics_state state = {
+      .nir = nir,
+      .hw_generated_local_id = false,
+   };
+
+   /* Constraints from NV_compute_shader_derivatives. */
+   if (gl_shader_stage_is_compute(nir->info.stage) &&
+       !nir->info.workgroup_size_variable) {
+      if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
+         assert(nir->info.workgroup_size[0] % 2 == 0);
+         assert(nir->info.workgroup_size[1] % 2 == 0);
+      } else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR) {
+         ASSERTED unsigned workgroup_size =
+            nir->info.workgroup_size[0] *
+            nir->info.workgroup_size[1] *
+            nir->info.workgroup_size[2];
+         assert(workgroup_size % 4 == 0);
+      }
+   }
+
+   if (devinfo->verx10 >= 125 && prog_data &&
+       nir->info.stage == MESA_SHADER_COMPUTE &&
+       nir->info.cs.derivative_group != DERIVATIVE_GROUP_QUADS &&
+       !nir->info.workgroup_size_variable &&
+       util_is_power_of_two_nonzero(nir->info.workgroup_size[0]) &&
+       util_is_power_of_two_nonzero(nir->info.workgroup_size[1])) {
+
+      state.hw_generated_local_id = true;
+
+      /* TODO: more heuristics about 1D/SLM access vs. 2D access */
+      bool linear =
+         BITSET_TEST(nir->info.system_values_read,
+                     SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) ||
+         (nir->info.workgroup_size[1] == 1 &&
+          nir->info.workgroup_size[2] == 1) ||
+         (nir->info.num_images == 0 && nir->info.num_textures == 0);
+
+      prog_data->walk_order =
+         linear ? INTEL_WALK_ORDER_XYZ : INTEL_WALK_ORDER_YXZ;
+
+      /* nir_lower_compute_system_values will replace any references to
+       * SYSTEM_VALUE_LOCAL_INVOCATION_ID vector components with zero for
+       * any dimension where the workgroup size is 1, so we can skip
+       * generating those.  However, the hardware can only generate
+       * X, XY, or XYZ - it can't skip earlier components.
+       */
+      prog_data->generate_local_id =
+         (nir->info.workgroup_size[0] > 1 ? WRITEMASK_X   : 0) |
+         (nir->info.workgroup_size[1] > 1 ? WRITEMASK_XY  : 0) |
+         (nir->info.workgroup_size[2] > 1 ? WRITEMASK_XYZ : 0);
+   }
+
+   nir_foreach_function_impl(impl, nir) {
+      state.impl = impl;
+      lower_cs_intrinsics_convert_impl(&state);
+   }
+
+   return state.progress;
+}
--- a/src/intel/compiler/elk/brw_nir_lower_intersection_shader.c
+++ b/src/intel/compiler/elk/brw_nir_lower_intersection_shader.c
@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir_rt.h"
+#include "brw_nir_rt_builder.h"
+
+static nir_function_impl *
+lower_any_hit_for_intersection(nir_shader *any_hit)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(any_hit);
+
+   /* Any-hit shaders need three parameters */
+   assert(impl->function->num_params == 0);
+   nir_parameter params[] = {
+      {
+         /* A pointer to a boolean value for whether or not the hit was
+          * accepted.
+          */
+         .num_components = 1,
+         .bit_size = 32,
+      },
+      {
+         /* The hit T value */
+         .num_components = 1,
+         .bit_size = 32,
+      },
+      {
+         /* The hit kind */
+         .num_components = 1,
+         .bit_size = 32,
+      },
+   };
+   impl->function->num_params = ARRAY_SIZE(params);
+   impl->function->params =
+      ralloc_array(any_hit, nir_parameter, ARRAY_SIZE(params));
+   memcpy(impl->function->params, params, sizeof(params));
+
+   nir_builder build = nir_builder_at(nir_before_impl(impl));
+   nir_builder *b = &build;
+
+   nir_def *commit_ptr = nir_load_param(b, 0);
+   nir_def *hit_t = nir_load_param(b, 1);
+   nir_def *hit_kind = nir_load_param(b, 2);
+
+   nir_deref_instr *commit =
+      nir_build_deref_cast(b, commit_ptr, nir_var_function_temp,
+                           glsl_bool_type(), 0);
+
+   nir_foreach_block_safe(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         switch (instr->type) {
+         case nir_instr_type_intrinsic: {
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_ignore_ray_intersection:
+               b->cursor = nir_instr_remove(&intrin->instr);
+               /* We put the newly emitted code inside a dummy if because it's
+                * going to contain a jump instruction and we don't want to
+                * deal with that mess here.  It'll get dealt with by our
+                * control-flow optimization passes.
+                */
+               nir_store_deref(b, commit, nir_imm_false(b), 0x1);
+               nir_push_if(b, nir_imm_true(b));
+               nir_jump(b, nir_jump_return);
+               nir_pop_if(b, NULL);
+               break;
+
+            case nir_intrinsic_terminate_ray:
+               /* The "normal" handling of terminateRay works fine in
+                * intersection shaders.
+                */
+               break;
+
+            case nir_intrinsic_load_ray_t_max:
+               nir_def_rewrite_uses(&intrin->def,
+                                        hit_t);
+               nir_instr_remove(&intrin->instr);
+               break;
+
+            case nir_intrinsic_load_ray_hit_kind:
+               nir_def_rewrite_uses(&intrin->def,
+                                        hit_kind);
+               nir_instr_remove(&intrin->instr);
+               break;
+
+            default:
+               break;
+            }
+            break;
+         }
+
+         case nir_instr_type_jump: {
+            /* Stomp any halts to returns since they only return from the
+             * any-hit shader and not necessarily from the intersection
+             * shader.  This is safe to do because we've already asserted
+             * that we only have the one function.
+             */
+            nir_jump_instr *jump = nir_instr_as_jump(instr);
+            if (jump->type == nir_jump_halt)
+               jump->type = nir_jump_return;
+            break;
+         }
+
+         default:
+            break;
+         }
+      }
+   }
+
+   nir_validate_shader(any_hit, "after initial any-hit lowering");
+
+   nir_lower_returns_impl(impl);
+
+   nir_validate_shader(any_hit, "after lowering returns");
+
+   return impl;
+}
+
+void
+brw_nir_lower_intersection_shader(nir_shader *intersection,
+                                  const nir_shader *any_hit,
+                                  const struct intel_device_info *devinfo)
+{
+   void *dead_ctx = ralloc_context(intersection);
+
+   nir_function_impl *any_hit_impl = NULL;
+   struct hash_table *any_hit_var_remap = NULL;
+   if (any_hit) {
+      nir_shader *any_hit_tmp = nir_shader_clone(dead_ctx, any_hit);
+      NIR_PASS_V(any_hit_tmp, nir_opt_dce);
+      any_hit_impl = lower_any_hit_for_intersection(any_hit_tmp);
+      any_hit_var_remap = _mesa_pointer_hash_table_create(dead_ctx);
+   }
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(intersection);
+
+   nir_builder build = nir_builder_at(nir_before_impl(impl));
+   nir_builder *b = &build;
+
+   nir_def *t_addr = brw_nir_rt_mem_hit_addr(b, false /* committed */);
+   nir_variable *commit =
+      nir_local_variable_create(impl, glsl_bool_type(), "ray_commit");
+   nir_store_var(b, commit, nir_imm_false(b), 0x1);
+
+   assert(impl->end_block->predecessors->entries == 1);
+   set_foreach(impl->end_block->predecessors, block_entry) {
+      struct nir_block *block = (void *)block_entry->key;
+      b->cursor = nir_after_block_before_jump(block);
+      nir_push_if(b, nir_load_var(b, commit));
+      {
+         /* Set the "valid" bit in mem_hit */
+         nir_def *ray_addr = brw_nir_rt_mem_hit_addr(b, false /* committed */);
+         nir_def *flags_dw_addr = nir_iadd_imm(b, ray_addr, 12);
+         nir_store_global(b, flags_dw_addr, 4,
+            nir_ior(b, nir_load_global(b, flags_dw_addr, 4, 1, 32),
+                       nir_imm_int(b, 1 << 16)), 0x1 /* write_mask */);
+
+         nir_accept_ray_intersection(b);
+      }
+      nir_push_else(b, NULL);
+      {
+         nir_ignore_ray_intersection(b);
+      }
+      nir_pop_if(b, NULL);
+      break;
+   }
+
+   nir_foreach_block_safe(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         switch (instr->type) {
+         case nir_instr_type_intrinsic: {
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_report_ray_intersection: {
+               b->cursor = nir_instr_remove(&intrin->instr);
+               nir_def *hit_t = intrin->src[0].ssa;
+               nir_def *hit_kind = intrin->src[1].ssa;
+               nir_def *min_t = nir_load_ray_t_min(b);
+
+               struct brw_nir_rt_mem_ray_defs ray_def;
+               brw_nir_rt_load_mem_ray(b, &ray_def, BRW_RT_BVH_LEVEL_WORLD);
+
+               struct brw_nir_rt_mem_hit_defs hit_in = {};
+               brw_nir_rt_load_mem_hit(b, &hit_in, false);
+
+               nir_def *max_t = ray_def.t_far;
+
+               /* bool commit_tmp = false; */
+               nir_variable *commit_tmp =
+                  nir_local_variable_create(impl, glsl_bool_type(),
+                                            "commit_tmp");
+               nir_store_var(b, commit_tmp, nir_imm_false(b), 0x1);
+
+               nir_push_if(b, nir_iand(b, nir_fge(b, hit_t, min_t),
+                                          nir_fge(b, max_t, hit_t)));
+               {
+                  /* Any-hit defaults to commit */
+                  nir_store_var(b, commit_tmp, nir_imm_true(b), 0x1);
+
+                  if (any_hit_impl != NULL) {
+                     nir_push_if(b, nir_inot(b, nir_load_leaf_opaque_intel(b)));
+                     {
+                        nir_def *params[] = {
+                           &nir_build_deref_var(b, commit_tmp)->def,
+                           hit_t,
+                           hit_kind,
+                        };
+                        nir_inline_function_impl(b, any_hit_impl, params,
+                                                 any_hit_var_remap);
+                     }
+                     nir_pop_if(b, NULL);
+                  }
+
+                  nir_push_if(b, nir_load_var(b, commit_tmp));
+                  {
+                     nir_store_var(b, commit, nir_imm_true(b), 0x1);
+
+                     nir_def *ray_addr =
+                        brw_nir_rt_mem_ray_addr(b, brw_nir_rt_stack_addr(b), BRW_RT_BVH_LEVEL_WORLD);
+
+                     nir_store_global(b, nir_iadd_imm(b, ray_addr, 16 + 12), 4,  hit_t, 0x1);
+                     nir_store_global(b, t_addr, 4,
+                                      nir_vec2(b, nir_fmin(b, hit_t, hit_in.t), hit_kind),
+                                      0x3);
+                  }
+                  nir_pop_if(b, NULL);
+               }
+               nir_pop_if(b, NULL);
+
+               nir_def *accepted = nir_load_var(b, commit_tmp);
+               nir_def_rewrite_uses(&intrin->def,
+                                        accepted);
+               break;
+            }
+
+            default:
+               break;
+            }
+            break;
+         }
+
+         default:
+            break;
+         }
+      }
+   }
+   nir_metadata_preserve(impl, nir_metadata_none);
+
+   /* We did some inlining; have to re-index SSA defs */
+   nir_index_ssa_defs(impl);
+
+   ralloc_free(dead_ctx);
+}
--- a/src/intel/compiler/elk/brw_nir_lower_ray_queries.c
+++ b/src/intel/compiler/elk/brw_nir_lower_ray_queries.c
@ -0,0 +1,567 @@
+/*
+ * Copyright (c) 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir_rt.h"
+#include "brw_nir_rt_builder.h"
+
+#include "nir_deref.h"
+
+#include "util/macros.h"
+
+struct lowering_state {
+   const struct intel_device_info *devinfo;
+
+   nir_function_impl *impl;
+
+   struct hash_table *queries;
+   uint32_t n_queries;
+
+   struct brw_nir_rt_globals_defs globals;
+   nir_def *rq_globals;
+};
+
+struct brw_ray_query {
+   nir_variable *opaque_var;
+   nir_variable *internal_var;
+   uint32_t id;
+};
+
+#define SIZEOF_QUERY_STATE (sizeof(uint32_t))
+
+static bool
+need_spill_fill(struct lowering_state *state)
+{
+   return state->n_queries > 1;
+}
+
+/**
+ * This pass converts opaque RayQuery structures from SPIRV into a vec3 where
+ * the first 2 elements store a global address for the query and the third
+ * element is an incremented counter on the number of executed
+ * nir_intrinsic_rq_proceed.
+ */
+
+static void
+register_opaque_var(nir_variable *opaque_var, struct lowering_state *state)
+{
+   struct hash_entry *entry = _mesa_hash_table_search(state->queries, opaque_var);
+   assert(entry == NULL);
+
+   struct brw_ray_query *rq = rzalloc(state->queries, struct brw_ray_query);
+   rq->opaque_var = opaque_var;
+   rq->id = state->n_queries;
+
+   unsigned aoa_size = glsl_get_aoa_size(opaque_var->type);
+   state->n_queries += MAX2(1, aoa_size);
+
+   _mesa_hash_table_insert(state->queries, opaque_var, rq);
+}
+
+static void
+create_internal_var(struct brw_ray_query *rq, struct lowering_state *state)
+{
+   const struct glsl_type *opaque_type = rq->opaque_var->type;
+   const struct glsl_type *internal_type = glsl_uint16_t_type();
+
+   while (glsl_type_is_array(opaque_type)) {
+      assert(!glsl_type_is_unsized_array(opaque_type));
+      internal_type = glsl_array_type(internal_type,
+                                      glsl_array_size(opaque_type),
+                                      0);
+      opaque_type = glsl_get_array_element(opaque_type);
+   }
+
+   rq->internal_var = nir_local_variable_create(state->impl,
+                                                internal_type,
+                                                NULL);
+}
+
+
+
+static nir_def *
+get_ray_query_shadow_addr(nir_builder *b,
+                          nir_deref_instr *deref,
+                          struct lowering_state *state,
+                          nir_deref_instr **out_state_deref)
+{
+   nir_deref_path path;
+   nir_deref_path_init(&path, deref, NULL);
+   assert(path.path[0]->deref_type == nir_deref_type_var);
+
+   nir_variable *opaque_var = nir_deref_instr_get_variable(path.path[0]);
+   struct hash_entry *entry = _mesa_hash_table_search(state->queries, opaque_var);
+   assert(entry);
+
+   struct brw_ray_query *rq = entry->data;
+
+   /* Base address in the shadow memory of the variable associated with this
+    * ray query variable.
+    */
+   nir_def *base_addr =
+      nir_iadd_imm(b, state->globals.resume_sbt_addr,
+                   brw_rt_ray_queries_shadow_stack_size(state->devinfo) * rq->id);
+
+   bool spill_fill = need_spill_fill(state);
+   *out_state_deref = nir_build_deref_var(b, rq->internal_var);
+
+   if (!spill_fill)
+      return NULL;
+
+   /* Just emit code and let constant-folding go to town */
+   nir_deref_instr **p = &path.path[1];
+   for (; *p; p++) {
+      if ((*p)->deref_type == nir_deref_type_array) {
+         nir_def *index = (*p)->arr.index.ssa;
+
+         /**/
+         *out_state_deref = nir_build_deref_array(b, *out_state_deref, index);
+
+         /**/
+         uint64_t size = MAX2(1, glsl_get_aoa_size((*p)->type)) *
+            brw_rt_ray_queries_shadow_stack_size(state->devinfo);
+
+         nir_def *mul = nir_amul_imm(b, nir_i2i64(b, index), size);
+
+         base_addr = nir_iadd(b, base_addr, mul);
+      } else {
+         unreachable("Unsupported deref type");
+      }
+   }
+
+   nir_deref_path_finish(&path);
+
+   /* Add the lane offset to the shadow memory address */
+   nir_def *lane_offset =
+      nir_imul_imm(
+         b,
+         nir_iadd(
+            b,
+            nir_imul(
+               b,
+               brw_load_btd_dss_id(b),
+               brw_nir_rt_load_num_simd_lanes_per_dss(b, state->devinfo)),
+            brw_nir_rt_sync_stack_id(b)),
+         BRW_RT_SIZEOF_SHADOW_RAY_QUERY);
+
+   return nir_iadd(b, base_addr, nir_i2i64(b, lane_offset));
+}
+
+static void
+update_trace_ctrl_level(nir_builder *b,
+                        nir_deref_instr *state_deref,
+                        nir_def **out_old_ctrl,
+                        nir_def **out_old_level,
+                        nir_def *new_ctrl,
+                        nir_def *new_level)
+{
+   nir_def *old_value = nir_load_deref(b, state_deref);
+   nir_def *old_ctrl = nir_ishr_imm(b, old_value, 2);
+   nir_def *old_level = nir_iand_imm(b, old_value, 0x3);
+
+   if (out_old_ctrl)
+      *out_old_ctrl = old_ctrl;
+   if (out_old_level)
+      *out_old_level = old_level;
+
+   if (new_ctrl)
+      new_ctrl = nir_i2i16(b, new_ctrl);
+   if (new_level)
+      new_level = nir_i2i16(b, new_level);
+
+   if (new_ctrl || new_level) {
+      if (!new_ctrl)
+         new_ctrl = old_ctrl;
+      if (!new_level)
+         new_level = old_level;
+
+      nir_def *new_value = nir_ior(b, nir_ishl_imm(b, new_ctrl, 2), new_level);
+      nir_store_deref(b, state_deref, new_value, 0x1);
+   }
+}
+
+static void
+fill_query(nir_builder *b,
+           nir_def *hw_stack_addr,
+           nir_def *shadow_stack_addr,
+           nir_def *ctrl)
+{
+   brw_nir_memcpy_global(b, hw_stack_addr, 64, shadow_stack_addr, 64,
+                         BRW_RT_SIZEOF_RAY_QUERY);
+}
+
+static void
+spill_query(nir_builder *b,
+            nir_def *hw_stack_addr,
+            nir_def *shadow_stack_addr)
+{
+   brw_nir_memcpy_global(b, shadow_stack_addr, 64, hw_stack_addr, 64,
+                         BRW_RT_SIZEOF_RAY_QUERY);
+}
+
+
+static void
+lower_ray_query_intrinsic(nir_builder *b,
+                          nir_intrinsic_instr *intrin,
+                          struct lowering_state *state)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_deref_instr *ctrl_level_deref;
+   nir_def *shadow_stack_addr =
+      get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref);
+   nir_def *hw_stack_addr =
+      brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr, state->devinfo);
+   nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr;
+
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_rq_initialize: {
+      nir_def *as_addr = intrin->src[1].ssa;
+      nir_def *ray_flags = intrin->src[2].ssa;
+      /* From the SPIR-V spec:
+       *
+       *    "Only the 8 least-significant bits of Cull Mask are used by
+       *    this instruction - other bits are ignored.
+       *
+       *    Only the 16 least-significant bits of Miss Index are used by
+       *    this instruction - other bits are ignored."
+       */
+      nir_def *cull_mask = nir_iand_imm(b, intrin->src[3].ssa, 0xff);
+      nir_def *ray_orig = intrin->src[4].ssa;
+      nir_def *ray_t_min = intrin->src[5].ssa;
+      nir_def *ray_dir = intrin->src[6].ssa;
+      nir_def *ray_t_max = intrin->src[7].ssa;
+
+      nir_def *root_node_ptr =
+         brw_nir_rt_acceleration_structure_to_root_node(b, as_addr);
+
+      struct brw_nir_rt_mem_ray_defs ray_defs = {
+         .root_node_ptr = root_node_ptr,
+         .ray_flags = nir_u2u16(b, ray_flags),
+         .ray_mask = cull_mask,
+         .orig = ray_orig,
+         .t_near = ray_t_min,
+         .dir = ray_dir,
+         .t_far = ray_t_max,
+      };
+
+      nir_def *ray_addr =
+         brw_nir_rt_mem_ray_addr(b, stack_addr, BRW_RT_BVH_LEVEL_WORLD);
+
+      brw_nir_rt_query_mark_init(b, stack_addr);
+      brw_nir_rt_store_mem_ray_query_at_addr(b, ray_addr, &ray_defs);
+
+      update_trace_ctrl_level(b, ctrl_level_deref,
+                              NULL, NULL,
+                              nir_imm_int(b, GEN_RT_TRACE_RAY_INITAL),
+                              nir_imm_int(b, BRW_RT_BVH_LEVEL_WORLD));
+      break;
+   }
+
+   case nir_intrinsic_rq_proceed: {
+      nir_def *not_done =
+         nir_inot(b, brw_nir_rt_query_done(b, stack_addr));
+      nir_def *not_done_then, *not_done_else;
+
+      nir_push_if(b, not_done);
+      {
+         nir_def *ctrl, *level;
+         update_trace_ctrl_level(b, ctrl_level_deref,
+                                 &ctrl, &level,
+                                 NULL,
+                                 NULL);
+
+         /* Mark the query as done because handing it over to the HW for
+          * processing. If the HW make any progress, it will write back some
+          * data and as a side effect, clear the "done" bit. If no progress is
+          * made, HW does not write anything back and we can use this bit to
+          * detect that.
+          */
+         brw_nir_rt_query_mark_done(b, stack_addr);
+
+         if (shadow_stack_addr)
+            fill_query(b, hw_stack_addr, shadow_stack_addr, ctrl);
+
+         nir_trace_ray_intel(b, state->rq_globals, level, ctrl, .synchronous = true);
+
+         struct brw_nir_rt_mem_hit_defs hit_in = {};
+         brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false);
+
+         if (shadow_stack_addr)
+            spill_query(b, hw_stack_addr, shadow_stack_addr);
+
+         update_trace_ctrl_level(b, ctrl_level_deref,
+                                 NULL, NULL,
+                                 nir_imm_int(b, GEN_RT_TRACE_RAY_CONTINUE),
+                                 hit_in.bvh_level);
+
+         not_done_then = nir_inot(b, hit_in.done);
+      }
+      nir_push_else(b, NULL);
+      {
+         not_done_else = nir_imm_false(b);
+      }
+      nir_pop_if(b, NULL);
+      not_done = nir_if_phi(b, not_done_then, not_done_else);
+      nir_def_rewrite_uses(&intrin->def, not_done);
+      break;
+   }
+
+   case nir_intrinsic_rq_confirm_intersection: {
+      brw_nir_memcpy_global(b,
+                            brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, true), 16,
+                            brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, false), 16,
+                            BRW_RT_SIZEOF_HIT_INFO);
+      update_trace_ctrl_level(b, ctrl_level_deref,
+                              NULL, NULL,
+                              nir_imm_int(b, GEN_RT_TRACE_RAY_COMMIT),
+                              nir_imm_int(b, BRW_RT_BVH_LEVEL_OBJECT));
+      break;
+   }
+
+   case nir_intrinsic_rq_generate_intersection: {
+      brw_nir_rt_generate_hit_addr(b, stack_addr, intrin->src[1].ssa);
+      update_trace_ctrl_level(b, ctrl_level_deref,
+                              NULL, NULL,
+                              nir_imm_int(b, GEN_RT_TRACE_RAY_COMMIT),
+                              nir_imm_int(b, BRW_RT_BVH_LEVEL_OBJECT));
+      break;
+   }
+
+   case nir_intrinsic_rq_terminate: {
+      brw_nir_rt_query_mark_done(b, stack_addr);
+      break;
+   }
+
+   case nir_intrinsic_rq_load: {
+      const bool committed = nir_intrinsic_committed(intrin);
+
+      struct brw_nir_rt_mem_ray_defs world_ray_in = {};
+      struct brw_nir_rt_mem_ray_defs object_ray_in = {};
+      struct brw_nir_rt_mem_hit_defs hit_in = {};
+      brw_nir_rt_load_mem_ray_from_addr(b, &world_ray_in, stack_addr,
+                                        BRW_RT_BVH_LEVEL_WORLD);
+      brw_nir_rt_load_mem_ray_from_addr(b, &object_ray_in, stack_addr,
+                                        BRW_RT_BVH_LEVEL_OBJECT);
+      brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr, committed);
+
+      nir_def *sysval = NULL;
+      switch (nir_intrinsic_ray_query_value(intrin)) {
+      case nir_ray_query_value_intersection_type:
+         if (committed) {
+            /* Values we want to generate :
+             *
+             * RayQueryCommittedIntersectionNoneEXT = 0U        <= hit_in.valid == false
+             * RayQueryCommittedIntersectionTriangleEXT = 1U    <= hit_in.leaf_type == BRW_RT_BVH_NODE_TYPE_QUAD (4)
+             * RayQueryCommittedIntersectionGeneratedEXT = 2U   <= hit_in.leaf_type == BRW_RT_BVH_NODE_TYPE_PROCEDURAL (3)
+             */
+            sysval =
+               nir_bcsel(b, nir_ieq_imm(b, hit_in.leaf_type, 4),
+                         nir_imm_int(b, 1), nir_imm_int(b, 2));
+            sysval =
+               nir_bcsel(b, hit_in.valid,
+                         sysval, nir_imm_int(b, 0));
+         } else {
+            /* 0 -> triangle, 1 -> AABB */
+            sysval =
+               nir_b2i32(b,
+                         nir_ieq_imm(b, hit_in.leaf_type,
+                                     BRW_RT_BVH_NODE_TYPE_PROCEDURAL));
+         }
+         break;
+
+      case nir_ray_query_value_intersection_t:
+         sysval = hit_in.t;
+         break;
+
+      case nir_ray_query_value_intersection_instance_custom_index: {
+         struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+         brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+         sysval = leaf.instance_id;
+         break;
+      }
+
+      case nir_ray_query_value_intersection_instance_id: {
+         struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+         brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+         sysval = leaf.instance_index;
+         break;
+      }
+
+      case nir_ray_query_value_intersection_instance_sbt_index: {
+         struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+         brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+         sysval = leaf.contribution_to_hit_group_index;
+         break;
+      }
+
+      case nir_ray_query_value_intersection_geometry_index: {
+         nir_def *geometry_index_dw =
+            nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
+                            1, 32);
+         sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29));
+         break;
+      }
+
+      case nir_ray_query_value_intersection_primitive_index:
+         sysval = brw_nir_rt_load_primitive_id_from_hit(b, NULL /* is_procedural */, &hit_in);
+         break;
+
+      case nir_ray_query_value_intersection_barycentrics:
+         sysval = hit_in.tri_bary;
+         break;
+
+      case nir_ray_query_value_intersection_front_face:
+         sysval = hit_in.front_face;
+         break;
+
+      case nir_ray_query_value_intersection_object_ray_direction:
+         sysval = world_ray_in.dir;
+         break;
+
+      case nir_ray_query_value_intersection_object_ray_origin:
+         sysval = world_ray_in.orig;
+         break;
+
+      case nir_ray_query_value_intersection_object_to_world: {
+         struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+         brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+         sysval = leaf.object_to_world[nir_intrinsic_column(intrin)];
+         break;
+      }
+
+      case nir_ray_query_value_intersection_world_to_object: {
+         struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+         brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+         sysval = leaf.world_to_object[nir_intrinsic_column(intrin)];
+         break;
+      }
+
+      case nir_ray_query_value_intersection_candidate_aabb_opaque:
+         sysval = hit_in.front_face;
+         break;
+
+      case nir_ray_query_value_tmin:
+         sysval = world_ray_in.t_near;
+         break;
+
+      case nir_ray_query_value_flags:
+         sysval = nir_u2u32(b, world_ray_in.ray_flags);
+         break;
+
+      case nir_ray_query_value_world_ray_direction:
+         sysval = world_ray_in.dir;
+         break;
+
+      case nir_ray_query_value_world_ray_origin:
+         sysval = world_ray_in.orig;
+         break;
+
+      case nir_ray_query_value_intersection_triangle_vertex_positions: {
+         struct brw_nir_rt_bvh_primitive_leaf_positions_defs pos;
+         brw_nir_rt_load_bvh_primitive_leaf_positions(b, &pos, hit_in.prim_leaf_ptr);
+         sysval = pos.positions[nir_intrinsic_column(intrin)];
+         break;
+      }
+
+      default:
+         unreachable("Invalid ray query");
+      }
+
+      assert(sysval);
+      nir_def_rewrite_uses(&intrin->def, sysval);
+      break;
+   }
+
+   default:
+      unreachable("Invalid intrinsic");
+   }
+}
+
+static void
+lower_ray_query_impl(nir_function_impl *impl, struct lowering_state *state)
+{
+   nir_builder _b, *b = &_b;
+   _b = nir_builder_at(nir_before_impl(impl));
+
+   state->rq_globals = nir_load_ray_query_global_intel(b);
+
+   brw_nir_rt_load_globals_addr(b, &state->globals, state->rq_globals);
+
+   nir_foreach_block_safe(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         if (intrin->intrinsic != nir_intrinsic_rq_initialize &&
+             intrin->intrinsic != nir_intrinsic_rq_terminate &&
+             intrin->intrinsic != nir_intrinsic_rq_proceed &&
+             intrin->intrinsic != nir_intrinsic_rq_generate_intersection &&
+             intrin->intrinsic != nir_intrinsic_rq_confirm_intersection &&
+             intrin->intrinsic != nir_intrinsic_rq_load)
+            continue;
+
+         lower_ray_query_intrinsic(b, intrin, state);
+      }
+   }
+
+   nir_metadata_preserve(impl, nir_metadata_none);
+}
+
+bool
+brw_nir_lower_ray_queries(nir_shader *shader,
+                          const struct intel_device_info *devinfo)
+{
+   assert(exec_list_length(&shader->functions) == 1);
+
+   struct lowering_state state = {
+      .devinfo = devinfo,
+      .impl = nir_shader_get_entrypoint(shader),
+      .queries = _mesa_pointer_hash_table_create(NULL),
+   };
+
+   /* Map all query variable to internal type variables */
+   nir_foreach_function_temp_variable(var, state.impl)
+      register_opaque_var(var, &state);
+   hash_table_foreach(state.queries, entry)
+      create_internal_var(entry->data, &state);
+
+   bool progress = state.n_queries > 0;
+
+   if (progress) {
+      lower_ray_query_impl(state.impl, &state);
+
+      nir_remove_dead_derefs(shader);
+      nir_remove_dead_variables(shader,
+                                nir_var_shader_temp | nir_var_function_temp,
+                                NULL);
+
+      nir_metadata_preserve(state.impl, nir_metadata_none);
+   }
+
+   ralloc_free(state.queries);
+
+   return progress;
+}
--- a/src/intel/compiler/elk/brw_nir_lower_rt_intrinsics.c
+++ b/src/intel/compiler/elk/brw_nir_lower_rt_intrinsics.c
@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir_rt.h"
+#include "brw_nir_rt_builder.h"
+
+static nir_def *
+build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)
+{
+   switch (b->shader->info.stage) {
+   case MESA_SHADER_ANY_HIT:
+      /* Any-hit shaders are always compiled into intersection shaders for
+       * procedural geometry.  If we got here in an any-hit shader, it's for
+       * triangles.
+       */
+      return nir_imm_false(b);
+
+   case MESA_SHADER_INTERSECTION:
+      return nir_imm_true(b);
+
+   default:
+      return nir_ieq_imm(b, hit->leaf_type,
+                            BRW_RT_BVH_NODE_TYPE_PROCEDURAL);
+   }
+}
+
+static void
+lower_rt_intrinsics_impl(nir_function_impl *impl,
+                         const struct intel_device_info *devinfo)
+{
+   bool progress = false;
+
+   nir_builder build = nir_builder_at(nir_before_impl(impl));
+   nir_builder *b = &build;
+
+   struct brw_nir_rt_globals_defs globals;
+   brw_nir_rt_load_globals(b, &globals);
+
+   nir_def *hotzone_addr = brw_nir_rt_sw_hotzone_addr(b, devinfo);
+   nir_def *hotzone = nir_load_global(b, hotzone_addr, 16, 4, 32);
+
+   gl_shader_stage stage = b->shader->info.stage;
+   struct brw_nir_rt_mem_ray_defs world_ray_in = {};
+   struct brw_nir_rt_mem_ray_defs object_ray_in = {};
+   struct brw_nir_rt_mem_hit_defs hit_in = {};
+   switch (stage) {
+   case MESA_SHADER_ANY_HIT:
+   case MESA_SHADER_CLOSEST_HIT:
+   case MESA_SHADER_INTERSECTION:
+      brw_nir_rt_load_mem_hit(b, &hit_in,
+                              stage == MESA_SHADER_CLOSEST_HIT);
+      brw_nir_rt_load_mem_ray(b, &object_ray_in,
+                              BRW_RT_BVH_LEVEL_OBJECT);
+      FALLTHROUGH;
+
+   case MESA_SHADER_MISS:
+      brw_nir_rt_load_mem_ray(b, &world_ray_in,
+                              BRW_RT_BVH_LEVEL_WORLD);
+      break;
+
+   default:
+      break;
+   }
+
+   nir_def *thread_stack_base_addr = brw_nir_rt_sw_stack_addr(b, devinfo);
+   nir_def *stack_base_offset = nir_channel(b, hotzone, 0);
+   nir_def *stack_base_addr =
+      nir_iadd(b, thread_stack_base_addr, nir_u2u64(b, stack_base_offset));
+   ASSERTED bool seen_scratch_base_ptr_load = false;
+   ASSERTED bool found_resume = false;
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+         b->cursor = nir_after_instr(&intrin->instr);
+
+         nir_def *sysval = NULL;
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_load_scratch_base_ptr:
+            assert(nir_intrinsic_base(intrin) == 1);
+            seen_scratch_base_ptr_load = true;
+            sysval = stack_base_addr;
+            break;
+
+         case nir_intrinsic_btd_stack_push_intel: {
+            int32_t stack_size = nir_intrinsic_stack_size(intrin);
+            if (stack_size > 0) {
+               nir_def *child_stack_offset =
+                  nir_iadd_imm(b, stack_base_offset, stack_size);
+               nir_store_global(b, hotzone_addr, 16, child_stack_offset, 0x1);
+            }
+            nir_instr_remove(instr);
+            break;
+         }
+
+         case nir_intrinsic_rt_resume:
+            /* This is the first "interesting" instruction */
+            assert(block == nir_start_block(impl));
+            assert(!seen_scratch_base_ptr_load);
+            found_resume = true;
+
+            int32_t stack_size = nir_intrinsic_stack_size(intrin);
+            if (stack_size > 0) {
+               stack_base_offset =
+                  nir_iadd_imm(b, stack_base_offset, -stack_size);
+               nir_store_global(b, hotzone_addr, 16, stack_base_offset, 0x1);
+               stack_base_addr = nir_iadd(b, thread_stack_base_addr,
+                                          nir_u2u64(b, stack_base_offset));
+            }
+            nir_instr_remove(instr);
+            break;
+
+         case nir_intrinsic_load_uniform: {
+            /* We don't want to lower this in the launch trampoline. */
+            if (stage == MESA_SHADER_COMPUTE)
+               break;
+
+            sysval = brw_nir_load_global_const(b, intrin,
+                        nir_load_btd_global_arg_addr_intel(b),
+                        BRW_RT_PUSH_CONST_OFFSET);
+
+            break;
+         }
+
+         case nir_intrinsic_load_ray_launch_id:
+            sysval = nir_channels(b, hotzone, 0xe);
+            break;
+
+         case nir_intrinsic_load_ray_launch_size:
+            sysval = globals.launch_size;
+            break;
+
+         case nir_intrinsic_load_ray_world_origin:
+            sysval = world_ray_in.orig;
+            break;
+
+         case nir_intrinsic_load_ray_world_direction:
+            sysval = world_ray_in.dir;
+            break;
+
+         case nir_intrinsic_load_ray_object_origin:
+            sysval = object_ray_in.orig;
+            break;
+
+         case nir_intrinsic_load_ray_object_direction:
+            sysval = object_ray_in.dir;
+            break;
+
+         case nir_intrinsic_load_ray_t_min:
+            /* It shouldn't matter which we pull this from */
+            sysval = world_ray_in.t_near;
+            break;
+
+         case nir_intrinsic_load_ray_t_max:
+            if (stage == MESA_SHADER_MISS)
+               sysval = world_ray_in.t_far;
+            else
+               sysval = hit_in.t;
+            break;
+
+         case nir_intrinsic_load_primitive_id:
+            sysval = brw_nir_rt_load_primitive_id_from_hit(b,
+                                                           build_leaf_is_procedural(b, &hit_in),
+                                                           &hit_in);
+            break;
+
+         case nir_intrinsic_load_instance_id: {
+            struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+            brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+            sysval = leaf.instance_index;
+            break;
+         }
+
+         case nir_intrinsic_load_ray_object_to_world: {
+            struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+            brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+            sysval = leaf.object_to_world[nir_intrinsic_column(intrin)];
+            break;
+         }
+
+         case nir_intrinsic_load_ray_world_to_object: {
+            struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+            brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+            sysval = leaf.world_to_object[nir_intrinsic_column(intrin)];
+            break;
+         }
+
+         case nir_intrinsic_load_ray_hit_kind: {
+            nir_def *tri_hit_kind =
+               nir_bcsel(b, hit_in.front_face,
+                            nir_imm_int(b, BRW_RT_HIT_KIND_FRONT_FACE),
+                            nir_imm_int(b, BRW_RT_HIT_KIND_BACK_FACE));
+            sysval = nir_bcsel(b, build_leaf_is_procedural(b, &hit_in),
+                                  hit_in.aabb_hit_kind, tri_hit_kind);
+            break;
+         }
+
+         case nir_intrinsic_load_ray_flags:
+            /* We need to fetch the original ray flags we stored in the
+             * leaf pointer, because the actual ray flags we get here
+             * will include any flags passed on the pipeline at creation
+             * time, and the spec for IncomingRayFlagsKHR says:
+             *   Setting pipeline flags on the raytracing pipeline must not
+             *   cause any corresponding flags to be set in variables with
+             *   this decoration.
+             */
+            sysval = nir_u2u32(b, world_ray_in.inst_leaf_ptr);
+            break;
+
+         case nir_intrinsic_load_cull_mask:
+            sysval = nir_u2u32(b, world_ray_in.ray_mask);
+            break;
+
+         case nir_intrinsic_load_ray_geometry_index: {
+            nir_def *geometry_index_dw =
+               nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
+                               1, 32);
+            sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29));
+            break;
+         }
+
+         case nir_intrinsic_load_ray_instance_custom_index: {
+            struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+            brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
+            sysval = leaf.instance_id;
+            break;
+         }
+
+         case nir_intrinsic_load_shader_record_ptr:
+            /* We can't handle this intrinsic in resume shaders because the
+             * handle we get there won't be from the original SBT.  The shader
+             * call lowering/splitting pass should have ensured that this
+             * value was spilled from the initial shader and unspilled in any
+             * resume shaders that need it.
+             */
+            assert(!found_resume);
+            sysval = nir_load_btd_local_arg_addr_intel(b);
+            break;
+
+         case nir_intrinsic_load_ray_base_mem_addr_intel:
+            sysval = globals.base_mem_addr;
+            break;
+
+         case nir_intrinsic_load_ray_hw_stack_size_intel:
+            sysval = nir_imul_imm(b, globals.hw_stack_size, 64);
+            break;
+
+         case nir_intrinsic_load_ray_sw_stack_size_intel:
+            sysval = nir_imul_imm(b, globals.sw_stack_size, 64);
+            break;
+
+         case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
+            sysval = globals.num_dss_rt_stacks;
+            break;
+
+         case nir_intrinsic_load_ray_hit_sbt_addr_intel:
+            sysval = globals.hit_sbt_addr;
+            break;
+
+         case nir_intrinsic_load_ray_hit_sbt_stride_intel:
+            sysval = globals.hit_sbt_stride;
+            break;
+
+         case nir_intrinsic_load_ray_miss_sbt_addr_intel:
+            sysval = globals.miss_sbt_addr;
+            break;
+
+         case nir_intrinsic_load_ray_miss_sbt_stride_intel:
+            sysval = globals.miss_sbt_stride;
+            break;
+
+         case nir_intrinsic_load_callable_sbt_addr_intel:
+            sysval = globals.call_sbt_addr;
+            break;
+
+         case nir_intrinsic_load_callable_sbt_stride_intel:
+            sysval = globals.call_sbt_stride;
+            break;
+
+         case nir_intrinsic_load_btd_resume_sbt_addr_intel:
+            sysval = nir_pack_64_2x32_split(b,
+               nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW),
+               nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH));
+            break;
+
+         case nir_intrinsic_load_leaf_procedural_intel:
+            sysval = build_leaf_is_procedural(b, &hit_in);
+            break;
+
+         case nir_intrinsic_load_ray_triangle_vertex_positions: {
+            struct brw_nir_rt_bvh_primitive_leaf_positions_defs pos;
+            brw_nir_rt_load_bvh_primitive_leaf_positions(b, &pos, hit_in.prim_leaf_ptr);
+            sysval = pos.positions[nir_intrinsic_column(intrin)];
+            break;
+         }
+
+         case nir_intrinsic_load_leaf_opaque_intel: {
+            if (stage == MESA_SHADER_INTERSECTION) {
+               /* In intersection shaders, the opaque bit is passed to us in
+                * the front_face bit.
+                */
+               sysval = hit_in.front_face;
+            } else {
+               nir_def *flags_dw =
+                  nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
+                                  1, 32);
+               sysval = nir_i2b(b, nir_iand_imm(b, flags_dw, 1u << 30));
+            }
+            break;
+         }
+
+         default:
+            continue;
+         }
+
+         progress = true;
+
+         if (sysval) {
+            nir_def_rewrite_uses(&intrin->def,
+                                     sysval);
+            nir_instr_remove(&intrin->instr);
+         }
+      }
+   }
+
+   nir_metadata_preserve(impl,
+                         progress ?
+                         nir_metadata_none :
+                         (nir_metadata_block_index |
+                          nir_metadata_dominance));
+}
+
+/** Lower ray-tracing system values and intrinsics
+ *
+ * In most 3D shader stages, intrinsics are a fairly thin wrapper around
+ * hardware functionality and system values represent magic bits that come
+ * into the shader from FF hardware.  Ray-tracing, however, looks a bit more
+ * like the OpenGL 1.0 world where the underlying hardware is simple and most
+ * of the API implementation is software.
+ *
+ * In particular, most things that are treated as system values (or built-ins
+ * in SPIR-V) don't get magically dropped into registers for us.  Instead, we
+ * have to fetch them from the relevant data structures shared with the
+ * ray-tracing hardware.  Most come from either the RT_DISPATCH_GLOBALS or
+ * from one of the MemHit data structures.  Some, such as primitive_id require
+ * us to fetch the leaf address from the MemHit struct and then manually read
+ * the data out of the BVH.  Instead of trying to emit all this code deep in
+ * the back-end where we can't effectively optimize it, we lower it all to
+ * global memory access in NIR.
+ *
+ * Once this pass is complete, the only real system values left are the two
+ * argument pointer system values for BTD dispatch: btd_local_arg_addr and
+ * btd_global_arg_addr.
+ */
+void
+brw_nir_lower_rt_intrinsics(nir_shader *nir,
+                            const struct intel_device_info *devinfo)
+{
+   nir_foreach_function_impl(impl, nir) {
+      lower_rt_intrinsics_impl(impl, devinfo);
+   }
+}
--- a/src/intel/compiler/elk/brw_nir_lower_shader_calls.c
+++ b/src/intel/compiler/elk/brw_nir_lower_shader_calls.c
@ -0,0 +1,329 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir_rt.h"
+#include "brw_nir_rt_builder.h"
+#include "nir_phi_builder.h"
+
+UNUSED static bool
+no_load_scratch_base_ptr_intrinsic(nir_shader *shader)
+{
+   nir_foreach_function_impl(impl, shader) {
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            if (intrin->intrinsic == nir_intrinsic_load_scratch_base_ptr)
+               return false;
+         }
+      }
+   }
+
+   return true;
+}
+
+/** Insert the appropriate return instruction at the end of the shader */
+void
+brw_nir_lower_shader_returns(nir_shader *shader)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+
+   /* Reserve scratch space at the start of the shader's per-thread scratch
+    * space for the return BINDLESS_SHADER_RECORD address and data payload.
+    * When a shader is called, the calling shader will write the return BSR
+    * address in this region of the callee's scratch space.
+    *
+    * We could also put it at the end of the caller's scratch space.  However,
+    * doing this way means that a shader never accesses its caller's scratch
+    * space unless given an explicit pointer (such as for ray payloads).  It
+    * also makes computing the address easier given that we want to apply an
+    * alignment to the scratch offset to ensure we can make alignment
+    * assumptions in the called shader.
+    *
+    * This isn't needed for ray-gen shaders because they end the thread and
+    * never return to the calling trampoline shader.
+    */
+   assert(no_load_scratch_base_ptr_intrinsic(shader));
+   if (shader->info.stage != MESA_SHADER_RAYGEN)
+      shader->scratch_size += BRW_BTD_STACK_CALLEE_DATA_SIZE;
+
+   nir_builder b = nir_builder_create(impl);
+
+   set_foreach(impl->end_block->predecessors, block_entry) {
+      struct nir_block *block = (void *)block_entry->key;
+      b.cursor = nir_after_block_before_jump(block);
+
+      switch (shader->info.stage) {
+      case MESA_SHADER_RAYGEN:
+         /* A raygen shader is always the root of the shader call tree.  When
+          * it ends, we retire the bindless stack ID and no further shaders
+          * will be executed.
+          */
+         assert(impl->end_block->predecessors->entries == 1);
+         brw_nir_btd_retire(&b);
+         break;
+
+      case MESA_SHADER_ANY_HIT:
+         /* The default action of an any-hit shader is to accept the ray
+          * intersection.  Any-hit shaders may have more than one exit.  Only
+          * the final "normal" exit will actually need to accept the
+          * intersection as any others should come from nir_jump_halt
+          * instructions inserted after ignore_ray_intersection or
+          * terminate_ray or the like.  However, inserting an accept after
+          * the ignore or terminate is safe because it'll get deleted later.
+          */
+         nir_accept_ray_intersection(&b);
+         break;
+
+      case MESA_SHADER_CALLABLE:
+      case MESA_SHADER_MISS:
+      case MESA_SHADER_CLOSEST_HIT:
+         /* Callable, miss, and closest-hit shaders don't take any special
+          * action at the end.  They simply return back to the previous shader
+          * in the call stack.
+          */
+         assert(impl->end_block->predecessors->entries == 1);
+         brw_nir_btd_return(&b);
+         break;
+
+      case MESA_SHADER_INTERSECTION:
+         /* This will be handled by brw_nir_lower_intersection_shader */
+         break;
+
+      default:
+         unreachable("Invalid callable shader stage");
+      }
+   }
+
+   nir_metadata_preserve(impl, nir_metadata_block_index |
+                               nir_metadata_dominance);
+}
+
+static void
+store_resume_addr(nir_builder *b, nir_intrinsic_instr *call)
+{
+   uint32_t call_idx = nir_intrinsic_call_idx(call);
+   uint32_t offset = nir_intrinsic_stack_size(call);
+
+   /* First thing on the called shader's stack is the resume address
+    * followed by a pointer to the payload.
+    */
+   nir_def *resume_record_addr =
+      nir_iadd_imm(b, nir_load_btd_resume_sbt_addr_intel(b),
+                   call_idx * BRW_BTD_RESUME_SBT_STRIDE);
+   /* By the time we get here, any remaining shader/function memory
+    * pointers have been lowered to SSA values.
+    */
+   nir_def *payload_addr =
+      nir_get_shader_call_payload_src(call)->ssa;
+   brw_nir_rt_store_scratch(b, offset, BRW_BTD_STACK_ALIGN,
+                            nir_vec2(b, resume_record_addr, payload_addr),
+                            0xf /* write_mask */);
+
+   nir_btd_stack_push_intel(b, offset);
+}
+
+static bool
+lower_shader_trace_ray_instr(struct nir_builder *b, nir_instr *instr, void *data)
+{
+   struct brw_bs_prog_key *key = data;
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   /* Leave nir_intrinsic_rt_resume to be lowered by
+    * brw_nir_lower_rt_intrinsics()
+    */
+   nir_intrinsic_instr *call = nir_instr_as_intrinsic(instr);
+   if (call->intrinsic != nir_intrinsic_rt_trace_ray)
+      return false;
+
+   b->cursor = nir_instr_remove(instr);
+
+   store_resume_addr(b, call);
+
+   nir_def *as_addr = call->src[0].ssa;
+   nir_def *ray_flags = call->src[1].ssa;
+   /* From the SPIR-V spec:
+    *
+    *    "Only the 8 least-significant bits of Cull Mask are used by this
+    *    instruction - other bits are ignored.
+    *
+    *    Only the 4 least-significant bits of SBT Offset and SBT Stride are
+    *    used by this instruction - other bits are ignored.
+    *
+    *    Only the 16 least-significant bits of Miss Index are used by this
+    *    instruction - other bits are ignored."
+    */
+   nir_def *cull_mask = nir_iand_imm(b, call->src[2].ssa, 0xff);
+   nir_def *sbt_offset = nir_iand_imm(b, call->src[3].ssa, 0xf);
+   nir_def *sbt_stride = nir_iand_imm(b, call->src[4].ssa, 0xf);
+   nir_def *miss_index = nir_iand_imm(b, call->src[5].ssa, 0xffff);
+   nir_def *ray_orig = call->src[6].ssa;
+   nir_def *ray_t_min = call->src[7].ssa;
+   nir_def *ray_dir = call->src[8].ssa;
+   nir_def *ray_t_max = call->src[9].ssa;
+
+   nir_def *root_node_ptr =
+      brw_nir_rt_acceleration_structure_to_root_node(b, as_addr);
+
+   /* The hardware packet requires an address to the first element of the
+    * hit SBT.
+    *
+    * In order to calculate this, we must multiply the "SBT Offset"
+    * provided to OpTraceRay by the SBT stride provided for the hit SBT in
+    * the call to vkCmdTraceRay() and add that to the base address of the
+    * hit SBT. This stride is not to be confused with the "SBT Stride"
+    * provided to OpTraceRay which is in units of this stride. It's a
+    * rather terrible overload of the word "stride". The hardware docs
+    * calls the SPIR-V stride value the "shader index multiplier" which is
+    * a much more sane name.
+    */
+   nir_def *hit_sbt_stride_B =
+      nir_load_ray_hit_sbt_stride_intel(b);
+   nir_def *hit_sbt_offset_B =
+      nir_imul(b, sbt_offset, nir_u2u32(b, hit_sbt_stride_B));
+   nir_def *hit_sbt_addr =
+      nir_iadd(b, nir_load_ray_hit_sbt_addr_intel(b),
+                  nir_u2u64(b, hit_sbt_offset_B));
+
+   /* The hardware packet takes an address to the miss BSR. */
+   nir_def *miss_sbt_stride_B =
+      nir_load_ray_miss_sbt_stride_intel(b);
+   nir_def *miss_sbt_offset_B =
+      nir_imul(b, miss_index, nir_u2u32(b, miss_sbt_stride_B));
+   nir_def *miss_sbt_addr =
+      nir_iadd(b, nir_load_ray_miss_sbt_addr_intel(b),
+                  nir_u2u64(b, miss_sbt_offset_B));
+
+   struct brw_nir_rt_mem_ray_defs ray_defs = {
+      .root_node_ptr = root_node_ptr,
+      /* Combine the shader value given to traceRayEXT() with the pipeline
+       * creation value VkPipelineCreateFlags.
+       */
+      .ray_flags = nir_ior_imm(b, nir_u2u16(b, ray_flags), key->pipeline_ray_flags),
+      .ray_mask = cull_mask,
+      .hit_group_sr_base_ptr = hit_sbt_addr,
+      .hit_group_sr_stride = nir_u2u16(b, hit_sbt_stride_B),
+      .miss_sr_ptr = miss_sbt_addr,
+      .orig = ray_orig,
+      .t_near = ray_t_min,
+      .dir = ray_dir,
+      .t_far = ray_t_max,
+      .shader_index_multiplier = sbt_stride,
+      /* The instance leaf pointer is unused in the top level BVH traversal
+       * since we always start from the root node. We can reuse that field to
+       * store the ray_flags handed to traceRayEXT(). This will be reloaded
+       * when the shader accesses gl_IncomingRayFlagsEXT (see
+       * nir_intrinsic_load_ray_flags brw_nir_lower_rt_intrinsic.c)
+       */
+      .inst_leaf_ptr = nir_u2u64(b, ray_flags),
+   };
+   brw_nir_rt_store_mem_ray(b, &ray_defs, BRW_RT_BVH_LEVEL_WORLD);
+
+   nir_trace_ray_intel(b,
+                       nir_load_btd_global_arg_addr_intel(b),
+                       nir_imm_int(b, BRW_RT_BVH_LEVEL_WORLD),
+                       nir_imm_int(b, GEN_RT_TRACE_RAY_INITAL),
+                       .synchronous = false);
+   return true;
+}
+
+static bool
+lower_shader_call_instr(struct nir_builder *b, nir_intrinsic_instr *call,
+                        void *data)
+{
+   if (call->intrinsic != nir_intrinsic_rt_execute_callable)
+      return false;
+
+   b->cursor = nir_instr_remove(&call->instr);
+
+   store_resume_addr(b, call);
+
+   nir_def *sbt_offset32 =
+      nir_imul(b, call->src[0].ssa,
+               nir_u2u32(b, nir_load_callable_sbt_stride_intel(b)));
+   nir_def *sbt_addr =
+      nir_iadd(b, nir_load_callable_sbt_addr_intel(b),
+               nir_u2u64(b, sbt_offset32));
+   brw_nir_btd_spawn(b, sbt_addr);
+   return true;
+}
+
+bool
+brw_nir_lower_shader_calls(nir_shader *shader, struct brw_bs_prog_key *key)
+{
+   bool a = nir_shader_instructions_pass(shader,
+                                         lower_shader_trace_ray_instr,
+                                         nir_metadata_none,
+                                         key);
+   bool b = nir_shader_intrinsics_pass(shader, lower_shader_call_instr,
+                                         nir_metadata_block_index |
+                                         nir_metadata_dominance,
+                                         NULL);
+   return a || b;
+}
+
+/** Creates a trivial return shader
+ *
+ * In most cases this shader doesn't actually do anything. It just needs to
+ * return to the caller.
+ *
+ * By default, our HW has the ability to handle the fact that a shader is not
+ * available and will execute the next following shader in the tracing call.
+ * For instance, a RAYGEN shader traces a ray, the tracing generates a hit,
+ * but there is no ANYHIT shader available. The HW should follow up by
+ * execution the CLOSESTHIT shader.
+ *
+ * This default behavior can be changed through the RT_CTRL register
+ * (privileged access) and when NULL shader checks are disabled, the HW will
+ * instead call the call stack handler (this shader). This is what i915 is
+ * doing as part of Wa_14013202645.
+ *
+ * In order to ensure the call to the CLOSESTHIT shader, this shader needs to
+ * commit the ray and will not proceed with the BTD return. Similarly when the
+ * same thing happen with the INTERSECTION shader, we should just carry on the
+ * ray traversal with the continue operation.
+ *
+ */
+nir_shader *
+brw_nir_create_trivial_return_shader(const struct brw_compiler *compiler,
+                                     void *mem_ctx)
+{
+   const nir_shader_compiler_options *nir_options =
+      compiler->nir_options[MESA_SHADER_CALLABLE];
+
+   nir_builder _b = nir_builder_init_simple_shader(MESA_SHADER_CALLABLE,
+                                                   nir_options,
+                                                   "RT Trivial Return");
+   nir_builder *b = &_b;
+
+   ralloc_steal(mem_ctx, b->shader);
+   nir_shader *nir = b->shader;
+
+   NIR_PASS_V(nir, brw_nir_lower_shader_returns);
+
+   return nir;
+}
--- a/src/intel/compiler/elk/brw_nir_lower_storage_image.c
+++ b/src/intel/compiler/elk/brw_nir_lower_storage_image.c
@ -0,0 +1,765 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "isl/isl.h"
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_format_convert.h"
+
+static nir_def *
+_load_image_param(nir_builder *b, nir_deref_instr *deref, unsigned offset)
+{
+   nir_intrinsic_instr *load =
+      nir_intrinsic_instr_create(b->shader,
+                                 nir_intrinsic_image_deref_load_param_intel);
+   load->src[0] = nir_src_for_ssa(&deref->def);
+   nir_intrinsic_set_base(load, offset / 4);
+
+   switch (offset) {
+   case ISL_IMAGE_PARAM_OFFSET_OFFSET:
+   case ISL_IMAGE_PARAM_SWIZZLING_OFFSET:
+      load->num_components = 2;
+      break;
+   case ISL_IMAGE_PARAM_TILING_OFFSET:
+   case ISL_IMAGE_PARAM_SIZE_OFFSET:
+      load->num_components = 3;
+      break;
+   case ISL_IMAGE_PARAM_STRIDE_OFFSET:
+      load->num_components = 4;
+      break;
+   default:
+      unreachable("Invalid param offset");
+   }
+   nir_def_init(&load->instr, &load->def, load->num_components, 32);
+
+   nir_builder_instr_insert(b, &load->instr);
+   return &load->def;
+}
+
+#define load_image_param(b, d, o) \
+   _load_image_param(b, d, ISL_IMAGE_PARAM_##o##_OFFSET)
+
+static nir_def *
+image_coord_is_in_bounds(nir_builder *b, nir_deref_instr *deref,
+                         nir_def *coord)
+{
+   nir_def *size = load_image_param(b, deref, SIZE);
+   nir_def *cmp = nir_ilt(b, coord, size);
+
+   unsigned coord_comps = glsl_get_sampler_coordinate_components(deref->type);
+   nir_def *in_bounds = nir_imm_true(b);
+   for (unsigned i = 0; i < coord_comps; i++)
+      in_bounds = nir_iand(b, in_bounds, nir_channel(b, cmp, i));
+
+   return in_bounds;
+}
+
+/** Calculate the offset in memory of the texel given by \p coord.
+ *
+ * This is meant to be used with untyped surface messages to access a tiled
+ * surface, what involves taking into account the tiling and swizzling modes
+ * of the surface manually so it will hopefully not happen very often.
+ *
+ * The tiling algorithm implemented here matches either the X or Y tiling
+ * layouts supported by the hardware depending on the tiling coefficients
+ * passed to the program as uniforms.  See Volume 1 Part 2 Section 4.5
+ * "Address Tiling Function" of the IVB PRM for an in-depth explanation of
+ * the hardware tiling format.
+ */
+static nir_def *
+image_address(nir_builder *b, const struct intel_device_info *devinfo,
+              nir_deref_instr *deref, nir_def *coord)
+{
+   if (glsl_get_sampler_dim(deref->type) == GLSL_SAMPLER_DIM_1D &&
+       glsl_sampler_type_is_array(deref->type)) {
+      /* It's easier if 1D arrays are treated like 2D arrays */
+      coord = nir_vec3(b, nir_channel(b, coord, 0),
+                          nir_imm_int(b, 0),
+                          nir_channel(b, coord, 1));
+   } else {
+      unsigned dims = glsl_get_sampler_coordinate_components(deref->type);
+      coord = nir_trim_vector(b, coord, dims);
+   }
+
+   nir_def *offset = load_image_param(b, deref, OFFSET);
+   nir_def *tiling = load_image_param(b, deref, TILING);
+   nir_def *stride = load_image_param(b, deref, STRIDE);
+
+   /* Shift the coordinates by the fixed surface offset.  It may be non-zero
+    * if the image is a single slice of a higher-dimensional surface, or if a
+    * non-zero mipmap level of the surface is bound to the pipeline.  The
+    * offset needs to be applied here rather than at surface state set-up time
+    * because the desired slice-level may start mid-tile, so simply shifting
+    * the surface base address wouldn't give a well-formed tiled surface in
+    * the general case.
+    */
+   nir_def *xypos = (coord->num_components == 1) ?
+                        nir_vec2(b, coord, nir_imm_int(b, 0)) :
+                        nir_trim_vector(b, coord, 2);
+   xypos = nir_iadd(b, xypos, offset);
+
+   /* The layout of 3-D textures in memory is sort-of like a tiling
+    * format.  At each miplevel, the slices are arranged in rows of
+    * 2^level slices per row.  The slice row is stored in tmp.y and
+    * the slice within the row is stored in tmp.x.
+    *
+    * The layout of 2-D array textures and cubemaps is much simpler:
+    * Depending on whether the ARYSPC_LOD0 layout is in use it will be
+    * stored in memory as an array of slices, each one being a 2-D
+    * arrangement of miplevels, or as a 2D arrangement of miplevels,
+    * each one being an array of slices.  In either case the separation
+    * between slices of the same LOD is equal to the qpitch value
+    * provided as stride.w.
+    *
+    * This code can be made to handle either 2D arrays and 3D textures
+    * by passing in the miplevel as tile.z for 3-D textures and 0 in
+    * tile.z for 2-D array textures.
+    *
+    * See Volume 1 Part 1 of the Gfx7 PRM, sections 6.18.4.7 "Surface
+    * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
+    * of the hardware 3D texture and 2D array layouts.
+    */
+   if (coord->num_components > 2) {
+      /* Decompose z into a major (tmp.y) and a minor (tmp.x)
+       * index.
+       */
+      nir_def *z = nir_channel(b, coord, 2);
+      nir_def *z_x = nir_ubfe(b, z, nir_imm_int(b, 0),
+                                  nir_channel(b, tiling, 2));
+      nir_def *z_y = nir_ushr(b, z, nir_channel(b, tiling, 2));
+
+      /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
+       * slice offset.
+       */
+      xypos = nir_iadd(b, xypos, nir_imul(b, nir_vec2(b, z_x, z_y),
+                                             nir_channels(b, stride, 0xc)));
+   }
+
+   nir_def *addr;
+   if (coord->num_components > 1) {
+      /* Calculate the major/minor x and y indices.  In order to
+       * accommodate both X and Y tiling, the Y-major tiling format is
+       * treated as being a bunch of narrow X-tiles placed next to each
+       * other.  This means that the tile width for Y-tiling is actually
+       * the width of one sub-column of the Y-major tile where each 4K
+       * tile has 8 512B sub-columns.
+       *
+       * The major Y value is the row of tiles in which the pixel lives.
+       * The major X value is the tile sub-column in which the pixel
+       * lives; for X tiling, this is the same as the tile column, for Y
+       * tiling, each tile has 8 sub-columns.  The minor X and Y indices
+       * are the position within the sub-column.
+       */
+
+      /* Calculate the minor x and y indices. */
+      nir_def *minor = nir_ubfe(b, xypos, nir_imm_int(b, 0),
+                                       nir_trim_vector(b, tiling, 2));
+      nir_def *major = nir_ushr(b, xypos, nir_trim_vector(b, tiling, 2));
+
+      /* Calculate the texel index from the start of the tile row and the
+       * vertical coordinate of the row.
+       * Equivalent to:
+       *   tmp.x = (major.x << tile.y << tile.x) +
+       *           (minor.y << tile.x) + minor.x
+       *   tmp.y = major.y << tile.y
+       */
+      nir_def *idx_x, *idx_y;
+      idx_x = nir_ishl(b, nir_channel(b, major, 0), nir_channel(b, tiling, 1));
+      idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 1));
+      idx_x = nir_ishl(b, idx_x, nir_channel(b, tiling, 0));
+      idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 0));
+      idx_y = nir_ishl(b, nir_channel(b, major, 1), nir_channel(b, tiling, 1));
+
+      /* Add it to the start of the tile row. */
+      nir_def *idx;
+      idx = nir_imul(b, idx_y, nir_channel(b, stride, 1));
+      idx = nir_iadd(b, idx, idx_x);
+
+      /* Multiply by the Bpp value. */
+      addr = nir_imul(b, idx, nir_channel(b, stride, 0));
+
+      if (devinfo->ver < 8 && devinfo->platform != INTEL_PLATFORM_BYT) {
+         /* Take into account the two dynamically specified shifts.  Both are
+          * used to implement swizzling of X-tiled surfaces.  For Y-tiled
+          * surfaces only one bit needs to be XOR-ed with bit 6 of the memory
+          * address, so a swz value of 0xff (actually interpreted as 31 by the
+          * hardware) will be provided to cause the relevant bit of tmp.y to
+          * be zero and turn the first XOR into the identity.  For linear
+          * surfaces or platforms lacking address swizzling both shifts will
+          * be 0xff causing the relevant bits of both tmp.x and .y to be zero,
+          * what effectively disables swizzling.
+          */
+         nir_def *swizzle = load_image_param(b, deref, SWIZZLING);
+         nir_def *shift0 = nir_ushr(b, addr, nir_channel(b, swizzle, 0));
+         nir_def *shift1 = nir_ushr(b, addr, nir_channel(b, swizzle, 1));
+
+         /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
+         nir_def *bit = nir_iand(b, nir_ixor(b, shift0, shift1),
+                                        nir_imm_int(b, 1 << 6));
+         addr = nir_ixor(b, addr, bit);
+      }
+   } else {
+      /* Multiply by the Bpp/stride value.  Note that the addr.y may be
+       * non-zero even if the image is one-dimensional because a vertical
+       * offset may have been applied above to select a non-zero slice or
+       * level of a higher-dimensional texture.
+       */
+      nir_def *idx;
+      idx = nir_imul(b, nir_channel(b, xypos, 1), nir_channel(b, stride, 1));
+      idx = nir_iadd(b, nir_channel(b, xypos, 0), idx);
+      addr = nir_imul(b, idx, nir_channel(b, stride, 0));
+   }
+
+   return addr;
+}
+
+struct format_info {
+   const struct isl_format_layout *fmtl;
+   unsigned chans;
+   unsigned bits[4];
+};
+
+static struct format_info
+get_format_info(enum isl_format fmt)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
+
+   return (struct format_info) {
+      .fmtl = fmtl,
+      .chans = isl_format_get_num_channels(fmt),
+      .bits = {
+         fmtl->channels.r.bits,
+         fmtl->channels.g.bits,
+         fmtl->channels.b.bits,
+         fmtl->channels.a.bits
+      },
+   };
+}
+
+static nir_def *
+convert_color_for_load(nir_builder *b, const struct intel_device_info *devinfo,
+                       nir_def *color,
+                       enum isl_format image_fmt, enum isl_format lower_fmt,
+                       unsigned dest_components)
+{
+   if (image_fmt == lower_fmt)
+      goto expand_vec;
+
+   if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
+      assert(lower_fmt == ISL_FORMAT_R32_UINT);
+      color = nir_format_unpack_11f11f10f(b, color);
+      goto expand_vec;
+   }
+
+   struct format_info image = get_format_info(image_fmt);
+   struct format_info lower = get_format_info(lower_fmt);
+
+   const bool needs_sign_extension =
+      isl_format_has_snorm_channel(image_fmt) ||
+      isl_format_has_sint_channel(image_fmt);
+
+   /* We only check the red channel to detect if we need to pack/unpack */
+   assert(image.bits[0] != lower.bits[0] ||
+          memcmp(image.bits, lower.bits, sizeof(image.bits)) == 0);
+
+   if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
+      if (needs_sign_extension)
+         color = nir_format_unpack_sint(b, color, image.bits, image.chans);
+      else
+         color = nir_format_unpack_uint(b, color, image.bits, image.chans);
+   } else {
+      /* All these formats are homogeneous */
+      for (unsigned i = 1; i < image.chans; i++)
+         assert(image.bits[i] == image.bits[0]);
+
+      /* On IVB, we rely on the undocumented behavior that typed reads from
+       * surfaces of the unsupported R8 and R16 formats return useful data in
+       * their least significant bits.  However, the data in the high bits is
+       * garbage so we have to discard it.
+       */
+      if (devinfo->verx10 == 70 &&
+          (lower_fmt == ISL_FORMAT_R16_UINT ||
+           lower_fmt == ISL_FORMAT_R8_UINT))
+         color = nir_format_mask_uvec(b, color, lower.bits);
+
+      if (image.bits[0] != lower.bits[0]) {
+         color = nir_format_bitcast_uvec_unmasked(b, color, lower.bits[0],
+                                                  image.bits[0]);
+      }
+
+      if (needs_sign_extension)
+         color = nir_format_sign_extend_ivec(b, color, image.bits);
+   }
+
+   switch (image.fmtl->channels.r.type) {
+   case ISL_UNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_unorm_to_float(b, color, image.bits);
+      break;
+
+   case ISL_SNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_snorm_to_float(b, color, image.bits);
+      break;
+
+   case ISL_SFLOAT:
+      if (image.bits[0] == 16)
+         color = nir_unpack_half_2x16_split_x(b, color);
+      break;
+
+   case ISL_UINT:
+   case ISL_SINT:
+      break;
+
+   default:
+      unreachable("Invalid image channel type");
+   }
+
+expand_vec:
+   assert(dest_components == 1 || dest_components == 4);
+   assert(color->num_components <= dest_components);
+   if (color->num_components == dest_components)
+      return color;
+
+   nir_def *comps[4];
+   for (unsigned i = 0; i < color->num_components; i++)
+      comps[i] = nir_channel(b, color, i);
+
+   for (unsigned i = color->num_components; i < 3; i++)
+      comps[i] = nir_imm_int(b, 0);
+
+   if (color->num_components < 4) {
+      if (isl_format_has_int_channel(image_fmt))
+         comps[3] = nir_imm_int(b, 1);
+      else
+         comps[3] = nir_imm_float(b, 1);
+   }
+
+   return nir_vec(b, comps, dest_components);
+}
+
+static bool
+lower_image_load_instr(nir_builder *b,
+                       const struct intel_device_info *devinfo,
+                       nir_intrinsic_instr *intrin,
+                       bool sparse)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   if (var->data.image.format == PIPE_FORMAT_NONE)
+      return false;
+
+   const enum isl_format image_fmt =
+      isl_format_for_pipe_format(var->data.image.format);
+
+   if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
+      const enum isl_format lower_fmt =
+         isl_lower_storage_image_format(devinfo, image_fmt);
+      const unsigned dest_components =
+         sparse ? (intrin->num_components - 1) : intrin->num_components;
+
+      /* Use an undef to hold the uses of the load while we do the color
+       * conversion.
+       */
+      nir_def *placeholder = nir_undef(b, 4, 32);
+      nir_def_rewrite_uses(&intrin->def, placeholder);
+
+      intrin->num_components = isl_format_get_num_channels(lower_fmt);
+      intrin->def.num_components = intrin->num_components;
+
+      b->cursor = nir_after_instr(&intrin->instr);
+
+      nir_def *color = convert_color_for_load(b, devinfo,
+                                                  &intrin->def,
+                                                  image_fmt, lower_fmt,
+                                                  dest_components);
+
+      if (sparse) {
+         /* Put the sparse component back on the original instruction */
+         intrin->num_components++;
+         intrin->def.num_components = intrin->num_components;
+
+         /* Carry over the sparse component without modifying it with the
+          * converted color.
+          */
+         nir_def *sparse_color[NIR_MAX_VEC_COMPONENTS];
+         for (unsigned i = 0; i < dest_components; i++)
+            sparse_color[i] = nir_channel(b, color, i);
+         sparse_color[dest_components] =
+            nir_channel(b, &intrin->def, intrin->num_components - 1);
+         color = nir_vec(b, sparse_color, dest_components + 1);
+      }
+
+      nir_def_rewrite_uses(placeholder, color);
+      nir_instr_remove(placeholder->parent_instr);
+   } else {
+      /* This code part is only useful prior to Gfx9, we do not have plans to
+       * enable sparse there.
+       */
+      assert(!sparse);
+
+      const struct isl_format_layout *image_fmtl =
+         isl_format_get_layout(image_fmt);
+      /* We have a matching typed format for everything 32b and below */
+      assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
+      enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
+                                ISL_FORMAT_R32G32_UINT :
+                                ISL_FORMAT_R32G32B32A32_UINT;
+      const unsigned dest_components = intrin->num_components;
+
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      nir_def *coord = intrin->src[1].ssa;
+
+      nir_def *do_load = image_coord_is_in_bounds(b, deref, coord);
+      if (devinfo->verx10 == 70) {
+         /* Check whether the first stride component (i.e. the Bpp value)
+          * is greater than four, what on Gfx7 indicates that a surface of
+          * type RAW has been bound for untyped access.  Reading or writing
+          * to a surface of type other than RAW using untyped surface
+          * messages causes a hang on IVB and VLV.
+          */
+         nir_def *stride = load_image_param(b, deref, STRIDE);
+         nir_def *is_raw =
+            nir_igt_imm(b, nir_channel(b, stride, 0), 4);
+         do_load = nir_iand(b, do_load, is_raw);
+      }
+      nir_push_if(b, do_load);
+
+      nir_def *addr = image_address(b, devinfo, deref, coord);
+      nir_def *load =
+         nir_image_deref_load_raw_intel(b, image_fmtl->bpb / 32, 32,
+                                        &deref->def, addr);
+
+      nir_push_else(b, NULL);
+
+      nir_def *zero = nir_imm_zero(b, load->num_components, 32);
+
+      nir_pop_if(b, NULL);
+
+      nir_def *value = nir_if_phi(b, load, zero);
+
+      nir_def *color = convert_color_for_load(b, devinfo, value,
+                                                  image_fmt, raw_fmt,
+                                                  dest_components);
+
+      nir_def_rewrite_uses(&intrin->def, color);
+   }
+
+   return true;
+}
+
+static nir_def *
+convert_color_for_store(nir_builder *b, const struct intel_device_info *devinfo,
+                        nir_def *color,
+                        enum isl_format image_fmt, enum isl_format lower_fmt)
+{
+   struct format_info image = get_format_info(image_fmt);
+   struct format_info lower = get_format_info(lower_fmt);
+
+   color = nir_trim_vector(b, color, image.chans);
+
+   if (image_fmt == lower_fmt)
+      return color;
+
+   if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
+      assert(lower_fmt == ISL_FORMAT_R32_UINT);
+      return nir_format_pack_11f11f10f(b, color);
+   }
+
+   switch (image.fmtl->channels.r.type) {
+   case ISL_UNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_float_to_unorm(b, color, image.bits);
+      break;
+
+   case ISL_SNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_float_to_snorm(b, color, image.bits);
+      break;
+
+   case ISL_SFLOAT:
+      if (image.bits[0] == 16)
+         color = nir_format_float_to_half(b, color);
+      break;
+
+   case ISL_UINT:
+      color = nir_format_clamp_uint(b, color, image.bits);
+      break;
+
+   case ISL_SINT:
+      color = nir_format_clamp_sint(b, color, image.bits);
+      break;
+
+   default:
+      unreachable("Invalid image channel type");
+   }
+
+   if (image.bits[0] < 32 &&
+       (isl_format_has_snorm_channel(image_fmt) ||
+        isl_format_has_sint_channel(image_fmt)))
+      color = nir_format_mask_uvec(b, color, image.bits);
+
+   if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
+      color = nir_format_pack_uint(b, color, image.bits, image.chans);
+   } else {
+      /* All these formats are homogeneous */
+      for (unsigned i = 1; i < image.chans; i++)
+         assert(image.bits[i] == image.bits[0]);
+
+      if (image.bits[0] != lower.bits[0]) {
+         color = nir_format_bitcast_uvec_unmasked(b, color, image.bits[0],
+                                                  lower.bits[0]);
+      }
+   }
+
+   return color;
+}
+
+static bool
+lower_image_store_instr(nir_builder *b,
+                        const struct intel_device_info *devinfo,
+                        nir_intrinsic_instr *intrin)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   /* For write-only surfaces, we trust that the hardware can just do the
+    * conversion for us.
+    */
+   if (var->data.access & ACCESS_NON_READABLE)
+      return false;
+
+   if (var->data.image.format == PIPE_FORMAT_NONE)
+      return false;
+
+   const enum isl_format image_fmt =
+      isl_format_for_pipe_format(var->data.image.format);
+
+   if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
+      const enum isl_format lower_fmt =
+         isl_lower_storage_image_format(devinfo, image_fmt);
+
+      /* Color conversion goes before the store */
+      b->cursor = nir_before_instr(&intrin->instr);
+
+      nir_def *color = convert_color_for_store(b, devinfo,
+                                                   intrin->src[3].ssa,
+                                                   image_fmt, lower_fmt);
+      intrin->num_components = isl_format_get_num_channels(lower_fmt);
+      nir_src_rewrite(&intrin->src[3], color);
+   } else {
+      const struct isl_format_layout *image_fmtl =
+         isl_format_get_layout(image_fmt);
+      /* We have a matching typed format for everything 32b and below */
+      assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
+      enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
+                                ISL_FORMAT_R32G32_UINT :
+                                ISL_FORMAT_R32G32B32A32_UINT;
+
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      nir_def *coord = intrin->src[1].ssa;
+
+      nir_def *do_store = image_coord_is_in_bounds(b, deref, coord);
+      if (devinfo->verx10 == 70) {
+         /* Check whether the first stride component (i.e. the Bpp value)
+          * is greater than four, what on Gfx7 indicates that a surface of
+          * type RAW has been bound for untyped access.  Reading or writing
+          * to a surface of type other than RAW using untyped surface
+          * messages causes a hang on IVB and VLV.
+          */
+         nir_def *stride = load_image_param(b, deref, STRIDE);
+         nir_def *is_raw =
+            nir_igt_imm(b, nir_channel(b, stride, 0), 4);
+         do_store = nir_iand(b, do_store, is_raw);
+      }
+      nir_push_if(b, do_store);
+
+      nir_def *addr = image_address(b, devinfo, deref, coord);
+      nir_def *color = convert_color_for_store(b, devinfo,
+                                                   intrin->src[3].ssa,
+                                                   image_fmt, raw_fmt);
+
+      nir_intrinsic_instr *store =
+         nir_intrinsic_instr_create(b->shader,
+                                    nir_intrinsic_image_deref_store_raw_intel);
+      store->src[0] = nir_src_for_ssa(&deref->def);
+      store->src[1] = nir_src_for_ssa(addr);
+      store->src[2] = nir_src_for_ssa(color);
+      store->num_components = image_fmtl->bpb / 32;
+      nir_builder_instr_insert(b, &store->instr);
+
+      nir_pop_if(b, NULL);
+   }
+
+   return true;
+}
+
+static bool
+lower_image_atomic_instr(nir_builder *b,
+                         const struct intel_device_info *devinfo,
+                         nir_intrinsic_instr *intrin)
+{
+   if (devinfo->verx10 >= 75)
+      return false;
+
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   /* Use an undef to hold the uses of the load conversion. */
+   nir_def *placeholder = nir_undef(b, 4, 32);
+   nir_def_rewrite_uses(&intrin->def, placeholder);
+
+   /* Check the first component of the size field to find out if the
+    * image is bound.  Necessary on IVB for typed atomics because
+    * they don't seem to respect null surfaces and will happily
+    * corrupt or read random memory when no image is bound.
+    */
+   nir_def *size = load_image_param(b, deref, SIZE);
+   nir_def *zero = nir_imm_int(b, 0);
+   nir_push_if(b, nir_ine(b, nir_channel(b, size, 0), zero));
+
+   nir_builder_instr_insert(b, &intrin->instr);
+
+   nir_pop_if(b, NULL);
+
+   nir_def *result = nir_if_phi(b, &intrin->def, zero);
+   nir_def_rewrite_uses(placeholder, result);
+
+   return true;
+}
+
+static bool
+lower_image_size_instr(nir_builder *b,
+                       const struct intel_device_info *devinfo,
+                       nir_intrinsic_instr *intrin)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   /* For write-only images, we have an actual image surface so we fall back
+    * and let the back-end emit a TXS for this.
+    */
+   if (var->data.access & ACCESS_NON_READABLE)
+      return false;
+
+   if (var->data.image.format == PIPE_FORMAT_NONE)
+      return false;
+
+   /* If we have a matching typed format, then we have an actual image surface
+    * so we fall back and let the back-end emit a TXS for this.
+    */
+   const enum isl_format image_fmt =
+      isl_format_for_pipe_format(var->data.image.format);
+   if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt))
+      return false;
+
+   assert(nir_src_as_uint(intrin->src[1]) == 0);
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_def *size = load_image_param(b, deref, SIZE);
+
+   nir_def *comps[4] = { NULL, NULL, NULL, NULL };
+
+   assert(nir_intrinsic_image_dim(intrin) != GLSL_SAMPLER_DIM_CUBE);
+   unsigned coord_comps = glsl_get_sampler_coordinate_components(deref->type);
+   for (unsigned c = 0; c < coord_comps; c++)
+      comps[c] = nir_channel(b, size, c);
+
+   for (unsigned c = coord_comps; c < intrin->def.num_components; ++c)
+      comps[c] = nir_imm_int(b, 1);
+
+   nir_def *vec = nir_vec(b, comps, intrin->def.num_components);
+   nir_def_rewrite_uses(&intrin->def, vec);
+
+   return true;
+}
+
+static bool
+brw_nir_lower_storage_image_instr(nir_builder *b,
+                                  nir_instr *instr,
+                                  void *cb_data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+   const struct brw_nir_lower_storage_image_opts *opts = cb_data;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_image_deref_load:
+      if (opts->lower_loads)
+         return lower_image_load_instr(b, opts->devinfo, intrin, false);
+      return false;
+
+   case nir_intrinsic_image_deref_sparse_load:
+      if (opts->lower_loads)
+         return lower_image_load_instr(b, opts->devinfo, intrin, true);
+      return false;
+
+   case nir_intrinsic_image_deref_store:
+      if (opts->lower_stores)
+         return lower_image_store_instr(b, opts->devinfo, intrin);
+      return false;
+
+   case nir_intrinsic_image_deref_atomic:
+   case nir_intrinsic_image_deref_atomic_swap:
+      if (opts->lower_atomics)
+         return lower_image_atomic_instr(b, opts->devinfo, intrin);
+      return false;
+
+   case nir_intrinsic_image_deref_size:
+      if (opts->lower_get_size)
+         return lower_image_size_instr(b, opts->devinfo, intrin);
+      return false;
+
+   default:
+      /* Nothing to do */
+      return false;
+   }
+}
+
+bool
+brw_nir_lower_storage_image(nir_shader *shader,
+                            const struct brw_nir_lower_storage_image_opts *opts)
+{
+   bool progress = false;
+
+   const nir_lower_image_options image_options = {
+      .lower_cube_size = true,
+      .lower_image_samples_to_one = true,
+   };
+
+   progress |= nir_lower_image(shader, &image_options);
+
+   progress |= nir_shader_instructions_pass(shader,
+                                            brw_nir_lower_storage_image_instr,
+                                            nir_metadata_none,
+                                            (void *)opts);
+
+   return progress;
+}
--- a/src/intel/compiler/elk/brw_nir_rt.c
+++ b/src/intel/compiler/elk/brw_nir_rt.c
@ -0,0 +1,536 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "intel_nir.h"
+#include "brw_nir_rt.h"
+#include "brw_nir_rt_builder.h"
+#include "intel_nir.h"
+
+static bool
+resize_deref(nir_builder *b, nir_deref_instr *deref,
+             unsigned num_components, unsigned bit_size)
+{
+   if (deref->def.num_components == num_components &&
+       deref->def.bit_size == bit_size)
+      return false;
+
+   /* NIR requires array indices have to match the deref bit size */
+   if (deref->def.bit_size != bit_size &&
+       (deref->deref_type == nir_deref_type_array ||
+        deref->deref_type == nir_deref_type_ptr_as_array)) {
+      b->cursor = nir_before_instr(&deref->instr);
+      nir_def *idx;
+      if (nir_src_is_const(deref->arr.index)) {
+         idx = nir_imm_intN_t(b, nir_src_as_int(deref->arr.index), bit_size);
+      } else {
+         idx = nir_i2iN(b, deref->arr.index.ssa, bit_size);
+      }
+      nir_src_rewrite(&deref->arr.index, idx);
+   }
+
+   deref->def.num_components = num_components;
+   deref->def.bit_size = bit_size;
+
+   return true;
+}
+
+static bool
+lower_rt_io_derefs(nir_shader *shader)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+
+   bool progress = false;
+
+   unsigned num_shader_call_vars = 0;
+   nir_foreach_variable_with_modes(var, shader, nir_var_shader_call_data)
+      num_shader_call_vars++;
+
+   unsigned num_ray_hit_attrib_vars = 0;
+   nir_foreach_variable_with_modes(var, shader, nir_var_ray_hit_attrib)
+      num_ray_hit_attrib_vars++;
+
+   /* At most one payload is allowed because it's an input.  Technically, this
+    * is also true for hit attribute variables.  However, after we inline an
+    * any-hit shader into an intersection shader, we can end up with multiple
+    * hit attribute variables.  They'll end up mapping to a cast from the same
+    * base pointer so this is fine.
+    */
+   assert(num_shader_call_vars <= 1);
+
+   nir_builder b = nir_builder_at(nir_before_impl(impl));
+
+   nir_def *call_data_addr = NULL;
+   if (num_shader_call_vars > 0) {
+      assert(shader->scratch_size >= BRW_BTD_STACK_CALLEE_DATA_SIZE);
+      call_data_addr =
+         brw_nir_rt_load_scratch(&b, BRW_BTD_STACK_CALL_DATA_PTR_OFFSET, 8,
+                                 1, 64);
+      progress = true;
+   }
+
+   gl_shader_stage stage = shader->info.stage;
+   nir_def *hit_attrib_addr = NULL;
+   if (num_ray_hit_attrib_vars > 0) {
+      assert(stage == MESA_SHADER_ANY_HIT ||
+             stage == MESA_SHADER_CLOSEST_HIT ||
+             stage == MESA_SHADER_INTERSECTION);
+      nir_def *hit_addr =
+         brw_nir_rt_mem_hit_addr(&b, stage == MESA_SHADER_CLOSEST_HIT);
+      /* The vec2 barycentrics are in 2nd and 3rd dwords of MemHit */
+      nir_def *bary_addr = nir_iadd_imm(&b, hit_addr, 4);
+      hit_attrib_addr = nir_bcsel(&b, nir_load_leaf_procedural_intel(&b),
+                                      brw_nir_rt_hit_attrib_data_addr(&b),
+                                      bary_addr);
+      progress = true;
+   }
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_deref)
+            continue;
+
+         nir_deref_instr *deref = nir_instr_as_deref(instr);
+         if (nir_deref_mode_is(deref, nir_var_shader_call_data)) {
+            deref->modes = nir_var_function_temp;
+            if (deref->deref_type == nir_deref_type_var) {
+               b.cursor = nir_before_instr(&deref->instr);
+               nir_deref_instr *cast =
+                  nir_build_deref_cast(&b, call_data_addr,
+                                       nir_var_function_temp,
+                                       deref->var->type, 0);
+               nir_def_rewrite_uses(&deref->def,
+                                        &cast->def);
+               nir_instr_remove(&deref->instr);
+               progress = true;
+            }
+         } else if (nir_deref_mode_is(deref, nir_var_ray_hit_attrib)) {
+            deref->modes = nir_var_function_temp;
+            if (deref->deref_type == nir_deref_type_var) {
+               b.cursor = nir_before_instr(&deref->instr);
+               nir_deref_instr *cast =
+                  nir_build_deref_cast(&b, hit_attrib_addr,
+                                       nir_var_function_temp,
+                                       deref->type, 0);
+               nir_def_rewrite_uses(&deref->def,
+                                        &cast->def);
+               nir_instr_remove(&deref->instr);
+               progress = true;
+            }
+         }
+
+         /* We're going to lower all function_temp memory to scratch using
+          * 64-bit addresses.  We need to resize all our derefs first or else
+          * nir_lower_explicit_io will have a fit.
+          */
+         if (nir_deref_mode_is(deref, nir_var_function_temp) &&
+             resize_deref(&b, deref, 1, 64))
+            progress = true;
+      }
+   }
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+   } else {
+      nir_metadata_preserve(impl, nir_metadata_all);
+   }
+
+   return progress;
+}
+
+/** Lowers ray-tracing shader I/O and scratch access
+ *
+ * SPV_KHR_ray_tracing adds three new types of I/O, each of which need their
+ * own bit of special care:
+ *
+ *  - Shader payload data:  This is represented by the IncomingCallableData
+ *    and IncomingRayPayload storage classes which are both represented by
+ *    nir_var_call_data in NIR.  There is at most one of these per-shader and
+ *    they contain payload data passed down the stack from the parent shader
+ *    when it calls executeCallable() or traceRay().  In our implementation,
+ *    the actual storage lives in the calling shader's scratch space and we're
+ *    passed a pointer to it.
+ *
+ *  - Hit attribute data:  This is represented by the HitAttribute storage
+ *    class in SPIR-V and nir_var_ray_hit_attrib in NIR.  For triangle
+ *    geometry, it's supposed to contain two floats which are the barycentric
+ *    coordinates.  For AABS/procedural geometry, it contains the hit data
+ *    written out by the intersection shader.  In our implementation, it's a
+ *    64-bit pointer which points either to the u/v area of the relevant
+ *    MemHit data structure or the space right after the HW ray stack entry.
+ *
+ *  - Shader record buffer data:  This allows read-only access to the data
+ *    stored in the SBT right after the bindless shader handles.  It's
+ *    effectively a UBO with a magic address.  Coming out of spirv_to_nir,
+ *    we get a nir_intrinsic_load_shader_record_ptr which is cast to a
+ *    nir_var_mem_global deref and all access happens through that.  The
+ *    shader_record_ptr system value is handled in brw_nir_lower_rt_intrinsics
+ *    and we assume nir_lower_explicit_io is called elsewhere thanks to
+ *    VK_KHR_buffer_device_address so there's really nothing to do here.
+ *
+ * We also handle lowering any remaining function_temp variables to scratch at
+ * this point.  This gets rid of any remaining arrays and also takes care of
+ * the sending side of ray payloads where we pass pointers to a function_temp
+ * variable down the call stack.
+ */
+static void
+lower_rt_io_and_scratch(nir_shader *nir)
+{
+   /* First, we to ensure all the I/O variables have explicit types.  Because
+    * these are shader-internal and don't come in from outside, they don't
+    * have an explicit memory layout and we have to assign them one.
+    */
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
+              nir_var_function_temp |
+              nir_var_shader_call_data |
+              nir_var_ray_hit_attrib,
+              glsl_get_natural_size_align_bytes);
+
+   /* Now patch any derefs to I/O vars */
+   NIR_PASS_V(nir, lower_rt_io_derefs);
+
+   /* Finally, lower any remaining function_temp, mem_constant, or
+    * ray_hit_attrib access to 64-bit global memory access.
+    */
+   NIR_PASS_V(nir, nir_lower_explicit_io,
+              nir_var_function_temp |
+              nir_var_mem_constant |
+              nir_var_ray_hit_attrib,
+              nir_address_format_64bit_global);
+}
+
+static void
+build_terminate_ray(nir_builder *b)
+{
+   nir_def *skip_closest_hit = nir_test_mask(b, nir_load_ray_flags(b),
+      BRW_RT_RAY_FLAG_SKIP_CLOSEST_HIT_SHADER);
+   nir_push_if(b, skip_closest_hit);
+   {
+      /* The shader that calls traceRay() is unable to access any ray hit
+       * information except for that which is explicitly written into the ray
+       * payload by shaders invoked during the trace.  If there's no closest-
+       * hit shader, then accepting the hit has no observable effect; it's
+       * just extra memory traffic for no reason.
+       */
+      brw_nir_btd_return(b);
+      nir_jump(b, nir_jump_halt);
+   }
+   nir_push_else(b, NULL);
+   {
+      /* The closest hit shader is in the same shader group as the any-hit
+       * shader that we're currently in.  We can get the address for its SBT
+       * handle by looking at the shader record pointer and subtracting the
+       * size of a SBT handle.  The BINDLESS_SHADER_RECORD for a closest hit
+       * shader is the first one in the SBT handle.
+       */
+      nir_def *closest_hit =
+         nir_iadd_imm(b, nir_load_shader_record_ptr(b),
+                        -BRW_RT_SBT_HANDLE_SIZE);
+
+      brw_nir_rt_commit_hit(b);
+      brw_nir_btd_spawn(b, closest_hit);
+      nir_jump(b, nir_jump_halt);
+   }
+   nir_pop_if(b, NULL);
+}
+
+/** Lowers away ray walk intrinsics
+ *
+ * This lowers terminate_ray, ignore_ray_intersection, and the NIR-specific
+ * accept_ray_intersection intrinsics to the appropriate Intel-specific
+ * intrinsics.
+ */
+static bool
+lower_ray_walk_intrinsics(nir_shader *shader,
+                          const struct intel_device_info *devinfo)
+{
+   assert(shader->info.stage == MESA_SHADER_ANY_HIT ||
+          shader->info.stage == MESA_SHADER_INTERSECTION);
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+
+   nir_builder b = nir_builder_create(impl);
+
+   bool progress = false;
+   nir_foreach_block_safe(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_ignore_ray_intersection: {
+            b.cursor = nir_instr_remove(&intrin->instr);
+
+            /* We put the newly emitted code inside a dummy if because it's
+             * going to contain a jump instruction and we don't want to deal
+             * with that mess here.  It'll get dealt with by our control-flow
+             * optimization passes.
+             */
+            nir_push_if(&b, nir_imm_true(&b));
+            nir_trace_ray_intel(&b,
+                                nir_load_btd_global_arg_addr_intel(&b),
+                                nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT),
+                                nir_imm_int(&b, GEN_RT_TRACE_RAY_CONTINUE),
+                                .synchronous = false);
+            nir_jump(&b, nir_jump_halt);
+            nir_pop_if(&b, NULL);
+            progress = true;
+            break;
+         }
+
+         case nir_intrinsic_accept_ray_intersection: {
+            b.cursor = nir_instr_remove(&intrin->instr);
+
+            nir_def *terminate = nir_test_mask(&b, nir_load_ray_flags(&b),
+               BRW_RT_RAY_FLAG_TERMINATE_ON_FIRST_HIT);
+            nir_push_if(&b, terminate);
+            {
+               build_terminate_ray(&b);
+            }
+            nir_push_else(&b, NULL);
+            {
+               nir_trace_ray_intel(&b,
+                                   nir_load_btd_global_arg_addr_intel(&b),
+                                   nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT),
+                                   nir_imm_int(&b, GEN_RT_TRACE_RAY_COMMIT),
+                                   .synchronous = false);
+               nir_jump(&b, nir_jump_halt);
+            }
+            nir_pop_if(&b, NULL);
+            progress = true;
+            break;
+         }
+
+         case nir_intrinsic_terminate_ray: {
+            b.cursor = nir_instr_remove(&intrin->instr);
+            build_terminate_ray(&b);
+            progress = true;
+            break;
+         }
+
+         default:
+            break;
+         }
+      }
+   }
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_none);
+   } else {
+      nir_metadata_preserve(impl, nir_metadata_all);
+   }
+
+   return progress;
+}
+
+void
+brw_nir_lower_raygen(nir_shader *nir)
+{
+   assert(nir->info.stage == MESA_SHADER_RAYGEN);
+   NIR_PASS_V(nir, brw_nir_lower_shader_returns);
+   lower_rt_io_and_scratch(nir);
+}
+
+void
+brw_nir_lower_any_hit(nir_shader *nir, const struct intel_device_info *devinfo)
+{
+   assert(nir->info.stage == MESA_SHADER_ANY_HIT);
+   NIR_PASS_V(nir, brw_nir_lower_shader_returns);
+   NIR_PASS_V(nir, lower_ray_walk_intrinsics, devinfo);
+   lower_rt_io_and_scratch(nir);
+}
+
+void
+brw_nir_lower_closest_hit(nir_shader *nir)
+{
+   assert(nir->info.stage == MESA_SHADER_CLOSEST_HIT);
+   NIR_PASS_V(nir, brw_nir_lower_shader_returns);
+   lower_rt_io_and_scratch(nir);
+}
+
+void
+brw_nir_lower_miss(nir_shader *nir)
+{
+   assert(nir->info.stage == MESA_SHADER_MISS);
+   NIR_PASS_V(nir, brw_nir_lower_shader_returns);
+   lower_rt_io_and_scratch(nir);
+}
+
+void
+brw_nir_lower_callable(nir_shader *nir)
+{
+   assert(nir->info.stage == MESA_SHADER_CALLABLE);
+   NIR_PASS_V(nir, brw_nir_lower_shader_returns);
+   lower_rt_io_and_scratch(nir);
+}
+
+void
+brw_nir_lower_combined_intersection_any_hit(nir_shader *intersection,
+                                            const nir_shader *any_hit,
+                                            const struct intel_device_info *devinfo)
+{
+   assert(intersection->info.stage == MESA_SHADER_INTERSECTION);
+   assert(any_hit == NULL || any_hit->info.stage == MESA_SHADER_ANY_HIT);
+   NIR_PASS_V(intersection, brw_nir_lower_shader_returns);
+   NIR_PASS_V(intersection, brw_nir_lower_intersection_shader,
+              any_hit, devinfo);
+   NIR_PASS_V(intersection, lower_ray_walk_intrinsics, devinfo);
+   lower_rt_io_and_scratch(intersection);
+}
+
+static nir_def *
+build_load_uniform(nir_builder *b, unsigned offset,
+                   unsigned num_components, unsigned bit_size)
+{
+   return nir_load_uniform(b, num_components, bit_size, nir_imm_int(b, 0),
+                           .base = offset,
+                           .range = num_components * bit_size / 8);
+}
+
+#define load_trampoline_param(b, name, num_components, bit_size) \
+   build_load_uniform((b), offsetof(struct brw_rt_raygen_trampoline_params, name), \
+                      (num_components), (bit_size))
+
+nir_shader *
+brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
+                                 void *mem_ctx)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   const nir_shader_compiler_options *nir_options =
+      compiler->nir_options[MESA_SHADER_COMPUTE];
+
+   STATIC_ASSERT(sizeof(struct brw_rt_raygen_trampoline_params) == 32);
+
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
+                                                  nir_options,
+                                                  "RT Ray-Gen Trampoline");
+   ralloc_steal(mem_ctx, b.shader);
+
+   b.shader->info.workgroup_size_variable = true;
+
+   /* The RT global data and raygen BINDLESS_SHADER_RECORD addresses are
+    * passed in as push constants in the first register.  We deal with the
+    * raygen BSR address here; the global data we'll deal with later.
+    */
+   b.shader->num_uniforms = 32;
+   nir_def *raygen_param_bsr_addr =
+      load_trampoline_param(&b, raygen_bsr_addr, 1, 64);
+   nir_def *is_indirect =
+      nir_i2b(&b, load_trampoline_param(&b, is_indirect, 1, 8));
+   nir_def *local_shift =
+      nir_u2u32(&b, load_trampoline_param(&b, local_group_size_log2, 3, 8));
+
+   nir_def *raygen_indirect_bsr_addr;
+   nir_push_if(&b, is_indirect);
+   {
+      raygen_indirect_bsr_addr =
+         nir_load_global_constant(&b, raygen_param_bsr_addr,
+                                  8 /* align */,
+                                  1 /* components */,
+                                  64 /* bit_size */);
+   }
+   nir_pop_if(&b, NULL);
+
+   nir_def *raygen_bsr_addr =
+      nir_if_phi(&b, raygen_indirect_bsr_addr, raygen_param_bsr_addr);
+
+   nir_def *global_id = nir_load_workgroup_id_zero_base(&b);
+   nir_def *simd_channel = nir_load_subgroup_invocation(&b);
+   nir_def *local_x =
+      nir_ubfe(&b, simd_channel, nir_imm_int(&b, 0),
+                  nir_channel(&b, local_shift, 0));
+   nir_def *local_y =
+      nir_ubfe(&b, simd_channel, nir_channel(&b, local_shift, 0),
+                  nir_channel(&b, local_shift, 1));
+   nir_def *local_z =
+      nir_ubfe(&b, simd_channel,
+                  nir_iadd(&b, nir_channel(&b, local_shift, 0),
+                              nir_channel(&b, local_shift, 1)),
+                  nir_channel(&b, local_shift, 2));
+   nir_def *launch_id =
+      nir_iadd(&b, nir_ishl(&b, global_id, local_shift),
+                  nir_vec3(&b, local_x, local_y, local_z));
+
+   nir_def *launch_size = nir_load_ray_launch_size(&b);
+   nir_push_if(&b, nir_ball(&b, nir_ult(&b, launch_id, launch_size)));
+   {
+      nir_store_global(&b, brw_nir_rt_sw_hotzone_addr(&b, devinfo), 16,
+                       nir_vec4(&b, nir_imm_int(&b, 0), /* Stack ptr */
+                                    nir_channel(&b, launch_id, 0),
+                                    nir_channel(&b, launch_id, 1),
+                                    nir_channel(&b, launch_id, 2)),
+                       0xf /* write mask */);
+
+      brw_nir_btd_spawn(&b, raygen_bsr_addr);
+   }
+   nir_push_else(&b, NULL);
+   {
+      /* Even though these invocations aren't being used for anything, the
+       * hardware allocated stack IDs for them.  They need to retire them.
+       */
+      brw_nir_btd_retire(&b);
+   }
+   nir_pop_if(&b, NULL);
+
+   nir_shader *nir = b.shader;
+   nir->info.name = ralloc_strdup(nir, "RT: TraceRay trampoline");
+   nir_validate_shader(nir, "in brw_nir_create_raygen_trampoline");
+
+   struct brw_nir_compiler_opts opts = {};
+   brw_preprocess_nir(compiler, nir, &opts);
+
+   NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
+
+   b = nir_builder_create(nir_shader_get_entrypoint(b.shader));
+   /* brw_nir_lower_rt_intrinsics will leave us with a btd_global_arg_addr
+    * intrinsic which doesn't exist in compute shaders.  We also created one
+    * above when we generated the BTD spawn intrinsic.  Now we go through and
+    * replace them with a uniform load.
+    */
+   nir_foreach_block(block, b.impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         if (intrin->intrinsic != nir_intrinsic_load_btd_global_arg_addr_intel)
+            continue;
+
+         b.cursor = nir_before_instr(&intrin->instr);
+         nir_def *global_arg_addr =
+            load_trampoline_param(&b, rt_disp_globals_addr, 1, 64);
+         nir_def_rewrite_uses(&intrin->def,
+                                  global_arg_addr);
+         nir_instr_remove(instr);
+      }
+   }
+
+   NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, NULL);
+
+   const bool is_scalar = true;
+   brw_nir_optimize(nir, is_scalar, devinfo);
+
+   return nir;
+}
--- a/src/intel/compiler/elk/brw_nir_rt.h
+++ b/src/intel/compiler/elk/brw_nir_rt.h
@ -0,0 +1,76 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_NIR_RT_H
+#define BRW_NIR_RT_H
+
+#include "brw_nir.h"
+#include "brw_rt.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void brw_nir_lower_raygen(nir_shader *nir);
+void brw_nir_lower_any_hit(nir_shader *nir,
+                           const struct intel_device_info *devinfo);
+void brw_nir_lower_closest_hit(nir_shader *nir);
+void brw_nir_lower_miss(nir_shader *nir);
+void brw_nir_lower_callable(nir_shader *nir);
+void brw_nir_lower_combined_intersection_any_hit(nir_shader *intersection,
+                                                 const nir_shader *any_hit,
+                                                 const struct intel_device_info *devinfo);
+
+/* We reserve the first 16B of the stack for callee data pointers */
+#define BRW_BTD_STACK_RESUME_BSR_ADDR_OFFSET 0
+#define BRW_BTD_STACK_CALL_DATA_PTR_OFFSET 8
+#define BRW_BTD_STACK_CALLEE_DATA_SIZE 16
+
+/* We require the stack to be 8B aligned at the start of a shader */
+#define BRW_BTD_STACK_ALIGN 8
+
+bool brw_nir_lower_ray_queries(nir_shader *shader,
+                               const struct intel_device_info *devinfo);
+
+void brw_nir_lower_shader_returns(nir_shader *shader);
+
+bool brw_nir_lower_shader_calls(nir_shader *shader, struct brw_bs_prog_key *key);
+
+void brw_nir_lower_rt_intrinsics(nir_shader *shader,
+                                 const struct intel_device_info *devinfo);
+void brw_nir_lower_intersection_shader(nir_shader *intersection,
+                                       const nir_shader *any_hit,
+                                       const struct intel_device_info *devinfo);
+
+nir_shader *
+brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
+                                 void *mem_ctx);
+nir_shader *
+brw_nir_create_trivial_return_shader(const struct brw_compiler *compiler,
+                                     void *mem_ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BRW_NIR_RT_H */
--- a/src/intel/compiler/elk/brw_nir_rt_builder.h
+++ b/src/intel/compiler/elk/brw_nir_rt_builder.h
@ -0,0 +1,990 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_NIR_RT_BUILDER_H
+#define BRW_NIR_RT_BUILDER_H
+
+/* This file provides helpers to access memory based data structures that the
+ * RT hardware reads/writes and their locations.
+ *
+ * See also "Memory Based Data Structures for Ray Tracing" (BSpec 47547) and
+ * "Ray Tracing Address Computation for Memory Resident Structures" (BSpec
+ * 47550).
+ */
+
+#include "brw_rt.h"
+#include "nir_builder.h"
+
+#define is_access_for_builder(b) \
+   ((b)->shader->info.stage == MESA_SHADER_FRAGMENT ? \
+    ACCESS_INCLUDE_HELPERS : 0)
+
+static inline nir_def *
+brw_nir_rt_load(nir_builder *b, nir_def *addr, unsigned align,
+                unsigned components, unsigned bit_size)
+{
+   return nir_build_load_global(b, components, bit_size, addr,
+                                .align_mul = align,
+                                .access = is_access_for_builder(b));
+}
+
+static inline void
+brw_nir_rt_store(nir_builder *b, nir_def *addr, unsigned align,
+                 nir_def *value, unsigned write_mask)
+{
+   nir_build_store_global(b, value, addr,
+                          .align_mul = align,
+                          .write_mask = (write_mask) &
+                                        BITFIELD_MASK(value->num_components),
+                          .access = is_access_for_builder(b));
+}
+
+static inline nir_def *
+brw_nir_rt_load_const(nir_builder *b, unsigned components,
+                      nir_def *addr, nir_def *pred)
+{
+   return nir_load_global_const_block_intel(b, components, addr, pred);
+}
+
+static inline nir_def *
+brw_load_btd_dss_id(nir_builder *b)
+{
+   return nir_load_topology_id_intel(b, .base = BRW_TOPOLOGY_ID_DSS);
+}
+
+static inline nir_def *
+brw_nir_rt_load_num_simd_lanes_per_dss(nir_builder *b,
+                                       const struct intel_device_info *devinfo)
+{
+   return nir_imm_int(b, devinfo->num_thread_per_eu *
+                         devinfo->max_eus_per_subslice *
+                         16 /* The RT computation is based off SIMD16 */);
+}
+
+static inline nir_def *
+brw_load_eu_thread_simd(nir_builder *b)
+{
+   return nir_load_topology_id_intel(b, .base = BRW_TOPOLOGY_ID_EU_THREAD_SIMD);
+}
+
+static inline nir_def *
+brw_nir_rt_async_stack_id(nir_builder *b)
+{
+   return nir_iadd(b, nir_umul_32x16(b, nir_load_ray_num_dss_rt_stacks_intel(b),
+                                        brw_load_btd_dss_id(b)),
+                      nir_load_btd_stack_id_intel(b));
+}
+
+static inline nir_def *
+brw_nir_rt_sync_stack_id(nir_builder *b)
+{
+   return brw_load_eu_thread_simd(b);
+}
+
+/* We have our own load/store scratch helpers because they emit a global
+ * memory read or write based on the scratch_base_ptr system value rather
+ * than a load/store_scratch intrinsic.
+ */
+static inline nir_def *
+brw_nir_rt_load_scratch(nir_builder *b, uint32_t offset, unsigned align,
+                        unsigned num_components, unsigned bit_size)
+{
+   nir_def *addr =
+      nir_iadd_imm(b, nir_load_scratch_base_ptr(b, 1, 64, 1), offset);
+   return brw_nir_rt_load(b, addr, MIN2(align, BRW_BTD_STACK_ALIGN),
+                             num_components, bit_size);
+}
+
+static inline void
+brw_nir_rt_store_scratch(nir_builder *b, uint32_t offset, unsigned align,
+                         nir_def *value, nir_component_mask_t write_mask)
+{
+   nir_def *addr =
+      nir_iadd_imm(b, nir_load_scratch_base_ptr(b, 1, 64, 1), offset);
+   brw_nir_rt_store(b, addr, MIN2(align, BRW_BTD_STACK_ALIGN),
+                    value, write_mask);
+}
+
+static inline void
+brw_nir_btd_spawn(nir_builder *b, nir_def *record_addr)
+{
+   nir_btd_spawn_intel(b, nir_load_btd_global_arg_addr_intel(b), record_addr);
+}
+
+static inline void
+brw_nir_btd_retire(nir_builder *b)
+{
+   nir_btd_retire_intel(b);
+}
+
+/** This is a pseudo-op which does a bindless return
+ *
+ * It loads the return address from the stack and calls btd_spawn to spawn the
+ * resume shader.
+ */
+static inline void
+brw_nir_btd_return(struct nir_builder *b)
+{
+   nir_def *resume_addr =
+      brw_nir_rt_load_scratch(b, BRW_BTD_STACK_RESUME_BSR_ADDR_OFFSET,
+                              8 /* align */, 1, 64);
+   brw_nir_btd_spawn(b, resume_addr);
+}
+
+static inline void
+assert_def_size(nir_def *def, unsigned num_components, unsigned bit_size)
+{
+   assert(def->num_components == num_components);
+   assert(def->bit_size == bit_size);
+}
+
+static inline nir_def *
+brw_nir_num_rt_stacks(nir_builder *b,
+                      const struct intel_device_info *devinfo)
+{
+   return nir_imul_imm(b, nir_load_ray_num_dss_rt_stacks_intel(b),
+                          intel_device_info_dual_subslice_id_bound(devinfo));
+}
+
+static inline nir_def *
+brw_nir_rt_sw_hotzone_addr(nir_builder *b,
+                           const struct intel_device_info *devinfo)
+{
+   nir_def *offset32 =
+      nir_imul_imm(b, brw_nir_rt_async_stack_id(b),
+                      BRW_RT_SIZEOF_HOTZONE);
+
+   offset32 = nir_iadd(b, offset32, nir_ineg(b,
+      nir_imul_imm(b, brw_nir_num_rt_stacks(b, devinfo),
+                      BRW_RT_SIZEOF_HOTZONE)));
+
+   return nir_iadd(b, nir_load_ray_base_mem_addr_intel(b),
+                      nir_i2i64(b, offset32));
+}
+
+static inline nir_def *
+brw_nir_rt_sync_stack_addr(nir_builder *b,
+                           nir_def *base_mem_addr,
+                           const struct intel_device_info *devinfo)
+{
+   /* For Ray queries (Synchronous Ray Tracing), the formula is similar but
+    * goes down from rtMemBasePtr :
+    *
+    *    syncBase  = RTDispatchGlobals.rtMemBasePtr
+    *              - (DSSID * NUM_SIMD_LANES_PER_DSS + SyncStackID + 1)
+    *              * syncStackSize
+    *
+    * We assume that we can calculate a 32-bit offset first and then add it
+    * to the 64-bit base address at the end.
+    */
+   nir_def *offset32 =
+      nir_imul(b,
+               nir_iadd(b,
+                        nir_imul(b, brw_load_btd_dss_id(b),
+                                    brw_nir_rt_load_num_simd_lanes_per_dss(b, devinfo)),
+                        nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
+               nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY));
+   return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32));
+}
+
+static inline nir_def *
+brw_nir_rt_stack_addr(nir_builder *b)
+{
+   /* From the BSpec "Address Computation for Memory Based Data Structures:
+    * Ray and TraversalStack (Async Ray Tracing)":
+    *
+    *    stackBase = RTDispatchGlobals.rtMemBasePtr
+    *              + (DSSID * RTDispatchGlobals.numDSSRTStacks + stackID)
+    *              * RTDispatchGlobals.stackSizePerRay // 64B aligned
+    *
+    * We assume that we can calculate a 32-bit offset first and then add it
+    * to the 64-bit base address at the end.
+    */
+   nir_def *offset32 =
+      nir_imul(b, brw_nir_rt_async_stack_id(b),
+                  nir_load_ray_hw_stack_size_intel(b));
+   return nir_iadd(b, nir_load_ray_base_mem_addr_intel(b),
+                      nir_u2u64(b, offset32));
+}
+
+static inline nir_def *
+brw_nir_rt_mem_hit_addr_from_addr(nir_builder *b,
+                        nir_def *stack_addr,
+                        bool committed)
+{
+   return nir_iadd_imm(b, stack_addr, committed ? 0 : BRW_RT_SIZEOF_HIT_INFO);
+}
+
+static inline nir_def *
+brw_nir_rt_mem_hit_addr(nir_builder *b, bool committed)
+{
+   return nir_iadd_imm(b, brw_nir_rt_stack_addr(b),
+                          committed ? 0 : BRW_RT_SIZEOF_HIT_INFO);
+}
+
+static inline nir_def *
+brw_nir_rt_hit_attrib_data_addr(nir_builder *b)
+{
+   return nir_iadd_imm(b, brw_nir_rt_stack_addr(b),
+                          BRW_RT_OFFSETOF_HIT_ATTRIB_DATA);
+}
+
+static inline nir_def *
+brw_nir_rt_mem_ray_addr(nir_builder *b,
+                        nir_def *stack_addr,
+                        enum brw_rt_bvh_level bvh_level)
+{
+   /* From the BSpec "Address Computation for Memory Based Data Structures:
+    * Ray and TraversalStack (Async Ray Tracing)":
+    *
+    *    rayBase = stackBase + sizeof(HitInfo) * 2 // 64B aligned
+    *    rayPtr  = rayBase + bvhLevel * sizeof(Ray); // 64B aligned
+    *
+    * In Vulkan, we always have exactly two levels of BVH: World and Object.
+    */
+   uint32_t offset = BRW_RT_SIZEOF_HIT_INFO * 2 +
+                     bvh_level * BRW_RT_SIZEOF_RAY;
+   return nir_iadd_imm(b, stack_addr, offset);
+}
+
+static inline nir_def *
+brw_nir_rt_sw_stack_addr(nir_builder *b,
+                         const struct intel_device_info *devinfo)
+{
+   nir_def *addr = nir_load_ray_base_mem_addr_intel(b);
+
+   nir_def *offset32 = nir_imul(b, brw_nir_num_rt_stacks(b, devinfo),
+                                       nir_load_ray_hw_stack_size_intel(b));
+   addr = nir_iadd(b, addr, nir_u2u64(b, offset32));
+
+   nir_def *offset_in_stack =
+      nir_imul(b, nir_u2u64(b, brw_nir_rt_async_stack_id(b)),
+                  nir_u2u64(b, nir_load_ray_sw_stack_size_intel(b)));
+
+   return nir_iadd(b, addr, offset_in_stack);
+}
+
+static inline nir_def *
+nir_unpack_64_4x16_split_z(nir_builder *b, nir_def *val)
+{
+   return nir_unpack_32_2x16_split_x(b, nir_unpack_64_2x32_split_y(b, val));
+}
+
+struct brw_nir_rt_globals_defs {
+   nir_def *base_mem_addr;
+   nir_def *call_stack_handler_addr;
+   nir_def *hw_stack_size;
+   nir_def *num_dss_rt_stacks;
+   nir_def *hit_sbt_addr;
+   nir_def *hit_sbt_stride;
+   nir_def *miss_sbt_addr;
+   nir_def *miss_sbt_stride;
+   nir_def *sw_stack_size;
+   nir_def *launch_size;
+   nir_def *call_sbt_addr;
+   nir_def *call_sbt_stride;
+   nir_def *resume_sbt_addr;
+};
+
+static inline void
+brw_nir_rt_load_globals_addr(nir_builder *b,
+                             struct brw_nir_rt_globals_defs *defs,
+                             nir_def *addr)
+{
+   nir_def *data;
+   data = brw_nir_rt_load_const(b, 16, addr, nir_imm_true(b));
+   defs->base_mem_addr = nir_pack_64_2x32(b, nir_trim_vector(b, data, 2));
+
+   defs->call_stack_handler_addr =
+      nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
+
+   defs->hw_stack_size = nir_channel(b, data, 4);
+   defs->num_dss_rt_stacks = nir_iand_imm(b, nir_channel(b, data, 5), 0xffff);
+   defs->hit_sbt_addr =
+      nir_pack_64_2x32_split(b, nir_channel(b, data, 8),
+                                nir_extract_i16(b, nir_channel(b, data, 9),
+                                                   nir_imm_int(b, 0)));
+   defs->hit_sbt_stride =
+      nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 9));
+   defs->miss_sbt_addr =
+      nir_pack_64_2x32_split(b, nir_channel(b, data, 10),
+                                nir_extract_i16(b, nir_channel(b, data, 11),
+                                                   nir_imm_int(b, 0)));
+   defs->miss_sbt_stride =
+      nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 11));
+   defs->sw_stack_size = nir_channel(b, data, 12);
+   defs->launch_size = nir_channels(b, data, 0x7u << 13);
+
+   data = brw_nir_rt_load_const(b, 8, nir_iadd_imm(b, addr, 64), nir_imm_true(b));
+   defs->call_sbt_addr =
+      nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
+                                nir_extract_i16(b, nir_channel(b, data, 1),
+                                                   nir_imm_int(b, 0)));
+   defs->call_sbt_stride =
+      nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1));
+
+   defs->resume_sbt_addr =
+      nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
+}
+
+static inline void
+brw_nir_rt_load_globals(nir_builder *b,
+                        struct brw_nir_rt_globals_defs *defs)
+{
+   brw_nir_rt_load_globals_addr(b, defs, nir_load_btd_global_arg_addr_intel(b));
+}
+
+static inline nir_def *
+brw_nir_rt_unpack_leaf_ptr(nir_builder *b, nir_def *vec2)
+{
+   /* Hit record leaf pointers are 42-bit and assumed to be in 64B chunks.
+    * This leaves 22 bits at the top for other stuff.
+    */
+   nir_def *ptr64 = nir_imul_imm(b, nir_pack_64_2x32(b, vec2), 64);
+
+   /* The top 16 bits (remember, we shifted by 6 already) contain garbage
+    * that we need to get rid of.
+    */
+   nir_def *ptr_lo = nir_unpack_64_2x32_split_x(b, ptr64);
+   nir_def *ptr_hi = nir_unpack_64_2x32_split_y(b, ptr64);
+   ptr_hi = nir_extract_i16(b, ptr_hi, nir_imm_int(b, 0));
+   return nir_pack_64_2x32_split(b, ptr_lo, ptr_hi);
+}
+
+/**
+ * MemHit memory layout (BSpec 47547) :
+ *
+ *      name            bits    description
+ *    - t               32      hit distance of current hit (or initial traversal distance)
+ *    - u               32      barycentric hit coordinates
+ *    - v               32      barycentric hit coordinates
+ *    - primIndexDelta  16      prim index delta for compressed meshlets and quads
+ *    - valid            1      set if there is a hit
+ *    - leafType         3      type of node primLeafPtr is pointing to
+ *    - primLeafIndex    4      index of the hit primitive inside the leaf
+ *    - bvhLevel         3      the instancing level at which the hit occured
+ *    - frontFace        1      whether we hit the front-facing side of a triangle (also used to pass opaque flag when calling intersection shaders)
+ *    - pad0             4      unused bits
+ *    - primLeafPtr     42      pointer to BVH leaf node (multiple of 64 bytes)
+ *    - hitGroupRecPtr0 22      LSB of hit group record of the hit triangle (multiple of 16 bytes)
+ *    - instLeafPtr     42      pointer to BVH instance leaf node (in multiple of 64 bytes)
+ *    - hitGroupRecPtr1 22      MSB of hit group record of the hit triangle (multiple of 32 bytes)
+ */
+struct brw_nir_rt_mem_hit_defs {
+   nir_def *t;
+   nir_def *tri_bary; /**< Only valid for triangle geometry */
+   nir_def *aabb_hit_kind; /**< Only valid for AABB geometry */
+   nir_def *valid;
+   nir_def *leaf_type;
+   nir_def *prim_index_delta;
+   nir_def *prim_leaf_index;
+   nir_def *bvh_level;
+   nir_def *front_face;
+   nir_def *done; /**< Only for ray queries */
+   nir_def *prim_leaf_ptr;
+   nir_def *inst_leaf_ptr;
+};
+
+static inline void
+brw_nir_rt_load_mem_hit_from_addr(nir_builder *b,
+                                  struct brw_nir_rt_mem_hit_defs *defs,
+                                  nir_def *stack_addr,
+                                  bool committed)
+{
+   nir_def *hit_addr =
+      brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, committed);
+
+   nir_def *data = brw_nir_rt_load(b, hit_addr, 16, 4, 32);
+   defs->t = nir_channel(b, data, 0);
+   defs->aabb_hit_kind = nir_channel(b, data, 1);
+   defs->tri_bary = nir_channels(b, data, 0x6);
+   nir_def *bitfield = nir_channel(b, data, 3);
+   defs->prim_index_delta =
+      nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 0), nir_imm_int(b, 16));
+   defs->valid = nir_i2b(b, nir_iand_imm(b, bitfield, 1u << 16));
+   defs->leaf_type =
+      nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 17), nir_imm_int(b, 3));
+   defs->prim_leaf_index =
+      nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 20), nir_imm_int(b, 4));
+   defs->bvh_level =
+      nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 24), nir_imm_int(b, 3));
+   defs->front_face = nir_i2b(b, nir_iand_imm(b, bitfield, 1 << 27));
+   defs->done = nir_i2b(b, nir_iand_imm(b, bitfield, 1 << 28));
+
+   data = brw_nir_rt_load(b, nir_iadd_imm(b, hit_addr, 16), 16, 4, 32);
+   defs->prim_leaf_ptr =
+      brw_nir_rt_unpack_leaf_ptr(b, nir_channels(b, data, 0x3 << 0));
+   defs->inst_leaf_ptr =
+      brw_nir_rt_unpack_leaf_ptr(b, nir_channels(b, data, 0x3 << 2));
+}
+
+static inline void
+brw_nir_rt_load_mem_hit(nir_builder *b,
+                        struct brw_nir_rt_mem_hit_defs *defs,
+                        bool committed)
+{
+   brw_nir_rt_load_mem_hit_from_addr(b, defs, brw_nir_rt_stack_addr(b),
+                                     committed);
+}
+
+static inline void
+brw_nir_memcpy_global(nir_builder *b,
+                      nir_def *dst_addr, uint32_t dst_align,
+                      nir_def *src_addr, uint32_t src_align,
+                      uint32_t size)
+{
+   /* We're going to copy in 16B chunks */
+   assert(size % 16 == 0);
+   dst_align = MIN2(dst_align, 16);
+   src_align = MIN2(src_align, 16);
+
+   for (unsigned offset = 0; offset < size; offset += 16) {
+      nir_def *data =
+         brw_nir_rt_load(b, nir_iadd_imm(b, src_addr, offset), 16,
+                         4, 32);
+      brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, offset), 16,
+                       data, 0xf /* write_mask */);
+   }
+}
+
+static inline void
+brw_nir_memclear_global(nir_builder *b,
+                        nir_def *dst_addr, uint32_t dst_align,
+                        uint32_t size)
+{
+   /* We're going to copy in 16B chunks */
+   assert(size % 16 == 0);
+   dst_align = MIN2(dst_align, 16);
+
+   nir_def *zero = nir_imm_ivec4(b, 0, 0, 0, 0);
+   for (unsigned offset = 0; offset < size; offset += 16) {
+      brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, offset), dst_align,
+                       zero, 0xf /* write_mask */);
+   }
+}
+
+static inline nir_def *
+brw_nir_rt_query_done(nir_builder *b, nir_def *stack_addr)
+{
+   struct brw_nir_rt_mem_hit_defs hit_in = {};
+   brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr,
+                                     false /* committed */);
+
+   return hit_in.done;
+}
+
+static inline void
+brw_nir_rt_set_dword_bit_at(nir_builder *b,
+                            nir_def *addr,
+                            uint32_t addr_offset,
+                            uint32_t bit)
+{
+   nir_def *dword_addr = nir_iadd_imm(b, addr, addr_offset);
+   nir_def *dword = brw_nir_rt_load(b, dword_addr, 4, 1, 32);
+   brw_nir_rt_store(b, dword_addr, 4, nir_ior_imm(b, dword, 1u << bit), 0x1);
+}
+
+static inline void
+brw_nir_rt_query_mark_done(nir_builder *b, nir_def *stack_addr)
+{
+   brw_nir_rt_set_dword_bit_at(b,
+                               brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr,
+                                                                 false /* committed */),
+                               4 * 3 /* dword offset */, 28 /* bit */);
+}
+
+/* This helper clears the 3rd dword of the MemHit structure where the valid
+ * bit is located.
+ */
+static inline void
+brw_nir_rt_query_mark_init(nir_builder *b, nir_def *stack_addr)
+{
+   nir_def *dword_addr;
+
+   for (uint32_t i = 0; i < 2; i++) {
+      dword_addr =
+         nir_iadd_imm(b,
+                      brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr,
+                                                        i == 0 /* committed */),
+                      4 * 3 /* dword offset */);
+      brw_nir_rt_store(b, dword_addr, 4, nir_imm_int(b, 0), 0x1);
+   }
+}
+
+/* This helper is pretty much a memcpy of uncommitted into committed hit
+ * structure, just adding the valid bit.
+ */
+static inline void
+brw_nir_rt_commit_hit_addr(nir_builder *b, nir_def *stack_addr)
+{
+   nir_def *dst_addr =
+      brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, true /* committed */);
+   nir_def *src_addr =
+      brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, false /* committed */);
+
+   for (unsigned offset = 0; offset < BRW_RT_SIZEOF_HIT_INFO; offset += 16) {
+      nir_def *data =
+         brw_nir_rt_load(b, nir_iadd_imm(b, src_addr, offset), 16, 4, 32);
+
+      if (offset == 0) {
+         data = nir_vec4(b,
+                         nir_channel(b, data, 0),
+                         nir_channel(b, data, 1),
+                         nir_channel(b, data, 2),
+                         nir_ior_imm(b,
+                                     nir_channel(b, data, 3),
+                                     0x1 << 16 /* valid */));
+
+         /* Also write the potential hit as we change it. */
+         brw_nir_rt_store(b, nir_iadd_imm(b, src_addr, offset), 16,
+                          data, 0xf /* write_mask */);
+      }
+
+      brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, offset), 16,
+                       data, 0xf /* write_mask */);
+   }
+}
+
+static inline void
+brw_nir_rt_commit_hit(nir_builder *b)
+{
+   nir_def *stack_addr = brw_nir_rt_stack_addr(b);
+   brw_nir_rt_commit_hit_addr(b, stack_addr);
+}
+
+static inline void
+brw_nir_rt_generate_hit_addr(nir_builder *b, nir_def *stack_addr, nir_def *t_val)
+{
+   nir_def *committed_addr =
+      brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, true /* committed */);
+   nir_def *potential_addr =
+      brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, false /* committed */);
+
+   /* Set:
+    *
+    *   potential.t     = t_val;
+    *   potential.valid = true;
+    */
+   nir_def *potential_hit_dwords_0_3 =
+      brw_nir_rt_load(b, potential_addr, 16, 4, 32);
+   potential_hit_dwords_0_3 =
+      nir_vec4(b,
+               t_val,
+               nir_channel(b, potential_hit_dwords_0_3, 1),
+               nir_channel(b, potential_hit_dwords_0_3, 2),
+               nir_ior_imm(b, nir_channel(b, potential_hit_dwords_0_3, 3),
+                           (0x1 << 16) /* valid */));
+   brw_nir_rt_store(b, potential_addr, 16, potential_hit_dwords_0_3, 0xf /* write_mask */);
+
+   /* Set:
+    *
+    *   committed.t               = t_val;
+    *   committed.u               = 0.0f;
+    *   committed.v               = 0.0f;
+    *   committed.valid           = true;
+    *   committed.leaf_type       = potential.leaf_type;
+    *   committed.bvh_level       = BRW_RT_BVH_LEVEL_OBJECT;
+    *   committed.front_face      = false;
+    *   committed.prim_leaf_index = 0;
+    *   committed.done            = false;
+    */
+   nir_def *committed_hit_dwords_0_3 =
+      brw_nir_rt_load(b, committed_addr, 16, 4, 32);
+   committed_hit_dwords_0_3 =
+      nir_vec4(b,
+               t_val,
+               nir_imm_float(b, 0.0f),
+               nir_imm_float(b, 0.0f),
+               nir_ior_imm(b,
+                           nir_ior_imm(b, nir_channel(b, potential_hit_dwords_0_3, 3), 0x000e0000),
+                           (0x1 << 16)                     /* valid */ |
+                           (BRW_RT_BVH_LEVEL_OBJECT << 24) /* leaf_type */));
+   brw_nir_rt_store(b, committed_addr, 16, committed_hit_dwords_0_3, 0xf /* write_mask */);
+
+   /* Set:
+    *
+    *   committed.prim_leaf_ptr   = potential.prim_leaf_ptr;
+    *   committed.inst_leaf_ptr   = potential.inst_leaf_ptr;
+    */
+   brw_nir_memcpy_global(b,
+                         nir_iadd_imm(b, committed_addr, 16), 16,
+                         nir_iadd_imm(b, potential_addr, 16), 16,
+                         16);
+}
+
+struct brw_nir_rt_mem_ray_defs {
+   nir_def *orig;
+   nir_def *dir;
+   nir_def *t_near;
+   nir_def *t_far;
+   nir_def *root_node_ptr;
+   nir_def *ray_flags;
+   nir_def *hit_group_sr_base_ptr;
+   nir_def *hit_group_sr_stride;
+   nir_def *miss_sr_ptr;
+   nir_def *shader_index_multiplier;
+   nir_def *inst_leaf_ptr;
+   nir_def *ray_mask;
+};
+
+static inline void
+brw_nir_rt_store_mem_ray_query_at_addr(nir_builder *b,
+                                       nir_def *ray_addr,
+                                       const struct brw_nir_rt_mem_ray_defs *defs)
+{
+   assert_def_size(defs->orig, 3, 32);
+   assert_def_size(defs->dir, 3, 32);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 0), 16,
+      nir_vec4(b, nir_channel(b, defs->orig, 0),
+                  nir_channel(b, defs->orig, 1),
+                  nir_channel(b, defs->orig, 2),
+                  nir_channel(b, defs->dir, 0)),
+      ~0 /* write mask */);
+
+   assert_def_size(defs->t_near, 1, 32);
+   assert_def_size(defs->t_far, 1, 32);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 16), 16,
+      nir_vec4(b, nir_channel(b, defs->dir, 1),
+                  nir_channel(b, defs->dir, 2),
+                  defs->t_near,
+                  defs->t_far),
+      ~0 /* write mask */);
+
+   assert_def_size(defs->root_node_ptr, 1, 64);
+   assert_def_size(defs->ray_flags, 1, 16);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 32), 16,
+      nir_vec2(b, nir_unpack_64_2x32_split_x(b, defs->root_node_ptr),
+                  nir_pack_32_2x16_split(b,
+                     nir_unpack_64_4x16_split_z(b, defs->root_node_ptr),
+                     defs->ray_flags)),
+      0x3 /* write mask */);
+
+   /* leaf_ptr is optional */
+   nir_def *inst_leaf_ptr;
+   if (defs->inst_leaf_ptr) {
+      inst_leaf_ptr = defs->inst_leaf_ptr;
+   } else {
+      inst_leaf_ptr = nir_imm_int64(b, 0);
+   }
+
+   assert_def_size(inst_leaf_ptr, 1, 64);
+   assert_def_size(defs->ray_mask, 1, 32);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 56), 8,
+      nir_vec2(b, nir_unpack_64_2x32_split_x(b, inst_leaf_ptr),
+                  nir_pack_32_2x16_split(b,
+                     nir_unpack_64_4x16_split_z(b, inst_leaf_ptr),
+                     nir_unpack_32_2x16_split_x(b, defs->ray_mask))),
+      ~0 /* write mask */);
+}
+
+static inline void
+brw_nir_rt_store_mem_ray(nir_builder *b,
+                         const struct brw_nir_rt_mem_ray_defs *defs,
+                         enum brw_rt_bvh_level bvh_level)
+{
+   nir_def *ray_addr =
+      brw_nir_rt_mem_ray_addr(b, brw_nir_rt_stack_addr(b), bvh_level);
+
+   assert_def_size(defs->orig, 3, 32);
+   assert_def_size(defs->dir, 3, 32);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 0), 16,
+      nir_vec4(b, nir_channel(b, defs->orig, 0),
+                  nir_channel(b, defs->orig, 1),
+                  nir_channel(b, defs->orig, 2),
+                  nir_channel(b, defs->dir, 0)),
+      ~0 /* write mask */);
+
+   assert_def_size(defs->t_near, 1, 32);
+   assert_def_size(defs->t_far, 1, 32);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 16), 16,
+      nir_vec4(b, nir_channel(b, defs->dir, 1),
+                  nir_channel(b, defs->dir, 2),
+                  defs->t_near,
+                  defs->t_far),
+      ~0 /* write mask */);
+
+   assert_def_size(defs->root_node_ptr, 1, 64);
+   assert_def_size(defs->ray_flags, 1, 16);
+   assert_def_size(defs->hit_group_sr_base_ptr, 1, 64);
+   assert_def_size(defs->hit_group_sr_stride, 1, 16);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 32), 16,
+      nir_vec4(b, nir_unpack_64_2x32_split_x(b, defs->root_node_ptr),
+                  nir_pack_32_2x16_split(b,
+                     nir_unpack_64_4x16_split_z(b, defs->root_node_ptr),
+                     defs->ray_flags),
+                  nir_unpack_64_2x32_split_x(b, defs->hit_group_sr_base_ptr),
+                  nir_pack_32_2x16_split(b,
+                     nir_unpack_64_4x16_split_z(b, defs->hit_group_sr_base_ptr),
+                     defs->hit_group_sr_stride)),
+      ~0 /* write mask */);
+
+   /* leaf_ptr is optional */
+   nir_def *inst_leaf_ptr;
+   if (defs->inst_leaf_ptr) {
+      inst_leaf_ptr = defs->inst_leaf_ptr;
+   } else {
+      inst_leaf_ptr = nir_imm_int64(b, 0);
+   }
+
+   assert_def_size(defs->miss_sr_ptr, 1, 64);
+   assert_def_size(defs->shader_index_multiplier, 1, 32);
+   assert_def_size(inst_leaf_ptr, 1, 64);
+   assert_def_size(defs->ray_mask, 1, 32);
+   brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 48), 16,
+      nir_vec4(b, nir_unpack_64_2x32_split_x(b, defs->miss_sr_ptr),
+                  nir_pack_32_2x16_split(b,
+                     nir_unpack_64_4x16_split_z(b, defs->miss_sr_ptr),
+                     nir_unpack_32_2x16_split_x(b,
+                        nir_ishl(b, defs->shader_index_multiplier,
+                                    nir_imm_int(b, 8)))),
+                  nir_unpack_64_2x32_split_x(b, inst_leaf_ptr),
+                  nir_pack_32_2x16_split(b,
+                     nir_unpack_64_4x16_split_z(b, inst_leaf_ptr),
+                     nir_unpack_32_2x16_split_x(b, defs->ray_mask))),
+      ~0 /* write mask */);
+}
+
+static inline void
+brw_nir_rt_load_mem_ray_from_addr(nir_builder *b,
+                                  struct brw_nir_rt_mem_ray_defs *defs,
+                                  nir_def *ray_base_addr,
+                                  enum brw_rt_bvh_level bvh_level)
+{
+   nir_def *ray_addr = brw_nir_rt_mem_ray_addr(b,
+                                                   ray_base_addr,
+                                                   bvh_level);
+
+   nir_def *data[4] = {
+      brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr,  0), 16, 4, 32),
+      brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 16), 16, 4, 32),
+      brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 32), 16, 4, 32),
+      brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 48), 16, 4, 32),
+   };
+
+   defs->orig = nir_trim_vector(b, data[0], 3);
+   defs->dir = nir_vec3(b, nir_channel(b, data[0], 3),
+                           nir_channel(b, data[1], 0),
+                           nir_channel(b, data[1], 1));
+   defs->t_near = nir_channel(b, data[1], 2);
+   defs->t_far = nir_channel(b, data[1], 3);
+   defs->root_node_ptr =
+      nir_pack_64_2x32_split(b, nir_channel(b, data[2], 0),
+                                nir_extract_i16(b, nir_channel(b, data[2], 1),
+                                                   nir_imm_int(b, 0)));
+   defs->ray_flags =
+      nir_unpack_32_2x16_split_y(b, nir_channel(b, data[2], 1));
+   defs->hit_group_sr_base_ptr =
+      nir_pack_64_2x32_split(b, nir_channel(b, data[2], 2),
+                                nir_extract_i16(b, nir_channel(b, data[2], 3),
+                                                   nir_imm_int(b, 0)));
+   defs->hit_group_sr_stride =
+      nir_unpack_32_2x16_split_y(b, nir_channel(b, data[2], 3));
+   defs->miss_sr_ptr =
+      nir_pack_64_2x32_split(b, nir_channel(b, data[3], 0),
+                                nir_extract_i16(b, nir_channel(b, data[3], 1),
+                                                   nir_imm_int(b, 0)));
+   defs->shader_index_multiplier =
+      nir_ushr(b, nir_unpack_32_2x16_split_y(b, nir_channel(b, data[3], 1)),
+                  nir_imm_int(b, 8));
+   defs->inst_leaf_ptr =
+      nir_pack_64_2x32_split(b, nir_channel(b, data[3], 2),
+                                nir_extract_i16(b, nir_channel(b, data[3], 3),
+                                                   nir_imm_int(b, 0)));
+   defs->ray_mask =
+      nir_unpack_32_2x16_split_y(b, nir_channel(b, data[3], 3));
+}
+
+static inline void
+brw_nir_rt_load_mem_ray(nir_builder *b,
+                        struct brw_nir_rt_mem_ray_defs *defs,
+                        enum brw_rt_bvh_level bvh_level)
+{
+   brw_nir_rt_load_mem_ray_from_addr(b, defs, brw_nir_rt_stack_addr(b),
+                                     bvh_level);
+}
+
+struct brw_nir_rt_bvh_instance_leaf_defs {
+   nir_def *shader_index;
+   nir_def *contribution_to_hit_group_index;
+   nir_def *world_to_object[4];
+   nir_def *instance_id;
+   nir_def *instance_index;
+   nir_def *object_to_world[4];
+};
+
+static inline void
+brw_nir_rt_load_bvh_instance_leaf(nir_builder *b,
+                                  struct brw_nir_rt_bvh_instance_leaf_defs *defs,
+                                  nir_def *leaf_addr)
+{
+   nir_def *leaf_desc = brw_nir_rt_load(b, leaf_addr, 4, 2, 32);
+
+   defs->shader_index =
+      nir_iand_imm(b, nir_channel(b, leaf_desc, 0), (1 << 24) - 1);
+   defs->contribution_to_hit_group_index =
+      nir_iand_imm(b, nir_channel(b, leaf_desc, 1), (1 << 24) - 1);
+
+   defs->world_to_object[0] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 16), 4, 3, 32);
+   defs->world_to_object[1] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 28), 4, 3, 32);
+   defs->world_to_object[2] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 40), 4, 3, 32);
+   /* The last column of the matrices is swapped between the two probably
+    * because it makes it easier/faster for hardware somehow.
+    */
+   defs->object_to_world[3] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 52), 4, 3, 32);
+
+   nir_def *data =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 64), 4, 4, 32);
+   defs->instance_id = nir_channel(b, data, 2);
+   defs->instance_index = nir_channel(b, data, 3);
+
+   defs->object_to_world[0] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 80), 4, 3, 32);
+   defs->object_to_world[1] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 92), 4, 3, 32);
+   defs->object_to_world[2] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 104), 4, 3, 32);
+   defs->world_to_object[3] =
+      brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 116), 4, 3, 32);
+}
+
+struct brw_nir_rt_bvh_primitive_leaf_defs {
+   nir_def *shader_index;
+   nir_def *geom_mask;
+   nir_def *geom_index;
+   nir_def *type;
+   nir_def *geom_flags;
+};
+
+static inline void
+brw_nir_rt_load_bvh_primitive_leaf(nir_builder *b,
+                                   struct brw_nir_rt_bvh_primitive_leaf_defs *defs,
+                                   nir_def *leaf_addr)
+{
+   nir_def *desc = brw_nir_rt_load(b, leaf_addr, 4, 2, 32);
+
+   defs->shader_index =
+      nir_ubitfield_extract(b, nir_channel(b, desc, 0),
+                            nir_imm_int(b, 23), nir_imm_int(b, 0));
+   defs->geom_mask =
+      nir_ubitfield_extract(b, nir_channel(b, desc, 0),
+                            nir_imm_int(b, 31), nir_imm_int(b, 24));
+
+   defs->geom_index =
+      nir_ubitfield_extract(b, nir_channel(b, desc, 1),
+                            nir_imm_int(b, 28), nir_imm_int(b, 0));
+   defs->type =
+      nir_ubitfield_extract(b, nir_channel(b, desc, 1),
+                            nir_imm_int(b, 29), nir_imm_int(b, 29));
+   defs->geom_flags =
+      nir_ubitfield_extract(b, nir_channel(b, desc, 1),
+                            nir_imm_int(b, 31), nir_imm_int(b, 30));
+}
+
+struct brw_nir_rt_bvh_primitive_leaf_positions_defs {
+   nir_def *positions[3];
+};
+
+static inline void
+brw_nir_rt_load_bvh_primitive_leaf_positions(nir_builder *b,
+                                             struct brw_nir_rt_bvh_primitive_leaf_positions_defs *defs,
+                                             nir_def *leaf_addr)
+{
+   for (unsigned i = 0; i < ARRAY_SIZE(defs->positions); i++) {
+      defs->positions[i] =
+         brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 16 + i * 4 * 3), 4, 3, 32);
+   }
+}
+
+static inline nir_def *
+brw_nir_rt_load_primitive_id_from_hit(nir_builder *b,
+                                      nir_def *is_procedural,
+                                      const struct brw_nir_rt_mem_hit_defs *defs)
+{
+   if (!is_procedural) {
+      is_procedural =
+         nir_ieq_imm(b, defs->leaf_type,
+                        BRW_RT_BVH_NODE_TYPE_PROCEDURAL);
+   }
+
+   nir_def *prim_id_proc, *prim_id_quad;
+   nir_push_if(b, is_procedural);
+   {
+      /* For procedural leafs, the index is in dw[3]. */
+      nir_def *offset =
+         nir_iadd_imm(b, nir_ishl_imm(b, defs->prim_leaf_index, 2), 12);
+      prim_id_proc = nir_load_global(b, nir_iadd(b, defs->prim_leaf_ptr,
+                                                 nir_u2u64(b, offset)),
+                                     4, /* align */ 1, 32);
+   }
+   nir_push_else(b, NULL);
+   {
+      /* For quad leafs, the index is dw[2] and there is a 16bit additional
+       * offset in dw[3].
+       */
+      prim_id_quad = nir_load_global(b, nir_iadd_imm(b, defs->prim_leaf_ptr, 8),
+                                     4, /* align */ 1, 32);
+      prim_id_quad = nir_iadd(b,
+                              prim_id_quad,
+                              defs->prim_index_delta);
+   }
+   nir_pop_if(b, NULL);
+
+   return nir_if_phi(b, prim_id_proc, prim_id_quad);
+}
+
+static inline nir_def *
+brw_nir_rt_acceleration_structure_to_root_node(nir_builder *b,
+                                               nir_def *as_addr)
+{
+   /* The HW memory structure in which we specify what acceleration structure
+    * to traverse, takes the address to the root node in the acceleration
+    * structure, not the acceleration structure itself. To find that, we have
+    * to read the root node offset from the acceleration structure which is
+    * the first QWord.
+    *
+    * But if the acceleration structure pointer is NULL, then we should return
+    * NULL as root node pointer.
+    *
+    * TODO: we could optimize this by assuming that for a given version of the
+    * BVH, we can find the root node at a given offset.
+    */
+   nir_def *root_node_ptr, *null_node_ptr;
+   nir_push_if(b, nir_ieq_imm(b, as_addr, 0));
+   {
+      null_node_ptr = nir_imm_int64(b, 0);
+   }
+   nir_push_else(b, NULL);
+   {
+      root_node_ptr =
+         nir_iadd(b, as_addr, brw_nir_rt_load(b, as_addr, 256, 1, 64));
+   }
+   nir_pop_if(b, NULL);
+
+   return nir_if_phi(b, null_node_ptr, root_node_ptr);
+}
+
+#endif /* BRW_NIR_RT_BUILDER_H */
--- a/src/intel/compiler/elk/brw_nir_trig_workarounds.py
+++ b/src/intel/compiler/elk/brw_nir_trig_workarounds.py
@ -0,0 +1,67 @@
+#
+# Copyright (C) 2016 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+# Prior to Kaby Lake, The SIN and COS instructions on Intel hardware can
+# produce values slightly outside of the [-1.0, 1.0] range for a small set of
+# values.  Obviously, this can break everyone's expectations about trig
+# functions.  This appears to be fixed in Kaby Lake.
+#
+# According to an internal presentation, the COS instruction can produce
+# a value up to 1.000027 for inputs in the range (0.08296, 0.09888).  One
+# suggested workaround is to multiply by 0.99997, scaling down the
+# amplitude slightly.  Apparently this also minimizes the error function,
+# reducing the maximum error from 0.00006 to about 0.00003.
+
+import argparse
+import sys
+from math import pi
+
+TRIG_WORKAROUNDS = [
+    (('fsin', 'x(is_not_const)'), ('fmul', ('fsin', 'x'), 0.99997)),
+    (('fcos', 'x(is_not_const)'), ('fmul', ('fcos', 'x'), 0.99997)),
+]
+
+LIMIT_TRIG_INPUT_RANGE_WORKAROUND = [
+    (('fsin', 'x(is_not_const)'), ('fsin', ('fmod', 'x', 2.0 * pi))),
+    (('fcos', 'x(is_not_const)'), ('fcos', ('fmod', 'x', 2.0 * pi))),
+]
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--import-path', required=True)
+    args = parser.parse_args()
+    sys.path.insert(0, args.import_path)
+    run()
+
+
+def run():
+    import nir_algebraic  # pylint: disable=import-error
+
+    print('#include "brw_nir.h"')
+    print(nir_algebraic.AlgebraicPass("brw_nir_apply_trig_workarounds",
+                                      TRIG_WORKAROUNDS).render())
+    print(nir_algebraic.AlgebraicPass("brw_nir_limit_trig_input_range_workaround",
+                                      LIMIT_TRIG_INPUT_RANGE_WORKAROUND).render())
+
+
+if __name__ == '__main__':
+    main()
--- a/src/intel/compiler/elk/brw_packed_float.c
+++ b/src/intel/compiler/elk/brw_packed_float.c
@ -0,0 +1,75 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include "brw_reg.h"
+
+union fu {
+   float f;
+   unsigned u;
+   struct {
+      unsigned mantissa:23;
+      unsigned exponent:8;
+      unsigned sign:1;
+   } s;
+};
+
+int
+brw_float_to_vf(float f)
+{
+   union fu fu = { .f = f };
+
+   /* ±0.0f is special cased. */
+   if (f == 0.0f)
+      return fu.s.sign << 7;
+
+   unsigned mantissa = fu.s.mantissa >> (23 - 4);
+   unsigned exponent = fu.s.exponent - (127 - 3);
+   unsigned vf = (fu.s.sign << 7) | (exponent << 4) | mantissa;
+
+   /* 0.125 would have had the same representation as 0.0, so reject it. */
+   if ((vf & 0x7f) == 0)
+      return -1;
+
+   /* Make sure the mantissa fits in 4-bits and the exponent in 3-bits. */
+   if (fu.u & 0x7ffff || exponent > 7)
+      return -1;
+
+   return vf;
+}
+
+float
+brw_vf_to_float(unsigned char vf)
+{
+   union fu fu;
+
+   /* ±0.0f is special cased. */
+   if (vf == 0x00 || vf == 0x80) {
+      fu.u = (unsigned)vf << 24;
+      return fu.f;
+   }
+
+   fu.s.sign = vf >> 7;
+   fu.s.exponent = ((vf & 0x70) >> 4) + (127 - 3);
+   fu.s.mantissa = (vf & 0xf) << (23 - 4);
+
+   return fu.f;
+}
--- a/src/intel/compiler/elk/brw_predicated_break.cpp
+++ b/src/intel/compiler/elk/brw_predicated_break.cpp
@ -0,0 +1,243 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_shader.h"
+
+using namespace brw;
+
+/** @file brw_predicated_break.cpp
+ *
+ * Loops are often structured as
+ *
+ * loop:
+ *    CMP.f0
+ *    (+f0) IF
+ *    BREAK
+ *    ENDIF
+ *    ...
+ *    WHILE loop
+ *
+ * This peephole pass removes the IF and ENDIF instructions and predicates the
+ * BREAK, dropping two instructions from the loop body.
+ *
+ * If the loop was a DO { ... } WHILE loop, it looks like
+ *
+ * loop:
+ *    ...
+ *    CMP.f0
+ *    (+f0) IF
+ *    BREAK
+ *    ENDIF
+ *    WHILE loop
+ *
+ * and we can remove the BREAK instruction and predicate the WHILE.
+ */
+
+#define MAX_NESTING 128
+
+struct loop_continue_tracking {
+   BITSET_WORD has_continue[BITSET_WORDS(MAX_NESTING)];
+   unsigned depth;
+};
+
+static void
+enter_loop(struct loop_continue_tracking *s)
+{
+   s->depth++;
+
+   /* Any loops deeper than that maximum nesting will just re-use the last
+    * flag.  This simplifies most of the code.  MAX_NESTING is chosen to be
+    * large enough that it is unlikely to occur.  Even if it does, the
+    * optimization that uses this tracking is unlikely to make much
+    * difference.
+    */
+   if (s->depth < MAX_NESTING)
+      BITSET_CLEAR(s->has_continue, s->depth);
+}
+
+static void
+exit_loop(struct loop_continue_tracking *s)
+{
+   assert(s->depth > 0);
+   s->depth--;
+}
+
+static void
+set_continue(struct loop_continue_tracking *s)
+{
+   const unsigned i = MIN2(s->depth, MAX_NESTING - 1);
+
+   BITSET_SET(s->has_continue, i);
+}
+
+static bool
+has_continue(const struct loop_continue_tracking *s)
+{
+   const unsigned i = MIN2(s->depth, MAX_NESTING - 1);
+
+   return BITSET_TEST(s->has_continue, i);
+}
+
+bool
+opt_predicated_break(backend_shader *s)
+{
+   bool progress = false;
+   struct loop_continue_tracking state = { {0, }, 0 };
+
+   foreach_block (block, s->cfg) {
+      /* DO instructions, by definition, can only be found at the beginning of
+       * basic blocks.
+       */
+      backend_instruction *const do_inst = block->start();
+
+      /* BREAK, CONTINUE, and WHILE instructions, by definition, can only be
+       * found at the ends of basic blocks.
+       */
+      backend_instruction *jump_inst = block->end();
+
+      if (do_inst->opcode == BRW_OPCODE_DO)
+         enter_loop(&state);
+
+      if (jump_inst->opcode == BRW_OPCODE_CONTINUE)
+         set_continue(&state);
+      else if (jump_inst->opcode == BRW_OPCODE_WHILE)
+         exit_loop(&state);
+
+      if (block->start_ip != block->end_ip)
+         continue;
+
+      if (jump_inst->opcode != BRW_OPCODE_BREAK &&
+          jump_inst->opcode != BRW_OPCODE_CONTINUE)
+         continue;
+
+      backend_instruction *if_inst = block->prev()->end();
+      if (if_inst->opcode != BRW_OPCODE_IF)
+         continue;
+
+      backend_instruction *endif_inst = block->next()->start();
+      if (endif_inst->opcode != BRW_OPCODE_ENDIF)
+         continue;
+
+      bblock_t *jump_block = block;
+      bblock_t *if_block = jump_block->prev();
+      bblock_t *endif_block = jump_block->next();
+
+      jump_inst->predicate = if_inst->predicate;
+      jump_inst->predicate_inverse = if_inst->predicate_inverse;
+
+      bblock_t *earlier_block = if_block;
+      if (if_block->start_ip == if_block->end_ip) {
+         earlier_block = if_block->prev();
+      }
+
+      if_inst->remove(if_block);
+
+      bblock_t *later_block = endif_block;
+      if (endif_block->start_ip == endif_block->end_ip) {
+         later_block = endif_block->next();
+      }
+      endif_inst->remove(endif_block);
+
+      if (!earlier_block->ends_with_control_flow()) {
+         /* FIXME: There is a potential problem here. If earlier_block starts
+          * with a DO instruction, this will delete the physical link to the
+          * WHILE block. It is unclear whether ENDIF has the same potential
+          * problem.
+          */
+         assert(earlier_block->start() == NULL ||
+                earlier_block->start()->opcode != BRW_OPCODE_DO);
+
+         earlier_block->unlink_children();
+         earlier_block->add_successor(s->cfg->mem_ctx, jump_block,
+                                      bblock_link_logical);
+      }
+
+      if (!later_block->starts_with_control_flow()) {
+         later_block->unlink_parents();
+      }
+
+      /* If jump_block already has a link to later_block, don't create another
+       * one. Instead, promote the link to logical.
+       */
+      bool need_to_link = true;
+      foreach_list_typed(bblock_link, link, link, &jump_block->children) {
+         if (link->block == later_block) {
+            assert(later_block->starts_with_control_flow());
+
+            /* Update the link from later_block back to jump_block. */
+            foreach_list_typed(bblock_link, parent_link, link, &later_block->parents) {
+               if (parent_link->block == jump_block) {
+                  parent_link->kind = bblock_link_logical;
+               }
+            }
+
+            /* Update the link from jump_block to later_block. */
+            link->kind = bblock_link_logical;
+            need_to_link = false;
+         }
+      }
+
+      if (need_to_link) {
+         jump_block->add_successor(s->cfg->mem_ctx, later_block,
+                                   bblock_link_logical);
+      }
+
+      if (earlier_block->can_combine_with(jump_block)) {
+         earlier_block->combine_with(jump_block);
+
+         block = earlier_block;
+      }
+
+      /* Now look at the first instruction of the block following the BREAK. If
+       * it's a WHILE, we can delete the break, predicate the WHILE, and join
+       * the two basic blocks.
+       *
+       * This optimization can only be applied if the only instruction that
+       * can transfer control to the WHILE is the BREAK.  If other paths can
+       * lead to the while, the flags may be in an unknown state, and the loop
+       * could terminate prematurely.  This can occur if the loop contains a
+       * CONT instruction.
+       */
+      bblock_t *while_block = earlier_block->next();
+      backend_instruction *while_inst = while_block->start();
+
+      if (jump_inst->opcode == BRW_OPCODE_BREAK &&
+          while_inst->opcode == BRW_OPCODE_WHILE &&
+          while_inst->predicate == BRW_PREDICATE_NONE &&
+          !has_continue(&state)) {
+         jump_inst->remove(earlier_block);
+         while_inst->predicate = jump_inst->predicate;
+         while_inst->predicate_inverse = !jump_inst->predicate_inverse;
+
+         assert(earlier_block->can_combine_with(while_block));
+         earlier_block->combine_with(while_block);
+      }
+
+      progress = true;
+   }
+
+   if (progress)
+      s->invalidate_analysis(DEPENDENCY_BLOCKS | DEPENDENCY_INSTRUCTIONS);
+
+   return progress;
+}
--- a/src/intel/compiler/elk/brw_prim.h
+++ b/src/intel/compiler/elk/brw_prim.h
@ -0,0 +1,50 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_PRIM_H
+#define BRW_PRIM_H
+
+#define _3DPRIM_POINTLIST         0x01
+#define _3DPRIM_LINELIST          0x02
+#define _3DPRIM_LINESTRIP         0x03
+#define _3DPRIM_TRILIST           0x04
+#define _3DPRIM_TRISTRIP          0x05
+#define _3DPRIM_TRIFAN            0x06
+#define _3DPRIM_QUADLIST          0x07
+#define _3DPRIM_QUADSTRIP         0x08
+#define _3DPRIM_LINELIST_ADJ      0x09 /* G45+ */
+#define _3DPRIM_LINESTRIP_ADJ     0x0A /* G45+ */
+#define _3DPRIM_TRILIST_ADJ       0x0B /* G45+ */
+#define _3DPRIM_TRISTRIP_ADJ      0x0C /* G45+ */
+#define _3DPRIM_TRISTRIP_REVERSE  0x0D
+#define _3DPRIM_POLYGON           0x0E
+#define _3DPRIM_RECTLIST          0x0F
+#define _3DPRIM_LINELOOP          0x10
+#define _3DPRIM_POINTLIST_BF      0x11
+#define _3DPRIM_LINESTRIP_CONT    0x12
+#define _3DPRIM_LINESTRIP_BF      0x13
+#define _3DPRIM_LINESTRIP_CONT_BF 0x14
+#define _3DPRIM_TRIFAN_NOSTIPPLE  0x16
+#define _3DPRIM_PATCHLIST(n) ({ assert(n > 0 && n <= 32); 0x20 + (n - 1); })
+
+#endif /* BRW_PRIM_H */
--- a/src/intel/compiler/elk/brw_private.h
+++ b/src/intel/compiler/elk/brw_private.h
@ -0,0 +1,76 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_PRIVATE_H
+#define BRW_PRIVATE_H
+
+#include "brw_compiler.h"
+
+#include <variant>
+
+unsigned brw_required_dispatch_width(const struct shader_info *info);
+
+static constexpr int SIMD_COUNT = 3;
+
+struct brw_simd_selection_state {
+   const struct intel_device_info *devinfo;
+
+   std::variant<struct brw_cs_prog_data *,
+                struct brw_bs_prog_data *> prog_data;
+
+   unsigned required_width;
+
+   const char *error[SIMD_COUNT];
+
+   bool compiled[SIMD_COUNT];
+   bool spilled[SIMD_COUNT];
+};
+
+inline int brw_simd_first_compiled(const brw_simd_selection_state &state)
+{
+   for (int i = 0; i < SIMD_COUNT; i++) {
+      if (state.compiled[i])
+         return i;
+   }
+   return -1;
+}
+
+inline bool brw_simd_any_compiled(const brw_simd_selection_state &state)
+{
+   return brw_simd_first_compiled(state) >= 0;
+}
+
+bool brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd);
+
+void brw_simd_mark_compiled(brw_simd_selection_state &state, unsigned simd, bool spilled);
+
+int brw_simd_select(const brw_simd_selection_state &state);
+
+int brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
+                                       const struct brw_cs_prog_data *prog_data,
+                                       const unsigned *sizes);
+
+bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag);
+
+#endif // BRW_PRIVATE_H
--- a/src/intel/compiler/elk/brw_reg.h
+++ b/src/intel/compiler/elk/brw_reg.h
--- a/src/intel/compiler/elk/brw_reg_type.c
+++ b/src/intel/compiler/elk/brw_reg_type.c
@ -0,0 +1,563 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_reg.h"
+#include "brw_eu_defines.h"
+#include "dev/intel_device_info.h"
+
+#define INVALID (-1)
+
+enum hw_reg_type {
+   BRW_HW_REG_TYPE_UD  = 0,
+   BRW_HW_REG_TYPE_D   = 1,
+   BRW_HW_REG_TYPE_UW  = 2,
+   BRW_HW_REG_TYPE_W   = 3,
+   BRW_HW_REG_TYPE_F   = 7,
+   GFX8_HW_REG_TYPE_UQ = 8,
+   GFX8_HW_REG_TYPE_Q  = 9,
+
+   BRW_HW_REG_TYPE_UB  = 4,
+   BRW_HW_REG_TYPE_B   = 5,
+   GFX7_HW_REG_TYPE_DF = 6,
+   GFX8_HW_REG_TYPE_HF = 10,
+
+   GFX11_HW_REG_TYPE_UD = 0,
+   GFX11_HW_REG_TYPE_D  = 1,
+   GFX11_HW_REG_TYPE_UW = 2,
+   GFX11_HW_REG_TYPE_W  = 3,
+   GFX11_HW_REG_TYPE_UB = 4,
+   GFX11_HW_REG_TYPE_B  = 5,
+   GFX11_HW_REG_TYPE_UQ = 6,
+   GFX11_HW_REG_TYPE_Q  = 7,
+   GFX11_HW_REG_TYPE_HF = 8,
+   GFX11_HW_REG_TYPE_F  = 9,
+   GFX11_HW_REG_TYPE_DF = 10,
+   GFX11_HW_REG_TYPE_NF = 11,
+};
+
+enum hw_imm_type {
+   BRW_HW_IMM_TYPE_UD  = 0,
+   BRW_HW_IMM_TYPE_D   = 1,
+   BRW_HW_IMM_TYPE_UW  = 2,
+   BRW_HW_IMM_TYPE_W   = 3,
+   BRW_HW_IMM_TYPE_F   = 7,
+   GFX8_HW_IMM_TYPE_UQ = 8,
+   GFX8_HW_IMM_TYPE_Q  = 9,
+
+   BRW_HW_IMM_TYPE_UV  = 4,
+   BRW_HW_IMM_TYPE_VF  = 5,
+   BRW_HW_IMM_TYPE_V   = 6,
+   GFX8_HW_IMM_TYPE_DF = 10,
+   GFX8_HW_IMM_TYPE_HF = 11,
+
+   GFX11_HW_IMM_TYPE_UD = 0,
+   GFX11_HW_IMM_TYPE_D  = 1,
+   GFX11_HW_IMM_TYPE_UW = 2,
+   GFX11_HW_IMM_TYPE_W  = 3,
+   GFX11_HW_IMM_TYPE_UV = 4,
+   GFX11_HW_IMM_TYPE_V  = 5,
+   GFX11_HW_IMM_TYPE_UQ = 6,
+   GFX11_HW_IMM_TYPE_Q  = 7,
+   GFX11_HW_IMM_TYPE_HF = 8,
+   GFX11_HW_IMM_TYPE_F  = 9,
+   GFX11_HW_IMM_TYPE_DF = 10,
+   GFX11_HW_IMM_TYPE_VF = 11,
+};
+
+#define GFX12_HW_REG_TYPE_UINT(n) (n)
+#define GFX12_HW_REG_TYPE_SINT(n) (0x4 | (n))
+#define GFX12_HW_REG_TYPE_FLOAT(n) (0x8 | (n))
+
+static const struct hw_type {
+   enum hw_reg_type reg_type;
+   enum hw_imm_type imm_type;
+} gfx4_hw_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = {     INVALID, INVALID             },
+
+   [BRW_REGISTER_TYPE_F]  = { BRW_HW_REG_TYPE_F,   BRW_HW_IMM_TYPE_F   },
+   [BRW_REGISTER_TYPE_VF] = { INVALID,             BRW_HW_IMM_TYPE_VF  },
+
+   [BRW_REGISTER_TYPE_D]  = { BRW_HW_REG_TYPE_D,   BRW_HW_IMM_TYPE_D   },
+   [BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD,  BRW_HW_IMM_TYPE_UD  },
+   [BRW_REGISTER_TYPE_W]  = { BRW_HW_REG_TYPE_W,   BRW_HW_IMM_TYPE_W   },
+   [BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW,  BRW_HW_IMM_TYPE_UW  },
+   [BRW_REGISTER_TYPE_B]  = { BRW_HW_REG_TYPE_B,   INVALID             },
+   [BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB,  INVALID             },
+   [BRW_REGISTER_TYPE_V]  = { INVALID,             BRW_HW_IMM_TYPE_V   },
+}, gfx6_hw_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = {     INVALID, INVALID             },
+
+   [BRW_REGISTER_TYPE_F]  = { BRW_HW_REG_TYPE_F,   BRW_HW_IMM_TYPE_F   },
+   [BRW_REGISTER_TYPE_VF] = { INVALID,             BRW_HW_IMM_TYPE_VF  },
+
+   [BRW_REGISTER_TYPE_D]  = { BRW_HW_REG_TYPE_D,   BRW_HW_IMM_TYPE_D   },
+   [BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD,  BRW_HW_IMM_TYPE_UD  },
+   [BRW_REGISTER_TYPE_W]  = { BRW_HW_REG_TYPE_W,   BRW_HW_IMM_TYPE_W   },
+   [BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW,  BRW_HW_IMM_TYPE_UW  },
+   [BRW_REGISTER_TYPE_B]  = { BRW_HW_REG_TYPE_B,   INVALID             },
+   [BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB,  INVALID             },
+   [BRW_REGISTER_TYPE_V]  = { INVALID,             BRW_HW_IMM_TYPE_V   },
+   [BRW_REGISTER_TYPE_UV] = { INVALID,             BRW_HW_IMM_TYPE_UV  },
+}, gfx7_hw_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = {     INVALID, INVALID             },
+
+   [BRW_REGISTER_TYPE_DF] = { GFX7_HW_REG_TYPE_DF, INVALID             },
+   [BRW_REGISTER_TYPE_F]  = { BRW_HW_REG_TYPE_F,   BRW_HW_IMM_TYPE_F   },
+   [BRW_REGISTER_TYPE_VF] = { INVALID,             BRW_HW_IMM_TYPE_VF  },
+
+   [BRW_REGISTER_TYPE_D]  = { BRW_HW_REG_TYPE_D,   BRW_HW_IMM_TYPE_D   },
+   [BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD,  BRW_HW_IMM_TYPE_UD  },
+   [BRW_REGISTER_TYPE_W]  = { BRW_HW_REG_TYPE_W,   BRW_HW_IMM_TYPE_W   },
+   [BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW,  BRW_HW_IMM_TYPE_UW  },
+   [BRW_REGISTER_TYPE_B]  = { BRW_HW_REG_TYPE_B,   INVALID             },
+   [BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB,  INVALID             },
+   [BRW_REGISTER_TYPE_V]  = { INVALID,             BRW_HW_IMM_TYPE_V   },
+   [BRW_REGISTER_TYPE_UV] = { INVALID,             BRW_HW_IMM_TYPE_UV  },
+}, gfx8_hw_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = {     INVALID, INVALID             },
+
+   [BRW_REGISTER_TYPE_DF] = { GFX7_HW_REG_TYPE_DF, GFX8_HW_IMM_TYPE_DF },
+   [BRW_REGISTER_TYPE_F]  = { BRW_HW_REG_TYPE_F,   BRW_HW_IMM_TYPE_F   },
+   [BRW_REGISTER_TYPE_HF] = { GFX8_HW_REG_TYPE_HF, GFX8_HW_IMM_TYPE_HF },
+   [BRW_REGISTER_TYPE_VF] = { INVALID,             BRW_HW_IMM_TYPE_VF  },
+
+   [BRW_REGISTER_TYPE_Q]  = { GFX8_HW_REG_TYPE_Q,  GFX8_HW_IMM_TYPE_Q  },
+   [BRW_REGISTER_TYPE_UQ] = { GFX8_HW_REG_TYPE_UQ, GFX8_HW_IMM_TYPE_UQ },
+   [BRW_REGISTER_TYPE_D]  = { BRW_HW_REG_TYPE_D,   BRW_HW_IMM_TYPE_D   },
+   [BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD,  BRW_HW_IMM_TYPE_UD  },
+   [BRW_REGISTER_TYPE_W]  = { BRW_HW_REG_TYPE_W,   BRW_HW_IMM_TYPE_W   },
+   [BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW,  BRW_HW_IMM_TYPE_UW  },
+   [BRW_REGISTER_TYPE_B]  = { BRW_HW_REG_TYPE_B,   INVALID             },
+   [BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB,  INVALID             },
+   [BRW_REGISTER_TYPE_V]  = { INVALID,             BRW_HW_IMM_TYPE_V   },
+   [BRW_REGISTER_TYPE_UV] = { INVALID,             BRW_HW_IMM_TYPE_UV  },
+}, gfx11_hw_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = {      INVALID, INVALID              },
+
+   [BRW_REGISTER_TYPE_NF] = { GFX11_HW_REG_TYPE_NF, INVALID              },
+   [BRW_REGISTER_TYPE_F]  = { GFX11_HW_REG_TYPE_F,  GFX11_HW_IMM_TYPE_F  },
+   [BRW_REGISTER_TYPE_HF] = { GFX11_HW_REG_TYPE_HF, GFX11_HW_IMM_TYPE_HF },
+   [BRW_REGISTER_TYPE_VF] = { INVALID,              GFX11_HW_IMM_TYPE_VF },
+
+   [BRW_REGISTER_TYPE_D]  = { GFX11_HW_REG_TYPE_D,  GFX11_HW_IMM_TYPE_D  },
+   [BRW_REGISTER_TYPE_UD] = { GFX11_HW_REG_TYPE_UD, GFX11_HW_IMM_TYPE_UD },
+   [BRW_REGISTER_TYPE_W]  = { GFX11_HW_REG_TYPE_W,  GFX11_HW_IMM_TYPE_W  },
+   [BRW_REGISTER_TYPE_UW] = { GFX11_HW_REG_TYPE_UW, GFX11_HW_IMM_TYPE_UW },
+   [BRW_REGISTER_TYPE_B]  = { GFX11_HW_REG_TYPE_B,  INVALID              },
+   [BRW_REGISTER_TYPE_UB] = { GFX11_HW_REG_TYPE_UB, INVALID              },
+   [BRW_REGISTER_TYPE_V]  = { INVALID,              GFX11_HW_IMM_TYPE_V  },
+   [BRW_REGISTER_TYPE_UV] = { INVALID,              GFX11_HW_IMM_TYPE_UV },
+}, gfx12_hw_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = {            INVALID, INVALID                    },
+
+   [BRW_REGISTER_TYPE_F]  = { GFX12_HW_REG_TYPE_FLOAT(2), GFX12_HW_REG_TYPE_FLOAT(2) },
+   [BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_FLOAT(1), GFX12_HW_REG_TYPE_FLOAT(1) },
+   [BRW_REGISTER_TYPE_VF] = { INVALID,                    GFX12_HW_REG_TYPE_FLOAT(0) },
+
+   [BRW_REGISTER_TYPE_D]  = { GFX12_HW_REG_TYPE_SINT(2),  GFX12_HW_REG_TYPE_SINT(2)  },
+   [BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2),  GFX12_HW_REG_TYPE_UINT(2)  },
+   [BRW_REGISTER_TYPE_W]  = { GFX12_HW_REG_TYPE_SINT(1),  GFX12_HW_REG_TYPE_SINT(1)  },
+   [BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1),  GFX12_HW_REG_TYPE_UINT(1)  },
+   [BRW_REGISTER_TYPE_B]  = { GFX12_HW_REG_TYPE_SINT(0),  INVALID                    },
+   [BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0),  INVALID                    },
+   [BRW_REGISTER_TYPE_V]  = { INVALID,                    GFX12_HW_REG_TYPE_SINT(0)  },
+   [BRW_REGISTER_TYPE_UV] = { INVALID,                    GFX12_HW_REG_TYPE_UINT(0)  },
+}, gfx125_hw_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = {            INVALID, INVALID                    },
+
+   [BRW_REGISTER_TYPE_DF] = { GFX12_HW_REG_TYPE_FLOAT(3), GFX12_HW_REG_TYPE_FLOAT(3) },
+   [BRW_REGISTER_TYPE_F]  = { GFX12_HW_REG_TYPE_FLOAT(2), GFX12_HW_REG_TYPE_FLOAT(2) },
+   [BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_FLOAT(1), GFX12_HW_REG_TYPE_FLOAT(1) },
+   [BRW_REGISTER_TYPE_VF] = { INVALID,                    GFX12_HW_REG_TYPE_FLOAT(0) },
+
+   [BRW_REGISTER_TYPE_Q]  = { GFX12_HW_REG_TYPE_SINT(3),  GFX12_HW_REG_TYPE_SINT(3)  },
+   [BRW_REGISTER_TYPE_UQ] = { GFX12_HW_REG_TYPE_UINT(3),  GFX12_HW_REG_TYPE_UINT(3)  },
+   [BRW_REGISTER_TYPE_D]  = { GFX12_HW_REG_TYPE_SINT(2),  GFX12_HW_REG_TYPE_SINT(2)  },
+   [BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2),  GFX12_HW_REG_TYPE_UINT(2)  },
+   [BRW_REGISTER_TYPE_W]  = { GFX12_HW_REG_TYPE_SINT(1),  GFX12_HW_REG_TYPE_SINT(1)  },
+   [BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1),  GFX12_HW_REG_TYPE_UINT(1)  },
+   [BRW_REGISTER_TYPE_B]  = { GFX12_HW_REG_TYPE_SINT(0),  INVALID                    },
+   [BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0),  INVALID                    },
+   [BRW_REGISTER_TYPE_V]  = { INVALID,                    GFX12_HW_REG_TYPE_SINT(0)  },
+   [BRW_REGISTER_TYPE_UV] = { INVALID,                    GFX12_HW_REG_TYPE_UINT(0)  },
+};
+
+/* SNB adds 3-src instructions (MAD and LRP) that only operate on floats, so
+ * the types were implied. IVB adds BFE and BFI2 that operate on doublewords
+ * and unsigned doublewords, so a new field is also available in the da3src
+ * struct (part of struct brw_instruction.bits1 in brw_structs.h) to select
+ * dst and shared-src types.
+ *
+ * CNL adds support for 3-src instructions in align1 mode, and with it support
+ * for most register types.
+ */
+enum hw_3src_reg_type {
+   GFX7_3SRC_TYPE_F  = 0,
+   GFX7_3SRC_TYPE_D  = 1,
+   GFX7_3SRC_TYPE_UD = 2,
+   GFX7_3SRC_TYPE_DF = 3,
+   GFX8_3SRC_TYPE_HF = 4,
+
+   /** When ExecutionDatatype is 1: @{ */
+   GFX10_ALIGN1_3SRC_REG_TYPE_HF = 0b000,
+   GFX10_ALIGN1_3SRC_REG_TYPE_F  = 0b001,
+   GFX10_ALIGN1_3SRC_REG_TYPE_DF = 0b010,
+   GFX11_ALIGN1_3SRC_REG_TYPE_NF = 0b011,
+   /** @} */
+
+   /** When ExecutionDatatype is 0: @{ */
+   GFX10_ALIGN1_3SRC_REG_TYPE_UD = 0b000,
+   GFX10_ALIGN1_3SRC_REG_TYPE_D  = 0b001,
+   GFX10_ALIGN1_3SRC_REG_TYPE_UW = 0b010,
+   GFX10_ALIGN1_3SRC_REG_TYPE_W  = 0b011,
+   GFX10_ALIGN1_3SRC_REG_TYPE_UB = 0b100,
+   GFX10_ALIGN1_3SRC_REG_TYPE_B  = 0b101,
+   /** @} */
+};
+
+static const struct hw_3src_type {
+   enum hw_3src_reg_type reg_type;
+   enum gfx10_align1_3src_exec_type exec_type;
+} gfx6_hw_3src_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+   [BRW_REGISTER_TYPE_F]  = { GFX7_3SRC_TYPE_F  },
+}, gfx7_hw_3src_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+   [BRW_REGISTER_TYPE_F]  = { GFX7_3SRC_TYPE_F  },
+   [BRW_REGISTER_TYPE_D]  = { GFX7_3SRC_TYPE_D  },
+   [BRW_REGISTER_TYPE_UD] = { GFX7_3SRC_TYPE_UD },
+   [BRW_REGISTER_TYPE_DF] = { GFX7_3SRC_TYPE_DF },
+}, gfx8_hw_3src_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+   [BRW_REGISTER_TYPE_F]  = { GFX7_3SRC_TYPE_F  },
+   [BRW_REGISTER_TYPE_D]  = { GFX7_3SRC_TYPE_D  },
+   [BRW_REGISTER_TYPE_UD] = { GFX7_3SRC_TYPE_UD },
+   [BRW_REGISTER_TYPE_DF] = { GFX7_3SRC_TYPE_DF },
+   [BRW_REGISTER_TYPE_HF] = { GFX8_3SRC_TYPE_HF },
+}, gfx10_hw_3src_align1_type[] = {
+#define E(x) BRW_ALIGN1_3SRC_EXEC_TYPE_##x
+   [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+   [BRW_REGISTER_TYPE_DF] = { GFX10_ALIGN1_3SRC_REG_TYPE_DF, E(FLOAT) },
+   [BRW_REGISTER_TYPE_F]  = { GFX10_ALIGN1_3SRC_REG_TYPE_F,  E(FLOAT) },
+   [BRW_REGISTER_TYPE_HF] = { GFX10_ALIGN1_3SRC_REG_TYPE_HF, E(FLOAT) },
+
+   [BRW_REGISTER_TYPE_D]  = { GFX10_ALIGN1_3SRC_REG_TYPE_D,  E(INT)   },
+   [BRW_REGISTER_TYPE_UD] = { GFX10_ALIGN1_3SRC_REG_TYPE_UD, E(INT)   },
+   [BRW_REGISTER_TYPE_W]  = { GFX10_ALIGN1_3SRC_REG_TYPE_W,  E(INT)   },
+   [BRW_REGISTER_TYPE_UW] = { GFX10_ALIGN1_3SRC_REG_TYPE_UW, E(INT)   },
+   [BRW_REGISTER_TYPE_B]  = { GFX10_ALIGN1_3SRC_REG_TYPE_B,  E(INT)   },
+   [BRW_REGISTER_TYPE_UB] = { GFX10_ALIGN1_3SRC_REG_TYPE_UB, E(INT)   },
+}, gfx11_hw_3src_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+   [BRW_REGISTER_TYPE_NF] = { GFX11_ALIGN1_3SRC_REG_TYPE_NF, E(FLOAT) },
+   [BRW_REGISTER_TYPE_F]  = { GFX10_ALIGN1_3SRC_REG_TYPE_F,  E(FLOAT) },
+   [BRW_REGISTER_TYPE_HF] = { GFX10_ALIGN1_3SRC_REG_TYPE_HF, E(FLOAT) },
+
+   [BRW_REGISTER_TYPE_D]  = { GFX10_ALIGN1_3SRC_REG_TYPE_D,  E(INT)   },
+   [BRW_REGISTER_TYPE_UD] = { GFX10_ALIGN1_3SRC_REG_TYPE_UD, E(INT)   },
+   [BRW_REGISTER_TYPE_W]  = { GFX10_ALIGN1_3SRC_REG_TYPE_W,  E(INT)   },
+   [BRW_REGISTER_TYPE_UW] = { GFX10_ALIGN1_3SRC_REG_TYPE_UW, E(INT)   },
+   [BRW_REGISTER_TYPE_B]  = { GFX10_ALIGN1_3SRC_REG_TYPE_B,  E(INT)   },
+   [BRW_REGISTER_TYPE_UB] = { GFX10_ALIGN1_3SRC_REG_TYPE_UB, E(INT)   },
+}, gfx12_hw_3src_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+   [BRW_REGISTER_TYPE_F]  = { GFX12_HW_REG_TYPE_UINT(2),     E(FLOAT), },
+   [BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_UINT(1),     E(FLOAT), },
+
+   [BRW_REGISTER_TYPE_D]  = { GFX12_HW_REG_TYPE_SINT(2),     E(INT),  },
+   [BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2),     E(INT),  },
+   [BRW_REGISTER_TYPE_W]  = { GFX12_HW_REG_TYPE_SINT(1),     E(INT),  },
+   [BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1),     E(INT),  },
+   [BRW_REGISTER_TYPE_B]  = { GFX12_HW_REG_TYPE_SINT(0),     E(INT),  },
+   [BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0),     E(INT),  },
+}, gfx125_hw_3src_type[] = {
+   [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID },
+
+   [BRW_REGISTER_TYPE_DF] = { GFX12_HW_REG_TYPE_UINT(3),     E(FLOAT), },
+   [BRW_REGISTER_TYPE_F]  = { GFX12_HW_REG_TYPE_UINT(2),     E(FLOAT), },
+   [BRW_REGISTER_TYPE_HF] = { GFX12_HW_REG_TYPE_UINT(1),     E(FLOAT), },
+
+   [BRW_REGISTER_TYPE_Q]  = { GFX12_HW_REG_TYPE_SINT(3),     E(INT),  },
+   [BRW_REGISTER_TYPE_UQ] = { GFX12_HW_REG_TYPE_UINT(3),     E(INT),  },
+   [BRW_REGISTER_TYPE_D]  = { GFX12_HW_REG_TYPE_SINT(2),     E(INT),  },
+   [BRW_REGISTER_TYPE_UD] = { GFX12_HW_REG_TYPE_UINT(2),     E(INT),  },
+   [BRW_REGISTER_TYPE_W]  = { GFX12_HW_REG_TYPE_SINT(1),     E(INT),  },
+   [BRW_REGISTER_TYPE_UW] = { GFX12_HW_REG_TYPE_UINT(1),     E(INT),  },
+   [BRW_REGISTER_TYPE_B]  = { GFX12_HW_REG_TYPE_SINT(0),     E(INT),  },
+   [BRW_REGISTER_TYPE_UB] = { GFX12_HW_REG_TYPE_UINT(0),     E(INT),  },
+#undef E
+};
+
+/**
+ * Convert a brw_reg_type enumeration value into the hardware representation.
+ *
+ * The hardware encoding may depend on whether the value is an immediate.
+ */
+unsigned
+brw_reg_type_to_hw_type(const struct intel_device_info *devinfo,
+                        enum brw_reg_file file,
+                        enum brw_reg_type type)
+{
+   const struct hw_type *table;
+
+   if (devinfo->verx10 >= 125) {
+      assert(type < ARRAY_SIZE(gfx125_hw_type));
+      table = gfx125_hw_type;
+   } else if (devinfo->ver >= 12) {
+      assert(type < ARRAY_SIZE(gfx12_hw_type));
+      table = gfx12_hw_type;
+   } else if (devinfo->ver >= 11) {
+      assert(type < ARRAY_SIZE(gfx11_hw_type));
+      table = gfx11_hw_type;
+   } else if (devinfo->ver >= 8) {
+      assert(type < ARRAY_SIZE(gfx8_hw_type));
+      table = gfx8_hw_type;
+   } else if (devinfo->ver >= 7) {
+      assert(type < ARRAY_SIZE(gfx7_hw_type));
+      table = gfx7_hw_type;
+   } else if (devinfo->ver >= 6) {
+      assert(type < ARRAY_SIZE(gfx6_hw_type));
+      table = gfx6_hw_type;
+   } else {
+      assert(type < ARRAY_SIZE(gfx4_hw_type));
+      table = gfx4_hw_type;
+   }
+
+   if (file == BRW_IMMEDIATE_VALUE) {
+      assert(table[type].imm_type != (enum hw_imm_type)INVALID);
+      return table[type].imm_type;
+   } else {
+      assert(table[type].reg_type != (enum hw_reg_type)INVALID);
+      return table[type].reg_type;
+   }
+}
+
+/**
+ * Convert the hardware representation into a brw_reg_type enumeration value.
+ *
+ * The hardware encoding may depend on whether the value is an immediate.
+ */
+enum brw_reg_type
+brw_hw_type_to_reg_type(const struct intel_device_info *devinfo,
+                        enum brw_reg_file file, unsigned hw_type)
+{
+   const struct hw_type *table;
+
+   if (devinfo->verx10 >= 125) {
+      table = gfx125_hw_type;
+   } else if (devinfo->ver >= 12) {
+      table = gfx12_hw_type;
+   } else if (devinfo->ver >= 11) {
+      table = gfx11_hw_type;
+   } else if (devinfo->ver >= 8) {
+      table = gfx8_hw_type;
+   } else if (devinfo->ver >= 7) {
+      table = gfx7_hw_type;
+   } else if (devinfo->ver >= 6) {
+      table = gfx6_hw_type;
+   } else {
+      table = gfx4_hw_type;
+   }
+
+   if (file == BRW_IMMEDIATE_VALUE) {
+      for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
+         if (table[i].imm_type == (enum hw_imm_type)hw_type) {
+            return i;
+         }
+      }
+   } else {
+      for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
+         if (table[i].reg_type == (enum hw_reg_type)hw_type) {
+            return i;
+         }
+      }
+   }
+   return INVALID_REG_TYPE;
+}
+
+/**
+ * Convert a brw_reg_type enumeration value into the hardware representation
+ * for a 3-src align16 instruction
+ */
+unsigned
+brw_reg_type_to_a16_hw_3src_type(const struct intel_device_info *devinfo,
+                                 enum brw_reg_type type)
+{
+   const struct hw_3src_type *table;
+
+   if (devinfo->ver >= 8) {
+      assert(type < ARRAY_SIZE(gfx8_hw_3src_type));
+      table = gfx8_hw_3src_type;
+   } else if (devinfo->ver >= 7) {
+      assert(type < ARRAY_SIZE(gfx7_hw_3src_type));
+      table = gfx7_hw_3src_type;
+   } else {
+      assert(type < ARRAY_SIZE(gfx6_hw_3src_type));
+      table = gfx6_hw_3src_type;
+   }
+
+   assert(table[type].reg_type != (enum hw_3src_reg_type)INVALID);
+   return table[type].reg_type;
+}
+
+/**
+ * Convert a brw_reg_type enumeration value into the hardware representation
+ * for a 3-src align1 instruction
+ */
+unsigned
+brw_reg_type_to_a1_hw_3src_type(const struct intel_device_info *devinfo,
+                                enum brw_reg_type type)
+{
+   if (devinfo->verx10 >= 125) {
+      assert(type < ARRAY_SIZE(gfx125_hw_3src_type));
+      return gfx125_hw_3src_type[type].reg_type;
+   } else if (devinfo->ver >= 12) {
+      assert(type < ARRAY_SIZE(gfx12_hw_3src_type));
+      return gfx12_hw_3src_type[type].reg_type;
+   } else if (devinfo->ver >= 11) {
+      assert(type < ARRAY_SIZE(gfx11_hw_3src_type));
+      return gfx11_hw_3src_type[type].reg_type;
+   } else {
+      assert(type < ARRAY_SIZE(gfx10_hw_3src_align1_type));
+      return gfx10_hw_3src_align1_type[type].reg_type;
+   }
+}
+
+/**
+ * Convert the hardware representation for a 3-src align16 instruction into a
+ * brw_reg_type enumeration value.
+ */
+enum brw_reg_type
+brw_a16_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
+                                 unsigned hw_type)
+{
+   const struct hw_3src_type *table = NULL;
+
+   if (devinfo->ver >= 8) {
+      table = gfx8_hw_3src_type;
+   } else if (devinfo->ver >= 7) {
+      table = gfx7_hw_3src_type;
+   } else if (devinfo->ver >= 6) {
+      table = gfx6_hw_3src_type;
+   }
+
+   for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
+      if (table[i].reg_type == hw_type) {
+         return i;
+      }
+   }
+   return INVALID_REG_TYPE;
+}
+
+/**
+ * Convert the hardware representation for a 3-src align1 instruction into a
+ * brw_reg_type enumeration value.
+ */
+enum brw_reg_type
+brw_a1_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
+                                unsigned hw_type, unsigned exec_type)
+{
+   const struct hw_3src_type *table =
+      (devinfo->verx10 >= 125 ? gfx125_hw_3src_type :
+       devinfo->ver >= 12 ? gfx12_hw_3src_type :
+       devinfo->ver >= 11 ? gfx11_hw_3src_type :
+       gfx10_hw_3src_align1_type);
+
+   for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) {
+      if (table[i].reg_type == hw_type &&
+          table[i].exec_type == exec_type) {
+         return i;
+      }
+   }
+   return INVALID_REG_TYPE;
+}
+
+/**
+ * Return the element size given a register type.
+ */
+unsigned
+brw_reg_type_to_size(enum brw_reg_type type)
+{
+   static const unsigned type_size[] = {
+      [BRW_REGISTER_TYPE_NF] = 8,
+      [BRW_REGISTER_TYPE_DF] = 8,
+      [BRW_REGISTER_TYPE_F]  = 4,
+      [BRW_REGISTER_TYPE_HF] = 2,
+      [BRW_REGISTER_TYPE_VF] = 4,
+
+      [BRW_REGISTER_TYPE_Q]  = 8,
+      [BRW_REGISTER_TYPE_UQ] = 8,
+      [BRW_REGISTER_TYPE_D]  = 4,
+      [BRW_REGISTER_TYPE_UD] = 4,
+      [BRW_REGISTER_TYPE_W]  = 2,
+      [BRW_REGISTER_TYPE_UW] = 2,
+      [BRW_REGISTER_TYPE_B]  = 1,
+      [BRW_REGISTER_TYPE_UB] = 1,
+      [BRW_REGISTER_TYPE_V]  = 2,
+      [BRW_REGISTER_TYPE_UV] = 2,
+   };
+   if (type >= ARRAY_SIZE(type_size))
+      return -1;
+
+   return type_size[type];
+}
+
+/**
+ * Converts a BRW_REGISTER_TYPE_* enum to a short string (F, UD, and so on).
+ *
+ * This is different than reg_encoding from brw_disasm.c in that it operates
+ * on the abstract enum values, rather than the generation-specific encoding.
+ */
+const char *
+brw_reg_type_to_letters(enum brw_reg_type type)
+{
+   static const char letters[][3] = {
+      [BRW_REGISTER_TYPE_NF] = "NF",
+      [BRW_REGISTER_TYPE_DF] = "DF",
+      [BRW_REGISTER_TYPE_F]  = "F",
+      [BRW_REGISTER_TYPE_HF] = "HF",
+      [BRW_REGISTER_TYPE_VF] = "VF",
+
+      [BRW_REGISTER_TYPE_Q]  = "Q",
+      [BRW_REGISTER_TYPE_UQ] = "UQ",
+      [BRW_REGISTER_TYPE_D]  = "D",
+      [BRW_REGISTER_TYPE_UD] = "UD",
+      [BRW_REGISTER_TYPE_W]  = "W",
+      [BRW_REGISTER_TYPE_UW] = "UW",
+      [BRW_REGISTER_TYPE_B]  = "B",
+      [BRW_REGISTER_TYPE_UB] = "UB",
+      [BRW_REGISTER_TYPE_V]  = "V",
+      [BRW_REGISTER_TYPE_UV] = "UV",
+   };
+   if (type >= ARRAY_SIZE(letters))
+      return "INVALID";
+
+   assert(type < ARRAY_SIZE(letters));
+   return letters[type];
+}
--- a/src/intel/compiler/elk/brw_reg_type.h
+++ b/src/intel/compiler/elk/brw_reg_type.h
@ -0,0 +1,209 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_REG_TYPE_H
+#define BRW_REG_TYPE_H
+
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef HAVE_FUNC_ATTRIBUTE_PURE
+#define ATTRIBUTE_PURE __attribute__((__pure__))
+#else
+#define ATTRIBUTE_PURE
+#endif
+
+enum brw_reg_file;
+struct intel_device_info;
+
+/*
+ * The ordering has been chosen so that no enum value is the same as a
+ * compatible hardware encoding.
+ */
+enum PACKED brw_reg_type {
+   /** Floating-point types: @{ */
+   BRW_REGISTER_TYPE_NF, /* >64-bit (accumulator-only) native float (gfx11+) */
+   BRW_REGISTER_TYPE_DF, /* 64-bit float (double float) */
+   BRW_REGISTER_TYPE_F,  /* 32-bit float */
+   BRW_REGISTER_TYPE_HF, /* 16-bit float (half float) */
+   BRW_REGISTER_TYPE_VF, /* 32-bit vector of 4 8-bit floats */
+   /** @} */
+
+   /** Integer types: @{ */
+   BRW_REGISTER_TYPE_Q,  /* 64-bit   signed integer (quad word) */
+   BRW_REGISTER_TYPE_UQ, /* 64-bit unsigned integer (quad word) */
+   BRW_REGISTER_TYPE_D,  /* 32-bit   signed integer (double word) */
+   BRW_REGISTER_TYPE_UD, /* 32-bit unsigned integer (double word) */
+   BRW_REGISTER_TYPE_W,  /* 16-bit   signed integer (word) */
+   BRW_REGISTER_TYPE_UW, /* 16-bit unsigned integer (word) */
+   BRW_REGISTER_TYPE_B,  /*  8-bit   signed integer (byte) */
+   BRW_REGISTER_TYPE_UB, /*  8-bit unsigned integer (byte) */
+   BRW_REGISTER_TYPE_V,  /* vector of 8   signed 4-bit integers (treated as W) */
+   BRW_REGISTER_TYPE_UV, /* vector of 8 unsigned 4-bit integers (treated as UW) */
+   /** @} */
+
+   BRW_REGISTER_TYPE_LAST = BRW_REGISTER_TYPE_UV
+};
+
+static inline bool
+brw_reg_type_is_floating_point(enum brw_reg_type type)
+{
+   switch (type) {
+   case BRW_REGISTER_TYPE_NF:
+   case BRW_REGISTER_TYPE_DF:
+   case BRW_REGISTER_TYPE_F:
+   case BRW_REGISTER_TYPE_HF:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static inline bool
+brw_reg_type_is_integer(enum brw_reg_type type)
+{
+   switch (type) {
+   case BRW_REGISTER_TYPE_Q:
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_B:
+   case BRW_REGISTER_TYPE_UB:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static inline bool
+brw_reg_type_is_unsigned_integer(enum brw_reg_type tp)
+{
+   return tp == BRW_REGISTER_TYPE_UB ||
+          tp == BRW_REGISTER_TYPE_UW ||
+          tp == BRW_REGISTER_TYPE_UD ||
+          tp == BRW_REGISTER_TYPE_UQ;
+}
+
+/*
+ * Returns a type based on a reference_type (word, float, half-float) and a
+ * given bit_size.
+ */
+static inline enum brw_reg_type
+brw_reg_type_from_bit_size(unsigned bit_size,
+                           enum brw_reg_type reference_type)
+{
+   switch(reference_type) {
+   case BRW_REGISTER_TYPE_HF:
+   case BRW_REGISTER_TYPE_F:
+   case BRW_REGISTER_TYPE_DF:
+      switch(bit_size) {
+      case 16:
+         return BRW_REGISTER_TYPE_HF;
+      case 32:
+         return BRW_REGISTER_TYPE_F;
+      case 64:
+         return BRW_REGISTER_TYPE_DF;
+      default:
+         unreachable("Invalid bit size");
+      }
+   case BRW_REGISTER_TYPE_B:
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_Q:
+      switch(bit_size) {
+      case 8:
+         return BRW_REGISTER_TYPE_B;
+      case 16:
+         return BRW_REGISTER_TYPE_W;
+      case 32:
+         return BRW_REGISTER_TYPE_D;
+      case 64:
+         return BRW_REGISTER_TYPE_Q;
+      default:
+         unreachable("Invalid bit size");
+      }
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_UQ:
+      switch(bit_size) {
+      case 8:
+         return BRW_REGISTER_TYPE_UB;
+      case 16:
+         return BRW_REGISTER_TYPE_UW;
+      case 32:
+         return BRW_REGISTER_TYPE_UD;
+      case 64:
+         return BRW_REGISTER_TYPE_UQ;
+      default:
+         unreachable("Invalid bit size");
+      }
+   default:
+      unreachable("Unknown type");
+   }
+}
+
+
+#define INVALID_REG_TYPE    ((enum brw_reg_type)-1)
+#define INVALID_HW_REG_TYPE ((unsigned)-1)
+
+unsigned
+brw_reg_type_to_hw_type(const struct intel_device_info *devinfo,
+                        enum brw_reg_file file, enum brw_reg_type type);
+
+enum brw_reg_type ATTRIBUTE_PURE
+brw_hw_type_to_reg_type(const struct intel_device_info *devinfo,
+                        enum brw_reg_file file, unsigned hw_type);
+
+unsigned
+brw_reg_type_to_a16_hw_3src_type(const struct intel_device_info *devinfo,
+                                 enum brw_reg_type type);
+
+unsigned
+brw_reg_type_to_a1_hw_3src_type(const struct intel_device_info *devinfo,
+                                enum brw_reg_type type);
+
+enum brw_reg_type
+brw_a16_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
+                                 unsigned hw_type);
+
+enum brw_reg_type
+brw_a1_hw_3src_type_to_reg_type(const struct intel_device_info *devinfo,
+                                unsigned hw_type, unsigned exec_type);
+
+unsigned
+brw_reg_type_to_size(enum brw_reg_type type);
+
+const char *
+brw_reg_type_to_letters(enum brw_reg_type type);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/intel/compiler/elk/brw_rt.h
+++ b/src/intel/compiler/elk/brw_rt.h
@ -0,0 +1,292 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_RT_H
+#define BRW_RT_H
+
+#include <stdint.h>
+
+#include "compiler/shader_enums.h"
+#include "util/macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Vulkan defines shaderGroupHandleSize = 32 */
+#define BRW_RT_SBT_HANDLE_SIZE 32
+
+/** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
+#define BRW_RT_DISPATCH_GLOBALS_SIZE 80
+
+/** Offset after the RT dispatch globals at which "push" constants live */
+#define BRW_RT_PUSH_CONST_OFFSET 128
+
+/** Stride of the resume SBT */
+#define BRW_BTD_RESUME_SBT_STRIDE 8
+
+/* Vulkan always uses exactly two levels of BVH: world and object.  At the API
+ * level, these are referred to as top and bottom.
+ */
+enum brw_rt_bvh_level {
+   BRW_RT_BVH_LEVEL_WORLD = 0,
+   BRW_RT_BVH_LEVEL_OBJECT = 1,
+};
+#define BRW_RT_MAX_BVH_LEVELS 2
+
+enum brw_rt_bvh_node_type {
+   BRW_RT_BVH_NODE_TYPE_INTERNAL = 0,
+   BRW_RT_BVH_NODE_TYPE_INSTANCE = 1,
+   BRW_RT_BVH_NODE_TYPE_PROCEDURAL = 3,
+   BRW_RT_BVH_NODE_TYPE_QUAD = 4,
+};
+
+/** HitKind values returned for triangle geometry
+ *
+ * This enum must match the SPIR-V enum.
+ */
+enum brw_rt_hit_kind {
+   BRW_RT_HIT_KIND_FRONT_FACE = 0xfe,
+   BRW_RT_HIT_KIND_BACK_FACE = 0xff,
+};
+
+/** Ray flags
+ *
+ * This enum must match the SPIR-V RayFlags enum.
+ */
+enum brw_rt_ray_flags {
+   BRW_RT_RAY_FLAG_FORCE_OPAQUE                    = 0x01,
+   BRW_RT_RAY_FLAG_FORCE_NON_OPAQUE                = 0x02,
+   BRW_RT_RAY_FLAG_TERMINATE_ON_FIRST_HIT          = 0x04,
+   BRW_RT_RAY_FLAG_SKIP_CLOSEST_HIT_SHADER         = 0x08,
+   BRW_RT_RAY_FLAG_CULL_BACK_FACING_TRIANGLES      = 0x10,
+   BRW_RT_RAY_FLAG_CULL_FRONT_FACING_TRIANGLES     = 0x20,
+   BRW_RT_RAY_FLAG_CULL_OPAQUE                     = 0x40,
+   BRW_RT_RAY_FLAG_CULL_NON_OPAQUE                 = 0x80,
+   BRW_RT_RAY_FLAG_SKIP_TRIANGLES                  = 0x100,
+   BRW_RT_RAY_FLAG_SKIP_AABBS                      = 0x200,
+};
+
+struct brw_rt_scratch_layout {
+   /** Number of stack IDs per DSS */
+   uint32_t stack_ids_per_dss;
+
+   /** Start offset (in bytes) of the hardware MemRay stack */
+   uint32_t ray_stack_start;
+
+   /** Stride (in bytes) of the hardware MemRay stack */
+   uint32_t ray_stack_stride;
+
+   /** Start offset (in bytes) of the SW stacks */
+   uint64_t sw_stack_start;
+
+   /** Size (in bytes) of the SW stack for a single shader invocation */
+   uint32_t sw_stack_size;
+
+   /** Total size (in bytes) of the RT scratch memory area */
+   uint64_t total_size;
+};
+
+/** Parameters passed to the raygen trampoline shader
+ *
+ * This struct is carefully construected to be 32B and must be passed to the
+ * raygen trampoline shader as as inline constant data.
+ */
+struct brw_rt_raygen_trampoline_params {
+   /** The GPU address of the RT_DISPATCH_GLOBALS */
+   uint64_t rt_disp_globals_addr;
+
+   /** The GPU address of the BINDLESS_SHADER_RECORD for the raygen shader */
+   uint64_t raygen_bsr_addr;
+
+   /** 1 if this is an indirect dispatch, 0 otherwise */
+   uint8_t is_indirect;
+
+   /** The integer log2 of the local group size
+    *
+    * Ray-tracing shaders don't have a concept of local vs. global workgroup
+    * size.  They only have a single 3D launch size.  The raygen trampoline
+    * shader is always dispatched with a local workgroup size equal to the
+    * SIMD width but the shape of the local workgroup is determined at
+    * dispatch time based on the shape of the launch and passed to the
+    * trampoline via this field.  (There's no sense having a Z dimension on
+    * the local workgroup if the launch is 2D.)
+    *
+    * We use the integer log2 of the size because there's no point in
+    * non-power-of-two sizes and  shifts are cheaper than division.
+    */
+   uint8_t local_group_size_log2[3];
+
+   uint32_t pad[3];
+};
+
+/** Size of the "hot zone" in bytes
+ *
+ * The hot zone is a SW-defined data structure which is a single uvec4
+ * containing two bits of information:
+ *
+ *  - hotzone.x: Stack offset (in bytes)
+ *
+ *    This is the offset (in bytes) into the per-thread scratch space at which
+ *    the current shader's stack starts.  This is incremented by the calling
+ *    shader prior to any shader call type instructions and gets decremented
+ *    by the resume shader as part of completing the return operation.
+ *
+ *
+ *  - hotzone.yzw: The launch ID associated with the current thread
+ *
+ *    Inside a bindless shader, the only information we have is the DSS ID
+ *    from the hardware EU and a per-DSS stack ID.  In particular, the three-
+ *    dimensional launch ID is lost the moment we leave the raygen trampoline.
+ */
+#define BRW_RT_SIZEOF_HOTZONE 16
+
+/* From the BSpec "Address Computation for Memory Based Data Structures:
+ * Ray and TraversalStack (Async Ray Tracing)":
+ *
+ *    sizeof(Ray) = 64B, sizeof(HitInfo) = 32B, sizeof(TravStack) = 32B.
+ */
+#define BRW_RT_SIZEOF_RAY 64
+#define BRW_RT_SIZEOF_HIT_INFO 32
+#define BRW_RT_SIZEOF_TRAV_STACK 32
+
+/* From the BSpec:
+ *
+ *    syncStackSize = (maxBVHLevels % 2 == 1) ?
+ *       (sizeof(HitInfo) * 2 +
+ *          (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels + 32B) :
+ *       (sizeof(HitInfo) * 2 +
+ *          (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels);
+ *
+ * The select is just to align to 64B.
+ */
+#define BRW_RT_SIZEOF_RAY_QUERY \
+   (BRW_RT_SIZEOF_HIT_INFO * 2 + \
+    (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \
+    (BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0))
+
+#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY  \
+   (BRW_RT_SIZEOF_HIT_INFO * 2 + \
+    (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS)
+
+#define BRW_RT_SIZEOF_HW_STACK \
+   (BRW_RT_SIZEOF_HIT_INFO * 2 + \
+    BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \
+    BRW_RT_SIZEOF_TRAV_STACK * BRW_RT_MAX_BVH_LEVELS)
+
+/* This is a mesa-defined region for hit attribute data */
+#define BRW_RT_SIZEOF_HIT_ATTRIB_DATA 64
+#define BRW_RT_OFFSETOF_HIT_ATTRIB_DATA BRW_RT_SIZEOF_HW_STACK
+
+#define BRW_RT_ASYNC_STACK_STRIDE \
+   ALIGN_POT(BRW_RT_OFFSETOF_HIT_ATTRIB_DATA + \
+             BRW_RT_SIZEOF_HIT_ATTRIB_DATA, 64)
+
+static inline void
+brw_rt_compute_scratch_layout(struct brw_rt_scratch_layout *layout,
+                              const struct intel_device_info *devinfo,
+                              uint32_t stack_ids_per_dss,
+                              uint32_t sw_stack_size)
+{
+   layout->stack_ids_per_dss = stack_ids_per_dss;
+
+   const uint32_t dss_count = intel_device_info_dual_subslice_id_bound(devinfo);
+   const uint32_t num_stack_ids = dss_count * stack_ids_per_dss;
+
+   uint64_t size = 0;
+
+   /* The first thing in our scratch area is an array of "hot zones" which
+    * store the stack offset as well as the launch IDs for each active
+    * invocation.
+    */
+   size += BRW_RT_SIZEOF_HOTZONE * num_stack_ids;
+
+   /* Next, we place the HW ray stacks */
+   assert(size % 64 == 0); /* Cache-line aligned */
+   assert(size < UINT32_MAX);
+   layout->ray_stack_start = size;
+   layout->ray_stack_stride = BRW_RT_ASYNC_STACK_STRIDE;
+   size += num_stack_ids * layout->ray_stack_stride;
+
+   /* Finally, we place the SW stacks for the individual ray-tracing shader
+    * invocations.  We align these to 64B to ensure that we don't have any
+    * shared cache lines which could hurt performance.
+    */
+   assert(size % 64 == 0);
+   layout->sw_stack_start = size;
+   layout->sw_stack_size = ALIGN(sw_stack_size, 64);
+
+   /* Currently it's always the case that sw_stack_size is a power of
+    * two, but power-of-two SW stack sizes are prone to causing
+    * collisions in the hashing function used by the L3 to map memory
+    * addresses to banks, which can cause stack accesses from most
+    * DSSes to bottleneck on a single L3 bank.  Fix it by padding the
+    * SW stack by a single cacheline if it was a power of two.
+    */
+   if (layout->sw_stack_size > 64 &&
+       util_is_power_of_two_nonzero(layout->sw_stack_size))
+      layout->sw_stack_size += 64;
+
+   size += num_stack_ids * layout->sw_stack_size;
+
+   layout->total_size = size;
+}
+
+static inline uint32_t
+brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info *devinfo)
+{
+   /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
+    * which includes all the threads.
+    */
+   uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
+   uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */
+   return max_eu_id * max_simd_size * BRW_RT_SIZEOF_RAY_QUERY;
+}
+
+static inline uint32_t
+brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo)
+{
+   /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
+    * which includes all the threads.
+    */
+   uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
+   uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */
+   return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY;
+}
+
+static inline uint32_t
+brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo,
+                                      uint32_t ray_queries)
+{
+   /* Don't bother a shadow stack if we only have a single query. We can
+    * directly write in the HW buffer.
+    */
+   return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) +
+          ray_queries * 4; /* Ctrl + Level data */
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BRW_RT_H */
--- a/src/intel/compiler/elk/brw_schedule_instructions.cpp
+++ b/src/intel/compiler/elk/brw_schedule_instructions.cpp
--- a/src/intel/compiler/elk/brw_shader.cpp
+++ b/src/intel/compiler/elk/brw_shader.cpp
--- a/src/intel/compiler/elk/brw_shader.h
+++ b/src/intel/compiler/elk/brw_shader.h
@ -0,0 +1,196 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_SHADER_H
+#define BRW_SHADER_H
+
+#include <stdint.h>
+#include "brw_cfg.h"
+#include "brw_compiler.h"
+#include "compiler/nir/nir.h"
+
+#ifdef __cplusplus
+#include "brw_ir_analysis.h"
+#include "brw_ir_allocator.h"
+
+enum instruction_scheduler_mode {
+   SCHEDULE_PRE,
+   SCHEDULE_PRE_NON_LIFO,
+   SCHEDULE_PRE_LIFO,
+   SCHEDULE_POST,
+   SCHEDULE_NONE,
+};
+
+#define UBO_START ((1 << 16) - 4)
+
+struct backend_shader {
+protected:
+
+   backend_shader(const struct brw_compiler *compiler,
+                  const struct brw_compile_params *params,
+                  const nir_shader *shader,
+                  struct brw_stage_prog_data *stage_prog_data,
+                  bool debug_enabled);
+
+public:
+   virtual ~backend_shader();
+
+   const struct brw_compiler *compiler;
+   void *log_data; /* Passed to compiler->*_log functions */
+
+   const struct intel_device_info * const devinfo;
+   const nir_shader *nir;
+   struct brw_stage_prog_data * const stage_prog_data;
+
+   /** ralloc context for temporary data used during compile */
+   void *mem_ctx;
+
+   /**
+    * List of either fs_inst or vec4_instruction (inheriting from
+    * backend_instruction)
+    */
+   exec_list instructions;
+
+   cfg_t *cfg;
+   brw_analysis<brw::idom_tree, backend_shader> idom_analysis;
+
+   gl_shader_stage stage;
+   bool debug_enabled;
+
+   brw::simple_allocator alloc;
+
+   virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const = 0;
+   virtual void dump_instructions_to_file(FILE *file) const;
+
+   /* Convenience functions based on the above. */
+   void dump_instruction(const backend_instruction *inst, FILE *file = stderr) const {
+      dump_instruction_to_file(inst, file);
+   }
+   void dump_instructions(const char *name = nullptr) const;
+
+   void calculate_cfg();
+
+   virtual void invalidate_analysis(brw::analysis_dependency_class c);
+};
+
+#else
+struct backend_shader;
+#endif /* __cplusplus */
+
+enum brw_reg_type brw_type_for_base_type(const struct glsl_type *type);
+uint32_t brw_math_function(enum opcode op);
+const char *brw_instruction_name(const struct brw_isa_info *isa,
+                                 enum opcode op);
+bool brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg);
+bool brw_negate_immediate(enum brw_reg_type type, struct brw_reg *reg);
+bool brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg);
+
+bool opt_predicated_break(struct backend_shader *s);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* brw_fs_reg_allocate.cpp */
+void brw_fs_alloc_reg_sets(struct brw_compiler *compiler);
+
+/* brw_vec4_reg_allocate.cpp */
+void brw_vec4_alloc_reg_set(struct brw_compiler *compiler);
+
+/* brw_disasm.c */
+extern const char *const conditional_modifier[16];
+extern const char *const pred_ctrl_align16[16];
+
+/* Per-thread scratch space is a power-of-two multiple of 1KB. */
+static inline unsigned
+brw_get_scratch_size(int size)
+{
+   return MAX2(1024, util_next_power_of_two(size));
+}
+
+
+static inline nir_variable_mode
+brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
+                         gl_shader_stage stage)
+{
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   const bool is_scalar = compiler->scalar_stage[stage];
+   nir_variable_mode indirect_mask = (nir_variable_mode) 0;
+
+   switch (stage) {
+   case MESA_SHADER_VERTEX:
+   case MESA_SHADER_FRAGMENT:
+      indirect_mask |= nir_var_shader_in;
+      break;
+
+   case MESA_SHADER_GEOMETRY:
+      if (!is_scalar)
+         indirect_mask |= nir_var_shader_in;
+      break;
+
+   default:
+      /* Everything else can handle indirect inputs */
+      break;
+   }
+
+   if (is_scalar && stage != MESA_SHADER_TESS_CTRL &&
+                    stage != MESA_SHADER_TASK &&
+                    stage != MESA_SHADER_MESH)
+      indirect_mask |= nir_var_shader_out;
+
+   /* On HSW+, we allow indirects in scalar shaders.  They get implemented
+    * using nir_lower_vars_to_explicit_types and nir_lower_explicit_io in
+    * brw_postprocess_nir.
+    *
+    * We haven't plumbed through the indirect scratch messages on gfx6 or
+    * earlier so doing indirects via scratch doesn't work there. On gfx7 and
+    * earlier the scratch space size is limited to 12kB.  If we allowed
+    * indirects as scratch all the time, we may easily exceed this limit
+    * without having any fallback.
+    */
+   if (is_scalar && devinfo->verx10 <= 70)
+      indirect_mask |= nir_var_function_temp;
+
+   return indirect_mask;
+}
+
+bool brw_texture_offset(const nir_tex_instr *tex, unsigned src,
+                        uint32_t *offset_bits);
+
+/**
+ * Scratch data used when compiling a GLSL geometry shader.
+ */
+struct brw_gs_compile
+{
+   struct brw_gs_prog_key key;
+   struct intel_vue_map input_vue_map;
+
+   unsigned control_data_bits_per_vertex;
+   unsigned control_data_header_size_bits;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BRW_SHADER_H */
--- a/src/intel/compiler/elk/brw_simd_selection.cpp
+++ b/src/intel/compiler/elk/brw_simd_selection.cpp
@ -0,0 +1,268 @@
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_private.h"
+#include "compiler/shader_info.h"
+#include "intel/dev/intel_debug.h"
+#include "intel/dev/intel_device_info.h"
+#include "util/ralloc.h"
+
+unsigned
+brw_required_dispatch_width(const struct shader_info *info)
+{
+   if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
+      assert(gl_shader_stage_uses_workgroup(info->stage));
+      /* These enum values are expressly chosen to be equal to the subgroup
+       * size that they require.
+       */
+      return (unsigned)info->subgroup_size;
+   } else {
+      return 0;
+   }
+}
+
+static inline bool
+test_bit(unsigned mask, unsigned bit) {
+   return mask & (1u << bit);
+}
+
+namespace {
+
+struct brw_cs_prog_data *
+get_cs_prog_data(brw_simd_selection_state &state)
+{
+   if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
+      return std::get<struct brw_cs_prog_data *>(state.prog_data);
+   else
+      return nullptr;
+}
+
+struct brw_stage_prog_data *
+get_prog_data(brw_simd_selection_state &state)
+{
+   if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
+      return &std::get<struct brw_cs_prog_data *>(state.prog_data)->base;
+   else if (std::holds_alternative<struct brw_bs_prog_data *>(state.prog_data))
+      return &std::get<struct brw_bs_prog_data *>(state.prog_data)->base;
+   else
+      return nullptr;
+}
+
+}
+
+bool
+brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
+{
+   assert(simd < SIMD_COUNT);
+   assert(!state.compiled[simd]);
+
+   const auto cs_prog_data = get_cs_prog_data(state);
+   const auto prog_data = get_prog_data(state);
+   const unsigned width = 8u << simd;
+
+   /* For shaders with variable size workgroup, in most cases we can compile
+    * all the variants (exceptions are bindless dispatch & ray queries), since
+    * the choice will happen only at dispatch time.
+    */
+   const bool workgroup_size_variable = cs_prog_data && cs_prog_data->local_size[0] == 0;
+
+   if (!workgroup_size_variable) {
+      if (state.spilled[simd]) {
+         state.error[simd] = "Would spill";
+         return false;
+      }
+
+      if (state.required_width && state.required_width != width) {
+         state.error[simd] = "Different than required dispatch width";
+         return false;
+      }
+
+      if (cs_prog_data) {
+         const unsigned workgroup_size = cs_prog_data->local_size[0] *
+                                         cs_prog_data->local_size[1] *
+                                         cs_prog_data->local_size[2];
+
+         unsigned max_threads = state.devinfo->max_cs_workgroup_threads;
+
+         const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
+         if (simd > min_simd && state.compiled[simd - 1] &&
+            workgroup_size <= (width / 2)) {
+            state.error[simd] = "Workgroup size already fits in smaller SIMD";
+            return false;
+         }
+
+         if (DIV_ROUND_UP(workgroup_size, width) > max_threads) {
+            state.error[simd] = "Would need more than max_threads to fit all invocations";
+            return false;
+         }
+      }
+
+      /* The SIMD32 is only enabled for cases it is needed unless forced.
+       *
+       * TODO: Use performance_analysis and drop this rule.
+       */
+      if (width == 32 && state.devinfo->ver < 20) {
+         if (!INTEL_DEBUG(DEBUG_DO32) && (state.compiled[0] || state.compiled[1])) {
+            state.error[simd] = "SIMD32 not required (use INTEL_DEBUG=do32 to force)";
+            return false;
+         }
+      }
+   }
+
+   if (width == 8 && state.devinfo->ver >= 20) {
+      state.error[simd] = "SIMD8 not supported on Xe2+";
+      return false;
+   }
+
+   if (width == 32 && cs_prog_data && cs_prog_data->base.ray_queries > 0) {
+      state.error[simd] = "Ray queries not supported";
+      return false;
+   }
+
+   if (width == 32 && cs_prog_data && cs_prog_data->uses_btd_stack_ids) {
+      state.error[simd] = "Bindless shader calls not supported";
+      return false;
+   }
+
+   uint64_t start;
+   switch (prog_data->stage) {
+   case MESA_SHADER_COMPUTE:
+      start = DEBUG_CS_SIMD8;
+      break;
+   case MESA_SHADER_TASK:
+      start = DEBUG_TS_SIMD8;
+      break;
+   case MESA_SHADER_MESH:
+      start = DEBUG_MS_SIMD8;
+      break;
+   case MESA_SHADER_RAYGEN:
+   case MESA_SHADER_ANY_HIT:
+   case MESA_SHADER_CLOSEST_HIT:
+   case MESA_SHADER_MISS:
+   case MESA_SHADER_INTERSECTION:
+   case MESA_SHADER_CALLABLE:
+      start = DEBUG_RT_SIMD8;
+      break;
+   default:
+      unreachable("unknown shader stage in brw_simd_should_compile");
+   }
+
+   const bool env_skip[] = {
+      (intel_simd & (start << 0)) == 0,
+      (intel_simd & (start << 1)) == 0,
+      (intel_simd & (start << 2)) == 0,
+   };
+
+   static_assert(ARRAY_SIZE(env_skip) == SIMD_COUNT);
+
+   if (unlikely(env_skip[simd])) {
+      state.error[simd] = "Disabled by INTEL_DEBUG environment variable";
+      return false;
+   }
+
+   return true;
+}
+
+void
+brw_simd_mark_compiled(brw_simd_selection_state &state, unsigned simd, bool spilled)
+{
+   assert(simd < SIMD_COUNT);
+   assert(!state.compiled[simd]);
+
+   auto cs_prog_data = get_cs_prog_data(state);
+
+   state.compiled[simd] = true;
+   if (cs_prog_data)
+      cs_prog_data->prog_mask |= 1u << simd;
+
+   /* If a SIMD spilled, all the larger ones would spill too. */
+   if (spilled) {
+      for (unsigned i = simd; i < SIMD_COUNT; i++) {
+         state.spilled[i] = true;
+         if (cs_prog_data)
+            cs_prog_data->prog_spilled |= 1u << i;
+      }
+   }
+}
+
+int
+brw_simd_select(const struct brw_simd_selection_state &state)
+{
+   for (int i = SIMD_COUNT - 1; i >= 0; i--) {
+      if (state.compiled[i] && !state.spilled[i])
+         return i;
+   }
+   for (int i = SIMD_COUNT - 1; i >= 0; i--) {
+      if (state.compiled[i])
+         return i;
+   }
+   return -1;
+}
+
+int
+brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
+                                   const struct brw_cs_prog_data *prog_data,
+                                   const unsigned *sizes)
+{
+   if (!sizes || (prog_data->local_size[0] == sizes[0] &&
+                  prog_data->local_size[1] == sizes[1] &&
+                  prog_data->local_size[2] == sizes[2])) {
+      brw_simd_selection_state simd_state{
+         .prog_data = const_cast<struct brw_cs_prog_data *>(prog_data),
+      };
+
+      /* Propagate the prog_data information back to the simd_state,
+       * so we can use select() directly.
+       */
+      for (int i = 0; i < SIMD_COUNT; i++) {
+         simd_state.compiled[i] = test_bit(prog_data->prog_mask, i);
+         simd_state.spilled[i] = test_bit(prog_data->prog_spilled, i);
+      }
+
+      return brw_simd_select(simd_state);
+   }
+
+   struct brw_cs_prog_data cloned = *prog_data;
+   for (unsigned i = 0; i < 3; i++)
+      cloned.local_size[i] = sizes[i];
+
+   cloned.prog_mask = 0;
+   cloned.prog_spilled = 0;
+
+   brw_simd_selection_state simd_state{
+      .devinfo = devinfo,
+      .prog_data = &cloned,
+   };
+
+   for (unsigned simd = 0; simd < SIMD_COUNT; simd++) {
+      /* We are not recompiling, so use original results of prog_mask and
+       * prog_spilled as they will already contain all possible compilations.
+       */
+      if (brw_simd_should_compile(simd_state, simd) &&
+          test_bit(prog_data->prog_mask, simd)) {
+         brw_simd_mark_compiled(simd_state, simd, test_bit(prog_data->prog_spilled, simd));
+      }
+   }
+
+   return brw_simd_select(simd_state);
+}
--- a/Show more
+++ b/Show more