From 7a521ddf36b3d1f7b81604341fcda26c60809c9b Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 11 Jan 2016 16:39:15 -0500
Subject: [PATCH 001/119] nvc0: allow fragment shader inputs to use indirect
 indexing

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index ccf96fb2815..6f7defa35f2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -295,9 +295,9 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       if (shader == PIPE_SHADER_COMPUTE && class_3d >= NVE4_3D_CLASS)
          return NVE4_MAX_PIPE_CONSTBUFS_COMPUTE;
       return NVC0_MAX_PIPE_CONSTBUFS;
-   case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
    case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
       return shader != PIPE_SHADER_FRAGMENT;
+   case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
    case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
    case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
       return 1;

From 37b67db6ae34fb6586d640a7a1b6232f091dd812 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 11 Jan 2016 16:41:18 -0500
Subject: [PATCH 002/119] nvc0/ir: be careful about propagating very large
 offsets into const load

Indirect constbuf indexing works by using very large offsets. However if
an indirect constbuf index load is const-propagated, it becomes a very
large const offset. Take that into account when legalizing the SSA by
moving the high parts of that offset into the file index. Also disallow
very large (or small) indices on most other instructions.

This fixes regressions in ubo_array_indexing/*-two-arrays piglit tests.

Fixes: abd326e81b (nv50/ir: propagate indirect loads into instructions)
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp  |  6 ++++++
 src/gallium/drivers/nouveau/codegen/nv50_ir_target.h   |  2 +-
 .../drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp    | 10 ++++++++++
 .../drivers/nouveau/codegen/nv50_ir_target_nvc0.h      |  2 ++
 4 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 6530078b938..dc1ab769b98 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -540,6 +540,12 @@ NVC0LegalizePostRA::visit(BasicBlock *bb)
          // It seems like barriers are never required for tessellation since
          // the warp size is 32, and there are always at most 32 tcs threads.
          bb->remove(i);
+      } else
+      if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {
+         int offset = i->src(0).get()->reg.data.offset;
+         if (abs(offset) > 0x10000)
+            i->src(0).get()->reg.fileIndex += offset >> 16;
+         i->src(0).get()->reg.data.offset = (int)(short)offset;
       } else {
          // TODO: Move this to before register allocation for operations that
          // need the $c register !
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
index 673f8811ff3..e6e1912adae 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
@@ -192,7 +192,7 @@ public:
    virtual bool insnCanLoad(const Instruction *insn, int s,
                             const Instruction *ld) const = 0;
    virtual bool insnCanLoadOffset(const Instruction *insn, int s,
-                                  int offset) const { return true; }
+                                  int offset) const = 0;
    virtual bool isOpSupported(operation, DataType) const = 0;
    virtual bool isAccessSupported(DataFile, DataType) const = 0;
    virtual bool isModSupported(const Instruction *,
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 014c652eede..a03afa8dc8d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -383,6 +383,16 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
    return true;
 }
 
+bool
+TargetNVC0::insnCanLoadOffset(const Instruction *insn, int s, int offset) const
+{
+   const ValueRef& ref = insn->src(s);
+   if (ref.getFile() == FILE_MEMORY_CONST &&
+       (insn->op != OP_LOAD || insn->subOp != NV50_IR_SUBOP_LDC_IS))
+      return offset >= -0x8000 && offset < 0x8000;
+   return true;
+}
+
 bool
 TargetNVC0::isAccessSupported(DataFile file, DataType ty) const
 {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
index 3c5c7480405..7d11cd96315 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
@@ -48,6 +48,8 @@ public:
 
    virtual bool insnCanLoad(const Instruction *insn, int s,
                             const Instruction *ld) const;
+   virtual bool insnCanLoadOffset(const Instruction *insn, int s,
+                                  int offset) const;
    virtual bool isOpSupported(operation, DataType) const;
    virtual bool isAccessSupported(DataFile, DataType) const;
    virtual bool isModSupported(const Instruction *, int s, Modifier) const;

From e231f59b6d5b12035a8041305c3a732d39a39c19 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 12 Jan 2016 14:41:52 -0500
Subject: [PATCH 003/119] nv50/ir: only use FILE_LOCAL_MEMORY for temp arrays
 that use indirection

Previously we were treating any indirect temp array usage to mean that
everything should end up in lmem. The MemoryOpt pass would clean a lot
of that up later, but in the meanwhile we would lose a lot of
opportunity for optimization.

This helps a lot of Metro 2033 Redux and a handful of KSP shaders:

total instructions in shared programs : 6288373 -> 6261517 (-0.43%)
total gprs used in shared programs    : 944051 -> 945131 (0.11%)
total local used in shared programs   : 54116 -> 54116 (0.00%)

A typical case is for register usage to double and for instructions to
halve. A future commit can also optimize local memory usage size to be
reduced with better packing.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp     | 65 ++++++++++++++-----
 1 file changed, 50 insertions(+), 15 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 7b313f3c39c..5454f042078 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -95,6 +95,13 @@ public:
          return tgsi_util_get_src_register_swizzle(&reg, chan);
       }
 
+      int getArrayId() const
+      {
+         if (isIndirect(0))
+            return fsr->Indirect.ArrayID;
+         return 0;
+      }
+
       nv50_ir::Modifier getMod(int chan) const;
 
       SrcRegister getIndirect(int dim) const
@@ -154,6 +161,13 @@ public:
          return SrcRegister(fdr->Indirect);
       }
 
+      int getArrayId() const
+      {
+         if (isIndirect(0))
+            return fdr->Indirect.ArrayID;
+         return 0;
+      }
+
    private:
       const struct tgsi_dst_register reg;
       const struct tgsi_full_dst_register *fdr;
@@ -809,7 +823,8 @@ public:
    // these registers are per-subroutine, cannot be used for parameter passing
    std::set<Location> locals;
 
-   bool mainTempsInLMem;
+   std::set<int> indirectTempArrays;
+   std::vector<int> tempArrayId;
 
    int clipVertexOutput;
 
@@ -841,8 +856,6 @@ Source::Source(struct nv50_ir_prog_info *prog) : info(prog)
 
    if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
       tgsi_dump(tokens, 0);
-
-   mainTempsInLMem = false;
 }
 
 Source::~Source()
@@ -872,6 +885,7 @@ bool Source::scanSource()
 
    textureViews.resize(scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1);
    //resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
+   tempArrayId.resize(scan.file_max[TGSI_FILE_TEMPORARY] + 1);
 
    info->immd.bufSize = 0;
 
@@ -917,7 +931,8 @@ bool Source::scanSource()
    }
    tgsi_parse_free(&parse);
 
-   if (mainTempsInLMem)
+   // TODO: Compute based on relevant array sizes
+   if (indirectTempArrays.size())
       info->bin.tlsSpace += (scan.file_max[TGSI_FILE_TEMPORARY] + 1) * 16;
 
    if (info->io.genUserClip > 0) {
@@ -1028,6 +1043,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
    unsigned sn = TGSI_SEMANTIC_GENERIC;
    unsigned si = 0;
    const unsigned first = decl->Range.First, last = decl->Range.Last;
+   const int arrayId = decl->Array.ArrayID;
 
    if (decl->Declaration.Semantic) {
       sn = decl->Semantic.Name;
@@ -1172,8 +1188,11 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
       for (i = first; i <= last; ++i)
          textureViews[i].target = decl->SamplerView.Resource;
       break;
-   case TGSI_FILE_NULL:
    case TGSI_FILE_TEMPORARY:
+      for (i = first; i <= last; ++i)
+         tempArrayId[i] = arrayId;
+      break;
+   case TGSI_FILE_NULL:
    case TGSI_FILE_ADDRESS:
    case TGSI_FILE_CONSTANT:
    case TGSI_FILE_IMMEDIATE:
@@ -1223,7 +1242,7 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
       } else
       if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) {
          if (insn.getDst(0).isIndirect(0))
-            mainTempsInLMem = true;
+            indirectTempArrays.insert(insn.getDst(0).getArrayId());
       }
    }
 
@@ -1231,7 +1250,7 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
       Instruction::SrcRegister src = insn.getSrc(s);
       if (src.getFile() == TGSI_FILE_TEMPORARY) {
          if (src.isIndirect(0))
-            mainTempsInLMem = true;
+            indirectTempArrays.insert(src.getArrayId());
       } else
 /*
       if (src.getFile() == TGSI_FILE_RESOURCE) {
@@ -1416,6 +1435,7 @@ private:
    DataType srcTy;
 
    DataArray tData; // TGSI_FILE_TEMPORARY
+   DataArray lData; // TGSI_FILE_TEMPORARY, for indirect arrays
    DataArray aData; // TGSI_FILE_ADDRESS
    DataArray pData; // TGSI_FILE_PREDICATE
    DataArray oData; // TGSI_FILE_OUTPUT (if outputs in registers)
@@ -1619,7 +1639,7 @@ Converter::getArrayForFile(unsigned file, int idx)
 {
    switch (file) {
    case TGSI_FILE_TEMPORARY:
-      return &tData;
+      return idx == 0 ? &tData : &lData;
    case TGSI_FILE_PREDICATE:
       return &pData;
    case TGSI_FILE_ADDRESS:
@@ -1644,7 +1664,7 @@ Converter::shiftAddress(Value *index)
 Value *
 Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
 {
-   const int idx2d = src.is2D() ? src.getIndex(1) : 0;
+   int idx2d = src.is2D() ? src.getIndex(1) : 0;
    const int idx = src.getIndex(0);
    const int swz = src.getSwizzle(c);
    Instruction *ld;
@@ -1686,6 +1706,14 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
       ld = mkOp1(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c));
       ld->perPatch = info->sv[idx].patch;
       return ld->getDef(0);
+   case TGSI_FILE_TEMPORARY: {
+      int arrayid = src.getArrayId();
+      if (!arrayid)
+         arrayid = code->tempArrayId[idx];
+      idx2d = (code->indirectTempArrays.find(arrayid) !=
+               code->indirectTempArrays.end());
+   }
+      /* fallthrough */
    default:
       return getArrayForFile(src.getFile(), idx2d)->load(
          sub.cur->values, idx, swz, shiftAddress(ptr));
@@ -1698,7 +1726,7 @@ Converter::acquireDst(int d, int c)
    const tgsi::Instruction::DstRegister dst = tgsi.getDst(d);
    const unsigned f = dst.getFile();
    const int idx = dst.getIndex(0);
-   const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
+   int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
 
    if (dst.isMasked(c)/* || f == TGSI_FILE_RESOURCE*/)
       return NULL;
@@ -1708,6 +1736,10 @@ Converter::acquireDst(int d, int c)
        (f == TGSI_FILE_OUTPUT && prog->getType() != Program::TYPE_FRAGMENT))
       return getScratch();
 
+   if (f == TGSI_FILE_TEMPORARY)
+      idx2d = code->indirectTempArrays.find(code->tempArrayId[idx]) !=
+         code->indirectTempArrays.end();
+
    return getArrayForFile(f, idx2d)-> acquire(sub.cur->values, idx, c);
 }
 
@@ -1740,7 +1772,7 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c,
 {
    const unsigned f = dst.getFile();
    const int idx = dst.getIndex(0);
-   const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
+   int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
 
    if (f == TGSI_FILE_SYSTEM_VALUE) {
       assert(!ptr);
@@ -1763,6 +1795,10 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c,
        f == TGSI_FILE_PREDICATE ||
        f == TGSI_FILE_ADDRESS ||
        f == TGSI_FILE_OUTPUT) {
+      if (f == TGSI_FILE_TEMPORARY)
+         idx2d = code->indirectTempArrays.find(code->tempArrayId[idx]) !=
+            code->indirectTempArrays.end();
+
       getArrayForFile(f, idx2d)->store(sub.cur->values, idx, c, ptr, val);
    } else {
       assert(!"invalid dst file");
@@ -3326,18 +3362,17 @@ Converter::exportOutputs()
 Converter::Converter(Program *ir, const tgsi::Source *code) : BuildUtil(ir),
      code(code),
      tgsi(NULL),
-     tData(this), aData(this), pData(this), oData(this)
+     tData(this), lData(this), aData(this), pData(this), oData(this)
 {
    info = code->info;
 
-   const DataFile tFile = code->mainTempsInLMem ? FILE_MEMORY_LOCAL : FILE_GPR;
-
    const unsigned tSize = code->fileSize(TGSI_FILE_TEMPORARY);
    const unsigned pSize = code->fileSize(TGSI_FILE_PREDICATE);
    const unsigned aSize = code->fileSize(TGSI_FILE_ADDRESS);
    const unsigned oSize = code->fileSize(TGSI_FILE_OUTPUT);
 
-   tData.setup(TGSI_FILE_TEMPORARY, 0, 0, tSize, 4, 4, tFile, 0);
+   tData.setup(TGSI_FILE_TEMPORARY, 0, 0, tSize, 4, 4, FILE_GPR, 0);
+   lData.setup(TGSI_FILE_TEMPORARY, 1, 0, tSize, 4, 4, FILE_MEMORY_LOCAL, 0);
    pData.setup(TGSI_FILE_PREDICATE, 0, 0, pSize, 4, 4, FILE_PREDICATE, 0);
    aData.setup(TGSI_FILE_ADDRESS, 0, 0, aSize, 4, 4, FILE_GPR, 0);
    oData.setup(TGSI_FILE_OUTPUT, 0, 0, oSize, 4, 4, FILE_GPR, 0);

From fffb559129dd1ae978ec7f9ba30b4ae97a5ebbcc Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 14 Jan 2016 01:09:25 -0500
Subject: [PATCH 004/119] nv50/ir: rebase indirect temp arrays to 0, so that we
 use less lmem space

Reduces local memory usage in a lot of Metro 2033 Redux and a few KSP
shaders:

total local used in shared programs   : 54116 -> 30372 (-43.88%)

Probably modest advantage to execution, but it's an imporant
prerequisite to dropping some of the TGSI optimizations done by the
state tracker.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp     | 58 ++++++++++++++-----
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 5454f042078..9c4a38f291b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -824,6 +824,8 @@ public:
    std::set<Location> locals;
 
    std::set<int> indirectTempArrays;
+   std::map<int, int> indirectTempOffsets;
+   std::map<int, std::pair<int, int> > tempArrayInfo;
    std::vector<int> tempArrayId;
 
    int clipVertexOutput;
@@ -931,9 +933,16 @@ bool Source::scanSource()
    }
    tgsi_parse_free(&parse);
 
-   // TODO: Compute based on relevant array sizes
-   if (indirectTempArrays.size())
-      info->bin.tlsSpace += (scan.file_max[TGSI_FILE_TEMPORARY] + 1) * 16;
+   if (indirectTempArrays.size()) {
+      int tempBase = 0;
+      for (std::set<int>::const_iterator it = indirectTempArrays.begin();
+           it != indirectTempArrays.end(); ++it) {
+         std::pair<int, int>& info = tempArrayInfo[*it];
+         indirectTempOffsets.insert(std::make_pair(*it, tempBase - info.first));
+         tempBase += info.second;
+      }
+      info->bin.tlsSpace += tempBase * 16;
+   }
 
    if (info->io.genUserClip > 0) {
       info->io.clipDistances = info->io.genUserClip;
@@ -1191,6 +1200,9 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
    case TGSI_FILE_TEMPORARY:
       for (i = first; i <= last; ++i)
          tempArrayId[i] = arrayId;
+      if (arrayId)
+         tempArrayInfo.insert(std::make_pair(arrayId, std::make_pair(
+                                                   first, last - first + 1)));
       break;
    case TGSI_FILE_NULL:
    case TGSI_FILE_ADDRESS:
@@ -1356,6 +1368,7 @@ private:
    void storeDst(const tgsi::Instruction::DstRegister dst, int c,
                  Value *val, Value *ptr);
 
+   void adjustTempIndex(int arrayId, int &idx, int &idx2d) const;
    Value *applySrcMod(Value *, int s, int c);
 
    Symbol *makeSym(uint file, int fileIndex, int idx, int c, uint32_t addr);
@@ -1661,11 +1674,23 @@ Converter::shiftAddress(Value *index)
    return mkOp2v(OP_SHL, TYPE_U32, getSSA(4, FILE_ADDRESS), index, mkImm(4));
 }
 
+void
+Converter::adjustTempIndex(int arrayId, int &idx, int &idx2d) const
+{
+   std::map<int, int>::const_iterator it =
+      code->indirectTempOffsets.find(arrayId);
+   if (it == code->indirectTempOffsets.end())
+      return;
+
+   idx2d = 1;
+   idx += it->second;
+}
+
 Value *
 Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
 {
    int idx2d = src.is2D() ? src.getIndex(1) : 0;
-   const int idx = src.getIndex(0);
+   int idx = src.getIndex(0);
    const int swz = src.getSwizzle(c);
    Instruction *ld;
 
@@ -1710,8 +1735,7 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
       int arrayid = src.getArrayId();
       if (!arrayid)
          arrayid = code->tempArrayId[idx];
-      idx2d = (code->indirectTempArrays.find(arrayid) !=
-               code->indirectTempArrays.end());
+      adjustTempIndex(arrayid, idx, idx2d);
    }
       /* fallthrough */
    default:
@@ -1725,7 +1749,7 @@ Converter::acquireDst(int d, int c)
 {
    const tgsi::Instruction::DstRegister dst = tgsi.getDst(d);
    const unsigned f = dst.getFile();
-   const int idx = dst.getIndex(0);
+   int idx = dst.getIndex(0);
    int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
 
    if (dst.isMasked(c)/* || f == TGSI_FILE_RESOURCE*/)
@@ -1736,9 +1760,12 @@ Converter::acquireDst(int d, int c)
        (f == TGSI_FILE_OUTPUT && prog->getType() != Program::TYPE_FRAGMENT))
       return getScratch();
 
-   if (f == TGSI_FILE_TEMPORARY)
-      idx2d = code->indirectTempArrays.find(code->tempArrayId[idx]) !=
-         code->indirectTempArrays.end();
+   if (f == TGSI_FILE_TEMPORARY) {
+      int arrayid = dst.getArrayId();
+      if (!arrayid)
+         arrayid = code->tempArrayId[idx];
+      adjustTempIndex(arrayid, idx, idx2d);
+   }
 
    return getArrayForFile(f, idx2d)-> acquire(sub.cur->values, idx, c);
 }
@@ -1771,7 +1798,7 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c,
                     Value *val, Value *ptr)
 {
    const unsigned f = dst.getFile();
-   const int idx = dst.getIndex(0);
+   int idx = dst.getIndex(0);
    int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
 
    if (f == TGSI_FILE_SYSTEM_VALUE) {
@@ -1795,9 +1822,12 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c,
        f == TGSI_FILE_PREDICATE ||
        f == TGSI_FILE_ADDRESS ||
        f == TGSI_FILE_OUTPUT) {
-      if (f == TGSI_FILE_TEMPORARY)
-         idx2d = code->indirectTempArrays.find(code->tempArrayId[idx]) !=
-            code->indirectTempArrays.end();
+      if (f == TGSI_FILE_TEMPORARY) {
+         int arrayid = dst.getArrayId();
+         if (!arrayid)
+            arrayid = code->tempArrayId[idx];
+         adjustTempIndex(arrayid, idx, idx2d);
+      }
 
       getArrayForFile(f, idx2d)->store(sub.cur->values, idx, c, ptr, val);
    } else {

From 22ac1f692228d8c44155bfa637c8e34356c7e0b7 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 23 Dec 2015 18:44:59 +0200
Subject: [PATCH 005/119] i965: Add state bit to trigger re-emission of color
 calculator state.

This will be used on Gen8+ to make sure that the color calculator
state pointers are re-emitted when switching back to the 3D pipeline
after some GPGPU workload due to a hardware workaround.  There are
other state bits already defined that could be used to achieve the
same effect but they all cause a ton of unrelated state to be
re-emitted (e.g. BRW_NEW_STATE_BASE_ADDRESS), so just define a new
one, state bits are cheap.

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_context.h      | 2 ++
 src/mesa/drivers/dri/i965/brw_state_upload.c | 1 +
 src/mesa/drivers/dri/i965/gen6_cc.c          | 1 +
 3 files changed, 4 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 7b0340fc2ab..b80db00eb75 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -221,6 +221,7 @@ enum brw_state_id {
    BRW_STATE_COMPUTE_PROGRAM,
    BRW_STATE_CS_WORK_GROUPS,
    BRW_STATE_URB_SIZE,
+   BRW_STATE_CC_STATE,
    BRW_NUM_STATE_BITS
 };
 
@@ -309,6 +310,7 @@ enum brw_state_id {
 #define BRW_NEW_COMPUTE_PROGRAM         (1ull << BRW_STATE_COMPUTE_PROGRAM)
 #define BRW_NEW_CS_WORK_GROUPS          (1ull << BRW_STATE_CS_WORK_GROUPS)
 #define BRW_NEW_URB_SIZE                (1ull << BRW_STATE_URB_SIZE)
+#define BRW_NEW_CC_STATE                (1ull << BRW_STATE_CC_STATE)
 
 struct brw_state_flags {
    /** State update flags signalled by mesa internals */
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 2a671a58d8c..876e130f1cd 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -664,6 +664,7 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_COMPUTE_PROGRAM),
    DEFINE_BIT(BRW_NEW_CS_WORK_GROUPS),
    DEFINE_BIT(BRW_NEW_URB_SIZE),
+   DEFINE_BIT(BRW_NEW_CC_STATE),
    {0, 0, 0}
 };
 
diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
index 3bab8f46ae8..cee139b7fd4 100644
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -298,6 +298,7 @@ const struct brw_tracked_state gen6_color_calc_state = {
       .mesa = _NEW_COLOR |
               _NEW_STENCIL,
       .brw = BRW_NEW_BATCH |
+             BRW_NEW_CC_STATE |
              BRW_NEW_STATE_BASE_ADDRESS,
    },
    .emit = gen6_upload_color_calc_state,

From 044acb9256046bebec890cac7e42043754459fc2 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Sat, 2 Jan 2016 18:35:28 -0800
Subject: [PATCH 006/119] i965/gen8+: Invalidate color calc state when
 switching to the GPGPU pipeline.

This hardware bug can cause a hang on context restore while the
current pipeline is set to GPGPU (BDWGFX HSD 1909593).  In addition to
clearing the valid bit, mark the CC state as dirty to make sure that
the CC indirect state pointer is re-emitted when we switch back to the
3D pipeline.

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_misc_state.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index cf6ba5b4aeb..2b2f031df63 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -868,6 +868,26 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
    const uint32_t _3DSTATE_PIPELINE_SELECT =
       is_965 ? CMD_PIPELINE_SELECT_965 : CMD_PIPELINE_SELECT_GM45;
 
+   if (brw->gen >= 8 && brw->gen < 10) {
+      /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
+       *
+       *   Software must clear the COLOR_CALC_STATE Valid field in
+       *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
+       *   with Pipeline Select set to GPGPU.
+       *
+       * The internal hardware docs recommend the same workaround for Gen9
+       * hardware too.
+       */
+      if (pipeline == BRW_COMPUTE_PIPELINE) {
+         BEGIN_BATCH(2);
+         OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
+         OUT_BATCH(0);
+         ADVANCE_BATCH();
+
+         brw->ctx.NewDriverState |= BRW_NEW_CC_STATE;
+      }
+   }
+
    /* Select the pipeline */
    BEGIN_BATCH(1);
    OUT_BATCH(_3DSTATE_PIPELINE_SELECT << 16 |

From 18c76551ee425b981efefc61f663a7781df17882 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Sat, 2 Jan 2016 19:02:09 -0800
Subject: [PATCH 007/119] i965/gen6-7: Implement stall and flushes required
 prior to switching pipelines.

Switching the current pipeline while it's not completely idle or the
read and write caches aren't flushed can lead to corruption.  Fixes
misrendering of at least the following Khronos CTS test:

 ES31-CTS.shader_image_load_store.basic-allTargets-store-fs

The stall and flushes are no longer required on Gen8+.

v2: Emit PIPE_CONTROL with non-zero post-sync op before the write
    cache flush on SNB due to hardware bug. (Ken)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93323
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_misc_state.c | 37 ++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 2b2f031df63..95edbc9edcd 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -886,6 +886,43 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
 
          brw->ctx.NewDriverState |= BRW_NEW_CC_STATE;
       }
+
+   } else if (brw->gen >= 6) {
+      /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+       * PIPELINE_SELECT [DevBWR+]":
+       *
+       *   Project: DEVSNB+
+       *
+       *   Software must ensure all the write caches are flushed through a
+       *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
+       *   command to invalidate read only caches prior to programming
+       *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
+       */
+      const unsigned dc_flush =
+         brw->gen >= 7 ? PIPE_CONTROL_DATA_CACHE_INVALIDATE : 0;
+
+      if (brw->gen == 6) {
+         /* Hardware workaround: SNB B-Spec says:
+          *
+          *   Before a PIPE_CONTROL with Write Cache Flush Enable = 1, a
+          *   PIPE_CONTROL with any non-zero post-sync-op is required.
+          */
+         brw_emit_post_sync_nonzero_flush(brw);
+      }
+
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                  dc_flush |
+                                  PIPE_CONTROL_NO_WRITE |
+                                  PIPE_CONTROL_CS_STALL);
+
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_STATE_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+                                  PIPE_CONTROL_NO_WRITE);
    }
 
    /* Select the pipeline */

From 635be1402c485b154ab1bf23e5448827364e70a5 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Sat, 2 Jan 2016 19:05:48 -0800
Subject: [PATCH 008/119] i965/gen4-5: Emit MI_FLUSH as required prior to
 switching pipelines.

AFAIK brw_emit_select_pipeline() is only called once during context
init on Gen4-5, at which point the pipeline is likely to be already
idle so it may just happen to work by luck regardless of the MI_FLUSH.

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_misc_state.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 95edbc9edcd..8335865b2d3 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -923,6 +923,19 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
                                   PIPE_CONTROL_STATE_CACHE_INVALIDATE |
                                   PIPE_CONTROL_INSTRUCTION_INVALIDATE |
                                   PIPE_CONTROL_NO_WRITE);
+
+   } else {
+      /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+       * PIPELINE_SELECT [DevBWR+]":
+       *
+       *   Project: PRE-DEVSNB
+       *
+       *   Software must ensure the current pipeline is flushed via an
+       *   MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT.
+       */
+      BEGIN_BATCH(1);
+      OUT_BATCH(MI_FLUSH);
+      ADVANCE_BATCH();
    }
 
    /* Select the pipeline */

From c8df0e7bf35cbab649c8d0e0205746293e686ce3 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Sat, 2 Jan 2016 19:06:48 -0800
Subject: [PATCH 009/119] i965/gen7: Emit stall and dummy primitive draw after
 switching to the 3D pipeline.

This hardware bug can supposedly lead to a hang on IVB and VLV.

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_misc_state.c | 24 ++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 8335865b2d3..3686cdf8ff4 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -944,6 +944,30 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
              (brw->gen >= 9 ? (3 << 8) : 0) |
              (pipeline == BRW_COMPUTE_PIPELINE ? 2 : 0));
    ADVANCE_BATCH();
+
+   if (brw->gen == 7 && !brw->is_haswell &&
+       pipeline == BRW_RENDER_PIPELINE) {
+      /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+       * PIPELINE_SELECT [DevBWR+]":
+       *
+       *   Project: DEVIVB, DEVHSW:GT3:A0
+       *
+       *   Software must send a pipe_control with a CS stall and a post sync
+       *   operation and then a dummy DRAW after every MI_SET_CONTEXT and
+       *   after any PIPELINE_SELECT that is enabling 3D mode.
+       */
+      gen7_emit_cs_stall_flush(brw);
+
+      BEGIN_BATCH(7);
+      OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2));
+      OUT_BATCH(_3DPRIM_POINTLIST);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
 }
 
 /**

From 0556b87de4302195402ade43f400e859d9bfad0e Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Sun, 3 Jan 2016 10:06:52 -0800
Subject: [PATCH 010/119] i965/gen7.5+: Disable resource streamer during GPGPU
 workloads.

The RS and hardware binding tables are only supported on the 3D
pipeline and can lead to corruption if left enabled during a GPGPU
workload.  Disable it when switching to the GPGPU (or media) pipeline
and re-enable it when switching back to the 3D pipeline.

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
---
 .../drivers/dri/i965/brw_binding_tables.c     |  2 +-
 src/mesa/drivers/dri/i965/brw_misc_state.c    | 40 +++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_state.h         |  1 +
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c b/src/mesa/drivers/dri/i965/brw_binding_tables.c
index 7fa5d602b96..f3a0310861c 100644
--- a/src/mesa/drivers/dri/i965/brw_binding_tables.c
+++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c
@@ -365,7 +365,7 @@ gen7_disable_hw_binding_tables(struct brw_context *brw)
 /**
  * Enable hardware binding tables and set up the binding table pool.
  */
-static void
+void
 gen7_enable_hw_binding_tables(struct brw_context *brw)
 {
    if (!brw->use_resource_streamer)
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 3686cdf8ff4..319c2a5669f 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -868,6 +868,26 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
    const uint32_t _3DSTATE_PIPELINE_SELECT =
       is_965 ? CMD_PIPELINE_SELECT_965 : CMD_PIPELINE_SELECT_GM45;
 
+   if (brw->use_resource_streamer && pipeline != BRW_RENDER_PIPELINE) {
+      /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+       * PIPELINE_SELECT [DevBWR+]":
+       *
+       *   Project: HSW, BDW, CHV, SKL, BXT
+       *
+       *   Hardware Binding Tables are only supported for 3D
+       *   workloads. Resource streamer must be enabled only for 3D
+       *   workloads. Resource streamer must be disabled for Media and GPGPU
+       *   workloads.
+       */
+      BEGIN_BATCH(1);
+      OUT_BATCH(MI_RS_CONTROL | 0);
+      ADVANCE_BATCH();
+
+      gen7_disable_hw_binding_tables(brw);
+
+      /* XXX - Disable gather constant pool too when we start using it. */
+   }
+
    if (brw->gen >= 8 && brw->gen < 10) {
       /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
        *
@@ -968,6 +988,26 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
       OUT_BATCH(0);
       ADVANCE_BATCH();
    }
+
+   if (brw->use_resource_streamer && pipeline == BRW_RENDER_PIPELINE) {
+      /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+       * PIPELINE_SELECT [DevBWR+]":
+       *
+       *   Project: HSW, BDW, CHV, SKL, BXT
+       *
+       *   Hardware Binding Tables are only supported for 3D
+       *   workloads. Resource streamer must be enabled only for 3D
+       *   workloads. Resource streamer must be disabled for Media and GPGPU
+       *   workloads.
+       */
+      BEGIN_BATCH(1);
+      OUT_BATCH(MI_RS_CONTROL | 1);
+      ADVANCE_BATCH();
+
+      gen7_enable_hw_binding_tables(brw);
+
+      /* XXX - Re-enable gather constant pool here. */
+   }
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index d29b997b963..7d61b7c4ab6 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -396,6 +396,7 @@ void gen7_update_binding_table_from_array(struct brw_context *brw,
                                           gl_shader_stage stage,
                                           const uint32_t* binding_table,
                                           int num_surfaces);
+void gen7_enable_hw_binding_tables(struct brw_context *brw);
 void gen7_disable_hw_binding_tables(struct brw_context *brw);
 void gen7_reset_hw_bt_pool_offsets(struct brw_context *brw);
 

From cf96bce0ca8b2d6b5d4641efea1f2feedc024957 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Thu, 14 Jan 2016 14:10:59 +0200
Subject: [PATCH 011/119] glsl: mark explicit uniforms as explicit in other
 stages too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If shader declares uniform explicit location in one stage but
implicit in another, explicit location should be used. Patch marks
implicit uniforms as explicit if they were explicit in previous stage.
This makes sure that we don't treat them implicit later when assigning
locations.

Fixes following CTS test:
   ES31-CTS.explicit_uniform_location.uniform-loc-implicit-in-some-stages3

v2: move check to cross_validate_globals (Timothy)

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
---
 src/glsl/linker.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 564c4712871..3f40c0be332 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -992,7 +992,17 @@ cross_validate_globals(struct gl_shader_program *prog,
 
 	       existing->data.location = var->data.location;
 	       existing->data.explicit_location = true;
-	    }
+	    } else {
+               /* Check if uniform with implicit location was marked explicit
+                * by earlier shader stage. If so, mark it explicit in this stage
+                * too to make sure later processing does not treat it as
+                * implicit one.
+                */
+               if (existing->data.explicit_location) {
+	          var->data.location = existing->data.location;
+	          var->data.explicit_location = true;
+               }
+            }
 
             /* From the GLSL 4.20 specification:
              * "A link error will result if two compilation units in a program

From bb6612f06b8320911d9f86f7463f823e76969f15 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 13 Jan 2016 13:32:44 -0800
Subject: [PATCH 012/119] nir/builder: Add a nir_build_ivec4() convenience
 helper.

nir_build_ivec4 is more readable and succinct than using nir_build_imm
directly, even if you have C99.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/glsl/nir/nir_builder.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/glsl/nir/nir_builder.h b/src/glsl/nir/nir_builder.h
index cfaaf8e03e3..88ba3a1c269 100644
--- a/src/glsl/nir/nir_builder.h
+++ b/src/glsl/nir/nir_builder.h
@@ -120,6 +120,20 @@ nir_imm_int(nir_builder *build, int x)
    return nir_build_imm(build, 1, v);
 }
 
+static inline nir_ssa_def *
+nir_imm_ivec4(nir_builder *build, int x, int y, int z, int w)
+{
+   nir_const_value v;
+
+   memset(&v, 0, sizeof(v));
+   v.i[0] = x;
+   v.i[1] = y;
+   v.i[2] = z;
+   v.i[3] = w;
+
+   return nir_build_imm(build, 4, v);
+}
+
 static inline nir_ssa_def *
 nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0,
               nir_ssa_def *src1, nir_ssa_def *src2, nir_ssa_def *src3)

From 824d82025d0bff9841647942aca501fba16fc1a9 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 13 Jan 2016 15:23:48 -0800
Subject: [PATCH 013/119] i965: Make an is_scalar boolean in brw_compile_vs().

Shorter than compiler->scalar_stage[MESA_SHADER_VERTEX], which can
help with line-wrapping.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index c6a52c5d183..ca270660458 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1988,11 +1988,11 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
                unsigned *final_assembly_size,
                char **error_str)
 {
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
    shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
-                                      compiler->scalar_stage[MESA_SHADER_VERTEX]);
-   shader = brw_postprocess_nir(shader, compiler->devinfo,
-                                compiler->scalar_stage[MESA_SHADER_VERTEX]);
+                                      is_scalar);
+   shader = brw_postprocess_nir(shader, compiler->devinfo, is_scalar);
 
    const unsigned *assembly = NULL;
 
@@ -2018,7 +2018,7 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
     * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
     * vec4 mode, the hardware appears to wedge unless we read something.
     */
-   if (compiler->scalar_stage[MESA_SHADER_VERTEX])
+   if (is_scalar)
       prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2);
    else
       prog_data->base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2);
@@ -2037,7 +2037,7 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
    else
       prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
 
-   if (compiler->scalar_stage[MESA_SHADER_VERTEX]) {
+   if (is_scalar) {
       prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
 
       fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base,

From a3500f943e2c61c0aed043108132f35b79d16676 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 13 Jan 2016 15:04:39 -0800
Subject: [PATCH 014/119] i965: Make add_const_offset_to_base() work at the
 shader level.

This makes it a pass, hiding the parameter structs and block callbacks
so it's simpler to work with.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_nir.c | 38 ++++++++++++++++-------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index f8b258bf96c..55ba732dec2 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -60,7 +60,7 @@ struct add_const_offset_to_base_params {
 };
 
 static bool
-add_const_offset_to_base(nir_block *block, void *closure)
+add_const_offset_to_base_block(nir_block *block, void *closure)
 {
    struct add_const_offset_to_base_params *params = closure;
    nir_builder *b = &params->b;
@@ -85,7 +85,19 @@ add_const_offset_to_base(nir_block *block, void *closure)
       }
    }
    return true;
+}
 
+static void
+add_const_offset_to_base(nir_shader *nir, nir_variable_mode mode)
+{
+   struct add_const_offset_to_base_params params = { .mode = mode };
+
+   nir_foreach_function(nir, f) {
+      if (f->impl) {
+         nir_builder_init(&params.b, f->impl);
+         nir_foreach_block(f->impl, add_const_offset_to_base_block, &params);
+      }
+   }
 }
 
 static bool
@@ -195,10 +207,6 @@ brw_nir_lower_inputs(nir_shader *nir,
                      const struct brw_device_info *devinfo,
                      bool is_scalar)
 {
-   struct add_const_offset_to_base_params params = {
-      .mode = nir_var_shader_in
-   };
-
    switch (nir->stage) {
    case MESA_SHADER_VERTEX:
       /* Start with the location of the variable's base. */
@@ -224,10 +232,10 @@ brw_nir_lower_inputs(nir_shader *nir,
          /* This pass needs actual constants */
          nir_opt_constant_folding(nir);
 
+         add_const_offset_to_base(nir, nir_var_shader_in);
+
          nir_foreach_function(nir, function) {
             if (function->impl) {
-               nir_builder_init(&params.b, function->impl);
-               nir_foreach_block(function->impl, add_const_offset_to_base, &params);
                nir_foreach_block(function->impl, remap_vs_attrs, &inputs_read);
             }
          }
@@ -270,10 +278,10 @@ brw_nir_lower_inputs(nir_shader *nir,
          /* This pass needs actual constants */
          nir_opt_constant_folding(nir);
 
+         add_const_offset_to_base(nir, nir_var_shader_in);
+
          nir_foreach_function(nir, function) {
             if (function->impl) {
-               nir_builder_init(&params.b, function->impl);
-               nir_foreach_block(function->impl, add_const_offset_to_base, &params);
                nir_foreach_block(function->impl, remap_inputs_with_vue_map,
                                  &input_vue_map);
             }
@@ -296,10 +304,10 @@ brw_nir_lower_inputs(nir_shader *nir,
       /* This pass needs actual constants */
       nir_opt_constant_folding(nir);
 
+      add_const_offset_to_base(nir, nir_var_shader_in);
+
       nir_foreach_function(nir, function) {
          if (function->impl) {
-            nir_builder_init(&params.b, function->impl);
-            nir_foreach_block(function->impl, add_const_offset_to_base, &params);
             nir_builder_init(&state.b, function->impl);
             nir_foreach_block(function->impl, remap_patch_urb_offsets, &state);
          }
@@ -339,10 +347,6 @@ brw_nir_lower_outputs(nir_shader *nir,
       }
       break;
    case MESA_SHADER_TESS_CTRL: {
-      struct add_const_offset_to_base_params params = {
-         .mode = nir_var_shader_out
-      };
-
       struct remap_patch_urb_offsets_state state;
       brw_compute_tess_vue_map(&state.vue_map, nir->info.outputs_written,
                                nir->info.patch_outputs_written);
@@ -356,10 +360,10 @@ brw_nir_lower_outputs(nir_shader *nir,
       /* This pass needs actual constants */
       nir_opt_constant_folding(nir);
 
+      add_const_offset_to_base(nir, nir_var_shader_out);
+
       nir_foreach_function(nir, function) {
          if (function->impl) {
-            nir_builder_init(&params.b, function->impl);
-            nir_foreach_block(function->impl, add_const_offset_to_base, &params);
             nir_builder_init(&state.b, function->impl);
             nir_foreach_block(function->impl, remap_patch_urb_offsets, &state);
          }

From 3657cbf24f3b0baf7e3382e572d97a36b0ed4103 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 13 Jan 2016 15:07:18 -0800
Subject: [PATCH 015/119] i965: Apply add_const_offset_to_base for vec4 VS
 inputs too.

This shouldn't hurt anything, and I'm about to introduce a pass that
will want it.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_nir.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 55ba732dec2..935529a6003 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -220,6 +220,11 @@ brw_nir_lower_inputs(nir_shader *nir,
        */
       nir_lower_io(nir, nir_var_shader_in, type_size_vec4);
 
+      /* This pass needs actual constants */
+      nir_opt_constant_folding(nir);
+
+      add_const_offset_to_base(nir, nir_var_shader_in);
+
       if (is_scalar) {
          /* Finally, translate VERT_ATTRIB_* values into the actual registers.
           *
@@ -229,11 +234,6 @@ brw_nir_lower_inputs(nir_shader *nir,
           */
          GLbitfield64 inputs_read = nir->info.inputs_read;
 
-         /* This pass needs actual constants */
-         nir_opt_constant_folding(nir);
-
-         add_const_offset_to_base(nir, nir_var_shader_in);
-
          nir_foreach_function(nir, function) {
             if (function->impl) {
                nir_foreach_block(function->impl, remap_vs_attrs, &inputs_read);

From 781d2787bc1cf975757a95d0d9324f734fa61c09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Samuel=20Iglesias=20Gons=C3=A1lvez?= <siglesias@igalia.com>
Date: Tue, 12 Jan 2016 15:36:56 +0100
Subject: [PATCH 016/119] glsl: restrict consumer stage condition to modify
 interpolation type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Only modify interpolation type for integer-based varyings or when the
consumer is known and different than fragment shader.

If we are linking separate shader programs and the consumer is unknown,
the consumer could be added later and be a fragment shader. If we
modify the interpolation type in this case, we could read wrong
values in the fragment shader inputs, as shown in bug 93320.

Fixes the following CTS test:
   ES31-CTS.vertex_attrib_binding.advanced-bindingUpdate

Fixes the following dEQP tests:

dEQP-GLES31.functional.separate_shader.random.102
dEQP-GLES31.functional.separate_shader.random.111
dEQP-GLES31.functional.separate_shader.random.115
dEQP-GLES31.functional.separate_shader.random.17
dEQP-GLES31.functional.separate_shader.random.22
dEQP-GLES31.functional.separate_shader.random.23
dEQP-GLES31.functional.separate_shader.random.3
dEQP-GLES31.functional.separate_shader.random.32
dEQP-GLES31.functional.separate_shader.random.39
dEQP-GLES31.functional.separate_shader.random.64
dEQP-GLES31.functional.separate_shader.random.73
dEQP-GLES31.functional.separate_shader.random.91

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93320
Signed-off-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/glsl/link_varyings.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 7cc58800765..09f80d0f39d 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -968,10 +968,12 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
    }
 
    if ((consumer_var == NULL && producer_var->type->contains_integer()) ||
-       consumer_stage != MESA_SHADER_FRAGMENT) {
+       (consumer_stage != -1 && consumer_stage != MESA_SHADER_FRAGMENT)) {
       /* Since this varying is not being consumed by the fragment shader, its
-       * interpolation type varying cannot possibly affect rendering.  Also,
-       * this variable is non-flat and is (or contains) an integer.
+       * interpolation type varying cannot possibly affect rendering.
+       * Also, this variable is non-flat and is (or contains) an integer.
+       * If the consumer stage is unknown, don't modify the interpolation
+       * type as it could affect rendering later with separate shaders.
        *
        * lower_packed_varyings requires all integer varyings to flat,
        * regardless of where they appear.  We can trivially satisfy that

From 2f9a325b6ac1609a9986b4bee161610730da7da5 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Fri, 15 Jan 2016 20:12:24 +0100
Subject: [PATCH 017/119] llvmpipe: fix "leaking" textures

This was not really a leak per se, but we were referencing the textures for
longer than intended. If textures were set via llvmpipe_set_sampler_views()
(for fs) and then picked up by lp_setup_set_fragment_sampler_views(), they
were referenced in the setup state. However, the only way to unreference them
was by replacing them with another texture, and not when the texture slot
was replaced with a NULL sampler view. (They were then further also referenced
by the scene too which might have additional minor side effects as we limit
the memory size which is allowed to be referenced by a scene in a rather crude
way.) Only setup destruction (at context destruction time) then finally would
get rid of the references.
Fix this by noting the number of textures the last time, and unreference
things if the new view is NULL (avoiding having to unreference things
always up to PIPE_MAX_SHADER_SAMPLER_VIEWS which would also have worked).
Found by code inspection, no test...

v2: rename var

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/drivers/llvmpipe/lp_setup.c         | 10 ++++++++--
 src/gallium/drivers/llvmpipe/lp_setup_context.h |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index bd850519468..e8c3e7c7678 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -796,13 +796,15 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
                                     unsigned num,
                                     struct pipe_sampler_view **views)
 {
-   unsigned i;
+   unsigned i, max_tex_num;
 
    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
 
    assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS);
 
-   for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
+   max_tex_num = MAX2(num, setup->fs.current_tex_num);
+
+   for (i = 0; i < max_tex_num; i++) {
       struct pipe_sampler_view *view = i < num ? views[i] : NULL;
 
       if (view) {
@@ -922,7 +924,11 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
             assert(jit_tex->base);
          }
       }
+      else {
+         pipe_resource_reference(&setup->fs.current_tex[i], NULL);
+      }
    }
+   setup->fs.current_tex_num = num;
 
    setup->dirty |= LP_SETUP_NEW_FS;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
index 80acd74bddd..03bb8ce2b6f 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -133,6 +133,7 @@ struct lp_setup_context
       const struct lp_rast_state *stored; /**< what's in the scene */
       struct lp_rast_state current;  /**< currently set state */
       struct pipe_resource *current_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+      unsigned current_tex_num;
    } fs;
 
    /** fragment shader constants */

From 03f66dfb4b399f078fc1bc99be1f0937ce981def Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Fri, 15 Jan 2016 03:10:40 +0100
Subject: [PATCH 018/119] llvmpipe: ditch additional ref counting for
 vertex/geometry sampler views

The cleaning up was quite a performance hog (making pipe_resource_reference
the number two in profilers on the vertex path, and 3rd overall, with its
cousin pipe_reference_described not far behind) if there were lots
of tiny draw calls (ipers). Now the reason was really that it was blindly
calling this for all potential shader views (so 32 each for vs and gs) even
though the app never touched a single one which could have been fixed,
however I can't come up with a good reason why we refcount these. We've got
references, of course, in the sampler views, which should be quite sufficient
as we do all vertex and geometry shader execution fully synchronous.
(Calling prepare_shader_sampling for all draw calls even if there were no
changes looks quite suboptimal too, but generally we don't really expect vs/gs
shader sampling to be used much with llvmpipe, and there's even an early exit
if there aren't any views to avoid the "null loop" albeit it's now no longer
always trying to loop through all 32 slots. Maybe improve another time...).
Of course, if we manage to make vertex loads run asynchronously some day,
we need references again, but adding that back would be the least of the
problems...
Also only set LP_NEW_SAMPLER_VIEW for fragment sampler views. Nothing on the
vertex side depends on it (I suppose we'd really wanted a separate flag in
any case).
(Good for a 3% improvement or so in ipers under the right conditions.)

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/drivers/llvmpipe/lp_context.h     |  2 -
 src/gallium/drivers/llvmpipe/lp_draw_arrays.c |  3 --
 src/gallium/drivers/llvmpipe/lp_state.h       |  6 ---
 .../drivers/llvmpipe/lp_state_sampler.c       | 47 +++++--------------
 4 files changed, 12 insertions(+), 46 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 62d99bbaac8..d4bd02d0225 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -82,8 +82,6 @@ struct llvmpipe_context {
    struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS];
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
    struct pipe_index_buffer index_buffer;
-   struct pipe_resource *mapped_vs_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-   struct pipe_resource *mapped_gs_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS];
 
    unsigned num_samplers[PIPE_SHADER_TYPES];
    unsigned num_sampler_views[PIPE_SHADER_TYPES];
diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
index edfb2040969..22ef5fc17f9 100644
--- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
+++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@@ -149,9 +149,6 @@ llvmpipe_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
          draw_vs_reset_so(lp->vs);
       }
    }
-   
-   llvmpipe_cleanup_vertex_sampling(lp);
-   llvmpipe_cleanup_geometry_sampling(lp);
 
    /*
     * TODO: Flush only when a user vertex/index buffer is present
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
index 2da6caaef16..78918cf984d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state.h
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -130,16 +130,10 @@ void
 llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *ctx,
                                  unsigned num,
                                  struct pipe_sampler_view **views);
-void
-llvmpipe_cleanup_vertex_sampling(struct llvmpipe_context *ctx);
-
 
 void
 llvmpipe_prepare_geometry_sampling(struct llvmpipe_context *ctx,
                                    unsigned num,
                                    struct pipe_sampler_view **views);
-void
-llvmpipe_cleanup_geometry_sampling(struct llvmpipe_context *ctx);
-
 
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 1e055878f7c..69af38e92c0 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -98,8 +98,9 @@ llvmpipe_bind_sampler_states(struct pipe_context *pipe,
                         llvmpipe->samplers[shader],
                         llvmpipe->num_samplers[shader]);
    }
-
-   llvmpipe->dirty |= LP_NEW_SAMPLER;
+   else {
+      llvmpipe->dirty |= LP_NEW_SAMPLER;
+   }
 }
 
 
@@ -146,8 +147,9 @@ llvmpipe_set_sampler_views(struct pipe_context *pipe,
                              llvmpipe->sampler_views[shader],
                              llvmpipe->num_sampler_views[shader]);
    }
-
-   llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
+   else {
+      llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
+   }
 }
 
 
@@ -228,8 +230,7 @@ prepare_shader_sampling(
    struct llvmpipe_context *lp,
    unsigned num,
    struct pipe_sampler_view **views,
-   unsigned shader_type,
-   struct pipe_resource *mapped_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS])
+   unsigned shader_type)
 {
 
    unsigned i;
@@ -242,7 +243,7 @@ prepare_shader_sampling(
    if (!num)
       return;
 
-   for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
+   for (i = 0; i < num; i++) {
       struct pipe_sampler_view *view = i < num ? views[i] : NULL;
 
       if (view) {
@@ -253,11 +254,6 @@ prepare_shader_sampling(
          unsigned first_level = 0;
          unsigned last_level = 0;
 
-         /* We're referencing the texture's internal data, so save a
-          * reference to it.
-          */
-         pipe_resource_reference(&mapped_tex[i], tex);
-
          if (!lp_tex->dt) {
             /* regular texture - setup array of mipmap level offsets */
             struct pipe_resource *res = view->texture;
@@ -335,47 +331,28 @@ prepare_shader_sampling(
 
 
 /**
- * Called during state validation when LP_NEW_SAMPLER_VIEW is set.
+ * Called whenever we're about to draw (no dirty flag, FIXME?).
  */
 void
 llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *lp,
                                  unsigned num,
                                  struct pipe_sampler_view **views)
 {
-   prepare_shader_sampling(lp, num, views, PIPE_SHADER_VERTEX,
-                           lp->mapped_vs_tex);
-}
-
-void
-llvmpipe_cleanup_vertex_sampling(struct llvmpipe_context *ctx)
-{
-   unsigned i;
-   for (i = 0; i < Elements(ctx->mapped_vs_tex); i++) {
-      pipe_resource_reference(&ctx->mapped_vs_tex[i], NULL);
-   }
+   prepare_shader_sampling(lp, num, views, PIPE_SHADER_VERTEX);
 }
 
 
 /**
- * Called during state validation when LP_NEW_SAMPLER_VIEW is set.
+ * Called whenever we're about to draw (no dirty flag, FIXME?).
  */
 void
 llvmpipe_prepare_geometry_sampling(struct llvmpipe_context *lp,
                                    unsigned num,
                                    struct pipe_sampler_view **views)
 {
-   prepare_shader_sampling(lp, num, views, PIPE_SHADER_GEOMETRY,
-                           lp->mapped_gs_tex);
+   prepare_shader_sampling(lp, num, views, PIPE_SHADER_GEOMETRY);
 }
 
-void
-llvmpipe_cleanup_geometry_sampling(struct llvmpipe_context *ctx)
-{
-   unsigned i;
-   for (i = 0; i < Elements(ctx->mapped_gs_tex); i++) {
-      pipe_resource_reference(&ctx->mapped_gs_tex[i], NULL);
-   }
-}
 
 void
 llvmpipe_init_sampler_funcs(struct llvmpipe_context *llvmpipe)

From 0a6811207fbe18d49c7ab95f93ed01f75ffcdda0 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Thu, 14 Jan 2016 12:08:57 -0800
Subject: [PATCH 019/119] i965/vec4: Use UW type for multiply into accumulator
 on GEN8+

BDW adds the following restriction: "When multiplying DW x DW, the dst
cannot be accumulator."

Cc: "11.1,11.0" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 0ae723f07e9..4ee2ed47d40 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1069,7 +1069,11 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_umul_high: {
       struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
 
-      emit(MUL(acc, op[0], op[1]));
+      if (devinfo->gen >=8)
+         emit(MUL(acc, op[0], retype(op[1], BRW_REGISTER_TYPE_UW)));
+      else
+         emit(MUL(acc, op[0], op[1]));
+
       emit(MACH(dst, op[0], op[1]));
       break;
    }

From 9870f798beab701a9edda81ff7ccc39f1875d610 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 14 Jan 2016 20:27:51 -0800
Subject: [PATCH 020/119] i965/fs/generator: Take an actual shader stage rather
 than a string

Cc: "11.1" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp   | 2 +-
 src/mesa/drivers/dri/i965/brw_fs.cpp              | 6 ++++--
 src/mesa/drivers/dri/i965/brw_fs.h                | 4 ++--
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp    | 7 ++++---
 src/mesa/drivers/dri/i965/brw_shader.cpp          | 2 +-
 src/mesa/drivers/dri/i965/brw_vec4.cpp            | 2 +-
 src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp | 2 +-
 7 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
index e684bdbb72c..c6ae3d848bf 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
@@ -32,7 +32,7 @@ brw_blorp_eu_emitter::brw_blorp_eu_emitter(struct brw_context *brw,
      generator(brw->intelScreen->compiler, brw,
                mem_ctx, (void *) rzalloc(mem_ctx, struct brw_wm_prog_key),
                (struct brw_stage_prog_data *) rzalloc(mem_ctx, struct brw_wm_prog_data),
-               0, false, "BLORP")
+               0, false, MESA_SHADER_FRAGMENT)
 {
    if (debug_flag)
       generator.enable_debug("blorp");
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index d4205ba66ad..05f335e5dfc 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -5603,7 +5603,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
    }
 
    fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
-                  v.promoted_constants, v.runtime_check_aads_emit, "FS");
+                  v.promoted_constants, v.runtime_check_aads_emit,
+                  MESA_SHADER_FRAGMENT);
 
    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
       g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
@@ -5728,7 +5729,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
    }
 
    fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
-                  v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
+                  v8.promoted_constants, v8.runtime_check_aads_emit,
+                  MESA_SHADER_COMPUTE);
    if (INTEL_DEBUG & DEBUG_CS) {
       char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
                                    shader->info.label ? shader->info.label :
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 372f7606cef..9a54c2dd0be 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -425,7 +425,7 @@ public:
                 struct brw_stage_prog_data *prog_data,
                 unsigned promoted_constants,
                 bool runtime_check_aads_emit,
-                const char *stage_abbrev);
+                gl_shader_stage stage);
    ~fs_generator();
 
    void enable_debug(const char *shader_name);
@@ -536,7 +536,7 @@ private:
    bool runtime_check_aads_emit;
    bool debug_flag;
    const char *shader_name;
-   const char *stage_abbrev;
+   gl_shader_stage stage;
    void *mem_ctx;
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index f582a817bd1..741a77ba566 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -111,14 +111,14 @@ fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
                            struct brw_stage_prog_data *prog_data,
                            unsigned promoted_constants,
                            bool runtime_check_aads_emit,
-                           const char *stage_abbrev)
+                           gl_shader_stage stage)
 
    : compiler(compiler), log_data(log_data),
      devinfo(compiler->devinfo), key(key),
      prog_data(prog_data),
      promoted_constants(promoted_constants),
      runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
-     stage_abbrev(stage_abbrev), mem_ctx(mem_ctx)
+     stage(stage), mem_ctx(mem_ctx)
 {
    p = rzalloc(mem_ctx, struct brw_codegen);
    brw_init_codegen(devinfo, p, mem_ctx);
@@ -2306,7 +2306,8 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
                               "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
                               "%d:%d spills:fills, Promoted %u constants, "
                               "compacted %d to %d bytes.",
-                              stage_abbrev, dispatch_width, before_size / 16,
+                              _mesa_shader_stage_to_abbrev(stage),
+                              dispatch_width, before_size / 16,
                               loop_count, cfg->cycle_count, spill_count,
                               fill_count, promoted_constants, before_size,
                               after_size);
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 3a69c23446c..d92bad25a72 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -1411,7 +1411,7 @@ brw_compile_tes(const struct brw_compiler *compiler,
 
       fs_generator g(compiler, log_data, mem_ctx, (void *) key,
                      &prog_data->base.base, v.promoted_constants, false,
-                     "TES");
+                     MESA_SHADER_TESS_EVAL);
       if (unlikely(INTEL_DEBUG & DEBUG_TES)) {
          g.enable_debug(ralloc_asprintf(mem_ctx,
                                         "%s tessellation evaluation shader %s",
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index ca270660458..e7aec1f1a9d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -2052,7 +2052,7 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
 
       fs_generator g(compiler, log_data, mem_ctx, (void *) key,
                      &prog_data->base.base, v.promoted_constants,
-                     v.runtime_check_aads_emit, "VS");
+                     v.runtime_check_aads_emit, MESA_SHADER_VERTEX);
       if (INTEL_DEBUG & DEBUG_VS) {
          const char *debug_name =
             ralloc_asprintf(mem_ctx, "%s vertex shader %s",
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index b13d36e2c7d..b2a971a40ff 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -839,7 +839,7 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
 
          fs_generator g(compiler, log_data, mem_ctx, &c.key,
                         &prog_data->base.base, v.promoted_constants,
-                        false, "GS");
+                        false, MESA_SHADER_GEOMETRY);
          if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
             const char *label =
                shader->info.label ? shader->info.label : "unnamed";

From 61b0cfd84ee6fb1273928ee2c8751301ae805eaa Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 14 Jan 2016 20:42:47 -0800
Subject: [PATCH 021/119] i965/fs: Always set channel 2 of texture headers in
 some stages

In the vertex and fragment stages, the hardware is nice to us and leaves
g0.2 zerod out for us so we can use it for headers.  However, in compute,
geometry, and tessellation stages, the hardware is not so nice.  In
particular, for compute shaders on BDW, the hardware places some debug bits
in 23:15.  As it happens, bit 15 is interpreted by the sampler as the alpha
channel mask.  This means that if you use a texturing instruction with a
header in a compute shader, you may randomly get the alpha channel
disabled.  Since channel masks affect the return length of the sampler
message, this can lead the GPU to expect a different mlen to the one you
specified in the shader and this, in turn, hangs your GPU.

Cc: "11.1" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 741a77ba566..e05622ae7a8 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -913,6 +913,14 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
             /* Set the offset bits in DWord 2. */
             brw_MOV(p, get_element_ud(header_reg, 2),
                        brw_imm_ud(inst->offset));
+         } else if (stage != MESA_SHADER_VERTEX &&
+                    stage != MESA_SHADER_FRAGMENT) {
+            /* The vertex and fragment stages have g0.2 set to 0, so
+             * header0.2 is 0 when g0 is copied. Other stages may not, so we
+             * must set it to 0 to avoid setting undesirable bits in the
+             * message.
+             */
+            brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0));
          }
 
          brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);

From d54a70aa18a0139a5eefbc5193fabaf739317533 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 14 Jan 2016 23:27:03 -0800
Subject: [PATCH 022/119] glsl: Allow implicit int -> uint conversions for
 bitwise operators (&, ^, |).

The ARB has decided that implicit conversions should be performed for
bitwise operators in future language revisions.  Implementations of
current language revisions may or may not perform them.

This patch makes Mesa apply implicti conversions even on current
language versions.  Applications appear to expect this behavior,
and there's really no downside to doing so.

Fixes shader compilation in Shadow of Mordor.

Bugzilla: https://www.khronos.org/bugzilla/show_bug.cgi?id=1405
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
Cc: mesa-stable@lists.freedesktop.org
---
 src/glsl/ast_to_hir.cpp | 46 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 8 deletions(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 13696a36b0c..0f51c54bfe3 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -487,15 +487,17 @@ unary_arithmetic_result_type(const struct glsl_type *type,
  * If the given types to the bit-logic operator are invalid, return
  * glsl_type::error_type.
  *
- * \param type_a Type of LHS of bit-logic op
- * \param type_b Type of RHS of bit-logic op
+ * \param value_a LHS of bit-logic op
+ * \param value_b RHS of bit-logic op
  */
 static const struct glsl_type *
-bit_logic_result_type(const struct glsl_type *type_a,
-                      const struct glsl_type *type_b,
+bit_logic_result_type(ir_rvalue * &value_a, ir_rvalue * &value_b,
                       ast_operators op,
                       struct _mesa_glsl_parse_state *state, YYLTYPE *loc)
 {
+   const glsl_type *type_a = value_a->type;
+   const glsl_type *type_b = value_b->type;
+
    if (!state->check_bitwise_operations_allowed(loc)) {
       return glsl_type::error_type;
    }
@@ -517,6 +519,36 @@ bit_logic_result_type(const struct glsl_type *type_a,
       return glsl_type::error_type;
    }
 
+   /* Prior to GLSL 4.0 / GL_ARB_gpu_shader5, implicit conversions didn't
+    * make sense for bitwise operations, as they don't operate on floats.
+    *
+    * GLSL 4.0 added implicit int -> uint conversions, which are relevant
+    * here.  It wasn't clear whether or not we should apply them to bitwise
+    * operations.  However, Khronos has decided that they should in future
+    * language revisions.  Applications also rely on this behavior.  We opt
+    * to apply them in general, but issue a portability warning.
+    *
+    * See https://www.khronos.org/bugzilla/show_bug.cgi?id=1405
+    */
+   if (type_a->base_type != type_b->base_type) {
+      if (!apply_implicit_conversion(type_a, value_b, state)
+          && !apply_implicit_conversion(type_b, value_a, state)) {
+         _mesa_glsl_error(loc, state,
+                          "could not implicitly convert operands to "
+                          "`%s` operator",
+                          ast_expression::operator_string(op));
+         return glsl_type::error_type;
+      } else {
+         _mesa_glsl_warning(loc, state,
+                            "some implementations may not support implicit "
+                            "int -> uint conversions for `%s' operators; "
+                            "consider casting explicitly for portability",
+                            ast_expression::operator_string(op));
+      }
+      type_a = value_a->type;
+      type_b = value_b->type;
+   }
+
    /*     "The fundamental types of the operands (signed or unsigned) must
     *     match,"
     */
@@ -1434,8 +1466,7 @@ ast_expression::do_hir(exec_list *instructions,
    case ast_bit_or:
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
-      type = bit_logic_result_type(op[0]->type, op[1]->type, this->oper,
-                                   state, &loc);
+      type = bit_logic_result_type(op[0], op[1], this->oper, state, &loc);
       result = new(ctx) ir_expression(operations[this->oper], type,
                                       op[0], op[1]);
       error_emitted = op[0]->type->is_error() || op[1]->type->is_error();
@@ -1625,8 +1656,7 @@ ast_expression::do_hir(exec_list *instructions,
    case ast_or_assign: {
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
-      type = bit_logic_result_type(op[0]->type, op[1]->type, this->oper,
-                                   state, &loc);
+      type = bit_logic_result_type(op[0], op[1], this->oper, state, &loc);
       ir_rvalue *temp_rhs = new(ctx) ir_expression(operations[this->oper],
                                                    type, op[0], op[1]);
       error_emitted =

From e5fefe49f2f86b5125da87c44fe855fd6f3424fc Mon Sep 17 00:00:00 2001
From: Jeff Muizelaar <jmuizelaar@mozilla.com>
Date: Sat, 16 Jan 2016 03:35:26 +0100
Subject: [PATCH 023/119] gallivm: avoid crashing in mod by 0 with llvmpipe

This adds code that is basically the same as the code in umod, udiv and idiv.
However, unlike idiv we return -1.

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 .../auxiliary/gallivm/lp_bld_tgsi_action.c     | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 3d5e2cb316b..6f75bec5005 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -1536,8 +1536,22 @@ mod_emit_cpu(
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   emit_data->output[emit_data->chan] = lp_build_mod(&bld_base->int_bld,
-                                   emit_data->args[0], emit_data->args[1]);
+   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+   LLVMValueRef div_mask = lp_build_cmp(&bld_base->uint_bld,
+                                        PIPE_FUNC_EQUAL, emit_data->args[1],
+                                        bld_base->uint_bld.zero);
+   /* We want to make sure that we never divide/mod by zero to not
+    * generate sigfpe. We don't want to crash just because the
+    * shader is doing something weird. */
+   LLVMValueRef divisor = LLVMBuildOr(builder,
+                                      div_mask,
+                                      emit_data->args[1], "");
+   LLVMValueRef result = lp_build_mod(&bld_base->int_bld,
+                                      emit_data->args[0], divisor);
+   /* umod by zero doesn't have a guaranteed return value chose -1 for now. */
+   emit_data->output[emit_data->chan] = LLVMBuildOr(builder,
+                                                    div_mask,
+                                                    result, "");
 }
 
 /* TGSI_OPCODE_NOT */

From 32a9fe013b9fbc93d9045341cb7cbe13231523fb Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Fri, 15 Jan 2016 17:12:27 -0500
Subject: [PATCH 024/119] nv50/ir: add saturate support on ex2

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp   | 5 +++++
 src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp | 1 +
 2 files changed, 6 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index c126c085daf..bc8354deba1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -1463,6 +1463,7 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp)
 
    if (i->encSize == 4) {
       assert(i->op == OP_RCP);
+      assert(!i->saturate);
       code[0] |= i->src(0).mod.abs() << 15;
       code[0] |= i->src(0).mod.neg() << 22;
       emitForm_MUL(i);
@@ -1470,6 +1471,10 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp)
       code[1] = subOp << 29;
       code[1] |= i->src(0).mod.abs() << 20;
       code[1] |= i->src(0).mod.neg() << 26;
+      if (i->saturate) {
+         assert(subOp == 6 && i->op == OP_EX2);
+         code[1] |= 1 << 27;
+      }
       emitForm_MAD(i);
    }
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index 101082e7491..2c4d7f53d60 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -99,6 +99,7 @@ static const struct opProperties _initProps[] =
    { OP_SET,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
    { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
    { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_EX2,    0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0, 0x0 },
    { OP_LG2,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
    { OP_RCP,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
    { OP_RSQ,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },

From 211b0644e6e99fa223b448288efa93e7f1b740a4 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 4 Jan 2016 13:24:08 -0500
Subject: [PATCH 025/119] nir: few missing struct names

nir.h is a bit inconsistent about 'typedef struct {} nir_foo' vs
'typedef struct nir_foo {} nir_foo'.  But missing struct name tags is
inconvenient when you need a fwd declaration without pulling in all
of nir.

So add missing struct name tag for nir_variable, and a couple other
spots where it would likely be useful.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
---
 src/glsl/nir/nir.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 11add65988c..e2bd2bfa025 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -139,7 +139,7 @@ typedef enum {
  * ir_variable - it should be easy to translate between the two.
  */
 
-typedef struct {
+typedef struct nir_variable {
    struct exec_node node;
 
    /**
@@ -349,7 +349,7 @@ typedef struct {
 #define nir_foreach_variable(var, var_list) \
    foreach_list_typed(nir_variable, var, node, var_list)
 
-typedef struct {
+typedef struct nir_register {
    struct exec_node node;
 
    unsigned num_components; /** < number of vector components */
@@ -443,7 +443,7 @@ nir_instr_is_last(nir_instr *instr)
    return exec_node_is_tail_sentinel(exec_node_get_next(&instr->node));
 }
 
-typedef struct {
+typedef struct nir_ssa_def {
    /** for debugging only, can be NULL */
    const char* name;
 

From 683794fd608688b4228941a4d662ad2e9311863b Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 15 Jan 2016 18:24:11 -0500
Subject: [PATCH 026/119] nir/print: const_index is signed

Noticed this with $piglit/bin/vp-address-01

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/glsl/nir/nir_print.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glsl/nir/nir_print.c b/src/glsl/nir/nir_print.c
index 80638ed58f2..be9ca4e0f57 100644
--- a/src/glsl/nir/nir_print.c
+++ b/src/glsl/nir/nir_print.c
@@ -480,7 +480,7 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
       if (i != 0)
          fprintf(fp, ", ");
 
-      fprintf(fp, "%u", instr->const_index[i]);
+      fprintf(fp, "%d", instr->const_index[i]);
    }
 
    fprintf(fp, ")");

From 6f0377d65180febf622318c10f7a9190fac22f3b Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Wed, 13 Jan 2016 18:39:56 -0500
Subject: [PATCH 027/119] ttn: add missing writemask on store_output

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/gallium/auxiliary/nir/tgsi_to_nir.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 7c577592f70..9b194481277 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -1932,6 +1932,7 @@ ttn_add_output_stores(struct ttn_compile *c)
          store->src[0].reg.reg = c->output_regs[loc].reg;
          store->src[0].reg.base_offset = c->output_regs[loc].offset;
          store->const_index[0] = loc;
+         store->const_index[1] = 0xf;  /* writemask */
          store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
          nir_builder_instr_insert(b, &store->instr);
       }

From d430f443de7dd554c68ba830ce23e56e87e2d9fe Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Wed, 6 Jan 2016 13:32:24 -0500
Subject: [PATCH 028/119] freedreno/ir3: cosmetic de-indent

Collapse two nested if's into one to reduce indent level.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_ra.c | 70 +++++++++++-----------
 1 file changed, 34 insertions(+), 36 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index 74755eb3bc0..bbe1cd1d2d6 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -626,44 +626,42 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		if (writes_gpr(instr)) {
 			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 
-			if (id->defn == instr) {
-				/* arrays which don't fit in one of the pre-defined class
-				 * sizes are pre-colored:
+			/* arrays which don't fit in one of the pre-defined class
+			 * sizes are pre-colored:
+			 */
+			if ((id->defn == instr) && (id->cls > 0)) {
+				unsigned name = ra_name(ctx, id->cls, id->defn);
+
+				ctx->def[name] = id->defn->ip;
+				ctx->use[name] = id->defn->ip;
+
+				/* since we are in SSA at this point: */
+				debug_assert(!BITSET_TEST(bd->use, name));
+
+				BITSET_SET(bd->def, name);
+
+				if (is_half(id->defn)) {
+					ra_set_node_class(ctx->g, name,
+							ctx->set->half_classes[id->cls - class_count]);
+				} else {
+					ra_set_node_class(ctx->g, name,
+							ctx->set->classes[id->cls]);
+				}
+
+				/* extend the live range for phi srcs, which may come
+				 * from the bottom of the loop
 				 */
-				if (id->cls >= 0) {
-					unsigned name = ra_name(ctx, id->cls, id->defn);
-
-					ctx->def[name] = id->defn->ip;
-					ctx->use[name] = id->defn->ip;
-
-					/* since we are in SSA at this point: */
-					debug_assert(!BITSET_TEST(bd->use, name));
-
-					BITSET_SET(bd->def, name);
-
-					if (is_half(id->defn)) {
-						ra_set_node_class(ctx->g, name,
-								ctx->set->half_classes[id->cls - class_count]);
-					} else {
-						ra_set_node_class(ctx->g, name,
-								ctx->set->classes[id->cls]);
-					}
-
-					/* extend the live range for phi srcs, which may come
-					 * from the bottom of the loop
-					 */
-					if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
-						struct ir3_instruction *phi = id->defn->regs[0]->instr;
-						foreach_ssa_src(src, phi) {
-							/* if src is after phi, then we need to extend
-							 * the liverange to the end of src's block:
-							 */
-							if (src->ip > phi->ip) {
-								struct ir3_instruction *last =
+				if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
+					struct ir3_instruction *phi = id->defn->regs[0]->instr;
+					foreach_ssa_src(src, phi) {
+						/* if src is after phi, then we need to extend
+						 * the liverange to the end of src's block:
+						 */
+						if (src->ip > phi->ip) {
+							struct ir3_instruction *last =
 									list_last_entry(&src->block->instr_list,
-										struct ir3_instruction, node);
-								ctx->use[name] = MAX2(ctx->use[name], last->ip);
-							}
+											struct ir3_instruction, node);
+							ctx->use[name] = MAX2(ctx->use[name], last->ip);
 						}
 					}
 				}

From fc0d2f7e02bd9b3a552e7be47b2123d0afa38588 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Wed, 6 Jan 2016 14:08:34 -0500
Subject: [PATCH 029/119] freedreno/ir3: bit of ra refactor

Shuffle things slightly, passing instr-data to ra_name() to reduce the
number of places where we need to add support for array names.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_ra.c | 45 ++++++++++------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index bbe1cd1d2d6..88ca95acbbf 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -562,7 +562,7 @@ ra_init(struct ir3_ra_ctx *ctx)
 }
 
 static unsigned
-ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
+__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
 {
 	unsigned name;
 	debug_assert(cls >= 0);
@@ -571,6 +571,13 @@ ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
 	return name;
 }
 
+static int
+ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
+{
+	/* TODO handle name mapping for arrays */
+	return __ra_name(ctx, id->cls, id->defn);
+}
+
 static void
 ra_destroy(struct ir3_ra_ctx *ctx)
 {
@@ -626,11 +633,8 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		if (writes_gpr(instr)) {
 			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 
-			/* arrays which don't fit in one of the pre-defined class
-			 * sizes are pre-colored:
-			 */
-			if ((id->defn == instr) && (id->cls > 0)) {
-				unsigned name = ra_name(ctx, id->cls, id->defn);
+			if (id->defn == instr) {
+				unsigned name = ra_name(ctx, id);
 
 				ctx->def[name] = id->defn->ip;
 				ctx->use[name] = id->defn->ip;
@@ -670,14 +674,10 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 		foreach_ssa_src(src, instr) {
 			if (writes_gpr(src)) {
-				struct ir3_ra_instr_data *id = &ctx->instrd[src->ip];
-
-				if (id->cls >= 0) {
-					unsigned name = ra_name(ctx, id->cls, id->defn);
-					ctx->use[name] = MAX2(ctx->use[name], instr->ip);
-					if (!BITSET_TEST(bd->def, name))
-						BITSET_SET(bd->use, name);
-				}
+				unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
+				ctx->use[name] = MAX2(ctx->use[name], instr->ip);
+				if (!BITSET_TEST(bd->def, name))
+					BITSET_SET(bd->use, name);
 			}
 		}
 	}
@@ -765,12 +765,8 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 	/* need to fix things up to keep outputs live: */
 	for (unsigned i = 0; i < ir->noutputs; i++) {
 		struct ir3_instruction *instr = ir->outputs[i];
-		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-
-		if (id->cls >= 0) {
-			unsigned name = ra_name(ctx, id->cls, id->defn);
-			ctx->use[name] = ctx->instr_cnt;
-		}
+		unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
+		ctx->use[name] = ctx->instr_cnt;
 	}
 
 	for (unsigned i = 0; i < ctx->alloc_count; i++) {
@@ -839,9 +835,8 @@ reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
 		struct ir3_instruction *instr)
 {
 	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-
-	if (id->cls >= 0) {
-		unsigned name = ra_name(ctx, id->cls, id->defn);
+	if (id->defn) {
+		unsigned name = ra_name(ctx, id);
 		unsigned r = ra_get_node_reg(ctx->g, name);
 		unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
 
@@ -895,7 +890,7 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 		if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
 			struct ir3_instruction *instr = ir->inputs[i];
 			int cls = size_to_class(1, true);
-			unsigned name = ra_name(ctx, cls, instr);
+			unsigned name = __ra_name(ctx, cls, instr);
 			unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
 
 			/* if we have frag_face, it gets hr0.x */
@@ -911,7 +906,7 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 				if (id->defn == instr) {
 					unsigned name, reg;
 
-					name = ra_name(ctx, id->cls, id->defn);
+					name = ra_name(ctx, id);
 					reg = ctx->set->gpr_to_ra_reg[id->cls][j];
 
 					ra_set_node_reg(ctx->g, name, reg);

From 2809c87f90e359bd94f1bd15d7615ea28010779a Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sat, 9 Jan 2016 14:46:36 -0500
Subject: [PATCH 030/119] freedreno/ir3: remove unused tgsi tokens ptr

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 86afda4ba08..e5d39097267 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -46,7 +46,6 @@
 struct ir3_compile {
 	struct ir3_compiler *compiler;
 
-	const struct tgsi_token *tokens;
 	struct nir_shader *s;
 
 	struct ir3 *ir;

From 680664dff930ecf3dd12a5086cdd0e10653b61dc Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Wed, 13 Jan 2016 13:07:51 -0500
Subject: [PATCH 031/119] freedreno/ir3: fix incorrect decoding of mov
 instructions

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/disasm-a3xx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
index 83ed5ffdca0..1d5022b69c7 100644
--- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
+++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
@@ -220,7 +220,7 @@ static void print_instr_cat1(instr_t *instr)
 		else if (cat1->off > 0)
 			printf("%c<a0.x + %d>", type, cat1->off);
 		else
-			printf("c<a0.x>");
+			printf("%c<a0.x>", type);
 	} else {
 		print_reg_src((reg_t)(cat1->src), type_size(cat1->src_type) == 32,
 				cat1->src_r, cat1->src_c, cat1->src_im, false, false, false);

From cc7ed34df94a389e60221228896332d0ac8ea6c6 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sun, 10 Jan 2016 11:03:46 -0500
Subject: [PATCH 032/119] freedreno/ir3: refactor/simplify cp

If we handle separately the special case of eliminating output mov
(which includes keeps and various other cases where we don't have a
consuming instruction's src register to collapse things into), we
can simplify the logic.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_cp.c | 171 ++++++++++-----------
 1 file changed, 83 insertions(+), 88 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index be4e4e81109..a6e69d2416f 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -41,16 +41,22 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
 		struct ir3_register *dst = instr->regs[0];
 		struct ir3_register *src = instr->regs[1];
 		struct ir3_instruction *src_instr = ssa(src);
+
+		/* only if mov src is SSA (not const/immed): */
+		if (!src_instr)
+			return false;
+
+		/* no indirect: */
 		if (dst->flags & IR3_REG_RELATIV)
 			return false;
 		if (src->flags & IR3_REG_RELATIV)
 			return false;
+
 		if (!allow_flags)
 			if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG |
 					IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
 				return false;
-		if (!src_instr)
-			return false;
+
 		/* TODO: remove this hack: */
 		if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO))
 			return false;
@@ -82,10 +88,17 @@ static bool valid_flags(struct ir3_instruction *instr, unsigned n,
 	unsigned valid_flags;
 	flags = cp_flags(flags);
 
+	/* If destination is indirect, then source cannot be.. at least
+	 * I don't think so..
+	 */
+	if ((instr->regs[0]->flags & IR3_REG_RELATIV) &&
+			(flags & IR3_REG_RELATIV))
+		return false;
+
 	/* clear flags that are 'ok' */
 	switch (instr->category) {
 	case 1:
-		valid_flags = IR3_REG_IMMED | IR3_REG_RELATIV;
+		valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV;
 		if (flags & ~valid_flags)
 			return false;
 		break;
@@ -183,9 +196,13 @@ static void combine_flags(unsigned *dstflags, unsigned srcflags)
 		*dstflags ^= IR3_REG_SNEG;
 	if (srcflags & IR3_REG_BNOT)
 		*dstflags ^= IR3_REG_BNOT;
-}
 
-static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, unsigned *flags);
+	*dstflags &= ~IR3_REG_SSA;
+	*dstflags |= srcflags & IR3_REG_SSA;
+	*dstflags |= srcflags & IR3_REG_CONST;
+	*dstflags |= srcflags & IR3_REG_IMMED;
+	*dstflags |= srcflags & IR3_REG_RELATIV;
+}
 
 /* the "plain" MAD's (ie. the ones that don't shift first src prior to
  * multiply) can swap their first two srcs if src[0] is !CONST and
@@ -206,52 +223,31 @@ static bool is_valid_mad(struct ir3_instruction *instr)
 static void
 reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 {
-	unsigned src_flags = 0, new_flags;
-	struct ir3_instruction *src_instr;
+	struct ir3_instruction *src = ssa(reg);
 
-	if (is_meta(instr)) {
-		/* meta instructions cannot fold up register
-		 * flags.. they are usually src for texture
-		 * fetch, etc, where we cannot specify abs/neg
-		 */
-		reg->instr = instr_cp(reg->instr, NULL);
-		return;
-	}
+	if (is_eligible_mov(src, true)) {
+		/* simple case, no immed/const/relativ, only mov's w/ ssa src: */
+		struct ir3_register *src_reg = src->regs[1];
+		unsigned new_flags = reg->flags;
 
-	src_instr = instr_cp(reg->instr, &src_flags);
+		combine_flags(&new_flags, src_reg->flags);
 
-	new_flags = reg->flags;
-	combine_flags(&new_flags, src_flags);
-
-	reg->flags = new_flags;
-	reg->instr = src_instr;
-
-	if (!valid_flags(instr, n, reg->flags)) {
-		/* insert an absneg.f */
-		if (reg->flags & (IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT)) {
-			debug_assert(!(reg->flags & (IR3_REG_FNEG | IR3_REG_FABS)));
-			reg->instr = ir3_ABSNEG_S(instr->block,
-					reg->instr, cp_flags(src_flags));
-		} else {
-			debug_assert(!(reg->flags & (IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT)));
-			reg->instr = ir3_ABSNEG_F(instr->block,
-					reg->instr, cp_flags(src_flags));
+		if (valid_flags(instr, n, new_flags)) {
+			reg->flags = new_flags;
+			reg->instr = ssa(src_reg);
 		}
-		reg->flags &= ~cp_flags(src_flags);
-		debug_assert(valid_flags(instr, n, reg->flags));
-		/* send it through instr_cp() again since
-		 * the absneg src might be a mov from const
-		 * that could be cleaned up:
-		 */
-		reg->instr = instr_cp(reg->instr, NULL);
-		return;
-	}
 
-	if (is_same_type_mov(reg->instr)) {
-		struct ir3_register *src_reg = reg->instr->regs[1];
-		unsigned new_flags = src_reg->flags;
+		src = ssa(reg);      /* could be null for IR3_REG_ARRAY case */
+		if (!src)
+			return;
+	} else if (is_same_type_mov(src) &&
+			/* cannot collapse const/immed/etc into meta instrs: */
+			!is_meta(instr)) {
+		/* immed/const/etc cases, which require some special handling: */
+		struct ir3_register *src_reg = src->regs[1];
+		unsigned new_flags = reg->flags;
 
-		combine_flags(&new_flags, reg->flags);
+		combine_flags(&new_flags, src_reg->flags);
 
 		if (!valid_flags(instr, n, new_flags)) {
 			/* special case for "normal" mad instructions, we can
@@ -330,7 +326,8 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 			if (new_flags & IR3_REG_BNOT)
 				iim_val = ~iim_val;
 
-			if (!(iim_val & ~0x3ff)) {
+			/* other than category 1 (mov) we can only encode up to 10 bits: */
+			if ((instr->category == 1) || !(iim_val & ~0x3ff)) {
 				new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
 				src_reg->flags = new_flags;
 				src_reg->iim_val = iim_val;
@@ -342,56 +339,53 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 	}
 }
 
-/**
- * Given an SSA src (instruction), return the one with extraneous
- * mov's removed, ie, for (to copy NIR syntax):
- *
- *   vec1 ssa1 = fadd <something>, <somethingelse>
- *   vec1 ssa2 = fabs ssa1
- *   vec1 ssa3 = fneg ssa1
- *
- * then calling instr_cp(ssa3, &flags) would return ssa1 with
- * (IR3_REG_ABS | IR3_REG_NEGATE) in flags.  If flags is NULL,
- * then disallow eliminating copies which would require flag
- * propagation (for example, we cannot propagate abs/neg into
- * an output).
+/* Handle special case of eliminating output mov, and similar cases where
+ * there isn't a normal "consuming" instruction.  In this case we cannot
+ * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot
+ * be eliminated)
  */
 static struct ir3_instruction *
-instr_cp(struct ir3_instruction *instr, unsigned *flags)
+eliminate_output_mov(struct ir3_instruction *instr)
+{
+	if (is_eligible_mov(instr, false)) {
+		struct ir3_register *reg = instr->regs[1];
+		struct ir3_instruction *src_instr = ssa(reg);
+		debug_assert(src_instr);
+		return src_instr;
+	}
+	return instr;
+}
+
+/**
+ * Find instruction src's which are mov's that can be collapsed, replacing
+ * the mov dst with the mov src
+ */
+static void
+instr_cp(struct ir3_instruction *instr)
 {
 	struct ir3_register *reg;
 
-	if (is_eligible_mov(instr, !!flags)) {
-		struct ir3_register *reg = instr->regs[1];
-		struct ir3_instruction *src_instr = ssa(reg);
-		if (flags)
-			combine_flags(flags, reg->flags);
-		return instr_cp(src_instr, flags);
-	}
+	if (instr->regs_count == 0)
+		return;
 
-	/* Check termination condition before walking children (rather
-	 * than before checking eligible-mov).  A mov instruction may
-	 * appear as ssa-src for multiple other instructions, and we
-	 * want to consider it for removal for each, rather than just
-	 * the first one.  (But regardless of how many places it shows
-	 * up as a src, we only need to recursively walk the children
-	 * once.)
-	 */
 	if (ir3_instr_check_mark(instr))
-		return instr;
+		return;
 
 	/* walk down the graph from each src: */
 	foreach_src_n(reg, n, instr) {
-		if (!(reg->flags & IR3_REG_SSA))
+		struct ir3_instruction *src = ssa(reg);
+
+		if (!src)
 			continue;
 
+		instr_cp(src);
 		reg_cp(instr, reg, n);
 	}
 
-	if (instr->address)
-		ir3_instr_set_address(instr, instr_cp(instr->address, NULL));
-
-	return instr;
+	if (instr->address) {
+		instr_cp(instr->address);
+		ir3_instr_set_address(instr, eliminate_output_mov(instr->address));
+	}
 }
 
 void
@@ -401,19 +395,20 @@ ir3_cp(struct ir3 *ir)
 
 	for (unsigned i = 0; i < ir->noutputs; i++) {
 		if (ir->outputs[i]) {
-			struct ir3_instruction *out =
-					instr_cp(ir->outputs[i], NULL);
-
-			ir->outputs[i] = out;
+			instr_cp(ir->outputs[i]);
+			ir->outputs[i] = eliminate_output_mov(ir->outputs[i]);
 		}
 	}
 
 	for (unsigned i = 0; i < ir->keeps_count; i++) {
-		ir->keeps[i] = instr_cp(ir->keeps[i], NULL);
+		instr_cp(ir->keeps[i]);
+		ir->keeps[i] = eliminate_output_mov(ir->keeps[i]);
 	}
 
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		if (block->condition)
-			block->condition = instr_cp(block->condition, NULL);
+		if (block->condition) {
+			instr_cp(block->condition);
+			block->condition = eliminate_output_mov(block->condition);
+		}
 	}
 }

From fad158a0e01f4c28851477e6d1eb5c8fd67e226b Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sun, 10 Jan 2016 14:10:08 -0500
Subject: [PATCH 033/119] freedreno/ir3: array rework

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/freedreno_screen.c      |   9 +-
 src/gallium/drivers/freedreno/ir3/ir3.c       |  35 +-
 src/gallium/drivers/freedreno/ir3/ir3.h       |  91 +++--
 .../drivers/freedreno/ir3/ir3_compiler_nir.c  | 333 ++++--------------
 src/gallium/drivers/freedreno/ir3/ir3_cp.c    |  29 +-
 src/gallium/drivers/freedreno/ir3/ir3_depth.c |   4 +
 src/gallium/drivers/freedreno/ir3/ir3_print.c |  34 +-
 src/gallium/drivers/freedreno/ir3/ir3_ra.c    | 190 +++++++---
 src/gallium/drivers/freedreno/ir3/ir3_sched.c |   3 +
 9 files changed, 365 insertions(+), 363 deletions(-)

diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index a75b04b327a..6562924dae1 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -400,9 +400,16 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 		return 1;
 	case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
 	case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+		/* Technically this should be the same as for TEMP/CONST, since
+		 * everything is just normal registers.  This is just temporary
+		 * hack until load_input/store_output handle arrays in a similar
+		 * way as load_var/store_var..
+		 */
+		return 0;
 	case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
 	case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
-		return 1;
+		/* a2xx compiler doesn't handle indirect: */
+		return is_ir3(screen) ? 1 : 0;
 	case PIPE_SHADER_CAP_SUBROUTINES:
 	case PIPE_SHADER_CAP_DOUBLES:
 	case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index b24825cff85..be415d8e5fe 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -81,6 +81,7 @@ struct ir3 * ir3_create(struct ir3_compiler *compiler,
 	shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
 
 	list_inithead(&shader->block_list);
+	list_inithead(&shader->array_list);
 
 	return shader;
 }
@@ -121,18 +122,19 @@ static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
 		val.iim_val = reg->iim_val;
 	} else {
 		unsigned components;
+		int16_t max;
 
 		if (reg->flags & IR3_REG_RELATIV) {
 			components = reg->size;
-			val.dummy10 = reg->offset;
+			val.dummy10 = reg->array.offset;
+			max = (reg->array.offset + repeat + components - 1) >> 2;
 		} else {
 			components = util_last_bit(reg->wrmask);
 			val.comp = reg->num & 0x3;
 			val.num  = reg->num >> 2;
+			max = (reg->num + repeat + components - 1) >> 2;
 		}
 
-		int16_t max = (reg->num + repeat + components - 1) >> 2;
-
 		if (reg->flags & IR3_REG_CONST) {
 			info->max_const = MAX2(info->max_const, max);
 		} else if (val.num == 63) {
@@ -233,7 +235,7 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr,
 	iassert((instr->regs_count == 2) || (instr->regs_count == 3));
 
 	if (src1->flags & IR3_REG_RELATIV) {
-		iassert(src1->num < (1 << 10));
+		iassert(src1->array.offset < (1 << 10));
 		cat2->rel1.src1      = reg(src1, info, instr->repeat,
 				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
 				IR3_REG_HALF | absneg);
@@ -260,7 +262,7 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr,
 				!((src1->flags ^ src2->flags) & IR3_REG_HALF));
 
 		if (src2->flags & IR3_REG_RELATIV) {
-			iassert(src2->num < (1 << 10));
+			iassert(src2->array.offset < (1 << 10));
 			cat2->rel2.src2      = reg(src2, info, instr->repeat,
 					IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
 					IR3_REG_HALF | absneg);
@@ -333,7 +335,7 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr,
 	iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF));
 
 	if (src1->flags & IR3_REG_RELATIV) {
-		iassert(src1->num < (1 << 10));
+		iassert(src1->array.offset < (1 << 10));
 		cat3->rel1.src1      = reg(src1, info, instr->repeat,
 				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
 				IR3_REG_HALF | absneg);
@@ -361,7 +363,7 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr,
 
 
 	if (src3->flags & IR3_REG_RELATIV) {
-		iassert(src3->num < (1 << 10));
+		iassert(src3->array.offset < (1 << 10));
 		cat3->rel2.src3      = reg(src3, info, instr->repeat,
 				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
 				IR3_REG_HALF | absneg);
@@ -404,7 +406,7 @@ static int emit_cat4(struct ir3_instruction *instr, void *ptr,
 	iassert(instr->regs_count == 2);
 
 	if (src->flags & IR3_REG_RELATIV) {
-		iassert(src->num < (1 << 10));
+		iassert(src->array.offset < (1 << 10));
 		cat4->rel.src      = reg(src, info, instr->repeat,
 				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG |
 				IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF);
@@ -737,6 +739,14 @@ struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 	return reg;
 }
 
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+		struct ir3_register *reg)
+{
+	struct ir3_register *new_reg = reg_create(shader, 0, 0);
+	*new_reg = *reg;
+	return new_reg;
+}
+
 void
 ir3_instr_set_address(struct ir3_instruction *instr,
 		struct ir3_instruction *addr)
@@ -777,3 +787,12 @@ ir3_count_instructions(struct ir3 *ir)
 	}
 	return cnt;
 }
+
+struct ir3_array *
+ir3_lookup_array(struct ir3 *ir, unsigned id)
+{
+	list_for_each_entry (struct ir3_array, arr, &ir->array_list, node)
+		if (arr->id == id)
+			return arr;
+	return NULL;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 62d14a0ae37..1e5a1e9ee8b 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -83,7 +83,8 @@ struct ir3_register {
 		 * before register assignment is done:
 		 */
 		IR3_REG_SSA    = 0x2000,   /* 'instr' is ptr to assigning instr */
-		IR3_REG_PHI_SRC= 0x4000,   /* phi src, regs[0]->instr points to phi */
+		IR3_REG_ARRAY  = 0x4000,
+		IR3_REG_PHI_SRC= 0x8000,   /* phi src, regs[0]->instr points to phi */
 
 	} flags;
 	union {
@@ -97,11 +98,18 @@ struct ir3_register {
 		uint32_t uim_val;
 		float    fim_val;
 		/* relative: */
-		int   offset;
+		struct {
+			uint16_t id;
+			uint16_t offset;
+		} array;
 	};
 
-	/* for IR3_REG_SSA, src registers contain ptr back to
-	 * assigning instruction.
+	/* For IR3_REG_SSA, src registers contain ptr back to assigning
+	 * instruction.
+	 *
+	 * For IR3_REG_ARRAY, the pointer is back to the last dependent
+	 * array access (although the net effect is the same, it points
+	 * back to a previous instruction that we depend on).
 	 */
 	struct ir3_instruction *instr;
 
@@ -221,9 +229,6 @@ struct ir3_instruction {
 		struct {
 			int off;              /* component/offset */
 		} fo;
-		struct {
-			int aid;
-		} fi;
 		struct {
 			/* used to temporarily hold reference to nir_phi_instr
 			 * until we resolve the phi srcs
@@ -293,19 +298,6 @@ struct ir3_instruction {
 	 */
 	struct ir3_instruction *address;
 
-	/* in case of a instruction with relative dst instruction, we need to
-	 * capture the dependency on the fanin for the previous values of
-	 * the array elements.  Since we don't know at compile time actually
-	 * which array elements are written, this serves to preserve the
-	 * unconditional write to array elements prior to the conditional
-	 * write.
-	 *
-	 * TODO only cat1 can do indirect write.. we could maybe move this
-	 * into instr->cat1.fanin (but would require the frontend to insert
-	 * the extra mov)
-	 */
-	struct ir3_instruction *fanin;
-
 	/* Entry in ir3_block's instruction list: */
 	struct list_head node;
 
@@ -379,10 +371,39 @@ struct ir3 {
 	/* List of blocks: */
 	struct list_head block_list;
 
+	/* List of ir3_array's: */
+	struct list_head array_list;
+
 	unsigned heap_idx;
 	struct ir3_heap_chunk *chunk;
 };
 
+typedef struct nir_variable nir_variable;
+
+struct ir3_array {
+	struct list_head node;
+	unsigned length;
+	unsigned id;
+
+	nir_variable *var;
+
+	/* We track the last write and last access (read or write) to
+	 * setup dependencies on instructions that read or write the
+	 * array.  Reads can be re-ordered wrt. other reads, but should
+	 * not be re-ordered wrt. to writes.  Writes cannot be reordered
+	 * wrt. any other access to the array.
+	 *
+	 * So array reads depend on last write, and array writes depend
+	 * on the last access.
+	 */
+	struct ir3_instruction *last_write, *last_access;
+
+	/* extra stuff used in RA pass: */
+	unsigned base;
+};
+
+struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
+
 typedef struct nir_block nir_block;
 
 struct ir3_block {
@@ -430,6 +451,8 @@ const char *ir3_instr_name(struct ir3_instruction *instr);
 
 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 		int num, int flags);
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+		struct ir3_register *reg);
 
 void ir3_instr_set_address(struct ir3_instruction *instr,
 		struct ir3_instruction *addr);
@@ -510,6 +533,9 @@ static inline bool is_same_type_mov(struct ir3_instruction *instr)
 	if (dst->num == regid(REG_A0, 0))
 		return false;
 
+	if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
+		return false;
+
 	if ((instr->category == 1) &&
 			(instr->cat1.src_type == instr->cat1.dst_type))
 		return true;
@@ -623,8 +649,10 @@ static inline bool writes_pred(struct ir3_instruction *instr)
 /* TODO better name */
 static inline struct ir3_instruction *ssa(struct ir3_register *reg)
 {
-	if (reg->flags & IR3_REG_SSA)
+	if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) {
+		debug_assert(!(reg->instr && (reg->instr->flags & IR3_INSTR_UNUSED)));
 		return reg->instr;
+	}
 	return NULL;
 }
 
@@ -813,8 +841,6 @@ static inline unsigned ir3_cat3_absneg(opc_t opc)
 
 static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
 {
-	if (instr->fanin)
-		return instr->regs_count + 2;
 	if (instr->address)
 		return instr->regs_count + 1;
 	return instr->regs_count;
@@ -822,8 +848,6 @@ static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
 
 static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
 {
-	if (n == (instr->regs_count + 1))
-		return instr->fanin;
 	if (n == (instr->regs_count + 0))
 		return instr->address;
 	return ssa(instr->regs[n]);
@@ -834,8 +858,8 @@ static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr
 /* iterator for an instruction's SSA sources (instr), also returns src #: */
 #define foreach_ssa_src_n(__srcinst, __n, __instr) \
 	if ((__instr)->regs_count) \
-		for (unsigned __cnt = __ssa_src_cnt(__instr) - 1, __n = 0; __n < __cnt; __n++) \
-			if ((__srcinst = __ssa_src_n(__instr, __n + 1)))
+		for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
+			if ((__srcinst = __ssa_src_n(__instr, __n)))
 
 /* iterator for an instruction's SSA sources (instr): */
 #define foreach_ssa_src(__srcinst, __instr) \
@@ -878,7 +902,15 @@ ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
 	struct ir3_instruction *instr =
 		ir3_instr_create(block, 1, 0);
 	ir3_reg_create(instr, 0, 0);   /* dst */
-	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+	if (src->regs[0]->flags & IR3_REG_ARRAY) {
+		struct ir3_register *src_reg =
+			ir3_reg_create(instr, 0, IR3_REG_ARRAY);
+		src_reg->array = src->regs[0]->array;
+		src_reg->instr = src;
+	} else {
+		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+	}
+	debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV));
 	instr->cat1.src_type = type;
 	instr->cat1.dst_type = type;
 	return instr;
@@ -894,6 +926,7 @@ ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
 	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
 	instr->cat1.src_type = src_type;
 	instr->cat1.dst_type = dst_type;
+	debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY));
 	return instr;
 }
 
@@ -1083,7 +1116,7 @@ typedef uint8_t regmask_t[2 * MAX_REG / 8];
 
 static inline unsigned regmask_idx(struct ir3_register *reg)
 {
-	unsigned num = reg->num;
+	unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num;
 	debug_assert(num < MAX_REG);
 	if (reg->flags & IR3_REG_HALF)
 		num += MAX_REG;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index e5d39097267..bd0ee89a1ec 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -74,8 +74,6 @@ struct ir3_compile {
 	/* mapping from nir_register to defining instruction: */
 	struct hash_table *def_ht;
 
-	/* mapping from nir_variable to ir3_array: */
-	struct hash_table *var_ht;
 	unsigned num_arrays;
 
 	/* a common pattern for indirect addressing is to request the
@@ -142,8 +140,6 @@ compile_init(struct ir3_compiler *compiler,
 	ctx->so = so;
 	ctx->def_ht = _mesa_hash_table_create(ctx,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
-	ctx->var_ht = _mesa_hash_table_create(ctx,
-			_mesa_hash_pointer, _mesa_key_pointer_equal);
 	ctx->block_ht = _mesa_hash_table_create(ctx,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
 
@@ -220,206 +216,26 @@ compile_free(struct ir3_compile *ctx)
 	ralloc_free(ctx);
 }
 
-/* global per-array information: */
-struct ir3_array {
-	unsigned length, aid;
-};
-
-/* per-block array state: */
-struct ir3_array_value {
-	/* TODO drop length/aid, and just have ptr back to ir3_array */
-	unsigned length, aid;
-	/* initial array element values are phi's, other than for the
-	 * entry block.  The phi src's get added later in a resolve step
-	 * after we have visited all the blocks, to account for back
-	 * edges in the cfg.
-	 */
-	struct ir3_instruction **phis;
-	/* current array element values (as block is processed).  When
-	 * the array phi's are resolved, it will contain the array state
-	 * at exit of block, so successor blocks can use it to add their
-	 * phi srcs.
-	 */
-	struct ir3_instruction *arr[];
-};
-
-/* track array assignments per basic block.  When an array is read
- * outside of the same basic block, we can use NIR's dominance-frontier
- * information to figure out where phi nodes are needed.
- */
-struct ir3_nir_block_data {
-	unsigned foo;
-	/* indexed by array-id (aid): */
-	struct ir3_array_value *arrs[];
-};
-
-static struct ir3_nir_block_data *
-get_block_data(struct ir3_compile *ctx, struct ir3_block *block)
-{
-	if (!block->data) {
-		struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) +
-				((ctx->num_arrays + 1) * sizeof(bd->arrs[0])));
-		block->data = bd;
-	}
-	return block->data;
-}
-
 static void
 declare_var(struct ir3_compile *ctx, nir_variable *var)
 {
 	unsigned length = glsl_get_length(var->type) * 4;  /* always vec4, at least with ttn */
 	struct ir3_array *arr = ralloc(ctx, struct ir3_array);
+	arr->id = ++ctx->num_arrays;
 	arr->length = length;
-	arr->aid = ++ctx->num_arrays;
-	_mesa_hash_table_insert(ctx->var_ht, var, arr);
+	arr->var = var;
+	list_addtail(&arr->node, &ctx->ir->array_list);
 }
 
-static nir_block *
-nir_block_pred(nir_block *block)
-{
-	assert(block->predecessors->entries < 2);
-	if (block->predecessors->entries == 0)
-		return NULL;
-	return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key;
-}
-
-static struct ir3_array_value *
+static struct ir3_array *
 get_var(struct ir3_compile *ctx, nir_variable *var)
 {
-	struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var);
-	struct ir3_block *block = ctx->block;
-	struct ir3_nir_block_data *bd = get_block_data(ctx, block);
-	struct ir3_array *arr = entry->data;
-
-	if (!bd->arrs[arr->aid]) {
-		struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) +
-				(arr->length * sizeof(av->arr[0])));
-		struct ir3_array_value *defn = NULL;
-		nir_block *pred_block;
-
-		av->length = arr->length;
-		av->aid = arr->aid;
-
-		/* For loops, we have to consider that we have not visited some
-		 * of the blocks who should feed into the phi (ie. back-edges in
-		 * the cfg).. for example:
-		 *
-		 *   loop {
-		 *      block { load_var; ... }
-		 *      if then block {} else block {}
-		 *      block { store_var; ... }
-		 *      if then block {} else block {}
-		 *      block {...}
-		 *   }
-		 *
-		 * We can skip the phi if we can chase the block predecessors
-		 * until finding the block previously defining the array without
-		 * crossing a block that has more than one predecessor.
-		 *
-		 * Otherwise create phi's and resolve them as a post-pass after
-		 * all the blocks have been visited (to handle back-edges).
-		 */
-
-		for (pred_block = block->nblock;
-				pred_block && (pred_block->predecessors->entries < 2) && !defn;
-				pred_block = nir_block_pred(pred_block)) {
-			struct ir3_block *pblock = get_block(ctx, pred_block);
-			struct ir3_nir_block_data *pbd = pblock->data;
-			if (!pbd)
-				continue;
-			defn = pbd->arrs[arr->aid];
-		}
-
-		if (defn) {
-			/* only one possible definer: */
-			for (unsigned i = 0; i < arr->length; i++)
-				av->arr[i] = defn->arr[i];
-		} else if (pred_block) {
-			/* not the first block, and multiple potential definers: */
-			av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0]));
-
-			for (unsigned i = 0; i < arr->length; i++) {
-				struct ir3_instruction *phi;
-
-				phi = ir3_instr_create2(block, -1, OPC_META_PHI,
-						1 + ctx->impl->num_blocks);
-				ir3_reg_create(phi, 0, 0);         /* dst */
-
-				/* phi's should go at head of block: */
-				list_delinit(&phi->node);
-				list_add(&phi->node, &block->instr_list);
-
-				av->phis[i] = av->arr[i] = phi;
-			}
-		} else {
-			/* Some shaders end up reading array elements without
-			 * first writing.. so initialize things to prevent null
-			 * instr ptrs later:
-			 */
-			for (unsigned i = 0; i < arr->length; i++)
-				av->arr[i] = create_immed(block, 0);
-		}
-
-		bd->arrs[arr->aid] = av;
-	}
-
-	return bd->arrs[arr->aid];
-}
-
-static void
-add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock,
-		struct ir3_array_value *av, BITSET_WORD *visited)
-{
-	struct ir3_block *block;
-	struct ir3_nir_block_data *bd;
-
-	if (BITSET_TEST(visited, nblock->index))
-		return;
-
-	BITSET_SET(visited, nblock->index);
-
-	block = get_block(ctx, nblock);
-	bd = block->data;
-
-	if (bd && bd->arrs[av->aid]) {
-		struct ir3_array_value *dav = bd->arrs[av->aid];
-		for (unsigned i = 0; i < av->length; i++) {
-			ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr =
-					dav->arr[i];
-		}
-	} else {
-		/* didn't find defn, recurse predecessors: */
-		struct set_entry *entry;
-		set_foreach(nblock->predecessors, entry) {
-			add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
-		}
-	}
-}
-
-static void
-resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block)
-{
-	struct ir3_nir_block_data *bd = block->data;
-	unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks);
-
-	if (!bd)
-		return;
-
-	/* TODO use nir dom_frontier to help us with this? */
-
-	for (unsigned i = 1; i <= ctx->num_arrays; i++) {
-		struct ir3_array_value *av = bd->arrs[i];
-		BITSET_WORD visited[bitset_words];
-		struct set_entry *entry;
-
-		if (!(av && av->phis))
-			continue;
-
-		memset(visited, 0, sizeof(visited));
-		set_foreach(block->nblock->predecessors, entry) {
-			add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
-		}
+	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+		if (arr->var == var)
+			return arr;
 	}
+	compile_error(ctx, "bogus var: %s\n", var->name);
+	return NULL;
 }
 
 /* allocate a n element value array (to be populated by caller) and
@@ -437,6 +253,7 @@ __get_dst(struct ir3_compile *ctx, void *key, unsigned n)
 static struct ir3_instruction **
 get_dst(struct ir3_compile *ctx, nir_dest *dst, unsigned n)
 {
+	compile_assert(ctx, dst->is_ssa);
 	if (dst->is_ssa) {
 		return __get_dst(ctx, &dst->ssa, n);
 	} else {
@@ -454,6 +271,7 @@ static struct ir3_instruction **
 get_src(struct ir3_compile *ctx, nir_src *src)
 {
 	struct hash_entry *entry;
+	compile_assert(ctx, src->is_ssa);
 	if (src->is_ssa) {
 		entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
 	} else {
@@ -568,7 +386,7 @@ create_uniform_indirect(struct ir3_compile *ctx, unsigned n,
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
 	ir3_reg_create(mov, 0, 0);
-	ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV);
+	ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
 
 	ir3_instr_set_address(mov, address);
 
@@ -607,17 +425,45 @@ create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 	src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV);
 	src->instr = collect;
 	src->size  = arrsz;
-	src->offset = n;
+	src->array.offset = n;
 
 	ir3_instr_set_address(mov, address);
 
 	return mov;
 }
 
+/* relative (indirect) if address!=NULL */
 static struct ir3_instruction *
-create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
-		struct ir3_instruction *src, struct ir3_instruction *address,
-		struct ir3_instruction *collect)
+create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, unsigned n,
+		struct ir3_instruction *address)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *mov;
+	struct ir3_register *src;
+
+	mov = ir3_instr_create(block, 1, 0);
+	mov->cat1.src_type = TYPE_U32;
+	mov->cat1.dst_type = TYPE_U32;
+	ir3_reg_create(mov, 0, 0);
+	src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+			COND(address, IR3_REG_RELATIV));
+	src->instr = arr->last_write;
+	src->size  = arr->length;
+	src->array.id = arr->id;
+	src->array.offset = n;
+
+	if (address)
+		ir3_instr_set_address(mov, address);
+
+	arr->last_access = mov;
+
+	return mov;
+}
+
+/* relative (indirect) if address!=NULL */
+static struct ir3_instruction *
+create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, unsigned n,
+		struct ir3_instruction *src, struct ir3_instruction *address)
 {
 	struct ir3_block *block = ctx->block;
 	struct ir3_instruction *mov;
@@ -626,14 +472,18 @@ create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 	mov = ir3_instr_create(block, 1, 0);
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
-	dst = ir3_reg_create(mov, 0, IR3_REG_RELATIV);
-	dst->size  = arrsz;
-	dst->offset = n;
+	dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+			COND(address, IR3_REG_RELATIV));
+	dst->instr = arr->last_access;
+	dst->size  = arr->length;
+	dst->array.id = arr->id;
+	dst->array.offset = n;
 	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
-	mov->fanin = collect;
 
 	ir3_instr_set_address(mov, address);
 
+	arr->last_write = arr->last_access = mov;
+
 	return mov;
 }
 
@@ -1198,7 +1048,7 @@ emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 {
 	nir_deref_var *dvar = intr->variables[0];
 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-	struct ir3_array_value *arr = get_var(ctx, dvar->var);
+	struct ir3_array *arr = get_var(ctx, dvar->var);
 
 	compile_assert(ctx, dvar->deref.child &&
 		(dvar->deref.child->deref_type == nir_deref_type_array));
@@ -1209,19 +1059,17 @@ emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = darr->base_offset * 4 + i;
 			compile_assert(ctx, n < arr->length);
-			dst[i] = arr->arr[n];
+			dst[i] = create_var_load(ctx, arr, n, NULL);
 		}
 		break;
 	case nir_deref_array_type_indirect: {
 		/* for indirect, we need to collect all the array elements: */
-		struct ir3_instruction *collect =
-				create_collect(ctx->block, arr->arr, arr->length);
 		struct ir3_instruction *addr =
 				get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = darr->base_offset * 4 + i;
 			compile_assert(ctx, n < arr->length);
-			dst[i] = create_indirect_load(ctx, arr->length, n, addr, collect);
+			dst[i] = create_var_load(ctx, arr, n, addr);
 		}
 		break;
 	}
@@ -1238,8 +1086,9 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 {
 	nir_deref_var *dvar = intr->variables[0];
 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-	struct ir3_array_value *arr = get_var(ctx, dvar->var);
-	struct ir3_instruction **src;
+	struct ir3_array *arr = get_var(ctx, dvar->var);
+	struct ir3_instruction *addr, **src;
+	unsigned wrmask = intr->const_index[0];
 
 	compile_assert(ctx, dvar->deref.child &&
 		(dvar->deref.child->deref_type == nir_deref_type_array));
@@ -1248,66 +1097,24 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 
 	switch (darr->deref_array_type) {
 	case nir_deref_array_type_direct:
-		/* direct access does not require anything special: */
-		for (int i = 0; i < intr->num_components; i++) {
-			/* ttn doesn't generate partial writemasks */
-			assert(intr->const_index[0] ==
-			       (1 << intr->num_components) - 1);
-
-			unsigned n = darr->base_offset * 4 + i;
-			compile_assert(ctx, n < arr->length);
-			arr->arr[n] = src[i];
-		}
+		addr = NULL;
 		break;
-	case nir_deref_array_type_indirect: {
-		/* for indirect, create indirect-store and fan that out: */
-		struct ir3_instruction *collect =
-				create_collect(ctx->block, arr->arr, arr->length);
-		struct ir3_instruction *addr =
-				get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
-		for (int i = 0; i < intr->num_components; i++) {
-			/* ttn doesn't generate partial writemasks */
-			assert(intr->const_index[0] ==
-			       (1 << intr->num_components) - 1);
-
-			struct ir3_instruction *store;
-			unsigned n = darr->base_offset * 4 + i;
-			compile_assert(ctx, n < arr->length);
-
-			store = create_indirect_store(ctx, arr->length,
-					n, src[i], addr, collect);
-
-			store->fanin->fi.aid = arr->aid;
-
-			/* TODO: probably split this out to be used for
-			 * store_output_indirect? or move this into
-			 * create_indirect_store()?
-			 */
-			for (int j = i; j < arr->length; j += intr->num_components) {
-				struct ir3_instruction *split;
-
-				split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
-				split->fo.off = j;
-				ir3_reg_create(split, 0, 0);
-				ir3_reg_create(split, 0, IR3_REG_SSA)->instr = store;
-
-				arr->arr[j] = split;
-			}
-		}
-		/* fixup fanout/split neighbors: */
-		for (int i = 0; i < arr->length; i++) {
-			arr->arr[i]->cp.right = (i < (arr->length - 1)) ?
-					arr->arr[i+1] : NULL;
-			arr->arr[i]->cp.left = (i > 0) ?
-					arr->arr[i-1] : NULL;
-		}
+	case nir_deref_array_type_indirect:
+		addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
 		break;
-	}
 	default:
 		compile_error(ctx, "Unhandled store deref type: %u\n",
 				darr->deref_array_type);
 		break;
 	}
+
+	for (int i = 0; i < intr->num_components; i++) {
+		if (!(wrmask & (1 << i)))
+			continue;
+		unsigned n = darr->base_offset * 4 + i;
+		compile_assert(ctx, n < arr->length);
+		create_var_store(ctx, arr, n, src[i], addr);
+	}
 }
 
 static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot,
@@ -1835,8 +1642,6 @@ resolve_phis(struct ir3_compile *ctx, struct ir3_block *block)
 			ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
 		}
 	}
-
-	resolve_array_phis(ctx, block);
 }
 
 static void
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index a6e69d2416f..0d88e7bc3ab 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -202,6 +202,7 @@ static void combine_flags(unsigned *dstflags, unsigned srcflags)
 	*dstflags |= srcflags & IR3_REG_CONST;
 	*dstflags |= srcflags & IR3_REG_IMMED;
 	*dstflags |= srcflags & IR3_REG_RELATIV;
+	*dstflags |= srcflags & IR3_REG_ARRAY;
 }
 
 /* the "plain" MAD's (ie. the ones that don't shift first src prior to
@@ -233,6 +234,10 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 		combine_flags(&new_flags, src_reg->flags);
 
 		if (valid_flags(instr, n, new_flags)) {
+			if (new_flags & IR3_REG_ARRAY) {
+				debug_assert(!(reg->flags & IR3_REG_ARRAY));
+				reg->array = src_reg->array;
+			}
 			reg->flags = new_flags;
 			reg->instr = ssa(src_reg);
 		}
@@ -283,6 +288,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 					conflicts(instr->address, reg->instr->address))
 				return;
 
+			src_reg = ir3_reg_clone(instr->block->shader, src_reg);
 			src_reg->flags = new_flags;
 			instr->regs[n+1] = src_reg;
 
@@ -294,6 +300,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 
 		if ((src_reg->flags & IR3_REG_RELATIV) &&
 				!conflicts(instr->address, reg->instr->address)) {
+			src_reg = ir3_reg_clone(instr->block->shader, src_reg);
 			src_reg->flags = new_flags;
 			instr->regs[n+1] = src_reg;
 			ir3_instr_set_address(instr, reg->instr->address);
@@ -329,6 +336,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 			/* other than category 1 (mov) we can only encode up to 10 bits: */
 			if ((instr->category == 1) || !(iim_val & ~0x3ff)) {
 				new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
+				src_reg = ir3_reg_clone(instr->block->shader, src_reg);
 				src_reg->flags = new_flags;
 				src_reg->iim_val = iim_val;
 				instr->regs[n+1] = src_reg;
@@ -349,9 +357,11 @@ eliminate_output_mov(struct ir3_instruction *instr)
 {
 	if (is_eligible_mov(instr, false)) {
 		struct ir3_register *reg = instr->regs[1];
-		struct ir3_instruction *src_instr = ssa(reg);
-		debug_assert(src_instr);
-		return src_instr;
+		if (!(reg->flags & IR3_REG_ARRAY)) {
+			struct ir3_instruction *src_instr = ssa(reg);
+			debug_assert(src_instr);
+			return src_instr;
+		}
 	}
 	return instr;
 }
@@ -379,9 +389,22 @@ instr_cp(struct ir3_instruction *instr)
 			continue;
 
 		instr_cp(src);
+
+		/* TODO non-indirect access we could figure out which register
+		 * we actually want and allow cp..
+		 */
+		if (reg->flags & IR3_REG_ARRAY)
+			continue;
+
 		reg_cp(instr, reg, n);
 	}
 
+	if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+		struct ir3_instruction *src = ssa(instr->regs[0]);
+		if (src)
+			instr_cp(src);
+	}
+
 	if (instr->address) {
 		instr_cp(instr->address);
 		ir3_instr_set_address(instr, eliminate_output_mov(instr->address));
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index 4bbc0458790..3354cbd23fa 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -118,6 +118,10 @@ ir3_instr_depth(struct ir3_instruction *instr)
 		/* visit child to compute it's depth: */
 		ir3_instr_depth(src);
 
+		/* for array writes, no need to delay on previous write: */
+		if (i == 0)
+			continue;
+
 		sd = ir3_delayslots(src, instr, i) + src->depth;
 
 		instr->depth = MAX2(instr->depth, sd);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c
index a84e7989cf8..ec832f5d72a 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -94,7 +94,7 @@ static void print_instr_name(struct ir3_instruction *instr)
 	}
 }
 
-static void print_reg_name(struct ir3_register *reg, bool followssa)
+static void print_reg_name(struct ir3_register *reg)
 {
 	if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
 			(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
@@ -106,20 +106,29 @@ static void print_reg_name(struct ir3_register *reg, bool followssa)
 
 	if (reg->flags & IR3_REG_IMMED) {
 		printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
-	} else if (reg->flags & IR3_REG_SSA) {
-		printf("_");
-		if (followssa) {
-			printf("[");
+	} else if (reg->flags & IR3_REG_ARRAY) {
+		printf("arr[id=%u, offset=%u, size=%u", reg->array.id,
+				reg->array.offset, reg->size);
+		/* for ARRAY we could have null src, for example first write
+		 * instruction..
+		 */
+		if (reg->instr) {
+			printf(", _[");
 			print_instr_name(reg->instr);
 			printf("]");
 		}
+		printf("]");
+	} else if (reg->flags & IR3_REG_SSA) {
+		printf("_[");
+		print_instr_name(reg->instr);
+		printf("]");
 	} else if (reg->flags & IR3_REG_RELATIV) {
 		if (reg->flags & IR3_REG_HALF)
 			printf("h");
 		if (reg->flags & IR3_REG_CONST)
-			printf("c<a0.x + %u>", reg->num);
+			printf("c<a0.x + %u>", reg->array.offset);
 		else
-			printf("\x1b[0;31mr<a0.x + %u>\x1b[0m (%u)", reg->num, reg->size);
+			printf("\x1b[0;31mr<a0.x + %u>\x1b[0m (%u)", reg->array.offset, reg->size);
 	} else {
 		if (reg->flags & IR3_REG_HALF)
 			printf("h");
@@ -158,7 +167,7 @@ print_instr(struct ir3_instruction *instr, int lvl)
 	for (i = 0; i < instr->regs_count; i++) {
 		struct ir3_register *reg = instr->regs[i];
 		printf(i ? ", " : " ");
-		print_reg_name(reg, !!i);
+		print_reg_name(reg);
 	}
 
 	if (instr->address) {
@@ -168,13 +177,6 @@ print_instr(struct ir3_instruction *instr, int lvl)
 		printf("]");
 	}
 
-	if (instr->fanin) {
-		printf(", fanin=_");
-		printf("[");
-		print_instr_name(instr->fanin);
-		printf("]");
-	}
-
 	if (instr->cp.left) {
 		printf(", left=_");
 		printf("[");
@@ -192,8 +194,6 @@ print_instr(struct ir3_instruction *instr, int lvl)
 	if (is_meta(instr)) {
 		if (instr->opc == OPC_META_FO) {
 			printf(", off=%d", instr->fo.off);
-		} else if ((instr->opc == OPC_META_FI) && instr->fi.aid) {
-			printf(", aid=%d", instr->fi.aid);
 		}
 	}
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index 88ca95acbbf..3c42f8e1592 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -68,25 +68,24 @@
  * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
  * register assignment.  But for us that is horrible from a scheduling
  * standpoint.  Instead what we do is use idea of 'definer' instruction.
- * Ie. the first instruction (lowest ip) to write to the array is the
+ * Ie. the first instruction (lowest ip) to write to the variable is the
  * one we consider from use/def perspective when building interference
- * graph.  (Other instructions which write other array elements just
- * define the variable some more.)
+ * graph.  (Other instructions which write other variable components
+ * just define the variable some more.)
+ *
+ * Arrays of arbitrary size are handled via pre-coloring a consecutive
+ * sequence of registers.  Additional scalar (single component) reg
+ * names are allocated starting at ctx->class_base[total_class_count]
+ * (see arr->base), which are pre-colored.  In the use/def graph direct
+ * access is treated as a single element use/def, and indirect access
+ * is treated as use or def of all array elements.  (Only the first
+ * def is tracked, in case of multiple indirect writes, etc.)
  */
 
 static const unsigned class_sizes[] = {
 	1, 2, 3, 4,
 	4 + 4, /* txd + 1d/2d */
 	4 + 6, /* txd + 3d */
-	/* temporary: until we can assign arrays, create classes so we
-	 * can round up array to fit.  NOTE with tgsi arrays should
-	 * really all be multiples of four:
-	 */
-	4 * 4,
-	4 * 8,
-	4 * 16,
-	4 * 32,
-
 };
 #define class_count ARRAY_SIZE(class_sizes)
 
@@ -265,8 +264,9 @@ struct ir3_ra_ctx {
 	struct ir3_ra_reg_set *set;
 	struct ra_graph *g;
 	unsigned alloc_count;
-	unsigned class_alloc_count[total_class_count];
-	unsigned class_base[total_class_count];
+	/* one per class, plus one slot for arrays: */
+	unsigned class_alloc_count[total_class_count + 1];
+	unsigned class_base[total_class_count + 1];
 	unsigned instr_cnt;
 	unsigned *def, *use;     /* def/use table */
 	struct ir3_ra_instr_data *instrd;
@@ -329,9 +329,6 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
 	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 	struct ir3_instruction *d = NULL;
 
-	if (instr->fanin)
-		return get_definer(ctx, instr->fanin, sz, off);
-
 	if (id->defn) {
 		*sz = id->sz;
 		*off = id->off;
@@ -485,10 +482,13 @@ ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		/* couple special cases: */
 		if (writes_addr(instr) || writes_pred(instr)) {
 			id->cls = -1;
-			continue;
+		} else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+			id->cls = total_class_count;
+			id->defn = instr;
+		} else {
+			id->defn = get_definer(ctx, instr, &id->sz, &id->off);
+			id->cls = size_to_class(id->sz, is_half(id->defn));
 		}
-		id->defn = get_definer(ctx, instr, &id->sz, &id->off);
-		id->cls = size_to_class(id->sz, is_half(id->defn));
 	}
 }
 
@@ -518,8 +518,6 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 		/* arrays which don't fit in one of the pre-defined class
 		 * sizes are pre-colored:
-		 *
-		 * TODO but we still need to allocate names for them, don't we??
 		 */
 		if (id->cls >= 0) {
 			instr->name = ctx->class_alloc_count[id->cls]++;
@@ -531,7 +529,7 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 static void
 ra_init(struct ir3_ra_ctx *ctx)
 {
-	unsigned n;
+	unsigned n, base;
 
 	ir3_clear_mark(ctx->ir);
 	n = ir3_count_instructions(ctx->ir);
@@ -550,11 +548,20 @@ ra_init(struct ir3_ra_ctx *ctx)
 	 * actual ra name is class_base[cls] + instr->name;
 	 */
 	ctx->class_base[0] = 0;
-	for (unsigned i = 1; i < total_class_count; i++) {
+	for (unsigned i = 1; i <= total_class_count; i++) {
 		ctx->class_base[i] = ctx->class_base[i-1] +
 				ctx->class_alloc_count[i-1];
 	}
 
+	/* and vreg names for array elements: */
+	base = ctx->class_base[total_class_count];
+	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+		arr->base = base;
+		ctx->class_alloc_count[total_class_count] += arr->length;
+		base += arr->length;
+	}
+	ctx->alloc_count += ctx->class_alloc_count[total_class_count];
+
 	ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
 	ralloc_steal(ctx->g, ctx->instrd);
 	ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
@@ -566,6 +573,7 @@ __ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
 {
 	unsigned name;
 	debug_assert(cls >= 0);
+	debug_assert(cls < total_class_count);  /* we shouldn't get arrays here.. */
 	name = ctx->class_base[cls] + defn->name;
 	debug_assert(name < ctx->alloc_count);
 	return name;
@@ -590,6 +598,22 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 	struct ir3_ra_block_data *bd;
 	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
 
+	void def(unsigned name, struct ir3_instruction *instr)
+	{
+		/* defined on first write: */
+		if (!ctx->def[name])
+			ctx->def[name] = instr->ip;
+		ctx->use[name] = instr->ip;
+		BITSET_SET(bd->def, name);
+	}
+
+	void use(unsigned name, struct ir3_instruction *instr)
+	{
+		ctx->use[name] = MAX2(ctx->use[name], instr->ip);
+		if (!BITSET_TEST(bd->def, name))
+			BITSET_SET(bd->use, name);
+	}
+
 	bd = rzalloc(ctx->g, struct ir3_ra_block_data);
 
 	bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
@@ -601,6 +625,7 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 		struct ir3_instruction *src;
+		struct ir3_register *reg;
 
 		if (instr->regs_count == 0)
 			continue;
@@ -632,17 +657,45 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 		if (writes_gpr(instr)) {
 			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+			struct ir3_register *dst = instr->regs[0];
 
-			if (id->defn == instr) {
+			if (dst->flags & IR3_REG_ARRAY) {
+				struct ir3_array *arr =
+					ir3_lookup_array(ctx->ir, dst->array.id);
+				unsigned i;
+
+				debug_assert(!(dst->flags & IR3_REG_PHI_SRC));
+
+				/* set the node class now.. in case we don't encounter
+				 * this array dst again.  From register_alloc algo's
+				 * perspective, these are all single/scalar regs:
+				 */
+				for (i = 0; i < arr->length; i++) {
+					unsigned name = arr->base + i;
+					ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
+				}
+
+				/* indirect write is treated like a write to all array
+				 * elements, since we don't know which one is actually
+				 * written:
+				 */
+				if (dst->flags & IR3_REG_RELATIV) {
+					for (i = 0; i < arr->length; i++) {
+						unsigned name = arr->base + i;
+						def(name, instr);
+					}
+				} else {
+					unsigned name = arr->base + dst->array.offset;
+					def(name, instr);
+				}
+
+			} else if (id->defn == instr) {
 				unsigned name = ra_name(ctx, id);
 
-				ctx->def[name] = id->defn->ip;
-				ctx->use[name] = id->defn->ip;
-
 				/* since we are in SSA at this point: */
 				debug_assert(!BITSET_TEST(bd->use, name));
 
-				BITSET_SET(bd->def, name);
+				def(name, id->defn);
 
 				if (is_half(id->defn)) {
 					ra_set_node_class(ctx->g, name,
@@ -672,12 +725,28 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 			}
 		}
 
-		foreach_ssa_src(src, instr) {
-			if (writes_gpr(src)) {
+		foreach_src(reg, instr) {
+			if (reg->flags & IR3_REG_ARRAY) {
+				struct ir3_array *arr =
+					ir3_lookup_array(ctx->ir, reg->array.id);
+				/* indirect read is treated like a read fromall array
+				 * elements, since we don't know which one is actually
+				 * read:
+				 */
+				if (reg->flags & IR3_REG_RELATIV) {
+					unsigned i;
+					for (i = 0; i < arr->length; i++) {
+						unsigned name = arr->base + i;
+						use(name, instr);
+					}
+				} else {
+					unsigned name = arr->base + reg->array.offset;
+					use(name, instr);
+					debug_assert(reg->array.offset < arr->length);
+				}
+			} else if ((src = ssa(reg)) && writes_gpr(src)) {
 				unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
-				ctx->use[name] = MAX2(ctx->use[name], instr->ip);
-				if (!BITSET_TEST(bd->def, name))
-					BITSET_SET(bd->use, name);
+				use(name, instr);
 			}
 		}
 	}
@@ -830,18 +899,36 @@ static void fixup_half_instr_src(struct ir3_instruction *instr)
 	}
 }
 
+/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
+ * array access(es) which do not have any previous access to depend
+ * on from scheduling point of view
+ */
 static void
 reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
 		struct ir3_instruction *instr)
 {
-	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-	if (id->defn) {
+	struct ir3_ra_instr_data *id;
+
+	if (reg->flags & IR3_REG_ARRAY) {
+		struct ir3_array *arr =
+			ir3_lookup_array(ctx->ir, reg->array.id);
+		unsigned name = arr->base + reg->array.offset;
+		unsigned r = ra_get_node_reg(ctx->g, name);
+		unsigned num = ctx->set->ra_reg_to_gpr[r];
+
+		if (reg->flags & IR3_REG_RELATIV) {
+			reg->array.offset = num;
+		} else {
+			reg->num = num;
+		}
+
+		reg->flags &= ~IR3_REG_ARRAY;
+	} else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
 		unsigned name = ra_name(ctx, id);
 		unsigned r = ra_get_node_reg(ctx->g, name);
 		unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
 
-		if (reg->flags & IR3_REG_RELATIV)
-			num += reg->offset;
+		debug_assert(!(reg->flags & IR3_REG_RELATIV));
 
 		reg->num = num;
 		reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
@@ -868,9 +955,9 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 		foreach_src_n(reg, n, instr) {
 			struct ir3_instruction *src = reg->instr;
-			if (!src)
+			/* Note: reg->instr could be null for IR3_REG_ARRAY */
+			if (!(src || (reg->flags & IR3_REG_ARRAY)))
 				continue;
-
 			reg_assign(ctx, instr->regs[n+1], src);
 			if (instr->regs[n+1]->flags & IR3_REG_HALF)
 				fixup_half_instr_src(instr);
@@ -881,6 +968,8 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 static int
 ra_alloc(struct ir3_ra_ctx *ctx)
 {
+	unsigned n = 0;
+
 	/* frag shader inputs get pre-assigned, since we have some
 	 * constraints/unknowns about setup for some of these regs:
 	 */
@@ -898,7 +987,8 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 			i += 4;
 		}
 
-		for (j = 0; i < ir->ninputs; i++) {
+		j = 0;
+		for (; i < ir->ninputs; i++) {
 			struct ir3_instruction *instr = ir->inputs[i];
 			if (instr) {
 				struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
@@ -914,6 +1004,24 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 				}
 			}
 		}
+		n = j;
+	}
+
+	/* pre-assign array elements:
+	 * TODO we could be a bit more clever if we knew which arrays didn't
+	 * fully (partially?) conflict with each other..
+	 */
+	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+		unsigned i;
+		for (i = 0; i < arr->length; i++) {
+			unsigned name, reg;
+
+			name = arr->base + i;
+			reg = ctx->set->gpr_to_ra_reg[0][n++];
+
+			ra_set_node_reg(ctx->g, name, reg);
+
+		}
 	}
 
 	if (!ra_allocate(ctx->g))
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 6aaa16edbfe..8f640febc5d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -187,6 +187,9 @@ delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 
 	foreach_ssa_src_n(src, i, instr) {
 		unsigned d;
+		/* for array writes, no need to delay on previous write: */
+		if (i == 0)
+			continue;
 		if (src->block != instr->block)
 			continue;
 		d = delay_calc_srcn(ctx, src, instr, i);

From ebd3a1fc17977a487b0ed86294e9b8312a1bf335 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Tue, 12 Jan 2016 10:24:10 -0500
Subject: [PATCH 034/119] ttn: use writemask for store_var

Only user is freedreno, and after array-rework it can cope.  Avoids
generating loads for a store.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/auxiliary/nir/tgsi_to_nir.c | 28 ++-----------------------
 1 file changed, 2 insertions(+), 26 deletions(-)

diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 9b194481277..dfda80f228f 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -673,10 +673,6 @@ ttn_get_dest(struct ttn_compile *c, struct tgsi_full_dst_register *tgsi_fdst)
 
    if (tgsi_dst->File == TGSI_FILE_TEMPORARY) {
       if (c->temp_regs[index].var) {
-          nir_builder *b = &c->build;
-          nir_intrinsic_instr *load;
-          struct tgsi_ind_register *indirect =
-                tgsi_dst->Indirect ? &tgsi_fdst->Indirect : NULL;
           nir_register *reg;
 
          /* this works, because TGSI will give us a base offset
@@ -690,26 +686,6 @@ ttn_get_dest(struct ttn_compile *c, struct tgsi_full_dst_register *tgsi_fdst)
          reg->num_components = 4;
          dest.dest.reg.reg = reg;
          dest.dest.reg.base_offset = 0;
-
-         /* since the alu op might not write to all components
-          * of the temporary, we must first do a load_var to
-          * get the previous array elements into the register.
-          * This is one area that NIR could use a bit of
-          * improvement (or opt pass to clean up the mess
-          * once things are scalarized)
-          */
-
-         load = nir_intrinsic_instr_create(c->build.shader,
-                                           nir_intrinsic_load_var);
-         load->num_components = 4;
-         load->variables[0] =
-               ttn_array_deref(c, load, c->temp_regs[index].var,
-                               c->temp_regs[index].offset,
-                               indirect);
-
-         load->dest = nir_dest_for_reg(reg);
-
-         nir_builder_instr_insert(b, &load->instr);
       } else {
          assert(!tgsi_dst->Indirect);
          dest.dest.reg.reg = c->temp_regs[index].reg;
@@ -1886,7 +1862,7 @@ ttn_emit_instruction(struct ttn_compile *c)
       ttn_move_dest(b, dest, nir_fsat(b, ttn_src_for_dest(b, &dest)));
    }
 
-   /* if the dst has a matching var, append store_global to move
+   /* if the dst has a matching var, append store_var to move
     * output from reg to var
     */
    nir_variable *var = ttn_get_var(c, tgsi_dst);
@@ -1899,7 +1875,7 @@ ttn_emit_instruction(struct ttn_compile *c)
                                            &tgsi_dst->Indirect : NULL;
 
       store->num_components = 4;
-      store->const_index[0] = 0xf;
+      store->const_index[0] = dest.write_mask;
       store->variables[0] = ttn_array_deref(c, store, var, offset, indirect);
       store->src[0] = nir_src_for_reg(dest.dest.reg.reg);
 

From ddede497b831fb98e3540247b570968532cdacc9 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 15 Jan 2016 17:07:02 -0500
Subject: [PATCH 035/119] freedreno/ir3: workaround bug/feature

Seems like in certain cases, we cannot use c<a0.x+0> as the third src to
cat3 instructions.  This may be slightly conservative, we may only have
this restriction when the first src is also const.

This fixes, for example, +24/-0 of the variable-indexing piglit tests.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_cp.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 0d88e7bc3ab..1cc211a7663 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -288,6 +288,15 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 					conflicts(instr->address, reg->instr->address))
 				return;
 
+			/* This seems to be a hw bug, or something where the timings
+			 * just somehow don't work out.  This restriction may only
+			 * apply if the first src is also CONST.
+			 */
+			if ((instr->category == 3) && (n == 2) &&
+					(src_reg->flags & IR3_REG_RELATIV) &&
+					(src_reg->array.offset == 0))
+				return;
+
 			src_reg = ir3_reg_clone(instr->block->shader, src_reg);
 			src_reg->flags = new_flags;
 			instr->regs[n+1] = src_reg;

From 6a33c5c0dffce136bdc95daa2db2d3e9d3c1741f Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 15 Jan 2016 18:22:40 -0500
Subject: [PATCH 036/119] freedreno/ir3: array offset can be negative

It at least happens with some piglit tests, like
$piglit/bin/vp-address-01

  VERT
  DCL IN[0]
  DCL IN[1]
  DCL OUT[0], POSITION
  DCL OUT[1], COLOR
  DCL CONST[0..7]
  DCL ADDR[0]
    0: ARL ADDR[0].x, IN[1].xxxx
    1: MOV_SAT OUT[1], CONST[ADDR[0].x-1]
    2: DP4 OUT[0].x, CONST[4], IN[0]
    3: DP4 OUT[0].y, CONST[5], IN[0]
    4: DP4 OUT[0].z, CONST[6], IN[0]
    5: DP4 OUT[0].w, CONST[7], IN[0]
    6: END

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/instr-a3xx.h     |  1 +
 src/gallium/drivers/freedreno/ir3/ir3.c            |  2 +-
 src/gallium/drivers/freedreno/ir3/ir3.h            |  2 +-
 .../drivers/freedreno/ir3/ir3_compiler_nir.c       | 14 +++++++-------
 src/gallium/drivers/freedreno/ir3/ir3_print.c      |  6 +++---
 5 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
index c3fb68d511c..1b1f1f0a797 100644
--- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
+++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
@@ -261,6 +261,7 @@ typedef union PACKED {
 	/* to make compiler happy: */
 	uint32_t dummy32;
 	uint32_t dummy10   : 10;
+	int32_t  idummy10  : 10;
 	uint32_t dummy11   : 11;
 	uint32_t dummy12   : 12;
 	uint32_t dummy13   : 13;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index be415d8e5fe..7d89142d7a1 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -126,7 +126,7 @@ static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
 
 		if (reg->flags & IR3_REG_RELATIV) {
 			components = reg->size;
-			val.dummy10 = reg->array.offset;
+			val.idummy10 = reg->array.offset;
 			max = (reg->array.offset + repeat + components - 1) >> 2;
 		} else {
 			components = util_last_bit(reg->wrmask);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 1e5a1e9ee8b..f24acfa95cd 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -100,7 +100,7 @@ struct ir3_register {
 		/* relative: */
 		struct {
 			uint16_t id;
-			uint16_t offset;
+			int16_t offset;
 		} array;
 	};
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index bd0ee89a1ec..1ea2dd9cbf7 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -377,7 +377,7 @@ create_uniform(struct ir3_compile *ctx, unsigned n)
 }
 
 static struct ir3_instruction *
-create_uniform_indirect(struct ir3_compile *ctx, unsigned n,
+create_uniform_indirect(struct ir3_compile *ctx, int n,
 		struct ir3_instruction *address)
 {
 	struct ir3_instruction *mov;
@@ -411,7 +411,7 @@ create_collect(struct ir3_block *block, struct ir3_instruction **arr,
 }
 
 static struct ir3_instruction *
-create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
+create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, int n,
 		struct ir3_instruction *address, struct ir3_instruction *collect)
 {
 	struct ir3_block *block = ctx->block;
@@ -434,7 +434,7 @@ create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 
 /* relative (indirect) if address!=NULL */
 static struct ir3_instruction *
-create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, unsigned n,
+create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, int n,
 		struct ir3_instruction *address)
 {
 	struct ir3_block *block = ctx->block;
@@ -462,7 +462,7 @@ create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, unsigned n,
 
 /* relative (indirect) if address!=NULL */
 static struct ir3_instruction *
-create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, unsigned n,
+create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, int n,
 		struct ir3_instruction *src, struct ir3_instruction *address)
 {
 	struct ir3_block *block = ctx->block;
@@ -1000,7 +1000,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 	nir_const_value *const_offset;
 	/* UBO addresses are the first driver params: */
 	unsigned ubo = regid(ctx->so->first_driver_param + IR3_UBOS_OFF, 0);
-	unsigned off = intr->const_index[0];
+	int off = intr->const_index[0];
 
 	/* First src is ubo index, which could either be an immed or not: */
 	src0 = get_src(ctx, &intr->src[0])[0];
@@ -1141,7 +1141,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 	const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
 	struct ir3_instruction **dst, **src;
 	struct ir3_block *b = ctx->block;
-	unsigned idx = intr->const_index[0];
+	int idx = intr->const_index[0];
 	nir_const_value *const_offset;
 
 	if (info->has_dest) {
@@ -1162,7 +1162,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		} else {
 			src = get_src(ctx, &intr->src[0]);
 			for (int i = 0; i < intr->num_components; i++) {
-				unsigned n = idx * 4 + i;
+				int n = idx * 4 + i;
 				dst[i] = create_uniform_indirect(ctx, n,
 						get_addr(ctx, src[0]));
 			}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c
index ec832f5d72a..ba0c4a57aa3 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -107,7 +107,7 @@ static void print_reg_name(struct ir3_register *reg)
 	if (reg->flags & IR3_REG_IMMED) {
 		printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
 	} else if (reg->flags & IR3_REG_ARRAY) {
-		printf("arr[id=%u, offset=%u, size=%u", reg->array.id,
+		printf("arr[id=%u, offset=%d, size=%u", reg->array.id,
 				reg->array.offset, reg->size);
 		/* for ARRAY we could have null src, for example first write
 		 * instruction..
@@ -126,9 +126,9 @@ static void print_reg_name(struct ir3_register *reg)
 		if (reg->flags & IR3_REG_HALF)
 			printf("h");
 		if (reg->flags & IR3_REG_CONST)
-			printf("c<a0.x + %u>", reg->array.offset);
+			printf("c<a0.x + %d>", reg->array.offset);
 		else
-			printf("\x1b[0;31mr<a0.x + %u>\x1b[0m (%u)", reg->array.offset, reg->size);
+			printf("\x1b[0;31mr<a0.x + %d>\x1b[0m (%u)", reg->array.offset, reg->size);
 	} else {
 		if (reg->flags & IR3_REG_HALF)
 			printf("h");

From 2a6ec1e0615127d036acfbece59576e9ef2527bc Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 15 Jan 2016 19:45:51 -0500
Subject: [PATCH 037/119] freedreno/ir3: better array register allocation

Detect arrays which don't conflict with each other and allow overlapping
register allocation.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3.h    |  4 +-
 src/gallium/drivers/freedreno/ir3/ir3_ra.c | 56 ++++++++++++++++++----
 2 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index f24acfa95cd..1a109d880e6 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -399,7 +399,9 @@ struct ir3_array {
 	struct ir3_instruction *last_write, *last_access;
 
 	/* extra stuff used in RA pass: */
-	unsigned base;
+	unsigned base;      /* base vreg name */
+	unsigned reg;       /* base physical reg */
+	uint16_t start_ip, end_ip;
 };
 
 struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index 3c42f8e1592..2ed78818e61 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -272,6 +272,13 @@ struct ir3_ra_ctx {
 	struct ir3_ra_instr_data *instrd;
 };
 
+/* does it conflict? */
+static inline bool
+intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
+{
+	return !((a_start >= b_end) || (b_start >= a_end));
+}
+
 static bool
 is_half(struct ir3_instruction *instr)
 {
@@ -666,6 +673,9 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 				debug_assert(!(dst->flags & IR3_REG_PHI_SRC));
 
+				arr->start_ip = MIN2(arr->start_ip, instr->ip);
+				arr->end_ip = MAX2(arr->end_ip, instr->ip);
+
 				/* set the node class now.. in case we don't encounter
 				 * this array dst again.  From register_alloc algo's
 				 * perspective, these are all single/scalar regs:
@@ -729,6 +739,8 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 			if (reg->flags & IR3_REG_ARRAY) {
 				struct ir3_array *arr =
 					ir3_lookup_array(ctx->ir, reg->array.id);
+				arr->start_ip = MIN2(arr->start_ip, instr->ip);
+				arr->end_ip = MAX2(arr->end_ip, instr->ip);
 				/* indirect read is treated like a read fromall array
 				 * elements, since we don't know which one is actually
 				 * read:
@@ -802,6 +814,12 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 {
 	struct ir3 *ir = ctx->ir;
 
+	/* initialize array live ranges: */
+	list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
+		arr->start_ip = ~0;
+		arr->end_ip = 0;
+	}
+
 	/* compute live ranges (use/def) on a block level, also updating
 	 * block's def/use bitmasks (used below to calculate per-block
 	 * livein/liveout):
@@ -840,8 +858,8 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 
 	for (unsigned i = 0; i < ctx->alloc_count; i++) {
 		for (unsigned j = 0; j < ctx->alloc_count; j++) {
-			if (!((ctx->def[i] >= ctx->use[j]) ||
-					(ctx->def[j] >= ctx->use[i]))) {
+			if (intersects(ctx->def[i], ctx->use[i],
+					ctx->def[j], ctx->use[j])) {
 				ra_add_node_interference(ctx->g, i, j);
 			}
 		}
@@ -1008,19 +1026,41 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 	}
 
 	/* pre-assign array elements:
-	 * TODO we could be a bit more clever if we knew which arrays didn't
-	 * fully (partially?) conflict with each other..
 	 */
 	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
-		unsigned i;
-		for (i = 0; i < arr->length; i++) {
+		unsigned base = n;
+
+		if (arr->end_ip == 0)
+			continue;
+
+		/* figure out what else we conflict with which has already
+		 * been assigned:
+		 */
+retry:
+		list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
+			if (arr2 == arr)
+				break;
+			if (arr2->end_ip == 0)
+				continue;
+			/* if it intersects with liverange AND register range.. */
+			if (intersects(arr->start_ip, arr->end_ip,
+					arr2->start_ip, arr2->end_ip) &&
+				intersects(base, base + arr->length,
+					arr2->reg, arr2->reg + arr2->length)) {
+				base = MAX2(base, arr2->reg + arr2->length);
+				goto retry;
+			}
+		}
+
+		arr->reg = base;
+
+		for (unsigned i = 0; i < arr->length; i++) {
 			unsigned name, reg;
 
 			name = arr->base + i;
-			reg = ctx->set->gpr_to_ra_reg[0][n++];
+			reg = ctx->set->gpr_to_ra_reg[0][base++];
 
 			ra_set_node_reg(ctx->g, name, reg);
-
 		}
 	}
 

From 02ac91d717036be0c8390b99860d37ff390c50e2 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sun, 17 Jan 2016 12:21:45 -0500
Subject: [PATCH 038/119] freedreno/ir3: fix mad 3rd src delay calc

In fad158a0 ("freedreno/ir3: array rework") the src # (n) shifted by
one, but missed updating delay-slot calc.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_depth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index 3354cbd23fa..6d294f1a48c 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -76,7 +76,7 @@ int ir3_delayslots(struct ir3_instruction *assigner,
 		return 6;
 	} else if ((consumer->category == 3) &&
 			(is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
-			(n == 2)) {
+			(n == 3)) {
 		/* special case, 3rd src to cat3 not required on first cycle */
 		return 1;
 	} else {

From 529aa8249a29577f160e1f289472b0da7241b90f Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sun, 17 Jan 2016 14:25:32 +0200
Subject: [PATCH 039/119] llvmpipe: fix arguments order given to vec_andc

This patch fixes a classic "confuse the enemy" bug.

_mm_andnot_si128 (SSE) and vec_andc (VMX) do the same operation, but the
arguments are opposite.

_mm_andnot_si128 performs "r = (~a) & b" while
vec_andc performs "r = a & (~b)"

To make sure this error won't return in another place, I added a wrapper
function, vec_andnot_si128, in u_pwr8.h, which makes the swap inside.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/auxiliary/util/u_pwr8.h         | 6 ++++++
 src/gallium/drivers/llvmpipe/lp_setup_tri.c | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/util/u_pwr8.h b/src/gallium/auxiliary/util/u_pwr8.h
index 1eca6d6df2c..ffd9f923142 100644
--- a/src/gallium/auxiliary/util/u_pwr8.h
+++ b/src/gallium/auxiliary/util/u_pwr8.h
@@ -153,6 +153,12 @@ vec_mullo_epi32 (__m128i a, __m128i b)
    return v;
 }
 
+static inline __m128i
+vec_andnot_si128 (__m128i a, __m128i b)
+{
+   return vec_andc (b, a);
+}
+
 static inline void
 transpose4_epi32(const __m128i * restrict a,
                  const __m128i * restrict b,
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index aa241761586..907129dbd1b 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -556,7 +556,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
       /* Calculate trivial reject values:
        */
-      eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy),
+      eo = vec_sub_epi32(vec_andnot_si128(dcdy_neg_mask, dcdy),
                          vec_and(dcdx_neg_mask, dcdx));
 
       /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */

From 5a81b48ad0a895d4a609bb9eddd55a9456cb9091 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 17 Jan 2016 03:41:55 -0500
Subject: [PATCH 040/119] nvc0: bsp_bo can't be null

We already deref it earlier. And these are all allocated on load.
Spotted by Coverity.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
index c53f946a762..af072a8acdc 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
@@ -64,7 +64,7 @@ nvc0_decoder_bsp_next(struct nouveau_vp3_decoder *dec,
       bsp_size += num_bytes[i];
    bsp_size += 256; /* the 4 end markers */
 
-   if (!bsp_bo || bsp_size > bsp_bo->size) {
+   if (bsp_size > bsp_bo->size) {
       union nouveau_bo_config cfg;
       struct nouveau_bo *tmp_bo = NULL;
 

From 3281ae96c8e38ecb0d92155ff46142d3cb4d57b7 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 17 Jan 2016 03:44:22 -0500
Subject: [PATCH 041/119] tgsi: initialize Atomic field in
 tgsi_default_declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Spotted by Coverity.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_build.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index ea207461d27..83f50628b40 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -110,6 +110,7 @@ tgsi_default_declaration( void )
    declaration.Invariant = 0;
    declaration.Local = 0;
    declaration.Array = 0;
+   declaration.Atomic = 0;
    declaration.Padding = 0;
 
    return declaration;

From 4ac1274caa260300e951d3e66596d308ec531cfa Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 17 Jan 2016 16:24:02 -0500
Subject: [PATCH 042/119] gm107/ir: don't do indirect frag shader inputs on
 GM107

Apparently the IPA op decided to stop working with offsets. Need to
figure out if we need to do an AL2P situation or something similar. For
now just turn it back off.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 6f7defa35f2..7211df94bb8 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -298,6 +298,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
       return shader != PIPE_SHADER_FRAGMENT;
    case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+      return shader != PIPE_SHADER_FRAGMENT || class_3d < GM107_3D_CLASS;
    case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
    case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
       return 1;

From 50376e0c0e81f80cac6ec77da491241e4ccf57f0 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Mon, 18 Jan 2016 11:13:27 +1100
Subject: [PATCH 043/119] glsl: fix segfault linking subroutine uniform with
 explicit location

Reviewed-by: Dave Airlie <airlied@redhat.com>
Cc: "11.0 11.1" mesa-stable@lists.freedesktop.org
---
 src/glsl/linker.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 3f40c0be332..6657777d74c 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3162,7 +3162,7 @@ check_explicit_uniform_locations(struct gl_context *ctx,
 
          if (var->data.explicit_location) {
             bool ret;
-            if (var->type->is_subroutine())
+            if (var->type->without_array()->is_subroutine())
                ret = reserve_subroutine_explicit_locations(prog, sh, var);
             else
                ret = reserve_explicit_locations(prog, uniform_map, var);

From 86677f101641c75d52577e3cd9e76441b1228b21 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Sun, 17 Jan 2016 16:09:08 +1100
Subject: [PATCH 044/119] mesa: fix segfault in glUniformSubroutinesuiv()

From Section 7.9 (SUBROUTINE UNIFORM VARIABLES) of the OpenGL
4.5 Core spec:

   "The command

       void UniformSubroutinesuiv(enum shadertype, sizei count,
                                  const uint *indices);

   will load all active subroutine uniforms for shader stage
   shadertype with subroutine indices from indices, storing
   indices[i] into the uniform at location i. The indices for
   any locations between zero and the value of
   ACTIVE_SUBROUTINE_UNIFORM_LOCATIONS minus one which are not
   used will be ignored."

V2: simplify NULL check suggested by Jason.

Acked-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Dave Airlie <airlied@redhat.com>
Cc: "11.0 11.1" mesa-stable@lists.freedesktop.org
https://bugs.freedesktop.org/show_bug.cgi?id=93731
---
 src/mesa/main/shaderapi.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index cdc85f3413b..126786cfaf4 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -2530,6 +2530,11 @@ _mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
    i = 0;
    do {
       struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[i];
+      if (uni == NULL) {
+         i++;
+         continue;
+      }
+
       int uni_count = uni->array_elements ? uni->array_elements : 1;
       int j, k;
 
@@ -2557,6 +2562,11 @@ _mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
    i = 0;
    do {
       struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[i];
+      if (uni == NULL) {
+         i++;
+         continue;
+      }
+
       int uni_count = uni->array_elements ? uni->array_elements : 1;
 
       memcpy(&uni->storage[0], &indices[i],

From eac2cece3143e07f0f229df15ab757b4f265b9a3 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 15 Jan 2016 13:45:48 +1100
Subject: [PATCH 045/119] glsl: add missing explicit_stream flag to
 has_layout()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This will allow the ARB_shading_language_420pack rules in
glsl_parser.yy for catching duplicate layout qualifiers to be
triggered for the stream identifier rather than relying on the
code meant to catch duplicates within a single layout(...)

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/glsl/ast_type.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp
index 8643b7bfb76..19ffe833164 100644
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@@ -76,7 +76,8 @@ ast_type_qualifier::has_layout() const
           || this->flags.q.explicit_location
           || this->flags.q.explicit_index
           || this->flags.q.explicit_binding
-          || this->flags.q.explicit_offset;
+          || this->flags.q.explicit_offset
+          || this->flags.q.explicit_stream;
 }
 
 bool

From 9258d9f23daca39767a41abc47836991a5f189b4 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 15 Jan 2016 13:45:49 +1100
Subject: [PATCH 046/119] glsl: remove special case for detecting stream
 duplicates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Any duplicates in a single declaration will already fail the
generic duplicates test due to the explicit_stream flag being set.

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/glsl/ast_type.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp
index 19ffe833164..1f675b28ae8 100644
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@@ -208,11 +208,6 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
             this->flags.q.stream = 1;
             this->stream = state->out_qualifier->stream;
          }
-      } else {
-         if (q.flags.q.explicit_stream) {
-            _mesa_glsl_error(loc, state,
-                             "duplicate layout `stream' qualifier");
-         }
       }
    }
 

From 119bef954379ebb35faf182b0b665becacddab76 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Sun, 17 Jan 2016 14:23:35 +1000
Subject: [PATCH 047/119] glsl: fix subroutine lowering reusing actual
 parmaters

One of the oglconform tests was crashing here, and it was
due to not cloning the actual parameters before creating the
new call. This makes a call clone function that does the right
things to make sure we clone all the needed info, and points
the callee at it. (It differs from ->clone due to this).

this may fix https://bugs.freedesktop.org/show_bug.cgi?id=93722, I had this
patch in my cts fixes tree, but hadn't had time to make sure I liked it.

Cc: "11.0 11.1" <mesa-stable@lists.freedesktop.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
---
 src/glsl/lower_subroutine.cpp | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/glsl/lower_subroutine.cpp b/src/glsl/lower_subroutine.cpp
index a0df5e1df81..ac8ade13d99 100644
--- a/src/glsl/lower_subroutine.cpp
+++ b/src/glsl/lower_subroutine.cpp
@@ -44,6 +44,7 @@ public:
    }
 
    ir_visitor_status visit_leave(ir_call *);
+   ir_call *call_clone(ir_call *call, ir_function_signature *callee);
    bool progress;
    struct _mesa_glsl_parse_state *state;
 };
@@ -58,6 +59,23 @@ lower_subroutine(exec_list *instructions, struct _mesa_glsl_parse_state *state)
    return v.progress;
 }
 
+ir_call *
+lower_subroutine_visitor::call_clone(ir_call *call, ir_function_signature *callee)
+{
+   void *mem_ctx = ralloc_parent(call);
+   ir_dereference_variable *new_return_ref = NULL;
+   if (call->return_deref != NULL)
+      new_return_ref = call->return_deref->clone(mem_ctx, NULL);
+
+   exec_list new_parameters;
+
+   foreach_in_list(ir_instruction, ir, &call->actual_parameters) {
+      new_parameters.push_tail(ir->clone(mem_ctx, NULL));
+   }
+
+   return new(mem_ctx) ir_call(callee, new_return_ref, &new_parameters);
+}
+
 ir_visitor_status
 lower_subroutine_visitor::visit_leave(ir_call *ir)
 {
@@ -66,7 +84,6 @@ lower_subroutine_visitor::visit_leave(ir_call *ir)
 
    void *mem_ctx = ralloc_parent(ir);
    ir_if *last_branch = NULL;
-   ir_dereference_variable *return_deref = ir->return_deref;
 
    for (int s = this->state->num_subroutines - 1; s >= 0; s--) {
       ir_rvalue *var;
@@ -92,14 +109,11 @@ lower_subroutine_visitor::visit_leave(ir_call *ir)
          fn->exact_matching_signature(this->state,
                                       &ir->actual_parameters);
 
-      ir_call *new_call = new(mem_ctx) ir_call(sub_sig, return_deref, &ir->actual_parameters);
+      ir_call *new_call = call_clone(ir, sub_sig);
       if (!last_branch)
          last_branch = if_tree(equal(subr_to_int(var), lc), new_call);
       else
          last_branch = if_tree(equal(subr_to_int(var), lc), new_call, last_branch);
-
-      if (return_deref && s > 0)
-        return_deref = return_deref->clone(mem_ctx, NULL);
    }
    if (last_branch)
       ir->insert_before(last_branch);

From 679a654a77f392cc9f8e2b51222b277bd1783ae5 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sun, 17 Jan 2016 22:15:40 +0200
Subject: [PATCH 048/119] llvmpipe: use vpkswss when dst is signed

This patch fixes a bug when building a pack instruction.

For POWER (altivec), in case the destination is signed and the
src width is 32, we need to use vpkswss. The original code used vpkuwus,
which emits an unsigned result.

This fixes the following piglit tests on ppc64le:
- spec@arb_color_buffer_float@gl_rgba8-drawpixels
- shaders@glsl-fs-fogscale

I've also corrected some coding style issues in the function.

v2: Returned else statements to vmware style

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_pack.c | 31 ++++++++++-----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index cdf6d80c261..0b0f7f0147c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -461,50 +461,49 @@ lp_build_pack2(struct gallivm_state *gallivm,
    assert(src_type.length * 2 == dst_type.length);
 
    /* Check for special cases first */
-   if((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
-       src_type.width * src_type.length >= 128) {
+   if ((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
+        src_type.width * src_type.length >= 128) {
       const char *intrinsic = NULL;
       boolean swap_intrinsic_operands = FALSE;
 
       switch(src_type.width) {
       case 32:
          if (util_cpu_caps.has_sse2) {
-           if(dst_type.sign) {
+           if (dst_type.sign) {
               intrinsic = "llvm.x86.sse2.packssdw.128";
-           }
-           else {
+           } else {
               if (util_cpu_caps.has_sse4_1) {
                  intrinsic = "llvm.x86.sse41.packusdw";
               }
            }
          } else if (util_cpu_caps.has_altivec) {
             if (dst_type.sign) {
-              intrinsic = "llvm.ppc.altivec.vpkswus";
-           } else {
-              intrinsic = "llvm.ppc.altivec.vpkuwus";
-           }
+               intrinsic = "llvm.ppc.altivec.vpkswss";
+            } else {
+               intrinsic = "llvm.ppc.altivec.vpkuwus";
+            }
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
-           swap_intrinsic_operands = TRUE;
+            swap_intrinsic_operands = TRUE;
 #endif
          }
          break;
       case 16:
          if (dst_type.sign) {
             if (util_cpu_caps.has_sse2) {
-              intrinsic = "llvm.x86.sse2.packsswb.128";
+               intrinsic = "llvm.x86.sse2.packsswb.128";
             } else if (util_cpu_caps.has_altivec) {
-              intrinsic = "llvm.ppc.altivec.vpkshss";
+               intrinsic = "llvm.ppc.altivec.vpkshss";
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
-              swap_intrinsic_operands = TRUE;
+               swap_intrinsic_operands = TRUE;
 #endif
             }
          } else {
             if (util_cpu_caps.has_sse2) {
-              intrinsic = "llvm.x86.sse2.packuswb.128";
+               intrinsic = "llvm.x86.sse2.packuswb.128";
             } else if (util_cpu_caps.has_altivec) {
-	      intrinsic = "llvm.ppc.altivec.vpkshus";
+               intrinsic = "llvm.ppc.altivec.vpkshus";
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
-              swap_intrinsic_operands = TRUE;
+               swap_intrinsic_operands = TRUE;
 #endif
             }
          }

From 4297259fc87b513f307ba9af3289b98b54f2cbb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer@amd.com>
Date: Fri, 15 Jan 2016 12:13:15 +0900
Subject: [PATCH 049/119] radeonsi: Print "LLVM emitted unknown config
 register" warning only once
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Say "LLVM" instead of "Compiler" for clarity.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index cc9718e42d3..3ab054c3253 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3735,8 +3735,15 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 				G_00B860_WAVESIZE(value) * 256 * 4 * 1;
 			break;
 		default:
-			fprintf(stderr, "Warning: Compiler emitted unknown "
-				"config register: 0x%x\n", reg);
+			{
+				static bool printed;
+
+				if (!printed) {
+					fprintf(stderr, "Warning: LLVM emitted unknown "
+						"config register: 0x%x\n", reg);
+					printed = true;
+				}
+			}
 			break;
 		}
 	}

From a9ab7172a621294f5d7b5dfac3363bed240e20a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer@amd.com>
Date: Fri, 15 Jan 2016 12:18:29 +0900
Subject: [PATCH 050/119] radeonsi: Avoid warning about LLVM generating
 R_0286D0_SPI_PS_INPUT_ADDR
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 3ab054c3253..2de7def8dd2 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3728,6 +3728,9 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 		case R_0286CC_SPI_PS_INPUT_ENA:
 			conf->spi_ps_input_ena = value;
 			break;
+		case R_0286D0_SPI_PS_INPUT_ADDR:
+			/* Not used yet, but will be in the future */
+			break;
 		case R_0286E8_SPI_TMPRING_SIZE:
 		case R_00B860_COMPUTE_TMPRING_SIZE:
 			/* WAVESIZE is in units of 256 dwords. */

From 0491dd1deb46ca77545322c53e90f86ae244cdae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer@amd.com>
Date: Fri, 15 Jan 2016 15:46:31 +0900
Subject: [PATCH 051/119] st/dri: Don't call invalidate_resource for NULL
 depth/stencil buffers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes crash in 4 EGL piglit tests with radeonsi.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/state_trackers/dri/dri_drawable.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/state_trackers/dri/dri_drawable.c b/src/gallium/state_trackers/dri/dri_drawable.c
index f0cc4a2a3ef..adc51284767 100644
--- a/src/gallium/state_trackers/dri/dri_drawable.c
+++ b/src/gallium/state_trackers/dri/dri_drawable.c
@@ -492,8 +492,10 @@ dri_flush(__DRIcontext *cPriv,
 
       if (pipe->invalidate_resource &&
           (flags & __DRI2_FLUSH_INVALIDATE_ANCILLARY)) {
-         pipe->invalidate_resource(pipe, drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL]);
-         pipe->invalidate_resource(pipe, drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL]);
+         if (drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL])
+            pipe->invalidate_resource(pipe, drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL]);
+         if (drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL])
+            pipe->invalidate_resource(pipe, drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL]);
       }
    }
 

From ad20be1f30ef5d11bcacef38a921cb778c504dd2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer@amd.com>
Date: Fri, 15 Jan 2016 16:02:22 +0900
Subject: [PATCH 052/119] gallium/radeon: Rename do_invalidate_resource to
 invalidate_buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

And only call it from r600_invalidate_resource for buffer resources.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeon/r600_buffer_common.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index 6592c5bdeca..c7984c47304 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -210,8 +210,8 @@ static void r600_buffer_destroy(struct pipe_screen *screen,
 }
 
 static bool
-r600_do_invalidate_resource(struct r600_common_context *rctx,
-			    struct r600_resource *rbuffer)
+r600_invalidate_buffer(struct r600_common_context *rctx,
+		       struct r600_resource *rbuffer)
 {
 	/* In AMD_pinned_memory, the user pointer association only gets
 	 * broken when the buffer is explicitly re-allocated.
@@ -236,7 +236,9 @@ void r600_invalidate_resource(struct pipe_context *ctx,
 	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
 	struct r600_resource *rbuffer = r600_resource(resource);
 
-	(void)r600_do_invalidate_resource(rctx, rbuffer);
+	/* We currently only do anyting here for buffers */
+	if (resource->target == PIPE_BUFFER)
+		(void)r600_invalidate_buffer(rctx, rbuffer);
 }
 
 static void *r600_buffer_get_transfer(struct pipe_context *ctx,
@@ -306,7 +308,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 	    !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
 		assert(usage & PIPE_TRANSFER_WRITE);
 
-		if (r600_do_invalidate_resource(rctx, rbuffer)) {
+		if (r600_invalidate_buffer(rctx, rbuffer)) {
 			/* At this point, the buffer is always idle. */
 			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
 		}

From eaf7ec9cfc5165f461bddc365aaaf6cb25c2d9bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Sun, 13 Dec 2015 11:44:13 +0100
Subject: [PATCH 053/119] st/va: add motion adaptive deinterlacing v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: minor cleanup

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/state_trackers/va/context.c    |  5 ++
 src/gallium/state_trackers/va/postproc.c   | 60 ++++++++++++++++++++--
 src/gallium/state_trackers/va/surface.c    | 22 ++++++--
 src/gallium/state_trackers/va/va_private.h |  2 +
 4 files changed, 82 insertions(+), 7 deletions(-)

diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c
index 37a011799e2..b25c381d968 100644
--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -31,6 +31,7 @@
 #include "util/u_memory.h"
 #include "util/u_handle_table.h"
 #include "util/u_video.h"
+#include "vl/vl_deint_filter.h"
 #include "vl/vl_winsys.h"
 
 #include "va_private.h"
@@ -296,6 +297,10 @@ vlVaDestroyContext(VADriverContextP ctx, VAContextID context_id)
       }
       context->decoder->destroy(context->decoder);
    }
+   if (context->deint) {
+      vl_deint_filter_cleanup(context->deint);
+      FREE(context->deint);
+   }
    FREE(context);
    handle_table_remove(drv->htab, context_id);
    pipe_mutex_unlock(drv->mutex);
diff --git a/src/gallium/state_trackers/va/postproc.c b/src/gallium/state_trackers/va/postproc.c
index 0cec0c88124..f541f7c8aeb 100644
--- a/src/gallium/state_trackers/va/postproc.c
+++ b/src/gallium/state_trackers/va/postproc.c
@@ -29,6 +29,7 @@
 
 #include "vl/vl_defines.h"
 #include "vl/vl_video_buffer.h"
+#include "vl/vl_deint_filter.h"
 
 #include "va_private.h"
 
@@ -174,6 +175,51 @@ static VAStatus vlVaPostProcBlit(vlVaDriver *drv, vlVaContext *context,
    return VA_STATUS_SUCCESS;
 }
 
+static struct pipe_video_buffer *
+vlVaApplyDeint(vlVaDriver *drv, vlVaContext *context,
+               VAProcPipelineParameterBuffer *param,
+               struct pipe_video_buffer *current,
+               unsigned field)
+{
+   vlVaSurface *prevprev, *prev, *next;
+
+   if (param->num_forward_references < 1 ||
+       param->num_backward_references < 2)
+      return current;
+
+   prevprev = handle_table_get(drv->htab, param->backward_references[1]);
+   prev = handle_table_get(drv->htab, param->backward_references[0]);
+   next = handle_table_get(drv->htab, param->forward_references[0]);
+
+   if (!prevprev || !prev || !next)
+      return current;
+
+   if (context->deint && (context->deint->video_width != current->width ||
+       context->deint->video_height != current->height)) {
+      vl_deint_filter_cleanup(context->deint);
+      FREE(context->deint);
+      context->deint = NULL;
+   }
+
+   if (!context->deint) {
+      context->deint = MALLOC(sizeof(struct vl_deint_filter));
+      if (!vl_deint_filter_init(context->deint, drv->pipe, current->width,
+                                current->height, false, false)) {
+         FREE(context->deint);
+         context->deint = NULL;
+         return current;
+      }
+   }
+
+   if (!vl_deint_filter_check_buffers(context->deint, prevprev->buffer,
+                                      prev->buffer, current, next->buffer))
+      return current;
+
+   vl_deint_filter_render(context->deint, prevprev->buffer, prev->buffer,
+                          current, next->buffer, field);
+   return context->deint->video_buffer;
+}
+
 VAStatus
 vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
 {
@@ -181,6 +227,7 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex
    VARectangle def_src_region, def_dst_region;
    const VARectangle *src_region, *dst_region;
    VAProcPipelineParameterBuffer *param;
+   struct pipe_video_buffer *src;
    vlVaSurface *src_surface;
    unsigned i;
 
@@ -199,6 +246,8 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex
    if (!src_surface || !src_surface->buffer)
       return VA_STATUS_ERROR_INVALID_SURFACE;
 
+   src = src_surface->buffer;
+
    for (i = 0; i < param->num_filters; i++) {
       vlVaBuffer *buf = handle_table_get(drv->htab, param->filters[i]);
       VAProcFilterParameterBufferBase *filter;
@@ -222,6 +271,11 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex
             deinterlace = VL_COMPOSITOR_WEAVE;
             break;
 
+         case VAProcDeinterlacingMotionAdaptive:
+            src = vlVaApplyDeint(drv, context, param, src,
+				 deint->flags & VA_DEINTERLACING_BOTTOM_FIELD);
+            break;
+
          default:
             return VA_STATUS_ERROR_UNIMPLEMENTED;
          }
@@ -239,10 +293,8 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex
 
    if (context->target->buffer_format != PIPE_FORMAT_NV12)
       return vlVaPostProcCompositor(drv, context, src_region, dst_region,
-                                    src_surface->buffer, context->target,
-                                    deinterlace);
+                                    src, context->target, deinterlace);
    else
       return vlVaPostProcBlit(drv, context, src_region, dst_region,
-                              src_surface->buffer, context->target,
-                              deinterlace);
+                              src, context->target, deinterlace);
 }
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c
index f23a88901f5..84a94949c47 100644
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -691,13 +691,14 @@ vlVaQueryVideoProcFilterCaps(VADriverContextP ctx, VAContextID context,
    case VAProcFilterDeinterlacing: {
       VAProcFilterCapDeinterlacing *deint = filter_caps;
 
-      if (*num_filter_caps < 2) {
-         *num_filter_caps = 2;
+      if (*num_filter_caps < 3) {
+         *num_filter_caps = 3;
          return VA_STATUS_ERROR_MAX_NUM_EXCEEDED;
       }
 
       deint[i++].type = VAProcDeinterlacingBob;
       deint[i++].type = VAProcDeinterlacingWeave;
+      deint[i++].type = VAProcDeinterlacingMotionAdaptive;
       break;
    }
 
@@ -750,9 +751,24 @@ vlVaQueryVideoProcPipelineCaps(VADriverContextP ctx, VAContextID context,
 
    for (i = 0; i < num_filters; i++) {
       vlVaBuffer *buf = handle_table_get(VL_VA_DRIVER(ctx)->htab, filters[i]);
+      VAProcFilterParameterBufferBase *filter;
 
-      if (!buf || buf->type >= VABufferTypeMax)
+      if (!buf || buf->type != VAProcFilterParameterBufferType)
          return VA_STATUS_ERROR_INVALID_BUFFER;
+
+      filter = buf->data;
+      switch (filter->type) {
+      case VAProcFilterDeinterlacing: {
+         VAProcFilterParameterBufferDeinterlacing *deint = buf->data;
+         if (deint->algorithm == VAProcDeinterlacingMotionAdaptive) {
+            pipeline_cap->num_forward_references = 1;
+            pipeline_cap->num_backward_references = 2;
+         }
+         break;
+      }
+      default:
+         return VA_STATUS_ERROR_UNIMPLEMENTED;
+      }
    }
 
    return VA_STATUS_SUCCESS;
diff --git a/src/gallium/state_trackers/va/va_private.h b/src/gallium/state_trackers/va/va_private.h
index 7afd81a196d..614fa98fef7 100644
--- a/src/gallium/state_trackers/va/va_private.h
+++ b/src/gallium/state_trackers/va/va_private.h
@@ -236,6 +236,8 @@ typedef struct {
       VAPictureParameterBufferMPEG4 pps;
       uint8_t start_code[32];
    } mpeg4;
+
+   struct vl_deint_filter *deint;
 } vlVaContext;
 
 typedef struct {

From a78e08e88fb42db540955f94752a49a4800c0e43 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.velikov@collabora.com>
Date: Thu, 14 Jan 2016 09:28:21 +0200
Subject: [PATCH 054/119] i965: adding missing headers to the dist tarball

Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
---
 src/mesa/drivers/dri/i965/Makefile.sources | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 5aeeca57f37..cea1e87ccde 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -71,7 +71,9 @@ i965_compiler_FILES = \
 	brw_vec4_surface_builder.cpp \
 	brw_vec4_surface_builder.h \
 	brw_vec4_tcs.cpp \
+	brw_vec4_tcs.h \
 	brw_vec4_tes.cpp \
+	brw_vec4_tes.h \
 	brw_vec4_visitor.cpp \
 	brw_vec4_vs_visitor.cpp \
 	brw_vue_map.c \

From 7bc714509b5661658ebe65422e49cbf2055f6ad3 Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.velikov@collabora.com>
Date: Mon, 18 Jan 2016 13:06:28 +0200
Subject: [PATCH 055/119] mapi: include gl.xml in the tarball

Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
---
 src/mapi/Makefile.am | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mapi/Makefile.am b/src/mapi/Makefile.am
index 307e05d503f..5ba924192cf 100644
--- a/src/mapi/Makefile.am
+++ b/src/mapi/Makefile.am
@@ -35,6 +35,7 @@ EXTRA_DIST = \
 	es2api/ABI-check \
 	mapi_abi.py \
 	glapi/SConscript \
+	glapi/registry/gl.xml \
 	shared-glapi/SConscript
 
 AM_CFLAGS = \

From c03f3dd0a518a53598dcfd1ac19a453bfebcb71a Mon Sep 17 00:00:00 2001
From: Emil Velikov <emil.velikov@collabora.com>
Date: Mon, 18 Jan 2016 13:34:14 +0200
Subject: [PATCH 056/119] gallium: bundle the compat header u_pwr8.h in the
 tarball

Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
---
 src/gallium/auxiliary/Makefile.sources | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 5325f974cda..6f50f714c3f 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -271,6 +271,7 @@ C_SOURCES := \
 	util/u_prim_restart.h \
 	util/u_pstipple.c \
 	util/u_pstipple.h \
+	util/u_pwr8.h \
 	util/u_range.h \
 	util/u_rect.h \
 	util/u_resource.c \

From 6062941e4dd441bb67d5db760b87ffc0509befb7 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 18 Jan 2016 15:22:27 -0500
Subject: [PATCH 057/119] freedreno: per-generation OUT_IB packet

Some a4xx firmware doesn't implement the "PFD" (prefetch-disabled)
version of the CP_INDIRECT_BUFFER packet.  So allow for PFD vs PFE per
generation.  Switch a3xx and a4xx over to using prefetch-enabled version
(which is also what blob does.. it seems only on a2xx we cannot use
PFE).

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a2xx/fd2_context.c  |  1 +
 src/gallium/drivers/freedreno/a2xx/fd2_emit.c     | 14 ++++++++++++++
 src/gallium/drivers/freedreno/a2xx/fd2_emit.h     |  2 ++
 src/gallium/drivers/freedreno/a3xx/fd3_emit.c     |  8 ++++++++
 src/gallium/drivers/freedreno/a3xx/fd3_gmem.c     |  2 +-
 src/gallium/drivers/freedreno/a4xx/fd4_emit.c     |  8 ++++++++
 src/gallium/drivers/freedreno/freedreno_context.h |  4 ++++
 src/gallium/drivers/freedreno/freedreno_gmem.c    |  4 ++--
 src/gallium/drivers/freedreno/freedreno_util.h    |  6 +++---
 9 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_context.c b/src/gallium/drivers/freedreno/a2xx/fd2_context.c
index 3bed73573a6..058f8219ed5 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_context.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_context.c
@@ -109,6 +109,7 @@ fd2_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 	fd2_gmem_init(pctx);
 	fd2_texture_init(pctx);
 	fd2_prog_init(pctx);
+	fd2_emit_init(pctx);
 
 	pctx = fd_context_init(&fd2_ctx->base, pscreen,
 			(screen->gpu_id >= 220) ? a22x_primtypes : a20x_primtypes,
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
index cc0ed59f300..4f667ab7d57 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
@@ -446,3 +446,17 @@ fd2_emit_setup(struct fd_context *ctx)
 	fd_ringbuffer_flush(ring);
 	fd_ringmarker_mark(ctx->draw_start);
 }
+
+static void
+fd2_emit_ib(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
+		struct fd_ringmarker *end)
+{
+	__OUT_IB(ring, false, start, end);
+}
+
+void
+fd2_emit_init(struct pipe_context *pctx)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	ctx->emit_ib = fd2_emit_ib;
+}
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
index 8ee04632091..3c146c17151 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
@@ -45,4 +45,6 @@ void fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val,
 void fd2_emit_state(struct fd_context *ctx, uint32_t dirty);
 void fd2_emit_setup(struct fd_context *ctx);
 
+void fd2_emit_init(struct pipe_context *pctx);
+
 #endif /* FD2_EMIT_H */
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index e65a352e7f6..811f58bbba2 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -891,10 +891,18 @@ fd3_emit_restore(struct fd_context *ctx)
 	ctx->needs_rb_fbd = true;
 }
 
+static void
+fd3_emit_ib(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
+		struct fd_ringmarker *end)
+{
+	__OUT_IB(ring, true, start, end);
+}
+
 void
 fd3_emit_init(struct pipe_context *pctx)
 {
 	struct fd_context *ctx = fd_context(pctx);
 	ctx->emit_const = fd3_emit_const;
 	ctx->emit_const_bo = fd3_emit_const_bo;
+	ctx->emit_ib = fd3_emit_ib;
 }
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
index 21fb59e450d..2ce393a41ae 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -853,7 +853,7 @@ emit_binning_pass(struct fd_context *ctx)
 			A3XX_PC_VSTREAM_CONTROL_N(0));
 
 	/* emit IB to binning drawcmds: */
-	OUT_IB(ring, ctx->binning_start, ctx->binning_end);
+	ctx->emit_ib(ring, ctx->binning_start, ctx->binning_end);
 	fd_reset_wfi(ctx);
 
 	fd_wfi(ctx, ring);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index bc62a5d9a4b..4a3f1da30ed 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -885,10 +885,18 @@ fd4_emit_restore(struct fd_context *ctx)
 	ctx->needs_rb_fbd = true;
 }
 
+static void
+fd4_emit_ib(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
+		struct fd_ringmarker *end)
+{
+	__OUT_IB(ring, true, start, end);
+}
+
 void
 fd4_emit_init(struct pipe_context *pctx)
 {
 	struct fd_context *ctx = fd_context(pctx);
 	ctx->emit_const = fd4_emit_const;
 	ctx->emit_const_bo = fd4_emit_const_bo;
+	ctx->emit_ib = fd4_emit_ib;
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index 418b71b95de..9e7130ab915 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -386,6 +386,10 @@ struct fd_context {
 			const uint32_t *dwords, struct pipe_resource *prsc);
 	void (*emit_const_bo)(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
 			uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets);
+
+	/* indirect-branch emit: */
+	void (*emit_ib)(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
+			struct fd_ringmarker *end);
 };
 
 static inline struct fd_context *
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c
index 648db9baee5..0d73349057c 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -331,7 +331,7 @@ render_tiles(struct fd_context *ctx)
 		fd_hw_query_prepare_tile(ctx, i, ctx->ring);
 
 		/* emit IB to drawcmds: */
-		OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
+		ctx->emit_ib(ctx->ring, ctx->draw_start, ctx->draw_end);
 		fd_reset_wfi(ctx);
 
 		/* emit gmem2mem to transfer tile back to system memory: */
@@ -349,7 +349,7 @@ render_sysmem(struct fd_context *ctx)
 	fd_hw_query_prepare_tile(ctx, 0, ctx->ring);
 
 	/* emit IB to drawcmds: */
-	OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
+	ctx->emit_ib(ctx->ring, ctx->draw_start, ctx->draw_end);
 	fd_reset_wfi(ctx);
 }
 
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index 0d2418e1e00..47dd467f498 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -265,8 +265,8 @@ OUT_WFI(struct fd_ringbuffer *ring)
 }
 
 static inline void
-OUT_IB(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
-		struct fd_ringmarker *end)
+__OUT_IB(struct fd_ringbuffer *ring, bool prefetch,
+		struct fd_ringmarker *start, struct fd_ringmarker *end)
 {
 	uint32_t dwords = fd_ringmarker_dwords(start, end);
 
@@ -280,7 +280,7 @@ OUT_IB(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
 	 */
 	emit_marker(ring, 6);
 
-	OUT_PKT3(ring, CP_INDIRECT_BUFFER_PFD, 2);
+	OUT_PKT3(ring, prefetch ? CP_INDIRECT_BUFFER_PFE : CP_INDIRECT_BUFFER_PFD, 2);
 	fd_ringbuffer_emit_reloc_ring(ring, start, end);
 	OUT_RING(ring, dwords);
 

From 805e080ba0ef637cfc7b637538b5a0ce5f0b53dc Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 18 Jan 2016 15:30:53 -0500
Subject: [PATCH 058/119] freedreno/a4xx: use smaller threadsize for more
 registers

Once we go past half of the "GPR" register file, it seems like we need
to run frag shader with smaller threadsize.  (The vertex shader already
runs at TWO_QUADS, which is the minimum.)

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/a4xx/fd4_program.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index 32b8fce1613..74716fb733f 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -217,6 +217,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 	struct stage s[MAX_STAGES];
 	uint32_t pos_regid, posz_regid, psize_regid, color_regid[8];
 	uint32_t face_regid, coord_regid, zwcoord_regid;
+	enum a3xx_threadsize fssz;
 	int constmode;
 	int i, j, k;
 
@@ -224,6 +225,8 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 
 	setup_stages(emit, s);
 
+	fssz = (s[FS].i->max_reg >= 24) ? TWO_QUADS : FOUR_QUADS;
+
 	/* blob seems to always use constmode currently: */
 	constmode = 1;
 
@@ -258,7 +261,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 	OUT_RING(ring, 0x00000003);
 
 	OUT_PKT0(ring, REG_A4XX_HLSQ_CONTROL_0_REG, 5);
-	OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) |
+	OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(fssz) |
 			A4XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) |
 			A4XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE |
 			/* NOTE:  I guess SHADERRESTART and CONSTFULLUPDATE maybe
@@ -385,7 +388,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 			A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) |
 			A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) |
 			A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
-			A4XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
+			A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) |
 			A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
 			COND(s[FS].v->has_samp, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE));
 	OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) |

From af686e7de3cc7cc053959729e9e31432740489fa Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 17 Jan 2016 16:25:00 -0500
Subject: [PATCH 059/119] st/mesa: restore the stObj's size if it was cleared
 out
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

An issue could still occur if the base level is set, but fixing that
would require a lot more logic.

This fixes the recently-failing texelFetch 3D tests because the mipmaps
were no longer being generated, which in turn caused the copying logic
to be hit, which in turn didn't work because of the broken
width/height/depth.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/mesa/state_tracker/st_cb_texture.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index f8b367989e7..0ceb37027e1 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -1670,6 +1670,12 @@ st_finalize_texture(struct gl_context *ctx,
          width = stObj->width0;
          height = stObj->height0;
          depth = stObj->depth0;
+      } else {
+         /* The width/height/depth may have been previously reset in
+          * guess_and_alloc_texture. */
+         stObj->width0 = width;
+         stObj->height0 = height;
+         stObj->depth0 = depth;
       }
       /* convert GL dims to Gallium dims */
       st_gl_texture_dims_to_pipe_dims(stObj->base.Target, width, height, depth,

From a31819cff8f4560786d731f5f1de6ba814368a2f Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 17 Jan 2016 22:28:19 -0500
Subject: [PATCH 060/119] nv50/ir: swap the least-ref'd source into src1 when
 both const/imm

The whole point of inlining sources is to reduce loads. We can end up in
a situation where one value is used a lot of times, and one value is
used only once per instruction. The once-per-instruction one is the one
that should get inlined, but with the previous algorithm, it was given
no preference.

This flips things around to preferring putting less-referenced values
into src1 which increases the likelihood of them being inlined.

While we're at it, adjust the heuristic to not treat 0 as an immediate,
as well as (effectively) check for situations where LIMMs can't be
loaded. All this yields improvements on nvc0:

total instructions in shared programs : 6261157 -> 6255985 (-0.08%)
total gprs used in shared programs    : 945082 -> 943417 (-0.18%)
total local used in shared programs   : 30372 -> 30288 (-0.28%)
total bytes used in shared programs   : 50089256 -> 50047880 (-0.08%)

                local        gpr       inst      bytes
    helped          21         822        3332        3332
      hurt           0         278         565         565

And more importantly avoids generating really bad code with SSBOs, where
we end up checking a lot of different values (usually immediates) against
the length.

On nv50 we get comparable results, and even improve packing (bytes went
down more than instructions):

total instructions in shared programs : 6346564 -> 6341277 (-0.08%)
total gprs used in shared programs    : 728719 -> 725131 (-0.49%)
total local used in shared programs   : 3552 -> 3552 (0.00%)
total bytes used in shared programs   : 43995688 -> 43932928 (-0.14%)

                local        gpr       inst      bytes
    helped           0        1380        3252        3774
      hurt           0         287        1710        1365

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_peephole.cpp      | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index f5c590eef10..75773553840 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -171,7 +171,10 @@ LoadPropagation::isImmdLoad(Instruction *ld)
    if (!ld || (ld->op != OP_MOV) ||
        ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
       return false;
-   return ld->src(0).getFile() == FILE_IMMEDIATE;
+
+   // A 0 can be replaced with a register, so it doesn't count as an immediate.
+   ImmediateValue val;
+   return ld->src(0).getImmediate(val) && !val.isInteger(0);
 }
 
 bool
@@ -187,7 +190,8 @@ LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
 void
 LoadPropagation::checkSwapSrc01(Instruction *insn)
 {
-   if (!prog->getTarget()->getOpInfo(insn).commutative)
+   const Target *targ = prog->getTarget();
+   if (!targ->getOpInfo(insn).commutative)
       if (insn->op != OP_SET && insn->op != OP_SLCT)
          return;
    if (insn->src(1).getFile() != FILE_GPR)
@@ -196,14 +200,15 @@ LoadPropagation::checkSwapSrc01(Instruction *insn)
    Instruction *i0 = insn->getSrc(0)->getInsn();
    Instruction *i1 = insn->getSrc(1)->getInsn();
 
-   if (isCSpaceLoad(i0)) {
-      if (!isCSpaceLoad(i1))
-         insn->swapSources(0, 1);
-      else
-         return;
-   } else
-   if (isImmdLoad(i0)) {
-      if (!isCSpaceLoad(i1) && !isImmdLoad(i1))
+   // Swap sources to inline the less frequently used source. That way,
+   // optimistically, it will eventually be able to remove the instruction.
+   int i0refs = insn->getSrc(0)->refCount();
+   int i1refs = insn->getSrc(1)->refCount();
+
+   if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) {
+      if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) ||
+          !targ->insnCanLoad(insn, 1, i1) ||
+          i0refs < i1refs)
          insn->swapSources(0, 1);
       else
          return;

From d018619d7f52b9a72d7c010d4afb70653d674f12 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Tue, 19 Jan 2016 14:35:50 +1100
Subject: [PATCH 061/119] glsl: fix interface block error message

Print the stream value not the pointer to the expression,
also use the unsigned format specifier.

Cc: 11.1 <mesa-stable@lists.freedesktop.org>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/glsl/ast_to_hir.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 0f51c54bfe3..cb57c16f089 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -6358,7 +6358,7 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
              qual_stream != block_stream) {
             _mesa_glsl_error(&loc, state, "stream layout qualifier on "
                              "interface block member does not match "
-                             "the interface block (%d vs %d)", qual->stream,
+                             "the interface block (%u vs %u)", qual_stream,
                              block_stream);
          }
       }

From 2bcacc69b9935f073e271527771b4fb42081dcb6 Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Fri, 8 Jan 2016 14:55:55 +0100
Subject: [PATCH 062/119] mesa: Move sanity check of BindVertexBuffer for
 OpenGL ES 3.1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sanity check of BindVertexBuffer for OpenGL ES in
_mesa_handle_bind_buffer_gen breaks OpenGL ES 2 conformance.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93426
Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
---
 src/mesa/main/bufferobj.c | 2 +-
 src/mesa/main/varray.c    | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 26f873bc9a9..8ede1f06e4e 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -953,7 +953,7 @@ _mesa_handle_bind_buffer_gen(struct gl_context *ctx,
 {
    struct gl_buffer_object *buf = *buf_handle;
 
-   if (!buf && (ctx->API == API_OPENGL_CORE || _mesa_is_gles31(ctx))) {
+   if (!buf && (ctx->API == API_OPENGL_CORE)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(non-gen name)", caller);
       return false;
    }
diff --git a/src/mesa/main/varray.c b/src/mesa/main/varray.c
index c71e16a1e56..c2bf2951687 100644
--- a/src/mesa/main/varray.c
+++ b/src/mesa/main/varray.c
@@ -1744,6 +1744,10 @@ vertex_array_vertex_buffer(struct gl_context *ctx,
    } else if (buffer != 0) {
       vbo = _mesa_lookup_bufferobj(ctx, buffer);
 
+      if (!vbo && _mesa_is_gles31(ctx)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION, "%s(non-gen name)", func);
+         return;
+      }
       /* From the GL_ARB_vertex_attrib_array spec:
        *
        *   "[Core profile only:]

From e6281a285012d76cf60fb8639838c369cf4d438f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Fri, 15 Jan 2016 16:56:15 -0500
Subject: [PATCH 063/119] util/u_pstipple.c: copy immediates during
 transformation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Apparently, nobody has combined stippling with a fragment shader
containing immediates in almost five years...

Fixes a bug in Kodi with radeonsi reported by Christian König.

Cc: "11.0 11.1" <mesa-stable@lists.freedesktop.org>
Tested-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/util/u_pstipple.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/auxiliary/util/u_pstipple.c b/src/gallium/auxiliary/util/u_pstipple.c
index 08dec13846d..3428172203b 100644
--- a/src/gallium/auxiliary/util/u_pstipple.c
+++ b/src/gallium/auxiliary/util/u_pstipple.c
@@ -230,6 +230,7 @@ pstip_transform_immed(struct tgsi_transform_context *ctx,
    struct pstip_transform_context *pctx =
       (struct pstip_transform_context *) ctx;
    pctx->numImmed++;
+   ctx->emit_immediate(ctx, immed);
 }
 
 

From f3b067af8656b1d3e0fda3e50bf80dbda5f4c1f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Mon, 18 Jan 2016 20:56:06 +0100
Subject: [PATCH 064/119] st/va: fix motion adaptive deinterlacing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/state_trackers/va/postproc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/state_trackers/va/postproc.c b/src/gallium/state_trackers/va/postproc.c
index f541f7c8aeb..d06f01617df 100644
--- a/src/gallium/state_trackers/va/postproc.c
+++ b/src/gallium/state_trackers/va/postproc.c
@@ -273,7 +273,7 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex
 
          case VAProcDeinterlacingMotionAdaptive:
             src = vlVaApplyDeint(drv, context, param, src,
-				 deint->flags & VA_DEINTERLACING_BOTTOM_FIELD);
+				 !!(deint->flags & VA_DEINTERLACING_BOTTOM_FIELD));
             break;
 
          default:

From a439788c59ae8e09854cebc7d8e4edd0eee6d391 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Fri, 15 Jan 2016 13:31:34 -0800
Subject: [PATCH 065/119] glsl: Restore Mesa-style to shader_enums.c/h.

---
 src/glsl/nir/shader_enums.c | 24 ++++++++++++++++--------
 src/glsl/nir/shader_enums.h | 16 ++++++++--------
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/src/glsl/nir/shader_enums.c b/src/glsl/nir/shader_enums.c
index 10f546a9814..16b20db3603 100644
--- a/src/glsl/nir/shader_enums.c
+++ b/src/glsl/nir/shader_enums.c
@@ -33,7 +33,8 @@
 #define ENUM(x) [x] = #x
 #define NAME(val) ((((val) < ARRAY_SIZE(names)) && names[(val)]) ? names[(val)] : "UNKNOWN")
 
-const char * gl_shader_stage_name(gl_shader_stage stage)
+const char *
+gl_shader_stage_name(gl_shader_stage stage)
 {
    static const char *names[] = {
       ENUM(MESA_SHADER_VERTEX),
@@ -51,7 +52,8 @@ const char * gl_shader_stage_name(gl_shader_stage stage)
  * Translate a gl_shader_stage to a short shader stage name for debug
  * printouts and error messages.
  */
-const char * _mesa_shader_stage_to_string(unsigned stage)
+const char *
+_mesa_shader_stage_to_string(unsigned stage)
 {
    switch (stage) {
    case MESA_SHADER_VERTEX:   return "vertex";
@@ -69,7 +71,8 @@ const char * _mesa_shader_stage_to_string(unsigned stage)
  * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
  * for debug printouts and error messages.
  */
-const char * _mesa_shader_stage_to_abbrev(unsigned stage)
+const char *
+_mesa_shader_stage_to_abbrev(unsigned stage)
 {
    switch (stage) {
    case MESA_SHADER_VERTEX:   return "VS";
@@ -83,7 +86,8 @@ const char * _mesa_shader_stage_to_abbrev(unsigned stage)
    unreachable("Unknown shader stage.");
 }
 
-const char * gl_vert_attrib_name(gl_vert_attrib attrib)
+const char *
+gl_vert_attrib_name(gl_vert_attrib attrib)
 {
    static const char *names[] = {
       ENUM(VERT_ATTRIB_POS),
@@ -124,7 +128,8 @@ const char * gl_vert_attrib_name(gl_vert_attrib attrib)
    return NAME(attrib);
 }
 
-const char * gl_varying_slot_name(gl_varying_slot slot)
+const char *
+gl_varying_slot_name(gl_varying_slot slot)
 {
    static const char *names[] = {
       ENUM(VARYING_SLOT_POS),
@@ -190,7 +195,8 @@ const char * gl_varying_slot_name(gl_varying_slot slot)
    return NAME(slot);
 }
 
-const char * gl_system_value_name(gl_system_value sysval)
+const char *
+gl_system_value_name(gl_system_value sysval)
 {
    static const char *names[] = {
      ENUM(SYSTEM_VALUE_VERTEX_ID),
@@ -216,7 +222,8 @@ const char * gl_system_value_name(gl_system_value sysval)
    return NAME(sysval);
 }
 
-const char * glsl_interp_qualifier_name(enum glsl_interp_qualifier qual)
+const char *
+glsl_interp_qualifier_name(enum glsl_interp_qualifier qual)
 {
    static const char *names[] = {
       ENUM(INTERP_QUALIFIER_NONE),
@@ -228,7 +235,8 @@ const char * glsl_interp_qualifier_name(enum glsl_interp_qualifier qual)
    return NAME(qual);
 }
 
-const char * gl_frag_result_name(gl_frag_result result)
+const char *
+gl_frag_result_name(gl_frag_result result)
 {
    static const char *names[] = {
       ENUM(FRAG_RESULT_DEPTH),
diff --git a/src/glsl/nir/shader_enums.h b/src/glsl/nir/shader_enums.h
index c747464d094..efc0b0d515e 100644
--- a/src/glsl/nir/shader_enums.h
+++ b/src/glsl/nir/shader_enums.h
@@ -47,19 +47,19 @@ typedef enum
    MESA_SHADER_COMPUTE = 5,
 } gl_shader_stage;
 
-const char * gl_shader_stage_name(gl_shader_stage stage);
+const char *gl_shader_stage_name(gl_shader_stage stage);
 
 /**
  * Translate a gl_shader_stage to a short shader stage name for debug
  * printouts and error messages.
  */
-const char * _mesa_shader_stage_to_string(unsigned stage);
+const char *_mesa_shader_stage_to_string(unsigned stage);
 
 /**
  * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
  * for debug printouts and error messages.
  */
-const char * _mesa_shader_stage_to_abbrev(unsigned stage);
+const char *_mesa_shader_stage_to_abbrev(unsigned stage);
 
 #define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1)
 
@@ -109,7 +109,7 @@ typedef enum
    VERT_ATTRIB_MAX = 33
 } gl_vert_attrib;
 
-const char * gl_vert_attrib_name(gl_vert_attrib attrib);
+const char *gl_vert_attrib_name(gl_vert_attrib attrib);
 
 /**
  * Symbolic constats to help iterating over
@@ -254,7 +254,7 @@ typedef enum
 #define VARYING_SLOT_PATCH0	(VARYING_SLOT_MAX)
 #define VARYING_SLOT_TESS_MAX	(VARYING_SLOT_PATCH0 + MAX_VARYING)
 
-const char * gl_varying_slot_name(gl_varying_slot slot);
+const char *gl_varying_slot_name(gl_varying_slot slot);
 
 /**
  * Bitflags for varying slots.
@@ -465,7 +465,7 @@ typedef enum
    SYSTEM_VALUE_MAX             /**< Number of values */
 } gl_system_value;
 
-const char * gl_system_value_name(gl_system_value sysval);
+const char *gl_system_value_name(gl_system_value sysval);
 
 /**
  * The possible interpolation qualifiers that can be applied to a fragment
@@ -483,7 +483,7 @@ enum glsl_interp_qualifier
    INTERP_QUALIFIER_COUNT /**< Number of interpolation qualifiers */
 };
 
-const char * glsl_interp_qualifier_name(enum glsl_interp_qualifier qual);
+const char *glsl_interp_qualifier_name(enum glsl_interp_qualifier qual);
 
 /**
  * Fragment program results
@@ -514,7 +514,7 @@ typedef enum
    FRAG_RESULT_DATA7,
 } gl_frag_result;
 
-const char * gl_frag_result_name(gl_frag_result result);
+const char *gl_frag_result_name(gl_frag_result result);
 
 #define FRAG_RESULT_MAX		(FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS)
 

From e734fb0326e61470d4341cc7bda52a25c96e1a72 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Fri, 15 Jan 2016 13:38:46 -0800
Subject: [PATCH 066/119] i965: Inform compiler of variable range to silence
 warning.

Extends commit 6531ccb70 to silence the warning in release builds as
well.

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/drivers/dri/i965/brw_vue_map.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c
index fea24368e8c..b66c209b24d 100644
--- a/src/mesa/drivers/dri/i965/brw_vue_map.c
+++ b/src/mesa/drivers/dri/i965/brw_vue_map.c
@@ -248,6 +248,8 @@ brw_compute_tess_vue_map(struct brw_vue_map *vue_map,
 static const char *
 varying_name(brw_varying_slot slot)
 {
+   assume(slot < BRW_VARYING_SLOT_COUNT);
+
    if (slot < VARYING_SLOT_MAX)
       return gl_varying_slot_name(slot);
 
@@ -257,7 +259,6 @@ varying_name(brw_varying_slot slot)
       [BRW_VARYING_SLOT_PNTC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PNTC",
    };
 
-   assert(slot < BRW_VARYING_SLOT_COUNT);
    return brw_names[slot - VARYING_SLOT_MAX];
 }
 

From 866a6bf9f70625517d6d2c17be9523b9f035f1db Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Tue, 19 Jan 2016 12:12:38 -0800
Subject: [PATCH 067/119] i965/vec4: Spaces around operators.

---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 4ee2ed47d40..1b87e3044c2 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1069,7 +1069,7 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_umul_high: {
       struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
 
-      if (devinfo->gen >=8)
+      if (devinfo->gen >= 8)
          emit(MUL(acc, op[0], retype(op[1], BRW_REGISTER_TYPE_UW)));
       else
          emit(MUL(acc, op[0], op[1]));

From c8b8c578d145f90794611602eb66fc7d3f1df033 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 15 Jan 2016 11:01:25 +1100
Subject: [PATCH 068/119] glsl: allow duplicate layout-qualifier-names

This is added by ARB_enhanced_layouts although it doesn't fit
into any of the six main changes so we enable this independently.

From the ARB_enhanced_layouts spec:

   "More than one layout qualifier may appear in a single
   declaration. Additionally, the same layout-qualifier-name
   can occur multiple times within a layout qualifier or across
   multiple layout qualifiers in the  same declaration"

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
---
 src/glsl/ast_type.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp
index 1f675b28ae8..4e750161b48 100644
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@@ -157,7 +157,8 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
       allowed_duplicates_mask.flags.i |=
          stream_layout_mask.flags.i;
 
-   if ((this->flags.i & q.flags.i & ~allowed_duplicates_mask.flags.i) != 0) {
+   if (!state->has_enhanced_layouts() &&
+       (this->flags.i & q.flags.i & ~allowed_duplicates_mask.flags.i) != 0) {
       _mesa_glsl_error(loc, state,
 		       "duplicate layout qualifiers used");
       return false;

From fd612e4547175d1350eac2d8374d01390de58acb Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Mon, 18 Jan 2016 16:09:06 +1100
Subject: [PATCH 069/119] glsl: split layout_defaults into specific types

This will allow merging of duplicate layout qualifiers as allowed
by ARB_shading_language_420pack

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
---
 src/glsl/glsl_parser.yy | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 51796a65df9..462ca45a55c 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -299,6 +299,10 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
 %type <node> for_init_statement
 %type <for_rest_statement> for_rest_statement
 %type <node> layout_defaults
+%type <node> layout_uniform_defaults
+%type <node> layout_buffer_defaults
+%type <node> layout_in_defaults
+%type <node> layout_out_defaults
 
 %right THEN ELSE
 %%
@@ -2737,7 +2741,7 @@ member_declaration:
    }
    ;
 
-layout_defaults:
+layout_uniform_defaults:
    layout_qualifier UNIFORM ';'
    {
       if (!state->default_uniform_qualifier->merge_qualifier(& @1, state, $1)) {
@@ -2745,8 +2749,10 @@ layout_defaults:
       }
       $$ = NULL;
    }
+   ;
 
-   | layout_qualifier BUFFER ';'
+layout_buffer_defaults:
+   layout_qualifier BUFFER ';'
    {
       if (!state->default_shader_storage_qualifier->merge_qualifier(& @1, state, $1)) {
          YYERROR;
@@ -2764,16 +2770,20 @@ layout_defaults:
 
       $$ = NULL;
    }
+   ;
 
-   | layout_qualifier IN_TOK ';'
+layout_in_defaults:
+   layout_qualifier IN_TOK ';'
    {
       $$ = NULL;
       if (!state->in_qualifier->merge_in_qualifier(& @1, state, $1, $$)) {
          YYERROR;
       }
    }
+   ;
 
-   | layout_qualifier OUT_TOK ';'
+layout_out_defaults:
+   layout_qualifier OUT_TOK ';'
    {
       $$ = NULL;
       if (state->stage == MESA_SHADER_GEOMETRY) {
@@ -2804,3 +2814,11 @@ layout_defaults:
                           "tessellation control or geometry shaders");
       }
    }
+   ;
+
+layout_defaults:
+   layout_uniform_defaults
+   | layout_buffer_defaults
+   | layout_in_defaults
+   | layout_out_defaults
+   ;

From a0a93470e3413d5a5890d988462bb898362bd68f Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Mon, 18 Jan 2016 19:13:03 +1100
Subject: [PATCH 070/119] glsl: move default layout qualifier rules out of the
 parser

Acked-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
---
 src/glsl/ast_type.cpp   | 22 +++++++++++++++++++++-
 src/glsl/glsl_parser.yy | 29 ++---------------------------
 2 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp
index 4e750161b48..e59d1b2754b 100644
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@@ -296,8 +296,28 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
    void *mem_ctx = state;
    const bool r = this->merge_qualifier(loc, state, q);
 
-   if (state->stage == MESA_SHADER_TESS_CTRL) {
+   if (state->stage == MESA_SHADER_GEOMETRY) {
+      if (q.flags.q.prim_type) {
+         /* Make sure this is a valid output primitive type. */
+         switch (q.prim_type) {
+         case GL_POINTS:
+         case GL_LINE_STRIP:
+         case GL_TRIANGLE_STRIP:
+            break;
+         default:
+            _mesa_glsl_error(loc, state, "invalid geometry shader output "
+                             "primitive type");
+            break;
+         }
+      }
+
+      /* Allow future assigments of global out's stream id value */
+      this->flags.q.explicit_stream = 0;
+   } else if (state->stage == MESA_SHADER_TESS_CTRL) {
       node = new(mem_ctx) ast_tcs_output_layout(*loc);
+   } else {
+      _mesa_glsl_error(loc, state, "out layout qualifiers only valid in "
+                       "tessellation control or geometry shaders");
    }
 
    return r;
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 462ca45a55c..925cb820ba9 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -2786,33 +2786,8 @@ layout_out_defaults:
    layout_qualifier OUT_TOK ';'
    {
       $$ = NULL;
-      if (state->stage == MESA_SHADER_GEOMETRY) {
-         if ($1.flags.q.prim_type) {
-            /* Make sure this is a valid output primitive type. */
-            switch ($1.prim_type) {
-            case GL_POINTS:
-            case GL_LINE_STRIP:
-            case GL_TRIANGLE_STRIP:
-               break;
-            default:
-               _mesa_glsl_error(&@1, state, "invalid geometry shader output "
-                                "primitive type");
-               break;
-            }
-         }
-         if (!state->out_qualifier->merge_qualifier(& @1, state, $1))
-            YYERROR;
-
-         /* Allow future assigments of global out's stream id value */
-         state->out_qualifier->flags.q.explicit_stream = 0;
-      } else if (state->stage == MESA_SHADER_TESS_CTRL) {
-         if (!state->out_qualifier->merge_out_qualifier(& @1, state, $1, $$))
-            YYERROR;
-      } else {
-         _mesa_glsl_error(& @1, state,
-                          "out layout qualifiers only valid in "
-                          "tessellation control or geometry shaders");
-      }
+      if (!state->out_qualifier->merge_out_qualifier(& @1, state, $1, $$))
+         YYERROR;
    }
    ;
 

From 564009986ff1485c467664542a9042e6ce4dcdfe Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Mon, 18 Jan 2016 17:06:57 +1100
Subject: [PATCH 071/119] glsl: update parser to allow duplicate default layout
 qualifiers

In order to only create a single node for each default declaration
we add a new boolean parameter to the in/out merge function to
only create one once we reach the rightmost layout qualifier.

From the ARB_shading_language_420pack spec:

   "More than one layout qualifier may appear in a single
   declaration. If the same layout-qualifier-name occurs in
   multiple layout qualifiers for the same declaration, the
   last one overrides the former ones."

Acked-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
---
 src/glsl/ast.h          |  4 +--
 src/glsl/ast_type.cpp   | 18 ++++++-----
 src/glsl/glsl_parser.yy | 66 +++++++++++++++++++++++++++++++++++++----
 3 files changed, 73 insertions(+), 15 deletions(-)

diff --git a/src/glsl/ast.h b/src/glsl/ast.h
index f8ab0b71b7b..465166994ea 100644
--- a/src/glsl/ast.h
+++ b/src/glsl/ast.h
@@ -704,12 +704,12 @@ struct ast_type_qualifier {
    bool merge_out_qualifier(YYLTYPE *loc,
                            _mesa_glsl_parse_state *state,
                            const ast_type_qualifier &q,
-                           ast_node* &node);
+                           ast_node* &node, bool create_node);
 
    bool merge_in_qualifier(YYLTYPE *loc,
                            _mesa_glsl_parse_state *state,
                            const ast_type_qualifier &q,
-                           ast_node* &node);
+                           ast_node* &node, bool create_node);
 
    ast_subroutine_list *subroutine_list;
 };
diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp
index e59d1b2754b..32cb0a0319c 100644
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@@ -291,7 +291,7 @@ bool
 ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
                                         _mesa_glsl_parse_state *state,
                                         const ast_type_qualifier &q,
-                                        ast_node* &node)
+                                        ast_node* &node, bool create_node)
 {
    void *mem_ctx = state;
    const bool r = this->merge_qualifier(loc, state, q);
@@ -314,7 +314,9 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
       /* Allow future assigments of global out's stream id value */
       this->flags.q.explicit_stream = 0;
    } else if (state->stage == MESA_SHADER_TESS_CTRL) {
-      node = new(mem_ctx) ast_tcs_output_layout(*loc);
+      if (create_node) {
+         node = new(mem_ctx) ast_tcs_output_layout(*loc);
+      }
    } else {
       _mesa_glsl_error(loc, state, "out layout qualifiers only valid in "
                        "tessellation control or geometry shaders");
@@ -327,7 +329,7 @@ bool
 ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
                                        _mesa_glsl_parse_state *state,
                                        const ast_type_qualifier &q,
-                                       ast_node* &node)
+                                       ast_node* &node, bool create_node)
 {
    void *mem_ctx = state;
    bool create_gs_ast = false;
@@ -467,10 +469,12 @@ ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
       this->point_mode = q.point_mode;
    }
 
-   if (create_gs_ast) {
-      node = new(mem_ctx) ast_gs_input_layout(*loc, q.prim_type);
-   } else if (create_cs_ast) {
-      node = new(mem_ctx) ast_cs_input_layout(*loc, q.local_size);
+   if (create_node) {
+      if (create_gs_ast) {
+         node = new(mem_ctx) ast_gs_input_layout(*loc, q.prim_type);
+      } else if (create_cs_ast) {
+         node = new(mem_ctx) ast_cs_input_layout(*loc, q.local_size);
+      }
    }
 
    return true;
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 925cb820ba9..6099aeb12ca 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -2742,7 +2742,20 @@ member_declaration:
    ;
 
 layout_uniform_defaults:
-   layout_qualifier UNIFORM ';'
+   layout_qualifier layout_uniform_defaults
+   {
+      $$ = NULL;
+      if (!state->has_420pack_or_es31()) {
+         _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
+         YYERROR;
+      } else {
+         if (!state->default_uniform_qualifier->
+                merge_qualifier(& @1, state, $1)) {
+            YYERROR;
+         }
+      }
+   }
+   | layout_qualifier UNIFORM ';'
    {
       if (!state->default_uniform_qualifier->merge_qualifier(& @1, state, $1)) {
          YYERROR;
@@ -2752,7 +2765,20 @@ layout_uniform_defaults:
    ;
 
 layout_buffer_defaults:
-   layout_qualifier BUFFER ';'
+   layout_qualifier layout_buffer_defaults
+   {
+      $$ = NULL;
+      if (!state->has_420pack_or_es31()) {
+         _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
+         YYERROR;
+      } else {
+         if (!state->default_shader_storage_qualifier->
+                merge_qualifier(& @1, state, $1)) {
+            YYERROR;
+         }
+      }
+   }
+   | layout_qualifier BUFFER ';'
    {
       if (!state->default_shader_storage_qualifier->merge_qualifier(& @1, state, $1)) {
          YYERROR;
@@ -2773,20 +2799,48 @@ layout_buffer_defaults:
    ;
 
 layout_in_defaults:
-   layout_qualifier IN_TOK ';'
+   layout_qualifier layout_in_defaults
    {
       $$ = NULL;
-      if (!state->in_qualifier->merge_in_qualifier(& @1, state, $1, $$)) {
+      if (!state->has_420pack_or_es31()) {
+         _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
+         YYERROR;
+      } else {
+         if (!state->in_qualifier->
+                merge_in_qualifier(& @1, state, $1, $$, false)) {
+            YYERROR;
+         }
+      }
+   }
+   | layout_qualifier IN_TOK ';'
+   {
+      $$ = NULL;
+      if (!state->in_qualifier->
+             merge_in_qualifier(& @1, state, $1, $$, true)) {
          YYERROR;
       }
    }
    ;
 
 layout_out_defaults:
-   layout_qualifier OUT_TOK ';'
+   layout_qualifier layout_out_defaults
    {
       $$ = NULL;
-      if (!state->out_qualifier->merge_out_qualifier(& @1, state, $1, $$))
+      if (!state->has_420pack_or_es31()) {
+         _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
+         YYERROR;
+      } else {
+         if (!state->out_qualifier->
+                merge_out_qualifier(& @1, state, $1, $$, false)) {
+            YYERROR;
+         }
+      }
+   }
+   | layout_qualifier OUT_TOK ';'
+   {
+      $$ = NULL;
+      if (!state->out_qualifier->
+             merge_out_qualifier(& @1, state, $1, $$, true))
          YYERROR;
    }
    ;

From 6a660a5f5dad02a6594ea905c511ba3cae6862a5 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 15 Jan 2016 12:43:10 +1100
Subject: [PATCH 072/119] glsl: allow multiple layout qualifiers for a single
 declaration

From the ARB_shading_language_420pack spec:

   "More than one layout qualifier may appear in a single
   declaration. If the same layout-qualifier-name occurs in
   multiple layout qualifiers for the same declaration, the
   last one overrides the former ones."

The parser was already failing correctly when the extension is
not available but testing for duplicates within a single layout
qualifier was still causing this to fail when available as both
cases share the same function for merging.

Here we add a parameter to differentiate between the two uses
and apply it to the duplicate test.

Acked-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
---
 src/glsl/ast.h          |  3 ++-
 src/glsl/ast_type.cpp   | 12 +++++++++---
 src/glsl/glsl_parser.yy | 32 +++++++++++++++++---------------
 3 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/src/glsl/ast.h b/src/glsl/ast.h
index 465166994ea..03df6c08b2b 100644
--- a/src/glsl/ast.h
+++ b/src/glsl/ast.h
@@ -699,7 +699,8 @@ struct ast_type_qualifier {
 
    bool merge_qualifier(YYLTYPE *loc,
 			_mesa_glsl_parse_state *state,
-			const ast_type_qualifier &q);
+                        const ast_type_qualifier &q,
+                        bool is_single_layout_merge);
 
    bool merge_out_qualifier(YYLTYPE *loc,
                            _mesa_glsl_parse_state *state,
diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp
index 32cb0a0319c..cf494d96724 100644
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@@ -114,10 +114,16 @@ ast_type_qualifier::interpolation_string() const
       return NULL;
 }
 
+/**
+ * This function merges both duplicate identifies within a single layout and
+ * multiple layout qualifiers on a single variable declaration. The
+ * is_single_layout_merge param is used differentiate between the two.
+ */
 bool
 ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
 				    _mesa_glsl_parse_state *state,
-				    const ast_type_qualifier &q)
+                                    const ast_type_qualifier &q,
+                                    bool is_single_layout_merge)
 {
    ast_type_qualifier ubo_mat_mask;
    ubo_mat_mask.flags.i = 0;
@@ -157,7 +163,7 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
       allowed_duplicates_mask.flags.i |=
          stream_layout_mask.flags.i;
 
-   if (!state->has_enhanced_layouts() &&
+   if (is_single_layout_merge && !state->has_enhanced_layouts() &&
        (this->flags.i & q.flags.i & ~allowed_duplicates_mask.flags.i) != 0) {
       _mesa_glsl_error(loc, state,
 		       "duplicate layout qualifiers used");
@@ -294,7 +300,7 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
                                         ast_node* &node, bool create_node)
 {
    void *mem_ctx = state;
-   const bool r = this->merge_qualifier(loc, state, q);
+   const bool r = this->merge_qualifier(loc, state, q, false);
 
    if (state->stage == MESA_SHADER_GEOMETRY) {
       if (q.flags.q.prim_type) {
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 6099aeb12ca..10198758944 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -957,7 +957,7 @@ parameter_qualifier:
                                       "or precise");
 
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
    | precision_qualifier parameter_qualifier
    {
@@ -974,7 +974,7 @@ parameter_qualifier:
    | memory_qualifier parameter_qualifier
    {
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
 
 parameter_direction_qualifier:
@@ -1153,7 +1153,7 @@ layout_qualifier_id_list:
    | layout_qualifier_id_list ',' layout_qualifier_id
    {
       $$ = $1;
-      if (!$$.merge_qualifier(& @3, state, $3)) {
+      if (!$$.merge_qualifier(& @3, state, $3, true)) {
          YYERROR;
       }
    }
@@ -1762,7 +1762,7 @@ type_qualifier:
       }
 
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
    | layout_qualifier type_qualifier
    {
@@ -1779,12 +1779,12 @@ type_qualifier:
          _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
 
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
    | subroutine_qualifier type_qualifier
    {
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
    | auxiliary_storage_qualifier type_qualifier
    {
@@ -1800,7 +1800,7 @@ type_qualifier:
                           "just before storage qualifiers");
       }
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
    | storage_qualifier type_qualifier
    {
@@ -1820,7 +1820,7 @@ type_qualifier:
       }
 
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
    | precision_qualifier type_qualifier
    {
@@ -1837,7 +1837,7 @@ type_qualifier:
    | memory_qualifier type_qualifier
    {
       $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
    }
    ;
 
@@ -2589,7 +2589,7 @@ interface_block:
          YYERROR;
       }
 
-      if (!block->layout.merge_qualifier(& @1, state, $1)) {
+      if (!block->layout.merge_qualifier(& @1, state, $1, false)) {
          YYERROR;
       }
 
@@ -2606,7 +2606,7 @@ interface_block:
                              "memory qualifiers can only be used in the "
                              "declaration of shader storage blocks");
       }
-      if (!block->layout.merge_qualifier(& @1, state, $1)) {
+      if (!block->layout.merge_qualifier(& @1, state, $1, false)) {
          YYERROR;
       }
       $$ = block;
@@ -2750,14 +2750,15 @@ layout_uniform_defaults:
          YYERROR;
       } else {
          if (!state->default_uniform_qualifier->
-                merge_qualifier(& @1, state, $1)) {
+                merge_qualifier(& @1, state, $1, false)) {
             YYERROR;
          }
       }
    }
    | layout_qualifier UNIFORM ';'
    {
-      if (!state->default_uniform_qualifier->merge_qualifier(& @1, state, $1)) {
+      if (!state->default_uniform_qualifier->
+             merge_qualifier(& @1, state, $1, false)) {
          YYERROR;
       }
       $$ = NULL;
@@ -2773,14 +2774,15 @@ layout_buffer_defaults:
          YYERROR;
       } else {
          if (!state->default_shader_storage_qualifier->
-                merge_qualifier(& @1, state, $1)) {
+                merge_qualifier(& @1, state, $1, false)) {
             YYERROR;
          }
       }
    }
    | layout_qualifier BUFFER ';'
    {
-      if (!state->default_shader_storage_qualifier->merge_qualifier(& @1, state, $1)) {
+      if (!state->default_shader_storage_qualifier->
+             merge_qualifier(& @1, state, $1, false)) {
          YYERROR;
       }
 

From 11fc7ad62ef9aa4c9df71e4e001582f8017e7a81 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 6 Jan 2016 12:40:12 +1100
Subject: [PATCH 073/119] mesa: remove link validation that should be done
 elsewhere
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Even if re-linking fails rendering shouldn't fail as the previous
succesfully linked program will still be available. It also shouldn't
be possible to have an unlinked program as part of the current rendering
state.

This fixes a subtest in:
ES31-CTS.sepshaderobjs.StateInteraction

This change should improve performance on CPU limited benchmarks as noted
in commit d6c6b186cf308f.

>From Section 7.3 (Program Objects) of the OpenGL 4.5 spec:

   "If a program object that is active for any shader stage is re-linked
    unsuccessfully, the link status will be set to FALSE, but any existing
    executables and associated state will remain part of the current rendering
    state until a subsequent call to UseProgram, UseProgramStages, or
    BindProgramPipeline removes them from use. If such a program is attached to
    any program pipeline object, the existing executables and associated state
    will remain part of the program pipeline object until a subsequent call to
    UseProgramStages removes them from use. An unsuccessfully linked program may
    not be made part of the current rendering state by UseProgram or added to
    program pipeline objects by UseProgramStages until it is successfully
    re-linked."

   "void UseProgram(uint program);

   ...

   An INVALID_OPERATION error is generated if program has not been linked, or
   was last linked unsuccessfully.  The current rendering state is not modified."

V2: apply the rule to both core and compat.

Cc: Tapani Pälli <tapani.palli@intel.com>
Cc: Brian Paul <brianp@vmware.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/context.c | 63 ++---------------------------------------
 1 file changed, 3 insertions(+), 60 deletions(-)

diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index be983d4c86a..f3fd01f395e 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -1930,31 +1930,6 @@ _mesa_check_blend_func_error(struct gl_context *ctx)
    return GL_TRUE;
 }
 
-static bool
-shader_linked_or_absent(struct gl_context *ctx,
-                        const struct gl_shader_program *shProg,
-                        bool *shader_present, const char *where)
-{
-   if (shProg) {
-      *shader_present = true;
-
-      if (!shProg->LinkStatus) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "%s(shader not linked)", where);
-         return false;
-      }
-#if 0 /* not normally enabled */
-      {
-         char errMsg[100];
-         if (!_mesa_validate_shader_program(ctx, shProg, errMsg)) {
-            _mesa_warning(ctx, "Shader program %u is invalid: %s",
-                          shProg->Name, errMsg);
-         }
-      }
-#endif
-   }
-
-   return true;
-}
 
 /**
  * Prior to drawing anything with glBegin, glDrawArrays, etc. this function
@@ -1967,54 +1942,22 @@ shader_linked_or_absent(struct gl_context *ctx,
 GLboolean
 _mesa_valid_to_render(struct gl_context *ctx, const char *where)
 {
-   unsigned i;
-
    /* This depends on having up to date derived state (shaders) */
    if (ctx->NewState)
       _mesa_update_state(ctx);
 
-   if (ctx->API == API_OPENGL_CORE || ctx->API == API_OPENGLES2) {
-      bool from_glsl_shader[MESA_SHADER_COMPUTE] = { false };
-
-      for (i = 0; i < MESA_SHADER_COMPUTE; i++) {
-         if (!shader_linked_or_absent(ctx, ctx->_Shader->CurrentProgram[i],
-                                      &from_glsl_shader[i], where))
-            return GL_FALSE;
-      }
-
-      /* In OpenGL Core Profile and OpenGL ES 2.0 / 3.0, there are no assembly
-       * shaders.  Don't check state related to those.
-       */
-   } else {
-      bool has_vertex_shader = false;
-      bool has_fragment_shader = false;
-
-      /* In OpenGL Compatibility Profile, there is only vertex shader and
-       * fragment shader.  We take this path also for API_OPENGLES because
-       * optimizing that path would make the other (more common) paths
-       * slightly slower.
-       */
-      if (!shader_linked_or_absent(ctx,
-                                   ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX],
-                                   &has_vertex_shader, where))
-         return GL_FALSE;
-
-      if (!shader_linked_or_absent(ctx,
-                                   ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT],
-                                   &has_fragment_shader, where))
-         return GL_FALSE;
-
+   if (ctx->API == API_OPENGL_COMPAT) {
       /* Any shader stages that are not supplied by the GLSL shader and have
        * assembly shaders enabled must now be validated.
        */
-      if (!has_vertex_shader
+      if (!ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX]
           && ctx->VertexProgram.Enabled && !ctx->VertexProgram._Enabled) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "%s(vertex program not valid)", where);
          return GL_FALSE;
       }
 
-      if (!has_fragment_shader) {
+      if (!ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT]) {
          if (ctx->FragmentProgram.Enabled && !ctx->FragmentProgram._Enabled) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(fragment program not valid)", where);

From 4fc018576b577ed9ab3b713ba21727479a0e8b23 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 31 Dec 2015 16:28:08 -0800
Subject: [PATCH 074/119] glsl: Don't abbreviate tessellation shader stage
 names.

I have a patch that writes shaders as .shader_test files, and it uses
this function to create the headers (i.e. [vertex shader]).

[tess ctrl shader] isn't a valid shader_runner header - it's spelled
out as [tessellation control shader].

There's no real reason to abbreviate it, so spell it out.

v2: Rebase on Rob's patches to move the code.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/glsl/nir/shader_enums.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/glsl/nir/shader_enums.c b/src/glsl/nir/shader_enums.c
index 16b20db3603..942d152b129 100644
--- a/src/glsl/nir/shader_enums.c
+++ b/src/glsl/nir/shader_enums.c
@@ -60,8 +60,8 @@ _mesa_shader_stage_to_string(unsigned stage)
    case MESA_SHADER_FRAGMENT: return "fragment";
    case MESA_SHADER_GEOMETRY: return "geometry";
    case MESA_SHADER_COMPUTE:  return "compute";
-   case MESA_SHADER_TESS_CTRL: return "tess ctrl";
-   case MESA_SHADER_TESS_EVAL: return "tess eval";
+   case MESA_SHADER_TESS_CTRL: return "tessellation control";
+   case MESA_SHADER_TESS_EVAL: return "tessellation evaluation";
    }
 
    unreachable("Unknown shader stage.");

From 9e4c8acd7814186673c4945c7045124ebbcd125f Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Sat, 16 Jan 2016 15:05:51 -0800
Subject: [PATCH 075/119] i965: Trigger CS state reemission when new sampler
 state is uploaded.

This reuses the NEW_SAMPLER_STATE_TABLE state bit (currently only used
on pre-Gen7 hardware) to signal that the sampler state tables have
changed in order to make sure that the GPGPU interface descriptor is
updated.

Reviewed-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_sampler_state.c | 2 +-
 src/mesa/drivers/dri/i965/gen7_cs_state.c     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index d181468f5cb..24798a5c552 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -582,7 +582,7 @@ brw_upload_sampler_state_table(struct brw_context *brw,
       batch_offset_for_sampler_state += size_in_bytes;
    }
 
-   if (brw->gen >= 7) {
+   if (brw->gen >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
       /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */
       gen7_emit_sampler_state_pointers_xs(brw, stage_state);
    } else {
diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c
index a025bb9dd66..6d6988c6a41 100644
--- a/src/mesa/drivers/dri/i965/gen7_cs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c
@@ -196,6 +196,7 @@ const struct brw_tracked_state brw_cs_state = {
       .brw = BRW_NEW_BATCH |
              BRW_NEW_CS_PROG_DATA |
              BRW_NEW_PUSH_CONSTANT_ALLOCATION |
+             BRW_NEW_SAMPLER_STATE_TABLE |
              BRW_NEW_SURFACES,
    },
    .emit = brw_upload_cs_state

From f8ac314cc2353f439e6a917db4e3aeaf47e2093e Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Sat, 16 Jan 2016 15:11:03 -0800
Subject: [PATCH 076/119] i965: Implement compute sampler state atom.

Fixes a number of GLES31 CTS failures and hangs on various hardware:

 ES31-CTS.texture_gather.plain-gather-depth-2d
 ES31-CTS.texture_gather.plain-gather-depth-2darray
 ES31-CTS.texture_gather.plain-gather-depth-cube
 ES31-CTS.texture_gather.offset-gather-depth-2d
 ES31-CTS.texture_gather.offset-gather-depth-2darray
 ES31-CTS.layout_binding.sampler2D_layout_binding_texture_ComputeShader
 ES31-CTS.layout_binding.sampler2DArray_layout_binding_texture_ComputeShader
 ES31-CTS.explicit_uniform_location.uniform-loc-types-samplers
 ES31-CTS.compute_shader.resources-texture

Some of them were actually passing by luck on some generations even
though we weren't uploading sampler state tables explicitly for the
compute stage, most likely because they relied on the cached sampler
state left from previous rendering to be close enough.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=92589
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93312
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93325
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93407
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93725
Reported-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_context.h       |  2 +-
 src/mesa/drivers/dri/i965/brw_sampler_state.c | 20 +++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_state.h         |  1 +
 src/mesa/drivers/dri/i965/brw_state_upload.c  |  2 ++
 4 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index b80db00eb75..2a29dfe5eec 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1264,7 +1264,7 @@ struct brw_context
 
    int num_atoms[BRW_NUM_PIPELINES];
    const struct brw_tracked_state render_atoms[76];
-   const struct brw_tracked_state compute_atoms[10];
+   const struct brw_tracked_state compute_atoms[11];
 
    /* If (INTEL_DEBUG & DEBUG_BATCH) */
    struct {
diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index 24798a5c552..c20a02817f9 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -693,3 +693,23 @@ const struct brw_tracked_state brw_tes_samplers = {
    },
    .emit = brw_upload_tes_samplers,
 };
+
+static void
+brw_upload_cs_samplers(struct brw_context *brw)
+{
+   /* BRW_NEW_COMPUTE_PROGRAM */
+   struct gl_program *cs = (struct gl_program *) brw->compute_program;
+   if (!cs)
+      return;
+
+   brw_upload_sampler_state_table(brw, cs, &brw->cs.base);
+}
+
+const struct brw_tracked_state brw_cs_samplers = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_COMPUTE_PROGRAM,
+   },
+   .emit = brw_upload_cs_samplers,
+};
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 7d61b7c4ab6..f44ccd6e071 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -75,6 +75,7 @@ extern const struct brw_tracked_state brw_vs_samplers;
 extern const struct brw_tracked_state brw_tcs_samplers;
 extern const struct brw_tracked_state brw_tes_samplers;
 extern const struct brw_tracked_state brw_gs_samplers;
+extern const struct brw_tracked_state brw_cs_samplers;
 extern const struct brw_tracked_state brw_vs_ubo_surfaces;
 extern const struct brw_tracked_state brw_vs_abo_surfaces;
 extern const struct brw_tracked_state brw_vs_image_surfaces;
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 876e130f1cd..ee75ca88549 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -282,6 +282,7 @@ static const struct brw_tracked_state *gen7_compute_atoms[] =
    &brw_cs_abo_surfaces,
    &brw_texture_surfaces,
    &brw_cs_work_groups_surface,
+   &brw_cs_samplers,
    &brw_cs_state,
 };
 
@@ -396,6 +397,7 @@ static const struct brw_tracked_state *gen8_compute_atoms[] =
    &brw_cs_abo_surfaces,
    &brw_texture_surfaces,
    &brw_cs_work_groups_surface,
+   &brw_cs_samplers,
    &brw_cs_state,
 };
 

From b21973acaa67fb7945a12fc266e20281d7eb5375 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Mon, 18 Jan 2016 04:29:22 +0100
Subject: [PATCH 077/119] llvmpipe: turn depth clears into full depth/stencil
 clears for d24x8 formats

If we have a d24x8 format, there is no stencil. Therefore, we can always
clear these bits too, which means this will be some kind of memset rather
than read-modify-write.
This is good for some 7% increase or so in gears with huge window size -
seems to have a bigger effect if things aren't in caches. Of course, any
real app won't spend nearly as much time comparatively in clearing
depth buffer in the first place, so the speedup will be much lower.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/drivers/llvmpipe/lp_setup.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index e8c3e7c7678..34d3c812b60 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -476,27 +476,30 @@ lp_setup_try_clear_zs(struct lp_setup_context *setup,
    uint64_t zsvalue = 0;
    uint32_t zmask32;
    uint8_t smask8;
+   enum pipe_format format = setup->fb.zsbuf->format;
 
    LP_DBG(DEBUG_SETUP, "%s state %d\n", __FUNCTION__, setup->state);
 
    zmask32 = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0;
    smask8 = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0;
 
-   zsvalue = util_pack64_z_stencil(setup->fb.zsbuf->format,
-                                   depth,
-                                   stencil);
+   zsvalue = util_pack64_z_stencil(format, depth, stencil);
 
-   /*
-    * XXX: should make a full mask here for things like D24X8,
-    * otherwise we'll do a read-modify-write clear later which
-    * should be unnecessary.
-    */
-   zsmask = util_pack64_mask_z_stencil(setup->fb.zsbuf->format,
-                                       zmask32,
-                                       smask8);
+   zsmask = util_pack64_mask_z_stencil(format, zmask32, smask8);
 
    zsvalue &= zsmask;
 
+   if (format == PIPE_FORMAT_Z24X8_UNORM ||
+       format == PIPE_FORMAT_X8Z24_UNORM) {
+      /*
+       * Make full mask if there's "X" bits so we can do full
+       * clear (without rmw).
+       */
+      uint32_t zsmask_full = 0;
+      zsmask_full = util_pack_mask_z_stencil(format, ~0, ~0);
+      zsmask |= ~zsmask_full;
+   }
+
    if (setup->state == SETUP_ACTIVE) {
       struct lp_scene *scene = setup->scene;
 

From 0a6a05c8eaeda570891fdece2d86e8e0b0e4d56f Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 20 Jan 2016 10:49:54 +1100
Subject: [PATCH 078/119] glsl: add missing explicit_image_format flag to
 has_layout()

Fixes piglit regression after fixes to duplicate layout rules.

Previously catching multiple layouts was relying on the code
meant to catch duplicates within a single layout(...), this
change triggers the rules for multiple layouts.

Cc: Mark Janes <mark.a.janes@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/glsl/ast_type.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp
index cf494d96724..e0e331152dd 100644
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@@ -74,6 +74,7 @@ ast_type_qualifier::has_layout() const
           || this->flags.q.row_major
           || this->flags.q.packed
           || this->flags.q.explicit_location
+          || this->flags.q.explicit_image_format
           || this->flags.q.explicit_index
           || this->flags.q.explicit_binding
           || this->flags.q.explicit_offset

From 4475d8f9169195baefa893b9b147fe20414cda7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Fri, 15 Jan 2016 13:11:20 +0200
Subject: [PATCH 079/119] glsl: move uniform calculation to link_uniforms
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch moves uniform calculation to happen during link_uniforms, this
is possible with help of UniformRemapTable that has all the reserved
locations.

Location assignment for implicit locations is changed so that we
utilize also the 'holes' that explicit uniform location assignment
might have left in UniformRemapTable, this makes it possible to fit
more uniforms as previously we were lazy here and wasting space.

Fixes following CTS tests:
   ES31-CTS.explicit_uniform_location.uniform-loc-mix-with-implicit-max
   ES31-CTS.explicit_uniform_location.uniform-loc-mix-with-implicit-max-array

v2: code cleanups, increment NumUniformRemapTable correctly, fix
    find_empty_block to work properly and add some more comments.

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Marta Lofstedt <marta.lofstedt@intel.com>
---
 src/glsl/link_uniforms.cpp | 87 +++++++++++++++++++++++++++++++++-----
 src/glsl/linker.cpp        | 19 +++------
 src/glsl/linker.h          |  3 +-
 3 files changed, 85 insertions(+), 24 deletions(-)

diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp
index 33b2d4c8646..76ee70dd400 100644
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -1057,9 +1057,40 @@ assign_hidden_uniform_slot_id(const char *name, unsigned hidden_id,
    uniform_size->map->put(hidden_uniform_start + hidden_id, name);
 }
 
+/**
+ * Search UniformRemapTable for empty block big enough to hold given uniform.
+ * TODO Optimize this algorithm later if it turns out to be a major bottleneck.
+ */
+static int
+find_empty_block(struct gl_shader_program *prog,
+                 struct gl_uniform_storage *uniform)
+{
+   const unsigned entries = MAX2(1, uniform->array_elements);
+   for (unsigned i = 0, j; i < prog->NumUniformRemapTable; i++) {
+      /* We found empty space in UniformRemapTable. */
+      if (prog->UniformRemapTable[i] == NULL) {
+         for (j = i; j < entries && j < prog->NumUniformRemapTable; j++) {
+            if (prog->UniformRemapTable[j] != NULL) {
+               /* Entries do not fit in this space, continue searching
+                * after this location.
+                */
+               i = j + 1;
+               break;
+            }
+         }
+         /* Entries fit, we can return this location. */
+         if (i != j + 1) {
+            return i;
+         }
+      }
+   }
+   return -1;
+}
+
 void
 link_assign_uniform_locations(struct gl_shader_program *prog,
-                              unsigned int boolean_true)
+                              unsigned int boolean_true,
+                              unsigned int max_locations)
 {
    ralloc_free(prog->UniformStorage);
    prog->UniformStorage = NULL;
@@ -1150,6 +1181,20 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
 
    parcel_out_uniform_storage parcel(prog->UniformHash, uniforms, data);
 
+   unsigned total_entries = 0;
+
+   /* Calculate amount of 'holes' left after explicit locations were
+    * reserved from UniformRemapTable.
+    */
+   unsigned empty_locs = 0;
+   for (unsigned i = 0; i < prog->NumUniformRemapTable; i++)
+      if (prog->UniformRemapTable[i] == NULL)
+         empty_locs++;
+
+   /* Add all the reserved explicit locations - empty locations in remap table. */
+   if (prog->NumUniformRemapTable)
+      total_entries = (prog->NumUniformRemapTable - 1) - empty_locs;
+
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       if (prog->_LinkedShaders[i] == NULL)
 	 continue;
@@ -1213,21 +1258,43 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       /* how many new entries for this uniform? */
       const unsigned entries = MAX2(1, uniforms[i].array_elements);
 
-      /* resize remap table to fit new entries */
-      prog->UniformRemapTable =
-         reralloc(prog,
-                  prog->UniformRemapTable,
-                  gl_uniform_storage *,
-                  prog->NumUniformRemapTable + entries);
+      /* Find UniformRemapTable for empty blocks where we can fit this uniform. */
+      int chosen_location = -1;
+
+      if (empty_locs)
+         chosen_location = find_empty_block(prog, &uniforms[i]);
+
+      if (chosen_location != -1) {
+         empty_locs -= entries;
+      } else {
+         chosen_location = prog->NumUniformRemapTable;
+
+         /* Add new entries to the total amount of entries. */
+         total_entries += entries;
+
+         /* resize remap table to fit new entries */
+         prog->UniformRemapTable =
+            reralloc(prog,
+                     prog->UniformRemapTable,
+                     gl_uniform_storage *,
+                     prog->NumUniformRemapTable + entries);
+         prog->NumUniformRemapTable += entries;
+      }
 
       /* set pointers for this uniform */
       for (unsigned j = 0; j < entries; j++)
-         prog->UniformRemapTable[prog->NumUniformRemapTable+j] = &uniforms[i];
+         prog->UniformRemapTable[chosen_location + j] = &uniforms[i];
 
       /* set the base location in remap table for the uniform */
-      uniforms[i].remap_location = prog->NumUniformRemapTable;
+      uniforms[i].remap_location = chosen_location;
+   }
 
-      prog->NumUniformRemapTable += entries;
+    /* Verify that total amount of entries for explicit and implicit locations
+     * is less than MAX_UNIFORM_LOCATIONS.
+     */
+   if (total_entries >= max_locations) {
+      linker_error(prog, "count of uniform locations >= MAX_UNIFORM_LOCATIONS"
+                   "(%u >= %u)", total_entries, max_locations);
    }
 
    /* Reserve all the explicit locations of the active subroutine uniforms. */
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 6657777d74c..5be8d9fc48f 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3146,7 +3146,6 @@ check_explicit_uniform_locations(struct gl_context *ctx,
       return;
    }
 
-   unsigned entries_total = 0;
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       struct gl_shader *sh = prog->_LinkedShaders[i];
 
@@ -3158,8 +3157,6 @@ check_explicit_uniform_locations(struct gl_context *ctx,
          if (!var || var->data.mode != ir_var_uniform)
             continue;
 
-         entries_total += var->type->uniform_locations();
-
          if (var->data.explicit_location) {
             bool ret;
             if (var->type->without_array()->is_subroutine())
@@ -3173,15 +3170,6 @@ check_explicit_uniform_locations(struct gl_context *ctx,
          }
       }
    }
-
-   /* Verify that total amount of entries for explicit and implicit locations
-    * is less than MAX_UNIFORM_LOCATIONS.
-    */
-   if (entries_total >= ctx->Const.MaxUserAssignableUniformLocations) {
-      linker_error(prog, "count of uniform locations >= MAX_UNIFORM_LOCATIONS"
-                   "(%u >= %u)", entries_total,
-                   ctx->Const.MaxUserAssignableUniformLocations);
-   }
    delete uniform_map;
 }
 
@@ -4556,7 +4544,12 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       goto done;
 
    update_array_sizes(prog);
-   link_assign_uniform_locations(prog, ctx->Const.UniformBooleanTrue);
+   link_assign_uniform_locations(prog, ctx->Const.UniformBooleanTrue,
+                                 ctx->Const.MaxUserAssignableUniformLocations);
+
+   if (!prog->LinkStatus)
+      goto done;
+
    link_assign_atomic_counter_resources(ctx, prog);
    store_fragdepth_layout(prog);
 
diff --git a/src/glsl/linker.h b/src/glsl/linker.h
index c80be1c7e22..76f95c04704 100644
--- a/src/glsl/linker.h
+++ b/src/glsl/linker.h
@@ -35,7 +35,8 @@ link_invalidate_variable_locations(exec_list *ir);
 
 extern void
 link_assign_uniform_locations(struct gl_shader_program *prog,
-                              unsigned int boolean_true);
+                              unsigned int boolean_true,
+                              unsigned int max_locations);
 
 extern void
 link_set_uniform_initializers(struct gl_shader_program *prog,

From f1152c3455d0fa1a8b3cafee2aaeffb31d5ba49b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Wed, 20 Jan 2016 22:02:22 +0200
Subject: [PATCH 080/119] Revert "glsl: move uniform calculation to
 link_uniforms"

This reverts commit 4475d8f9169195baefa893b9b147fe20414cda7c.
---
 src/glsl/link_uniforms.cpp | 87 +++++---------------------------------
 src/glsl/linker.cpp        | 19 ++++++---
 src/glsl/linker.h          |  3 +-
 3 files changed, 24 insertions(+), 85 deletions(-)

diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp
index 76ee70dd400..33b2d4c8646 100644
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -1057,40 +1057,9 @@ assign_hidden_uniform_slot_id(const char *name, unsigned hidden_id,
    uniform_size->map->put(hidden_uniform_start + hidden_id, name);
 }
 
-/**
- * Search UniformRemapTable for empty block big enough to hold given uniform.
- * TODO Optimize this algorithm later if it turns out to be a major bottleneck.
- */
-static int
-find_empty_block(struct gl_shader_program *prog,
-                 struct gl_uniform_storage *uniform)
-{
-   const unsigned entries = MAX2(1, uniform->array_elements);
-   for (unsigned i = 0, j; i < prog->NumUniformRemapTable; i++) {
-      /* We found empty space in UniformRemapTable. */
-      if (prog->UniformRemapTable[i] == NULL) {
-         for (j = i; j < entries && j < prog->NumUniformRemapTable; j++) {
-            if (prog->UniformRemapTable[j] != NULL) {
-               /* Entries do not fit in this space, continue searching
-                * after this location.
-                */
-               i = j + 1;
-               break;
-            }
-         }
-         /* Entries fit, we can return this location. */
-         if (i != j + 1) {
-            return i;
-         }
-      }
-   }
-   return -1;
-}
-
 void
 link_assign_uniform_locations(struct gl_shader_program *prog,
-                              unsigned int boolean_true,
-                              unsigned int max_locations)
+                              unsigned int boolean_true)
 {
    ralloc_free(prog->UniformStorage);
    prog->UniformStorage = NULL;
@@ -1181,20 +1150,6 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
 
    parcel_out_uniform_storage parcel(prog->UniformHash, uniforms, data);
 
-   unsigned total_entries = 0;
-
-   /* Calculate amount of 'holes' left after explicit locations were
-    * reserved from UniformRemapTable.
-    */
-   unsigned empty_locs = 0;
-   for (unsigned i = 0; i < prog->NumUniformRemapTable; i++)
-      if (prog->UniformRemapTable[i] == NULL)
-         empty_locs++;
-
-   /* Add all the reserved explicit locations - empty locations in remap table. */
-   if (prog->NumUniformRemapTable)
-      total_entries = (prog->NumUniformRemapTable - 1) - empty_locs;
-
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       if (prog->_LinkedShaders[i] == NULL)
 	 continue;
@@ -1258,43 +1213,21 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       /* how many new entries for this uniform? */
       const unsigned entries = MAX2(1, uniforms[i].array_elements);
 
-      /* Find UniformRemapTable for empty blocks where we can fit this uniform. */
-      int chosen_location = -1;
-
-      if (empty_locs)
-         chosen_location = find_empty_block(prog, &uniforms[i]);
-
-      if (chosen_location != -1) {
-         empty_locs -= entries;
-      } else {
-         chosen_location = prog->NumUniformRemapTable;
-
-         /* Add new entries to the total amount of entries. */
-         total_entries += entries;
-
-         /* resize remap table to fit new entries */
-         prog->UniformRemapTable =
-            reralloc(prog,
-                     prog->UniformRemapTable,
-                     gl_uniform_storage *,
-                     prog->NumUniformRemapTable + entries);
-         prog->NumUniformRemapTable += entries;
-      }
+      /* resize remap table to fit new entries */
+      prog->UniformRemapTable =
+         reralloc(prog,
+                  prog->UniformRemapTable,
+                  gl_uniform_storage *,
+                  prog->NumUniformRemapTable + entries);
 
       /* set pointers for this uniform */
       for (unsigned j = 0; j < entries; j++)
-         prog->UniformRemapTable[chosen_location + j] = &uniforms[i];
+         prog->UniformRemapTable[prog->NumUniformRemapTable+j] = &uniforms[i];
 
       /* set the base location in remap table for the uniform */
-      uniforms[i].remap_location = chosen_location;
-   }
+      uniforms[i].remap_location = prog->NumUniformRemapTable;
 
-    /* Verify that total amount of entries for explicit and implicit locations
-     * is less than MAX_UNIFORM_LOCATIONS.
-     */
-   if (total_entries >= max_locations) {
-      linker_error(prog, "count of uniform locations >= MAX_UNIFORM_LOCATIONS"
-                   "(%u >= %u)", total_entries, max_locations);
+      prog->NumUniformRemapTable += entries;
    }
 
    /* Reserve all the explicit locations of the active subroutine uniforms. */
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 5be8d9fc48f..6657777d74c 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3146,6 +3146,7 @@ check_explicit_uniform_locations(struct gl_context *ctx,
       return;
    }
 
+   unsigned entries_total = 0;
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       struct gl_shader *sh = prog->_LinkedShaders[i];
 
@@ -3157,6 +3158,8 @@ check_explicit_uniform_locations(struct gl_context *ctx,
          if (!var || var->data.mode != ir_var_uniform)
             continue;
 
+         entries_total += var->type->uniform_locations();
+
          if (var->data.explicit_location) {
             bool ret;
             if (var->type->without_array()->is_subroutine())
@@ -3170,6 +3173,15 @@ check_explicit_uniform_locations(struct gl_context *ctx,
          }
       }
    }
+
+   /* Verify that total amount of entries for explicit and implicit locations
+    * is less than MAX_UNIFORM_LOCATIONS.
+    */
+   if (entries_total >= ctx->Const.MaxUserAssignableUniformLocations) {
+      linker_error(prog, "count of uniform locations >= MAX_UNIFORM_LOCATIONS"
+                   "(%u >= %u)", entries_total,
+                   ctx->Const.MaxUserAssignableUniformLocations);
+   }
    delete uniform_map;
 }
 
@@ -4544,12 +4556,7 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       goto done;
 
    update_array_sizes(prog);
-   link_assign_uniform_locations(prog, ctx->Const.UniformBooleanTrue,
-                                 ctx->Const.MaxUserAssignableUniformLocations);
-
-   if (!prog->LinkStatus)
-      goto done;
-
+   link_assign_uniform_locations(prog, ctx->Const.UniformBooleanTrue);
    link_assign_atomic_counter_resources(ctx, prog);
    store_fragdepth_layout(prog);
 
diff --git a/src/glsl/linker.h b/src/glsl/linker.h
index 76f95c04704..c80be1c7e22 100644
--- a/src/glsl/linker.h
+++ b/src/glsl/linker.h
@@ -35,8 +35,7 @@ link_invalidate_variable_locations(exec_list *ir);
 
 extern void
 link_assign_uniform_locations(struct gl_shader_program *prog,
-                              unsigned int boolean_true,
-                              unsigned int max_locations);
+                              unsigned int boolean_true);
 
 extern void
 link_set_uniform_initializers(struct gl_shader_program *prog,

From 9f23007a7a56d576c8f20b76583ca2416e94a75d Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 20 Jan 2016 17:12:59 -0500
Subject: [PATCH 081/119] gk110/ir: add partial BAR support

This is enough for the plain TGSI BARRIER implementation.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_emit_gk110.cpp    | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index b1064bf0a92..d79f3060beb 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -1252,8 +1252,24 @@ CodeEmitterGK110::emitPIXLD(const Instruction *i)
 void
 CodeEmitterGK110::emitBAR(const Instruction *i)
 {
-   /* TODO */
-   emitNOP(i);
+   code[0] = 0x00000002;
+   code[1] = 0x85400000;
+
+   switch (i->subOp) {
+   case NV50_IR_SUBOP_BAR_ARRIVE:   code[1] |= 0x08; break;
+   case NV50_IR_SUBOP_BAR_RED_AND:  code[1] |= 0x50; break;
+   case NV50_IR_SUBOP_BAR_RED_OR:   code[1] |= 0x90; break;
+   case NV50_IR_SUBOP_BAR_RED_POPC: code[1] |= 0x10; break;
+   default:
+      code[1] |= 0x20;
+      assert(i->subOp == NV50_IR_SUBOP_BAR_SYNC);
+      break;
+   }
+
+   emitPredicate(i);
+
+   srcId(i->src(0), 10);
+   srcId(i->src(1), 23);
 }
 
 void

From 3a635761685e839447fec2e5f84ce8bb8682dbc7 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 20 Jan 2016 17:15:27 -0500
Subject: [PATCH 082/119] gk110/ir: fix load from shared memory

It was accidentally using the store opcode.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index d79f3060beb..4248d6cb248 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -1613,7 +1613,7 @@ CodeEmitterGK110::emitLOAD(const Instruction *i)
    switch (i->src(0).getFile()) {
    case FILE_MEMORY_GLOBAL: code[1] = 0xc0000000; code[0] = 0x00000000; break;
    case FILE_MEMORY_LOCAL:  code[1] = 0x7a000000; code[0] = 0x00000002; break;
-   case FILE_MEMORY_SHARED: code[1] = 0x7ac00000; code[0] = 0x00000002; break;
+   case FILE_MEMORY_SHARED: code[1] = 0x7a400000; code[0] = 0x00000002; break;
    case FILE_MEMORY_CONST:
       if (!i->src(0).isIndirect(0) && typeSizeof(i->dType) == 4) {
          emitMOV(i);

From dc3ac418bf889620c93f50c68ef55b9e9de3afd3 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 20 Jan 2016 17:59:34 -0500
Subject: [PATCH 083/119] nv50/ir: don't flip SHL(ADD) into ADD(SHL) if ADD
 sources have modifiers

Fixes: 31fde8fa (nv50/ir: flip shl(add, imm) into add(shl, imm))
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 75773553840..95e9fdfc57d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -1229,6 +1229,8 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
             adds = 1;
          else
             return;
+         if (si->src(!adds).mod != Modifier(0))
+            return;
          // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
 
          // This is more operations, but if one of x, y is an immediate, then

From e925ec881128651b97ba747f644e921597a1c209 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Thu, 21 Jan 2016 00:04:56 +0100
Subject: [PATCH 084/119] llvmpipe,i915: add back NEW_RASTERIZER dependency
 when computing vertex info

I removed this mistakenly in 2dbc20e45689e09766552517a74e2270e49817b5. I
actually thought it should not be necessary and a piglit run didn't show
any differences, but this shouldn't have been in there.
draw_prepare_shader_outputs() is in fact dependent on NEW_RASTERIZER.
The new polygon-mode-facing test indeed shows why this is necessary, there's
lots of invalid reads and writes with valgrind (also crashes without
valgrind), because the pre-pipeline vertex size doesn't match the
post-pipeline vertex size (note this won't help much with stages which don't
have the prepare hook which can grow the vertex size, in particular the wide
point stage, but this isn't used by llvmpipe). The test still won't pass, of
course, but it is only usage of uninitialized values now, which is much
less dangerous...
(Albeit I'm pretty sure for i915 it really is not needed anymore as it
doesn't care about the extra outputs and doesn't call
draw_prepare_shader_outputs().)

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/auxiliary/draw/draw_llvm.c          | 6 ++++++
 src/gallium/drivers/i915/i915_state_derived.c   | 2 +-
 src/gallium/drivers/llvmpipe/lp_state_derived.c | 6 ++++--
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 142d78ae49d..b48bdcc779e 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1618,6 +1618,12 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
    context_ptr               = LLVMGetParam(variant_func, 0);
    io_ptr                    = LLVMGetParam(variant_func, 1);
    vbuffers_ptr              = LLVMGetParam(variant_func, 2);
+   /*
+    * XXX: stride is actually unused. The stride we use is strictly calculated
+    * from the number of outputs (including the draw_extra outputs).
+    * Should probably fix some day (we need a new vs just because of extra
+    * outputs which the generated vs won't touch).
+    */
    stride                    = LLVMGetParam(variant_func, 5 + (elts ? 1 : 0));
    vb_ptr                    = LLVMGetParam(variant_func, 6 + (elts ? 1 : 0));
    system_values.instance_id = LLVMGetParam(variant_func, 7 + (elts ? 1 : 0));
diff --git a/src/gallium/drivers/i915/i915_state_derived.c b/src/gallium/drivers/i915/i915_state_derived.c
index bd0f448f645..177b8545985 100644
--- a/src/gallium/drivers/i915/i915_state_derived.c
+++ b/src/gallium/drivers/i915/i915_state_derived.c
@@ -184,7 +184,7 @@ static void calculate_vertex_layout(struct i915_context *i915)
 struct i915_tracked_state i915_update_vertex_layout = {
    "vertex_layout",
    calculate_vertex_layout,
-   I915_NEW_FS | I915_NEW_VS
+   I915_NEW_RASTERIZER | I915_NEW_FS | I915_NEW_VS
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index 34961cbbac5..c90f2f270fe 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -190,8 +190,10 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
       llvmpipe->tex_timestamp = lp_screen->timestamp;
       llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
    }
-      
-   if (llvmpipe->dirty & (LP_NEW_FS |
+
+   /* This needs LP_NEW_RASTERIZER because of draw_prepare_shader_outputs(). */
+   if (llvmpipe->dirty & (LP_NEW_RASTERIZER |
+                          LP_NEW_FS |
                           LP_NEW_VS))
       compute_vertex_info(llvmpipe);
 

From dc8b9bd0aae1b1bae89edda517ffe5c83b759552 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Wed, 20 Jan 2016 00:48:07 +0100
Subject: [PATCH 085/119] llvmpipe: warn about illegal use of objects in
 different contexts

Doing that is clearly a bug. We can't quite assert as st/mesa may hit this,
but increase at least visibility of it a bit.
(For the non-refcounted objects it would be illegal too, but we can't detect
that unless we'd store the context ourselves. Plus, those don't tend to cause
random crashes at context or object destruction time... So just sampler views,
surfaces and so targets for now.)

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/drivers/llvmpipe/lp_state_sampler.c |  9 +++++++++
 src/gallium/drivers/llvmpipe/lp_state_so.c      |  9 +++++++++
 src/gallium/drivers/llvmpipe/lp_state_surface.c | 15 ++++++++++++++-
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 69af38e92c0..32bf9fdd25d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -129,6 +129,15 @@ llvmpipe_set_sampler_views(struct pipe_context *pipe,
        */
       pipe_sampler_view_release(pipe,
                                 &llvmpipe->sampler_views[shader][start + i]);
+      /*
+       * Warn if someone tries to set a view created in a different context
+       * (which is why we need the hack above in the first place).
+       * An assert would be better but st/mesa relies on it...
+       */
+      if (views[i] && views[i]->context != pipe) {
+         debug_printf("Illegal setting of sampler_view %d created in another "
+                      "context\n", i);
+      }
       pipe_sampler_view_reference(&llvmpipe->sampler_views[shader][start + i],
                                   views[i]);
    }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_so.c b/src/gallium/drivers/llvmpipe/lp_state_so.c
index 2af04cdf1c3..b2afd6fbf70 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_so.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_so.c
@@ -70,6 +70,15 @@ llvmpipe_set_so_targets(struct pipe_context *pipe,
    int i;
    for (i = 0; i < num_targets; i++) {
       const boolean append = (offsets[i] == (unsigned)-1);
+      /*
+       * Warn if the so target was created in another context.
+       * XXX Not entirely sure if mesa/st may rely on this?
+       * Otherwise should just assert.
+       */
+      if (targets[i] && targets[i]->context != pipe) {
+         debug_printf("Illegal setting of so target with target %d created in "
+                       "another context\n", i);
+      }
       pipe_so_target_reference((struct pipe_stream_output_target **)&llvmpipe->so_targets[i], targets[i]);
       /* If we're not appending then lets set the internal
          offset to what was requested */
diff --git a/src/gallium/drivers/llvmpipe/lp_state_surface.c b/src/gallium/drivers/llvmpipe/lp_state_surface.c
index c879ba9751d..b20b9c5cdd5 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_surface.c
@@ -52,6 +52,7 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
    struct llvmpipe_context *lp = llvmpipe_context(pipe);
 
    boolean changed = !util_framebuffer_state_equal(&lp->framebuffer, fb);
+   unsigned i;
 
    assert(fb->width <= LP_MAX_WIDTH);
    assert(fb->height <= LP_MAX_HEIGHT);
@@ -66,10 +67,22 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
       const struct util_format_description *depth_desc =
          util_format_description(depth_format);
 
+      if (lp->framebuffer.zsbuf && lp->framebuffer.zsbuf->context != pipe) {
+         debug_printf("Illegal setting of fb state with zsbuf created in "
+                       "another context\n");
+      }
+      for (i = 0; i < fb->nr_cbufs; i++) {
+         if (lp->framebuffer.cbufs[i] &&
+             lp->framebuffer.cbufs[i]->context != pipe) {
+            debug_printf("Illegal setting of fb state with cbuf %d created in "
+                          "another context\n", i);
+         }
+      }
+
       util_copy_framebuffer_state(&lp->framebuffer, fb);
 
       if (LP_PERF & PERF_NO_DEPTH) {
-	 pipe_surface_reference(&lp->framebuffer.zsbuf, NULL);
+         pipe_surface_reference(&lp->framebuffer.zsbuf, NULL);
       }
 
       /*

From 7d9a97d6beee8878ac57448af8c41b8af3f189b1 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 21 Sep 2015 20:00:36 -0400
Subject: [PATCH 086/119] gk110/ir: add atomic op emission, fix gmem loads

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_emit_gk110.cpp    | 70 +++++++++++++++++--
 1 file changed, 65 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 4248d6cb248..8a98c29035f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -75,7 +75,7 @@ private:
    void emitLOAD(const Instruction *);
    void emitSTORE(const Instruction *);
    void emitMOV(const Instruction *);
-   void emitMEMBAR(const Instruction *);
+   void emitATOM(const Instruction *);
 
    void emitINTERP(const Instruction *);
    void emitAFETCH(const Instruction *);
@@ -123,6 +123,7 @@ private:
    void emitPIXLD(const Instruction *);
 
    void emitBAR(const Instruction *);
+   void emitMEMBAR(const Instruction *);
 
    void emitFlow(const Instruction *);
 
@@ -1272,6 +1273,14 @@ CodeEmitterGK110::emitBAR(const Instruction *i)
    srcId(i->src(1), 23);
 }
 
+void CodeEmitterGK110::emitMEMBAR(const Instruction *i)
+{
+   code[0] = 0x00000002 | NV50_IR_SUBOP_MEMBAR_SCOPE(i->subOp) << 8;
+   code[1] = 0x7cc00000;
+
+   emitPredicate(i);
+}
+
 void
 CodeEmitterGK110::emitFlow(const Instruction *i)
 {
@@ -1644,7 +1653,13 @@ CodeEmitterGK110::emitLOAD(const Instruction *i)
    emitPredicate(i);
 
    defId(i->def(0), 2);
-   srcId(i->src(0).getIndirect(0), 10);
+   if (i->getIndirect(0, 0)) {
+      srcId(i->src(0).getIndirect(0), 10);
+      if (i->getIndirect(0, 0)->reg.size == 8)
+         code[1] |= 1 << 23;
+   } else {
+      code[0] |= 255 << 10;
+   }
 }
 
 uint8_t
@@ -1699,12 +1714,54 @@ CodeEmitterGK110::emitMOV(const Instruction *i)
    }
 }
 
-void CodeEmitterGK110::emitMEMBAR(const Instruction *i)
+void
+CodeEmitterGK110::emitATOM(const Instruction *i)
 {
-   code[0] = 0x00000002 | NV50_IR_SUBOP_MEMBAR_SCOPE(i->subOp) << 8;
-   code[1] = 0x7cc00000;
+   code[0] = 0x00000002;
+   if (i->subOp == NV50_IR_SUBOP_ATOM_CAS)
+      code[1] = 0x77800000;
+   else
+      code[1] = 0x68000000;
+
+   switch (i->subOp) {
+   case NV50_IR_SUBOP_ATOM_CAS: break;
+   case NV50_IR_SUBOP_ATOM_EXCH: code[1] |= 0x04000000; break;
+   default: code[1] |= i->subOp << 23; break;
+   }
+
+   switch (i->dType) {
+   case TYPE_U32: break;
+   case TYPE_S32: code[1] |= 0x00100000; break;
+   case TYPE_U64: code[1] |= 0x00200000; break;
+   case TYPE_F32: code[1] |= 0x00300000; break;
+   case TYPE_B128: code[1] |= 0x00400000; break; /* TODO: U128 */
+   case TYPE_S64: code[1] |= 0x00500000; break;
+   default: assert(!"unsupported type"); break;
+   }
 
    emitPredicate(i);
+
+   /* TODO: cas: check that src regs line up */
+   /* TODO: cas: flip bits if $r255 is used */
+   srcId(i->src(1), 23);
+
+   if (i->defExists(0))
+      defId(i->def(0), 2);
+   else
+      code[0] |= 255 << 2;
+
+   const int32_t offset = SDATA(i->src(0)).offset;
+   assert(offset < 0x80000 && offset >= -0x80000);
+   code[0] |= (offset & 1) << 31;
+   code[1] |= (offset & 0xffffe) >> 1;
+
+   if (i->getIndirect(0, 0)) {
+      srcId(i->getIndirect(0, 0), 10);
+      if (i->getIndirect(0, 0)->reg.size == 8)
+         code[1] |= 1 << 19;
+   } else {
+      code[0] |= 255 << 10;
+   }
 }
 
 bool
@@ -1941,6 +1998,9 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
    case OP_MEMBAR:
       emitMEMBAR(insn);
       break;
+   case OP_ATOM:
+      emitATOM(insn);
+      break;
    case OP_PHI:
    case OP_UNION:
    case OP_CONSTRAINT:

From 8c2dfe05c5db9946d2d30546920e98bcfb8e3eb9 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 22 Sep 2015 19:13:14 -0400
Subject: [PATCH 087/119] gk110/ir: add OP_CCTL handling

---
 .../nouveau/codegen/nv50_ir_emit_gk110.cpp    | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 8a98c29035f..59041dfacab 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -76,6 +76,7 @@ private:
    void emitSTORE(const Instruction *);
    void emitMOV(const Instruction *);
    void emitATOM(const Instruction *);
+   void emitCCTL(const Instruction *);
 
    void emitINTERP(const Instruction *);
    void emitAFETCH(const Instruction *);
@@ -1714,6 +1715,14 @@ CodeEmitterGK110::emitMOV(const Instruction *i)
    }
 }
 
+static inline bool
+uses64bitAddress(const Instruction *ldst)
+{
+   return ldst->src(0).getFile() == FILE_MEMORY_GLOBAL &&
+      ldst->src(0).isIndirect(0) &&
+      ldst->getIndirect(0, 0)->reg.size == 8;
+}
+
 void
 CodeEmitterGK110::emitATOM(const Instruction *i)
 {
@@ -1764,6 +1773,29 @@ CodeEmitterGK110::emitATOM(const Instruction *i)
    }
 }
 
+void
+CodeEmitterGK110::emitCCTL(const Instruction *i)
+{
+   int32_t offset = SDATA(i->src(0)).offset;
+
+   code[0] = 0x00000002 | (i->subOp << 2);
+
+   if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+      code[1] = 0x7b000000;
+   } else {
+      code[1] = 0x7c000000;
+      offset &= 0xffffff;
+   }
+   code[0] |= offset << 23;
+   code[1] |= offset >> 9;
+
+   if (uses64bitAddress(i))
+      code[1] |= 1 << 23;
+   srcId(i->src(0).getIndirect(0), 10);
+
+   emitPredicate(i);
+}
+
 bool
 CodeEmitterGK110::emitInstruction(Instruction *insn)
 {
@@ -2001,6 +2033,9 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
    case OP_ATOM:
       emitATOM(insn);
       break;
+   case OP_CCTL:
+      emitCCTL(insn);
+      break;
    case OP_PHI:
    case OP_UNION:
    case OP_CONSTRAINT:

From 2e533ab74be1f997ddfaaf01798e7e3018138ac2 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 28 Dec 2015 15:59:03 -0500
Subject: [PATCH 088/119] gk110/ir: fix double-wide vm address

---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 59041dfacab..adb61d55501 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -1613,6 +1613,10 @@ CodeEmitterGK110::emitSTORE(const Instruction *i)
 
    srcId(i->src(1), 2);
    srcId(i->src(0).getIndirect(0), 10);
+   if (i->src(0).getFile() == FILE_MEMORY_GLOBAL &&
+       i->src(0).isIndirect(0) &&
+       i->getIndirect(0, 0)->reg.size == 8)
+      code[1] |= 1 << 23;
 }
 
 void

From 57b00258146632ba5ef4b84513ec4faa8a5907c1 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 7 Nov 2015 03:30:26 -0500
Subject: [PATCH 089/119] gm107/ir: set LD/ST address width bit

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index ec74e7ac811..c628cfe9a54 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -2146,6 +2146,7 @@ CodeEmitterGM107::emitLD()
    emitPRED (0x3a);
    emitLDSTc(0x38);
    emitLDSTs(0x35, insn->dType);
+   emitField(0x34, 1, insn->src(0).getIndirect(0)->getSize() == 8);
    emitADDR (0x08, 0x14, 32, 0, insn->src(0));
    emitGPR  (0x00, insn->def(0));
 }
@@ -2176,6 +2177,7 @@ CodeEmitterGM107::emitST()
    emitPRED (0x3a);
    emitLDSTc(0x38);
    emitLDSTs(0x35, insn->dType);
+   emitField(0x34, 1, insn->src(0).getIndirect(0)->getSize() == 8);
    emitADDR (0x08, 0x14, 32, 0, insn->src(0));
    emitGPR  (0x00, insn->src(1));
 }

From 71a489633b1bdd8492e22fe15bb9a57e30fa4ccd Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 3 Jan 2016 02:11:51 -0500
Subject: [PATCH 090/119] gm107/ir: add ATOM and CCTL support

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_emit_gm107.cpp    | 52 +++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index c628cfe9a54..465319af5ce 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -176,6 +176,8 @@ private:
    void emitISBERD();
    void emitAL2P();
    void emitIPA();
+   void emitATOM();
+   void emitCCTL();
 
    void emitPIXLD();
 
@@ -2298,6 +2300,50 @@ CodeEmitterGM107::emitIPA()
       emitGPR(0x27);
 }
 
+void
+CodeEmitterGM107::emitATOM()
+{
+   unsigned dType, subOp;
+   switch (insn->dType) {
+   case TYPE_U32: dType = 0; break;
+   case TYPE_S32: dType = 1; break;
+   case TYPE_U64: dType = 2; break;
+   case TYPE_F32: dType = 3; break;
+   case TYPE_B128: dType = 4; break;
+   case TYPE_S64: dType = 5; break;
+   default: assert(!"unexpected dType"); dType = 0; break;
+   }
+   if (insn->subOp == NV50_IR_SUBOP_ATOM_EXCH)
+      subOp = 8;
+   else
+      subOp = insn->subOp;
+   assert(insn->subOp != NV50_IR_SUBOP_ATOM_CAS); /* XXX */
+
+   emitInsn (0xed000000);
+   emitField(0x34, 4, subOp);
+   emitField(0x31, 3, dType);
+   emitField(0x30, 1, insn->src(0).getIndirect(0)->getSize() == 8);
+   emitGPR  (0x14, insn->src(1));
+   emitADDR (0x08, 0x1c, 20, 0, insn->src(0));
+   emitGPR  (0x00, insn->def(0));
+}
+
+void
+CodeEmitterGM107::emitCCTL()
+{
+   unsigned width;
+   if (insn->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+      emitInsn(0xef600000);
+      width = 30;
+   } else {
+      emitInsn(0xef800000);
+      width = 22;
+   }
+   emitField(0x34, 1, insn->src(0).getIndirect(0)->getSize() == 8);
+   emitADDR (0x08, 0x16, width, 2, insn->src(0));
+   emitField(0x00, 4, insn->subOp);
+}
+
 /*******************************************************************************
  * surface
  ******************************************************************************/
@@ -2797,6 +2843,12 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
          break;
       }
       break;
+   case OP_ATOM:
+      emitATOM();
+      break;
+   case OP_CCTL:
+      emitCCTL();
+      break;
    case OP_VFETCH:
       emitALD();
       break;

From 73c9ca754487fa5f39122119e1588e13ffcf5f47 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 18 Jan 2016 23:55:19 -0500
Subject: [PATCH 091/119] gm107/ir: add carry emission to LOP and IADD

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 465319af5ce..1fa0eb6da6d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -1554,11 +1554,13 @@ CodeEmitterGM107::emitLOP()
          break;
       }
       emitPRED (0x30);
+      emitX    (0x2b);
       emitField(0x29, 2, lop);
       emitINV  (0x28, insn->src(1));
       emitINV  (0x27, insn->src(0));
    } else {
       emitInsn (0x04000000);
+      emitX    (0x39);
       emitINV  (0x38, insn->src(1));
       emitINV  (0x37, insn->src(0));
       emitField(0x35, 2, lop);
@@ -1626,9 +1628,11 @@ CodeEmitterGM107::emitIADD()
       emitNEG(0x31, insn->src(0));
       emitNEG(0x30, insn->src(1));
       emitCC (0x2f);
+      emitX  (0x2b);
    } else {
       emitInsn(0x1c000000);
       emitSAT (0x36);
+      emitX   (0x35);
       emitCC  (0x34);
       emitIMMD(0x14, 32, insn->src(1));
    }

From c0b66d96d77b646282f7e732c4c25761431336c3 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 19 Jan 2016 05:30:56 -0500
Subject: [PATCH 092/119] gk110/ir: allow carry to be set/read by imad

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index adb61d55501..17cb484d2ba 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -700,6 +700,10 @@ CodeEmitterGK110::emitIMAD(const Instruction *i)
 
    if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
       code[1] |= 1 << 25;
+
+   if (i->flagsDef >= 0) code[1] |= 1 << 18;
+   if (i->flagsSrc >= 0) code[1] |= 1 << 20;
+
    SAT_(35);
 }
 

From daa0fd7843dfc3f2f209a1b755df01f4dbc8122f Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 19 Jan 2016 05:37:24 -0500
Subject: [PATCH 093/119] nv50/ir: 64-bit splitting fixes

Take reading shader outputs into account, and use setFlagsDef for the
carry since we rely on having i->flagsDef being set.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
index 1bf7240e131..f58cf97646e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -615,6 +615,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
          case FILE_MEMORY_CONST:
          case FILE_MEMORY_SHARED:
          case FILE_SHADER_INPUT:
+         case FILE_SHADER_OUTPUT:
             hi->getSrc(s)->reg.data.offset += 4;
             break;
          default:
@@ -625,7 +626,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
       }
    }
    if (srcNr == 2) {
-      lo->setDef(1, carry);
+      lo->setFlagsDef(1, carry);
       hi->setFlagsSrc(hi->srcCount(), carry);
    }
    return hi;

From daa775b58e6dcc800dd611678b32cb1c5184f5b0 Mon Sep 17 00:00:00 2001
From: Arlie Davis <arlied@google.com>
Date: Thu, 17 Sep 2015 15:19:24 -0700
Subject: [PATCH 094/119] mesa: Reduce libGL.so binary size by about 15%

This patch significantly reduces the size of the libGL.so binary. It does
not change the (externally visible) behavior of libGL.so at all.

gl_gentable.py generates a function, _glapi_create_table_from_handle.
This function allocates a large dispatch table, consisting of 1300 or so
function pointers, and fills this dispatch table by doing symbol lookups
on a given shared library.  Previously, gl_gentable.py would generate a
single, very large _glapi_create_table_from_handle function, with a short
cluster of lines for each entry point (function).  The idiom it generates
was a NULL check, a call to snprintf, a call to dlsym / GetProcAddress,
and then a store into the dispatch table.  Since this function processes
a large number of entry points, this code is duplicated many times over.

We can encode the same information much more compactly, by using a lookup
table.  The previous total size of _glapi_create_table_from_handle on x64
was 125848 bytes.  By using a lookup table, the size of
_glapi_create_table_from_handle (and the related lookup tables) is reduced
to 10840 bytes.  In other words, this enormous function is reduced by 91%.
The size of the entire libGL.so binary (measured when stripped) itself drops
by 15%.

So the purpose of this change is to reduce the binary size, which frees up
disk space, memory, etc.

size lib/libGL.so.1.2.0 on my system shows (Andreas)
   text	   data	    bss	    dec	    hex	filename
 565947	  11256	   2720	 579923	  8d953	lib/libGL.so.1.2.0 before
 469211	  21848	   2720	 493779	  788d3	lib/libGL.so.1.2.0 after

v2: Incorporate Matt's feedback.

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Jeremy Huddleston Sequoia <jeremyhu@apple.com>
Tested-by: Jeremy Huddleston Sequoia <jeremyhu@apple.com>
Signed-off-by: Andreas Boll <andreas.boll.dev@gmail.com>
---
 src/mapi/glapi/gen/gl_gentable.py | 57 ++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/src/mapi/glapi/gen/gl_gentable.py b/src/mapi/glapi/gen/gl_gentable.py
index 1b3eb72470d..7cd475aa2b8 100644
--- a/src/mapi/glapi/gen/gl_gentable.py
+++ b/src/mapi/glapi/gen/gl_gentable.py
@@ -113,6 +113,9 @@ __glapi_gentable_set_remaining_noop(struct _glapi_table *disp) {
             dispatch[i] = p.v;
 }
 
+"""
+
+footer = """
 struct _glapi_table *
 _glapi_create_table_from_handle(void *handle, const char *symbol_prefix) {
     struct _glapi_table *disp = calloc(_glapi_get_dispatch_table_size(), sizeof(_glapi_proc));
@@ -123,27 +126,28 @@ _glapi_create_table_from_handle(void *handle, const char *symbol_prefix) {
 
     if(symbol_prefix == NULL)
         symbol_prefix = "";
-"""
 
-footer = """
-    __glapi_gentable_set_remaining_noop(disp);
+    /* Note: This code relies on _glapi_table_func_names being sorted by the
+     * entry point index of each function.
+     */
+    for (int func_index = 0; func_index < GLAPI_TABLE_COUNT; ++func_index) {
+        const char *name = _glapi_table_func_names[func_index];
+        void ** procp = &((void **)disp)[func_index];
 
-    return disp;
-}
-"""
-
-body_template = """
-    if(!disp->%(name)s) {
-        void ** procp = (void **) &disp->%(name)s;
-        snprintf(symboln, sizeof(symboln), "%%s%(entry_point)s", symbol_prefix);
+        snprintf(symboln, sizeof(symboln), \"%s%s\", symbol_prefix, name);
 #ifdef _WIN32
         *procp = GetProcAddress(handle, symboln);
 #else
         *procp = dlsym(handle, symboln);
 #endif
     }
+    __glapi_gentable_set_remaining_noop(disp);
+
+    return disp;
+}
 """
 
+
 class PrintCode(gl_XML.gl_print_base):
 
     def __init__(self):
@@ -180,12 +184,33 @@ class PrintCode(gl_XML.gl_print_base):
 
 
     def printBody(self, api):
-        for f in api.functionIterateByOffset():
-            for entry_point in f.entry_points:
-                vars = { 'entry_point' : entry_point,
-                         'name' : f.name }
 
-                print body_template % vars
+        # Determine how many functions have a defined offset.
+        func_count = 0
+        for f in api.functions_by_name.itervalues():
+            if f.offset != -1:
+                func_count += 1
+
+        # Build the mapping from offset to function name.
+        funcnames = [None] * func_count
+        for f in api.functions_by_name.itervalues():
+            if f.offset != -1:
+                if not (funcnames[f.offset] is None):
+                    raise Exception("Function table has more than one function with same offset (offset %d, func %s)" % (f.offset, f.name))
+                funcnames[f.offset] = f.name
+
+        # Check that the table has no gaps.  We expect a function at every offset,
+        # and the code which generates the table relies on this.
+        for i in xrange(0, func_count):
+            if funcnames[i] is None:
+                raise Exception("Function table has no function at offset %d" % (i))
+
+        print "#define GLAPI_TABLE_COUNT %d" % func_count
+        print "static const char * const _glapi_table_func_names[GLAPI_TABLE_COUNT] = {"
+        for i in xrange(0, func_count):
+            print "    /* %5d */ \"%s\"," % (i, funcnames[i])
+        print "};"
+
         return
 
 

From 5d4b20267dce67cf7a49ce70604e662cfe089096 Mon Sep 17 00:00:00 2001
From: Andreas Boll <andreas.boll.dev@gmail.com>
Date: Wed, 9 Dec 2015 13:41:22 +0100
Subject: [PATCH 095/119] glapi: Build glapi_gentable.c only on Darwin

Removes the public symbol _glapi_create_table_from_handle from
libGL.so.1.2.0 on all platforms except Darwin.

Since the symbol is not used on other platforms it makes sense to
build glapi_gentable.c only on Darwin.

As a side effect it accelerates the build a bit and reduces the size
of libGL.so.1.2.0 as follows:

size lib/libGL.so.1.2.0 on my system shows
   text	   data	    bss	    dec	    hex	filename
 469211	  21848	   2720	 493779	  788d3	lib/libGL.so.1.2.0 before
 420988	  11240	   2720	 434948	  6a304	lib/libGL.so.1.2.0 after

A little bit of history:

_glapi_create_table_from_handle was introduced in

commit 85937f4c0d4a78d3a11e3c1fa6148640f2a9ad7b
Author: Jeremy Huddleston <jeremyhu@apple.com>
Date:   Thu Jun 9 16:59:49 2011 -0700

    glapi: Add API that can create a _glapi_table from a dlfcn handle

    Example usage:

    void *handle = dlopen(opengl_library_path, RTLD_LOCAL);
    struct _glapi_table *disp = _glapi_create_table_from_handle(handle,
"gl");

    Signed-off-by: Jeremy Huddleston <jeremyhu@apple.com>

and the only user in mesa was added in

commit f35913b96e743c5014e99220b1a1c5532a894d69
Author: Jeremy Huddleston <jeremyhu@apple.com>
Date:   Thu Jun 9 17:29:51 2011 -0700

    apple: Use _glapi_create_table_from_handle to initialize our
dispatch table

    Signed-off-by: Jeremy Huddleston <jeremyhu@apple.com>

gl_gentable.py was also used for XQuartz in xserver 1.11 - 1.14.

v2: Fix typos in commit message
    Add missing XORG_GLAPI_OUTPUTS += \ into src/mapi/glapi/gen/Makefile.am
    Add glapi_gentable.c to EXTRA_DIST for inclusion in the release
    tarball

v3: Fix commit message: s/gl_gentable.c/glapi_gentable.c/

Reported-by: Arlie Davis <arlied@google.com>
Cc: Jeremy Huddleston <jeremyhu@apple.com>
Signed-off-by: Andreas Boll <andreas.boll.dev@gmail.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mapi/Makefile.am           |  6 +++++-
 src/mapi/glapi/gen/Makefile.am | 14 +++++++++++---
 src/mapi/glapi/glapi.h         |  2 ++
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/mapi/Makefile.am b/src/mapi/Makefile.am
index 5ba924192cf..68a28a2283c 100644
--- a/src/mapi/Makefile.am
+++ b/src/mapi/Makefile.am
@@ -107,12 +107,16 @@ if HAVE_SPARC_ASM
 GLAPI_ASM_SOURCES = glapi/glapi_sparc.S
 endif
 
-glapi_libglapi_la_SOURCES = glapi/glapi_gentable.c
+glapi_libglapi_la_SOURCES =
 glapi_libglapi_la_CPPFLAGS = \
 	$(AM_CPPFLAGS) \
 	-I$(top_srcdir)/src/mapi/glapi \
 	-I$(top_srcdir)/src/mesa
 
+if HAVE_APPLEDRI
+glapi_libglapi_la_SOURCES += glapi/glapi_gentable.c
+endif
+
 if HAVE_SHARED_GLAPI
 glapi_libglapi_la_SOURCES += $(MAPI_BRIDGE_FILES) glapi/glapi_mapi_tmp.h
 glapi_libglapi_la_CPPFLAGS += \
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index 900b61a5d45..3f3e0b9af5f 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -27,8 +27,11 @@ MESA_GLAPI_OUTPUTS = \
 	$(MESA_GLAPI_DIR)/glapi_mapi_tmp.h \
 	$(MESA_GLAPI_DIR)/glprocs.h \
 	$(MESA_GLAPI_DIR)/glapitemp.h \
-	$(MESA_GLAPI_DIR)/glapitable.h \
-	$(MESA_GLAPI_DIR)/glapi_gentable.c
+	$(MESA_GLAPI_DIR)/glapitable.h
+
+if HAVE_APPLEDRI
+MESA_GLAPI_OUTPUTS += $(MESA_GLAPI_DIR)/glapi_gentable.c
+endif
 
 MESA_GLAPI_ASM_OUTPUTS =
 if HAVE_X86_ASM
@@ -57,6 +60,7 @@ BUILT_SOURCES = \
 	$(MESA_GLX_DIR)/indirect_size.c
 EXTRA_DIST= \
 	$(BUILT_SOURCES) \
+	$(MESA_GLAPI_DIR)/glapi_gentable.c \
 	$(MESA_GLAPI_DIR)/glapi_x86.S \
 	$(MESA_GLAPI_DIR)/glapi_x86-64.S \
 	$(MESA_GLAPI_DIR)/glapi_sparc.S \
@@ -88,8 +92,12 @@ XORG_GLAPI_DIR = $(XORG_BASE)/glx
 XORG_GLAPI_OUTPUTS = \
 	$(XORG_GLAPI_DIR)/glprocs.h \
 	$(XORG_GLAPI_DIR)/glapitable.h \
-	$(XORG_GLAPI_DIR)/dispatch.h \
+	$(XORG_GLAPI_DIR)/dispatch.h
+
+if HAVE_APPLEDRI
+XORG_GLAPI_OUTPUTS += \
 	$(XORG_GLAPI_DIR)/glapi_gentable.c
+endif
 
 XORG_OUTPUTS = \
 	$(XORG_GLAPI_OUTPUTS) \
diff --git a/src/mapi/glapi/glapi.h b/src/mapi/glapi/glapi.h
index f269b1701bc..3593c88bbc1 100644
--- a/src/mapi/glapi/glapi.h
+++ b/src/mapi/glapi/glapi.h
@@ -158,8 +158,10 @@ _GLAPI_EXPORT const char *
 _glapi_get_proc_name(unsigned int offset);
 
 
+#ifdef GLX_USE_APPLEGL
 _GLAPI_EXPORT struct _glapi_table *
 _glapi_create_table_from_handle(void *handle, const char *symbol_prefix);
+#endif
 
 
 _GLAPI_EXPORT void

From a087a09fa86f9617af98f6294dd2228555a4891c Mon Sep 17 00:00:00 2001
From: Jeremy Huddleston Sequoia <jeremyhu@apple.com>
Date: Wed, 20 Jan 2016 16:59:45 -0800
Subject: [PATCH 096/119] mesa: Fix some function prototype mismatching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

main/api_exec.c:543:36: warning: incompatible pointer types passing 'void (GLhandleARB, GLuint, const GLcharARB *)' (aka 'void (unsigned long, unsigned int, const char *)') to
parameter of
      type 'void (*)(GLuint, GLuint, const GLchar *)' (aka 'void (*)(unsigned int, unsigned int, const char *)') [-Wincompatible-pointer-types]
      SET_BindAttribLocation(exec, _mesa_BindAttribLocation);
                                   ^~~~~~~~~~~~~~~~~~~~~~~~
./main/dispatch.h:7590:88: note: passing argument to parameter 'fn' here
static inline void SET_BindAttribLocation(struct _glapi_table *disp, void (GLAPIENTRYP fn)(GLuint, GLuint, const GLchar *)) {
                                                                                       ^
main/api_exec.c:547:31: warning: incompatible pointer types passing 'void (GLhandleARB)' (aka 'void (unsigned long)') to parameter of type 'void (*)(GLuint)' (aka 'void (*)(unsigned
int)')
      [-Wincompatible-pointer-types]
      SET_CompileShader(exec, _mesa_CompileShader);
                              ^~~~~~~~~~~~~~~~~~~
./main/dispatch.h:7612:83: note: passing argument to parameter 'fn' here
static inline void SET_CompileShader(struct _glapi_table *disp, void (GLAPIENTRYP fn)(GLuint)) {
                                                                                  ^
main/api_exec.c:568:33: warning: incompatible pointer types passing 'void (GLhandleARB, GLuint, GLsizei, GLsizei *, GLint *, GLenum *, GLcharARB *)' (aka 'void (unsigned long,
unsigned int,
      int, int *, int *, unsigned int *, char *)') to parameter of type 'void (*)(GLuint, GLuint, GLsizei, GLsizei *, GLint *, GLenum *, GLchar *)' (aka 'void (*)(unsigned int,
unsigned int,
      int, int *, int *, unsigned int *, char *)') [-Wincompatible-pointer-types]
      SET_GetActiveAttrib(exec, _mesa_GetActiveAttrib);
                                ^~~~~~~~~~~~~~~~~~~~~
./main/dispatch.h:7711:85: note: passing argument to parameter 'fn' here
static inline void SET_GetActiveAttrib(struct _glapi_table *disp, void (GLAPIENTRYP fn)(GLuint, GLuint, GLsizei , GLsizei *, GLint *, GLenum *, GLchar *)) {
                                                                                    ^
main/api_exec.c:571:35: warning: incompatible pointer types passing 'GLint (GLhandleARB, const GLcharARB *)' (aka 'int (unsigned long, const char *)') to parameter of type
      'GLint (*)(GLuint, const GLchar *)' (aka 'int (*)(unsigned int, const char *)') [-Wincompatible-pointer-types]
      SET_GetAttribLocation(exec, _mesa_GetAttribLocation);
                                  ^~~~~~~~~~~~~~~~~~~~~~~
./main/dispatch.h:7744:88: note: passing argument to parameter 'fn' here
static inline void SET_GetAttribLocation(struct _glapi_table *disp, GLint (GLAPIENTRYP fn)(GLuint, const GLchar *)) {
                                                                                       ^
main/api_exec.c:585:33: warning: incompatible pointer types passing 'void (GLhandleARB, GLsizei, GLsizei *, GLcharARB *)' (aka 'void (unsigned long, int, int *, char *)') to
parameter of
      type 'void (*)(GLuint, GLsizei, GLsizei *, GLchar *)' (aka 'void (*)(unsigned int, int, int *, char *)') [-Wincompatible-pointer-types]
      SET_GetShaderSource(exec, _mesa_GetShaderSource);
                                ^~~~~~~~~~~~~~~~~~~~~
./main/dispatch.h:7788:85: note: passing argument to parameter 'fn' here
static inline void SET_GetShaderSource(struct _glapi_table *disp, void (GLAPIENTRYP fn)(GLuint, GLsizei, GLsizei *, GLchar *)) {
                                                                                    ^
main/api_exec.c:597:29: warning: incompatible pointer types passing 'void (GLhandleARB)' (aka 'void (unsigned long)') to parameter of type 'void (*)(GLuint)' (aka 'void (*)(unsigned
int)')
      [-Wincompatible-pointer-types]
      SET_LinkProgram(exec, _mesa_LinkProgram);
                            ^~~~~~~~~~~~~~~~~
./main/dispatch.h:7909:81: note: passing argument to parameter 'fn' here
static inline void SET_LinkProgram(struct _glapi_table *disp, void (GLAPIENTRYP fn)(GLuint)) {
                                                                                ^
main/api_exec.c:628:30: warning: incompatible pointer types passing 'void (GLhandleARB, GLsizei, const GLcharARB *const *, const GLint *)' (aka
      'void (unsigned long, int, const char *const *, const int *)') to parameter of type 'void (*)(GLuint, GLsizei, const GLchar *const *, const GLint *)' (aka 'void (*)(unsigned
int, int,
      const char *const *, const int *)') [-Wincompatible-pointer-types]
      SET_ShaderSource(exec, _mesa_ShaderSource);
                             ^~~~~~~~~~~~~~~~~~
./main/dispatch.h:7920:82: note: passing argument to parameter 'fn' here
static inline void SET_ShaderSource(struct _glapi_table *disp, void (GLAPIENTRYP fn)(GLuint, GLsizei, const GLchar * const *, const GLint *)) {
                                                                                 ^
main/api_exec.c:653:28: warning: incompatible pointer types passing 'void (GLhandleARB)' (aka 'void (unsigned long)') to parameter of type 'void (*)(GLuint)' (aka 'void (*)(unsigned
int)')
      [-Wincompatible-pointer-types]
      SET_UseProgram(exec, _mesa_UseProgram);
                           ^~~~~~~~~~~~~~~~
./main/dispatch.h:8173:80: note: passing argument to parameter 'fn' here
static inline void SET_UseProgram(struct _glapi_table *disp, void (GLAPIENTRYP fn)(GLuint)) {
                                                                               ^
main/api_exec.c:655:33: warning: incompatible pointer types passing 'void (GLhandleARB)' (aka 'void (unsigned long)') to parameter of type 'void (*)(GLuint)' (aka 'void (*)(unsigned
int)')
      [-Wincompatible-pointer-types]
      SET_ValidateProgram(exec, _mesa_ValidateProgram);
                                ^~~~~~~~~~~~~~~~~~~~~
./main/dispatch.h:8184:85: note: passing argument to parameter 'fn' here
static inline void SET_ValidateProgram(struct _glapi_table *disp, void (GLAPIENTRYP fn)(GLuint)) {

main/dlist.c:9457:26: warning: incompatible pointer types passing 'void (GLhandleARB)' (aka 'void (unsigned long)') to parameter of type 'void (*)(GLuint)' (aka 'void (*)(unsigned
int)')
      [-Wincompatible-pointer-types]
   SET_UseProgram(table, save_UseProgramObjectARB);
                         ^~~~~~~~~~~~~~~~~~~~~~~~
./main/dispatch.h:8173:80: note: passing argument to parameter 'fn' here
static inline void SET_UseProgram(struct _glapi_table *disp, void (GLAPIENTRYP fn)(GLuint)) {
                                                                               ^
1 warning generated.

Signed-off-by: Jeremy Huddleston Sequoia <jeremyhu@apple.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/mesa/main/dlist.c          |  5 ++---
 src/mesa/main/shader_query.cpp | 12 ++++++------
 src/mesa/main/shaderapi.c      | 16 ++++++++--------
 src/mesa/main/shaderapi.h      | 20 ++++++++++----------
 4 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index ba2e670eb9a..cd8e3b6a2f2 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -5982,9 +5982,8 @@ save_DrawTransformFeedbackStreamInstanced(GLenum mode, GLuint name,
    }
 }
 
-/* aka UseProgram() */
 static void GLAPIENTRY
-save_UseProgramObjectARB(GLhandleARB program)
+save_UseProgram(GLuint program)
 {
    GET_CURRENT_CONTEXT(ctx);
    Node *n;
@@ -9454,7 +9453,7 @@ _mesa_initialize_save_table(const struct gl_context *ctx)
 
    SET_BlitFramebuffer(table, save_BlitFramebufferEXT);
 
-   SET_UseProgram(table, save_UseProgramObjectARB);
+   SET_UseProgram(table, save_UseProgram);
    SET_Uniform1f(table, save_Uniform1fARB);
    SET_Uniform2f(table, save_Uniform2fARB);
    SET_Uniform3f(table, save_Uniform3fARB);
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index a18b860022d..e902585924a 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -64,8 +64,8 @@ DECL_RESOURCE_FUNC(XFB, gl_transform_feedback_varying_info);
 DECL_RESOURCE_FUNC(SUB, gl_subroutine_function);
 
 void GLAPIENTRY
-_mesa_BindAttribLocation(GLhandleARB program, GLuint index,
-                            const GLcharARB *name)
+_mesa_BindAttribLocation(GLuint program, GLuint index,
+                         const GLchar *name)
 {
    GET_CURRENT_CONTEXT(ctx);
 
@@ -126,9 +126,9 @@ is_active_attrib(const gl_shader_variable *var)
 }
 
 void GLAPIENTRY
-_mesa_GetActiveAttrib(GLhandleARB program, GLuint desired_index,
-                         GLsizei maxLength, GLsizei * length, GLint * size,
-                         GLenum * type, GLcharARB * name)
+_mesa_GetActiveAttrib(GLuint program, GLuint desired_index,
+                      GLsizei maxLength, GLsizei * length, GLint * size,
+                      GLenum * type, GLchar * name)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_shader_program *shProg;
@@ -191,7 +191,7 @@ _mesa_GetActiveAttrib(GLhandleARB program, GLuint desired_index,
 }
 
 GLint GLAPIENTRY
-_mesa_GetAttribLocation(GLhandleARB program, const GLcharARB * name)
+_mesa_GetAttribLocation(GLuint program, const GLchar * name)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_shader_program *const shProg =
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 126786cfaf4..9512e3b3df5 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -1265,7 +1265,7 @@ _mesa_AttachShader(GLuint program, GLuint shader)
 
 
 void GLAPIENTRY
-_mesa_CompileShader(GLhandleARB shaderObj)
+_mesa_CompileShader(GLuint shaderObj)
 {
    GET_CURRENT_CONTEXT(ctx);
    if (MESA_VERBOSE & VERBOSE_API)
@@ -1479,8 +1479,8 @@ _mesa_GetShaderInfoLog(GLuint shader, GLsizei bufSize,
 
 
 void GLAPIENTRY
-_mesa_GetShaderSource(GLhandleARB shader, GLsizei maxLength,
-                         GLsizei *length, GLcharARB *sourceOut)
+_mesa_GetShaderSource(GLuint shader, GLsizei maxLength,
+                      GLsizei *length, GLchar *sourceOut)
 {
    GET_CURRENT_CONTEXT(ctx);
    get_shader_source(ctx, shader, maxLength, length, sourceOut);
@@ -1512,7 +1512,7 @@ _mesa_IsShader(GLuint name)
 
 
 void GLAPIENTRY
-_mesa_LinkProgram(GLhandleARB programObj)
+_mesa_LinkProgram(GLuint programObj)
 {
    GET_CURRENT_CONTEXT(ctx);
    if (MESA_VERBOSE & VERBOSE_API)
@@ -1641,8 +1641,8 @@ read_shader(const gl_shader_stage stage, const char *source)
  * and pass it to _mesa_shader_source().
  */
 void GLAPIENTRY
-_mesa_ShaderSource(GLhandleARB shaderObj, GLsizei count,
-                   const GLcharARB * const * string, const GLint * length)
+_mesa_ShaderSource(GLuint shaderObj, GLsizei count,
+                   const GLchar * const * string, const GLint * length)
 {
    GET_CURRENT_CONTEXT(ctx);
    GLint *offsets;
@@ -1729,7 +1729,7 @@ _mesa_ShaderSource(GLhandleARB shaderObj, GLsizei count,
 
 
 void GLAPIENTRY
-_mesa_UseProgram(GLhandleARB program)
+_mesa_UseProgram(GLuint program)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_shader_program *shProg;
@@ -1791,7 +1791,7 @@ _mesa_UseProgram(GLhandleARB program)
 
 
 void GLAPIENTRY
-_mesa_ValidateProgram(GLhandleARB program)
+_mesa_ValidateProgram(GLuint program)
 {
    GET_CURRENT_CONTEXT(ctx);
    validate_program(ctx, program);
diff --git a/src/mesa/main/shaderapi.h b/src/mesa/main/shaderapi.h
index fba767bf4c1..8922c4d0640 100644
--- a/src/mesa/main/shaderapi.h
+++ b/src/mesa/main/shaderapi.h
@@ -64,7 +64,7 @@ extern void GLAPIENTRY
 _mesa_AttachObjectARB(GLhandleARB, GLhandleARB);
 
 extern void  GLAPIENTRY
-_mesa_CompileShader(GLhandleARB);
+_mesa_CompileShader(GLuint);
 
 extern GLhandleARB GLAPIENTRY
 _mesa_CreateProgramObjectARB(void);
@@ -100,7 +100,7 @@ extern void GLAPIENTRY
 _mesa_GetObjectParameterivARB(GLhandleARB, GLenum, GLint *);
 
 extern void GLAPIENTRY
-_mesa_GetShaderSource(GLhandleARB, GLsizei, GLsizei *, GLcharARB *);
+_mesa_GetShaderSource(GLuint, GLsizei, GLsizei *, GLchar *);
 
 extern GLboolean GLAPIENTRY
 _mesa_IsProgram(GLuint name);
@@ -109,20 +109,20 @@ extern GLboolean GLAPIENTRY
 _mesa_IsShader(GLuint name);
 
 extern void GLAPIENTRY
-_mesa_LinkProgram(GLhandleARB programObj);
+_mesa_LinkProgram(GLuint programObj);
 
 extern void GLAPIENTRY
-_mesa_ShaderSource(GLhandleARB, GLsizei, const GLcharARB* const *, const GLint *);
+_mesa_ShaderSource(GLuint, GLsizei, const GLchar* const *, const GLint *);
 
 extern void GLAPIENTRY
-_mesa_UseProgram(GLhandleARB);
+_mesa_UseProgram(GLuint);
 
 extern void GLAPIENTRY
-_mesa_ValidateProgram(GLhandleARB);
+_mesa_ValidateProgram(GLuint);
 
 
 extern void GLAPIENTRY
-_mesa_BindAttribLocation(GLhandleARB, GLuint, const GLcharARB *);
+_mesa_BindAttribLocation(GLuint program, GLuint, const GLchar *);
 
 extern void GLAPIENTRY
 _mesa_BindFragDataLocation(GLuint program, GLuint colorNumber,
@@ -133,11 +133,11 @@ _mesa_BindFragDataLocationIndexed(GLuint program, GLuint colorNumber,
                                   GLuint index, const GLchar *name);
 
 extern void GLAPIENTRY
-_mesa_GetActiveAttrib(GLhandleARB, GLuint, GLsizei, GLsizei *, GLint *,
-                         GLenum *, GLcharARB *);
+_mesa_GetActiveAttrib(GLuint, GLuint, GLsizei, GLsizei *, GLint *,
+                         GLenum *, GLchar *);
 
 extern GLint GLAPIENTRY
-_mesa_GetAttribLocation(GLhandleARB, const GLcharARB *);
+_mesa_GetAttribLocation(GLuint, const GLchar *);
 
 
 

From b20d6bf96d8a751cdc2ec36886c9c47d6e93a8ef Mon Sep 17 00:00:00 2001
From: Jeremy Huddleston Sequoia <jeremyhu@apple.com>
Date: Wed, 20 Jan 2016 17:03:26 -0800
Subject: [PATCH 097/119] mesa: Fix format warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

main/shaderapi.c:1318:51: warning: format specifies type 'unsigned int' but the argument has type 'GLhandleARB' (aka 'unsigned long') [-Wformat]
      _mesa_debug(ctx, "glDeleteObjectARB(%u)\n", obj);
                                          ~~      ^~~
                                          %lu

Signed-off-by: Jeremy Huddleston Sequoia <jeremyhu@apple.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/mesa/main/shaderapi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 9512e3b3df5..a988f41697b 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -1315,7 +1315,7 @@ _mesa_DeleteObjectARB(GLhandleARB obj)
 {
    if (MESA_VERBOSE & VERBOSE_API) {
       GET_CURRENT_CONTEXT(ctx);
-      _mesa_debug(ctx, "glDeleteObjectARB(%u)\n", obj);
+      _mesa_debug(ctx, "glDeleteObjectARB(%lu)\n", (unsigned long)obj);
    }
 
    if (obj) {

From 739ac3d39dacdede853d150b9903001524453330 Mon Sep 17 00:00:00 2001
From: Jeremy Huddleston Sequoia <jeremyhu@apple.com>
Date: Wed, 20 Jan 2016 17:10:54 -0800
Subject: [PATCH 098/119] mesa: Deal with size differences between GLuint and
 GLhandleARB in GetAttachedObjectsARB
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Jeremy Huddleston Sequoia <jeremyhu@apple.com>
Reviewed-by: Nicolai Hähnle <nhaehnle@gmail.com>
---
 src/mesa/main/shaderapi.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index a988f41697b..5854369a28c 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -1374,10 +1374,26 @@ _mesa_DetachShader(GLuint program, GLuint shader)
 
 void GLAPIENTRY
 _mesa_GetAttachedObjectsARB(GLhandleARB container, GLsizei maxCount,
-                            GLsizei * count, GLhandleARB * obj)
+                            GLsizei * count, GLhandleARB * objARB)
 {
+   int i;
+   GLuint *obj;
+
    GET_CURRENT_CONTEXT(ctx);
+
+   obj = calloc(maxCount, sizeof(GLuint));
+   if (!obj) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetAttachedObjectsARB");
+      return;
+   }
+
    get_attached_shaders(ctx, container, maxCount, count, obj);
+
+   for (i = 0 ; i < *count; i++) {
+      objARB[i] = (GLhandleARB)obj[i];
+   }
+
+   free(obj);
 }
 
 

From 0153ff8379be789262ad9cd636080d92b77becad Mon Sep 17 00:00:00 2001
From: Grazvydas Ignotas <notasas@gmail.com>
Date: Thu, 21 Jan 2016 01:52:24 +0200
Subject: [PATCH 099/119] r600g: don't leak driver const buffers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The buffers are referenced from r600_update_driver_const_buffers()
 -> r600_set_constant_buffer() -> u_upload_data(), but nothing
ever releases the reference. Similar case with driver_consts.
Found using valgrind.

Signed-off-by: Grazvydas Ignotas <notasas@gmail.com>
Cc: <mesa-stable@lists.freedesktop.org>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/r600/r600_pipe.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 08fdd361049..8abd60231f9 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -68,6 +68,7 @@ static const struct debug_named_value r600_debug_options[] = {
 static void r600_destroy_context(struct pipe_context *context)
 {
 	struct r600_context *rctx = (struct r600_context *)context;
+	unsigned sh;
 
 	r600_isa_destroy(rctx->isa);
 
@@ -76,6 +77,11 @@ static void r600_destroy_context(struct pipe_context *context)
 	pipe_resource_reference((struct pipe_resource**)&rctx->dummy_cmask, NULL);
 	pipe_resource_reference((struct pipe_resource**)&rctx->dummy_fmask, NULL);
 
+	for (sh = 0; sh < PIPE_SHADER_TYPES; sh++) {
+		rctx->b.b.set_constant_buffer(&rctx->b.b, sh, R600_BUFFER_INFO_CONST_BUFFER, NULL);
+		free(rctx->driver_consts[sh].constants);
+	}
+
 	if (rctx->fixed_func_tcs_shader)
 		rctx->b.b.delete_tcs_state(&rctx->b.b, rctx->fixed_func_tcs_shader);
 

From 666d96d1697fc9a1452519285758cc16240290d2 Mon Sep 17 00:00:00 2001
From: Neil Roberts <neil@linux.intel.com>
Date: Thu, 21 Jan 2016 17:12:29 +0000
Subject: [PATCH 100/119] texobj: Fix the completeness checks for cube textures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to the GL 1.4 spec section 3.8.10, a cubemap texture is only
complete if:

• The level base arrays of each of the six texture images making up
  the cube map have identical, positive, and square dimensions.
• The level base arrays were each specified with the same internal
  format.
• The level base arrays each have the same border width.

Previously the texture completeness code was only checking the first
point. This patch makes it additionally check the other two.

This fixes the following two dEQP tests:

deqp-gles2.functional.texture.completeness.cube.format_mismatch_rgba_rgb_level_0_neg_z
deqp-gles2.functional.texture.completeness.cube.format_mismatch_rgb_rgba_level_0_pos_z

And also this Piglit test:

spec/!opengl 2.0/incomplete-cubemap-format

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93792
Cc: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/texobj.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index b107a8f8678..9ce7b4c4754 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -769,7 +769,8 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
    }
 
    if (t->Target == GL_TEXTURE_CUBE_MAP_ARB) {
-      /* Make sure that all six cube map level 0 images are the same size.
+      /* Make sure that all six cube map level 0 images are the same size and
+       * format.
        * Note:  we know that the image's width==height (we enforce that
        * at glTexImage time) so we only need to test the width here.
        */
@@ -784,6 +785,15 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
             incomplete(t, BASE, "Cube face missing or mismatched size");
             return;
          }
+         if (t->Image[face][baseLevel]->InternalFormat !=
+             baseImage->InternalFormat) {
+            incomplete(t, BASE, "Cube face format mismatch");
+            return;
+         }
+         if (t->Image[face][baseLevel]->Border != baseImage->Border) {
+            incomplete(t, BASE, "Cube face border size mismatch");
+            return;
+         }
       }
    }
 

From cbf0e64ee1e2d4514dbec9e4fbf688098fcef318 Mon Sep 17 00:00:00 2001
From: Neil Roberts <neil@linux.intel.com>
Date: Thu, 21 Jan 2016 17:28:07 +0000
Subject: [PATCH 101/119] texobj: Remove redundant checks that the texture cube
 faces match size

The texture mipmap completeness checking code was checking whether all
of the faces have the same size. However this is pointless because the
code just above it checks whether the face has the expected size
calculated for the mipmap level anyway so the error condition could
never be reached. This patch just removes it.

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/texobj.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index 9ce7b4c4754..e926c7b6cd2 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -868,16 +868,6 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
                              img->Depth2);
                   return;
                }
-
-               /* Extra checks for cube textures */
-               if (face > 0) {
-                  /* check that cube faces are the same size */
-                  if (img->Width2 != t->Image[0][i]->Width2 ||
-                      img->Height2 != t->Image[0][i]->Height2) {
-		     incomplete(t, MIPMAP, "CubeMap Image[n][i] bad size");
-		     return;
-		  }
-               }
             }
          }
 

From 1f7a96e005be0c0941df5487a11c53f048ebd58a Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 10 Aug 2015 10:37:53 -0400
Subject: [PATCH 102/119] mesa: add GREMEDY_string_marker

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mapi/glapi/gen/GREMEDY_string_marker.xml | 18 ++++++++++++++++++
 src/mapi/glapi/gen/Makefile.am               |  1 +
 src/mapi/glapi/gen/gl_API.xml                |  2 ++
 src/mesa/main/dd.h                           |  6 ++++++
 src/mesa/main/errors.c                       | 13 +++++++++++++
 src/mesa/main/errors.h                       |  3 +++
 src/mesa/main/extensions_table.h             |  2 ++
 src/mesa/main/mtypes.h                       |  1 +
 src/mesa/main/tests/dispatch_sanity.cpp      |  3 +++
 9 files changed, 49 insertions(+)
 create mode 100644 src/mapi/glapi/gen/GREMEDY_string_marker.xml

diff --git a/src/mapi/glapi/gen/GREMEDY_string_marker.xml b/src/mapi/glapi/gen/GREMEDY_string_marker.xml
new file mode 100644
index 00000000000..ffa3eac5898
--- /dev/null
+++ b/src/mapi/glapi/gen/GREMEDY_string_marker.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<!-- Note: no GLX protocol info yet. -->
+
+
+<OpenGLAPI>
+
+<category name="GL_GREMEDY_string_marker" number="311">
+
+    <function name="StringMarkerGREMEDY">
+        <param name="len" type="GLsizei"/>
+        <param name="string" type="const GLvoid *"/>
+    </function>
+
+</category>
+
+</OpenGLAPI>
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index 3f3e0b9af5f..cd7feabba24 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -196,6 +196,7 @@ API_XML = \
 	EXT_texture_array.xml \
 	EXT_texture_integer.xml \
 	EXT_transform_feedback.xml \
+	GREMEDY_string_marker.xml \
 	INTEL_performance_query.xml \
 	KHR_debug.xml \
 	KHR_context_flush_control.xml \
diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml
index 593ace49563..d7ab3bff4df 100644
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -12620,6 +12620,8 @@
 
 <xi:include href="EXT_framebuffer_object.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
+<xi:include href="GREMEDY_string_marker.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
+
 <xi:include href="EXT_packed_depth_stencil.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
 <xi:include href="EXT_provoking_vertex.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index 70ed5633f7b..d4378e51159 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -762,6 +762,12 @@ struct dd_function_table {
    void (*UseProgram)(struct gl_context *ctx, struct gl_shader_program *shProg);
    /*@}*/
 
+   /**
+    * \name GREMEDY debug/marker functions
+    */
+   /*@{*/
+   void (*EmitStringMarker)(struct gl_context *ctx, const GLchar *string, GLsizei len);
+   /*@}*/
 
    /**
     * \name Support for multiple T&L engines
diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c
index 9e6610918c4..630d3525fd4 100644
--- a/src/mesa/main/errors.c
+++ b/src/mesa/main/errors.c
@@ -1276,6 +1276,19 @@ _mesa_free_errors_data(struct gl_context *ctx)
    mtx_destroy(&ctx->DebugMutex);
 }
 
+void GLAPIENTRY
+_mesa_StringMarkerGREMEDY(GLsizei len, const GLvoid *string)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   if (ctx->Extensions.GREMEDY_string_marker) {
+      /* if length not specified, string will be null terminated: */
+      if (len <= 0)
+         len = strlen(string);
+      ctx->Driver.EmitStringMarker(ctx, string, len);
+   } else {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "StringMarkerGREMEDY");
+   }
+}
 
 /**********************************************************************/
 /** \name Diagnostics */
diff --git a/src/mesa/main/errors.h b/src/mesa/main/errors.h
index f2919765488..92df2ac868a 100644
--- a/src/mesa/main/errors.h
+++ b/src/mesa/main/errors.h
@@ -138,6 +138,9 @@ _mesa_PushDebugGroup(GLenum source, GLuint id, GLsizei length,
 void GLAPIENTRY
 _mesa_PopDebugGroup(void);
 
+void GLAPIENTRY
+_mesa_StringMarkerGREMEDY(GLsizei len, const GLvoid *string);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index aeccb017423..9cec1762dbe 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -251,6 +251,8 @@ EXT(EXT_unpack_subimage                     , dummy_true
 EXT(EXT_vertex_array                        , dummy_true                             , GLL,  x ,  x ,  x , 1995)
 EXT(EXT_vertex_array_bgra                   , EXT_vertex_array_bgra                  , GLL, GLC,  x ,  x , 2008)
 
+EXT(GREMEDY_string_marker                   , GREMEDY_string_marker                  , GLL, GLC,  x ,  x , 2007)
+
 EXT(IBM_multimode_draw_arrays               , dummy_true                             , GLL, GLC,  x ,  x , 1998)
 EXT(IBM_rasterpos_clip                      , dummy_true                             , GLL,  x ,  x ,  x , 1996)
 EXT(IBM_texture_mirrored_repeat             , dummy_true                             , GLL,  x ,  x ,  x , 1998)
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 0992d4d30f0..3912c2bc847 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3880,6 +3880,7 @@ struct gl_extensions
    GLboolean ATI_texture_env_combine3;
    GLboolean ATI_fragment_shader;
    GLboolean ATI_separate_stencil;
+   GLboolean GREMEDY_string_marker;
    GLboolean INTEL_performance_query;
    GLboolean KHR_texture_compression_astc_hdr;
    GLboolean KHR_texture_compression_astc_ldr;
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 7610bcbd701..eb1108124e9 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -940,6 +940,9 @@ const struct function common_desktop_functions_possible[] = {
    { "glGetTextureSubImage", 20, -1 },
    { "glGetCompressedTextureSubImage", 20, -1 },
 
+   /* GL_GREMEDY_string_marker */
+   { "glStringMarkerGREMEDY", 15, -1 },
+
    { NULL, 0, -1 }
 };
 

From a6a99fbf05865efcb628ecb2a19d1d77db34c865 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sat, 5 Dec 2015 11:32:25 -0500
Subject: [PATCH 103/119] mesa: wire up EmitStringMarker for KHR_debug

The extension spec[1] describes DEBUG_TYPE_MARKER as "Annotation of the
command stream".  So for DEBUG_TYPE_MARKER, also pass the buf to the
driver's EmitStringMarker() to be inserted in the command stream.

[1] https://www.opengl.org/registry/specs/KHR/debug.txt

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/errors.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c
index 630d3525fd4..674364c7b0c 100644
--- a/src/mesa/main/errors.c
+++ b/src/mesa/main/errors.c
@@ -1018,6 +1018,13 @@ _mesa_DebugMessageInsert(GLenum source, GLenum type, GLuint id,
            gl_enum_to_debug_type(type), id,
            gl_enum_to_debug_severity(severity),
            length, buf);
+
+   if (type == GL_DEBUG_TYPE_MARKER && ctx->Driver.EmitStringMarker) {
+      /* if length not specified, string will be null terminated: */
+      if (length < 0)
+         length = strlen(buf);
+      ctx->Driver.EmitStringMarker(ctx, buf, length);
+   }
 }
 
 

From d6408372eb359d972614838f838776f1695e3c99 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 10 Aug 2015 11:41:29 -0400
Subject: [PATCH 104/119] gallium: add GREMEDY_string_marker

Since the GREMEDY extensions are normally only exposed by the gremedy
debugger (and could possibly trigger debug paths in the app), we don't
expose the extension by default, but instead only with
ST_DEBUG=gremedy.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/docs/source/screen.rst               | 1 +
 src/gallium/drivers/freedreno/freedreno_screen.c | 1 +
 src/gallium/drivers/i915/i915_screen.c           | 1 +
 src/gallium/drivers/ilo/ilo_screen.c             | 1 +
 src/gallium/drivers/llvmpipe/lp_screen.c         | 1 +
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   | 1 +
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   | 1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   | 1 +
 src/gallium/drivers/r300/r300_screen.c           | 1 +
 src/gallium/drivers/r600/r600_pipe.c             | 1 +
 src/gallium/drivers/radeonsi/si_pipe.c           | 1 +
 src/gallium/drivers/softpipe/sp_screen.c         | 1 +
 src/gallium/drivers/svga/svga_screen.c           | 1 +
 src/gallium/drivers/vc4/vc4_screen.c             | 1 +
 src/gallium/include/pipe/p_context.h             | 7 +++++++
 src/gallium/include/pipe/p_defines.h             | 1 +
 src/mesa/state_tracker/st_context.c              | 9 +++++++++
 src/mesa/state_tracker/st_debug.c                | 1 +
 src/mesa/state_tracker/st_debug.h                | 1 +
 src/mesa/state_tracker/st_extensions.c           | 5 +++++
 20 files changed, 38 insertions(+)

diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index d7ea123b0e9..b461810644a 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -305,6 +305,7 @@ The integer capabilities:
   for buffers is supported.
 * ``PIPE_CAP_GENERATE_MIPMAP``: Indicates whether pipe_context::generate_mipmap
   is supported.
+* ``PIPE_CAP_STRING_MARKER``: Whether pipe->emit_string_marker() is supported.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 6562924dae1..e7b21de6136 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -164,6 +164,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_BARRIER:
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
 	case PIPE_CAP_COMPUTE:
+	case PIPE_CAP_STRING_MARKER:
 		return 0;
 
 	case PIPE_CAP_SM3:
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index ede5558a11e..6b0ab587001 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -261,6 +261,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index fa327571b9b..5171cca9ea6 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -485,6 +485,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index fb52f5dc063..879a2e7d2f0 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -310,6 +310,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 933330f107a..61d91fd4cce 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -183,6 +183,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 712835c1ce1..32da60e0a23 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -226,6 +226,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 7211df94bb8..84dbd69b8a5 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -215,6 +215,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 8823b8d3197..90c4f71a945 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -209,6 +209,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
         case PIPE_CAP_INVALIDATE_BUFFER:
         case PIPE_CAP_GENERATE_MIPMAP:
+        case PIPE_CAP_STRING_MARKER:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 8abd60231f9..9b0f31270df 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -363,6 +363,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
 	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 	case PIPE_CAP_GENERATE_MIPMAP:
+	case PIPE_CAP_STRING_MARKER:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index f6ff4a81bd4..3e20c3b81fa 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -349,6 +349,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
 	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 	case PIPE_CAP_GENERATE_MIPMAP:
+	case PIPE_CAP_STRING_MARKER:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 143702a5650..3bc580899d4 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -260,6 +260,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index b21634f3d73..8d04222a0cd 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -357,6 +357,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_STRING_MARKER:
       return 0;
    case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
       return 64;
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index fb41877017d..08c2dad8406 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -198,6 +198,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
         case PIPE_CAP_INVALIDATE_BUFFER:
         case PIPE_CAP_GENERATE_MIPMAP:
+        case PIPE_CAP_STRING_MARKER:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index 4b551ed0b41..f69a75be50e 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -678,6 +678,13 @@ struct pipe_context {
    void (*dump_debug_state)(struct pipe_context *ctx, FILE *stream,
                             unsigned flags);
 
+   /**
+    * Emit string marker in cmdstream
+    */
+   void (*emit_string_marker)(struct pipe_context *ctx,
+                              const char *string,
+                              int len);
+
    /**
     * Generate mipmap.
     * \return TRUE if mipmap generation succeeds, FALSE otherwise
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index cb837cd2597..b46187bc8a1 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -644,6 +644,7 @@ enum pipe_cap
    PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT,
    PIPE_CAP_INVALIDATE_BUFFER,
    PIPE_CAP_GENERATE_MIPMAP,
+   PIPE_CAP_STRING_MARKER,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 4add50e3ed9..ce1e97aacb5 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -438,6 +438,12 @@ void st_destroy_context( struct st_context *st )
    free(ctx);
 }
 
+static void
+st_emit_string_marker(struct gl_context *ctx, const GLchar *string, GLsizei len)
+{
+   struct st_context *st = ctx->st;
+   st->pipe->emit_string_marker(st->pipe, string, len);
+}
 
 void st_init_driver_functions(struct pipe_screen *screen,
                               struct dd_function_table *functions)
@@ -476,6 +482,9 @@ void st_init_driver_functions(struct pipe_screen *screen,
 
    st_init_vdpau_functions(functions);
 
+   if (screen->get_param(screen, PIPE_CAP_STRING_MARKER))
+      functions->EmitStringMarker = st_emit_string_marker;
+
    functions->Enable = st_Enable;
    functions->UpdateState = st_invalidate_state;
 }
diff --git a/src/mesa/state_tracker/st_debug.c b/src/mesa/state_tracker/st_debug.c
index 134366db09d..9eb3b53b230 100644
--- a/src/mesa/state_tracker/st_debug.c
+++ b/src/mesa/state_tracker/st_debug.c
@@ -57,6 +57,7 @@ static const struct debug_named_value st_debug_flags[] = {
    { "buffer",   DEBUG_BUFFER, NULL },
    { "wf",       DEBUG_WIREFRAME, NULL },
    { "precompile",  DEBUG_PRECOMPILE, NULL },
+   { "gremedy",  DEBUG_GREMEDY, "Enable GREMEDY debug extensions" },
    DEBUG_NAMED_VALUE_END
 };
 
diff --git a/src/mesa/state_tracker/st_debug.h b/src/mesa/state_tracker/st_debug.h
index ed3ead82914..a094fdc2bfa 100644
--- a/src/mesa/state_tracker/st_debug.h
+++ b/src/mesa/state_tracker/st_debug.h
@@ -50,6 +50,7 @@ st_print_current(void);
 #define DEBUG_BUFFER    0x200
 #define DEBUG_WIREFRAME 0x400
 #define DEBUG_PRECOMPILE   0x800
+#define DEBUG_GREMEDY   0x1000
 
 #ifdef DEBUG
 extern int ST_DEBUG;
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 2a3e52362e4..53ea6767395 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -37,6 +37,7 @@
 #include "util/u_math.h"
 
 #include "st_context.h"
+#include "st_debug.h"
 #include "st_extensions.h"
 #include "st_format.h"
 
@@ -973,4 +974,8 @@ void st_init_extensions(struct pipe_screen *screen,
       extensions->ARB_gpu_shader_fp64 = GL_TRUE;
       extensions->ARB_vertex_attrib_64bit = GL_TRUE;
    }
+
+   if ((ST_DEBUG & DEBUG_GREMEDY) &&
+       screen->get_param(screen, PIPE_CAP_STRING_MARKER))
+      extensions->GREMEDY_string_marker = GL_TRUE;
 }

From bc1a37378c194400c502939fc00e2e658f3db3b5 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 10 Aug 2015 12:11:13 -0400
Subject: [PATCH 105/119] freedreno: implement emit_string_marker

Writes string to cmdstream in payload of a no-op packet.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/freedreno_context.c     | 27 +++++++++++++++++++
 .../drivers/freedreno/freedreno_screen.c      |  2 +-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c
index 0b6b9fbbe7a..c5ea86f9368 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -141,6 +141,32 @@ fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
 	}
 }
 
+/**
+ * emit marker string as payload of a no-op packet, which can be
+ * decoded by cffdump.
+ */
+static void
+fd_emit_string_marker(struct pipe_context *pctx, const char *string, int len)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	struct fd_ringbuffer *ring = ctx->ring;
+	const uint32_t *buf = (const void *)string;
+
+	OUT_PKT3(ring, CP_NOP, align(len, 4) / 4);
+	while (len >= 4) {
+		OUT_RING(ring, *buf);
+		buf++;
+		len -= 4;
+	}
+
+	/* copy remainder bytes without reading past end of input string: */
+	if (len > 0) {
+		uint32_t w = 0;
+		memcpy(&w, buf, len);
+		OUT_RING(ring, w);
+	}
+}
+
 void
 fd_context_destroy(struct pipe_context *pctx)
 {
@@ -207,6 +233,7 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen,
 	pctx->screen = pscreen;
 	pctx->priv = priv;
 	pctx->flush = fd_context_flush;
+	pctx->emit_string_marker = fd_emit_string_marker;
 
 	for (i = 0; i < ARRAY_SIZE(ctx->rings); i++) {
 		ctx->rings[i] = fd_ringbuffer_new(screen->pipe, 0x100000);
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index e7b21de6136..4b5d9c8a19e 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -155,6 +155,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_USER_CONSTANT_BUFFERS:
 	case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
 	case PIPE_CAP_VERTEXID_NOBASE:
+	case PIPE_CAP_STRING_MARKER:
 		return 1;
 
 	case PIPE_CAP_SHADER_STENCIL_EXPORT:
@@ -164,7 +165,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_BARRIER:
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
 	case PIPE_CAP_COMPUTE:
-	case PIPE_CAP_STRING_MARKER:
 		return 0;
 
 	case PIPE_CAP_SM3:

From 66672e791c1d9d8d168c661695b4959c122e3da5 Mon Sep 17 00:00:00 2001
From: Christian Gmeiner <christian.gmeiner@gmail.com>
Date: Wed, 20 Jan 2016 22:11:52 +0100
Subject: [PATCH 106/119] freedreno: make opc array static const

Signed-off-by: Christian Gmeiner <christian.gmeiner@gmail.com>
Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/disasm-a3xx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
index 1d5022b69c7..599872470fc 100644
--- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
+++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
@@ -650,7 +650,7 @@ static void print_instr_cat6(instr_t *instr)
 /* size of largest OPC field of all the instruction categories: */
 #define NOPC_BITS 6
 
-struct opc_info {
+static const struct opc_info {
 	uint16_t cat;
 	uint16_t opc;
 	const char *name;

From 13b87e02b979b86872e1872b8cb325d376d05cc5 Mon Sep 17 00:00:00 2001
From: cstout <cstout@google.com>
Date: Fri, 11 Dec 2015 16:58:45 -0800
Subject: [PATCH 107/119] freedreno/a4xx: Add support for adreno 430

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/freedreno_screen.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 4b5d9c8a19e..640f50f5dcb 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -574,6 +574,7 @@ fd_screen_create(struct fd_device *dev)
 		fd3_screen_init(pscreen);
 		break;
 	case 420:
+	case 430:
 		fd4_screen_init(pscreen);
 		break;
 	default:

From b126039784997ad49c329683443370f85c1bfebd Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Thu, 21 Jan 2016 09:54:19 -0800
Subject: [PATCH 108/119] nir: Make argument order of unop_convert match
 binop_convert.

Strangely the return and parameter types were reversed.
---
 src/glsl/nir/nir_opcodes.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index e79810c1991..a8bbe1a0b82 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -105,7 +105,7 @@ def opcode(name, output_size, output_type, input_sizes, input_types,
    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
                           input_types, algebraic_properties, const_expr)
 
-def unop_convert(name, in_type, out_type, const_expr):
+def unop_convert(name, out_type, in_type, const_expr):
    opcode(name, 0, out_type, [0], [in_type], "", const_expr)
 
 def unop(name, ty, const_expr):
@@ -155,17 +155,17 @@ unop("frsq", tfloat, "1.0f / sqrtf(src0)")
 unop("fsqrt", tfloat, "sqrtf(src0)")
 unop("fexp2", tfloat, "exp2f(src0)")
 unop("flog2", tfloat, "log2f(src0)")
-unop_convert("f2i", tfloat, tint, "src0") # Float-to-integer conversion.
-unop_convert("f2u", tfloat, tuint, "src0") # Float-to-unsigned conversion
-unop_convert("i2f", tint, tfloat, "src0") # Integer-to-float conversion.
+unop_convert("f2i", tint, tfloat, "src0") # Float-to-integer conversion.
+unop_convert("f2u", tuint, tfloat, "src0") # Float-to-unsigned conversion
+unop_convert("i2f", tfloat, tint, "src0") # Integer-to-float conversion.
 # Float-to-boolean conversion
-unop_convert("f2b", tfloat, tbool, "src0 != 0.0f")
+unop_convert("f2b", tbool, tfloat, "src0 != 0.0f")
 # Boolean-to-float conversion
-unop_convert("b2f", tbool, tfloat, "src0 ? 1.0f : 0.0f")
+unop_convert("b2f", tfloat, tbool, "src0 ? 1.0f : 0.0f")
 # Int-to-boolean conversion
-unop_convert("i2b", tint, tbool, "src0 != 0")
-unop_convert("b2i", tbool, tint, "src0 ? 1 : 0") # Boolean-to-int conversion
-unop_convert("u2f", tuint, tfloat, "src0") # Unsigned-to-float conversion.
+unop_convert("i2b", tbool, tint, "src0 != 0")
+unop_convert("b2i", tint, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
+unop_convert("u2f", tfloat, tuint, "src0") # Unsigned-to-float conversion.
 
 # Unary floating-point rounding operations.
 
@@ -264,7 +264,7 @@ for (unsigned bit = 0; bit < 32; bit++) {
 }
 """)
 
-unop_convert("ufind_msb", tuint, tint, """
+unop_convert("ufind_msb", tint, tuint, """
 dst = -1;
 for (int bit = 31; bit > 0; bit--) {
    if ((src0 >> bit) & 1) {

From b6bb3b9bcd8fb03c8d307cafa34bc9ed8a4e2f28 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Thu, 21 Jan 2016 09:19:53 -0800
Subject: [PATCH 109/119] i965: Move brw_compiler_create() to new
 brw_compiler.c.

A future patch will want to use designated initalizers, which aren't
available in C++, but this is C.
---
 src/mesa/drivers/dri/i965/Makefile.sources |   1 +
 src/mesa/drivers/dri/i965/brw_compiler.c   | 157 +++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_compiler.h   |   3 +
 src/mesa/drivers/dri/i965/brw_shader.cpp   | 130 -----------------
 src/mesa/drivers/dri/i965/brw_shader.h     |   3 -
 5 files changed, 161 insertions(+), 133 deletions(-)
 create mode 100644 src/mesa/drivers/dri/i965/brw_compiler.c

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index cea1e87ccde..caabb0decfb 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -1,6 +1,7 @@
 i965_compiler_FILES = \
 	brw_cfg.cpp \
 	brw_cfg.h \
+	brw_compiler.c \
 	brw_compiler.h \
 	brw_dead_control_flow.cpp \
 	brw_dead_control_flow.h \
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c
new file mode 100644
index 00000000000..6ca59c7392c
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright © 2015-2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_compiler.h"
+#include "brw_context.h"
+#include "glsl/nir/nir.h"
+#include "main/errors.h"
+#include "util/debug.h"
+
+static void
+shader_debug_log_mesa(void *data, const char *fmt, ...)
+{
+   struct brw_context *brw = (struct brw_context *)data;
+   va_list args;
+
+   va_start(args, fmt);
+   GLuint msg_id = 0;
+   _mesa_gl_vdebug(&brw->ctx, &msg_id,
+                   MESA_DEBUG_SOURCE_SHADER_COMPILER,
+                   MESA_DEBUG_TYPE_OTHER,
+                   MESA_DEBUG_SEVERITY_NOTIFICATION, fmt, args);
+   va_end(args);
+}
+
+static void
+shader_perf_log_mesa(void *data, const char *fmt, ...)
+{
+   struct brw_context *brw = (struct brw_context *)data;
+
+   va_list args;
+   va_start(args, fmt);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
+      va_list args_copy;
+      va_copy(args_copy, args);
+      vfprintf(stderr, fmt, args_copy);
+      va_end(args_copy);
+   }
+
+   if (brw->perf_debug) {
+      GLuint msg_id = 0;
+      _mesa_gl_vdebug(&brw->ctx, &msg_id,
+                      MESA_DEBUG_SOURCE_SHADER_COMPILER,
+                      MESA_DEBUG_TYPE_PERFORMANCE,
+                      MESA_DEBUG_SEVERITY_MEDIUM, fmt, args);
+   }
+   va_end(args);
+}
+
+struct brw_compiler *
+brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
+{
+   struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
+
+   compiler->devinfo = devinfo;
+   compiler->shader_debug_log = shader_debug_log_mesa;
+   compiler->shader_perf_log = shader_perf_log_mesa;
+
+   brw_fs_alloc_reg_sets(compiler);
+   brw_vec4_alloc_reg_set(compiler);
+
+   compiler->scalar_stage[MESA_SHADER_VERTEX] =
+      devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
+   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false;
+   compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
+      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
+   compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
+      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false);
+   compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true;
+   compiler->scalar_stage[MESA_SHADER_COMPUTE] = true;
+
+   nir_shader_compiler_options *nir_options =
+      rzalloc(compiler, nir_shader_compiler_options);
+   nir_options->native_integers = true;
+   nir_options->lower_fdiv = true;
+   /* In order to help allow for better CSE at the NIR level we tell NIR
+    * to split all ffma instructions during opt_algebraic and we then
+    * re-combine them as a later step.
+    */
+   nir_options->lower_ffma = true;
+   nir_options->lower_sub = true;
+   nir_options->lower_fdiv = true;
+   nir_options->lower_scmp = true;
+   nir_options->lower_fmod = true;
+   nir_options->lower_bitfield_extract = true;
+   nir_options->lower_bitfield_insert = true;
+   nir_options->lower_uadd_carry = true;
+   nir_options->lower_usub_borrow = true;
+
+   /* In the vec4 backend, our dpN instruction replicates its result to all
+    * the components of a vec4.  We would like NIR to give us replicated fdot
+    * instructions because it can optimize better for us.
+    *
+    * For the FS backend, it should be lowered away by the scalarizing pass so
+    * we should never see fdot anyway.
+    */
+   nir_options->fdot_replicates = true;
+
+   /* We want the GLSL compiler to emit code that uses condition codes */
+   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
+      compiler->glsl_compiler_options[i].MaxUnrollIterations = 32;
+      compiler->glsl_compiler_options[i].MaxIfDepth =
+         devinfo->gen < 6 ? 16 : UINT_MAX;
+
+      compiler->glsl_compiler_options[i].EmitCondCodes = true;
+      compiler->glsl_compiler_options[i].EmitNoNoise = true;
+      compiler->glsl_compiler_options[i].EmitNoMainReturn = true;
+      compiler->glsl_compiler_options[i].EmitNoIndirectInput = true;
+      compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;
+      compiler->glsl_compiler_options[i].LowerClipDistance = true;
+
+      bool is_scalar = compiler->scalar_stage[i];
+
+      compiler->glsl_compiler_options[i].EmitNoIndirectOutput = is_scalar;
+      compiler->glsl_compiler_options[i].EmitNoIndirectTemp = is_scalar;
+      compiler->glsl_compiler_options[i].OptimizeForAOS = !is_scalar;
+
+      /* !ARB_gpu_shader5 */
+      if (devinfo->gen < 7)
+         compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
+
+      compiler->glsl_compiler_options[i].NirOptions = nir_options;
+
+      compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
+   }
+
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false;
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false;
+
+   if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
+      compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
+
+   compiler->glsl_compiler_options[MESA_SHADER_COMPUTE]
+      .LowerShaderSharedVariables = true;
+
+   return compiler;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 224ddb14ed1..ec7d8be434b 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -686,6 +686,9 @@ struct brw_gs_prog_data
 
 /** @} */
 
+struct brw_compiler *
+brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo);
+
 /**
  * Compile a vertex shader.
  *
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index d92bad25a72..e42601b6f3e 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -29,136 +29,6 @@
 #include "brw_vec4_tes.h"
 #include "main/shaderobj.h"
 #include "main/uniforms.h"
-#include "util/debug.h"
-
-static void
-shader_debug_log_mesa(void *data, const char *fmt, ...)
-{
-   struct brw_context *brw = (struct brw_context *)data;
-   va_list args;
-
-   va_start(args, fmt);
-   GLuint msg_id = 0;
-   _mesa_gl_vdebug(&brw->ctx, &msg_id,
-                   MESA_DEBUG_SOURCE_SHADER_COMPILER,
-                   MESA_DEBUG_TYPE_OTHER,
-                   MESA_DEBUG_SEVERITY_NOTIFICATION, fmt, args);
-   va_end(args);
-}
-
-static void
-shader_perf_log_mesa(void *data, const char *fmt, ...)
-{
-   struct brw_context *brw = (struct brw_context *)data;
-
-   va_list args;
-   va_start(args, fmt);
-
-   if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
-      va_list args_copy;
-      va_copy(args_copy, args);
-      vfprintf(stderr, fmt, args_copy);
-      va_end(args_copy);
-   }
-
-   if (brw->perf_debug) {
-      GLuint msg_id = 0;
-      _mesa_gl_vdebug(&brw->ctx, &msg_id,
-                      MESA_DEBUG_SOURCE_SHADER_COMPILER,
-                      MESA_DEBUG_TYPE_PERFORMANCE,
-                      MESA_DEBUG_SEVERITY_MEDIUM, fmt, args);
-   }
-   va_end(args);
-}
-
-struct brw_compiler *
-brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
-{
-   struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
-
-   compiler->devinfo = devinfo;
-   compiler->shader_debug_log = shader_debug_log_mesa;
-   compiler->shader_perf_log = shader_perf_log_mesa;
-
-   brw_fs_alloc_reg_sets(compiler);
-   brw_vec4_alloc_reg_set(compiler);
-
-   compiler->scalar_stage[MESA_SHADER_VERTEX] =
-      devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
-   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false;
-   compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
-      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
-   compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
-      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false);
-   compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true;
-   compiler->scalar_stage[MESA_SHADER_COMPUTE] = true;
-
-   nir_shader_compiler_options *nir_options =
-      rzalloc(compiler, nir_shader_compiler_options);
-   nir_options->native_integers = true;
-   nir_options->lower_fdiv = true;
-   /* In order to help allow for better CSE at the NIR level we tell NIR
-    * to split all ffma instructions during opt_algebraic and we then
-    * re-combine them as a later step.
-    */
-   nir_options->lower_ffma = true;
-   nir_options->lower_sub = true;
-   nir_options->lower_fdiv = true;
-   nir_options->lower_scmp = true;
-   nir_options->lower_fmod = true;
-   nir_options->lower_bitfield_extract = true;
-   nir_options->lower_bitfield_insert = true;
-   nir_options->lower_uadd_carry = true;
-   nir_options->lower_usub_borrow = true;
-
-   /* In the vec4 backend, our dpN instruction replicates its result to all
-    * the components of a vec4.  We would like NIR to give us replicated fdot
-    * instructions because it can optimize better for us.
-    *
-    * For the FS backend, it should be lowered away by the scalarizing pass so
-    * we should never see fdot anyway.
-    */
-   nir_options->fdot_replicates = true;
-
-   /* We want the GLSL compiler to emit code that uses condition codes */
-   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
-      compiler->glsl_compiler_options[i].MaxUnrollIterations = 32;
-      compiler->glsl_compiler_options[i].MaxIfDepth =
-         devinfo->gen < 6 ? 16 : UINT_MAX;
-
-      compiler->glsl_compiler_options[i].EmitCondCodes = true;
-      compiler->glsl_compiler_options[i].EmitNoNoise = true;
-      compiler->glsl_compiler_options[i].EmitNoMainReturn = true;
-      compiler->glsl_compiler_options[i].EmitNoIndirectInput = true;
-      compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;
-      compiler->glsl_compiler_options[i].LowerClipDistance = true;
-
-      bool is_scalar = compiler->scalar_stage[i];
-
-      compiler->glsl_compiler_options[i].EmitNoIndirectOutput = is_scalar;
-      compiler->glsl_compiler_options[i].EmitNoIndirectTemp = is_scalar;
-      compiler->glsl_compiler_options[i].OptimizeForAOS = !is_scalar;
-
-      /* !ARB_gpu_shader5 */
-      if (devinfo->gen < 7)
-         compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
-
-      compiler->glsl_compiler_options[i].NirOptions = nir_options;
-
-      compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
-   }
-
-   compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false;
-   compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false;
-
-   if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
-      compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
-
-   compiler->glsl_compiler_options[MESA_SHADER_COMPUTE]
-      .LowerShaderSharedVariables = true;
-
-   return compiler;
-}
 
 extern "C" struct gl_shader *
 brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 593361348fd..82374a46c18 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -259,9 +259,6 @@ struct brw_gs_compile
    unsigned control_data_header_size_bits;
 };
 
-struct brw_compiler *
-brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo);
-
 void
 brw_assign_common_binding_table_offsets(gl_shader_stage stage,
                                         const struct brw_device_info *devinfo,

From 84166aed927cfa4da8fbf7ece05f8b262192754c Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Thu, 21 Jan 2016 09:30:05 -0800
Subject: [PATCH 110/119] i965: Make separate nir_options for scalar/vector
 stages.

We'll want to have different lowering options set for scalar/vector
stages.
---
 src/mesa/drivers/dri/i965/brw_compiler.c | 61 +++++++++++++-----------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c
index 6ca59c7392c..dbd5ad23cfd 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.c
+++ b/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -67,6 +67,37 @@ shader_perf_log_mesa(void *data, const char *fmt, ...)
    va_end(args);
 }
 
+#define COMMON_OPTIONS                                                        \
+   /* In order to help allow for better CSE at the NIR level we tell NIR to   \
+    * split all ffma instructions during opt_algebraic and we then re-combine \
+    * them as a later step.                                                   \
+    */                                                                        \
+   .lower_ffma = true,                                                        \
+   .lower_sub = true,                                                         \
+   .lower_fdiv = true,                                                        \
+   .lower_scmp = true,                                                        \
+   .lower_fmod = true,                                                        \
+   .lower_bitfield_extract = true,                                            \
+   .lower_bitfield_insert = true,                                             \
+   .lower_uadd_carry = true,                                                  \
+   .lower_usub_borrow = true,                                                 \
+   .lower_fdiv = true,                                                        \
+   .native_integers = true
+
+static const struct nir_shader_compiler_options scalar_nir_options = {
+   COMMON_OPTIONS,
+};
+
+static const struct nir_shader_compiler_options vector_nir_options = {
+   COMMON_OPTIONS,
+
+   /* In the vec4 backend, our dpN instruction replicates its result to all the
+    * components of a vec4.  We would like NIR to give us replicated fdot
+    * instructions because it can optimize better for us.
+    */
+   .fdot_replicates = true,
+};
+
 struct brw_compiler *
 brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
 {
@@ -89,33 +120,6 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
    compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true;
    compiler->scalar_stage[MESA_SHADER_COMPUTE] = true;
 
-   nir_shader_compiler_options *nir_options =
-      rzalloc(compiler, nir_shader_compiler_options);
-   nir_options->native_integers = true;
-   nir_options->lower_fdiv = true;
-   /* In order to help allow for better CSE at the NIR level we tell NIR
-    * to split all ffma instructions during opt_algebraic and we then
-    * re-combine them as a later step.
-    */
-   nir_options->lower_ffma = true;
-   nir_options->lower_sub = true;
-   nir_options->lower_fdiv = true;
-   nir_options->lower_scmp = true;
-   nir_options->lower_fmod = true;
-   nir_options->lower_bitfield_extract = true;
-   nir_options->lower_bitfield_insert = true;
-   nir_options->lower_uadd_carry = true;
-   nir_options->lower_usub_borrow = true;
-
-   /* In the vec4 backend, our dpN instruction replicates its result to all
-    * the components of a vec4.  We would like NIR to give us replicated fdot
-    * instructions because it can optimize better for us.
-    *
-    * For the FS backend, it should be lowered away by the scalarizing pass so
-    * we should never see fdot anyway.
-    */
-   nir_options->fdot_replicates = true;
-
    /* We want the GLSL compiler to emit code that uses condition codes */
    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
       compiler->glsl_compiler_options[i].MaxUnrollIterations = 32;
@@ -139,7 +143,8 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
       if (devinfo->gen < 7)
          compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
 
-      compiler->glsl_compiler_options[i].NirOptions = nir_options;
+      compiler->glsl_compiler_options[i].NirOptions =
+         is_scalar ? &scalar_nir_options : &vector_nir_options;
 
       compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
    }

From 5eb1145434b3c1bd4bf7260dc3421e0159452d82 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Thu, 21 Jan 2016 15:46:47 -0800
Subject: [PATCH 111/119] nir: Add lowering of nir_op_unpack_half_2x16.

---
 src/glsl/nir/nir.h                     |  3 +++
 src/glsl/nir/nir_lower_alu_to_scalar.c | 30 ++++++++++++++++++++++----
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index e2bd2bfa025..11130304170 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1468,6 +1468,9 @@ typedef struct nir_shader_compiler_options {
    /** lowers ffract to fsub+ffloor: */
    bool lower_ffract;
 
+   bool lower_pack_half_2x16;
+   bool lower_unpack_half_2x16;
+
    /**
     * Does the driver support real 32-bit integers?  (Otherwise, integers
     * are simulated by floats.)
diff --git a/src/glsl/nir/nir_lower_alu_to_scalar.c b/src/glsl/nir/nir_lower_alu_to_scalar.c
index 0a27e66cf0f..5372fbeed88 100644
--- a/src/glsl/nir/nir_lower_alu_to_scalar.c
+++ b/src/glsl/nir/nir_lower_alu_to_scalar.c
@@ -97,6 +97,20 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
        */
       return;
 
+   case nir_op_pack_half_2x16:
+      if (!b->shader->options->lower_pack_half_2x16)
+         return;
+
+      nir_ssa_def *val =
+         nir_pack_half_2x16_split(b, nir_channel(b, instr->src[0].src.ssa,
+                                                 instr->src[0].swizzle[0]),
+                                     nir_channel(b, instr->src[0].src.ssa,
+                                                 instr->src[0].swizzle[1]));
+
+      nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(val));
+      nir_instr_remove(&instr->instr);
+      return;
+
    case nir_op_unpack_unorm_4x8:
    case nir_op_unpack_snorm_4x8:
    case nir_op_unpack_unorm_2x16:
@@ -106,11 +120,19 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
        */
       return;
 
-   case nir_op_unpack_half_2x16:
-      /* We could split this into unpack_half_2x16_split_[xy], but should
-       * we?
-       */
+   case nir_op_unpack_half_2x16: {
+      if (!b->shader->options->lower_unpack_half_2x16)
+         return;
+
+      nir_ssa_def *comps[2];
+      comps[0] = nir_unpack_half_2x16_split_x(b, instr->src[0].src.ssa);
+      comps[1] = nir_unpack_half_2x16_split_y(b, instr->src[0].src.ssa);
+      nir_ssa_def *vec = nir_vec(b, comps, 2);
+
+      nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(vec));
+      nir_instr_remove(&instr->instr);
       return;
+   }
 
    case nir_op_fdph: {
       nir_ssa_def *sum[4];

From 24d385f85ca040246d549c835905e868d55ee210 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Thu, 21 Jan 2016 15:30:57 -0800
Subject: [PATCH 112/119] i965/fs: Switch from GLSL IR to NIR for
 un/packHalf2x16 lowering.

---
 src/mesa/drivers/dri/i965/brw_compiler.c             |  2 ++
 .../drivers/dri/i965/brw_fs_channel_expressions.cpp  |  4 ++++
 src/mesa/drivers/dri/i965/brw_link.cpp               | 12 +-----------
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c
index dbd5ad23cfd..21fff1ddf4f 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.c
+++ b/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -86,6 +86,8 @@ shader_perf_log_mesa(void *data, const char *fmt, ...)
 
 static const struct nir_shader_compiler_options scalar_nir_options = {
    COMMON_OPTIONS,
+   .lower_pack_half_2x16 = true,
+   .lower_unpack_half_2x16 = true,
 };
 
 static const struct nir_shader_compiler_options vector_nir_options = {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index 21f0b703d00..566257cf79a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -72,6 +72,9 @@ channel_expressions_predicate(ir_instruction *ir)
       return false;
 
    switch (expr->operation) {
+      case ir_unop_pack_half_2x16:
+         return false;
+
       /* these opcodes need to act on the whole vector,
        * just like texturing.
        */
@@ -162,6 +165,7 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
       return visit_continue;
 
    switch (expr->operation) {
+      case ir_unop_pack_half_2x16:
       case ir_unop_interpolate_at_centroid:
       case ir_binop_interpolate_at_offset:
       case ir_binop_interpolate_at_sample:
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index 234afd554df..8f2d7600146 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -87,17 +87,7 @@ brw_lower_packing_builtins(struct brw_context *brw,
            | LOWER_PACK_SNORM_4x8;
    }
 
-   if (brw->gen >= 7) {
-      /* Gen7 introduced the f32to16 and f16to32 instructions, which can be
-       * used to execute packHalf2x16 and unpackHalf2x16. For AOS code, no
-       * lowering is needed. For SOA code, the Half2x16 ops must be
-       * scalarized.
-       */
-      if (compiler->scalar_stage[shader_type]) {
-         ops |= LOWER_PACK_HALF_2x16_TO_SPLIT
-             |  LOWER_UNPACK_HALF_2x16_TO_SPLIT;
-      }
-   } else {
+   if (brw->gen < 7) {
       ops |= LOWER_PACK_HALF_2x16
           |  LOWER_UNPACK_HALF_2x16;
    }

From 26b2cc6f3a23d9e9047c9735d20e3fbcf2291ac2 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Thu, 21 Jan 2016 11:46:22 -0800
Subject: [PATCH 113/119] glsl: Remove 2x16 half-precision pack/unpack opcodes.

i965/fs was the only consumer, and we're now doing the lowering in NIR.
---
 src/glsl/ir.cpp                               |   9 --
 src/glsl/ir.h                                 |  19 ----
 src/glsl/ir_optimization.h                    |  15 +--
 src/glsl/ir_validate.cpp                      |  12 --
 src/glsl/lower_packing_builtins.cpp           | 105 +-----------------
 src/glsl/nir/glsl_to_nir.cpp                  |   9 --
 .../dri/i965/brw_fs_channel_expressions.cpp   |   3 -
 src/mesa/program/ir_to_mesa.cpp               |   3 -
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp    |   3 -
 9 files changed, 8 insertions(+), 170 deletions(-)

diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index b424edd8e96..db1947453ea 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -298,8 +298,6 @@ ir_expression::ir_expression(int op, ir_rvalue *op0)
       break;
 
    case ir_unop_noise:
-   case ir_unop_unpack_half_2x16_split_x:
-   case ir_unop_unpack_half_2x16_split_y:
       this->type = glsl_type::float_type;
       break;
 
@@ -422,10 +420,6 @@ ir_expression::ir_expression(int op, ir_rvalue *op0, ir_rvalue *op1)
       this->type = op0->type->get_base_type();
       break;
 
-   case ir_binop_pack_half_2x16_split:
-      this->type = glsl_type::uint_type;
-      break;
-
    case ir_binop_imul_high:
    case ir_binop_carry:
    case ir_binop_borrow:
@@ -555,8 +549,6 @@ static const char *const operator_strs[] = {
    "unpackUnorm2x16",
    "unpackUnorm4x8",
    "unpackHalf2x16",
-   "unpackHalf2x16_split_x",
-   "unpackHalf2x16_split_y",
    "bitfield_reverse",
    "bit_count",
    "find_msb",
@@ -599,7 +591,6 @@ static const char *const operator_strs[] = {
    "min",
    "max",
    "pow",
-   "packHalf2x16_split",
    "ubo_load",
    "ldexp",
    "vector_extract",
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index 5b845c6e856..b453187c32a 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -1401,16 +1401,6 @@ enum ir_expression_operation {
    ir_unop_unpack_half_2x16,
    /*@}*/
 
-   /**
-    * \name Lowered floating point unpacking operations.
-    *
-    * \see lower_packing_builtins_visitor::split_unpack_half_2x16
-    */
-   /*@{*/
-   ir_unop_unpack_half_2x16_split_x,
-   ir_unop_unpack_half_2x16_split_y,
-   /*@}*/
-
    /**
     * \name Bit operations, part of ARB_gpu_shader5.
     */
@@ -1541,15 +1531,6 @@ enum ir_expression_operation {
 
    ir_binop_pow,
 
-   /**
-    * \name Lowered floating point packing operations.
-    *
-    * \see lower_packing_builtins_visitor::split_pack_half_2x16
-    */
-   /*@{*/
-   ir_binop_pack_half_2x16_split,
-   /*@}*/
-
    /**
     * Load a value the size of a given GLSL type from a uniform block.
     *
diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
index be86f547f77..b56413a1500 100644
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -58,17 +58,14 @@ enum lower_packing_builtins_op {
    LOWER_PACK_HALF_2x16                 = 0x0010,
    LOWER_UNPACK_HALF_2x16               = 0x0020,
 
-   LOWER_PACK_HALF_2x16_TO_SPLIT        = 0x0040,
-   LOWER_UNPACK_HALF_2x16_TO_SPLIT      = 0x0080,
+   LOWER_PACK_SNORM_4x8                 = 0x0040,
+   LOWER_UNPACK_SNORM_4x8               = 0x0080,
 
-   LOWER_PACK_SNORM_4x8                 = 0x0100,
-   LOWER_UNPACK_SNORM_4x8               = 0x0200,
+   LOWER_PACK_UNORM_4x8                 = 0x0100,
+   LOWER_UNPACK_UNORM_4x8               = 0x0200,
 
-   LOWER_PACK_UNORM_4x8                 = 0x0400,
-   LOWER_UNPACK_UNORM_4x8               = 0x0800,
-
-   LOWER_PACK_USE_BFI                   = 0x1000,
-   LOWER_PACK_USE_BFE                   = 0x2000,
+   LOWER_PACK_USE_BFI                   = 0x0400,
+   LOWER_PACK_USE_BFE                   = 0x0800,
 };
 
 bool do_common_optimization(exec_list *ir, bool linked,
diff --git a/src/glsl/ir_validate.cpp b/src/glsl/ir_validate.cpp
index 94814799b9b..12928836597 100644
--- a/src/glsl/ir_validate.cpp
+++ b/src/glsl/ir_validate.cpp
@@ -372,12 +372,6 @@ ir_validate::visit_leave(ir_expression *ir)
       assert(ir->operands[0]->type == glsl_type::uint_type);
       break;
 
-   case ir_unop_unpack_half_2x16_split_x:
-   case ir_unop_unpack_half_2x16_split_y:
-      assert(ir->type == glsl_type::float_type);
-      assert(ir->operands[0]->type == glsl_type::uint_type);
-      break;
-
    case ir_unop_unpack_double_2x32:
       assert(ir->type == glsl_type::uvec2_type);
       assert(ir->operands[0]->type == glsl_type::double_type);
@@ -567,12 +561,6 @@ ir_validate::visit_leave(ir_expression *ir)
       assert(ir->operands[0]->type == ir->operands[1]->type);
       break;
 
-   case ir_binop_pack_half_2x16_split:
-      assert(ir->type == glsl_type::uint_type);
-      assert(ir->operands[0]->type == glsl_type::float_type);
-      assert(ir->operands[1]->type == glsl_type::float_type);
-      break;
-
    case ir_binop_ubo_load:
       assert(ir->operands[0]->type == glsl_type::uint_type);
 
diff --git a/src/glsl/lower_packing_builtins.cpp b/src/glsl/lower_packing_builtins.cpp
index 7f18238bc6e..a41627bd561 100644
--- a/src/glsl/lower_packing_builtins.cpp
+++ b/src/glsl/lower_packing_builtins.cpp
@@ -43,13 +43,6 @@ public:
       : op_mask(op_mask),
         progress(false)
    {
-      /* Mutually exclusive options. */
-      assert(!((op_mask & LOWER_PACK_HALF_2x16) &&
-               (op_mask & LOWER_PACK_HALF_2x16_TO_SPLIT)));
-
-      assert(!((op_mask & LOWER_UNPACK_HALF_2x16) &&
-               (op_mask & LOWER_UNPACK_HALF_2x16_TO_SPLIT)));
-
       factory.instructions = &factory_instructions;
    }
 
@@ -96,9 +89,6 @@ public:
       case LOWER_PACK_HALF_2x16:
          *rvalue = lower_pack_half_2x16(op0);
          break;
-      case LOWER_PACK_HALF_2x16_TO_SPLIT:
-         *rvalue = split_pack_half_2x16(op0);
-         break;
       case LOWER_UNPACK_SNORM_2x16:
          *rvalue = lower_unpack_snorm_2x16(op0);
          break;
@@ -114,9 +104,6 @@ public:
       case LOWER_UNPACK_HALF_2x16:
          *rvalue = lower_unpack_half_2x16(op0);
          break;
-      case LOWER_UNPACK_HALF_2x16_TO_SPLIT:
-         *rvalue = split_unpack_half_2x16(op0);
-         break;
       case LOWER_PACK_UNPACK_NONE:
       case LOWER_PACK_USE_BFI:
       case LOWER_PACK_USE_BFE:
@@ -161,7 +148,7 @@ private:
          result = op_mask & LOWER_PACK_UNORM_4x8;
          break;
       case ir_unop_pack_half_2x16:
-         result = op_mask & (LOWER_PACK_HALF_2x16 | LOWER_PACK_HALF_2x16_TO_SPLIT);
+         result = op_mask & LOWER_PACK_HALF_2x16;
          break;
       case ir_unop_unpack_snorm_2x16:
          result = op_mask & LOWER_UNPACK_SNORM_2x16;
@@ -176,7 +163,7 @@ private:
          result = op_mask & LOWER_UNPACK_UNORM_4x8;
          break;
       case ir_unop_unpack_half_2x16:
-         result = op_mask & (LOWER_UNPACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16_TO_SPLIT);
+         result = op_mask & LOWER_UNPACK_HALF_2x16;
          break;
       default:
          result = LOWER_PACK_UNPACK_NONE;
@@ -1092,41 +1079,6 @@ private:
       return result;
    }
 
-   /**
-    * \brief Split packHalf2x16's vec2 operand into two floats.
-    *
-    * \param vec2_rval is packHalf2x16's input
-    * \return a uint rvalue
-    *
-    * Some code generators, such as the i965 fragment shader, require that all
-    * vector expressions be lowered to a sequence of scalar expressions.
-    * However, packHalf2x16 cannot be scalarized by the same mechanism as
-    * a true vector operation because its input and output have a differing
-    * number of vector components.
-    *
-    * This method scalarizes packHalf2x16 by transforming it from an unary
-    * operation having vector input to a binary operation having scalar input.
-    * That is, it transforms
-    *
-    *    packHalf2x16(VEC2_RVAL);
-    *
-    * into
-    *
-    *    vec2 v = VEC2_RVAL;
-    *    return packHalf2x16_split(v.x, v.y);
-    */
-   ir_rvalue*
-   split_pack_half_2x16(ir_rvalue *vec2_rval)
-   {
-      assert(vec2_rval->type == glsl_type::vec2_type);
-
-      ir_variable *v = factory.make_temp(glsl_type::vec2_type,
-                                         "tmp_split_pack_half_2x16_v");
-      factory.emit(assign(v, vec2_rval));
-
-      return expr(ir_binop_pack_half_2x16_split, swizzle_x(v), swizzle_y(v));
-   }
-
    /**
     * \brief Lower the component-wise calculation of unpackHalf2x16.
     *
@@ -1341,59 +1293,6 @@ private:
       assert(result->type == glsl_type::vec2_type);
       return result;
    }
-
-   /**
-    * \brief Split unpackHalf2x16 into two operations.
-    *
-    * \param uint_rval is unpackHalf2x16's input
-    * \return a vec2 rvalue
-    *
-    * Some code generators, such as the i965 fragment shader, require that all
-    * vector expressions be lowered to a sequence of scalar expressions.
-    * However, unpackHalf2x16 cannot be scalarized by the same method as
-    * a true vector operation because the number of components of its input
-    * and output differ.
-    *
-    * This method scalarizes unpackHalf2x16 by transforming it from a single
-    * operation having vec2 output to a pair of operations each having float
-    * output. That is, it transforms
-    *
-    *   unpackHalf2x16(UINT_RVAL)
-    *
-    * into
-    *
-    *   uint u = UINT_RVAL;
-    *   vec2 v;
-    *
-    *   v.x = unpackHalf2x16_split_x(u);
-    *   v.y = unpackHalf2x16_split_y(u);
-    *
-    *   return v;
-    */
-   ir_rvalue*
-   split_unpack_half_2x16(ir_rvalue *uint_rval)
-   {
-      assert(uint_rval->type == glsl_type::uint_type);
-
-      /* uint u = uint_rval; */
-      ir_variable *u = factory.make_temp(glsl_type::uint_type,
-                                          "tmp_split_unpack_half_2x16_u");
-      factory.emit(assign(u, uint_rval));
-
-      /* vec2 v; */
-      ir_variable *v = factory.make_temp(glsl_type::vec2_type,
-                                          "tmp_split_unpack_half_2x16_v");
-
-      /* v.x = unpack_half_2x16_split_x(u); */
-      factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_x, u),
-                           WRITEMASK_X));
-
-      /* v.y = unpack_half_2x16_split_y(u); */
-      factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_y, u),
-                           WRITEMASK_Y));
-
-      return deref(v).val;
-   }
 };
 
 } // namespace anonymous
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index c7399ebba0b..bec59c79fde 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -1429,12 +1429,6 @@ nir_visitor::visit(ir_expression *ir)
    case ir_unop_unpack_half_2x16:
       result = nir_unpack_half_2x16(&b, srcs[0]);
       break;
-   case ir_unop_unpack_half_2x16_split_x:
-      result = nir_unpack_half_2x16_split_x(&b, srcs[0]);
-      break;
-   case ir_unop_unpack_half_2x16_split_y:
-      result = nir_unpack_half_2x16_split_y(&b, srcs[0]);
-      break;
    case ir_unop_bitfield_reverse:
       result = nir_bitfield_reverse(&b, srcs[0]);
       break;
@@ -1718,9 +1712,6 @@ nir_visitor::visit(ir_expression *ir)
       }
       break;
 
-   case ir_binop_pack_half_2x16_split:
-         result = nir_pack_half_2x16_split(&b, srcs[0], srcs[1]);
-         break;
    case ir_binop_ldexp: result = nir_ldexp(&b, srcs[0], srcs[1]); break;
    case ir_triop_fma:
       result = nir_ffma(&b, srcs[0], srcs[1], srcs[2]);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index 566257cf79a..b16dd2ffd9e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -403,9 +403,6 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
    case ir_unop_ssbo_unsized_array_length:
       unreachable("should have been lowered");
 
-   case ir_unop_unpack_half_2x16_split_x:
-   case ir_unop_unpack_half_2x16_split_y:
-   case ir_binop_pack_half_2x16_split:
    case ir_unop_interpolate_at_centroid:
    case ir_binop_interpolate_at_offset:
    case ir_binop_interpolate_at_sample:
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 9cde28dfc0a..6b2d431af07 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -1244,10 +1244,7 @@ ir_to_mesa_visitor::visit(ir_expression *ir)
    case ir_unop_unpack_unorm_2x16:
    case ir_unop_unpack_unorm_4x8:
    case ir_unop_unpack_half_2x16:
-   case ir_unop_unpack_half_2x16_split_x:
-   case ir_unop_unpack_half_2x16_split_y:
    case ir_unop_unpack_double_2x32:
-   case ir_binop_pack_half_2x16_split:
    case ir_unop_bitfield_reverse:
    case ir_unop_bit_count:
    case ir_unop_find_msb:
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index d424e3b335f..a06683f31c8 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -2177,12 +2177,9 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
 
    case ir_unop_unpack_snorm_2x16:
    case ir_unop_unpack_unorm_2x16:
-   case ir_unop_unpack_half_2x16_split_x:
-   case ir_unop_unpack_half_2x16_split_y:
    case ir_unop_unpack_snorm_4x8:
    case ir_unop_unpack_unorm_4x8:
 
-   case ir_binop_pack_half_2x16_split:
    case ir_quadop_vector:
    case ir_binop_vector_extract:
    case ir_triop_vector_insert:

From 26f0444ead45bd9d5f7fbbca6292b284f382ecd6 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Thu, 21 Jan 2016 09:09:29 -0800
Subject: [PATCH 114/119] nir: Add opcodes to extract bytes or words.

The uint versions zero extend while the int versions sign extend.
---
 src/glsl/nir/nir.h                |  3 +++
 src/glsl/nir/nir_opcodes.py       |  9 +++++++++
 src/glsl/nir/nir_opt_algebraic.py | 16 ++++++++++++++++
 3 files changed, 28 insertions(+)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 11130304170..7b39cbb4c12 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1471,6 +1471,9 @@ typedef struct nir_shader_compiler_options {
    bool lower_pack_half_2x16;
    bool lower_unpack_half_2x16;
 
+   bool lower_extract_byte;
+   bool lower_extract_word;
+
    /**
     * Does the driver support real 32-bit integers?  (Otherwise, integers
     * are simulated by floats.)
diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index a8bbe1a0b82..be3cd17193f 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -536,6 +536,15 @@ dst.x = src0.x;
 dst.y = src1.x;
 """)
 
+# Byte extraction
+binop("extract_ubyte", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
+binop("extract_ibyte", tint, "", "(int8_t)(src0 >> (src1 * 8))")
+
+# Word extraction
+binop("extract_uword", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
+binop("extract_iword", tint, "", "(int16_t)(src0 >> (src1 * 16))")
+
+
 def triop(name, ty, const_expr):
    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
index 7745b76f7ce..b761b54cd70 100644
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -242,6 +242,22 @@ optimizations = [
     ('bcsel', ('ult', 31, 'bits'), 'value',
               ('ubfe', 'value', 'offset', 'bits')),
     'options->lower_bitfield_extract'),
+
+   (('extract_ibyte', a, b),
+    ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 8),
+    'options->lower_extract_byte'),
+
+   (('extract_ubyte', a, b),
+    ('iand', ('ushr', a, ('imul', b, 8)), 0xff),
+    'options->lower_extract_byte'),
+
+   (('extract_iword', a, b),
+    ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16),
+    'options->lower_extract_word'),
+
+   (('extract_uword', a, b),
+    ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
+    'options->lower_extract_word'),
 ]
 
 # Add optimizations to handle the case where the result of a ternary is

From 6c1b3bc950d480a21d4957b5b0cab84ffc49769b Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Wed, 20 Jan 2016 18:56:37 -0800
Subject: [PATCH 115/119] i965/fs: Implement support for extract_word.

The vec4 backend will lower it.
---
 src/mesa/drivers/dri/i965/brw_defines.h       | 12 ++++++++++
 src/mesa/drivers/dri/i965/brw_fs_cse.cpp      |  2 ++
 .../drivers/dri/i965/brw_fs_generator.cpp     | 22 +++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp      | 16 ++++++++++++++
 src/mesa/drivers/dri/i965/brw_shader.cpp      |  4 ++++
 5 files changed, 56 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 10a6d39db85..01e0c99e440 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1085,6 +1085,18 @@ enum opcode {
     */
    SHADER_OPCODE_BROADCAST,
 
+   /**
+    * Pick the byte from its first source register given by the index
+    * specified as second source.
+    */
+   SHADER_OPCODE_EXTRACT_BYTE,
+
+   /**
+    * Pick the word from its first source register given by the index
+    * specified as second source.
+    */
+   SHADER_OPCODE_EXTRACT_WORD,
+
    VEC4_OPCODE_MOV_BYTES,
    VEC4_OPCODE_PACK_BYTES,
    VEC4_OPCODE_UNPACK_UNIFORM,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 3b65a382dc8..cde6566c05c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -78,6 +78,8 @@ is_expression(const fs_visitor *v, const fs_inst *const inst)
    case FS_OPCODE_LINTERP:
    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
    case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_EXTRACT_BYTE:
+   case SHADER_OPCODE_EXTRACT_WORD:
    case SHADER_OPCODE_MOV_INDIRECT:
       return true;
    case SHADER_OPCODE_RCP:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index e05622ae7a8..1916a995020 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -2201,6 +2201,28 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          brw_broadcast(p, dst, src[0], src[1]);
          break;
 
+      case SHADER_OPCODE_EXTRACT_BYTE: {
+         assert(src[0].type == BRW_REGISTER_TYPE_D ||
+                src[0].type == BRW_REGISTER_TYPE_UD);
+
+         enum brw_reg_type type =
+            src[0].type == BRW_REGISTER_TYPE_D ? BRW_REGISTER_TYPE_B
+                                               : BRW_REGISTER_TYPE_UB;
+         brw_MOV(p, dst, spread(suboffset(retype(src[0], type), src[1].ud), 4));
+         break;
+      }
+
+      case SHADER_OPCODE_EXTRACT_WORD: {
+         assert(src[0].type == BRW_REGISTER_TYPE_D ||
+                src[0].type == BRW_REGISTER_TYPE_UD);
+
+         enum brw_reg_type type =
+            src[0].type == BRW_REGISTER_TYPE_D ? BRW_REGISTER_TYPE_W
+                                               : BRW_REGISTER_TYPE_UW;
+         brw_MOV(p, dst, spread(suboffset(retype(src[0], type), src[1].ud), 2));
+         break;
+      }
+
       case FS_OPCODE_SET_SAMPLE_ID:
          generate_set_sample_id(inst, dst, src[0], src[1]);
          break;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index d7bcc1c5374..3efee50b273 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1075,6 +1075,22 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
 
+   case nir_op_extract_ubyte:
+   case nir_op_extract_ibyte: {
+      nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
+      bld.emit(SHADER_OPCODE_EXTRACT_BYTE,
+               result, op[0], brw_imm_ud(byte->u[0]));
+      break;
+   }
+
+   case nir_op_extract_uword:
+   case nir_op_extract_iword: {
+      nir_const_value *word = nir_src_as_const_value(instr->src[1].src);
+      bld.emit(SHADER_OPCODE_EXTRACT_WORD,
+               result, op[0], brw_imm_ud(word->u[0]));
+      break;
+   }
+
    default:
       unreachable("unhandled instruction");
    }
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index e42601b6f3e..6a6efa9aea2 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -312,6 +312,10 @@ brw_instruction_name(enum opcode op)
    case SHADER_OPCODE_BROADCAST:
       return "broadcast";
 
+   case SHADER_OPCODE_EXTRACT_BYTE:
+      return "extract_byte";
+   case SHADER_OPCODE_EXTRACT_WORD:
+      return "extract_word";
    case VEC4_OPCODE_MOV_BYTES:
       return "mov_bytes";
    case VEC4_OPCODE_PACK_BYTES:

From d7781038f51ee569aacc07bfd17b70dad318f043 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 25 Jan 2016 11:05:52 -0800
Subject: [PATCH 116/119] nir: Add lowering support for packing opcodes.

---
 src/glsl/nir/nir.h                     |  4 ++++
 src/glsl/nir/nir_lower_alu_to_scalar.c | 32 ++++++++++++++++++++++++++
 src/glsl/nir/nir_opcodes.py            | 10 ++++++++
 src/glsl/nir/nir_opt_algebraic.py      | 20 ++++++++++++++++
 4 files changed, 66 insertions(+)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 7b39cbb4c12..bbd5b1af64e 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1469,6 +1469,10 @@ typedef struct nir_shader_compiler_options {
    bool lower_ffract;
 
    bool lower_pack_half_2x16;
+   bool lower_pack_unorm_2x16;
+   bool lower_pack_snorm_2x16;
+   bool lower_pack_unorm_4x8;
+   bool lower_pack_snorm_4x8;
    bool lower_unpack_half_2x16;
 
    bool lower_extract_byte;
diff --git a/src/glsl/nir/nir_lower_alu_to_scalar.c b/src/glsl/nir/nir_lower_alu_to_scalar.c
index 5372fbeed88..37cb0221e0b 100644
--- a/src/glsl/nir/nir_lower_alu_to_scalar.c
+++ b/src/glsl/nir/nir_lower_alu_to_scalar.c
@@ -134,6 +134,38 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
       return;
    }
 
+   case nir_op_pack_uvec2_to_uint: {
+      assert(b->shader->options->lower_pack_snorm_2x16 ||
+             b->shader->options->lower_pack_unorm_2x16);
+
+      nir_ssa_def *word =
+         nir_extract_uword(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
+      nir_ssa_def *val =
+         nir_ior(b, nir_ishl(b, nir_channel(b, word, 1), nir_imm_int(b, 16)),
+                                nir_channel(b, word, 0));
+
+      nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(val));
+      nir_instr_remove(&instr->instr);
+      break;
+   }
+
+   case nir_op_pack_uvec4_to_uint: {
+      assert(b->shader->options->lower_pack_snorm_4x8 ||
+             b->shader->options->lower_pack_unorm_4x8);
+
+      nir_ssa_def *byte =
+         nir_extract_ubyte(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
+      nir_ssa_def *val =
+         nir_ior(b, nir_ior(b, nir_ishl(b, nir_channel(b, byte, 3), nir_imm_int(b, 24)),
+                               nir_ishl(b, nir_channel(b, byte, 2), nir_imm_int(b, 16))),
+                    nir_ior(b, nir_ishl(b, nir_channel(b, byte, 1), nir_imm_int(b, 8)),
+                               nir_channel(b, byte, 0)));
+
+      nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(val));
+      nir_instr_remove(&instr->instr);
+      break;
+   }
+
    case nir_op_fdph: {
       nir_ssa_def *sum[4];
       for (unsigned i = 0; i < 3; i++) {
diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index be3cd17193f..3b82c3ce1f9 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -237,6 +237,16 @@ unpack_2x16("unorm")
 unpack_4x8("unorm")
 unpack_2x16("half")
 
+unop_horiz("pack_uvec2_to_uint", 0, tuint, 2, tuint, """
+dst = (src0.x & 0xffff) | (src0.y >> 16);
+""")
+
+unop_horiz("pack_uvec4_to_uint", 0, tuint, 4, tuint, """
+dst = (src0.x <<  0) |
+      (src0.y <<  8) |
+      (src0.z << 16) |
+      (src0.w << 24);
+""")
 
 # Lowered floating point unpacking operations.
 
diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
index b761b54cd70..56b0f5ea7b2 100644
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -258,6 +258,26 @@ optimizations = [
    (('extract_uword', a, b),
     ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
     'options->lower_extract_word'),
+
+    (('pack_unorm_2x16', 'v'),
+     ('pack_uvec2_to_uint',
+        ('f2u', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))),
+     'options->lower_pack_unorm_2x16'),
+
+    (('pack_unorm_4x8', 'v'),
+     ('pack_uvec4_to_uint',
+        ('f2u', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),
+     'options->lower_pack_unorm_4x8'),
+
+    (('pack_snorm_2x16', 'v'),
+     ('pack_uvec2_to_uint',
+        ('f2i', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))),
+     'options->lower_pack_snorm_2x16'),
+
+    (('pack_snorm_4x8', 'v'),
+     ('pack_uvec4_to_uint',
+        ('f2i', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
+     'options->lower_pack_snorm_4x8'),
 ]
 
 # Add optimizations to handle the case where the result of a ternary is

From 8bb22dc3518b86ed2e0194c127f0438a0c073018 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 25 Jan 2016 11:07:02 -0800
Subject: [PATCH 117/119] nir: Add lowering support for unpacking opcodes.

---
 src/glsl/nir/nir.h                |  4 ++++
 src/glsl/nir/nir_opt_algebraic.py | 28 ++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index bbd5b1af64e..3b90b5129f1 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1474,6 +1474,10 @@ typedef struct nir_shader_compiler_options {
    bool lower_pack_unorm_4x8;
    bool lower_pack_snorm_4x8;
    bool lower_unpack_half_2x16;
+   bool lower_unpack_unorm_2x16;
+   bool lower_unpack_snorm_2x16;
+   bool lower_unpack_unorm_4x8;
+   bool lower_unpack_snorm_4x8;
 
    bool lower_extract_byte;
    bool lower_extract_word;
diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
index 56b0f5ea7b2..a0d6c074682 100644
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -278,6 +278,34 @@ optimizations = [
      ('pack_uvec4_to_uint',
         ('f2i', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
      'options->lower_pack_snorm_4x8'),
+
+    (('unpack_unorm_2x16', 'v'),
+     ('fdiv', ('u2f', ('vec4', ('extract_uword', 'v', 0),
+                               ('extract_uword', 'v', 1), 0, 0)),
+              65535.0),
+     'options->lower_unpack_unorm_2x16'),
+
+    (('unpack_unorm_4x8', 'v'),
+     ('fdiv', ('u2f', ('vec4', ('extract_ubyte', 'v', 0),
+                               ('extract_ubyte', 'v', 1),
+                               ('extract_ubyte', 'v', 2),
+                               ('extract_ubyte', 'v', 3))),
+              255.0),
+     'options->lower_unpack_unorm_4x8'),
+
+    (('unpack_snorm_2x16', 'v'),
+     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_iword', 'v', 0),
+                                                            ('extract_iword', 'v', 1), 0, 0)),
+                                           32767.0))),
+     'options->lower_unpack_snorm_2x16'),
+
+    (('unpack_snorm_4x8', 'v'),
+     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_ibyte', 'v', 0),
+                                                            ('extract_ibyte', 'v', 1),
+                                                            ('extract_ibyte', 'v', 2),
+                                                            ('extract_ibyte', 'v', 3))),
+                                           127.0))),
+     'options->lower_unpack_snorm_4x8'),
 ]
 
 # Add optimizations to handle the case where the result of a ternary is

From 5deba3f00a9614e9ab60a40c737e87dc9f5a5a43 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 25 Jan 2016 10:49:15 -0800
Subject: [PATCH 118/119] i965/vec4: Implement nir_op_pack_uvec2_to_uint.

And mark nir_op_pack_uvec4_to_uint unreachable, since it's only produced
by lowering pack[SU]norm4x8 which the vec4 backend does not need.
---
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 1b87e3044c2..d3ac7ab61f7 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1325,6 +1325,24 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_pack_unorm_2x16:
       unreachable("not reached: should be handled by lower_packing_builtins");
 
+   case nir_op_pack_uvec4_to_uint:
+      unreachable("not reached");
+
+   case nir_op_pack_uvec2_to_uint: {
+      dst_reg tmp1 = dst_reg(this, glsl_type::uint_type);
+      tmp1.writemask = WRITEMASK_X;
+      op[0].swizzle = BRW_SWIZZLE_YYYY;
+      emit(SHL(tmp1, op[0], src_reg(brw_imm_ud(16u))));
+
+      dst_reg tmp2 = dst_reg(this, glsl_type::uint_type);
+      tmp2.writemask = WRITEMASK_X;
+      op[0].swizzle = BRW_SWIZZLE_XXXX;
+      emit(AND(tmp2, op[0], src_reg(brw_imm_ud(0xffffu))));
+
+      emit(OR(dst, src_reg(tmp1), src_reg(tmp2)));
+      break;
+   }
+
    case nir_op_unpack_half_2x16:
       /* As NIR does not guarantee that we have a correct swizzle outside the
        * boundaries of a vector, and the implementation of emit_unpack_half_2x16

From 874ede498334c9ec39383be9b24c2c368dcb349e Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 25 Jan 2016 11:07:28 -0800
Subject: [PATCH 119/119] i965/gen7+: Use NIR for lowering of pack/unpack
 opcodes.

---
 src/mesa/drivers/dri/i965/brw_compiler.c      | 15 +++++++++++
 .../dri/i965/brw_fs_channel_expressions.cpp   |  8 ++++++
 src/mesa/drivers/dri/i965/brw_link.cpp        | 25 +++++--------------
 3 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c
index 21fff1ddf4f..f9e22d1d6b5 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.c
+++ b/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -87,7 +87,15 @@ shader_perf_log_mesa(void *data, const char *fmt, ...)
 static const struct nir_shader_compiler_options scalar_nir_options = {
    COMMON_OPTIONS,
    .lower_pack_half_2x16 = true,
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_snorm_4x8 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_pack_unorm_4x8 = true,
    .lower_unpack_half_2x16 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_snorm_4x8 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_unpack_unorm_4x8 = true,
 };
 
 static const struct nir_shader_compiler_options vector_nir_options = {
@@ -98,6 +106,13 @@ static const struct nir_shader_compiler_options vector_nir_options = {
     * instructions because it can optimize better for us.
     */
    .fdot_replicates = true,
+
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_extract_byte = true,
+   .lower_extract_word = true,
 };
 
 struct brw_compiler *
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index b16dd2ffd9e..cbad47ee40a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -73,6 +73,10 @@ channel_expressions_predicate(ir_instruction *ir)
 
    switch (expr->operation) {
       case ir_unop_pack_half_2x16:
+      case ir_unop_pack_snorm_2x16:
+      case ir_unop_pack_snorm_4x8:
+      case ir_unop_pack_unorm_2x16:
+      case ir_unop_pack_unorm_4x8:
          return false;
 
       /* these opcodes need to act on the whole vector,
@@ -166,6 +170,10 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
 
    switch (expr->operation) {
       case ir_unop_pack_half_2x16:
+      case ir_unop_pack_snorm_2x16:
+      case ir_unop_pack_snorm_4x8:
+      case ir_unop_pack_unorm_2x16:
+      case ir_unop_pack_unorm_4x8:
       case ir_unop_interpolate_at_centroid:
       case ir_binop_interpolate_at_offset:
       case ir_binop_interpolate_at_sample:
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index 8f2d7600146..ab9d7929c05 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -73,26 +73,13 @@ brw_lower_packing_builtins(struct brw_context *brw,
                            gl_shader_stage shader_type,
                            exec_list *ir)
 {
-   const struct brw_compiler *compiler = brw->intelScreen->compiler;
+   /* Gens < 7 don't have instructions to convert to or from half-precision,
+    * and Gens < 6 don't expose that functionality.
+    */
+   if (brw->gen != 6)
+      return;
 
-   int ops = LOWER_PACK_SNORM_2x16
-           | LOWER_UNPACK_SNORM_2x16
-           | LOWER_PACK_UNORM_2x16
-           | LOWER_UNPACK_UNORM_2x16;
-
-   if (compiler->scalar_stage[shader_type]) {
-      ops |= LOWER_UNPACK_UNORM_4x8
-           | LOWER_UNPACK_SNORM_4x8
-           | LOWER_PACK_UNORM_4x8
-           | LOWER_PACK_SNORM_4x8;
-   }
-
-   if (brw->gen < 7) {
-      ops |= LOWER_PACK_HALF_2x16
-          |  LOWER_UNPACK_HALF_2x16;
-   }
-
-   lower_packing_builtins(ir, ops);
+   lower_packing_builtins(ir, LOWER_PACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16);
 }
 
 static void