radeonsi: switch to 3-spaces style

Generated automatically using clang-format and the following config: AlignAfterOpenBracket: true AlignConsecutiveMacros: true AllowAllArgumentsOnNextLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: false AlwaysBreakAfterReturnType: None BasedOnStyle: LLVM BraceWrapping: AfterControlStatement: false AfterEnum: true AfterFunction: true AfterStruct: false BeforeElse: false SplitEmptyFunction: true BinPackArguments: true BinPackParameters: true BreakBeforeBraces: Custom ColumnLimit: 100 ContinuationIndentWidth: 3 Cpp11BracedListStyle: false Cpp11BracedListStyle: true ForEachMacros: - LIST_FOR_EACH_ENTRY - LIST_FOR_EACH_ENTRY_SAFE - util_dynarray_foreach - nir_foreach_variable - nir_foreach_variable_safe - nir_foreach_register - nir_foreach_register_safe - nir_foreach_use - nir_foreach_use_safe - nir_foreach_if_use - nir_foreach_if_use_safe - nir_foreach_def - nir_foreach_def_safe - nir_foreach_phi_src - nir_foreach_phi_src_safe - nir_foreach_parallel_copy_entry - nir_foreach_instr - nir_foreach_instr_reverse - nir_foreach_instr_safe - nir_foreach_instr_reverse_safe - nir_foreach_function - nir_foreach_block - nir_foreach_block_safe - nir_foreach_block_reverse - nir_foreach_block_reverse_safe - nir_foreach_block_in_cf_node IncludeBlocks: Regroup IncludeCategories: - Regex: '<[[:alnum:].]+>' Priority: 2 - Regex: '.*' Priority: 1 IndentWidth: 3 PenaltyBreakBeforeFirstCallParameter: 1 PenaltyExcessCharacter: 100 SpaceAfterCStyleCast: false SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: false SpacesInContainerLiterals: false Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4319>
2026-01-07 06:30:11 +01:00 · 2020-03-27 19:32:38 +01:00 · 2020-03-27 19:32:38 +01:00 · d7008fe46a
commit d7008fe46a
parent 53e5e802f8
52 changed files with 37663 additions and 41424 deletions
--- a/src/gallium/drivers/radeonsi/.editorconfig
+++ b/src/gallium/drivers/radeonsi/.editorconfig
@ -1,3 +0,0 @@
-[*.{c,h}]
-indent_style = tab
-indent_size = tab
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
--- a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
+++ b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
@ -1,18 +1,18 @@
 // DriConf options specific to radeonsi
 DRI_CONF_SECTION_PERFORMANCE
-    DRI_CONF_ADAPTIVE_SYNC("true")
-    DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
-    DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
-    DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false")
+DRI_CONF_ADAPTIVE_SYNC("true")
+DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
+DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
+DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false")
 DRI_CONF_SECTION_END

 DRI_CONF_SECTION_DEBUG

 //= BEGIN VERBATIM
-#define OPT_BOOL(name, dflt, description) \
-	DRI_CONF_OPT_BEGIN_B(radeonsi_##name, #dflt) \
-		DRI_CONF_DESC(en, description) \
-	DRI_CONF_OPT_END
+#define OPT_BOOL(name, dflt, description)                                                          \
+   DRI_CONF_OPT_BEGIN_B(radeonsi_##name, #dflt)                                                    \
+   DRI_CONF_DESC(en, description)                                                                  \
+   DRI_CONF_OPT_END

 #include "radeonsi/si_debug_options.h"
 //= END VERBATIM
--- a/src/gallium/drivers/radeonsi/gfx10_query.c
+++ b/src/gallium/drivers/radeonsi/gfx10_query.c
@ -22,13 +22,13 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#include <stddef.h>
-
 #include "si_pipe.h"
 #include "si_query.h"
+#include "sid.h"
 #include "util/u_memory.h"
 #include "util/u_suballoc.h"
-#include "sid.h"
+
+#include <stddef.h>

 /**
 * The query buffer is written to by ESGS NGG shaders with statistics about
@ -39,12 +39,12 @@
 * without additional GPU cost.
 */
 struct gfx10_sh_query_buffer {
-	struct list_head list;
-	struct si_resource *buf;
-	unsigned refcount;
+   struct list_head list;
+   struct si_resource *buf;
+   unsigned refcount;

-	/* Offset into the buffer in bytes; points at the first un-emitted entry. */
-	unsigned head;
+   /* Offset into the buffer in bytes; points at the first un-emitted entry. */
+   unsigned head;
 };

 /* Memory layout of the query buffer. Must be kept in sync with shaders
@ -55,469 +55,454 @@ struct gfx10_sh_query_buffer {
 * of all those values unconditionally.
 */
 struct gfx10_sh_query_buffer_mem {
-	struct {
-		uint64_t generated_primitives_start_dummy;
-		uint64_t emitted_primitives_start_dummy;
-		uint64_t generated_primitives;
-		uint64_t emitted_primitives;
-	} stream[4];
-	uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
-	uint32_t pad[31];
+   struct {
+      uint64_t generated_primitives_start_dummy;
+      uint64_t emitted_primitives_start_dummy;
+      uint64_t generated_primitives;
+      uint64_t emitted_primitives;
+   } stream[4];
+   uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
+   uint32_t pad[31];
 };

 /* Shader-based queries. */
 struct gfx10_sh_query {
-	struct si_query b;
+   struct si_query b;

-	struct gfx10_sh_query_buffer *first;
-	struct gfx10_sh_query_buffer *last;
-	unsigned first_begin;
-	unsigned last_end;
+   struct gfx10_sh_query_buffer *first;
+   struct gfx10_sh_query_buffer *last;
+   unsigned first_begin;
+   unsigned last_end;

-	unsigned stream;
+   unsigned stream;
 };

 static void emit_shader_query(struct si_context *sctx)
 {
-	assert(!list_is_empty(&sctx->shader_query_buffers));
+   assert(!list_is_empty(&sctx->shader_query_buffers));

-	struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers,
-							     struct gfx10_sh_query_buffer, list);
-	qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
+   struct gfx10_sh_query_buffer *qbuf =
+      list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+   qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
 }

 static void gfx10_release_query_buffers(struct si_context *sctx,
-					struct gfx10_sh_query_buffer *first,
-					struct gfx10_sh_query_buffer *last)
+                                        struct gfx10_sh_query_buffer *first,
+                                        struct gfx10_sh_query_buffer *last)
 {
-	while (first) {
-		struct gfx10_sh_query_buffer *qbuf = first;
-		if (first != last)
-			first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
-		else
-			first = NULL;
+   while (first) {
+      struct gfx10_sh_query_buffer *qbuf = first;
+      if (first != last)
+         first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+      else
+         first = NULL;

-		qbuf->refcount--;
-		if (qbuf->refcount)
-			continue;
+      qbuf->refcount--;
+      if (qbuf->refcount)
+         continue;

-		if (qbuf->list.next == &sctx->shader_query_buffers)
-			continue; /* keep the most recent buffer; it may not be full yet */
-		if (qbuf->list.prev == &sctx->shader_query_buffers)
-			continue; /* keep the oldest buffer for recycling */
+      if (qbuf->list.next == &sctx->shader_query_buffers)
+         continue; /* keep the most recent buffer; it may not be full yet */
+      if (qbuf->list.prev == &sctx->shader_query_buffers)
+         continue; /* keep the oldest buffer for recycling */

-		list_del(&qbuf->list);
-		si_resource_reference(&qbuf->buf, NULL);
-		FREE(qbuf);
-	}
+      list_del(&qbuf->list);
+      si_resource_reference(&qbuf->buf, NULL);
+      FREE(qbuf);
+   }
 }

 static bool gfx10_alloc_query_buffer(struct si_context *sctx)
 {
-	if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
-		return true;
+   if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
+      return true;

-	struct gfx10_sh_query_buffer *qbuf = NULL;
+   struct gfx10_sh_query_buffer *qbuf = NULL;

-	if (!list_is_empty(&sctx->shader_query_buffers)) {
-		qbuf = list_last_entry(&sctx->shader_query_buffers,
-				       struct gfx10_sh_query_buffer, list);
-		if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
-			goto success;
+   if (!list_is_empty(&sctx->shader_query_buffers)) {
+      qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+      if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
+         goto success;

-		qbuf = list_first_entry(&sctx->shader_query_buffers,
-				        struct gfx10_sh_query_buffer, list);
-		if (!qbuf->refcount &&
-		    !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
-		    sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
-			/* Can immediately re-use the oldest buffer */
-			list_del(&qbuf->list);
-		} else {
-			qbuf = NULL;
-		}
-	}
+      qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+      if (!qbuf->refcount &&
+          !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
+          sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
+         /* Can immediately re-use the oldest buffer */
+         list_del(&qbuf->list);
+      } else {
+         qbuf = NULL;
+      }
+   }

-	if (!qbuf) {
-		qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
-		if (unlikely(!qbuf))
-			return false;
+   if (!qbuf) {
+      qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
+      if (unlikely(!qbuf))
+         return false;

-		struct si_screen *screen = sctx->screen;
-		unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem),
-					 screen->info.min_alloc_size);
-		qbuf->buf = si_resource(
-			pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
-		if (unlikely(!qbuf->buf)) {
-			FREE(qbuf);
-			return false;
-		}
-	}
+      struct si_screen *screen = sctx->screen;
+      unsigned buf_size =
+         MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size);
+      qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
+      if (unlikely(!qbuf->buf)) {
+         FREE(qbuf);
+         return false;
+      }
+   }

-	/* The buffer is currently unused by the GPU. Initialize it.
-	 *
-	 * We need to set the high bit of all the primitive counters for
-	 * compatibility with the SET_PREDICATION packet.
-	 */
-	uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
-						 PIPE_TRANSFER_WRITE |
-						 PIPE_TRANSFER_UNSYNCHRONIZED);
-	assert(results);
+   /* The buffer is currently unused by the GPU. Initialize it.
+    *
+    * We need to set the high bit of all the primitive counters for
+    * compatibility with the SET_PREDICATION packet.
+    */
+   uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
+                                            PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED);
+   assert(results);

-	for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem);
-	     i < e; ++i) {
-		for (unsigned j = 0; j < 16; ++j)
-			results[32 * i + j] = (uint64_t)1 << 63;
-		results[32 * i + 16] = 0;
-	}
+   for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e;
+        ++i) {
+      for (unsigned j = 0; j < 16; ++j)
+         results[32 * i + j] = (uint64_t)1 << 63;
+      results[32 * i + 16] = 0;
+   }

-	list_addtail(&qbuf->list, &sctx->shader_query_buffers);
-	qbuf->head = 0;
-	qbuf->refcount = sctx->num_active_shader_queries;
+   list_addtail(&qbuf->list, &sctx->shader_query_buffers);
+   qbuf->head = 0;
+   qbuf->refcount = sctx->num_active_shader_queries;

 success:;
-	struct pipe_shader_buffer sbuf;
-	sbuf.buffer = &qbuf->buf->b.b;
-	sbuf.buffer_offset = qbuf->head;
-	sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
-	si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
-	sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
+   struct pipe_shader_buffer sbuf;
+   sbuf.buffer = &qbuf->buf->b.b;
+   sbuf.buffer_offset = qbuf->head;
+   sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
+   si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
+   sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);

-	si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
-	return true;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
+   return true;
 }

 static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
 {
-	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
-	gfx10_release_query_buffers(sctx, query->first, query->last);
-	FREE(query);
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   gfx10_release_query_buffers(sctx, query->first, query->last);
+   FREE(query);
 }

 static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
 {
-	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;

-	gfx10_release_query_buffers(sctx, query->first, query->last);
-	query->first = query->last = NULL;
+   gfx10_release_query_buffers(sctx, query->first, query->last);
+   query->first = query->last = NULL;

-	if (unlikely(!gfx10_alloc_query_buffer(sctx)))
-		return false;
+   if (unlikely(!gfx10_alloc_query_buffer(sctx)))
+      return false;

-	query->first = list_last_entry(&sctx->shader_query_buffers,
-				       struct gfx10_sh_query_buffer, list);
-	query->first_begin = query->first->head;
+   query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+   query->first_begin = query->first->head;

-	sctx->num_active_shader_queries++;
-	query->first->refcount++;
+   sctx->num_active_shader_queries++;
+   query->first->refcount++;

-	return true;
+   return true;
 }

 static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
 {
-	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;

-	if (unlikely(!query->first))
-		return false; /* earlier out of memory error */
+   if (unlikely(!query->first))
+      return false; /* earlier out of memory error */

-	query->last = list_last_entry(&sctx->shader_query_buffers,
-				      struct gfx10_sh_query_buffer, list);
-	query->last_end = query->last->head;
+   query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+   query->last_end = query->last->head;

-	/* Signal the fence of the previous chunk */
-	if (query->last_end != 0) {
-		uint64_t fence_va = query->last->buf->gpu_address;
-		fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
-		fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
-		si_cp_release_mem(sctx, sctx->gfx_cs,
-				  V_028A90_BOTTOM_OF_PIPE_TS, 0,
-				  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-				  EOP_DATA_SEL_VALUE_32BIT,
-				  query->last->buf, fence_va, 0xffffffff,
-				  PIPE_QUERY_GPU_FINISHED);
-	}
+   /* Signal the fence of the previous chunk */
+   if (query->last_end != 0) {
+      uint64_t fence_va = query->last->buf->gpu_address;
+      fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
+      fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+      si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
+                        EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
+                        0xffffffff, PIPE_QUERY_GPU_FINISHED);
+   }

-	sctx->num_active_shader_queries--;
+   sctx->num_active_shader_queries--;

-	if (sctx->num_active_shader_queries > 0) {
-		gfx10_alloc_query_buffer(sctx);
-	} else {
-		si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
-		sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
+   if (sctx->num_active_shader_queries > 0) {
+      gfx10_alloc_query_buffer(sctx);
+   } else {
+      si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
+      sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;

-		/* If a query_begin is followed by a query_end without a draw
-		 * in-between, we need to clear the atom to ensure that the
-		 * next query_begin will re-initialize the shader buffer. */
-		si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
-	}
+      /* If a query_begin is followed by a query_end without a draw
+       * in-between, we need to clear the atom to ensure that the
+       * next query_begin will re-initialize the shader buffer. */
+      si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
+   }

-	return true;
+   return true;
 }

 static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
-				      struct gfx10_sh_query_buffer_mem *qmem,
-				      union pipe_query_result *result)
+                                      struct gfx10_sh_query_buffer_mem *qmem,
+                                      union pipe_query_result *result)
 {
-	static const uint64_t mask = ((uint64_t)1 << 63) - 1;
+   static const uint64_t mask = ((uint64_t)1 << 63) - 1;

-	switch (query->b.type) {
-	case PIPE_QUERY_PRIMITIVES_EMITTED:
-		result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
-		break;
-	case PIPE_QUERY_PRIMITIVES_GENERATED:
-		result->u64 += qmem->stream[query->stream].generated_primitives & mask;
-		break;
-	case PIPE_QUERY_SO_STATISTICS:
-		result->so_statistics.num_primitives_written +=
-			qmem->stream[query->stream].emitted_primitives & mask;
-		result->so_statistics.primitives_storage_needed +=
-			qmem->stream[query->stream].generated_primitives & mask;
-		break;
-	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		result->b |= qmem->stream[query->stream].emitted_primitives !=
-			     qmem->stream[query->stream].generated_primitives;
-		break;
-	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-		for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
-			result->b |= qmem->stream[query->stream].emitted_primitives !=
-				     qmem->stream[query->stream].generated_primitives;
-		}
-		break;
-	default:
-		assert(0);
-	}
+   switch (query->b.type) {
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      result->u64 += qmem->stream[query->stream].generated_primitives & mask;
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      result->so_statistics.num_primitives_written +=
+         qmem->stream[query->stream].emitted_primitives & mask;
+      result->so_statistics.primitives_storage_needed +=
+         qmem->stream[query->stream].generated_primitives & mask;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      result->b |= qmem->stream[query->stream].emitted_primitives !=
+                   qmem->stream[query->stream].generated_primitives;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+         result->b |= qmem->stream[query->stream].emitted_primitives !=
+                      qmem->stream[query->stream].generated_primitives;
+      }
+      break;
+   default:
+      assert(0);
+   }
 }

-static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery,
-				      bool wait, union pipe_query_result *result)
+static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
+                                      union pipe_query_result *result)
 {
-	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;

-	util_query_clear_result(result, query->b.type);
+   util_query_clear_result(result, query->b.type);

-	if (unlikely(!query->first))
-		return false; /* earlier out of memory error */
-	assert(query->last);
+   if (unlikely(!query->first))
+      return false; /* earlier out of memory error */
+   assert(query->last);

-	for (struct gfx10_sh_query_buffer *qbuf = query->last;;
-	     qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
-		unsigned usage = PIPE_TRANSFER_READ |
-				 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
-		void *map;
+   for (struct gfx10_sh_query_buffer *qbuf = query->last;;
+        qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
+      unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+      void *map;

-		if (rquery->b.flushed)
-			map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
-		else
-			map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+      if (rquery->b.flushed)
+         map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+      else
+         map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);

-		if (!map)
-			return false;
+      if (!map)
+         return false;

-		unsigned results_begin = 0;
-		unsigned results_end = qbuf->head;
-		if (qbuf == query->first)
-			results_begin = query->first_begin;
-		if (qbuf == query->last)
-			results_end = query->last_end;
+      unsigned results_begin = 0;
+      unsigned results_end = qbuf->head;
+      if (qbuf == query->first)
+         results_begin = query->first_begin;
+      if (qbuf == query->last)
+         results_end = query->last_end;

-		while (results_begin != results_end) {
-			struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
-			results_begin += sizeof(*qmem);
+      while (results_begin != results_end) {
+         struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
+         results_begin += sizeof(*qmem);

-			gfx10_sh_query_add_result(query, qmem, result);
-		}
+         gfx10_sh_query_add_result(query, qmem, result);
+      }

-		if (qbuf == query->first)
-			break;
-	}
+      if (qbuf == query->first)
+         break;
+   }

-	return true;
+   return true;
 }

-static void gfx10_sh_query_get_result_resource(struct si_context *sctx,
-					       struct si_query *rquery,
-					       bool wait,
-					       enum pipe_query_value_type result_type,
-					       int index,
-					       struct pipe_resource *resource,
-					       unsigned offset)
+static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
+                                               bool wait, enum pipe_query_value_type result_type,
+                                               int index, struct pipe_resource *resource,
+                                               unsigned offset)
 {
-	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
-	struct si_qbo_state saved_state = {};
-	struct pipe_resource *tmp_buffer = NULL;
-	unsigned tmp_buffer_offset = 0;
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   struct si_qbo_state saved_state = {};
+   struct pipe_resource *tmp_buffer = NULL;
+   unsigned tmp_buffer_offset = 0;

-	if (!sctx->sh_query_result_shader) {
-		sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
-		if (!sctx->sh_query_result_shader)
-			return;
-	}
+   if (!sctx->sh_query_result_shader) {
+      sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
+      if (!sctx->sh_query_result_shader)
+         return;
+   }

-	if (query->first != query->last) {
-		u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
-				     &tmp_buffer_offset, &tmp_buffer);
-		if (!tmp_buffer)
-			return;
-	}
+   if (query->first != query->last) {
+      u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
+      if (!tmp_buffer)
+         return;
+   }

-	si_save_qbo_state(sctx, &saved_state);
+   si_save_qbo_state(sctx, &saved_state);

-	/* Pre-fill the constants configuring the shader behavior. */
-	struct {
-		uint32_t config;
-		uint32_t offset;
-		uint32_t chain;
-		uint32_t result_count;
-	} consts;
-	struct pipe_constant_buffer constant_buffer = {};
+   /* Pre-fill the constants configuring the shader behavior. */
+   struct {
+      uint32_t config;
+      uint32_t offset;
+      uint32_t chain;
+      uint32_t result_count;
+   } consts;
+   struct pipe_constant_buffer constant_buffer = {};

-	if (index >= 0) {
-		switch (query->b.type) {
-		case PIPE_QUERY_PRIMITIVES_GENERATED:
-			consts.offset = sizeof(uint32_t) * query->stream;
-			consts.config = 0;
-			break;
-		case PIPE_QUERY_PRIMITIVES_EMITTED:
-			consts.offset = sizeof(uint32_t) * (4 + query->stream);
-			consts.config = 0;
-			break;
-		case PIPE_QUERY_SO_STATISTICS:
-			consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
-			consts.config = 0;
-			break;
-		case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-			consts.offset = sizeof(uint32_t) * query->stream;
-			consts.config = 2;
-			break;
-		case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-			consts.offset = 0;
-			consts.config = 3;
-			break;
-		default: unreachable("bad query type");
-		}
-	} else {
-		/* Check result availability. */
-		consts.offset = 0;
-		consts.config = 1;
-	}
+   if (index >= 0) {
+      switch (query->b.type) {
+      case PIPE_QUERY_PRIMITIVES_GENERATED:
+         consts.offset = sizeof(uint32_t) * query->stream;
+         consts.config = 0;
+         break;
+      case PIPE_QUERY_PRIMITIVES_EMITTED:
+         consts.offset = sizeof(uint32_t) * (4 + query->stream);
+         consts.config = 0;
+         break;
+      case PIPE_QUERY_SO_STATISTICS:
+         consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
+         consts.config = 0;
+         break;
+      case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+         consts.offset = sizeof(uint32_t) * query->stream;
+         consts.config = 2;
+         break;
+      case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+         consts.offset = 0;
+         consts.config = 3;
+         break;
+      default:
+         unreachable("bad query type");
+      }
+   } else {
+      /* Check result availability. */
+      consts.offset = 0;
+      consts.config = 1;
+   }

-	if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
-		consts.config |= 8;
+   if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
+      consts.config |= 8;

-	constant_buffer.buffer_size = sizeof(consts);
-	constant_buffer.user_buffer = &consts;
+   constant_buffer.buffer_size = sizeof(consts);
+   constant_buffer.user_buffer = &consts;

-	/* Pre-fill the SSBOs and grid. */
-	struct pipe_shader_buffer ssbo[3];
-	struct pipe_grid_info grid = {};
+   /* Pre-fill the SSBOs and grid. */
+   struct pipe_shader_buffer ssbo[3];
+   struct pipe_grid_info grid = {};

-	ssbo[1].buffer = tmp_buffer;
-	ssbo[1].buffer_offset = tmp_buffer_offset;
-	ssbo[1].buffer_size = 16;
+   ssbo[1].buffer = tmp_buffer;
+   ssbo[1].buffer_offset = tmp_buffer_offset;
+   ssbo[1].buffer_size = 16;

-	ssbo[2] = ssbo[1];
+   ssbo[2] = ssbo[1];

-	sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
+   sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);

-	grid.block[0] = 1;
-	grid.block[1] = 1;
-	grid.block[2] = 1;
-	grid.grid[0] = 1;
-	grid.grid[1] = 1;
-	grid.grid[2] = 1;
+   grid.block[0] = 1;
+   grid.block[1] = 1;
+   grid.block[2] = 1;
+   grid.grid[0] = 1;
+   grid.grid[1] = 1;
+   grid.grid[2] = 1;

-	struct gfx10_sh_query_buffer *qbuf = query->first;
-	for (;;) {
-		unsigned begin = qbuf == query->first ? query->first_begin : 0;
-		unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
-		if (!end)
-			continue;
+   struct gfx10_sh_query_buffer *qbuf = query->first;
+   for (;;) {
+      unsigned begin = qbuf == query->first ? query->first_begin : 0;
+      unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
+      if (!end)
+         continue;

-		ssbo[0].buffer = &qbuf->buf->b.b;
-		ssbo[0].buffer_offset = begin;
-		ssbo[0].buffer_size = end - begin;
+      ssbo[0].buffer = &qbuf->buf->b.b;
+      ssbo[0].buffer_offset = begin;
+      ssbo[0].buffer_size = end - begin;

-		consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
-		consts.chain = 0;
-		if (qbuf != query->first)
-			consts.chain |= 1;
-		if (qbuf != query->last)
-			consts.chain |= 2;
+      consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
+      consts.chain = 0;
+      if (qbuf != query->first)
+         consts.chain |= 1;
+      if (qbuf != query->last)
+         consts.chain |= 2;

-		if (qbuf == query->last) {
-			ssbo[2].buffer = resource;
-			ssbo[2].buffer_offset = offset;
-			ssbo[2].buffer_size = 8;
-		}
+      if (qbuf == query->last) {
+         ssbo[2].buffer = resource;
+         ssbo[2].buffer_offset = offset;
+         ssbo[2].buffer_size = 8;
+      }

-		sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
-		sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
+      sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+      sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);

-		if (wait) {
-			uint64_t va;
+      if (wait) {
+         uint64_t va;

-			/* Wait for result availability. Wait only for readiness
-			 * of the last entry, since the fence writes should be
-			 * serialized in the CP.
-			 */
-			va = qbuf->buf->gpu_address;
-			va += end - sizeof(struct gfx10_sh_query_buffer_mem);
-			va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+         /* Wait for result availability. Wait only for readiness
+          * of the last entry, since the fence writes should be
+          * serialized in the CP.
+          */
+         va = qbuf->buf->gpu_address;
+         va += end - sizeof(struct gfx10_sh_query_buffer_mem);
+         va += offsetof(struct gfx10_sh_query_buffer_mem, fence);

-			si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
-		}
+         si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
+      }

-		sctx->b.launch_grid(&sctx->b, &grid);
-		sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+      sctx->b.launch_grid(&sctx->b, &grid);
+      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;

-		if (qbuf == query->last)
-			break;
-		qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
-	}
+      if (qbuf == query->last)
+         break;
+      qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+   }

-	si_restore_qbo_state(sctx, &saved_state);
-	pipe_resource_reference(&tmp_buffer, NULL);
+   si_restore_qbo_state(sctx, &saved_state);
+   pipe_resource_reference(&tmp_buffer, NULL);
 }

 static const struct si_query_ops gfx10_sh_query_ops = {
-	.destroy = gfx10_sh_query_destroy,
-	.begin = gfx10_sh_query_begin,
-	.end = gfx10_sh_query_end,
-	.get_result = gfx10_sh_query_get_result,
-	.get_result_resource = gfx10_sh_query_get_result_resource,
+   .destroy = gfx10_sh_query_destroy,
+   .begin = gfx10_sh_query_begin,
+   .end = gfx10_sh_query_end,
+   .get_result = gfx10_sh_query_get_result,
+   .get_result_resource = gfx10_sh_query_get_result_resource,
 };

-struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
-					 enum pipe_query_type query_type,
-					 unsigned index)
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
+                                         unsigned index)
 {
-	struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
-	if (unlikely(!query))
-		return NULL;
+   struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
+   if (unlikely(!query))
+      return NULL;

-	query->b.ops = &gfx10_sh_query_ops;
-	query->b.type = query_type;
-	query->stream = index;
+   query->b.ops = &gfx10_sh_query_ops;
+   query->b.type = query_type;
+   query->stream = index;

-	return (struct pipe_query *)query;
+   return (struct pipe_query *)query;
 }

 void gfx10_init_query(struct si_context *sctx)
 {
-	list_inithead(&sctx->shader_query_buffers);
-	sctx->atoms.s.shader_query.emit = emit_shader_query;
+   list_inithead(&sctx->shader_query_buffers);
+   sctx->atoms.s.shader_query.emit = emit_shader_query;
 }

 void gfx10_destroy_query(struct si_context *sctx)
 {
-	while (!list_is_empty(&sctx->shader_query_buffers)) {
-		struct gfx10_sh_query_buffer *qbuf =
-			list_first_entry(&sctx->shader_query_buffers,
-					 struct gfx10_sh_query_buffer, list);
-		list_del(&qbuf->list);
+   while (!list_is_empty(&sctx->shader_query_buffers)) {
+      struct gfx10_sh_query_buffer *qbuf =
+         list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+      list_del(&qbuf->list);

-		assert(!qbuf->refcount);
-		si_resource_reference(&qbuf->buf, NULL);
-		FREE(qbuf);
-	}
+      assert(!qbuf->refcount);
+      si_resource_reference(&qbuf->buf, NULL);
+      FREE(qbuf);
+   }
 }
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
--- a/src/gallium/drivers/radeonsi/si_build_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_build_pm4.h
@ -34,131 +34,128 @@

 static inline void radeon_set_config_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
-	assert(reg < SI_CONTEXT_REG_OFFSET);
-	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
-	radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
-	radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2);
+   assert(reg < SI_CONTEXT_REG_OFFSET);
+   assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
+   radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2);
 }

 static inline void radeon_set_config_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
-	radeon_set_config_reg_seq(cs, reg, 1);
-	radeon_emit(cs, value);
+   radeon_set_config_reg_seq(cs, reg, 1);
+   radeon_emit(cs, value);
 }

 static inline void radeon_set_context_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
-	assert(reg >= SI_CONTEXT_REG_OFFSET);
-	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
-	radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
-	radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
+   assert(reg >= SI_CONTEXT_REG_OFFSET);
+   assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
+   radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
 }

 static inline void radeon_set_context_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
-	radeon_set_context_reg_seq(cs, reg, 1);
-	radeon_emit(cs, value);
+   radeon_set_context_reg_seq(cs, reg, 1);
+   radeon_emit(cs, value);
 }

-static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs,
-					      unsigned reg, unsigned idx,
-					      unsigned value)
+static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs, unsigned reg, unsigned idx,
+                                              unsigned value)
 {
-	assert(reg >= SI_CONTEXT_REG_OFFSET);
-	assert(cs->current.cdw + 3 <= cs->current.max_dw);
-	radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0));
-	radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28));
-	radeon_emit(cs, value);
+   assert(reg >= SI_CONTEXT_REG_OFFSET);
+   assert(cs->current.cdw + 3 <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0));
+   radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28));
+   radeon_emit(cs, value);
 }

 static inline void radeon_set_sh_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
-	assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
-	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
-	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
-	radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
+   assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
+   assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
+   radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
 }

 static inline void radeon_set_sh_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
-	radeon_set_sh_reg_seq(cs, reg, 1);
-	radeon_emit(cs, value);
+   radeon_set_sh_reg_seq(cs, reg, 1);
+   radeon_emit(cs, value);
 }

 static inline void radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
-	assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
-	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
-	radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0));
-	radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
+   assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
+   assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0));
+   radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
 }

 static inline void radeon_set_uconfig_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
-	radeon_set_uconfig_reg_seq(cs, reg, 1);
-	radeon_emit(cs, value);
+   radeon_set_uconfig_reg_seq(cs, reg, 1);
+   radeon_emit(cs, value);
 }

-static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs,
-					      struct si_screen *screen,
-					      unsigned reg, unsigned idx,
-					      unsigned value)
+static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs, struct si_screen *screen,
+                                              unsigned reg, unsigned idx, unsigned value)
 {
-	assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
-	assert(cs->current.cdw + 3 <= cs->current.max_dw);
-	assert(idx != 0);
-	unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
-	if (screen->info.chip_class < GFX9 ||
-	    (screen->info.chip_class == GFX9 && screen->info.me_fw_version < 26))
-		opcode = PKT3_SET_UCONFIG_REG;
-	radeon_emit(cs, PKT3(opcode, 1, 0));
-	radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28));
-	radeon_emit(cs, value);
+   assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
+   assert(cs->current.cdw + 3 <= cs->current.max_dw);
+   assert(idx != 0);
+   unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
+   if (screen->info.chip_class < GFX9 ||
+       (screen->info.chip_class == GFX9 && screen->info.me_fw_version < 26))
+      opcode = PKT3_SET_UCONFIG_REG;
+   radeon_emit(cs, PKT3(opcode, 1, 0));
+   radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28));
+   radeon_emit(cs, value);
 }

 static inline void radeon_set_context_reg_rmw(struct radeon_cmdbuf *cs, unsigned reg,
-					      unsigned value, unsigned mask)
+                                              unsigned value, unsigned mask)
 {
-	assert(reg >= SI_CONTEXT_REG_OFFSET);
-	assert(cs->current.cdw + 4 <= cs->current.max_dw);
-	radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0));
-	radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
-	radeon_emit(cs, mask);
-	radeon_emit(cs, value);
+   assert(reg >= SI_CONTEXT_REG_OFFSET);
+   assert(cs->current.cdw + 4 <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0));
+   radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
+   radeon_emit(cs, mask);
+   radeon_emit(cs, value);
 }

 /* Emit PKT3_CONTEXT_REG_RMW if the register value is different. */
 static inline void radeon_opt_set_context_reg_rmw(struct si_context *sctx, unsigned offset,
-						  enum si_tracked_reg reg, unsigned value,
-						  unsigned mask)
+                                                  enum si_tracked_reg reg, unsigned value,
+                                                  unsigned mask)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;

-	assert((value & ~mask) == 0);
-	value &= mask;
+   assert((value & ~mask) == 0);
+   value &= mask;

-	if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
-	    sctx->tracked_regs.reg_value[reg] != value) {
-		radeon_set_context_reg_rmw(cs, offset, value, mask);
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
+       sctx->tracked_regs.reg_value[reg] != value) {
+      radeon_set_context_reg_rmw(cs, offset, value, mask);

-		sctx->tracked_regs.reg_saved |= 0x1ull << reg;
-		sctx->tracked_regs.reg_value[reg] = value;
-	}
+      sctx->tracked_regs.reg_saved |= 0x1ull << reg;
+      sctx->tracked_regs.reg_value[reg] = value;
+   }
 }

 /* Emit PKT3_SET_CONTEXT_REG if the register value is different. */
 static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned offset,
-					      enum si_tracked_reg reg, unsigned value)
+                                              enum si_tracked_reg reg, unsigned value)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;

-	if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
-	    sctx->tracked_regs.reg_value[reg] != value) {
-		radeon_set_context_reg(cs, offset, value);
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
+       sctx->tracked_regs.reg_value[reg] != value) {
+      radeon_set_context_reg(cs, offset, value);

-		sctx->tracked_regs.reg_saved |= 0x1ull << reg;
-		sctx->tracked_regs.reg_value[reg] = value;
-	}
+      sctx->tracked_regs.reg_saved |= 0x1ull << reg;
+      sctx->tracked_regs.reg_value[reg] = value;
+   }
 }

 /**
@ -168,98 +165,96 @@ static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned
 * @param value2        is written to second register
 */
 static inline void radeon_opt_set_context_reg2(struct si_context *sctx, unsigned offset,
-					       enum si_tracked_reg reg, unsigned value1,
-					       unsigned value2)
+                                               enum si_tracked_reg reg, unsigned value1,
+                                               unsigned value2)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;

-	if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 ||
-	    sctx->tracked_regs.reg_value[reg] != value1 ||
-	    sctx->tracked_regs.reg_value[reg+1] != value2) {
-		radeon_set_context_reg_seq(cs, offset, 2);
-		radeon_emit(cs, value1);
-		radeon_emit(cs, value2);
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 ||
+       sctx->tracked_regs.reg_value[reg] != value1 ||
+       sctx->tracked_regs.reg_value[reg + 1] != value2) {
+      radeon_set_context_reg_seq(cs, offset, 2);
+      radeon_emit(cs, value1);
+      radeon_emit(cs, value2);

-		sctx->tracked_regs.reg_value[reg] = value1;
-		sctx->tracked_regs.reg_value[reg+1] = value2;
-		sctx->tracked_regs.reg_saved |= 0x3ull << reg;
-	}
+      sctx->tracked_regs.reg_value[reg] = value1;
+      sctx->tracked_regs.reg_value[reg + 1] = value2;
+      sctx->tracked_regs.reg_saved |= 0x3ull << reg;
+   }
 }

 /**
 * Set 3 consecutive registers if any registers value is different.
 */
 static inline void radeon_opt_set_context_reg3(struct si_context *sctx, unsigned offset,
-					       enum si_tracked_reg reg, unsigned value1,
-					       unsigned value2, unsigned value3)
+                                               enum si_tracked_reg reg, unsigned value1,
+                                               unsigned value2, unsigned value3)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;

-	if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 ||
-	    sctx->tracked_regs.reg_value[reg] != value1 ||
-	    sctx->tracked_regs.reg_value[reg+1] != value2 ||
-	    sctx->tracked_regs.reg_value[reg+2] != value3) {
-		radeon_set_context_reg_seq(cs, offset, 3);
-		radeon_emit(cs, value1);
-		radeon_emit(cs, value2);
-		radeon_emit(cs, value3);
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 ||
+       sctx->tracked_regs.reg_value[reg] != value1 ||
+       sctx->tracked_regs.reg_value[reg + 1] != value2 ||
+       sctx->tracked_regs.reg_value[reg + 2] != value3) {
+      radeon_set_context_reg_seq(cs, offset, 3);
+      radeon_emit(cs, value1);
+      radeon_emit(cs, value2);
+      radeon_emit(cs, value3);

-		sctx->tracked_regs.reg_value[reg] = value1;
-		sctx->tracked_regs.reg_value[reg+1] = value2;
-		sctx->tracked_regs.reg_value[reg+2] = value3;
-		sctx->tracked_regs.reg_saved |= 0x7ull << reg;
-	}
+      sctx->tracked_regs.reg_value[reg] = value1;
+      sctx->tracked_regs.reg_value[reg + 1] = value2;
+      sctx->tracked_regs.reg_value[reg + 2] = value3;
+      sctx->tracked_regs.reg_saved |= 0x7ull << reg;
+   }
 }

 /**
 * Set 4 consecutive registers if any registers value is different.
 */
 static inline void radeon_opt_set_context_reg4(struct si_context *sctx, unsigned offset,
-					       enum si_tracked_reg reg, unsigned value1,
-					       unsigned value2, unsigned value3,
-					       unsigned value4)
+                                               enum si_tracked_reg reg, unsigned value1,
+                                               unsigned value2, unsigned value3, unsigned value4)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;

-	if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf ||
-	    sctx->tracked_regs.reg_value[reg] != value1 ||
-	    sctx->tracked_regs.reg_value[reg+1] != value2 ||
-	    sctx->tracked_regs.reg_value[reg+2] != value3 ||
-	    sctx->tracked_regs.reg_value[reg+3] != value4) {
-		radeon_set_context_reg_seq(cs, offset, 4);
-		radeon_emit(cs, value1);
-		radeon_emit(cs, value2);
-		radeon_emit(cs, value3);
-		radeon_emit(cs, value4);
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf ||
+       sctx->tracked_regs.reg_value[reg] != value1 ||
+       sctx->tracked_regs.reg_value[reg + 1] != value2 ||
+       sctx->tracked_regs.reg_value[reg + 2] != value3 ||
+       sctx->tracked_regs.reg_value[reg + 3] != value4) {
+      radeon_set_context_reg_seq(cs, offset, 4);
+      radeon_emit(cs, value1);
+      radeon_emit(cs, value2);
+      radeon_emit(cs, value3);
+      radeon_emit(cs, value4);

-		sctx->tracked_regs.reg_value[reg] = value1;
-		sctx->tracked_regs.reg_value[reg+1] = value2;
-		sctx->tracked_regs.reg_value[reg+2] = value3;
-		sctx->tracked_regs.reg_value[reg+3] = value4;
-		sctx->tracked_regs.reg_saved |= 0xfull << reg;
-	}
+      sctx->tracked_regs.reg_value[reg] = value1;
+      sctx->tracked_regs.reg_value[reg + 1] = value2;
+      sctx->tracked_regs.reg_value[reg + 2] = value3;
+      sctx->tracked_regs.reg_value[reg + 3] = value4;
+      sctx->tracked_regs.reg_saved |= 0xfull << reg;
+   }
 }

 /**
 * Set consecutive registers if any registers value is different.
 */
 static inline void radeon_opt_set_context_regn(struct si_context *sctx, unsigned offset,
-					       unsigned *value, unsigned *saved_val,
-					       unsigned num)
+                                               unsigned *value, unsigned *saved_val, unsigned num)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	int i, j;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   int i, j;

-	for (i = 0; i < num; i++) {
-		if (saved_val[i] != value[i]) {
-			radeon_set_context_reg_seq(cs, offset, num);
-			for (j = 0; j < num; j++)
-				radeon_emit(cs, value[j]);
+   for (i = 0; i < num; i++) {
+      if (saved_val[i] != value[i]) {
+         radeon_set_context_reg_seq(cs, offset, num);
+         for (j = 0; j < num; j++)
+            radeon_emit(cs, value[j]);

-			memcpy(saved_val, value, sizeof(uint32_t) * num);
-			break;
-		}
-	}
+         memcpy(saved_val, value, sizeof(uint32_t) * num);
+         break;
+      }
+   }
 }

 #endif
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
--- a/src/gallium/drivers/radeonsi/si_compute.h
+++ b/src/gallium/drivers/radeonsi/si_compute.h
@ -25,35 +25,33 @@
 #ifndef SI_COMPUTE_H
 #define SI_COMPUTE_H

+#include "si_shader.h"
 #include "util/u_inlines.h"

-#include "si_shader.h"
-
 struct si_compute {
-	struct si_shader_selector sel;
-	struct si_shader shader;
+   struct si_shader_selector sel;
+   struct si_shader shader;

-	unsigned ir_type;
-	unsigned local_size;
-	unsigned private_size;
-	unsigned input_size;
+   unsigned ir_type;
+   unsigned local_size;
+   unsigned private_size;
+   unsigned input_size;

-	int max_global_buffers;
-	struct pipe_resource **global_buffers;
+   int max_global_buffers;
+   struct pipe_resource **global_buffers;

-	bool reads_variable_block_size;
-	unsigned num_cs_user_data_dwords;
+   bool reads_variable_block_size;
+   unsigned num_cs_user_data_dwords;
 };

 void si_destroy_compute(struct si_compute *program);

-static inline void
-si_compute_reference(struct si_compute **dst, struct si_compute *src)
+static inline void si_compute_reference(struct si_compute **dst, struct si_compute *src)
 {
-	if (pipe_reference(&(*dst)->sel.base.reference, &src->sel.base.reference))
-		si_destroy_compute(*dst);
+   if (pipe_reference(&(*dst)->sel.base.reference, &src->sel.base.reference))
+      si_destroy_compute(*dst);

-	*dst = src;
+   *dst = src;
 }

 #endif /* SI_COMPUTE_H */
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
--- a/src/gallium/drivers/radeonsi/si_debug_options.h
+++ b/src/gallium/drivers/radeonsi/si_debug_options.h
@ -1,9 +1,11 @@
 OPT_BOOL(aux_debug, false, "Generate ddebug_dumps for the auxiliary context")
 OPT_BOOL(sync_compile, false, "Always compile synchronously (will cause stalls)")
 OPT_BOOL(dump_shader_binary, false, "Dump shader binary as part of ddebug_dumps")
-OPT_BOOL(debug_disassembly, false, "Report shader disassembly as part of driver debug messages (for shader db)")
+OPT_BOOL(debug_disassembly, false,
+         "Report shader disassembly as part of driver debug messages (for shader db)")
 OPT_BOOL(halt_shaders, false, "Halt shaders at the start (will hang)")
-OPT_BOOL(vs_fetch_always_opencode, false, "Always open code vertex fetches (less efficient, purely for testing)")
+OPT_BOOL(vs_fetch_always_opencode, false,
+         "Always open code vertex fetches (less efficient, purely for testing)")
 OPT_BOOL(prim_restart_tri_strips_only, false, "Only enable primitive restart for triangle strips")

 #undef OPT_BOOL
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
--- a/src/gallium/drivers/radeonsi/si_dma_cs.c
+++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
@ -27,304 +27,279 @@

 static void si_dma_emit_wait_idle(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->sdma_cs;
+   struct radeon_cmdbuf *cs = sctx->sdma_cs;

-	/* NOP waits for idle. */
-	if (sctx->chip_class >= GFX7)
-		radeon_emit(cs, 0x00000000); /* NOP */
-	else
-		radeon_emit(cs, 0xf0000000); /* NOP */
+   /* NOP waits for idle. */
+   if (sctx->chip_class >= GFX7)
+      radeon_emit(cs, 0x00000000); /* NOP */
+   else
+      radeon_emit(cs, 0xf0000000); /* NOP */
 }

-void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst,
-			   uint64_t offset)
+void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset)
 {
-	struct radeon_cmdbuf *cs = sctx->sdma_cs;
-	uint64_t va = dst->gpu_address + offset;
+   struct radeon_cmdbuf *cs = sctx->sdma_cs;
+   uint64_t va = dst->gpu_address + offset;

-	if (sctx->chip_class == GFX6) {
-		unreachable("SI DMA doesn't support the timestamp packet.");
-		return;
-	}
+   if (sctx->chip_class == GFX6) {
+      unreachable("SI DMA doesn't support the timestamp packet.");
+      return;
+   }

-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8);
+   /* Mark the buffer range of destination as valid (initialized),
+    * so that transfer_map knows it should wait for the GPU when mapping
+    * that range. */
+   util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8);

-	assert(va % 8 == 0);
+   assert(va % 8 == 0);

-	si_need_dma_space(sctx, 4, dst, NULL);
-	si_dma_emit_wait_idle(sctx);
+   si_need_dma_space(sctx, 4, dst, NULL);
+   si_dma_emit_wait_idle(sctx);

-	radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP,
-					SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP,
-					0));
-	radeon_emit(cs, va);
-	radeon_emit(cs, va >> 32);
+   radeon_emit(
+      cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0));
+   radeon_emit(cs, va);
+   radeon_emit(cs, va >> 32);
 }

-void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
-			  uint64_t offset, uint64_t size, unsigned clear_value)
+void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
+                          uint64_t size, unsigned clear_value)
 {
-	struct radeon_cmdbuf *cs = sctx->sdma_cs;
-	unsigned i, ncopy, csize;
-	struct si_resource *sdst = si_resource(dst);
+   struct radeon_cmdbuf *cs = sctx->sdma_cs;
+   unsigned i, ncopy, csize;
+   struct si_resource *sdst = si_resource(dst);

-	assert(offset % 4 == 0);
-	assert(size);
-	assert(size % 4 == 0);
+   assert(offset % 4 == 0);
+   assert(size);
+   assert(size % 4 == 0);

-	if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
-	    sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS)) {
-		sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
-		return;
-	}
+   if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
+       sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS)) {
+      sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
+      return;
+   }

-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
+   /* Mark the buffer range of destination as valid (initialized),
+    * so that transfer_map knows it should wait for the GPU when mapping
+    * that range. */
+   util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);

-	offset += sdst->gpu_address;
+   offset += sdst->gpu_address;

-	if (sctx->chip_class == GFX6) {
-		/* the same maximum size as for copying */
-		ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
-		si_need_dma_space(sctx, ncopy * 4, sdst, NULL);
+   if (sctx->chip_class == GFX6) {
+      /* the same maximum size as for copying */
+      ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
+      si_need_dma_space(sctx, ncopy * 4, sdst, NULL);

-		for (i = 0; i < ncopy; i++) {
-			csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
-			radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0,
-						      csize / 4));
-			radeon_emit(cs, offset);
-			radeon_emit(cs, clear_value);
-			radeon_emit(cs, (offset >> 32) << 16);
-			offset += csize;
-			size -= csize;
-		}
-		return;
-	}
+      for (i = 0; i < ncopy; i++) {
+         csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
+         radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, csize / 4));
+         radeon_emit(cs, offset);
+         radeon_emit(cs, clear_value);
+         radeon_emit(cs, (offset >> 32) << 16);
+         offset += csize;
+         size -= csize;
+      }
+      return;
+   }

-	/* The following code is for Sea Islands and later. */
-	/* the same maximum size as for copying */
-	ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
-	si_need_dma_space(sctx, ncopy * 5, sdst, NULL);
+   /* The following code is for Sea Islands and later. */
+   /* the same maximum size as for copying */
+   ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
+   si_need_dma_space(sctx, ncopy * 5, sdst, NULL);

-	for (i = 0; i < ncopy; i++) {
-		csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
-		radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0,
-						0x8000 /* dword copy */));
-		radeon_emit(cs, offset);
-		radeon_emit(cs, offset >> 32);
-		radeon_emit(cs, clear_value);
-		/* dw count */
-		radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc);
-		offset += csize;
-		size -= csize;
-	}
+   for (i = 0; i < ncopy; i++) {
+      csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
+      radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, 0x8000 /* dword copy */));
+      radeon_emit(cs, offset);
+      radeon_emit(cs, offset >> 32);
+      radeon_emit(cs, clear_value);
+      /* dw count */
+      radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc);
+      offset += csize;
+      size -= csize;
+   }
 }

 void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
-			 struct pipe_resource *src, uint64_t dst_offset,
-			 uint64_t src_offset, uint64_t size)
+                         struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
+                         uint64_t size)
 {
-	struct radeon_cmdbuf *cs = sctx->sdma_cs;
-	unsigned i, ncopy, csize;
-	struct si_resource *sdst = si_resource(dst);
-	struct si_resource *ssrc = si_resource(src);
+   struct radeon_cmdbuf *cs = sctx->sdma_cs;
+   unsigned i, ncopy, csize;
+   struct si_resource *sdst = si_resource(dst);
+   struct si_resource *ssrc = si_resource(src);

-	if (!cs ||
-	    dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
-	    src->flags & PIPE_RESOURCE_FLAG_SPARSE) {
-		si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
-		return;
-	}
+   if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE || src->flags & PIPE_RESOURCE_FLAG_SPARSE) {
+      si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
+      return;
+   }

-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	util_range_add(dst, &sdst->valid_buffer_range, dst_offset,
-		       dst_offset + size);
+   /* Mark the buffer range of destination as valid (initialized),
+    * so that transfer_map knows it should wait for the GPU when mapping
+    * that range. */
+   util_range_add(dst, &sdst->valid_buffer_range, dst_offset, dst_offset + size);

-	dst_offset += sdst->gpu_address;
-	src_offset += ssrc->gpu_address;
+   dst_offset += sdst->gpu_address;
+   src_offset += ssrc->gpu_address;

-	if (sctx->chip_class == GFX6) {
-		unsigned max_size, sub_cmd, shift;
+   if (sctx->chip_class == GFX6) {
+      unsigned max_size, sub_cmd, shift;

-		/* see whether we should use the dword-aligned or byte-aligned copy */
-		if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
-			sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
-			shift = 2;
-			max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE;
-		} else {
-			sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
-			shift = 0;
-			max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE;
-		}
+      /* see whether we should use the dword-aligned or byte-aligned copy */
+      if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
+         sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
+         shift = 2;
+         max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE;
+      } else {
+         sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
+         shift = 0;
+         max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE;
+      }

-		ncopy = DIV_ROUND_UP(size, max_size);
-		si_need_dma_space(sctx, ncopy * 5, sdst, ssrc);
+      ncopy = DIV_ROUND_UP(size, max_size);
+      si_need_dma_space(sctx, ncopy * 5, sdst, ssrc);

-		for (i = 0; i < ncopy; i++) {
-			csize = MIN2(size, max_size);
-			radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd,
-						      csize >> shift));
-			radeon_emit(cs, dst_offset);
-			radeon_emit(cs, src_offset);
-			radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
-			radeon_emit(cs, (src_offset >> 32UL) & 0xff);
-			dst_offset += csize;
-			src_offset += csize;
-			size -= csize;
-		}
-		return;
-	}
+      for (i = 0; i < ncopy; i++) {
+         csize = MIN2(size, max_size);
+         radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize >> shift));
+         radeon_emit(cs, dst_offset);
+         radeon_emit(cs, src_offset);
+         radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
+         radeon_emit(cs, (src_offset >> 32UL) & 0xff);
+         dst_offset += csize;
+         src_offset += csize;
+         size -= csize;
+      }
+      return;
+   }

-	/* The following code is for CI and later. */
-	unsigned align = ~0u;
-	ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
+   /* The following code is for CI and later. */
+   unsigned align = ~0u;
+   ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);

-	/* Align copy size to dw if src/dst address are dw aligned */
-	if ((src_offset & 0x3) == 0 &&
-	    (dst_offset & 0x3) == 0 &&
-	    size > 4 &&
-	    (size & 3) != 0) {
-		align = ~0x3u;
-		ncopy++;
-	}
+   /* Align copy size to dw if src/dst address are dw aligned */
+   if ((src_offset & 0x3) == 0 && (dst_offset & 0x3) == 0 && size > 4 && (size & 3) != 0) {
+      align = ~0x3u;
+      ncopy++;
+   }

-	si_need_dma_space(sctx, ncopy * 7, sdst, ssrc);
+   si_need_dma_space(sctx, ncopy * 7, sdst, ssrc);

-	for (i = 0; i < ncopy; i++) {
-		csize = size >= 4 ? MIN2(size & align, CIK_SDMA_COPY_MAX_SIZE) : size;
-		radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
-						CIK_SDMA_COPY_SUB_OPCODE_LINEAR,
-						0));
-		radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
-		radeon_emit(cs, 0); /* src/dst endian swap */
-		radeon_emit(cs, src_offset);
-		radeon_emit(cs, src_offset >> 32);
-		radeon_emit(cs, dst_offset);
-		radeon_emit(cs, dst_offset >> 32);
-		dst_offset += csize;
-		src_offset += csize;
-		size -= csize;
-	}
+   for (i = 0; i < ncopy; i++) {
+      csize = size >= 4 ? MIN2(size & align, CIK_SDMA_COPY_MAX_SIZE) : size;
+      radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR, 0));
+      radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
+      radeon_emit(cs, 0); /* src/dst endian swap */
+      radeon_emit(cs, src_offset);
+      radeon_emit(cs, src_offset >> 32);
+      radeon_emit(cs, dst_offset);
+      radeon_emit(cs, dst_offset >> 32);
+      dst_offset += csize;
+      src_offset += csize;
+      size -= csize;
+   }
 }

-void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
-		       struct si_resource *dst, struct si_resource *src)
+void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst,
+                       struct si_resource *src)
 {
-	struct radeon_winsys *ws = ctx->ws;
-	uint64_t vram = ctx->sdma_cs->used_vram;
-	uint64_t gtt = ctx->sdma_cs->used_gart;
+   struct radeon_winsys *ws = ctx->ws;
+   uint64_t vram = ctx->sdma_cs->used_vram;
+   uint64_t gtt = ctx->sdma_cs->used_gart;

-	if (dst) {
-		vram += dst->vram_usage;
-		gtt += dst->gart_usage;
-	}
-	if (src) {
-		vram += src->vram_usage;
-		gtt += src->gart_usage;
-	}
+   if (dst) {
+      vram += dst->vram_usage;
+      gtt += dst->gart_usage;
+   }
+   if (src) {
+      vram += src->vram_usage;
+      gtt += src->gart_usage;
+   }

-	/* Flush the GFX IB if DMA depends on it. */
-	if (!ctx->sdma_uploads_in_progress &&
-	    radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
-	    ((dst &&
-	      ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
-					  RADEON_USAGE_READWRITE)) ||
-	     (src &&
-	      ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf,
-					  RADEON_USAGE_WRITE))))
-		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+   /* Flush the GFX IB if DMA depends on it. */
+   if (!ctx->sdma_uploads_in_progress && radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
+       ((dst && ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
+        (src && ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf, RADEON_USAGE_WRITE))))
+      si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);

-	/* Flush if there's not enough space, or if the memory usage per IB
-	 * is too large.
-	 *
-	 * IBs using too little memory are limited by the IB submission overhead.
-	 * IBs using too much memory are limited by the kernel/TTM overhead.
-	 * Too long IBs create CPU-GPU pipeline bubbles and add latency.
-	 *
-	 * This heuristic makes sure that DMA requests are executed
-	 * very soon after the call is made and lowers memory usage.
-	 * It improves texture upload performance by keeping the DMA
-	 * engine busy while uploads are being submitted.
-	 */
-	num_dw++; /* for emit_wait_idle below */
-	if (!ctx->sdma_uploads_in_progress &&
-	    (!ws->cs_check_space(ctx->sdma_cs, num_dw, false) ||
-	     ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 ||
-	     !radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) {
-		si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
-		assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw);
-	}
+   /* Flush if there's not enough space, or if the memory usage per IB
+    * is too large.
+    *
+    * IBs using too little memory are limited by the IB submission overhead.
+    * IBs using too much memory are limited by the kernel/TTM overhead.
+    * Too long IBs create CPU-GPU pipeline bubbles and add latency.
+    *
+    * This heuristic makes sure that DMA requests are executed
+    * very soon after the call is made and lowers memory usage.
+    * It improves texture upload performance by keeping the DMA
+    * engine busy while uploads are being submitted.
+    */
+   num_dw++; /* for emit_wait_idle below */
+   if (!ctx->sdma_uploads_in_progress &&
+       (!ws->cs_check_space(ctx->sdma_cs, num_dw, false) ||
+        ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 ||
+        !radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) {
+      si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+      assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw);
+   }

-	/* Wait for idle if either buffer has been used in the IB before to
-	 * prevent read-after-write hazards.
-	 */
-	if ((dst &&
-	     ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf,
-					 RADEON_USAGE_READWRITE)) ||
-	    (src &&
-	     ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf,
-					 RADEON_USAGE_WRITE)))
-		si_dma_emit_wait_idle(ctx);
+   /* Wait for idle if either buffer has been used in the IB before to
+    * prevent read-after-write hazards.
+    */
+   if ((dst && ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
+       (src && ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf, RADEON_USAGE_WRITE)))
+      si_dma_emit_wait_idle(ctx);

-	unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
-	if (dst) {
-		ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync,
-				  dst->domains, 0);
-	}
-	if (src) {
-		ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync,
-				  src->domains, 0);
-	}
+   unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
+   if (dst) {
+      ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync, dst->domains, 0);
+   }
+   if (src) {
+      ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync, src->domains, 0);
+   }

-	/* this function is called before all DMA calls, so increment this. */
-	ctx->num_dma_calls++;
+   /* this function is called before all DMA calls, so increment this. */
+   ctx->num_dma_calls++;
 }

-void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
-		     struct pipe_fence_handle **fence)
+void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
 {
-	struct radeon_cmdbuf *cs = ctx->sdma_cs;
-	struct radeon_saved_cs saved;
-	bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
+   struct radeon_cmdbuf *cs = ctx->sdma_cs;
+   struct radeon_saved_cs saved;
+   bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;

-	if (!radeon_emitted(cs, 0)) {
-		if (fence)
-			ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
-		return;
-	}
+   if (!radeon_emitted(cs, 0)) {
+      if (fence)
+         ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
+      return;
+   }

-	if (check_vm)
-		si_save_cs(ctx->ws, cs, &saved, true);
+   if (check_vm)
+      si_save_cs(ctx->ws, cs, &saved, true);

-	ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
-	if (fence)
-		ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
+   ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
+   if (fence)
+      ctx->ws->fence_reference(fence, ctx->last_sdma_fence);

-	if (check_vm) {
-		/* Use conservative timeout 800ms, after which we won't wait any
-		 * longer and assume the GPU is hung.
-		 */
-		ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800*1000*1000);
+   if (check_vm) {
+      /* Use conservative timeout 800ms, after which we won't wait any
+       * longer and assume the GPU is hung.
+       */
+      ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800 * 1000 * 1000);

-		si_check_vm_faults(ctx, &saved, RING_DMA);
-		si_clear_saved_cs(&saved);
-	}
+      si_check_vm_faults(ctx, &saved, RING_DMA);
+      si_clear_saved_cs(&saved);
+   }
 }

-void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
-			    uint64_t offset, uint64_t size, unsigned value)
+void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
+                            uint64_t size, unsigned value)
 {
-	struct si_context *ctx = (struct si_context*)sscreen->aux_context;
+   struct si_context *ctx = (struct si_context *)sscreen->aux_context;

-	simple_mtx_lock(&sscreen->aux_context_lock);
-	si_sdma_clear_buffer(ctx, dst, offset, size, value);
-	sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
-	simple_mtx_unlock(&sscreen->aux_context_lock);
+   simple_mtx_lock(&sscreen->aux_context_lock);
+   si_sdma_clear_buffer(ctx, dst, offset, size, value);
+   sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
+   simple_mtx_unlock(&sscreen->aux_context_lock);
 }
--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@ -23,516 +23,499 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#include "si_pipe.h"
 #include "si_build_pm4.h"
+#include "si_pipe.h"
 #include "sid.h"
-
 #include "util/os_time.h"
 #include "util/u_upload_mgr.h"

 /* initialize */
 void si_need_gfx_cs_space(struct si_context *ctx)
 {
-	struct radeon_cmdbuf *cs = ctx->gfx_cs;
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;

-	/* There is no need to flush the DMA IB here, because
-	 * si_need_dma_space always flushes the GFX IB if there is
-	 * a conflict, which means any unflushed DMA commands automatically
-	 * precede the GFX IB (= they had no dependency on the GFX IB when
-	 * they were submitted).
-	 */
+   /* There is no need to flush the DMA IB here, because
+    * si_need_dma_space always flushes the GFX IB if there is
+    * a conflict, which means any unflushed DMA commands automatically
+    * precede the GFX IB (= they had no dependency on the GFX IB when
+    * they were submitted).
+    */

-	/* There are two memory usage counters in the winsys for all buffers
-	 * that have been added (cs_add_buffer) and two counters in the pipe
-	 * driver for those that haven't been added yet.
-	 */
-	if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs,
-						   ctx->vram, ctx->gtt))) {
-		ctx->gtt = 0;
-		ctx->vram = 0;
-		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-		return;
-	}
-	ctx->gtt = 0;
-	ctx->vram = 0;
+   /* There are two memory usage counters in the winsys for all buffers
+    * that have been added (cs_add_buffer) and two counters in the pipe
+    * driver for those that haven't been added yet.
+    */
+   if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs, ctx->vram, ctx->gtt))) {
+      ctx->gtt = 0;
+      ctx->vram = 0;
+      si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+      return;
+   }
+   ctx->gtt = 0;
+   ctx->vram = 0;

-	unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx);
-	if (!ctx->ws->cs_check_space(cs, need_dwords, false))
-		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+   unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx);
+   if (!ctx->ws->cs_check_space(cs, need_dwords, false))
+      si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 }

 void si_unref_sdma_uploads(struct si_context *sctx)
 {
-	for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
-		si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
-		si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
-	}
-	sctx->num_sdma_uploads = 0;
+   for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
+      si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
+      si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
+   }
+   sctx->num_sdma_uploads = 0;
 }

-void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
-		     struct pipe_fence_handle **fence)
+void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
 {
-	struct radeon_cmdbuf *cs = ctx->gfx_cs;
-	struct radeon_winsys *ws = ctx->ws;
-	const unsigned wait_ps_cs = SI_CONTEXT_PS_PARTIAL_FLUSH |
-				    SI_CONTEXT_CS_PARTIAL_FLUSH;
-	unsigned wait_flags = 0;
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+   struct radeon_winsys *ws = ctx->ws;
+   const unsigned wait_ps_cs = SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+   unsigned wait_flags = 0;

-	if (ctx->gfx_flush_in_progress)
-		return;
+   if (ctx->gfx_flush_in_progress)
+      return;

-	if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
-		wait_flags |= wait_ps_cs |
-			      SI_CONTEXT_INV_L2;
-	} else if (ctx->chip_class == GFX6) {
-		/* The kernel flushes L2 before shaders are finished. */
-		wait_flags |= wait_ps_cs;
-	} else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
-		wait_flags |= wait_ps_cs;
-	}
+   if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
+      wait_flags |= wait_ps_cs | SI_CONTEXT_INV_L2;
+   } else if (ctx->chip_class == GFX6) {
+      /* The kernel flushes L2 before shaders are finished. */
+      wait_flags |= wait_ps_cs;
+   } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
+      wait_flags |= wait_ps_cs;
+   }

-	/* Drop this flush if it's a no-op. */
-	if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) &&
-	    (!wait_flags || !ctx->gfx_last_ib_is_busy))
-		return;
+   /* Drop this flush if it's a no-op. */
+   if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) && (!wait_flags || !ctx->gfx_last_ib_is_busy))
+      return;

-	if (ctx->b.get_device_reset_status(&ctx->b) != PIPE_NO_RESET)
-		return;
+   if (ctx->b.get_device_reset_status(&ctx->b) != PIPE_NO_RESET)
+      return;

-	if (ctx->screen->debug_flags & DBG(CHECK_VM))
-		flags &= ~PIPE_FLUSH_ASYNC;
+   if (ctx->screen->debug_flags & DBG(CHECK_VM))
+      flags &= ~PIPE_FLUSH_ASYNC;

-	ctx->gfx_flush_in_progress = true;
+   ctx->gfx_flush_in_progress = true;

-	/* If the state tracker is flushing the GFX IB, si_flush_from_st is
-	 * responsible for flushing the DMA IB and merging the fences from both.
-	 * If the driver flushes the GFX IB internally, and it should never ask
-	 * for a fence handle.
-	 */
-	assert(!radeon_emitted(ctx->sdma_cs, 0) || fence == NULL);
+   /* If the state tracker is flushing the GFX IB, si_flush_from_st is
+    * responsible for flushing the DMA IB and merging the fences from both.
+    * If the driver flushes the GFX IB internally, and it should never ask
+    * for a fence handle.
+    */
+   assert(!radeon_emitted(ctx->sdma_cs, 0) || fence == NULL);

-	/* Update the sdma_uploads list by flushing the uploader. */
-	u_upload_unmap(ctx->b.const_uploader);
+   /* Update the sdma_uploads list by flushing the uploader. */
+   u_upload_unmap(ctx->b.const_uploader);

-	/* Execute SDMA uploads. */
-	ctx->sdma_uploads_in_progress = true;
-	for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
-		struct si_sdma_upload *up = &ctx->sdma_uploads[i];
+   /* Execute SDMA uploads. */
+   ctx->sdma_uploads_in_progress = true;
+   for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
+      struct si_sdma_upload *up = &ctx->sdma_uploads[i];

-		assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 &&
-		       up->size % 4 == 0);
+      assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 && up->size % 4 == 0);

-		si_sdma_copy_buffer(ctx, &up->dst->b.b, &up->src->b.b,
-				    up->dst_offset, up->src_offset, up->size);
-	}
-	ctx->sdma_uploads_in_progress = false;
-	si_unref_sdma_uploads(ctx);
+      si_sdma_copy_buffer(ctx, &up->dst->b.b, &up->src->b.b, up->dst_offset, up->src_offset,
+                          up->size);
+   }
+   ctx->sdma_uploads_in_progress = false;
+   si_unref_sdma_uploads(ctx);

-	/* Flush SDMA (preamble IB). */
-	if (radeon_emitted(ctx->sdma_cs, 0))
-		si_flush_dma_cs(ctx, flags, NULL);
+   /* Flush SDMA (preamble IB). */
+   if (radeon_emitted(ctx->sdma_cs, 0))
+      si_flush_dma_cs(ctx, flags, NULL);

-	if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) {
-		struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs;
-		si_compute_signal_gfx(ctx);
+   if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) {
+      struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs;
+      si_compute_signal_gfx(ctx);

-		/* Make sure compute shaders are idle before leaving the IB, so that
-		 * the next IB doesn't overwrite GDS that might be in use. */
-		radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) |
-					EVENT_INDEX(4));
+      /* Make sure compute shaders are idle before leaving the IB, so that
+       * the next IB doesn't overwrite GDS that might be in use. */
+      radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));

-		/* Save the GDS prim restart counter if needed. */
-		if (ctx->preserve_prim_restart_gds_at_flush) {
-			si_cp_copy_data(ctx, compute_cs,
-					COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4,
-					COPY_DATA_GDS, NULL, 4);
-		}
-	}
+      /* Save the GDS prim restart counter if needed. */
+      if (ctx->preserve_prim_restart_gds_at_flush) {
+         si_cp_copy_data(ctx, compute_cs, COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4,
+                         COPY_DATA_GDS, NULL, 4);
+      }
+   }

-	if (ctx->has_graphics) {
-		if (!list_is_empty(&ctx->active_queries))
-			si_suspend_queries(ctx);
+   if (ctx->has_graphics) {
+      if (!list_is_empty(&ctx->active_queries))
+         si_suspend_queries(ctx);

-		ctx->streamout.suspended = false;
-		if (ctx->streamout.begin_emitted) {
-			si_emit_streamout_end(ctx);
-			ctx->streamout.suspended = true;
+      ctx->streamout.suspended = false;
+      if (ctx->streamout.begin_emitted) {
+         si_emit_streamout_end(ctx);
+         ctx->streamout.suspended = true;

-			/* Since NGG streamout uses GDS, we need to make GDS
-			 * idle when we leave the IB, otherwise another process
-			 * might overwrite it while our shaders are busy.
-			 */
-			if (ctx->screen->use_ngg_streamout)
-				wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
-		}
-	}
+         /* Since NGG streamout uses GDS, we need to make GDS
+          * idle when we leave the IB, otherwise another process
+          * might overwrite it while our shaders are busy.
+          */
+         if (ctx->screen->use_ngg_streamout)
+            wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+      }
+   }

-	/* Make sure CP DMA is idle at the end of IBs after L2 prefetches
-	 * because the kernel doesn't wait for it. */
-	if (ctx->chip_class >= GFX7)
-		si_cp_dma_wait_for_idle(ctx);
+   /* Make sure CP DMA is idle at the end of IBs after L2 prefetches
+    * because the kernel doesn't wait for it. */
+   if (ctx->chip_class >= GFX7)
+      si_cp_dma_wait_for_idle(ctx);

-	/* Wait for draw calls to finish if needed. */
-	if (wait_flags) {
-		ctx->flags |= wait_flags;
-		ctx->emit_cache_flush(ctx);
-	}
-	ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;
+   /* Wait for draw calls to finish if needed. */
+   if (wait_flags) {
+      ctx->flags |= wait_flags;
+      ctx->emit_cache_flush(ctx);
+   }
+   ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;

-	if (ctx->current_saved_cs) {
-		si_trace_emit(ctx);
+   if (ctx->current_saved_cs) {
+      si_trace_emit(ctx);

-		/* Save the IB for debug contexts. */
-		si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);
-		ctx->current_saved_cs->flushed = true;
-		ctx->current_saved_cs->time_flush = os_time_get_nano();
+      /* Save the IB for debug contexts. */
+      si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);
+      ctx->current_saved_cs->flushed = true;
+      ctx->current_saved_cs->time_flush = os_time_get_nano();

-		si_log_hw_flush(ctx);
-	}
+      si_log_hw_flush(ctx);
+   }

-	if (si_compute_prim_discard_enabled(ctx)) {
-		/* The compute IB can start after the previous gfx IB starts. */
-		if (radeon_emitted(ctx->prim_discard_compute_cs, 0) &&
-		    ctx->last_gfx_fence) {
-			ctx->ws->cs_add_fence_dependency(ctx->gfx_cs,
-							 ctx->last_gfx_fence,
-							 RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY |
-							 RADEON_DEPENDENCY_START_FENCE);
-		}
+   if (si_compute_prim_discard_enabled(ctx)) {
+      /* The compute IB can start after the previous gfx IB starts. */
+      if (radeon_emitted(ctx->prim_discard_compute_cs, 0) && ctx->last_gfx_fence) {
+         ctx->ws->cs_add_fence_dependency(
+            ctx->gfx_cs, ctx->last_gfx_fence,
+            RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY | RADEON_DEPENDENCY_START_FENCE);
+      }

-		/* Remember the last execution barrier. It's in the IB.
-		 * It will signal the start of the next compute IB.
-		 */
-		if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW &&
-		    ctx->last_pkt3_write_data) {
-			*ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
-			ctx->last_pkt3_write_data = NULL;
+      /* Remember the last execution barrier. It's in the IB.
+       * It will signal the start of the next compute IB.
+       */
+      if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW && ctx->last_pkt3_write_data) {
+         *ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
+         ctx->last_pkt3_write_data = NULL;

-			si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
-			ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
-			si_resource_reference(&ctx->barrier_buf, NULL);
+         si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
+         ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
+         si_resource_reference(&ctx->barrier_buf, NULL);

-			ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
-		}
-	}
+         ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
+      }
+   }

-	/* Flush the CS. */
-	ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
-	if (fence)
-		ws->fence_reference(fence, ctx->last_gfx_fence);
+   /* Flush the CS. */
+   ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
+   if (fence)
+      ws->fence_reference(fence, ctx->last_gfx_fence);

-	ctx->num_gfx_cs_flushes++;
+   ctx->num_gfx_cs_flushes++;

-	if (si_compute_prim_discard_enabled(ctx)) {
-		/* Remember the last execution barrier, which is the last fence
-		 * in this case.
-		 */
-		if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
-			ctx->last_pkt3_write_data = NULL;
-			si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
-			ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
-		}
-	}
+   if (si_compute_prim_discard_enabled(ctx)) {
+      /* Remember the last execution barrier, which is the last fence
+       * in this case.
+       */
+      if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
+         ctx->last_pkt3_write_data = NULL;
+         si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
+         ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
+      }
+   }

-	/* Check VM faults if needed. */
-	if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
-		/* Use conservative timeout 800ms, after which we won't wait any
-		 * longer and assume the GPU is hung.
-		 */
-		ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800*1000*1000);
+   /* Check VM faults if needed. */
+   if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
+      /* Use conservative timeout 800ms, after which we won't wait any
+       * longer and assume the GPU is hung.
+       */
+      ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800 * 1000 * 1000);

-		si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX);
-	}
+      si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX);
+   }

-	if (ctx->current_saved_cs)
-		si_saved_cs_reference(&ctx->current_saved_cs, NULL);
+   if (ctx->current_saved_cs)
+      si_saved_cs_reference(&ctx->current_saved_cs, NULL);

-	si_begin_new_gfx_cs(ctx);
-	ctx->gfx_flush_in_progress = false;
+   si_begin_new_gfx_cs(ctx);
+   ctx->gfx_flush_in_progress = false;
 }

 static void si_begin_gfx_cs_debug(struct si_context *ctx)
 {
-	static const uint32_t zeros[1];
-	assert(!ctx->current_saved_cs);
+   static const uint32_t zeros[1];
+   assert(!ctx->current_saved_cs);

-	ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs));
-	if (!ctx->current_saved_cs)
-		return;
+   ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs));
+   if (!ctx->current_saved_cs)
+      return;

-	pipe_reference_init(&ctx->current_saved_cs->reference, 1);
+   pipe_reference_init(&ctx->current_saved_cs->reference, 1);

-	ctx->current_saved_cs->trace_buf = si_resource(
-		pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
-	if (!ctx->current_saved_cs->trace_buf) {
-		free(ctx->current_saved_cs);
-		ctx->current_saved_cs = NULL;
-		return;
-	}
+   ctx->current_saved_cs->trace_buf =
+      si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
+   if (!ctx->current_saved_cs->trace_buf) {
+      free(ctx->current_saved_cs);
+      ctx->current_saved_cs = NULL;
+      return;
+   }

-	pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b,
-				    0, sizeof(zeros), zeros);
-	ctx->current_saved_cs->trace_id = 0;
+   pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b, 0, sizeof(zeros),
+                               zeros);
+   ctx->current_saved_cs->trace_id = 0;

-	si_trace_emit(ctx);
+   si_trace_emit(ctx);

-	radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf,
-			      RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
+   radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf,
+                             RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
 }

 static void si_add_gds_to_buffer_list(struct si_context *sctx)
 {
-	if (sctx->gds) {
-		sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
-				       RADEON_USAGE_READWRITE, 0, 0);
-		if (sctx->gds_oa) {
-			sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
-					       RADEON_USAGE_READWRITE, 0, 0);
-		}
-	}
+   if (sctx->gds) {
+      sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0);
+      if (sctx->gds_oa) {
+         sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0);
+      }
+   }
 }

 void si_allocate_gds(struct si_context *sctx)
 {
-	struct radeon_winsys *ws = sctx->ws;
+   struct radeon_winsys *ws = sctx->ws;

-	if (sctx->gds)
-		return;
+   if (sctx->gds)
+      return;

-	assert(sctx->screen->use_ngg_streamout);
+   assert(sctx->screen->use_ngg_streamout);

-	/* 4 streamout GDS counters.
-	 * We need 256B (64 dw) of GDS, otherwise streamout hangs.
-	 */
-	sctx->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, 0);
-	sctx->gds_oa = ws->buffer_create(ws, 4, 1, RADEON_DOMAIN_OA, 0);
+   /* 4 streamout GDS counters.
+    * We need 256B (64 dw) of GDS, otherwise streamout hangs.
+    */
+   sctx->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, 0);
+   sctx->gds_oa = ws->buffer_create(ws, 4, 1, RADEON_DOMAIN_OA, 0);

-	assert(sctx->gds && sctx->gds_oa);
-	si_add_gds_to_buffer_list(sctx);
+   assert(sctx->gds && sctx->gds_oa);
+   si_add_gds_to_buffer_list(sctx);
 }

 void si_begin_new_gfx_cs(struct si_context *ctx)
 {
-	if (ctx->is_debug)
-		si_begin_gfx_cs_debug(ctx);
+   if (ctx->is_debug)
+      si_begin_gfx_cs_debug(ctx);

-	si_add_gds_to_buffer_list(ctx);
+   si_add_gds_to_buffer_list(ctx);

-	/* Always invalidate caches at the beginning of IBs, because external
-	 * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
-	 * buffers.
-	 *
-	 * Note that the cache flush done by the kernel at the end of GFX IBs
-	 * isn't useful here, because that flush can finish after the following
-	 * IB starts drawing.
-	 *
-	 * TODO: Do we also need to invalidate CB & DB caches?
-	 */
-	ctx->flags |= SI_CONTEXT_INV_ICACHE |
-		      SI_CONTEXT_INV_SCACHE |
-		      SI_CONTEXT_INV_VCACHE |
-		      SI_CONTEXT_INV_L2 |
-		      SI_CONTEXT_START_PIPELINE_STATS;
+   /* Always invalidate caches at the beginning of IBs, because external
+    * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
+    * buffers.
+    *
+    * Note that the cache flush done by the kernel at the end of GFX IBs
+    * isn't useful here, because that flush can finish after the following
+    * IB starts drawing.
+    *
+    * TODO: Do we also need to invalidate CB & DB caches?
+    */
+   ctx->flags |= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+                 SI_CONTEXT_INV_L2 | SI_CONTEXT_START_PIPELINE_STATS;

-	ctx->cs_shader_state.initialized = false;
-	si_all_descriptors_begin_new_cs(ctx);
+   ctx->cs_shader_state.initialized = false;
+   si_all_descriptors_begin_new_cs(ctx);

-	if (!ctx->has_graphics) {
-		ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
-		return;
-	}
+   if (!ctx->has_graphics) {
+      ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
+      return;
+   }

-	/* set all valid group as dirty so they get reemited on
-	 * next draw command
-	 */
-	si_pm4_reset_emitted(ctx);
+   /* set all valid group as dirty so they get reemited on
+    * next draw command
+    */
+   si_pm4_reset_emitted(ctx);

-	/* The CS initialization should be emitted before everything else. */
-	si_pm4_emit(ctx, ctx->init_config);
-	if (ctx->init_config_gs_rings)
-		si_pm4_emit(ctx, ctx->init_config_gs_rings);
+   /* The CS initialization should be emitted before everything else. */
+   si_pm4_emit(ctx, ctx->init_config);
+   if (ctx->init_config_gs_rings)
+      si_pm4_emit(ctx, ctx->init_config_gs_rings);

-	if (ctx->queued.named.ls)
-		ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
-	if (ctx->queued.named.hs)
-		ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
-	if (ctx->queued.named.es)
-		ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
-	if (ctx->queued.named.gs)
-		ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
-	if (ctx->queued.named.vs)
-		ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
-	if (ctx->queued.named.ps)
-		ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
-	if (ctx->vb_descriptors_buffer && ctx->vertex_elements)
-		ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
+   if (ctx->queued.named.ls)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+   if (ctx->queued.named.hs)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+   if (ctx->queued.named.es)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+   if (ctx->queued.named.gs)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+   if (ctx->queued.named.vs)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+   if (ctx->queued.named.ps)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+   if (ctx->vb_descriptors_buffer && ctx->vertex_elements)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;

-	/* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
-	bool has_clear_state = ctx->screen->info.has_clear_state;
-	if (has_clear_state) {
-		ctx->framebuffer.dirty_cbufs =
-			 u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
-		/* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
-		ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
-	} else {
-		ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
-		ctx->framebuffer.dirty_zsbuf = true;
-	}
-	/* This should always be marked as dirty to set the framebuffer scissor
-	 * at least. */
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
+   /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
+   bool has_clear_state = ctx->screen->info.has_clear_state;
+   if (has_clear_state) {
+      ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
+      /* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
+      ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
+   } else {
+      ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
+      ctx->framebuffer.dirty_zsbuf = true;
+   }
+   /* This should always be marked as dirty to set the framebuffer scissor
+    * at least. */
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);

-	si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs);
-	/* CLEAR_STATE sets zeros. */
-	if (!has_clear_state || ctx->clip_state.any_nonzeros)
-		si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state);
-	ctx->sample_locs_num_samples = 0;
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config);
-	/* CLEAR_STATE sets 0xffff. */
-	if (!has_clear_state || ctx->sample_mask != 0xffff)
-		si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state);
-	/* CLEAR_STATE sets zeros. */
-	if (!has_clear_state || ctx->blend_color.any_nonzeros)
-		si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state);
-	if (ctx->chip_class >= GFX9)
-		si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
-	if (!ctx->screen->use_ngg_streamout)
-		si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
-	/* CLEAR_STATE disables all window rectangles. */
-	if (!has_clear_state || ctx->num_window_rectangles > 0)
-		si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs);
+   /* CLEAR_STATE sets zeros. */
+   if (!has_clear_state || ctx->clip_state.any_nonzeros)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state);
+   ctx->sample_locs_num_samples = 0;
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config);
+   /* CLEAR_STATE sets 0xffff. */
+   if (!has_clear_state || ctx->sample_mask != 0xffff)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state);
+   /* CLEAR_STATE sets zeros. */
+   if (!has_clear_state || ctx->blend_color.any_nonzeros)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state);
+   if (ctx->chip_class >= GFX9)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
+   if (!ctx->screen->use_ngg_streamout)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
+   /* CLEAR_STATE disables all window rectangles. */
+   if (!has_clear_state || ctx->num_window_rectangles > 0)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);

-	si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);

-	si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);
-	if (ctx->scratch_buffer) {
-		si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
-	}
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);
+   if (ctx->scratch_buffer) {
+      si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
+   }

-	if (ctx->streamout.suspended) {
-		ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;
-		si_streamout_buffers_dirty(ctx);
-	}
+   if (ctx->streamout.suspended) {
+      ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;
+      si_streamout_buffers_dirty(ctx);
+   }

-	if (!list_is_empty(&ctx->active_queries))
-		si_resume_queries(ctx);
+   if (!list_is_empty(&ctx->active_queries))
+      si_resume_queries(ctx);

-	assert(!ctx->gfx_cs->prev_dw);
-	ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
+   assert(!ctx->gfx_cs->prev_dw);
+   ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;

-	/* Invalidate various draw states so that they are emitted before
-	 * the first draw call. */
-	si_invalidate_draw_sh_constants(ctx);
-	ctx->last_index_size = -1;
-	ctx->last_primitive_restart_en = -1;
-	ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
-	ctx->last_prim = -1;
-	ctx->last_multi_vgt_param = -1;
-	ctx->last_vs_state = ~0;
-	ctx->last_ls = NULL;
-	ctx->last_tcs = NULL;
-	ctx->last_tes_sh_base = -1;
-	ctx->last_num_tcs_input_cp = -1;
-	ctx->last_ls_hs_config = -1; /* impossible value */
-	ctx->last_binning_enabled = -1;
-	ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL;
+   /* Invalidate various draw states so that they are emitted before
+    * the first draw call. */
+   si_invalidate_draw_sh_constants(ctx);
+   ctx->last_index_size = -1;
+   ctx->last_primitive_restart_en = -1;
+   ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
+   ctx->last_prim = -1;
+   ctx->last_multi_vgt_param = -1;
+   ctx->last_vs_state = ~0;
+   ctx->last_ls = NULL;
+   ctx->last_tcs = NULL;
+   ctx->last_tes_sh_base = -1;
+   ctx->last_num_tcs_input_cp = -1;
+   ctx->last_ls_hs_config = -1; /* impossible value */
+   ctx->last_binning_enabled = -1;
+   ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL;

-	ctx->prim_discard_compute_ib_initialized = false;
+   ctx->prim_discard_compute_ib_initialized = false;

-        /* Compute-based primitive discard:
-         *   The index ring is divided into 2 halves. Switch between the halves
-         *   in the same fashion as doublebuffering.
-         */
-        if (ctx->index_ring_base)
-                ctx->index_ring_base = 0;
-        else
-                ctx->index_ring_base = ctx->index_ring_size_per_ib;
+   /* Compute-based primitive discard:
+    *   The index ring is divided into 2 halves. Switch between the halves
+    *   in the same fashion as doublebuffering.
+    */
+   if (ctx->index_ring_base)
+      ctx->index_ring_base = 0;
+   else
+      ctx->index_ring_base = ctx->index_ring_size_per_ib;

-        ctx->index_ring_offset = 0;
+   ctx->index_ring_offset = 0;

-	STATIC_ASSERT(SI_NUM_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved) * 8);
+   STATIC_ASSERT(SI_NUM_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved) * 8);

-	if (has_clear_state) {
-		ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
-		ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL]	= 0x00001000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG]	= 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA]	= 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__VS] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__CL] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL]	= 0x00090000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;
-		ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL]	= 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ]	= 0x3f800000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ]	= 0x3f800000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ]	= 0x3f800000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ]	= 0x3f800000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE]	= 0xffff;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE]	= 0;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_IDX_FORMAT]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL]  = 0x00000002;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK]  = 0xffffffff;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL]  = 0x0000001e; /* From GFX8 */
+   if (has_clear_state) {
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
+      ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x00001000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__VS] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__CL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x00090000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_IDX_FORMAT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x00000002;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] =
+         0x0000001e; /* From GFX8 */

-		/* Set all cleared context registers to saved. */
-		ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */
-		ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */
-	} else {
-		/* Set all register values to unknown. */
-		ctx->tracked_regs.reg_saved = 0;
-		ctx->last_gs_out_prim = -1; /* unknown */
-	}
+      /* Set all cleared context registers to saved. */
+      ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */
+      ctx->last_gs_out_prim = 0;                                       /* cleared by CLEAR_STATE */
+   } else {
+      /* Set all register values to unknown. */
+      ctx->tracked_regs.reg_saved = 0;
+      ctx->last_gs_out_prim = -1; /* unknown */
+   }

-	/* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */
-	memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
+   /* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */
+   memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
 }
--- a/src/gallium/drivers/radeonsi/si_gpu_load.c
+++ b/src/gallium/drivers/radeonsi/si_gpu_load.c
@ -40,242 +40,234 @@
 * fps (there are too few samples per frame). */
 #define SAMPLES_PER_SEC 10000

-#define GRBM_STATUS		0x8010
-#define TA_BUSY(x)		(((x) >> 14) & 0x1)
-#define GDS_BUSY(x)		(((x) >> 15) & 0x1)
-#define VGT_BUSY(x)		(((x) >> 17) & 0x1)
-#define IA_BUSY(x)		(((x) >> 19) & 0x1)
-#define SX_BUSY(x)		(((x) >> 20) & 0x1)
-#define WD_BUSY(x)		(((x) >> 21) & 0x1)
-#define SPI_BUSY(x)		(((x) >> 22) & 0x1)
-#define BCI_BUSY(x)		(((x) >> 23) & 0x1)
-#define SC_BUSY(x)		(((x) >> 24) & 0x1)
-#define PA_BUSY(x)		(((x) >> 25) & 0x1)
-#define DB_BUSY(x)		(((x) >> 26) & 0x1)
-#define CP_BUSY(x)		(((x) >> 29) & 0x1)
-#define CB_BUSY(x)		(((x) >> 30) & 0x1)
-#define GUI_ACTIVE(x)		(((x) >> 31) & 0x1)
+#define GRBM_STATUS   0x8010
+#define TA_BUSY(x)    (((x) >> 14) & 0x1)
+#define GDS_BUSY(x)   (((x) >> 15) & 0x1)
+#define VGT_BUSY(x)   (((x) >> 17) & 0x1)
+#define IA_BUSY(x)    (((x) >> 19) & 0x1)
+#define SX_BUSY(x)    (((x) >> 20) & 0x1)
+#define WD_BUSY(x)    (((x) >> 21) & 0x1)
+#define SPI_BUSY(x)   (((x) >> 22) & 0x1)
+#define BCI_BUSY(x)   (((x) >> 23) & 0x1)
+#define SC_BUSY(x)    (((x) >> 24) & 0x1)
+#define PA_BUSY(x)    (((x) >> 25) & 0x1)
+#define DB_BUSY(x)    (((x) >> 26) & 0x1)
+#define CP_BUSY(x)    (((x) >> 29) & 0x1)
+#define CB_BUSY(x)    (((x) >> 30) & 0x1)
+#define GUI_ACTIVE(x) (((x) >> 31) & 0x1)

-#define SRBM_STATUS2		0x0e4c
-#define SDMA_BUSY(x)		(((x) >> 5) & 0x1)
+#define SRBM_STATUS2 0x0e4c
+#define SDMA_BUSY(x) (((x) >> 5) & 0x1)

-#define CP_STAT                 0x8680
-#define PFP_BUSY(x)		(((x) >> 15) & 0x1)
-#define MEQ_BUSY(x)		(((x) >> 16) & 0x1)
-#define ME_BUSY(x)		(((x) >> 17) & 0x1)
-#define SURFACE_SYNC_BUSY(x)	(((x) >> 21) & 0x1)
-#define DMA_BUSY(x)		(((x) >> 22) & 0x1)
-#define SCRATCH_RAM_BUSY(x)	(((x) >> 24) & 0x1)
+#define CP_STAT              0x8680
+#define PFP_BUSY(x)          (((x) >> 15) & 0x1)
+#define MEQ_BUSY(x)          (((x) >> 16) & 0x1)
+#define ME_BUSY(x)           (((x) >> 17) & 0x1)
+#define SURFACE_SYNC_BUSY(x) (((x) >> 21) & 0x1)
+#define DMA_BUSY(x)          (((x) >> 22) & 0x1)
+#define SCRATCH_RAM_BUSY(x)  (((x) >> 24) & 0x1)

 #define IDENTITY(x) x

-#define UPDATE_COUNTER(field, mask)					\
-	do {								\
-		if (mask(value))					\
-			p_atomic_inc(&counters->named.field.busy);	\
-		else							\
-			p_atomic_inc(&counters->named.field.idle);	\
-	} while (0)
+#define UPDATE_COUNTER(field, mask)                                                                \
+   do {                                                                                            \
+      if (mask(value))                                                                             \
+         p_atomic_inc(&counters->named.field.busy);                                                \
+      else                                                                                         \
+         p_atomic_inc(&counters->named.field.idle);                                                \
+   } while (0)

-static void si_update_mmio_counters(struct si_screen *sscreen,
-				    union si_mmio_counters *counters)
+static void si_update_mmio_counters(struct si_screen *sscreen, union si_mmio_counters *counters)
 {
-	uint32_t value = 0;
-	bool gui_busy, sdma_busy = false;
+   uint32_t value = 0;
+   bool gui_busy, sdma_busy = false;

-	/* GRBM_STATUS */
-	sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value);
+   /* GRBM_STATUS */
+   sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value);

-	UPDATE_COUNTER(ta, TA_BUSY);
-	UPDATE_COUNTER(gds, GDS_BUSY);
-	UPDATE_COUNTER(vgt, VGT_BUSY);
-	UPDATE_COUNTER(ia, IA_BUSY);
-	UPDATE_COUNTER(sx, SX_BUSY);
-	UPDATE_COUNTER(wd, WD_BUSY);
-	UPDATE_COUNTER(spi, SPI_BUSY);
-	UPDATE_COUNTER(bci, BCI_BUSY);
-	UPDATE_COUNTER(sc, SC_BUSY);
-	UPDATE_COUNTER(pa, PA_BUSY);
-	UPDATE_COUNTER(db, DB_BUSY);
-	UPDATE_COUNTER(cp, CP_BUSY);
-	UPDATE_COUNTER(cb, CB_BUSY);
-	UPDATE_COUNTER(gui, GUI_ACTIVE);
-	gui_busy = GUI_ACTIVE(value);
+   UPDATE_COUNTER(ta, TA_BUSY);
+   UPDATE_COUNTER(gds, GDS_BUSY);
+   UPDATE_COUNTER(vgt, VGT_BUSY);
+   UPDATE_COUNTER(ia, IA_BUSY);
+   UPDATE_COUNTER(sx, SX_BUSY);
+   UPDATE_COUNTER(wd, WD_BUSY);
+   UPDATE_COUNTER(spi, SPI_BUSY);
+   UPDATE_COUNTER(bci, BCI_BUSY);
+   UPDATE_COUNTER(sc, SC_BUSY);
+   UPDATE_COUNTER(pa, PA_BUSY);
+   UPDATE_COUNTER(db, DB_BUSY);
+   UPDATE_COUNTER(cp, CP_BUSY);
+   UPDATE_COUNTER(cb, CB_BUSY);
+   UPDATE_COUNTER(gui, GUI_ACTIVE);
+   gui_busy = GUI_ACTIVE(value);

-	if (sscreen->info.chip_class == GFX7 || sscreen->info.chip_class == GFX8) {
-		/* SRBM_STATUS2 */
-		sscreen->ws->read_registers(sscreen->ws, SRBM_STATUS2, 1, &value);
+   if (sscreen->info.chip_class == GFX7 || sscreen->info.chip_class == GFX8) {
+      /* SRBM_STATUS2 */
+      sscreen->ws->read_registers(sscreen->ws, SRBM_STATUS2, 1, &value);

-		UPDATE_COUNTER(sdma, SDMA_BUSY);
-		sdma_busy = SDMA_BUSY(value);
-	}
+      UPDATE_COUNTER(sdma, SDMA_BUSY);
+      sdma_busy = SDMA_BUSY(value);
+   }

-	if (sscreen->info.chip_class >= GFX8) {
-		/* CP_STAT */
-		sscreen->ws->read_registers(sscreen->ws, CP_STAT, 1, &value);
+   if (sscreen->info.chip_class >= GFX8) {
+      /* CP_STAT */
+      sscreen->ws->read_registers(sscreen->ws, CP_STAT, 1, &value);

-		UPDATE_COUNTER(pfp, PFP_BUSY);
-		UPDATE_COUNTER(meq, MEQ_BUSY);
-		UPDATE_COUNTER(me, ME_BUSY);
-		UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY);
-		UPDATE_COUNTER(cp_dma, DMA_BUSY);
-		UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY);
-	}
+      UPDATE_COUNTER(pfp, PFP_BUSY);
+      UPDATE_COUNTER(meq, MEQ_BUSY);
+      UPDATE_COUNTER(me, ME_BUSY);
+      UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY);
+      UPDATE_COUNTER(cp_dma, DMA_BUSY);
+      UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY);
+   }

-	value = gui_busy || sdma_busy;
-	UPDATE_COUNTER(gpu, IDENTITY);
+   value = gui_busy || sdma_busy;
+   UPDATE_COUNTER(gpu, IDENTITY);
 }

 #undef UPDATE_COUNTER

-static int
-si_gpu_load_thread(void *param)
+static int si_gpu_load_thread(void *param)
 {
-	struct si_screen *sscreen = (struct si_screen*)param;
-	const int period_us = 1000000 / SAMPLES_PER_SEC;
-	int sleep_us = period_us;
-	int64_t cur_time, last_time = os_time_get();
+   struct si_screen *sscreen = (struct si_screen *)param;
+   const int period_us = 1000000 / SAMPLES_PER_SEC;
+   int sleep_us = period_us;
+   int64_t cur_time, last_time = os_time_get();

-	while (!p_atomic_read(&sscreen->gpu_load_stop_thread)) {
-		if (sleep_us)
-			os_time_sleep(sleep_us);
+   while (!p_atomic_read(&sscreen->gpu_load_stop_thread)) {
+      if (sleep_us)
+         os_time_sleep(sleep_us);

-		/* Make sure we sleep the ideal amount of time to match
-		 * the expected frequency. */
-		cur_time = os_time_get();
+      /* Make sure we sleep the ideal amount of time to match
+       * the expected frequency. */
+      cur_time = os_time_get();

-		if (os_time_timeout(last_time, last_time + period_us,
-				    cur_time))
-			sleep_us = MAX2(sleep_us - 1, 1);
-		else
-			sleep_us += 1;
+      if (os_time_timeout(last_time, last_time + period_us, cur_time))
+         sleep_us = MAX2(sleep_us - 1, 1);
+      else
+         sleep_us += 1;

-		/*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/
-		last_time = cur_time;
+      /*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/
+      last_time = cur_time;

-		/* Update the counters. */
-		si_update_mmio_counters(sscreen, &sscreen->mmio_counters);
-	}
-	p_atomic_dec(&sscreen->gpu_load_stop_thread);
-	return 0;
+      /* Update the counters. */
+      si_update_mmio_counters(sscreen, &sscreen->mmio_counters);
+   }
+   p_atomic_dec(&sscreen->gpu_load_stop_thread);
+   return 0;
 }

 void si_gpu_load_kill_thread(struct si_screen *sscreen)
 {
-	if (!sscreen->gpu_load_thread)
-		return;
+   if (!sscreen->gpu_load_thread)
+      return;

-	p_atomic_inc(&sscreen->gpu_load_stop_thread);
-	thrd_join(sscreen->gpu_load_thread, NULL);
-	sscreen->gpu_load_thread = 0;
+   p_atomic_inc(&sscreen->gpu_load_stop_thread);
+   thrd_join(sscreen->gpu_load_thread, NULL);
+   sscreen->gpu_load_thread = 0;
 }

-static uint64_t si_read_mmio_counter(struct si_screen *sscreen,
-				     unsigned busy_index)
+static uint64_t si_read_mmio_counter(struct si_screen *sscreen, unsigned busy_index)
 {
-	/* Start the thread if needed. */
-	if (!sscreen->gpu_load_thread) {
-		simple_mtx_lock(&sscreen->gpu_load_mutex);
-		/* Check again inside the mutex. */
-		if (!sscreen->gpu_load_thread)
-			sscreen->gpu_load_thread =
-				u_thread_create(si_gpu_load_thread, sscreen);
-		simple_mtx_unlock(&sscreen->gpu_load_mutex);
-	}
+   /* Start the thread if needed. */
+   if (!sscreen->gpu_load_thread) {
+      simple_mtx_lock(&sscreen->gpu_load_mutex);
+      /* Check again inside the mutex. */
+      if (!sscreen->gpu_load_thread)
+         sscreen->gpu_load_thread = u_thread_create(si_gpu_load_thread, sscreen);
+      simple_mtx_unlock(&sscreen->gpu_load_mutex);
+   }

-	unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]);
-	unsigned idle = p_atomic_read(&sscreen->mmio_counters.array[busy_index + 1]);
+   unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]);
+   unsigned idle = p_atomic_read(&sscreen->mmio_counters.array[busy_index + 1]);

-	return busy | ((uint64_t)idle << 32);
+   return busy | ((uint64_t)idle << 32);
 }

-static unsigned si_end_mmio_counter(struct si_screen *sscreen,
-				    uint64_t begin, unsigned busy_index)
+static unsigned si_end_mmio_counter(struct si_screen *sscreen, uint64_t begin, unsigned busy_index)
 {
-	uint64_t end = si_read_mmio_counter(sscreen, busy_index);
-	unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff);
-	unsigned idle = (end >> 32) - (begin >> 32);
+   uint64_t end = si_read_mmio_counter(sscreen, busy_index);
+   unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff);
+   unsigned idle = (end >> 32) - (begin >> 32);

-	/* Calculate the % of time the busy counter was being incremented.
-	 *
-	 * If no counters were incremented, return the current counter status.
-	 * It's for the case when the load is queried faster than
-	 * the counters are updated.
-	 */
-	if (idle || busy) {
-		return busy*100 / (busy + idle);
-	} else {
-		union si_mmio_counters counters;
+   /* Calculate the % of time the busy counter was being incremented.
+    *
+    * If no counters were incremented, return the current counter status.
+    * It's for the case when the load is queried faster than
+    * the counters are updated.
+    */
+   if (idle || busy) {
+      return busy * 100 / (busy + idle);
+   } else {
+      union si_mmio_counters counters;

-		memset(&counters, 0, sizeof(counters));
-		si_update_mmio_counters(sscreen, &counters);
-		return counters.array[busy_index] ? 100 : 0;
-	}
+      memset(&counters, 0, sizeof(counters));
+      si_update_mmio_counters(sscreen, &counters);
+      return counters.array[busy_index] ? 100 : 0;
+   }
 }

-#define BUSY_INDEX(sscreen, field) (&sscreen->mmio_counters.named.field.busy - \
-				    sscreen->mmio_counters.array)
+#define BUSY_INDEX(sscreen, field)                                                                 \
+   (&sscreen->mmio_counters.named.field.busy - sscreen->mmio_counters.array)

-static unsigned busy_index_from_type(struct si_screen *sscreen,
-				     unsigned type)
+static unsigned busy_index_from_type(struct si_screen *sscreen, unsigned type)
 {
-	switch (type) {
-	case SI_QUERY_GPU_LOAD:
-		return BUSY_INDEX(sscreen, gpu);
-	case SI_QUERY_GPU_SHADERS_BUSY:
-		return BUSY_INDEX(sscreen, spi);
-	case SI_QUERY_GPU_TA_BUSY:
-		return BUSY_INDEX(sscreen, ta);
-	case SI_QUERY_GPU_GDS_BUSY:
-		return BUSY_INDEX(sscreen, gds);
-	case SI_QUERY_GPU_VGT_BUSY:
-		return BUSY_INDEX(sscreen, vgt);
-	case SI_QUERY_GPU_IA_BUSY:
-		return BUSY_INDEX(sscreen, ia);
-	case SI_QUERY_GPU_SX_BUSY:
-		return BUSY_INDEX(sscreen, sx);
-	case SI_QUERY_GPU_WD_BUSY:
-		return BUSY_INDEX(sscreen, wd);
-	case SI_QUERY_GPU_BCI_BUSY:
-		return BUSY_INDEX(sscreen, bci);
-	case SI_QUERY_GPU_SC_BUSY:
-		return BUSY_INDEX(sscreen, sc);
-	case SI_QUERY_GPU_PA_BUSY:
-		return BUSY_INDEX(sscreen, pa);
-	case SI_QUERY_GPU_DB_BUSY:
-		return BUSY_INDEX(sscreen, db);
-	case SI_QUERY_GPU_CP_BUSY:
-		return BUSY_INDEX(sscreen, cp);
-	case SI_QUERY_GPU_CB_BUSY:
-		return BUSY_INDEX(sscreen, cb);
-	case SI_QUERY_GPU_SDMA_BUSY:
-		return BUSY_INDEX(sscreen, sdma);
-	case SI_QUERY_GPU_PFP_BUSY:
-		return BUSY_INDEX(sscreen, pfp);
-	case SI_QUERY_GPU_MEQ_BUSY:
-		return BUSY_INDEX(sscreen, meq);
-	case SI_QUERY_GPU_ME_BUSY:
-		return BUSY_INDEX(sscreen, me);
-	case SI_QUERY_GPU_SURF_SYNC_BUSY:
-		return BUSY_INDEX(sscreen, surf_sync);
-	case SI_QUERY_GPU_CP_DMA_BUSY:
-		return BUSY_INDEX(sscreen, cp_dma);
-	case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
-		return BUSY_INDEX(sscreen, scratch_ram);
-	default:
-		unreachable("invalid query type");
-	}
+   switch (type) {
+   case SI_QUERY_GPU_LOAD:
+      return BUSY_INDEX(sscreen, gpu);
+   case SI_QUERY_GPU_SHADERS_BUSY:
+      return BUSY_INDEX(sscreen, spi);
+   case SI_QUERY_GPU_TA_BUSY:
+      return BUSY_INDEX(sscreen, ta);
+   case SI_QUERY_GPU_GDS_BUSY:
+      return BUSY_INDEX(sscreen, gds);
+   case SI_QUERY_GPU_VGT_BUSY:
+      return BUSY_INDEX(sscreen, vgt);
+   case SI_QUERY_GPU_IA_BUSY:
+      return BUSY_INDEX(sscreen, ia);
+   case SI_QUERY_GPU_SX_BUSY:
+      return BUSY_INDEX(sscreen, sx);
+   case SI_QUERY_GPU_WD_BUSY:
+      return BUSY_INDEX(sscreen, wd);
+   case SI_QUERY_GPU_BCI_BUSY:
+      return BUSY_INDEX(sscreen, bci);
+   case SI_QUERY_GPU_SC_BUSY:
+      return BUSY_INDEX(sscreen, sc);
+   case SI_QUERY_GPU_PA_BUSY:
+      return BUSY_INDEX(sscreen, pa);
+   case SI_QUERY_GPU_DB_BUSY:
+      return BUSY_INDEX(sscreen, db);
+   case SI_QUERY_GPU_CP_BUSY:
+      return BUSY_INDEX(sscreen, cp);
+   case SI_QUERY_GPU_CB_BUSY:
+      return BUSY_INDEX(sscreen, cb);
+   case SI_QUERY_GPU_SDMA_BUSY:
+      return BUSY_INDEX(sscreen, sdma);
+   case SI_QUERY_GPU_PFP_BUSY:
+      return BUSY_INDEX(sscreen, pfp);
+   case SI_QUERY_GPU_MEQ_BUSY:
+      return BUSY_INDEX(sscreen, meq);
+   case SI_QUERY_GPU_ME_BUSY:
+      return BUSY_INDEX(sscreen, me);
+   case SI_QUERY_GPU_SURF_SYNC_BUSY:
+      return BUSY_INDEX(sscreen, surf_sync);
+   case SI_QUERY_GPU_CP_DMA_BUSY:
+      return BUSY_INDEX(sscreen, cp_dma);
+   case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+      return BUSY_INDEX(sscreen, scratch_ram);
+   default:
+      unreachable("invalid query type");
+   }
 }

 uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type)
 {
-	unsigned busy_index = busy_index_from_type(sscreen, type);
-	return si_read_mmio_counter(sscreen, busy_index);
+   unsigned busy_index = busy_index_from_type(sscreen, type);
+   return si_read_mmio_counter(sscreen, busy_index);
 }

-unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
-			uint64_t begin)
+unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin)
 {
-	unsigned busy_index = busy_index_from_type(sscreen, type);
-	return si_end_mmio_counter(sscreen, begin, busy_index);
+   unsigned busy_index = busy_index_from_type(sscreen, type);
+   return si_end_mmio_counter(sscreen, begin, busy_index);
 }
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@ -22,170 +22,159 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#include "util/u_memory.h"
 #include "si_pipe.h"
 #include "sid.h"
+#include "util/u_memory.h"

 void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode)
 {
-	state->last_opcode = opcode;
-	state->last_pm4 = state->ndw++;
+   state->last_opcode = opcode;
+   state->last_pm4 = state->ndw++;
 }

 void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw)
 {
-	state->pm4[state->ndw++] = dw;
+   state->pm4[state->ndw++] = dw;
 }

 void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate)
 {
-	unsigned count;
-	count = state->ndw - state->last_pm4 - 2;
-	state->pm4[state->last_pm4] =
-		PKT3(state->last_opcode, count, predicate);
+   unsigned count;
+   count = state->ndw - state->last_pm4 - 2;
+   state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate);

-	assert(state->ndw <= SI_PM4_MAX_DW);
+   assert(state->ndw <= SI_PM4_MAX_DW);
 }

 void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val)
 {
-	unsigned opcode;
+   unsigned opcode;

-	if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
-		opcode = PKT3_SET_CONFIG_REG;
-		reg -= SI_CONFIG_REG_OFFSET;
+   if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
+      opcode = PKT3_SET_CONFIG_REG;
+      reg -= SI_CONFIG_REG_OFFSET;

-	} else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
-		opcode = PKT3_SET_SH_REG;
-		reg -= SI_SH_REG_OFFSET;
+   } else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
+      opcode = PKT3_SET_SH_REG;
+      reg -= SI_SH_REG_OFFSET;

-	} else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
-		opcode = PKT3_SET_CONTEXT_REG;
-		reg -= SI_CONTEXT_REG_OFFSET;
+   } else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
+      opcode = PKT3_SET_CONTEXT_REG;
+      reg -= SI_CONTEXT_REG_OFFSET;

-	} else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
-		opcode = PKT3_SET_UCONFIG_REG;
-		reg -= CIK_UCONFIG_REG_OFFSET;
+   } else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
+      opcode = PKT3_SET_UCONFIG_REG;
+      reg -= CIK_UCONFIG_REG_OFFSET;

-	} else {
-		PRINT_ERR("Invalid register offset %08x!\n", reg);
-		return;
-	}
+   } else {
+      PRINT_ERR("Invalid register offset %08x!\n", reg);
+      return;
+   }

-	reg >>= 2;
+   reg >>= 2;

-	if (opcode != state->last_opcode || reg != (state->last_reg + 1)) {
-		si_pm4_cmd_begin(state, opcode);
-		si_pm4_cmd_add(state, reg);
-	}
+   if (opcode != state->last_opcode || reg != (state->last_reg + 1)) {
+      si_pm4_cmd_begin(state, opcode);
+      si_pm4_cmd_add(state, reg);
+   }

-	state->last_reg = reg;
-	si_pm4_cmd_add(state, val);
-	si_pm4_cmd_end(state, false);
+   state->last_reg = reg;
+   si_pm4_cmd_add(state, val);
+   si_pm4_cmd_end(state, false);
 }

-void si_pm4_add_bo(struct si_pm4_state *state,
-                   struct si_resource *bo,
-                   enum radeon_bo_usage usage,
-		   enum radeon_bo_priority priority)
+void si_pm4_add_bo(struct si_pm4_state *state, struct si_resource *bo, enum radeon_bo_usage usage,
+                   enum radeon_bo_priority priority)
 {
-	unsigned idx = state->nbo++;
-	assert(idx < SI_PM4_MAX_BO);
+   unsigned idx = state->nbo++;
+   assert(idx < SI_PM4_MAX_BO);

-	si_resource_reference(&state->bo[idx], bo);
-	state->bo_usage[idx] = usage;
-	state->bo_priority[idx] = priority;
+   si_resource_reference(&state->bo[idx], bo);
+   state->bo_usage[idx] = usage;
+   state->bo_priority[idx] = priority;
 }

 void si_pm4_clear_state(struct si_pm4_state *state)
 {
-	for (int i = 0; i < state->nbo; ++i)
-		si_resource_reference(&state->bo[i], NULL);
-	si_resource_reference(&state->indirect_buffer, NULL);
-	state->nbo = 0;
-	state->ndw = 0;
+   for (int i = 0; i < state->nbo; ++i)
+      si_resource_reference(&state->bo[i], NULL);
+   si_resource_reference(&state->indirect_buffer, NULL);
+   state->nbo = 0;
+   state->ndw = 0;
 }

-void si_pm4_free_state(struct si_context *sctx,
-		       struct si_pm4_state *state,
-		       unsigned idx)
+void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx)
 {
-	if (!state)
-		return;
+   if (!state)
+      return;

-	if (idx != ~0 && sctx->emitted.array[idx] == state) {
-		sctx->emitted.array[idx] = NULL;
-	}
+   if (idx != ~0 && sctx->emitted.array[idx] == state) {
+      sctx->emitted.array[idx] = NULL;
+   }

-	si_pm4_clear_state(state);
-	FREE(state);
+   si_pm4_clear_state(state);
+   FREE(state);
 }

 void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;

-	for (int i = 0; i < state->nbo; ++i) {
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs, state->bo[i],
-				      state->bo_usage[i], state->bo_priority[i]);
-	}
+   for (int i = 0; i < state->nbo; ++i) {
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, state->bo[i], state->bo_usage[i],
+                                state->bo_priority[i]);
+   }

-	if (!state->indirect_buffer) {
-		radeon_emit_array(cs, state->pm4, state->ndw);
-	} else {
-		struct si_resource *ib = state->indirect_buffer;
+   if (!state->indirect_buffer) {
+      radeon_emit_array(cs, state->pm4, state->ndw);
+   } else {
+      struct si_resource *ib = state->indirect_buffer;

-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs, ib,
-					  RADEON_USAGE_READ,
-                                          RADEON_PRIO_IB2);
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, ib, RADEON_USAGE_READ, RADEON_PRIO_IB2);

-		radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
-		radeon_emit(cs, ib->gpu_address);
-		radeon_emit(cs, ib->gpu_address >> 32);
-		radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff);
-	}
+      radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
+      radeon_emit(cs, ib->gpu_address);
+      radeon_emit(cs, ib->gpu_address >> 32);
+      radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff);
+   }

-	if (state->atom.emit)
-		state->atom.emit(sctx);
+   if (state->atom.emit)
+      state->atom.emit(sctx);
 }

 void si_pm4_reset_emitted(struct si_context *sctx)
 {
-	memset(&sctx->emitted, 0, sizeof(sctx->emitted));
-	sctx->dirty_states |= u_bit_consecutive(0, SI_NUM_STATES);
+   memset(&sctx->emitted, 0, sizeof(sctx->emitted));
+   sctx->dirty_states |= u_bit_consecutive(0, SI_NUM_STATES);
 }

-void si_pm4_upload_indirect_buffer(struct si_context *sctx,
-				   struct si_pm4_state *state)
+void si_pm4_upload_indirect_buffer(struct si_context *sctx, struct si_pm4_state *state)
 {
-	struct pipe_screen *screen = sctx->b.screen;
-	unsigned aligned_ndw = align(state->ndw, 8);
+   struct pipe_screen *screen = sctx->b.screen;
+   unsigned aligned_ndw = align(state->ndw, 8);

-	/* only supported on GFX7 and later */
-	if (sctx->chip_class < GFX7)
-		return;
+   /* only supported on GFX7 and later */
+   if (sctx->chip_class < GFX7)
+      return;

-	assert(state->ndw);
-	assert(aligned_ndw <= SI_PM4_MAX_DW);
+   assert(state->ndw);
+   assert(aligned_ndw <= SI_PM4_MAX_DW);

-	si_resource_reference(&state->indirect_buffer, NULL);
-	/* TODO: this hangs with 1024 or higher alignment on GFX9. */
-	state->indirect_buffer =
-		si_aligned_buffer_create(screen, 0,
-					 PIPE_USAGE_DEFAULT, aligned_ndw * 4,
-					 256);
-	if (!state->indirect_buffer)
-		return;
+   si_resource_reference(&state->indirect_buffer, NULL);
+   /* TODO: this hangs with 1024 or higher alignment on GFX9. */
+   state->indirect_buffer =
+      si_aligned_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, aligned_ndw * 4, 256);
+   if (!state->indirect_buffer)
+      return;

-	/* Pad the IB to 8 DWs to meet CP fetch alignment requirements. */
-	if (sctx->screen->info.gfx_ib_pad_with_type2) {
-		for (int i = state->ndw; i < aligned_ndw; i++)
-			state->pm4[i] = 0x80000000; /* type2 nop packet */
-	} else {
-		for (int i = state->ndw; i < aligned_ndw; i++)
-			state->pm4[i] = 0xffff1000; /* type3 nop packet */
-	}
+   /* Pad the IB to 8 DWs to meet CP fetch alignment requirements. */
+   if (sctx->screen->info.gfx_ib_pad_with_type2) {
+      for (int i = state->ndw; i < aligned_ndw; i++)
+         state->pm4[i] = 0x80000000; /* type2 nop packet */
+   } else {
+      for (int i = state->ndw; i < aligned_ndw; i++)
+         state->pm4[i] = 0xffff1000; /* type3 nop packet */
+   }

-	pipe_buffer_write(&sctx->b, &state->indirect_buffer->b.b,
-			  0, aligned_ndw *4, state->pm4);
+   pipe_buffer_write(&sctx->b, &state->indirect_buffer->b.b, 0, aligned_ndw * 4, state->pm4);
 }
--- a/src/gallium/drivers/radeonsi/si_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_pm4.h
@ -27,8 +27,8 @@

 #include "radeon/radeon_winsys.h"

-#define SI_PM4_MAX_DW		176
-#define SI_PM4_MAX_BO		3
+#define SI_PM4_MAX_DW 176
+#define SI_PM4_MAX_BO 3

 // forward defines
 struct si_context;
@ -37,32 +37,31 @@ struct si_context;
 * command buffer (AKA indirect buffer, AKA IB, AKA command stream, AKA CS).
 */
 struct si_atom {
-	void (*emit)(struct si_context *ctx);
+   void (*emit)(struct si_context *ctx);
 };

-struct si_pm4_state
-{
-	/* optional indirect buffer */
-	struct si_resource	*indirect_buffer;
+struct si_pm4_state {
+   /* optional indirect buffer */
+   struct si_resource *indirect_buffer;

-	/* PKT3_SET_*_REG handling */
-	unsigned	last_opcode;
-	unsigned	last_reg;
-	unsigned	last_pm4;
+   /* PKT3_SET_*_REG handling */
+   unsigned last_opcode;
+   unsigned last_reg;
+   unsigned last_pm4;

-	/* commands for the DE */
-	unsigned	ndw;
-	uint32_t	pm4[SI_PM4_MAX_DW];
+   /* commands for the DE */
+   unsigned ndw;
+   uint32_t pm4[SI_PM4_MAX_DW];

-	/* BO's referenced by this state */
-	unsigned		nbo;
-	struct si_resource	*bo[SI_PM4_MAX_BO];
-	enum radeon_bo_usage	bo_usage[SI_PM4_MAX_BO];
-	enum radeon_bo_priority	bo_priority[SI_PM4_MAX_BO];
+   /* BO's referenced by this state */
+   unsigned nbo;
+   struct si_resource *bo[SI_PM4_MAX_BO];
+   enum radeon_bo_usage bo_usage[SI_PM4_MAX_BO];
+   enum radeon_bo_priority bo_priority[SI_PM4_MAX_BO];

-	/* For shader states only */
-	struct si_shader *shader;
-	struct si_atom atom;
+   /* For shader states only */
+   struct si_shader *shader;
+   struct si_atom atom;
 };

 void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode);
@ -70,17 +69,12 @@ void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw);
 void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate);

 void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val);
-void si_pm4_add_bo(struct si_pm4_state *state,
-		   struct si_resource *bo,
-		   enum radeon_bo_usage usage,
-		   enum radeon_bo_priority priority);
-void si_pm4_upload_indirect_buffer(struct si_context *sctx,
-				   struct si_pm4_state *state);
+void si_pm4_add_bo(struct si_pm4_state *state, struct si_resource *bo, enum radeon_bo_usage usage,
+                   enum radeon_bo_priority priority);
+void si_pm4_upload_indirect_buffer(struct si_context *sctx, struct si_pm4_state *state);

 void si_pm4_clear_state(struct si_pm4_state *state);
-void si_pm4_free_state(struct si_context *sctx,
-		       struct si_pm4_state *state,
-		       unsigned idx);
+void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx);

 void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state);
 void si_pm4_reset_emitted(struct si_context *sctx);
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@ -40,236 +40,220 @@ struct si_resource;

 #define SI_MAX_STREAMS 4

-enum {
-	SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
-	SI_QUERY_DECOMPRESS_CALLS,
-	SI_QUERY_MRT_DRAW_CALLS,
-	SI_QUERY_PRIM_RESTART_CALLS,
-	SI_QUERY_SPILL_DRAW_CALLS,
-	SI_QUERY_COMPUTE_CALLS,
-	SI_QUERY_SPILL_COMPUTE_CALLS,
-	SI_QUERY_DMA_CALLS,
-	SI_QUERY_CP_DMA_CALLS,
-	SI_QUERY_NUM_VS_FLUSHES,
-	SI_QUERY_NUM_PS_FLUSHES,
-	SI_QUERY_NUM_CS_FLUSHES,
-	SI_QUERY_NUM_CB_CACHE_FLUSHES,
-	SI_QUERY_NUM_DB_CACHE_FLUSHES,
-	SI_QUERY_NUM_L2_INVALIDATES,
-	SI_QUERY_NUM_L2_WRITEBACKS,
-	SI_QUERY_NUM_RESIDENT_HANDLES,
-	SI_QUERY_TC_OFFLOADED_SLOTS,
-	SI_QUERY_TC_DIRECT_SLOTS,
-	SI_QUERY_TC_NUM_SYNCS,
-	SI_QUERY_CS_THREAD_BUSY,
-	SI_QUERY_GALLIUM_THREAD_BUSY,
-	SI_QUERY_REQUESTED_VRAM,
-	SI_QUERY_REQUESTED_GTT,
-	SI_QUERY_MAPPED_VRAM,
-	SI_QUERY_MAPPED_GTT,
-	SI_QUERY_BUFFER_WAIT_TIME,
-	SI_QUERY_NUM_MAPPED_BUFFERS,
-	SI_QUERY_NUM_GFX_IBS,
-	SI_QUERY_NUM_SDMA_IBS,
-	SI_QUERY_GFX_BO_LIST_SIZE,
-	SI_QUERY_GFX_IB_SIZE,
-	SI_QUERY_NUM_BYTES_MOVED,
-	SI_QUERY_NUM_EVICTIONS,
-	SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS,
-	SI_QUERY_VRAM_USAGE,
-	SI_QUERY_VRAM_VIS_USAGE,
-	SI_QUERY_GTT_USAGE,
-	SI_QUERY_GPU_TEMPERATURE,
-	SI_QUERY_CURRENT_GPU_SCLK,
-	SI_QUERY_CURRENT_GPU_MCLK,
-	SI_QUERY_GPU_LOAD,
-	SI_QUERY_GPU_SHADERS_BUSY,
-	SI_QUERY_GPU_TA_BUSY,
-	SI_QUERY_GPU_GDS_BUSY,
-	SI_QUERY_GPU_VGT_BUSY,
-	SI_QUERY_GPU_IA_BUSY,
-	SI_QUERY_GPU_SX_BUSY,
-	SI_QUERY_GPU_WD_BUSY,
-	SI_QUERY_GPU_BCI_BUSY,
-	SI_QUERY_GPU_SC_BUSY,
-	SI_QUERY_GPU_PA_BUSY,
-	SI_QUERY_GPU_DB_BUSY,
-	SI_QUERY_GPU_CP_BUSY,
-	SI_QUERY_GPU_CB_BUSY,
-	SI_QUERY_GPU_SDMA_BUSY,
-	SI_QUERY_GPU_PFP_BUSY,
-	SI_QUERY_GPU_MEQ_BUSY,
-	SI_QUERY_GPU_ME_BUSY,
-	SI_QUERY_GPU_SURF_SYNC_BUSY,
-	SI_QUERY_GPU_CP_DMA_BUSY,
-	SI_QUERY_GPU_SCRATCH_RAM_BUSY,
-	SI_QUERY_NUM_COMPILATIONS,
-	SI_QUERY_NUM_SHADERS_CREATED,
-	SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
-	SI_QUERY_GPIN_ASIC_ID,
-	SI_QUERY_GPIN_NUM_SIMD,
-	SI_QUERY_GPIN_NUM_RB,
-	SI_QUERY_GPIN_NUM_SPI,
-	SI_QUERY_GPIN_NUM_SE,
-	SI_QUERY_TIME_ELAPSED_SDMA,
-	SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */
-	SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
-	SI_QUERY_PD_NUM_PRIMS_REJECTED,
-	SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
-	SI_QUERY_LIVE_SHADER_CACHE_HITS,
-	SI_QUERY_LIVE_SHADER_CACHE_MISSES,
-	SI_QUERY_MEMORY_SHADER_CACHE_HITS,
-	SI_QUERY_MEMORY_SHADER_CACHE_MISSES,
-	SI_QUERY_DISK_SHADER_CACHE_HITS,
-	SI_QUERY_DISK_SHADER_CACHE_MISSES,
+enum
+{
+   SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
+   SI_QUERY_DECOMPRESS_CALLS,
+   SI_QUERY_MRT_DRAW_CALLS,
+   SI_QUERY_PRIM_RESTART_CALLS,
+   SI_QUERY_SPILL_DRAW_CALLS,
+   SI_QUERY_COMPUTE_CALLS,
+   SI_QUERY_SPILL_COMPUTE_CALLS,
+   SI_QUERY_DMA_CALLS,
+   SI_QUERY_CP_DMA_CALLS,
+   SI_QUERY_NUM_VS_FLUSHES,
+   SI_QUERY_NUM_PS_FLUSHES,
+   SI_QUERY_NUM_CS_FLUSHES,
+   SI_QUERY_NUM_CB_CACHE_FLUSHES,
+   SI_QUERY_NUM_DB_CACHE_FLUSHES,
+   SI_QUERY_NUM_L2_INVALIDATES,
+   SI_QUERY_NUM_L2_WRITEBACKS,
+   SI_QUERY_NUM_RESIDENT_HANDLES,
+   SI_QUERY_TC_OFFLOADED_SLOTS,
+   SI_QUERY_TC_DIRECT_SLOTS,
+   SI_QUERY_TC_NUM_SYNCS,
+   SI_QUERY_CS_THREAD_BUSY,
+   SI_QUERY_GALLIUM_THREAD_BUSY,
+   SI_QUERY_REQUESTED_VRAM,
+   SI_QUERY_REQUESTED_GTT,
+   SI_QUERY_MAPPED_VRAM,
+   SI_QUERY_MAPPED_GTT,
+   SI_QUERY_BUFFER_WAIT_TIME,
+   SI_QUERY_NUM_MAPPED_BUFFERS,
+   SI_QUERY_NUM_GFX_IBS,
+   SI_QUERY_NUM_SDMA_IBS,
+   SI_QUERY_GFX_BO_LIST_SIZE,
+   SI_QUERY_GFX_IB_SIZE,
+   SI_QUERY_NUM_BYTES_MOVED,
+   SI_QUERY_NUM_EVICTIONS,
+   SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS,
+   SI_QUERY_VRAM_USAGE,
+   SI_QUERY_VRAM_VIS_USAGE,
+   SI_QUERY_GTT_USAGE,
+   SI_QUERY_GPU_TEMPERATURE,
+   SI_QUERY_CURRENT_GPU_SCLK,
+   SI_QUERY_CURRENT_GPU_MCLK,
+   SI_QUERY_GPU_LOAD,
+   SI_QUERY_GPU_SHADERS_BUSY,
+   SI_QUERY_GPU_TA_BUSY,
+   SI_QUERY_GPU_GDS_BUSY,
+   SI_QUERY_GPU_VGT_BUSY,
+   SI_QUERY_GPU_IA_BUSY,
+   SI_QUERY_GPU_SX_BUSY,
+   SI_QUERY_GPU_WD_BUSY,
+   SI_QUERY_GPU_BCI_BUSY,
+   SI_QUERY_GPU_SC_BUSY,
+   SI_QUERY_GPU_PA_BUSY,
+   SI_QUERY_GPU_DB_BUSY,
+   SI_QUERY_GPU_CP_BUSY,
+   SI_QUERY_GPU_CB_BUSY,
+   SI_QUERY_GPU_SDMA_BUSY,
+   SI_QUERY_GPU_PFP_BUSY,
+   SI_QUERY_GPU_MEQ_BUSY,
+   SI_QUERY_GPU_ME_BUSY,
+   SI_QUERY_GPU_SURF_SYNC_BUSY,
+   SI_QUERY_GPU_CP_DMA_BUSY,
+   SI_QUERY_GPU_SCRATCH_RAM_BUSY,
+   SI_QUERY_NUM_COMPILATIONS,
+   SI_QUERY_NUM_SHADERS_CREATED,
+   SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
+   SI_QUERY_GPIN_ASIC_ID,
+   SI_QUERY_GPIN_NUM_SIMD,
+   SI_QUERY_GPIN_NUM_RB,
+   SI_QUERY_GPIN_NUM_SPI,
+   SI_QUERY_GPIN_NUM_SE,
+   SI_QUERY_TIME_ELAPSED_SDMA,
+   SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */
+   SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
+   SI_QUERY_PD_NUM_PRIMS_REJECTED,
+   SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
+   SI_QUERY_LIVE_SHADER_CACHE_HITS,
+   SI_QUERY_LIVE_SHADER_CACHE_MISSES,
+   SI_QUERY_MEMORY_SHADER_CACHE_HITS,
+   SI_QUERY_MEMORY_SHADER_CACHE_MISSES,
+   SI_QUERY_DISK_SHADER_CACHE_HITS,
+   SI_QUERY_DISK_SHADER_CACHE_MISSES,

-	SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100,
+   SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100,
 };

-enum {
-	SI_QUERY_GROUP_GPIN = 0,
-	SI_NUM_SW_QUERY_GROUPS
+enum
+{
+   SI_QUERY_GROUP_GPIN = 0,
+   SI_NUM_SW_QUERY_GROUPS
 };

 struct si_query_ops {
-	void (*destroy)(struct si_context *, struct si_query *);
-	bool (*begin)(struct si_context *, struct si_query *);
-	bool (*end)(struct si_context *, struct si_query *);
-	bool (*get_result)(struct si_context *,
-			   struct si_query *, bool wait,
-			   union pipe_query_result *result);
-	void (*get_result_resource)(struct si_context *,
-				    struct si_query *, bool wait,
-				    enum pipe_query_value_type result_type,
-				    int index,
-				    struct pipe_resource *resource,
-				    unsigned offset);
+   void (*destroy)(struct si_context *, struct si_query *);
+   bool (*begin)(struct si_context *, struct si_query *);
+   bool (*end)(struct si_context *, struct si_query *);
+   bool (*get_result)(struct si_context *, struct si_query *, bool wait,
+                      union pipe_query_result *result);
+   void (*get_result_resource)(struct si_context *, struct si_query *, bool wait,
+                               enum pipe_query_value_type result_type, int index,
+                               struct pipe_resource *resource, unsigned offset);

-	void (*suspend)(struct si_context *, struct si_query *);
-	void (*resume)(struct si_context *, struct si_query *);
+   void (*suspend)(struct si_context *, struct si_query *);
+   void (*resume)(struct si_context *, struct si_query *);
 };

 struct si_query {
-	struct threaded_query b;
-	const struct si_query_ops *ops;
+   struct threaded_query b;
+   const struct si_query_ops *ops;

-	/* The PIPE_QUERY_xxx type of query */
-	unsigned type;
+   /* The PIPE_QUERY_xxx type of query */
+   unsigned type;

-	/* The number of dwords for suspend. */
-	unsigned num_cs_dw_suspend;
+   /* The number of dwords for suspend. */
+   unsigned num_cs_dw_suspend;

-	/* Linked list of queries that must be suspended at end of CS. */
-	struct list_head active_list;
+   /* Linked list of queries that must be suspended at end of CS. */
+   struct list_head active_list;
 };

-enum {
-	SI_QUERY_HW_FLAG_NO_START = (1 << 0),
-	/* gap */
-	/* whether begin_query doesn't clear the result */
-	SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
+enum
+{
+   SI_QUERY_HW_FLAG_NO_START = (1 << 0),
+   /* gap */
+   /* whether begin_query doesn't clear the result */
+   SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
 };

 struct si_query_hw_ops {
-	bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *);
-	void (*emit_start)(struct si_context *,
-			   struct si_query_hw *,
-			   struct si_resource *buffer, uint64_t va);
-	void (*emit_stop)(struct si_context *,
-			  struct si_query_hw *,
-			  struct si_resource *buffer, uint64_t va);
-	void (*clear_result)(struct si_query_hw *, union pipe_query_result *);
-	void (*add_result)(struct si_screen *screen,
-			   struct si_query_hw *, void *buffer,
-			   union pipe_query_result *result);
+   bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *);
+   void (*emit_start)(struct si_context *, struct si_query_hw *, struct si_resource *buffer,
+                      uint64_t va);
+   void (*emit_stop)(struct si_context *, struct si_query_hw *, struct si_resource *buffer,
+                     uint64_t va);
+   void (*clear_result)(struct si_query_hw *, union pipe_query_result *);
+   void (*add_result)(struct si_screen *screen, struct si_query_hw *, void *buffer,
+                      union pipe_query_result *result);
 };

 struct si_query_buffer {
-	/* The buffer where query results are stored. */
-	struct si_resource		*buf;
-	/* If a query buffer is full, a new buffer is created and the old one
-	 * is put in here. When we calculate the result, we sum up the samples
-	 * from all buffers. */
-	struct si_query_buffer	*previous;
-	/* Offset of the next free result after current query data */
-	unsigned			results_end;
-	bool unprepared;
+   /* The buffer where query results are stored. */
+   struct si_resource *buf;
+   /* If a query buffer is full, a new buffer is created and the old one
+    * is put in here. When we calculate the result, we sum up the samples
+    * from all buffers. */
+   struct si_query_buffer *previous;
+   /* Offset of the next free result after current query data */
+   unsigned results_end;
+   bool unprepared;
 };

 void si_query_buffer_destroy(struct si_screen *sctx, struct si_query_buffer *buffer);
 void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer);
 bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
-			   bool (*prepare_buffer)(struct si_context *, struct si_query_buffer*),
-			   unsigned size);
-
+                           bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
+                           unsigned size);

 struct si_query_hw {
-	struct si_query b;
-	struct si_query_hw_ops *ops;
-	unsigned flags;
+   struct si_query b;
+   struct si_query_hw_ops *ops;
+   unsigned flags;

-	/* The query buffer and how many results are in it. */
-	struct si_query_buffer buffer;
-	/* Size of the result in memory for both begin_query and end_query,
-	 * this can be one or two numbers, or it could even be a size of a structure. */
-	unsigned result_size;
-	/* For transform feedback: which stream the query is for */
-	unsigned stream;
+   /* The query buffer and how many results are in it. */
+   struct si_query_buffer buffer;
+   /* Size of the result in memory for both begin_query and end_query,
+    * this can be one or two numbers, or it could even be a size of a structure. */
+   unsigned result_size;
+   /* For transform feedback: which stream the query is for */
+   unsigned stream;

-	/* Workaround via compute shader */
-	struct si_resource *workaround_buf;
-	unsigned workaround_offset;
+   /* Workaround via compute shader */
+   struct si_resource *workaround_buf;
+   unsigned workaround_offset;
 };

-void si_query_hw_destroy(struct si_context *sctx,
-			 struct si_query *squery);
-bool si_query_hw_begin(struct si_context *sctx,
-		       struct si_query *squery);
-bool si_query_hw_end(struct si_context *sctx,
-		     struct si_query *squery);
-bool si_query_hw_get_result(struct si_context *sctx,
-			    struct si_query *squery,
-			    bool wait,
-			    union pipe_query_result *result);
+void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery);
+bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery);
+bool si_query_hw_end(struct si_context *sctx, struct si_query *squery);
+bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
+                            union pipe_query_result *result);
 void si_query_hw_suspend(struct si_context *sctx, struct si_query *query);
 void si_query_hw_resume(struct si_context *sctx, struct si_query *query);

-
 /* Shader-based queries */
-struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
-					 enum pipe_query_type query_type,
-					 unsigned index);
-
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
+                                         unsigned index);

 /* Performance counters */
 struct si_perfcounters {
-	unsigned num_groups;
-	unsigned num_blocks;
-	struct si_pc_block *blocks;
+   unsigned num_groups;
+   unsigned num_blocks;
+   struct si_pc_block *blocks;

-	unsigned num_stop_cs_dwords;
-	unsigned num_instance_cs_dwords;
+   unsigned num_stop_cs_dwords;
+   unsigned num_instance_cs_dwords;

-	bool separate_se;
-	bool separate_instance;
+   bool separate_se;
+   bool separate_instance;
 };

-struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
-					 unsigned num_queries,
-					 unsigned *query_types);
+struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
+                                         unsigned *query_types);

-int si_get_perfcounter_info(struct si_screen *,
-			    unsigned index,
-			    struct pipe_driver_query_info *info);
-int si_get_perfcounter_group_info(struct si_screen *,
-				  unsigned index,
-				  struct pipe_driver_query_group_info *info);
+int si_get_perfcounter_info(struct si_screen *, unsigned index,
+                            struct pipe_driver_query_info *info);
+int si_get_perfcounter_group_info(struct si_screen *, unsigned index,
+                                  struct pipe_driver_query_group_info *info);

 struct si_qbo_state {
-	void *saved_compute;
-	struct pipe_constant_buffer saved_const0;
-	struct pipe_shader_buffer saved_ssbo[3];
-	unsigned saved_ssbo_writable_mask;
+   void *saved_compute;
+   struct pipe_constant_buffer saved_const0;
+   struct pipe_shader_buffer saved_ssbo[3];
+   unsigned saved_ssbo_writable_mask;
 };

 #endif /* SI_QUERY_H */
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@ -25,8 +25,8 @@
 #ifndef SI_SHADER_PRIVATE_H
 #define SI_SHADER_PRIVATE_H

-#include "si_shader.h"
 #include "ac_shader_abi.h"
+#include "si_shader.h"

 struct pipe_debug_callback;

@ -38,275 +38,245 @@ struct pipe_debug_callback;
 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 14

 struct si_shader_output_values {
-	LLVMValueRef values[4];
-	unsigned semantic_name;
-	unsigned semantic_index;
-	ubyte vertex_stream[4];
+   LLVMValueRef values[4];
+   unsigned semantic_name;
+   unsigned semantic_index;
+   ubyte vertex_stream[4];
 };

 struct si_shader_context {
-	struct ac_llvm_context ac;
-	struct si_shader *shader;
-	struct si_screen *screen;
+   struct ac_llvm_context ac;
+   struct si_shader *shader;
+   struct si_screen *screen;

-	unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
+   unsigned type; /* PIPE_SHADER_* specifies the type of shader. */

-	/* For clamping the non-constant index in resource indexing: */
-	unsigned num_const_buffers;
-	unsigned num_shader_buffers;
-	unsigned num_images;
-	unsigned num_samplers;
+   /* For clamping the non-constant index in resource indexing: */
+   unsigned num_const_buffers;
+   unsigned num_shader_buffers;
+   unsigned num_images;
+   unsigned num_samplers;

-	struct ac_shader_args args;
-	struct ac_shader_abi abi;
+   struct ac_shader_args args;
+   struct ac_shader_abi abi;

-	LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
+   LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];

-	LLVMBasicBlockRef merged_wrap_if_entry_block;
-	int merged_wrap_if_label;
+   LLVMBasicBlockRef merged_wrap_if_entry_block;
+   int merged_wrap_if_label;

-	LLVMValueRef main_fn;
-	LLVMTypeRef return_type;
+   LLVMValueRef main_fn;
+   LLVMTypeRef return_type;

-	struct ac_arg const_and_shader_buffers;
-	struct ac_arg samplers_and_images;
+   struct ac_arg const_and_shader_buffers;
+   struct ac_arg samplers_and_images;

-	/* For merged shaders, the per-stage descriptors for the stage other
-	 * than the one we're processing, used to pass them through from the
-	 * first stage to the second.
-	 */
-	struct ac_arg other_const_and_shader_buffers;
-	struct ac_arg other_samplers_and_images;
+   /* For merged shaders, the per-stage descriptors for the stage other
+    * than the one we're processing, used to pass them through from the
+    * first stage to the second.
+    */
+   struct ac_arg other_const_and_shader_buffers;
+   struct ac_arg other_samplers_and_images;

-	struct ac_arg rw_buffers;
-	struct ac_arg bindless_samplers_and_images;
-	/* Common inputs for merged shaders. */
-	struct ac_arg merged_wave_info;
-	struct ac_arg merged_scratch_offset;
-	struct ac_arg small_prim_cull_info;
-	/* API VS */
-	struct ac_arg vertex_buffers;
-	struct ac_arg vb_descriptors[5];
-	struct ac_arg rel_auto_id;
-	struct ac_arg vs_prim_id;
-	struct ac_arg vertex_index0;
-	/* VS states and layout of LS outputs / TCS inputs at the end
-	 *   [0] = clamp vertex color
-	 *   [1] = indexed
-	 *   [2:3] = NGG: output primitive type
-	 *   [4:5] = NGG: provoking vertex index
-	 *   [6]   = NGG: streamout queries enabled
-	 *   [7:10] = NGG: small prim filter precision = num_samples / quant_mode,
-	 *            but in reality it's: 1/2^n, from 1/16 to 1/4096 = 1/2^4 to 1/2^12
-	 *            Only the first 4 bits of the exponent are stored.
-	 *            Set it like this: (fui(num_samples / quant_mode) >> 23)
-	 *            Expand to FP32 like this: ((0x70 | value) << 23);
-	 *            With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15)
-	 *            = 1/2^(15 - value) in FP32
-	 *   [11:23] = stride between patches in DW = num_inputs * num_vertices * 4
-	 *             max = 32*32*4 + 32*4
-	 *   [24:31] = stride between vertices in DW = num_inputs * 4
-	 *             max = 32*4
-	 */
-	struct ac_arg vs_state_bits;
-	struct ac_arg vs_blit_inputs;
-	struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */
-	/* HW VS */
-	struct ac_arg streamout_config;
-	struct ac_arg streamout_write_index;
-	struct ac_arg streamout_offset[4];
+   struct ac_arg rw_buffers;
+   struct ac_arg bindless_samplers_and_images;
+   /* Common inputs for merged shaders. */
+   struct ac_arg merged_wave_info;
+   struct ac_arg merged_scratch_offset;
+   struct ac_arg small_prim_cull_info;
+   /* API VS */
+   struct ac_arg vertex_buffers;
+   struct ac_arg vb_descriptors[5];
+   struct ac_arg rel_auto_id;
+   struct ac_arg vs_prim_id;
+   struct ac_arg vertex_index0;
+   /* VS states and layout of LS outputs / TCS inputs at the end
+    *   [0] = clamp vertex color
+    *   [1] = indexed
+    *   [2:3] = NGG: output primitive type
+    *   [4:5] = NGG: provoking vertex index
+    *   [6]   = NGG: streamout queries enabled
+    *   [7:10] = NGG: small prim filter precision = num_samples / quant_mode,
+    *            but in reality it's: 1/2^n, from 1/16 to 1/4096 = 1/2^4 to 1/2^12
+    *            Only the first 4 bits of the exponent are stored.
+    *            Set it like this: (fui(num_samples / quant_mode) >> 23)
+    *            Expand to FP32 like this: ((0x70 | value) << 23);
+    *            With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15)
+    *            = 1/2^(15 - value) in FP32
+    *   [11:23] = stride between patches in DW = num_inputs * num_vertices * 4
+    *             max = 32*32*4 + 32*4
+    *   [24:31] = stride between vertices in DW = num_inputs * 4
+    *             max = 32*4
+    */
+   struct ac_arg vs_state_bits;
+   struct ac_arg vs_blit_inputs;
+   struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */
+   /* HW VS */
+   struct ac_arg streamout_config;
+   struct ac_arg streamout_write_index;
+   struct ac_arg streamout_offset[4];

-	/* API TCS & TES */
-	/* Layout of TCS outputs in the offchip buffer
-	 * # 6 bits
-	 *   [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40)
-	 * # 6 bits
-	 *   [6:11] = the number of output vertices per patch, max = 32
-	 * # 20 bits
-	 *   [12:31] = the offset of per patch attributes in the buffer in bytes.
-	 *             max = NUM_PATCHES*32*32*16
-	 */
-	struct ac_arg tcs_offchip_layout;
+   /* API TCS & TES */
+   /* Layout of TCS outputs in the offchip buffer
+    * # 6 bits
+    *   [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40)
+    * # 6 bits
+    *   [6:11] = the number of output vertices per patch, max = 32
+    * # 20 bits
+    *   [12:31] = the offset of per patch attributes in the buffer in bytes.
+    *             max = NUM_PATCHES*32*32*16
+    */
+   struct ac_arg tcs_offchip_layout;

-	/* API TCS */
-	/* Offsets where TCS outputs and TCS patch outputs live in LDS:
-	 *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
-	 *   [16:31] = TCS output patch0 offset for per-patch / 16
-	 *             max = (NUM_PATCHES + 1) * 32*32
-	 */
-	struct ac_arg tcs_out_lds_offsets;
-	/* Layout of TCS outputs / TES inputs:
-	 *   [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4
-	 *            max = 32*32*4 + 32*4
-	 *   [13:18] = gl_PatchVerticesIn, max = 32
-	 *   [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers
-	 */
-	struct ac_arg tcs_out_lds_layout;
-	struct ac_arg tcs_offchip_offset;
-	struct ac_arg tcs_factor_offset;
+   /* API TCS */
+   /* Offsets where TCS outputs and TCS patch outputs live in LDS:
+    *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
+    *   [16:31] = TCS output patch0 offset for per-patch / 16
+    *             max = (NUM_PATCHES + 1) * 32*32
+    */
+   struct ac_arg tcs_out_lds_offsets;
+   /* Layout of TCS outputs / TES inputs:
+    *   [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4
+    *            max = 32*32*4 + 32*4
+    *   [13:18] = gl_PatchVerticesIn, max = 32
+    *   [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers
+    */
+   struct ac_arg tcs_out_lds_layout;
+   struct ac_arg tcs_offchip_offset;
+   struct ac_arg tcs_factor_offset;

-	/* API TES */
-	struct ac_arg tes_offchip_addr;
-	struct ac_arg tes_u;
-	struct ac_arg tes_v;
-	struct ac_arg tes_rel_patch_id;
-	/* HW ES */
-	struct ac_arg es2gs_offset;
-	/* HW GS */
-	/* On gfx10:
-	 *  - bits 0..11: ordered_wave_id
-	 *  - bits 12..20: number of vertices in group
-	 *  - bits 22..30: number of primitives in group
-	 */
-	struct ac_arg gs_tg_info;
-	/* API GS */
-	struct ac_arg gs2vs_offset;
-	struct ac_arg gs_wave_id; /* GFX6 */
-	struct ac_arg gs_vtx_offset[6]; /* in dwords (GFX6) */
-	struct ac_arg gs_vtx01_offset; /* in dwords (GFX9) */
-	struct ac_arg gs_vtx23_offset; /* in dwords (GFX9) */
-	struct ac_arg gs_vtx45_offset; /* in dwords (GFX9) */
-	/* PS */
-	struct ac_arg pos_fixed_pt;
-	/* CS */
-	struct ac_arg block_size;
-	struct ac_arg cs_user_data;
+   /* API TES */
+   struct ac_arg tes_offchip_addr;
+   struct ac_arg tes_u;
+   struct ac_arg tes_v;
+   struct ac_arg tes_rel_patch_id;
+   /* HW ES */
+   struct ac_arg es2gs_offset;
+   /* HW GS */
+   /* On gfx10:
+    *  - bits 0..11: ordered_wave_id
+    *  - bits 12..20: number of vertices in group
+    *  - bits 22..30: number of primitives in group
+    */
+   struct ac_arg gs_tg_info;
+   /* API GS */
+   struct ac_arg gs2vs_offset;
+   struct ac_arg gs_wave_id;       /* GFX6 */
+   struct ac_arg gs_vtx_offset[6]; /* in dwords (GFX6) */
+   struct ac_arg gs_vtx01_offset;  /* in dwords (GFX9) */
+   struct ac_arg gs_vtx23_offset;  /* in dwords (GFX9) */
+   struct ac_arg gs_vtx45_offset;  /* in dwords (GFX9) */
+   /* PS */
+   struct ac_arg pos_fixed_pt;
+   /* CS */
+   struct ac_arg block_size;
+   struct ac_arg cs_user_data;

-	struct ac_llvm_compiler *compiler;
+   struct ac_llvm_compiler *compiler;

-	/* Preloaded descriptors. */
-	LLVMValueRef esgs_ring;
-	LLVMValueRef gsvs_ring[4];
-	LLVMValueRef tess_offchip_ring;
+   /* Preloaded descriptors. */
+   LLVMValueRef esgs_ring;
+   LLVMValueRef gsvs_ring[4];
+   LLVMValueRef tess_offchip_ring;

-	LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
-	LLVMValueRef gs_next_vertex[4];
-	LLVMValueRef gs_curprim_verts[4];
-	LLVMValueRef gs_generated_prims[4];
-	LLVMValueRef gs_ngg_emit;
-	LLVMValueRef gs_ngg_scratch;
-	LLVMValueRef postponed_kill;
-	LLVMValueRef return_value;
+   LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
+   LLVMValueRef gs_next_vertex[4];
+   LLVMValueRef gs_curprim_verts[4];
+   LLVMValueRef gs_generated_prims[4];
+   LLVMValueRef gs_ngg_emit;
+   LLVMValueRef gs_ngg_scratch;
+   LLVMValueRef postponed_kill;
+   LLVMValueRef return_value;
 };

-static inline struct si_shader_context *
-si_shader_context_from_abi(struct ac_shader_abi *abi)
+static inline struct si_shader_context *si_shader_context_from_abi(struct ac_shader_abi *abi)
 {
-	struct si_shader_context *ctx = NULL;
-	return container_of(abi, ctx, abi);
+   struct si_shader_context *ctx = NULL;
+   return container_of(abi, ctx, abi);
 }

 bool si_is_multi_part_shader(struct si_shader *shader);
 bool si_is_merged_shader(struct si_shader *shader);
-void si_add_arg_checked(struct ac_shader_args *args,
-			enum ac_arg_regfile file,
-			unsigned registers, enum ac_arg_type type,
-			struct ac_arg *arg,
-			unsigned idx);
+void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
+                        enum ac_arg_type type, struct ac_arg *arg, unsigned idx);
 unsigned si_get_max_workgroup_size(const struct si_shader *shader);
 bool si_need_ps_prolog(const union si_shader_part_key *key);
-void si_get_ps_prolog_key(struct si_shader *shader,
-			  union si_shader_part_key *key,
-			  bool separate_prolog);
-void si_get_ps_epilog_key(struct si_shader *shader,
-			  union si_shader_part_key *key);
+void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
+                          bool separate_prolog);
+void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key);
 void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
 void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader);

 bool gfx10_ngg_export_prim_early(struct si_shader *shader);
 void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx);
-void gfx10_ngg_build_export_prim(struct si_shader_context *ctx,
-				 LLVMValueRef user_edgeflags[3],
-				 LLVMValueRef prim_passthrough);
-void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
-					       unsigned max_outputs,
-					       LLVMValueRef *addrs);
-void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
-			     unsigned max_outputs,
-			     LLVMValueRef *addrs);
-void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
-			      unsigned stream,
-			      LLVMValueRef *addrs);
+void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
+                                 LLVMValueRef prim_passthrough);
+void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsigned max_outputs,
+                                               LLVMValueRef *addrs);
+void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs);
 void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx);
 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx);
 void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader);

 /* si_shader_llvm.c */
-bool si_compile_llvm(struct si_screen *sscreen,
-		     struct si_shader_binary *binary,
-		     struct ac_shader_config *conf,
-		     struct ac_llvm_compiler *compiler,
-		     struct ac_llvm_context *ac,
-		     struct pipe_debug_callback *debug,
-		     enum pipe_shader_type shader_type,
-		     const char *name,
-		     bool less_optimized);
-void si_llvm_context_init(struct si_shader_context *ctx,
-			  struct si_screen *sscreen,
-			  struct ac_llvm_compiler *compiler,
-			  unsigned wave_size);
-void si_llvm_create_func(struct si_shader_context *ctx, const char *name,
-			 LLVMTypeRef *return_types, unsigned num_return_elems,
-			 unsigned max_workgroup_size);
+bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary,
+                     struct ac_shader_config *conf, struct ac_llvm_compiler *compiler,
+                     struct ac_llvm_context *ac, struct pipe_debug_callback *debug,
+                     enum pipe_shader_type shader_type, const char *name, bool less_optimized);
+void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen,
+                          struct ac_llvm_compiler *compiler, unsigned wave_size);
+void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types,
+                         unsigned num_return_elems, unsigned max_workgroup_size);
 void si_llvm_optimize_module(struct si_shader_context *ctx);
 void si_llvm_dispose(struct si_shader_context *ctx);
-LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx,
-				  LLVMValueRef resource, LLVMValueRef offset);
+LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource,
+                                  LLVMValueRef offset);
 void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret);
 LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
-				 struct ac_arg param, unsigned return_index);
+                                 struct ac_arg param, unsigned return_index);
 LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
-				       struct ac_arg param, unsigned return_index);
+                                       struct ac_arg param, unsigned return_index);
 LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
-				 struct ac_arg param, unsigned return_index);
+                                 struct ac_arg param, unsigned return_index);
 LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx);
-LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx,
-				   LLVMTypeRef type, LLVMValueRef val1,
-				   LLVMValueRef val2);
+LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMTypeRef type,
+                                   LLVMValueRef val1, LLVMValueRef val2);
 void si_llvm_emit_barrier(struct si_shader_context *ctx);
 void si_llvm_declare_esgs_ring(struct si_shader_context *ctx);
 void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
-			     unsigned bitoffset);
-LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
-			     struct ac_arg param, unsigned rshift,
-			     unsigned bitwidth);
-LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
-				 unsigned swizzle);
+                             unsigned bitoffset);
+LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
+                             unsigned bitwidth);
+LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle);
 LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi);
 void si_llvm_declare_compute_memory(struct si_shader_context *ctx);
 bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir);
 void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
-			       unsigned num_parts, unsigned main_part,
-			       unsigned next_shader_first_part);
+                               unsigned num_parts, unsigned main_part,
+                               unsigned next_shader_first_part);

 /* si_shader_llvm_gs.c */
 LLVMValueRef si_is_es_thread(struct si_shader_context *ctx);
 LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx);
-void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-			      LLVMValueRef *addrs);
+void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
 void si_preload_esgs_ring(struct si_shader_context *ctx);
 void si_preload_gs_rings(struct si_shader_context *ctx);
-void si_llvm_build_gs_prolog(struct si_shader_context *ctx,
-			     union si_shader_part_key *key);
+void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
 void si_llvm_init_gs_callbacks(struct si_shader_context *ctx);

 /* si_shader_llvm_tess.c */
 void si_llvm_preload_tes_rings(struct si_shader_context *ctx);
-void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-			      LLVMValueRef *addrs);
-void si_llvm_build_tcs_epilog(struct si_shader_context *ctx,
-			      union si_shader_part_key *key);
+void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key);
 void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx);
 void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);

 /* si_shader_llvm_ps.c */
 LLVMValueRef si_get_sample_id(struct si_shader_context *ctx);
-void si_llvm_build_ps_prolog(struct si_shader_context *ctx,
-			     union si_shader_part_key *key);
-void si_llvm_build_ps_epilog(struct si_shader_context *ctx,
-			     union si_shader_part_key *key);
-void si_llvm_build_monolithic_ps(struct si_shader_context *ctx,
-				 struct si_shader *shader);
+void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
+void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part_key *key);
+void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, struct si_shader *shader);
 void si_llvm_init_ps_callbacks(struct si_shader_context *ctx);

 /* si_shader_llvm_resources.c */
@ -314,21 +284,16 @@ void si_llvm_init_resource_callbacks(struct si_shader_context *ctx);

 /* si_shader_llvm_vs.c */
 void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir);
-void si_llvm_streamout_store_output(struct si_shader_context *ctx,
-				    LLVMValueRef const *so_buffers,
-				    LLVMValueRef const *so_write_offsets,
-				    struct pipe_stream_output *stream_out,
-				    struct si_shader_output_values *shader_out);
-void si_llvm_emit_streamout(struct si_shader_context *ctx,
-			    struct si_shader_output_values *outputs,
-			    unsigned noutput, unsigned stream);
+void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
+                                    LLVMValueRef const *so_write_offsets,
+                                    struct pipe_stream_output *stream_out,
+                                    struct si_shader_output_values *shader_out);
+void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
+                            unsigned noutput, unsigned stream);
 void si_llvm_build_vs_exports(struct si_shader_context *ctx,
-			      struct si_shader_output_values *outputs,
-			      unsigned noutput);
-void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-			      LLVMValueRef *addrs);
-void si_llvm_build_vs_prolog(struct si_shader_context *ctx,
-			     union si_shader_part_key *key);
+                              struct si_shader_output_values *outputs, unsigned noutput);
+void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
 void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);

 #endif
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c
@ -22,111 +22,98 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#include "si_shader_internal.h"
 #include "si_pipe.h"
+#include "si_shader_internal.h"
 #include "sid.h"

 /**
 * Return a value that is equal to the given i32 \p index if it lies in [0,num)
 * or an undefined value in the same interval otherwise.
 */
-static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx,
-				 LLVMValueRef index,
-				 unsigned num)
+static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, LLVMValueRef index,
+                                        unsigned num)
 {
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0);
-	LLVMValueRef cc;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0);
+   LLVMValueRef cc;

-	if (util_is_power_of_two_or_zero(num)) {
-		index = LLVMBuildAnd(builder, index, c_max, "");
-	} else {
-		/* In theory, this MAX pattern should result in code that is
-		 * as good as the bit-wise AND above.
-		 *
-		 * In practice, LLVM generates worse code (at the time of
-		 * writing), because its value tracking is not strong enough.
-		 */
-		cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, "");
-		index = LLVMBuildSelect(builder, cc, index, c_max, "");
-	}
+   if (util_is_power_of_two_or_zero(num)) {
+      index = LLVMBuildAnd(builder, index, c_max, "");
+   } else {
+      /* In theory, this MAX pattern should result in code that is
+       * as good as the bit-wise AND above.
+       *
+       * In practice, LLVM generates worse code (at the time of
+       * writing), because its value tracking is not strong enough.
+       */
+      cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, "");
+      index = LLVMBuildSelect(builder, cc, index, c_max, "");
+   }

-	return index;
+   return index;
 }

 static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx)
 {
-	LLVMValueRef ptr =
-		ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
-	struct si_shader_selector *sel = ctx->shader->selector;
+   LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
+   struct si_shader_selector *sel = ctx->shader->selector;

-	/* Do the bounds checking with a descriptor, because
-	 * doing computation and manual bounds checking of 64-bit
-	 * addresses generates horrible VALU code with very high
-	 * VGPR usage and very low SIMD occupancy.
-	 */
-	ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
+   /* Do the bounds checking with a descriptor, because
+    * doing computation and manual bounds checking of 64-bit
+    * addresses generates horrible VALU code with very high
+    * VGPR usage and very low SIMD occupancy.
+    */
+   ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");

-	LLVMValueRef desc0, desc1;
-	desc0 = ptr;
-	desc1 = LLVMConstInt(ctx->ac.i32,
-			     S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
+   LLVMValueRef desc0, desc1;
+   desc0 = ptr;
+   desc1 = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);

-	uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-			 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-			 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-			 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+   uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                    S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);

-	if (ctx->screen->info.chip_class >= GFX10)
-		rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-			 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
-			 S_008F0C_RESOURCE_LEVEL(1);
-	else
-		rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-			 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+   if (ctx->screen->info.chip_class >= GFX10)
+      rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+               S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+   else
+      rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);

-	LLVMValueRef desc_elems[] = {
-		desc0,
-		desc1,
-		LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0),
-		LLVMConstInt(ctx->ac.i32, rsrc3, false)
-	};
+   LLVMValueRef desc_elems[] = {desc0, desc1,
+                                LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0),
+                                LLVMConstInt(ctx->ac.i32, rsrc3, false)};

-	return ac_build_gather_values(&ctx->ac, desc_elems, 4);
+   return ac_build_gather_values(&ctx->ac, desc_elems, 4);
 }

 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct si_shader_selector *sel = ctx->shader->selector;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_selector *sel = ctx->shader->selector;

-	LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
+   LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);

-	if (sel->info.const_buffers_declared == 1 &&
-	    sel->info.shader_buffers_declared == 0) {
-		return load_const_buffer_desc_fast_path(ctx);
-	}
+   if (sel->info.const_buffers_declared == 1 && sel->info.shader_buffers_declared == 0) {
+      return load_const_buffer_desc_fast_path(ctx);
+   }

-	index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
-	index = LLVMBuildAdd(ctx->ac.builder, index,
-			     LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), "");
+   index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
+   index =
+      LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), "");

-	return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
+   return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
 }

-static LLVMValueRef
-load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
+static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac,
-					   ctx->const_and_shader_buffers);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);

-	index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
-	index = LLVMBuildSub(ctx->ac.builder,
-			     LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0),
-			     index, "");
+   index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
+   index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0),
+                        index, "");

-	return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
+   return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
 }

 /**
@ -140,181 +127,167 @@ load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
 * nicer: disabling DCC in the shader still leads to undefined results but
 * avoids the lockup.
 */
-static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
-				  LLVMValueRef rsrc)
+static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rsrc)
 {
-	if (ctx->screen->info.chip_class <= GFX7) {
-		return rsrc;
-	} else {
-		LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
-		LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0);
-		LLVMValueRef tmp;
+   if (ctx->screen->info.chip_class <= GFX7) {
+      return rsrc;
+   } else {
+      LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
+      LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0);
+      LLVMValueRef tmp;

-		tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
-		tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
-		return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
-	}
+      tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
+      tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
+      return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
+   }
 }

 /* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should
 * adjust "index" to point to FMASK. */
-static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx,
-				       LLVMValueRef list, LLVMValueRef index,
-				       enum ac_descriptor_type desc_type,
-				       bool uses_store, bool bindless)
+static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, LLVMValueRef list,
+                                       LLVMValueRef index, enum ac_descriptor_type desc_type,
+                                       bool uses_store, bool bindless)
 {
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef rsrc;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef rsrc;

-	if (desc_type == AC_DESC_BUFFER) {
-		index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0),
-				      ctx->ac.i32_1);
-		list = LLVMBuildPointerCast(builder, list,
-					    ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
-	} else {
-		assert(desc_type == AC_DESC_IMAGE ||
-		       desc_type == AC_DESC_FMASK);
-	}
+   if (desc_type == AC_DESC_BUFFER) {
+      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
+      list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+   } else {
+      assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_FMASK);
+   }

-	if (bindless)
-		rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
-	else
-		rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
+   if (bindless)
+      rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
+   else
+      rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);

-	if (desc_type == AC_DESC_IMAGE && uses_store)
-		rsrc = force_dcc_off(ctx, rsrc);
-	return rsrc;
+   if (desc_type == AC_DESC_IMAGE && uses_store)
+      rsrc = force_dcc_off(ctx, rsrc);
+   return rsrc;
 }

 /**
 * Load an image view, fmask view. or sampler state descriptor.
 */
-static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx,
-					 LLVMValueRef list, LLVMValueRef index,
-					 enum ac_descriptor_type type)
+static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, LLVMValueRef list,
+                                         LLVMValueRef index, enum ac_descriptor_type type)
 {
-	LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMBuilderRef builder = ctx->ac.builder;

-	switch (type) {
-	case AC_DESC_IMAGE:
-		/* The image is at [0:7]. */
-		index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
-		break;
-	case AC_DESC_BUFFER:
-		/* The buffer is in [4:7]. */
-		index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
-				      ctx->ac.i32_1);
-		list = LLVMBuildPointerCast(builder, list,
-					    ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
-		break;
-	case AC_DESC_FMASK:
-		/* The FMASK is at [8:15]. */
-		index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0),
-				      ctx->ac.i32_1);
-		break;
-	case AC_DESC_SAMPLER:
-		/* The sampler state is at [12:15]. */
-		index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
-				      LLVMConstInt(ctx->ac.i32, 3, 0));
-		list = LLVMBuildPointerCast(builder, list,
-					    ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
-		break;
-	case AC_DESC_PLANE_0:
-	case AC_DESC_PLANE_1:
-	case AC_DESC_PLANE_2:
-		/* Only used for the multiplane image support for Vulkan. Should
-		 * never be reached in radeonsi.
-		 */
-		unreachable("Plane descriptor requested in radeonsi.");
-	}
+   switch (type) {
+   case AC_DESC_IMAGE:
+      /* The image is at [0:7]. */
+      index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
+      break;
+   case AC_DESC_BUFFER:
+      /* The buffer is in [4:7]. */
+      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1);
+      list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+      break;
+   case AC_DESC_FMASK:
+      /* The FMASK is at [8:15]. */
+      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
+      break;
+   case AC_DESC_SAMPLER:
+      /* The sampler state is at [12:15]. */
+      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
+                            LLVMConstInt(ctx->ac.i32, 3, 0));
+      list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+      break;
+   case AC_DESC_PLANE_0:
+   case AC_DESC_PLANE_1:
+   case AC_DESC_PLANE_2:
+      /* Only used for the multiplane image support for Vulkan. Should
+       * never be reached in radeonsi.
+       */
+      unreachable("Plane descriptor requested in radeonsi.");
+   }

-	return ac_build_load_to_sgpr(&ctx->ac, list, index);
+   return ac_build_load_to_sgpr(&ctx->ac, list, index);
 }

-static LLVMValueRef
-si_nir_load_sampler_desc(struct ac_shader_abi *abi,
-		         unsigned descriptor_set, unsigned base_index,
-		         unsigned constant_index, LLVMValueRef dynamic_index,
-		         enum ac_descriptor_type desc_type, bool image,
-			 bool write, bool bindless)
+static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set,
+                                             unsigned base_index, unsigned constant_index,
+                                             LLVMValueRef dynamic_index,
+                                             enum ac_descriptor_type desc_type, bool image,
+                                             bool write, bool bindless)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	LLVMBuilderRef builder = ctx->ac.builder;
-	unsigned const_index = base_index + constant_index;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   unsigned const_index = base_index + constant_index;

-	assert(!descriptor_set);
-	assert(desc_type <= AC_DESC_BUFFER);
+   assert(!descriptor_set);
+   assert(desc_type <= AC_DESC_BUFFER);

-	if (bindless) {
-		LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images);
+   if (bindless) {
+      LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images);

-		/* dynamic_index is the bindless handle */
-		if (image) {
-			/* Bindless image descriptors use 16-dword slots. */
-			dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index,
-					     LLVMConstInt(ctx->ac.i64, 2, 0), "");
-			/* FMASK is right after the image. */
-			if (desc_type == AC_DESC_FMASK) {
-				dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index,
-							     ctx->ac.i32_1, "");
-			}
+      /* dynamic_index is the bindless handle */
+      if (image) {
+         /* Bindless image descriptors use 16-dword slots. */
+         dynamic_index =
+            LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
+         /* FMASK is right after the image. */
+         if (desc_type == AC_DESC_FMASK) {
+            dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, ctx->ac.i32_1, "");
+         }

-			return si_load_image_desc(ctx, list, dynamic_index, desc_type,
-						  write, true);
-		}
+         return si_load_image_desc(ctx, list, dynamic_index, desc_type, write, true);
+      }

-		/* Since bindless handle arithmetic can contain an unsigned integer
-		 * wraparound and si_load_sampler_desc assumes there isn't any,
-		 * use GEP without "inbounds" (inside ac_build_pointer_add)
-		 * to prevent incorrect code generation and hangs.
-		 */
-		dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index,
-					     LLVMConstInt(ctx->ac.i64, 2, 0), "");
-		list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);
-		return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type);
-	}
+      /* Since bindless handle arithmetic can contain an unsigned integer
+       * wraparound and si_load_sampler_desc assumes there isn't any,
+       * use GEP without "inbounds" (inside ac_build_pointer_add)
+       * to prevent incorrect code generation and hangs.
+       */
+      dynamic_index =
+         LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
+      list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);
+      return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type);
+   }

-	unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
-	assert(const_index < num_slots || dynamic_index);
+   unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
+   assert(const_index < num_slots || dynamic_index);

-	LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images);
-	LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);
+   LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images);
+   LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);

-	if (dynamic_index) {
-		index = LLVMBuildAdd(builder, index, dynamic_index, "");
+   if (dynamic_index) {
+      index = LLVMBuildAdd(builder, index, dynamic_index, "");

-		/* From the GL_ARB_shader_image_load_store extension spec:
-		 *
-		 *    If a shader performs an image load, store, or atomic
-		 *    operation using an image variable declared as an array,
-		 *    and if the index used to select an individual element is
-		 *    negative or greater than or equal to the size of the
-		 *    array, the results of the operation are undefined but may
-		 *    not lead to termination.
-		 */
-		index = si_llvm_bound_index(ctx, index, num_slots);
-	}
+      /* From the GL_ARB_shader_image_load_store extension spec:
+       *
+       *    If a shader performs an image load, store, or atomic
+       *    operation using an image variable declared as an array,
+       *    and if the index used to select an individual element is
+       *    negative or greater than or equal to the size of the
+       *    array, the results of the operation are undefined but may
+       *    not lead to termination.
+       */
+      index = si_llvm_bound_index(ctx, index, num_slots);
+   }

-	if (image) {
-		/* FMASKs are separate from images. */
-		if (desc_type == AC_DESC_FMASK) {
-			index = LLVMBuildAdd(ctx->ac.builder, index,
-					     LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), "");
-		}
-		index = LLVMBuildSub(ctx->ac.builder,
-				     LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0),
-				     index, "");
-		return si_load_image_desc(ctx, list, index, desc_type, write, false);
-	}
+   if (image) {
+      /* FMASKs are separate from images. */
+      if (desc_type == AC_DESC_FMASK) {
+         index =
+            LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), "");
+      }
+      index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0),
+                           index, "");
+      return si_load_image_desc(ctx, list, index, desc_type, write, false);
+   }

-	index = LLVMBuildAdd(ctx->ac.builder, index,
-			     LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), "");
-	return si_load_sampler_desc(ctx, list, index, desc_type);
+   index = LLVMBuildAdd(ctx->ac.builder, index,
+                        LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), "");
+   return si_load_sampler_desc(ctx, list, index, desc_type);
 }

 void si_llvm_init_resource_callbacks(struct si_shader_context *ctx)
 {
-	ctx->abi.load_ubo = load_ubo;
-	ctx->abi.load_ssbo = load_ssbo;
-	ctx->abi.load_sampler_desc = si_nir_load_sampler_desc;
+   ctx->abi.load_ubo = load_ubo;
+   ctx->abi.load_ssbo = load_ssbo;
+   ctx->abi.load_sampler_desc = si_nir_load_sampler_desc;
 }
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
--- a/src/gallium/drivers/radeonsi/si_state_binning.c
+++ b/src/gallium/drivers/radeonsi/si_state_binning.c
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
--- a/src/gallium/drivers/radeonsi/si_state_msaa.c
+++ b/src/gallium/drivers/radeonsi/si_state_msaa.c
@ -25,17 +25,16 @@
 #include "si_build_pm4.h"

 /* For MSAA sample positions. */
-#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)  \
-	((((unsigned)(s0x) & 0xf) << 0)  | (((unsigned)(s0y) & 0xf) << 4)  | \
-	 (((unsigned)(s1x) & 0xf) << 8)  | (((unsigned)(s1y) & 0xf) << 12) | \
-	 (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \
-	 (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28))
+#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)                                          \
+   ((((unsigned)(s0x)&0xf) << 0) | (((unsigned)(s0y)&0xf) << 4) | (((unsigned)(s1x)&0xf) << 8) |   \
+    (((unsigned)(s1y)&0xf) << 12) | (((unsigned)(s2x)&0xf) << 16) |                                \
+    (((unsigned)(s2y)&0xf) << 20) | (((unsigned)(s3x)&0xf) << 24) | (((unsigned)(s3y)&0xf) << 28))

 /* For obtaining location coordinates from registers */
-#define SEXT4(x)		((int)((x) | ((x) & 0x8 ? 0xfffffff0 : 0)))
-#define GET_SFIELD(reg, index)	SEXT4(((reg) >> ((index) * 4)) & 0xf)
-#define GET_SX(reg, index)	GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
-#define GET_SY(reg, index)	GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
+#define SEXT4(x)               ((int)((x) | ((x)&0x8 ? 0xfffffff0 : 0)))
+#define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index)*4)) & 0xf)
+#define GET_SX(reg, index)     GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
+#define GET_SY(reg, index)     GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)

 /* The following sample ordering is required by EQAA.
 *
@ -88,132 +87,128 @@

 /* 1x MSAA */
 static const uint32_t sample_locs_1x =
-	FILL_SREG( 0, 0,   0, 0,   0, 0,   0, 0); /* S1, S2, S3 fields are not used by 1x */
+   FILL_SREG(0, 0, 0, 0, 0, 0, 0, 0); /* S1, S2, S3 fields are not used by 1x */
 static const uint64_t centroid_priority_1x = 0x0000000000000000ull;

 /* 2x MSAA (the positions are sorted for EQAA) */
 static const uint32_t sample_locs_2x =
-	FILL_SREG(-4,-4,   4, 4,   0, 0,   0, 0); /* S2 & S3 fields are not used by 2x MSAA */
+   FILL_SREG(-4, -4, 4, 4, 0, 0, 0, 0); /* S2 & S3 fields are not used by 2x MSAA */
 static const uint64_t centroid_priority_2x = 0x1010101010101010ull;

 /* 4x MSAA (the positions are sorted for EQAA) */
-static const uint32_t sample_locs_4x =
-	FILL_SREG(-2,-6,   2, 6,   -6, 2,  6,-2);
+static const uint32_t sample_locs_4x = FILL_SREG(-2, -6, 2, 6, -6, 2, 6, -2);
 static const uint64_t centroid_priority_4x = 0x3210321032103210ull;

 /* 8x MSAA (the positions are sorted for EQAA) */
 static const uint32_t sample_locs_8x[] = {
-	FILL_SREG(-3,-5,   5, 1,  -1, 3,   7,-7),
-	FILL_SREG(-7,-1,   3, 7,  -5, 5,   1,-3),
-	/* The following are unused by hardware, but we emit them to IBs
-	 * instead of multiple SET_CONTEXT_REG packets. */
-	0,
-	0,
+   FILL_SREG(-3, -5, 5, 1, -1, 3, 7, -7),
+   FILL_SREG(-7, -1, 3, 7, -5, 5, 1, -3),
+   /* The following are unused by hardware, but we emit them to IBs
+    * instead of multiple SET_CONTEXT_REG packets. */
+   0,
+   0,
 };
 static const uint64_t centroid_priority_8x = 0x3546012735460127ull;

 /* 16x MSAA (the positions are sorted for EQAA) */
 static const uint32_t sample_locs_16x[] = {
-	FILL_SREG(-5,-2,   5, 3,  -2, 6,   3,-5),
-	FILL_SREG(-4,-6,   1, 1,  -6, 4,   7,-4),
-	FILL_SREG(-1,-3,   6, 7,  -3, 2,   0,-7),
-	FILL_SREG(-7,-8,   2, 5,  -8, 0,   4,-1),
+   FILL_SREG(-5, -2, 5, 3, -2, 6, 3, -5),
+   FILL_SREG(-4, -6, 1, 1, -6, 4, 7, -4),
+   FILL_SREG(-1, -3, 6, 7, -3, 2, 0, -7),
+   FILL_SREG(-7, -8, 2, 5, -8, 0, 4, -1),
 };
 static const uint64_t centroid_priority_16x = 0xc97e64b231d0fa85ull;

 static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_count,
-				   unsigned sample_index, float *out_value)
+                                   unsigned sample_index, float *out_value)
 {
-	const uint32_t *sample_locs;
+   const uint32_t *sample_locs;

-	switch (sample_count) {
-	case 1:
-	default:
-		sample_locs = &sample_locs_1x;
-		break;
-	case 2:
-		sample_locs = &sample_locs_2x;
-		break;
-	case 4:
-		sample_locs = &sample_locs_4x;
-		break;
-	case 8:
-		sample_locs = sample_locs_8x;
-		break;
-	case 16:
-		sample_locs = sample_locs_16x;
-		break;
-	}
+   switch (sample_count) {
+   case 1:
+   default:
+      sample_locs = &sample_locs_1x;
+      break;
+   case 2:
+      sample_locs = &sample_locs_2x;
+      break;
+   case 4:
+      sample_locs = &sample_locs_4x;
+      break;
+   case 8:
+      sample_locs = sample_locs_8x;
+      break;
+   case 16:
+      sample_locs = sample_locs_16x;
+      break;
+   }

-	out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
-	out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
+   out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
+   out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
 }

-static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs,
-				      uint64_t centroid_priority,
-				      uint32_t sample_locs)
+static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority,
+                                      uint32_t sample_locs)
 {
-	radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
-	radeon_emit(cs, centroid_priority);
-	radeon_emit(cs, centroid_priority >> 32);
-	radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
-	radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
-	radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
-	radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
+   radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+   radeon_emit(cs, centroid_priority);
+   radeon_emit(cs, centroid_priority >> 32);
+   radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
+   radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
+   radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
+   radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
 }

-static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs,
-				       uint64_t centroid_priority,
-				       const uint32_t *sample_locs,
-				       unsigned num_samples)
+static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority,
+                                       const uint32_t *sample_locs, unsigned num_samples)
 {
-	radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
-	radeon_emit(cs, centroid_priority);
-	radeon_emit(cs, centroid_priority >> 32);
-	radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
-				   num_samples == 8 ? 14 : 16);
-	radeon_emit_array(cs, sample_locs, 4);
-	radeon_emit_array(cs, sample_locs, 4);
-	radeon_emit_array(cs, sample_locs, 4);
-	radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4);
+   radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+   radeon_emit(cs, centroid_priority);
+   radeon_emit(cs, centroid_priority >> 32);
+   radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
+                              num_samples == 8 ? 14 : 16);
+   radeon_emit_array(cs, sample_locs, 4);
+   radeon_emit_array(cs, sample_locs, 4);
+   radeon_emit_array(cs, sample_locs, 4);
+   radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4);
 }

 void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples)
 {
-	switch (nr_samples) {
-	default:
-	case 1:
-		si_emit_max_4_sample_locs(cs, centroid_priority_1x, sample_locs_1x);
-		break;
-	case 2:
-		si_emit_max_4_sample_locs(cs, centroid_priority_2x, sample_locs_2x);
-		break;
-	case 4:
-		si_emit_max_4_sample_locs(cs, centroid_priority_4x, sample_locs_4x);
-		break;
-	case 8:
-		si_emit_max_16_sample_locs(cs, centroid_priority_8x, sample_locs_8x, 8);
-		break;
-	case 16:
-		si_emit_max_16_sample_locs(cs, centroid_priority_16x, sample_locs_16x, 16);
-		break;
-	}
+   switch (nr_samples) {
+   default:
+   case 1:
+      si_emit_max_4_sample_locs(cs, centroid_priority_1x, sample_locs_1x);
+      break;
+   case 2:
+      si_emit_max_4_sample_locs(cs, centroid_priority_2x, sample_locs_2x);
+      break;
+   case 4:
+      si_emit_max_4_sample_locs(cs, centroid_priority_4x, sample_locs_4x);
+      break;
+   case 8:
+      si_emit_max_16_sample_locs(cs, centroid_priority_8x, sample_locs_8x, 8);
+      break;
+   case 16:
+      si_emit_max_16_sample_locs(cs, centroid_priority_16x, sample_locs_16x, 16);
+      break;
+   }
 }

 void si_init_msaa_functions(struct si_context *sctx)
 {
-	int i;
+   int i;

-	sctx->b.get_sample_position = si_get_sample_position;
+   sctx->b.get_sample_position = si_get_sample_position;

-	si_get_sample_position(&sctx->b, 1, 0, sctx->sample_positions.x1[0]);
+   si_get_sample_position(&sctx->b, 1, 0, sctx->sample_positions.x1[0]);

-	for (i = 0; i < 2; i++)
-		si_get_sample_position(&sctx->b, 2, i, sctx->sample_positions.x2[i]);
-	for (i = 0; i < 4; i++)
-		si_get_sample_position(&sctx->b, 4, i, sctx->sample_positions.x4[i]);
-	for (i = 0; i < 8; i++)
-		si_get_sample_position(&sctx->b, 8, i, sctx->sample_positions.x8[i]);
-	for (i = 0; i < 16; i++)
-		si_get_sample_position(&sctx->b, 16, i, sctx->sample_positions.x16[i]);
+   for (i = 0; i < 2; i++)
+      si_get_sample_position(&sctx->b, 2, i, sctx->sample_positions.x2[i]);
+   for (i = 0; i < 4; i++)
+      si_get_sample_position(&sctx->b, 4, i, sctx->sample_positions.x4[i]);
+   for (i = 0; i < 8; i++)
+      si_get_sample_position(&sctx->b, 8, i, sctx->sample_positions.x8[i]);
+   for (i = 0; i < 16; i++)
+      si_get_sample_position(&sctx->b, 16, i, sctx->sample_positions.x16[i]);
 }
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@ -23,395 +23,372 @@
 */

 #include "si_build_pm4.h"
-
 #include "util/u_memory.h"
 #include "util/u_suballoc.h"

 static void si_set_streamout_enable(struct si_context *sctx, bool enable);

 static inline void si_so_target_reference(struct si_streamout_target **dst,
-					  struct pipe_stream_output_target *src)
+                                          struct pipe_stream_output_target *src)
 {
-	pipe_so_target_reference((struct pipe_stream_output_target**)dst, src);
+   pipe_so_target_reference((struct pipe_stream_output_target **)dst, src);
 }

-static struct pipe_stream_output_target *
-si_create_so_target(struct pipe_context *ctx,
-		    struct pipe_resource *buffer,
-		    unsigned buffer_offset,
-		    unsigned buffer_size)
+static struct pipe_stream_output_target *si_create_so_target(struct pipe_context *ctx,
+                                                             struct pipe_resource *buffer,
+                                                             unsigned buffer_offset,
+                                                             unsigned buffer_size)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_streamout_target *t;
-	struct si_resource *buf = si_resource(buffer);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_streamout_target *t;
+   struct si_resource *buf = si_resource(buffer);

-	t = CALLOC_STRUCT(si_streamout_target);
-	if (!t) {
-		return NULL;
-	}
+   t = CALLOC_STRUCT(si_streamout_target);
+   if (!t) {
+      return NULL;
+   }

-	unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4;
-	u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4,
-			     &t->buf_filled_size_offset,
-			     (struct pipe_resource**)&t->buf_filled_size);
-	if (!t->buf_filled_size) {
-		FREE(t);
-		return NULL;
-	}
+   unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4;
+   u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4,
+                        &t->buf_filled_size_offset, (struct pipe_resource **)&t->buf_filled_size);
+   if (!t->buf_filled_size) {
+      FREE(t);
+      return NULL;
+   }

-	t->b.reference.count = 1;
-	t->b.context = ctx;
-	pipe_resource_reference(&t->b.buffer, buffer);
-	t->b.buffer_offset = buffer_offset;
-	t->b.buffer_size = buffer_size;
+   t->b.reference.count = 1;
+   t->b.context = ctx;
+   pipe_resource_reference(&t->b.buffer, buffer);
+   t->b.buffer_offset = buffer_offset;
+   t->b.buffer_size = buffer_size;

-	util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset,
-		       buffer_offset + buffer_size);
-	return &t->b;
+   util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset, buffer_offset + buffer_size);
+   return &t->b;
 }

-static void si_so_target_destroy(struct pipe_context *ctx,
-				 struct pipe_stream_output_target *target)
+static void si_so_target_destroy(struct pipe_context *ctx, struct pipe_stream_output_target *target)
 {
-	struct si_streamout_target *t = (struct si_streamout_target*)target;
-	pipe_resource_reference(&t->b.buffer, NULL);
-	si_resource_reference(&t->buf_filled_size, NULL);
-	FREE(t);
+   struct si_streamout_target *t = (struct si_streamout_target *)target;
+   pipe_resource_reference(&t->b.buffer, NULL);
+   si_resource_reference(&t->buf_filled_size, NULL);
+   FREE(t);
 }

 void si_streamout_buffers_dirty(struct si_context *sctx)
 {
-	if (!sctx->streamout.enabled_mask)
-		return;
+   if (!sctx->streamout.enabled_mask)
+      return;

-	si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
-	si_set_streamout_enable(sctx, true);
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
+   si_set_streamout_enable(sctx, true);
 }

-static void si_set_streamout_targets(struct pipe_context *ctx,
-				     unsigned num_targets,
-				     struct pipe_stream_output_target **targets,
-				     const unsigned *offsets)
+static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targets,
+                                     struct pipe_stream_output_target **targets,
+                                     const unsigned *offsets)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	unsigned old_num_targets = sctx->streamout.num_targets;
-	unsigned i;
-	bool wait_now = false;
+   struct si_context *sctx = (struct si_context *)ctx;
+   unsigned old_num_targets = sctx->streamout.num_targets;
+   unsigned i;
+   bool wait_now = false;

-	/* We are going to unbind the buffers. Mark which caches need to be flushed. */
-	if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
-		/* Since streamout uses vector writes which go through TC L2
-		 * and most other clients can use TC L2 as well, we don't need
-		 * to flush it.
-		 *
-		 * The only cases which requires flushing it is VGT DMA index
-		 * fetching (on <= GFX7) and indirect draw data, which are rare
-		 * cases. Thus, flag the TC L2 dirtiness in the resource and
-		 * handle it at draw call time.
-		 */
-		for (i = 0; i < sctx->streamout.num_targets; i++)
-			if (sctx->streamout.targets[i])
-				si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
+   /* We are going to unbind the buffers. Mark which caches need to be flushed. */
+   if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
+      /* Since streamout uses vector writes which go through TC L2
+       * and most other clients can use TC L2 as well, we don't need
+       * to flush it.
+       *
+       * The only cases which requires flushing it is VGT DMA index
+       * fetching (on <= GFX7) and indirect draw data, which are rare
+       * cases. Thus, flag the TC L2 dirtiness in the resource and
+       * handle it at draw call time.
+       */
+      for (i = 0; i < sctx->streamout.num_targets; i++)
+         if (sctx->streamout.targets[i])
+            si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;

-		/* Invalidate the scalar cache in case a streamout buffer is
-		 * going to be used as a constant buffer.
-		 *
-		 * Invalidate vL1, because streamout bypasses it (done by
-		 * setting GLC=1 in the store instruction), but vL1 in other
-		 * CUs can contain outdated data of streamout buffers.
-		 *
-		 * VS_PARTIAL_FLUSH is required if the buffers are going to be
-		 * used as an input immediately.
-		 */
-		sctx->flags |= SI_CONTEXT_INV_SCACHE |
-			       SI_CONTEXT_INV_VCACHE;
+      /* Invalidate the scalar cache in case a streamout buffer is
+       * going to be used as a constant buffer.
+       *
+       * Invalidate vL1, because streamout bypasses it (done by
+       * setting GLC=1 in the store instruction), but vL1 in other
+       * CUs can contain outdated data of streamout buffers.
+       *
+       * VS_PARTIAL_FLUSH is required if the buffers are going to be
+       * used as an input immediately.
+       */
+      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;

-		/* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
-		if (sctx->screen->use_ngg_streamout) {
-			sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+      /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
+      if (sctx->screen->use_ngg_streamout) {
+         sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;

-			/* Wait now. This is needed to make sure that GDS is not
-			 * busy at the end of IBs.
-			 *
-			 * Also, the next streamout operation will overwrite GDS,
-			 * so we need to make sure that it's idle.
-			 */
-			wait_now = true;
-		} else {
-			sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
-		}
-	}
+         /* Wait now. This is needed to make sure that GDS is not
+          * busy at the end of IBs.
+          *
+          * Also, the next streamout operation will overwrite GDS,
+          * so we need to make sure that it's idle.
+          */
+         wait_now = true;
+      } else {
+         sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
+      }
+   }

-	/* All readers of the streamout targets need to be finished before we can
-	 * start writing to the targets.
-	 */
-	if (num_targets) {
-		if (sctx->screen->use_ngg_streamout)
-			si_allocate_gds(sctx);
+   /* All readers of the streamout targets need to be finished before we can
+    * start writing to the targets.
+    */
+   if (num_targets) {
+      if (sctx->screen->use_ngg_streamout)
+         si_allocate_gds(sctx);

-		sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			       SI_CONTEXT_CS_PARTIAL_FLUSH;
-	}
+      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+   }

-	/* Streamout buffers must be bound in 2 places:
-	 * 1) in VGT by setting the VGT_STRMOUT registers
-	 * 2) as shader resources
-	 */
+   /* Streamout buffers must be bound in 2 places:
+    * 1) in VGT by setting the VGT_STRMOUT registers
+    * 2) as shader resources
+    */

-	/* Stop streamout. */
-	if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
-		si_emit_streamout_end(sctx);
+   /* Stop streamout. */
+   if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
+      si_emit_streamout_end(sctx);

-	/* Set the new targets. */
-	unsigned enabled_mask = 0, append_bitmask = 0;
-	for (i = 0; i < num_targets; i++) {
-		si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
-		if (!targets[i])
-			continue;
+   /* Set the new targets. */
+   unsigned enabled_mask = 0, append_bitmask = 0;
+   for (i = 0; i < num_targets; i++) {
+      si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
+      if (!targets[i])
+         continue;

-		si_context_add_resource_size(sctx, targets[i]->buffer);
-		enabled_mask |= 1 << i;
+      si_context_add_resource_size(sctx, targets[i]->buffer);
+      enabled_mask |= 1 << i;

-		if (offsets[i] == ((unsigned)-1))
-			append_bitmask |= 1 << i;
-	}
+      if (offsets[i] == ((unsigned)-1))
+         append_bitmask |= 1 << i;
+   }

-	for (; i < sctx->streamout.num_targets; i++)
-		si_so_target_reference(&sctx->streamout.targets[i], NULL);
+   for (; i < sctx->streamout.num_targets; i++)
+      si_so_target_reference(&sctx->streamout.targets[i], NULL);

-	sctx->streamout.enabled_mask = enabled_mask;
-	sctx->streamout.num_targets = num_targets;
-	sctx->streamout.append_bitmask = append_bitmask;
+   sctx->streamout.enabled_mask = enabled_mask;
+   sctx->streamout.num_targets = num_targets;
+   sctx->streamout.append_bitmask = append_bitmask;

-	/* Update dirty state bits. */
-	if (num_targets) {
-		si_streamout_buffers_dirty(sctx);
-	} else {
-		si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
-		si_set_streamout_enable(sctx, false);
-	}
+   /* Update dirty state bits. */
+   if (num_targets) {
+      si_streamout_buffers_dirty(sctx);
+   } else {
+      si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
+      si_set_streamout_enable(sctx, false);
+   }

-	/* Set the shader resources.*/
-	for (i = 0; i < num_targets; i++) {
-		if (targets[i]) {
-			struct pipe_shader_buffer sbuf;
-			sbuf.buffer = targets[i]->buffer;
+   /* Set the shader resources.*/
+   for (i = 0; i < num_targets; i++) {
+      if (targets[i]) {
+         struct pipe_shader_buffer sbuf;
+         sbuf.buffer = targets[i]->buffer;

-			if (sctx->screen->use_ngg_streamout) {
-				sbuf.buffer_offset = targets[i]->buffer_offset;
-				sbuf.buffer_size = targets[i]->buffer_size;
-			} else {
-				sbuf.buffer_offset = 0;
-				sbuf.buffer_size = targets[i]->buffer_offset +
-						   targets[i]->buffer_size;
-			}
+         if (sctx->screen->use_ngg_streamout) {
+            sbuf.buffer_offset = targets[i]->buffer_offset;
+            sbuf.buffer_size = targets[i]->buffer_size;
+         } else {
+            sbuf.buffer_offset = 0;
+            sbuf.buffer_size = targets[i]->buffer_offset + targets[i]->buffer_size;
+         }

-			si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
-			si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
-		} else {
-			si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
-		}
-	}
-	for (; i < old_num_targets; i++)
-		si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
+         si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
+         si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
+      } else {
+         si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
+      }
+   }
+   for (; i < old_num_targets; i++)
+      si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);

-	if (wait_now)
-		sctx->emit_cache_flush(sctx);
+   if (wait_now)
+      sctx->emit_cache_flush(sctx);
 }

 static void gfx10_emit_streamout_begin(struct si_context *sctx)
 {
-	struct si_streamout_target **t = sctx->streamout.targets;
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	unsigned last_target = 0;
+   struct si_streamout_target **t = sctx->streamout.targets;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned last_target = 0;

-	for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
-		if (t[i])
-			last_target = i;
-	}
+   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
+      if (t[i])
+         last_target = i;
+   }

-	for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
-		if (!t[i])
-			continue;
+   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
+      if (!t[i])
+         continue;

-		t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
+      t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];

-		bool append = sctx->streamout.append_bitmask & (1 << i);
-		uint64_t va = 0;
+      bool append = sctx->streamout.append_bitmask & (1 << i);
+      uint64_t va = 0;

-		if (append) {
-			radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
-						  t[i]->buf_filled_size,
-						  RADEON_USAGE_READ,
-						  RADEON_PRIO_SO_FILLED_SIZE);
+      if (append) {
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ,
+                                   RADEON_PRIO_SO_FILLED_SIZE);

-			va = t[i]->buf_filled_size->gpu_address +
-			     t[i]->buf_filled_size_offset;
-		}
+         va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+      }

-		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-		radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
-				S_411_DST_SEL(V_411_GDS) |
-				S_411_CP_SYNC(i == last_target));
-		radeon_emit(cs, va);
-		radeon_emit(cs, va >> 32);
-		radeon_emit(cs, 4 * i); /* destination in GDS */
-		radeon_emit(cs, 0);
-		radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) |
-				S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
-	}
+      radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+      radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
+                         S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+      radeon_emit(cs, 4 * i); /* destination in GDS */
+      radeon_emit(cs, 0);
+      radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) | S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
+   }

-	sctx->streamout.begin_emitted = true;
+   sctx->streamout.begin_emitted = true;
 }

 static void gfx10_emit_streamout_end(struct si_context *sctx)
 {
-	struct si_streamout_target **t = sctx->streamout.targets;
+   struct si_streamout_target **t = sctx->streamout.targets;

-	for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
-		if (!t[i])
-			continue;
+   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
+      if (!t[i])
+         continue;

-		uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+      uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;

-		si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0,
-				  EOP_DST_SEL_TC_L2,
-				  EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
-				  EOP_DATA_SEL_GDS,
-				  t[i]->buf_filled_size, va,
-				  EOP_DATA_GDS(i, 1), 0);
+      si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2,
+                        EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS,
+                        t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0);

-		t[i]->buf_filled_size_valid = true;
-	}
+      t[i]->buf_filled_size_valid = true;
+   }

-	sctx->streamout.begin_emitted = false;
+   sctx->streamout.begin_emitted = false;
 }

 static void si_flush_vgt_streamout(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	unsigned reg_strmout_cntl;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned reg_strmout_cntl;

-	/* The register is at different places on different ASICs. */
-	if (sctx->chip_class >= GFX7) {
-		reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
-		radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
-	} else {
-		reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
-		radeon_set_config_reg(cs, reg_strmout_cntl, 0);
-	}
+   /* The register is at different places on different ASICs. */
+   if (sctx->chip_class >= GFX7) {
+      reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
+      radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
+   } else {
+      reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
+      radeon_set_config_reg(cs, reg_strmout_cntl, 0);
+   }

-	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));

-	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-	radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
-	radeon_emit(cs, reg_strmout_cntl >> 2);  /* register */
-	radeon_emit(cs, 0);
-	radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
-	radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
-	radeon_emit(cs, 4); /* poll interval */
+   radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+   radeon_emit(cs,
+               WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
+   radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
+   radeon_emit(cs, 0);
+   radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
+   radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
+   radeon_emit(cs, 4);                              /* poll interval */
 }

 static void si_emit_streamout_begin(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	struct si_streamout_target **t = sctx->streamout.targets;
-	uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
-	unsigned i;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct si_streamout_target **t = sctx->streamout.targets;
+   uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
+   unsigned i;

-	si_flush_vgt_streamout(sctx);
+   si_flush_vgt_streamout(sctx);

-	for (i = 0; i < sctx->streamout.num_targets; i++) {
-		if (!t[i])
-			continue;
+   for (i = 0; i < sctx->streamout.num_targets; i++) {
+      if (!t[i])
+         continue;

-		t[i]->stride_in_dw = stride_in_dw[i];
+      t[i]->stride_in_dw = stride_in_dw[i];

-		/* AMD GCN binds streamout buffers as shader resources.
-		 * VGT only counts primitives and tells the shader
-		 * through SGPRs what to do. */
-		radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
-		radeon_emit(cs, (t[i]->b.buffer_offset +
-				 t[i]->b.buffer_size) >> 2);	/* BUFFER_SIZE (in DW) */
-		radeon_emit(cs, stride_in_dw[i]);		/* VTX_STRIDE (in DW) */
+      /* AMD GCN binds streamout buffers as shader resources.
+       * VGT only counts primitives and tells the shader
+       * through SGPRs what to do. */
+      radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
+      radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
+      radeon_emit(cs, stride_in_dw[i]);                                    /* VTX_STRIDE (in DW) */

-		if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
-			uint64_t va = t[i]->buf_filled_size->gpu_address +
-				      t[i]->buf_filled_size_offset;
+      if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
+         uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;

-			/* Append. */
-			radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
-			radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
-				    STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
-			radeon_emit(cs, 0); /* unused */
-			radeon_emit(cs, 0); /* unused */
-			radeon_emit(cs, va); /* src address lo */
-			radeon_emit(cs, va >> 32); /* src address hi */
+         /* Append. */
+         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
+                            STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
+         radeon_emit(cs, 0);                                                 /* unused */
+         radeon_emit(cs, 0);                                                 /* unused */
+         radeon_emit(cs, va);                                                /* src address lo */
+         radeon_emit(cs, va >> 32);                                          /* src address hi */

-			radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
-						  t[i]->buf_filled_size,
-						  RADEON_USAGE_READ,
-						  RADEON_PRIO_SO_FILLED_SIZE);
-		} else {
-			/* Start from the beginning. */
-			radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
-			radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
-				    STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
-			radeon_emit(cs, 0); /* unused */
-			radeon_emit(cs, 0); /* unused */
-			radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
-			radeon_emit(cs, 0); /* unused */
-		}
-	}
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ,
+                                   RADEON_PRIO_SO_FILLED_SIZE);
+      } else {
+         /* Start from the beginning. */
+         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
+                            STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
+         radeon_emit(cs, 0);                                                    /* unused */
+         radeon_emit(cs, 0);                                                    /* unused */
+         radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
+         radeon_emit(cs, 0);                          /* unused */
+      }
+   }

-	sctx->streamout.begin_emitted = true;
+   sctx->streamout.begin_emitted = true;
 }

 void si_emit_streamout_end(struct si_context *sctx)
 {
-	if (sctx->screen->use_ngg_streamout) {
-		gfx10_emit_streamout_end(sctx);
-		return;
-	}
+   if (sctx->screen->use_ngg_streamout) {
+      gfx10_emit_streamout_end(sctx);
+      return;
+   }

-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	struct si_streamout_target **t = sctx->streamout.targets;
-	unsigned i;
-	uint64_t va;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct si_streamout_target **t = sctx->streamout.targets;
+   unsigned i;
+   uint64_t va;

-	si_flush_vgt_streamout(sctx);
+   si_flush_vgt_streamout(sctx);

-	for (i = 0; i < sctx->streamout.num_targets; i++) {
-		if (!t[i])
-			continue;
+   for (i = 0; i < sctx->streamout.num_targets; i++) {
+      if (!t[i])
+         continue;

-		va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
-		radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
-		radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
-			    STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
-			    STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
-		radeon_emit(cs, va);     /* dst address lo */
-		radeon_emit(cs, va >> 32); /* dst address hi */
-		radeon_emit(cs, 0); /* unused */
-		radeon_emit(cs, 0); /* unused */
+      va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+      radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+      radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
+                         STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
+      radeon_emit(cs, va);                                  /* dst address lo */
+      radeon_emit(cs, va >> 32);                            /* dst address hi */
+      radeon_emit(cs, 0);                                   /* unused */
+      radeon_emit(cs, 0);                                   /* unused */

-		radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
-					  t[i]->buf_filled_size,
-					  RADEON_USAGE_WRITE,
-					  RADEON_PRIO_SO_FILLED_SIZE);
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_WRITE,
+                                RADEON_PRIO_SO_FILLED_SIZE);

-		/* Zero the buffer size. The counters (primitives generated,
-		 * primitives emitted) may be enabled even if there is not
-		 * buffer bound. This ensures that the primitives-emitted query
-		 * won't increment. */
-		radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
-		sctx->context_roll = true;
+      /* Zero the buffer size. The counters (primitives generated,
+       * primitives emitted) may be enabled even if there is not
+       * buffer bound. This ensures that the primitives-emitted query
+       * won't increment. */
+      radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
+      sctx->context_roll = true;

-		t[i]->buf_filled_size_valid = true;
-	}
+      t[i]->buf_filled_size_valid = true;
+   }

-	sctx->streamout.begin_emitted = false;
+   sctx->streamout.begin_emitted = false;
 }

 /* STREAMOUT CONFIG DERIVED STATE
@ -423,71 +400,65 @@ void si_emit_streamout_end(struct si_context *sctx)

 static void si_emit_streamout_enable(struct si_context *sctx)
 {
-	assert(!sctx->screen->use_ngg_streamout);
+   assert(!sctx->screen->use_ngg_streamout);

-	radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
-	radeon_emit(sctx->gfx_cs,
-		    S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
-		    S_028B94_RAST_STREAM(0) |
-		    S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
-		    S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
-		    S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
-	radeon_emit(sctx->gfx_cs,
-		    sctx->streamout.hw_enabled_mask &
-		    sctx->streamout.enabled_stream_buffers_mask);
+   radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
+   radeon_emit(sctx->gfx_cs, S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
+                                S_028B94_RAST_STREAM(0) |
+                                S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
+                                S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
+                                S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
+   radeon_emit(sctx->gfx_cs,
+               sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask);
 }

 static void si_set_streamout_enable(struct si_context *sctx, bool enable)
 {
-	bool old_strmout_en = si_get_strmout_en(sctx);
-	unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
+   bool old_strmout_en = si_get_strmout_en(sctx);
+   unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;

-	sctx->streamout.streamout_enabled = enable;
+   sctx->streamout.streamout_enabled = enable;

-	sctx->streamout.hw_enabled_mask = sctx->streamout.enabled_mask |
-					  (sctx->streamout.enabled_mask << 4) |
-					  (sctx->streamout.enabled_mask << 8) |
-					  (sctx->streamout.enabled_mask << 12);
+   sctx->streamout.hw_enabled_mask =
+      sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) |
+      (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12);

-	if (!sctx->screen->use_ngg_streamout &&
-	    ((old_strmout_en != si_get_strmout_en(sctx)) ||
-	     (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)))
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
+   if (!sctx->screen->use_ngg_streamout &&
+       ((old_strmout_en != si_get_strmout_en(sctx)) ||
+        (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 }

-void si_update_prims_generated_query_state(struct si_context *sctx,
-					   unsigned type, int diff)
+void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff)
 {
-	if (!sctx->screen->use_ngg_streamout &&
-	    type == PIPE_QUERY_PRIMITIVES_GENERATED) {
-		bool old_strmout_en = si_get_strmout_en(sctx);
+   if (!sctx->screen->use_ngg_streamout && type == PIPE_QUERY_PRIMITIVES_GENERATED) {
+      bool old_strmout_en = si_get_strmout_en(sctx);

-		sctx->streamout.num_prims_gen_queries += diff;
-		assert(sctx->streamout.num_prims_gen_queries >= 0);
+      sctx->streamout.num_prims_gen_queries += diff;
+      assert(sctx->streamout.num_prims_gen_queries >= 0);

-		sctx->streamout.prims_gen_query_enabled =
-			sctx->streamout.num_prims_gen_queries != 0;
+      sctx->streamout.prims_gen_query_enabled = sctx->streamout.num_prims_gen_queries != 0;

-		if (old_strmout_en != si_get_strmout_en(sctx))
-			si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
+      if (old_strmout_en != si_get_strmout_en(sctx))
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);

-		if (si_update_ngg(sctx)) {
-			si_shader_change_notify(sctx);
-			sctx->do_update_shaders = true;
-		}
-	}
+      if (si_update_ngg(sctx)) {
+         si_shader_change_notify(sctx);
+         sctx->do_update_shaders = true;
+      }
+   }
 }

 void si_init_streamout_functions(struct si_context *sctx)
 {
-	sctx->b.create_stream_output_target = si_create_so_target;
-	sctx->b.stream_output_target_destroy = si_so_target_destroy;
-	sctx->b.set_stream_output_targets = si_set_streamout_targets;
+   sctx->b.create_stream_output_target = si_create_so_target;
+   sctx->b.stream_output_target_destroy = si_so_target_destroy;
+   sctx->b.set_stream_output_targets = si_set_streamout_targets;

-	if (sctx->screen->use_ngg_streamout) {
-		sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin;
-	} else {
-		sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
-		sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
-	}
+   if (sctx->screen->use_ngg_streamout) {
+      sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin;
+   } else {
+      sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
+      sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
+   }
 }
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
--- a/src/gallium/drivers/radeonsi/si_test_dma.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma.c
@ -26,8 +26,8 @@
 /* This file implements randomized SDMA texture blit tests. */

 #include "si_pipe.h"
-#include "util/u_surface.h"
 #include "util/rand_xor.h"
+#include "util/u_surface.h"

 static uint64_t seed_xorshift128plus[2];

@ -36,382 +36,356 @@ static uint64_t seed_xorshift128plus[2];
 /* The GPU blits are emulated on the CPU using these CPU textures. */

 struct cpu_texture {
-	uint8_t *ptr;
-	uint64_t size;
-	uint64_t layer_stride;
-	unsigned stride;
+   uint8_t *ptr;
+   uint64_t size;
+   uint64_t layer_stride;
+   unsigned stride;
 };

-static void alloc_cpu_texture(struct cpu_texture *tex,
-			      struct pipe_resource *templ)
+static void alloc_cpu_texture(struct cpu_texture *tex, struct pipe_resource *templ)
 {
-	tex->stride = align(util_format_get_stride(templ->format, templ->width0),
-			    RAND_NUM_SIZE);
-	tex->layer_stride = (uint64_t)tex->stride * templ->height0;
-	tex->size = tex->layer_stride * templ->array_size;
-	tex->ptr = malloc(tex->size);
-	assert(tex->ptr);
+   tex->stride = align(util_format_get_stride(templ->format, templ->width0), RAND_NUM_SIZE);
+   tex->layer_stride = (uint64_t)tex->stride * templ->height0;
+   tex->size = tex->layer_stride * templ->array_size;
+   tex->ptr = malloc(tex->size);
+   assert(tex->ptr);
 }

-static void set_random_pixels(struct pipe_context *ctx,
-			      struct pipe_resource *tex,
-			      struct cpu_texture *cpu)
+static void set_random_pixels(struct pipe_context *ctx, struct pipe_resource *tex,
+                              struct cpu_texture *cpu)
 {
-	struct pipe_transfer *t;
-	uint8_t *map;
-	int x,y,z;
+   struct pipe_transfer *t;
+   uint8_t *map;
+   int x, y, z;

-	map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE,
-				   0, 0, 0, tex->width0, tex->height0,
-				   tex->array_size, &t);
-	assert(map);
+   map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE, 0, 0, 0, tex->width0, tex->height0,
+                              tex->array_size, &t);
+   assert(map);

-	for (z = 0; z < tex->array_size; z++) {
-		for (y = 0; y < tex->height0; y++) {
-			uint64_t *ptr = (uint64_t*)
-				(map + t->layer_stride*z + t->stride*y);
-			uint64_t *ptr_cpu = (uint64_t*)
-				(cpu->ptr + cpu->layer_stride*z + cpu->stride*y);
-			unsigned size = cpu->stride / RAND_NUM_SIZE;
+   for (z = 0; z < tex->array_size; z++) {
+      for (y = 0; y < tex->height0; y++) {
+         uint64_t *ptr = (uint64_t *)(map + t->layer_stride * z + t->stride * y);
+         uint64_t *ptr_cpu = (uint64_t *)(cpu->ptr + cpu->layer_stride * z + cpu->stride * y);
+         unsigned size = cpu->stride / RAND_NUM_SIZE;

-			assert(t->stride % RAND_NUM_SIZE == 0);
-			assert(cpu->stride % RAND_NUM_SIZE == 0);
+         assert(t->stride % RAND_NUM_SIZE == 0);
+         assert(cpu->stride % RAND_NUM_SIZE == 0);

-			for (x = 0; x < size; x++) {
-				*ptr++ = *ptr_cpu++ =
-					rand_xorshift128plus(seed_xorshift128plus);
-			}
-		}
-	}
+         for (x = 0; x < size; x++) {
+            *ptr++ = *ptr_cpu++ = rand_xorshift128plus(seed_xorshift128plus);
+         }
+      }
+   }

-	pipe_transfer_unmap(ctx, t);
+   pipe_transfer_unmap(ctx, t);
 }

-static bool compare_textures(struct pipe_context *ctx,
-			     struct pipe_resource *tex,
-			     struct cpu_texture *cpu)
+static bool compare_textures(struct pipe_context *ctx, struct pipe_resource *tex,
+                             struct cpu_texture *cpu)
 {
-	struct pipe_transfer *t;
-	uint8_t *map;
-	int y,z;
-	bool pass = true;
-	unsigned stride = util_format_get_stride(tex->format, tex->width0);
+   struct pipe_transfer *t;
+   uint8_t *map;
+   int y, z;
+   bool pass = true;
+   unsigned stride = util_format_get_stride(tex->format, tex->width0);

-	map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ,
-				   0, 0, 0, tex->width0, tex->height0,
-				   tex->array_size, &t);
-	assert(map);
+   map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ, 0, 0, 0, tex->width0, tex->height0,
+                              tex->array_size, &t);
+   assert(map);

-	for (z = 0; z < tex->array_size; z++) {
-		for (y = 0; y < tex->height0; y++) {
-			uint8_t *ptr = map + t->layer_stride*z + t->stride*y;
-			uint8_t *cpu_ptr = cpu->ptr +
-					   cpu->layer_stride*z + cpu->stride*y;
+   for (z = 0; z < tex->array_size; z++) {
+      for (y = 0; y < tex->height0; y++) {
+         uint8_t *ptr = map + t->layer_stride * z + t->stride * y;
+         uint8_t *cpu_ptr = cpu->ptr + cpu->layer_stride * z + cpu->stride * y;

-			if (memcmp(ptr, cpu_ptr, stride)) {
-				pass = false;
-				goto done;
-			}
-		}
-	}
+         if (memcmp(ptr, cpu_ptr, stride)) {
+            pass = false;
+            goto done;
+         }
+      }
+   }
 done:
-	pipe_transfer_unmap(ctx, t);
-	return pass;
+   pipe_transfer_unmap(ctx, t);
+   return pass;
 }

 static enum pipe_format choose_format()
 {
-	enum pipe_format formats[] = {
-		PIPE_FORMAT_R8_UINT,
-		PIPE_FORMAT_R16_UINT,
-		PIPE_FORMAT_R32_UINT,
-		PIPE_FORMAT_R32G32_UINT,
-		PIPE_FORMAT_R32G32B32A32_UINT,
-		PIPE_FORMAT_G8R8_B8R8_UNORM,
-	};
-	return formats[rand() % ARRAY_SIZE(formats)];
+   enum pipe_format formats[] = {
+      PIPE_FORMAT_R8_UINT,     PIPE_FORMAT_R16_UINT,          PIPE_FORMAT_R32_UINT,
+      PIPE_FORMAT_R32G32_UINT, PIPE_FORMAT_R32G32B32A32_UINT, PIPE_FORMAT_G8R8_B8R8_UNORM,
+   };
+   return formats[rand() % ARRAY_SIZE(formats)];
 }

-static const char *array_mode_to_string(struct si_screen *sscreen,
-					struct radeon_surf *surf)
+static const char *array_mode_to_string(struct si_screen *sscreen, struct radeon_surf *surf)
 {
-	if (sscreen->info.chip_class >= GFX9) {
-		switch (surf->u.gfx9.surf.swizzle_mode) {
-		case 0:
-			return "  LINEAR";
-		case 21:
-			return " 4KB_S_X";
-		case 22:
-			return " 4KB_D_X";
-		case 25:
-			return "64KB_S_X";
-		case 26:
-			return "64KB_D_X";
-		default:
-			printf("Unhandled swizzle mode = %u\n",
-			       surf->u.gfx9.surf.swizzle_mode);
-			return " UNKNOWN";
-		}
-	} else {
-		switch (surf->u.legacy.level[0].mode) {
-		case RADEON_SURF_MODE_LINEAR_ALIGNED:
-			return "LINEAR_ALIGNED";
-		case RADEON_SURF_MODE_1D:
-			return "1D_TILED_THIN1";
-		case RADEON_SURF_MODE_2D:
-			return "2D_TILED_THIN1";
-		default:
-			assert(0);
-			return "       UNKNOWN";
-		}
-	}
+   if (sscreen->info.chip_class >= GFX9) {
+      switch (surf->u.gfx9.surf.swizzle_mode) {
+      case 0:
+         return "  LINEAR";
+      case 21:
+         return " 4KB_S_X";
+      case 22:
+         return " 4KB_D_X";
+      case 25:
+         return "64KB_S_X";
+      case 26:
+         return "64KB_D_X";
+      default:
+         printf("Unhandled swizzle mode = %u\n", surf->u.gfx9.surf.swizzle_mode);
+         return " UNKNOWN";
+      }
+   } else {
+      switch (surf->u.legacy.level[0].mode) {
+      case RADEON_SURF_MODE_LINEAR_ALIGNED:
+         return "LINEAR_ALIGNED";
+      case RADEON_SURF_MODE_1D:
+         return "1D_TILED_THIN1";
+      case RADEON_SURF_MODE_2D:
+         return "2D_TILED_THIN1";
+      default:
+         assert(0);
+         return "       UNKNOWN";
+      }
+   }
 }

 static unsigned generate_max_tex_side(unsigned max_tex_side)
 {
-	switch (rand() % 4) {
-	case 0:
-		/* Try to hit large sizes in 1/4 of the cases. */
-		return max_tex_side;
-	case 1:
-		/* Try to hit 1D tiling in 1/4 of the cases. */
-		return 128;
-	default:
-		/* Try to hit common sizes in 2/4 of the cases. */
-		return 2048;
-	}
+   switch (rand() % 4) {
+   case 0:
+      /* Try to hit large sizes in 1/4 of the cases. */
+      return max_tex_side;
+   case 1:
+      /* Try to hit 1D tiling in 1/4 of the cases. */
+      return 128;
+   default:
+      /* Try to hit common sizes in 2/4 of the cases. */
+      return 2048;
+   }
 }

 void si_test_dma(struct si_screen *sscreen)
 {
-	struct pipe_screen *screen = &sscreen->b;
-	struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
-	struct si_context *sctx = (struct si_context*)ctx;
-	uint64_t max_alloc_size;
-	unsigned i, iterations, num_partial_copies, max_tex_side;
-	unsigned num_pass = 0, num_fail = 0;
+   struct pipe_screen *screen = &sscreen->b;
+   struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+   struct si_context *sctx = (struct si_context *)ctx;
+   uint64_t max_alloc_size;
+   unsigned i, iterations, num_partial_copies, max_tex_side;
+   unsigned num_pass = 0, num_fail = 0;

-	max_tex_side = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_SIZE);
+   max_tex_side = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_SIZE);

-	/* Max 128 MB allowed for both textures. */
-	max_alloc_size = 128 * 1024 * 1024;
+   /* Max 128 MB allowed for both textures. */
+   max_alloc_size = 128 * 1024 * 1024;

-	/* the seed for random test parameters */
-	srand(0x9b47d95b);
-	/* the seed for random pixel data */
-	s_rand_xorshift128plus(seed_xorshift128plus, false);
+   /* the seed for random test parameters */
+   srand(0x9b47d95b);
+   /* the seed for random pixel data */
+   s_rand_xorshift128plus(seed_xorshift128plus, false);

-	iterations = 1000000000; /* just kill it when you are bored */
-	num_partial_copies = 30;
+   iterations = 1000000000; /* just kill it when you are bored */
+   num_partial_copies = 30;

-	/* These parameters are randomly generated per test:
-	 * - whether to do one whole-surface copy or N partial copies per test
-	 * - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D)
-	 * - which texture dimensions to use
-	 * - whether to use VRAM (all tiling modes) and GTT (staging, linear
-	 *   only) allocations
-	 * - random initial pixels in src
-	 * - generate random subrectangle copies for partial blits
-	 */
-	for (i = 0; i < iterations; i++) {
-		struct pipe_resource tsrc = {}, tdst = {}, *src, *dst;
-		struct si_texture *sdst;
-		struct si_texture *ssrc;
-		struct cpu_texture src_cpu, dst_cpu;
-		unsigned max_width, max_height, max_depth, j, num;
-		unsigned gfx_blits = 0, dma_blits = 0, cs_blits = 0, max_tex_side_gen;
-		unsigned max_tex_layers;
-		bool pass;
-		bool do_partial_copies = rand() & 1;
+   /* These parameters are randomly generated per test:
+    * - whether to do one whole-surface copy or N partial copies per test
+    * - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D)
+    * - which texture dimensions to use
+    * - whether to use VRAM (all tiling modes) and GTT (staging, linear
+    *   only) allocations
+    * - random initial pixels in src
+    * - generate random subrectangle copies for partial blits
+    */
+   for (i = 0; i < iterations; i++) {
+      struct pipe_resource tsrc = {}, tdst = {}, *src, *dst;
+      struct si_texture *sdst;
+      struct si_texture *ssrc;
+      struct cpu_texture src_cpu, dst_cpu;
+      unsigned max_width, max_height, max_depth, j, num;
+      unsigned gfx_blits = 0, dma_blits = 0, cs_blits = 0, max_tex_side_gen;
+      unsigned max_tex_layers;
+      bool pass;
+      bool do_partial_copies = rand() & 1;

-		/* generate a random test case */
-		tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY;
-		tsrc.depth0 = tdst.depth0 = 1;
+      /* generate a random test case */
+      tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY;
+      tsrc.depth0 = tdst.depth0 = 1;

-		tsrc.format = tdst.format = choose_format();
+      tsrc.format = tdst.format = choose_format();

-		max_tex_side_gen = generate_max_tex_side(max_tex_side);
-		max_tex_layers = rand() % 4 ? 1 : 5;
+      max_tex_side_gen = generate_max_tex_side(max_tex_side);
+      max_tex_layers = rand() % 4 ? 1 : 5;

-		tsrc.width0 = (rand() % max_tex_side_gen) + 1;
-		tsrc.height0 = (rand() % max_tex_side_gen) + 1;
-		tsrc.array_size = (rand() % max_tex_layers) + 1;
+      tsrc.width0 = (rand() % max_tex_side_gen) + 1;
+      tsrc.height0 = (rand() % max_tex_side_gen) + 1;
+      tsrc.array_size = (rand() % max_tex_layers) + 1;

-		if (tsrc.format == PIPE_FORMAT_G8R8_B8R8_UNORM)
-			tsrc.width0 = align(tsrc.width0, 2);
+      if (tsrc.format == PIPE_FORMAT_G8R8_B8R8_UNORM)
+         tsrc.width0 = align(tsrc.width0, 2);

-		/* Have a 1/4 chance of getting power-of-two dimensions. */
-		if (rand() % 4 == 0) {
-			tsrc.width0 = util_next_power_of_two(tsrc.width0);
-			tsrc.height0 = util_next_power_of_two(tsrc.height0);
-		}
+      /* Have a 1/4 chance of getting power-of-two dimensions. */
+      if (rand() % 4 == 0) {
+         tsrc.width0 = util_next_power_of_two(tsrc.width0);
+         tsrc.height0 = util_next_power_of_two(tsrc.height0);
+      }

-		if (!do_partial_copies) {
-			/* whole-surface copies only, same dimensions */
-			tdst = tsrc;
-		} else {
-			max_tex_side_gen = generate_max_tex_side(max_tex_side);
-			max_tex_layers = rand() % 4 ? 1 : 5;
+      if (!do_partial_copies) {
+         /* whole-surface copies only, same dimensions */
+         tdst = tsrc;
+      } else {
+         max_tex_side_gen = generate_max_tex_side(max_tex_side);
+         max_tex_layers = rand() % 4 ? 1 : 5;

-			/* many partial copies, dimensions can be different */
-			tdst.width0 = (rand() % max_tex_side_gen) + 1;
-			tdst.height0 = (rand() % max_tex_side_gen) + 1;
-			tdst.array_size = (rand() % max_tex_layers) + 1;
+         /* many partial copies, dimensions can be different */
+         tdst.width0 = (rand() % max_tex_side_gen) + 1;
+         tdst.height0 = (rand() % max_tex_side_gen) + 1;
+         tdst.array_size = (rand() % max_tex_layers) + 1;

-			/* Have a 1/4 chance of getting power-of-two dimensions. */
-			if (rand() % 4 == 0) {
-				tdst.width0 = util_next_power_of_two(tdst.width0);
-				tdst.height0 = util_next_power_of_two(tdst.height0);
-			}
-		}
+         /* Have a 1/4 chance of getting power-of-two dimensions. */
+         if (rand() % 4 == 0) {
+            tdst.width0 = util_next_power_of_two(tdst.width0);
+            tdst.height0 = util_next_power_of_two(tdst.height0);
+         }
+      }

-		/* check texture sizes */
-		if ((uint64_t) util_format_get_nblocks(tsrc.format, tsrc.width0, tsrc.height0)
-			* tsrc.array_size * util_format_get_blocksize(tsrc.format) +
-		    (uint64_t) util_format_get_nblocks(tdst.format, tdst.width0, tdst.height0)
-			* tdst.array_size * util_format_get_blocksize(tdst.format) >
-		    max_alloc_size) {
-			/* too large, try again */
-			i--;
-			continue;
-		}
+      /* check texture sizes */
+      if ((uint64_t)util_format_get_nblocks(tsrc.format, tsrc.width0, tsrc.height0) *
+                tsrc.array_size * util_format_get_blocksize(tsrc.format) +
+             (uint64_t)util_format_get_nblocks(tdst.format, tdst.width0, tdst.height0) *
+                tdst.array_size * util_format_get_blocksize(tdst.format) >
+          max_alloc_size) {
+         /* too large, try again */
+         i--;
+         continue;
+      }

-		/* VRAM + the tiling mode depends on dimensions (3/4 of cases),
-		 * or GTT + linear only (1/4 of cases)
-		 */
-		tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
-		tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
+      /* VRAM + the tiling mode depends on dimensions (3/4 of cases),
+       * or GTT + linear only (1/4 of cases)
+       */
+      tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
+      tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;

-		/* Allocate textures (both the GPU and CPU copies).
-		 * The CPU will emulate what the GPU should be doing.
-		 */
-		src = screen->resource_create(screen, &tsrc);
-		dst = screen->resource_create(screen, &tdst);
-		assert(src);
-		assert(dst);
-		sdst = (struct si_texture*)dst;
-		ssrc = (struct si_texture*)src;
-		alloc_cpu_texture(&src_cpu, &tsrc);
-		alloc_cpu_texture(&dst_cpu, &tdst);
+      /* Allocate textures (both the GPU and CPU copies).
+       * The CPU will emulate what the GPU should be doing.
+       */
+      src = screen->resource_create(screen, &tsrc);
+      dst = screen->resource_create(screen, &tdst);
+      assert(src);
+      assert(dst);
+      sdst = (struct si_texture *)dst;
+      ssrc = (struct si_texture *)src;
+      alloc_cpu_texture(&src_cpu, &tsrc);
+      alloc_cpu_texture(&dst_cpu, &tdst);

-		printf("%4u: dst = (%5u x %5u x %u, %s), "
-		       " src = (%5u x %5u x %u, %s), format = %s, ",
-		       i, tdst.width0, tdst.height0, tdst.array_size,
-		       array_mode_to_string(sscreen, &sdst->surface),
-		       tsrc.width0, tsrc.height0, tsrc.array_size,
-		       array_mode_to_string(sscreen, &ssrc->surface),
-		       util_format_description(tsrc.format)->name);
-		fflush(stdout);
+      printf("%4u: dst = (%5u x %5u x %u, %s), "
+             " src = (%5u x %5u x %u, %s), format = %s, ",
+             i, tdst.width0, tdst.height0, tdst.array_size,
+             array_mode_to_string(sscreen, &sdst->surface), tsrc.width0, tsrc.height0,
+             tsrc.array_size, array_mode_to_string(sscreen, &ssrc->surface),
+             util_format_description(tsrc.format)->name);
+      fflush(stdout);

-		/* set src pixels */
-		set_random_pixels(ctx, src, &src_cpu);
+      /* set src pixels */
+      set_random_pixels(ctx, src, &src_cpu);

-		/* clear dst pixels */
-		uint32_t zero = 0;
-		si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4,
-		                SI_COHERENCY_SHADER, false);
-		memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
+      /* clear dst pixels */
+      uint32_t zero = 0;
+      si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4, SI_COHERENCY_SHADER, false);
+      memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);

-		/* preparation */
-		max_width = MIN2(tsrc.width0, tdst.width0);
-		max_height = MIN2(tsrc.height0, tdst.height0);
-		max_depth = MIN2(tsrc.array_size, tdst.array_size);
+      /* preparation */
+      max_width = MIN2(tsrc.width0, tdst.width0);
+      max_height = MIN2(tsrc.height0, tdst.height0);
+      max_depth = MIN2(tsrc.array_size, tdst.array_size);

-		num = do_partial_copies ? num_partial_copies : 1;
-		for (j = 0; j < num; j++) {
-			int width, height, depth;
-			int srcx, srcy, srcz, dstx, dsty, dstz;
-			struct pipe_box box;
-			unsigned old_num_draw_calls = sctx->num_draw_calls;
-			unsigned old_num_dma_calls = sctx->num_dma_calls;
-			unsigned old_num_cs_calls = sctx->num_compute_calls;
+      num = do_partial_copies ? num_partial_copies : 1;
+      for (j = 0; j < num; j++) {
+         int width, height, depth;
+         int srcx, srcy, srcz, dstx, dsty, dstz;
+         struct pipe_box box;
+         unsigned old_num_draw_calls = sctx->num_draw_calls;
+         unsigned old_num_dma_calls = sctx->num_dma_calls;
+         unsigned old_num_cs_calls = sctx->num_compute_calls;

-			if (!do_partial_copies) {
-				/* copy whole src to dst */
-				width = max_width;
-				height = max_height;
-				depth = max_depth;
+         if (!do_partial_copies) {
+            /* copy whole src to dst */
+            width = max_width;
+            height = max_height;
+            depth = max_depth;

-				srcx = srcy = srcz = dstx = dsty = dstz = 0;
-			} else {
-				/* random sub-rectangle copies from src to dst */
-				depth = (rand() % max_depth) + 1;
-				srcz = rand() % (tsrc.array_size - depth + 1);
-				dstz = rand() % (tdst.array_size - depth + 1);
+            srcx = srcy = srcz = dstx = dsty = dstz = 0;
+         } else {
+            /* random sub-rectangle copies from src to dst */
+            depth = (rand() % max_depth) + 1;
+            srcz = rand() % (tsrc.array_size - depth + 1);
+            dstz = rand() % (tdst.array_size - depth + 1);

-				/* special code path to hit the tiled partial copies */
-				if (!ssrc->surface.is_linear &&
-				    !sdst->surface.is_linear &&
-				    rand() & 1) {
-					if (max_width < 8 || max_height < 8)
-						continue;
-					width = ((rand() % (max_width / 8)) + 1) * 8;
-					height = ((rand() % (max_height / 8)) + 1) * 8;
+            /* special code path to hit the tiled partial copies */
+            if (!ssrc->surface.is_linear && !sdst->surface.is_linear && rand() & 1) {
+               if (max_width < 8 || max_height < 8)
+                  continue;
+               width = ((rand() % (max_width / 8)) + 1) * 8;
+               height = ((rand() % (max_height / 8)) + 1) * 8;

-					srcx = rand() % (tsrc.width0 - width + 1) & ~0x7;
-					srcy = rand() % (tsrc.height0 - height + 1) & ~0x7;
+               srcx = rand() % (tsrc.width0 - width + 1) & ~0x7;
+               srcy = rand() % (tsrc.height0 - height + 1) & ~0x7;

-					dstx = rand() % (tdst.width0 - width + 1) & ~0x7;
-					dsty = rand() % (tdst.height0 - height + 1) & ~0x7;
-				} else {
-					/* just make sure that it doesn't divide by zero */
-					assert(max_width > 0 && max_height > 0);
+               dstx = rand() % (tdst.width0 - width + 1) & ~0x7;
+               dsty = rand() % (tdst.height0 - height + 1) & ~0x7;
+            } else {
+               /* just make sure that it doesn't divide by zero */
+               assert(max_width > 0 && max_height > 0);

-					width = (rand() % max_width) + 1;
-					height = (rand() % max_height) + 1;
+               width = (rand() % max_width) + 1;
+               height = (rand() % max_height) + 1;

-					srcx = rand() % (tsrc.width0 - width + 1);
-					srcy = rand() % (tsrc.height0 - height + 1);
+               srcx = rand() % (tsrc.width0 - width + 1);
+               srcy = rand() % (tsrc.height0 - height + 1);

-					dstx = rand() % (tdst.width0 - width + 1);
-					dsty = rand() % (tdst.height0 - height + 1);
-				}
+               dstx = rand() % (tdst.width0 - width + 1);
+               dsty = rand() % (tdst.height0 - height + 1);
+            }

-				/* special code path to hit out-of-bounds reads in L2T */
-				if (ssrc->surface.is_linear &&
-				    !sdst->surface.is_linear &&
-				    rand() % 4 == 0) {
-					srcx = 0;
-					srcy = 0;
-					srcz = 0;
-				}
-			}
+            /* special code path to hit out-of-bounds reads in L2T */
+            if (ssrc->surface.is_linear && !sdst->surface.is_linear && rand() % 4 == 0) {
+               srcx = 0;
+               srcy = 0;
+               srcz = 0;
+            }
+         }

-			/* GPU copy */
-			u_box_3d(srcx, srcy, srcz, width, height, depth, &box);
-			sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box);
+         /* GPU copy */
+         u_box_3d(srcx, srcy, srcz, width, height, depth, &box);
+         sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box);

-			/* See which engine was used. */
-			gfx_blits += sctx->num_draw_calls > old_num_draw_calls;
-			dma_blits += sctx->num_dma_calls > old_num_dma_calls;
-			cs_blits  += sctx->num_compute_calls > old_num_cs_calls;
+         /* See which engine was used. */
+         gfx_blits += sctx->num_draw_calls > old_num_draw_calls;
+         dma_blits += sctx->num_dma_calls > old_num_dma_calls;
+         cs_blits += sctx->num_compute_calls > old_num_cs_calls;

-			/* CPU copy */
-			util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride,
-				      dst_cpu.layer_stride,
-				      dstx, dsty, dstz, width, height, depth,
-				      src_cpu.ptr, src_cpu.stride,
-				      src_cpu.layer_stride,
-				      srcx, srcy, srcz);
-		}
+         /* CPU copy */
+         util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride, dst_cpu.layer_stride, dstx, dsty,
+                       dstz, width, height, depth, src_cpu.ptr, src_cpu.stride,
+                       src_cpu.layer_stride, srcx, srcy, srcz);
+      }

-		pass = compare_textures(ctx, dst, &dst_cpu);
-		if (pass)
-			num_pass++;
-		else
-			num_fail++;
+      pass = compare_textures(ctx, dst, &dst_cpu);
+      if (pass)
+         num_pass++;
+      else
+         num_fail++;

-		printf("BLITs: GFX = %2u, DMA = %2u, CS = %2u, %s [%u/%u]\n",
-		       gfx_blits, dma_blits, cs_blits, pass ? "pass" : "fail",
-		       num_pass, num_pass+num_fail);
+      printf("BLITs: GFX = %2u, DMA = %2u, CS = %2u, %s [%u/%u]\n", gfx_blits, dma_blits, cs_blits,
+             pass ? "pass" : "fail", num_pass, num_pass + num_fail);

-		/* cleanup */
-		pipe_resource_reference(&src, NULL);
-		pipe_resource_reference(&dst, NULL);
-		free(src_cpu.ptr);
-		free(dst_cpu.ptr);
-	}
+      /* cleanup */
+      pipe_resource_reference(&src, NULL);
+      pipe_resource_reference(&dst, NULL);
+      free(src_cpu.ptr);
+      free(dst_cpu.ptr);
+   }

-	ctx->destroy(ctx);
-	exit(0);
+   ctx->destroy(ctx);
+   exit(0);
 }
--- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@ -28,451 +28,444 @@
 #include "si_pipe.h"
 #include "si_query.h"

-#define MIN_SIZE	512
-#define MAX_SIZE	(128 * 1024 * 1024)
-#define SIZE_SHIFT	1
-#define NUM_RUNS	128
+#define MIN_SIZE   512
+#define MAX_SIZE   (128 * 1024 * 1024)
+#define SIZE_SHIFT 1
+#define NUM_RUNS   128

 static double get_MBps_rate(unsigned num_bytes, unsigned ns)
 {
-	return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
+   return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
 }

 void si_test_dma_perf(struct si_screen *sscreen)
 {
-	struct pipe_screen *screen = &sscreen->b;
-	struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
-	struct si_context *sctx = (struct si_context*)ctx;
-	const uint32_t clear_value = 0x12345678;
-	static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
-	static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
+   struct pipe_screen *screen = &sscreen->b;
+   struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+   struct si_context *sctx = (struct si_context *)ctx;
+   const uint32_t clear_value = 0x12345678;
+   static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
+   static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};

 #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
-#define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
+#define NUM_METHODS (4 + 2 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))

-	static const char *method_str[] = {
-		"CP MC   ",
-		"CP L2   ",
-		"CP L2   ",
-		"SDMA    ",
-	};
-	static const char *placement_str[] = {
-		/* Clear */
-		"fill->VRAM",
-		"fill->GTT ",
-		/* Copy */
-		"VRAM->VRAM",
-		"VRAM->GTT ",
-		"GTT ->VRAM",
-	};
+   static const char *method_str[] = {
+      "CP MC   ",
+      "CP L2   ",
+      "CP L2   ",
+      "SDMA    ",
+   };
+   static const char *placement_str[] = {
+      /* Clear */
+      "fill->VRAM",
+      "fill->GTT ",
+      /* Copy */
+      "VRAM->VRAM",
+      "VRAM->GTT ",
+      "GTT ->VRAM",
+   };

-	printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
-	printf("Heap       ,Method  ,L2p,Wa,");
-	for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-		if (size >= 1024)
-			printf("%6uKB,", size / 1024);
-		else
-			printf(" %6uB,", size);
-	}
-	printf("\n");
+   printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
+   printf("Heap       ,Method  ,L2p,Wa,");
+   for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+      if (size >= 1024)
+         printf("%6uKB,", size / 1024);
+      else
+         printf(" %6uB,", size);
+   }
+   printf("\n");

-	/* results[log2(size)][placement][method][] */
-	struct si_result {
-		bool is_valid;
-		bool is_cp;
-		bool is_sdma;
-		bool is_cs;
-		unsigned cache_policy;
-		unsigned dwords_per_thread;
-		unsigned waves_per_sh;
-		unsigned score;
-		unsigned index; /* index in results[x][y][index] */
-	} results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
+   /* results[log2(size)][placement][method][] */
+   struct si_result {
+      bool is_valid;
+      bool is_cp;
+      bool is_sdma;
+      bool is_cs;
+      unsigned cache_policy;
+      unsigned dwords_per_thread;
+      unsigned waves_per_sh;
+      unsigned score;
+      unsigned index; /* index in results[x][y][index] */
+   } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};

-	/* Run benchmarks. */
-	for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
-		bool is_copy = placement >= 2;
+   /* Run benchmarks. */
+   for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+      bool is_copy = placement >= 2;

-		printf("-----------,--------,---,--,");
-		for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
-			printf("--------,");
-		printf("\n");
+      printf("-----------,--------,---,--,");
+      for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
+         printf("--------,");
+      printf("\n");

-		for (unsigned method = 0; method < NUM_METHODS; method++) {
-			bool test_cp = method <= 2;
-			bool test_sdma = method == 3;
-			bool test_cs = method >= 4;
-			unsigned cs_method = method - 4;
-			STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
-			unsigned cs_waves_per_sh =
-				test_cs ? cs_waves_per_sh_list[cs_method / (2*NUM_SHADERS)] : 0;
-			cs_method %= 2*NUM_SHADERS;
-			unsigned cache_policy = test_cp ? method % 3 :
-						test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
-			unsigned cs_dwords_per_thread =
-				test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
+      for (unsigned method = 0; method < NUM_METHODS; method++) {
+         bool test_cp = method <= 2;
+         bool test_sdma = method == 3;
+         bool test_cs = method >= 4;
+         unsigned cs_method = method - 4;
+         STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
+         unsigned cs_waves_per_sh =
+            test_cs ? cs_waves_per_sh_list[cs_method / (2 * NUM_SHADERS)] : 0;
+         cs_method %= 2 * NUM_SHADERS;
+         unsigned cache_policy =
+            test_cp ? method % 3 : test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
+         unsigned cs_dwords_per_thread =
+            test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;

-			if (test_sdma && !sctx->sdma_cs)
-				continue;
+         if (test_sdma && !sctx->sdma_cs)
+            continue;

-			if (sctx->chip_class == GFX6) {
-				/* GFX6 doesn't support CP DMA operations through L2. */
-				if (test_cp && cache_policy != L2_BYPASS)
-					continue;
-				/* WAVES_PER_SH is in multiples of 16 on GFX6. */
-				if (test_cs && cs_waves_per_sh % 16 != 0)
-					continue;
-			}
+         if (sctx->chip_class == GFX6) {
+            /* GFX6 doesn't support CP DMA operations through L2. */
+            if (test_cp && cache_policy != L2_BYPASS)
+               continue;
+            /* WAVES_PER_SH is in multiples of 16 on GFX6. */
+            if (test_cs && cs_waves_per_sh % 16 != 0)
+               continue;
+         }

-			printf("%s ,", placement_str[placement]);
-			if (test_cs) {
-				printf("CS x%-4u,%3s,", cs_dwords_per_thread,
-				       cache_policy == L2_LRU ? "LRU" :
-				       cache_policy == L2_STREAM ? "Str" : "");
-			} else {
-				printf("%s,%3s,", method_str[method],
-				       method == L2_LRU ? "LRU" :
-				       method == L2_STREAM ? "Str" : "");
-			}
-			if (test_cs && cs_waves_per_sh)
-				printf("%2u,", cs_waves_per_sh);
-			else
-				printf("  ,");
+         printf("%s ,", placement_str[placement]);
+         if (test_cs) {
+            printf("CS x%-4u,%3s,", cs_dwords_per_thread,
+                   cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
+         } else {
+            printf("%s,%3s,", method_str[method],
+                   method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
+         }
+         if (test_cs && cs_waves_per_sh)
+            printf("%2u,", cs_waves_per_sh);
+         else
+            printf("  ,");

-			double score = 0;
-			for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-				/* Don't test bigger sizes if it's too slow. Print 0. */
-				if (size >= 512*1024 &&
-				    score < 400 * (size / (4*1024*1024))) {
-					printf("%7.0f ,", 0.0);
-					continue;
-				}
+         double score = 0;
+         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+            /* Don't test bigger sizes if it's too slow. Print 0. */
+            if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
+               printf("%7.0f ,", 0.0);
+               continue;
+            }

-				enum pipe_resource_usage dst_usage, src_usage;
-				struct pipe_resource *dst, *src;
-				struct pipe_query *q[NUM_RUNS];
-				unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
+            enum pipe_resource_usage dst_usage, src_usage;
+            struct pipe_resource *dst, *src;
+            struct pipe_query *q[NUM_RUNS];
+            unsigned query_type = PIPE_QUERY_TIME_ELAPSED;

-				if (test_sdma) {
-					if (sctx->chip_class == GFX6)
-						query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
-					else
-						query_type = SI_QUERY_TIME_ELAPSED_SDMA;
-				}
+            if (test_sdma) {
+               if (sctx->chip_class == GFX6)
+                  query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
+               else
+                  query_type = SI_QUERY_TIME_ELAPSED_SDMA;
+            }

-				if (placement == 0 || placement == 2 || placement == 4)
-					dst_usage = PIPE_USAGE_DEFAULT;
-				else
-					dst_usage = PIPE_USAGE_STREAM;
+            if (placement == 0 || placement == 2 || placement == 4)
+               dst_usage = PIPE_USAGE_DEFAULT;
+            else
+               dst_usage = PIPE_USAGE_STREAM;

-				if (placement == 2 || placement == 3)
-					src_usage = PIPE_USAGE_DEFAULT;
-				else
-					src_usage = PIPE_USAGE_STREAM;
+            if (placement == 2 || placement == 3)
+               src_usage = PIPE_USAGE_DEFAULT;
+            else
+               src_usage = PIPE_USAGE_STREAM;

-				dst = pipe_buffer_create(screen, 0, dst_usage, size);
-				src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
+            dst = pipe_buffer_create(screen, 0, dst_usage, size);
+            src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;

-				/* Run tests. */
-				for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
-					q[iter] = ctx->create_query(ctx, query_type, 0);
-					ctx->begin_query(ctx, q[iter]);
+            /* Run tests. */
+            for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+               q[iter] = ctx->create_query(ctx, query_type, 0);
+               ctx->begin_query(ctx, q[iter]);

-					if (test_cp) {
-						/* CP DMA */
-						if (is_copy) {
-							si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0,
-									      SI_COHERENCY_NONE, cache_policy);
-						} else {
-							si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size,
-									       clear_value, 0,
-									       SI_COHERENCY_NONE, cache_policy);
-						}
-					} else if (test_sdma) {
-						/* SDMA */
-						if (is_copy) {
-							si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
-						} else {
-							si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
-						}
-					} else {
-						/* Compute */
-						/* The memory accesses are coalesced, meaning that the 1st instruction writes
-						 * the 1st contiguous block of data for the whole wave, the 2nd instruction
-						 * writes the 2nd contiguous block of data, etc.
-						 */
-						unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
-						unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
-						unsigned dwords_per_wave = cs_dwords_per_thread * 64;
+               if (test_cp) {
+                  /* CP DMA */
+                  if (is_copy) {
+                     si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, SI_COHERENCY_NONE,
+                                           cache_policy);
+                  } else {
+                     si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, clear_value, 0,
+                                            SI_COHERENCY_NONE, cache_policy);
+                  }
+               } else if (test_sdma) {
+                  /* SDMA */
+                  if (is_copy) {
+                     si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
+                  } else {
+                     si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
+                  }
+               } else {
+                  /* Compute */
+                  /* The memory accesses are coalesced, meaning that the 1st instruction writes
+                   * the 1st contiguous block of data for the whole wave, the 2nd instruction
+                   * writes the 2nd contiguous block of data, etc.
+                   */
+                  unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
+                  unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
+                  unsigned dwords_per_wave = cs_dwords_per_thread * 64;

-						unsigned num_dwords = size / 4;
-						unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+                  unsigned num_dwords = size / 4;
+                  unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);

-						void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
-											cache_policy == L2_STREAM, is_copy);
+                  void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
+                                                          cache_policy == L2_STREAM, is_copy);

-						struct pipe_grid_info info = {};
-						info.block[0] = MIN2(64, num_instructions);
-						info.block[1] = 1;
-						info.block[2] = 1;
-						info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
-						info.grid[1] = 1;
-						info.grid[2] = 1;
+                  struct pipe_grid_info info = {};
+                  info.block[0] = MIN2(64, num_instructions);
+                  info.block[1] = 1;
+                  info.block[2] = 1;
+                  info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+                  info.grid[1] = 1;
+                  info.grid[2] = 1;

-						struct pipe_shader_buffer sb[2] = {};
-						sb[0].buffer = dst;
-						sb[0].buffer_size = size;
+                  struct pipe_shader_buffer sb[2] = {};
+                  sb[0].buffer = dst;
+                  sb[0].buffer_size = size;

-						if (is_copy) {
-							sb[1].buffer = src;
-							sb[1].buffer_size = size;
-						} else {
-							for (unsigned i = 0; i < 4; i++)
-								sctx->cs_user_data[i] = clear_value;
-						}
+                  if (is_copy) {
+                     sb[1].buffer = src;
+                     sb[1].buffer_size = size;
+                  } else {
+                     for (unsigned i = 0; i < 4; i++)
+                        sctx->cs_user_data[i] = clear_value;
+                  }

-						sctx->flags |= SI_CONTEXT_INV_VCACHE |
-							       SI_CONTEXT_INV_SCACHE;
+                  sctx->flags |= SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_SCACHE;

-						ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0,
-									is_copy ? 2 : 1, sb, 0x1);
-						ctx->bind_compute_state(ctx, cs);
-						sctx->cs_max_waves_per_sh = cs_waves_per_sh;
+                  ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
+                  ctx->bind_compute_state(ctx, cs);
+                  sctx->cs_max_waves_per_sh = cs_waves_per_sh;

-						ctx->launch_grid(ctx, &info);
+                  ctx->launch_grid(ctx, &info);

-						ctx->bind_compute_state(ctx, NULL);
-						ctx->delete_compute_state(ctx, cs);
-						sctx->cs_max_waves_per_sh = 0; /* disable the limit */
+                  ctx->bind_compute_state(ctx, NULL);
+                  ctx->delete_compute_state(ctx, cs);
+                  sctx->cs_max_waves_per_sh = 0; /* disable the limit */

-						sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-					}
+                  sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+               }

-					/* Flush L2, so that we don't just test L2 cache performance. */
-					if (!test_sdma) {
-						sctx->flags |= SI_CONTEXT_WB_L2;
-						sctx->emit_cache_flush(sctx);
-					}
+               /* Flush L2, so that we don't just test L2 cache performance. */
+               if (!test_sdma) {
+                  sctx->flags |= SI_CONTEXT_WB_L2;
+                  sctx->emit_cache_flush(sctx);
+               }

-					ctx->end_query(ctx, q[iter]);
-					ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
-				}
-				pipe_resource_reference(&dst, NULL);
-				pipe_resource_reference(&src, NULL);
+               ctx->end_query(ctx, q[iter]);
+               ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
+            }
+            pipe_resource_reference(&dst, NULL);
+            pipe_resource_reference(&src, NULL);

-				/* Get results. */
-				uint64_t min = ~0ull, max = 0, total = 0;
+            /* Get results. */
+            uint64_t min = ~0ull, max = 0, total = 0;

-				for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
-					union pipe_query_result result;
+            for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+               union pipe_query_result result;

-					ctx->get_query_result(ctx, q[iter], true, &result);
-					ctx->destroy_query(ctx, q[iter]);
+               ctx->get_query_result(ctx, q[iter], true, &result);
+               ctx->destroy_query(ctx, q[iter]);

-					min = MIN2(min, result.u64);
-					max = MAX2(max, result.u64);
-					total += result.u64;
-				}
+               min = MIN2(min, result.u64);
+               max = MAX2(max, result.u64);
+               total += result.u64;
+            }

-				score = get_MBps_rate(size, total / (double)NUM_RUNS);
-				printf("%7.0f ,", score);
-				fflush(stdout);
+            score = get_MBps_rate(size, total / (double)NUM_RUNS);
+            printf("%7.0f ,", score);
+            fflush(stdout);

-				struct si_result *r = &results[util_logbase2(size)][placement][method];
-				r->is_valid = true;
-				r->is_cp = test_cp;
-				r->is_sdma = test_sdma;
-				r->is_cs = test_cs;
-				r->cache_policy = cache_policy;
-				r->dwords_per_thread = cs_dwords_per_thread;
-				r->waves_per_sh = cs_waves_per_sh;
-				r->score = score;
-				r->index = method;
-			}
-			puts("");
-		}
-	}
+            struct si_result *r = &results[util_logbase2(size)][placement][method];
+            r->is_valid = true;
+            r->is_cp = test_cp;
+            r->is_sdma = test_sdma;
+            r->is_cs = test_cs;
+            r->cache_policy = cache_policy;
+            r->dwords_per_thread = cs_dwords_per_thread;
+            r->waves_per_sh = cs_waves_per_sh;
+            r->score = score;
+            r->index = method;
+         }
+         puts("");
+      }
+   }

-	puts("");
-	puts("static struct si_method");
-	printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n",
-	       sctx->screen->info.name);
-	puts("{");
-	puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+   puts("");
+   puts("static struct si_method");
+   printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
+          "cached)\n",
+          sctx->screen->info.name);
+   puts("{");
+   puts("   unsigned size = MIN2(size64, UINT_MAX);\n");

-	/* Analyze results and find the best methods. */
-	for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
-		if (placement == 0)
-			puts("   if (dst == RADEON_DOMAIN_VRAM) {");
-		else if (placement == 1)
-			puts("   } else { /* GTT */");
-		else if (placement == 2) {
-			puts("}");
-			puts("");
-			puts("static struct si_method");
-			printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
-			       sctx->screen->info.name);
-			printf("                     uint64_t size64, bool async, bool cached)\n");
-			puts("{");
-			puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
-			puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
-		} else if (placement == 3)
-			puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
-		else
-			puts("   } else { /* GTT -> VRAM */");
+   /* Analyze results and find the best methods. */
+   for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+      if (placement == 0)
+         puts("   if (dst == RADEON_DOMAIN_VRAM) {");
+      else if (placement == 1)
+         puts("   } else { /* GTT */");
+      else if (placement == 2) {
+         puts("}");
+         puts("");
+         puts("static struct si_method");
+         printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
+                sctx->screen->info.name);
+         printf("                     uint64_t size64, bool async, bool cached)\n");
+         puts("{");
+         puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+         puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
+      } else if (placement == 3)
+         puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
+      else
+         puts("   } else { /* GTT -> VRAM */");

-		for (unsigned mode = 0; mode < 3; mode++) {
-			bool async = mode == 0;
-			bool cached = mode == 1;
+      for (unsigned mode = 0; mode < 3; mode++) {
+         bool async = mode == 0;
+         bool cached = mode == 1;

-			if (async)
-				puts("      if (async) { /* SDMA or async compute */");
-			else if (cached)
-				puts("      if (cached) { /* gfx ring */");
-			else
-				puts("      } else { /* gfx ring - uncached */");
+         if (async)
+            puts("      if (async) { /* SDMA or async compute */");
+         else if (cached)
+            puts("      if (cached) { /* gfx ring */");
+         else
+            puts("      } else { /* gfx ring - uncached */");

-			/* The list of best chosen methods. */
-			struct si_result *methods[32];
-			unsigned method_max_size[32];
-			unsigned num_methods = 0;
+         /* The list of best chosen methods. */
+         struct si_result *methods[32];
+         unsigned method_max_size[32];
+         unsigned num_methods = 0;

-			for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-				/* Find the best method. */
-				struct si_result *best = NULL;
+         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+            /* Find the best method. */
+            struct si_result *best = NULL;

-				for (unsigned i = 0; i < NUM_METHODS; i++) {
-					struct si_result *r = &results[util_logbase2(size)][placement][i];
+            for (unsigned i = 0; i < NUM_METHODS; i++) {
+               struct si_result *r = &results[util_logbase2(size)][placement][i];

-					if (!r->is_valid)
-						continue;
+               if (!r->is_valid)
+                  continue;

-					/* Ban CP DMA clears via MC on <= GFX8. They are super slow
-					 * on GTT, which we can get due to BO evictions.
-					 */
-					if (sctx->chip_class <= GFX8 && placement == 1 &&
-					    r->is_cp && r->cache_policy == L2_BYPASS)
-						continue;
+               /* Ban CP DMA clears via MC on <= GFX8. They are super slow
+                * on GTT, which we can get due to BO evictions.
+                */
+               if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp &&
+                   r->cache_policy == L2_BYPASS)
+                  continue;

-					if (async) {
-						/* The following constraints for compute IBs try to limit
-						 * resource usage so as not to decrease the performance
-						 * of gfx IBs too much.
-						 */
+               if (async) {
+                  /* The following constraints for compute IBs try to limit
+                   * resource usage so as not to decrease the performance
+                   * of gfx IBs too much.
+                   */

-						/* Don't use CP DMA on asynchronous rings, because
-						 * the engine is shared with gfx IBs.
-						 */
-						if (r->is_cp)
-							continue;
+                  /* Don't use CP DMA on asynchronous rings, because
+                   * the engine is shared with gfx IBs.
+                   */
+                  if (r->is_cp)
+                     continue;

-						/* Don't use L2 caching on asynchronous rings to minimize
-						 * L2 usage.
-						 */
-						if (r->cache_policy == L2_LRU)
-							continue;
+                  /* Don't use L2 caching on asynchronous rings to minimize
+                   * L2 usage.
+                   */
+                  if (r->cache_policy == L2_LRU)
+                     continue;

-						/* Asynchronous compute recommends waves_per_sh != 0
-						 * to limit CU usage. */
-						if (r->is_cs && r->waves_per_sh == 0)
-							continue;
-					} else {
-						/* SDMA is always asynchronous */
-						if (r->is_sdma)
-							continue;
+                  /* Asynchronous compute recommends waves_per_sh != 0
+                   * to limit CU usage. */
+                  if (r->is_cs && r->waves_per_sh == 0)
+                     continue;
+               } else {
+                  /* SDMA is always asynchronous */
+                  if (r->is_sdma)
+                     continue;

-						if (cached && r->cache_policy == L2_BYPASS)
-							continue;
-						if (!cached && r->cache_policy == L2_LRU)
-							continue;
-					}
+                  if (cached && r->cache_policy == L2_BYPASS)
+                     continue;
+                  if (!cached && r->cache_policy == L2_LRU)
+                     continue;
+               }

-					if (!best) {
-						best = r;
-						continue;
-					}
+               if (!best) {
+                  best = r;
+                  continue;
+               }

-					/* Assume some measurement error. Earlier methods occupy fewer
-					 * resources, so the next method is always more greedy, and we
-					 * don't want to select it due to a measurement error.
-					 */
-					double min_improvement = 1.03;
+               /* Assume some measurement error. Earlier methods occupy fewer
+                * resources, so the next method is always more greedy, and we
+                * don't want to select it due to a measurement error.
+                */
+               double min_improvement = 1.03;

-					if (best->score * min_improvement < r->score)
-						best = r;
-				}
+               if (best->score * min_improvement < r->score)
+                  best = r;
+            }

-				if (num_methods > 0) {
-					unsigned prev_index = num_methods - 1;
-					struct si_result *prev = methods[prev_index];
-					struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index];
+            if (num_methods > 0) {
+               unsigned prev_index = num_methods - 1;
+               struct si_result *prev = methods[prev_index];
+               struct si_result *prev_this_size =
+                  &results[util_logbase2(size)][placement][prev->index];

-					/* If the best one is also the best for the previous size,
-					 * just bump the size for the previous one.
-					 *
-					 * If there is no best, it means all methods were too slow
-					 * for this size and were not tested. Use the best one for
-					 * the previous size.
-					 */
-					if (!best ||
-					    /* If it's the same method as for the previous size: */
-					    (prev->is_cp == best->is_cp &&
-					     prev->is_sdma == best->is_sdma &&
-					     prev->is_cs == best->is_cs &&
-					     prev->cache_policy == best->cache_policy &&
-					     prev->dwords_per_thread == best->dwords_per_thread &&
-					     prev->waves_per_sh == best->waves_per_sh) ||
-					    /* If the method for the previous size is also the best
-					     * for this size: */
-					    (prev_this_size->is_valid &&
-					     prev_this_size->score * 1.03 > best->score)) {
-						method_max_size[prev_index] = size;
-						continue;
-					}
-				}
+               /* If the best one is also the best for the previous size,
+                * just bump the size for the previous one.
+                *
+                * If there is no best, it means all methods were too slow
+                * for this size and were not tested. Use the best one for
+                * the previous size.
+                */
+               if (!best ||
+                   /* If it's the same method as for the previous size: */
+                   (prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma &&
+                    prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
+                    prev->dwords_per_thread == best->dwords_per_thread &&
+                    prev->waves_per_sh == best->waves_per_sh) ||
+                   /* If the method for the previous size is also the best
+                    * for this size: */
+                   (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
+                  method_max_size[prev_index] = size;
+                  continue;
+               }
+            }

-				/* Add it to the list. */
-				assert(num_methods < ARRAY_SIZE(methods));
-				methods[num_methods] = best;
-				method_max_size[num_methods] = size;
-				num_methods++;
-			}
+            /* Add it to the list. */
+            assert(num_methods < ARRAY_SIZE(methods));
+            methods[num_methods] = best;
+            method_max_size[num_methods] = size;
+            num_methods++;
+         }

-			for (unsigned i = 0; i < num_methods; i++) {
-				struct si_result *best = methods[i];
-				unsigned size = method_max_size[i];
+         for (unsigned i = 0; i < num_methods; i++) {
+            struct si_result *best = methods[i];
+            unsigned size = method_max_size[i];

-				/* The size threshold is between the current benchmarked
-				 * size and the next benchmarked size. */
-				if (i < num_methods - 1)
-					printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
-				else if (i > 0)
-					printf("         else                   ");
-				else
-					printf("         ");
-				printf("return ");
+            /* The size threshold is between the current benchmarked
+             * size and the next benchmarked size. */
+            if (i < num_methods - 1)
+               printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
+            else if (i > 0)
+               printf("         else                   ");
+            else
+               printf("         ");
+            printf("return ");

-				assert(best);
-				if (best->is_cp) {
-					printf("CP_DMA(%s);\n",
-					       best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
-					       best->cache_policy == L2_LRU ?    "L2_LRU   " : "L2_STREAM");
-				}
-				if (best->is_sdma)
-					printf("SDMA;\n");
-				if (best->is_cs) {
-					printf("COMPUTE(%s, %u, %u);\n",
-					       best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM",
-					       best->dwords_per_thread,
-					       best->waves_per_sh);
-				}
-			}
-		}
-		puts("      }");
-	}
-	puts("   }");
-	puts("}");
+            assert(best);
+            if (best->is_cp) {
+               printf("CP_DMA(%s);\n",
+                      best->cache_policy == L2_BYPASS
+                         ? "L2_BYPASS"
+                         : best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM");
+            }
+            if (best->is_sdma)
+               printf("SDMA;\n");
+            if (best->is_cs) {
+               printf("COMPUTE(%s, %u, %u);\n",
+                      best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM",
+                      best->dwords_per_thread, best->waves_per_sh);
+            }
+         }
+      }
+      puts("      }");
+   }
+   puts("   }");
+   puts("}");

-	ctx->destroy(ctx);
-	exit(0);
+   ctx->destroy(ctx);
+   exit(0);
 }
--- a/src/gallium/drivers/radeonsi/si_texture.c
+++ b/src/gallium/drivers/radeonsi/si_texture.c
--- a/src/gallium/drivers/radeonsi/si_uvd.c
+++ b/src/gallium/drivers/radeonsi/si_uvd.c
@ -25,79 +25,77 @@
 *
 **************************************************************************/

-#include "si_pipe.h"
-#include "radeon/radeon_video.h"
 #include "radeon/radeon_uvd.h"
+#include "radeon/radeon_uvd_enc.h"
 #include "radeon/radeon_vce.h"
 #include "radeon/radeon_vcn_dec.h"
 #include "radeon/radeon_vcn_enc.h"
-#include "radeon/radeon_uvd_enc.h"
+#include "radeon/radeon_video.h"
+#include "si_pipe.h"
 #include "util/u_video.h"

 /**
 * creates an video buffer with an UVD compatible memory layout
 */
 struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
-						 const struct pipe_video_buffer *tmpl)
+                                                 const struct pipe_video_buffer *tmpl)
 {
-	struct pipe_video_buffer vidbuf = *tmpl;
-	/* TODO: get tiling working */
-	vidbuf.bind |= PIPE_BIND_LINEAR;
+   struct pipe_video_buffer vidbuf = *tmpl;
+   /* TODO: get tiling working */
+   vidbuf.bind |= PIPE_BIND_LINEAR;

-	return vl_video_buffer_create_as_resource(pipe, &vidbuf);
+   return vl_video_buffer_create_as_resource(pipe, &vidbuf);
 }

 /* set the decoding target buffer offsets */
-static struct pb_buffer* si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
+static struct pb_buffer *si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
 {
-	struct si_screen *sscreen = (struct si_screen*)buf->base.context->screen;
-	struct si_texture *luma = (struct si_texture *)buf->resources[0];
-	struct si_texture *chroma = (struct si_texture *)buf->resources[1];
-	enum ruvd_surface_type type =  (sscreen->info.chip_class >= GFX9) ?
-					RUVD_SURFACE_TYPE_GFX9 :
-					RUVD_SURFACE_TYPE_LEGACY;
+   struct si_screen *sscreen = (struct si_screen *)buf->base.context->screen;
+   struct si_texture *luma = (struct si_texture *)buf->resources[0];
+   struct si_texture *chroma = (struct si_texture *)buf->resources[1];
+   enum ruvd_surface_type type =
+      (sscreen->info.chip_class >= GFX9) ? RUVD_SURFACE_TYPE_GFX9 : RUVD_SURFACE_TYPE_LEGACY;

-	msg->body.decode.dt_field_mode = buf->base.interlaced;
+   msg->body.decode.dt_field_mode = buf->base.interlaced;

-	si_uvd_set_dt_surfaces(msg, &luma->surface, (chroma) ? &chroma->surface : NULL, type);
+   si_uvd_set_dt_surfaces(msg, &luma->surface, (chroma) ? &chroma->surface : NULL, type);

-	return luma->buffer.buf;
+   return luma->buffer.buf;
 }

 /* get the radeon resources for VCE */
-static void si_vce_get_buffer(struct pipe_resource *resource,
-			      struct pb_buffer **handle,
-			      struct radeon_surf **surface)
+static void si_vce_get_buffer(struct pipe_resource *resource, struct pb_buffer **handle,
+                              struct radeon_surf **surface)
 {
-	struct si_texture *res = (struct si_texture *)resource;
+   struct si_texture *res = (struct si_texture *)resource;

-	if (handle)
-		*handle = res->buffer.buf;
+   if (handle)
+      *handle = res->buffer.buf;

-	if (surface)
-		*surface = &res->surface;
+   if (surface)
+      *surface = &res->surface;
 }

 /**
 * creates an UVD compatible decoder
 */
 struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
-					       const struct pipe_video_codec *templ)
+                                               const struct pipe_video_codec *templ)
 {
-	struct si_context *ctx = (struct si_context *)context;
-	bool vcn = ctx->family >= CHIP_RAVEN;
+   struct si_context *ctx = (struct si_context *)context;
+   bool vcn = ctx->family >= CHIP_RAVEN;

-	if (templ->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
-		if (vcn) {
-			return radeon_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
-		} else {
-			if (u_reduce_video_profile(templ->profile) == PIPE_VIDEO_FORMAT_HEVC)
-				return radeon_uvd_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
-			else
-				return si_vce_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
-		}
-	}
+   if (templ->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
+      if (vcn) {
+         return radeon_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
+      } else {
+         if (u_reduce_video_profile(templ->profile) == PIPE_VIDEO_FORMAT_HEVC)
+            return radeon_uvd_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
+         else
+            return si_vce_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
+      }
+   }

-	return (vcn) ? 	radeon_create_decoder(context, templ) :
-		si_common_uvd_create_decoder(context, templ, si_uvd_set_dtb);
+   return (vcn) ? radeon_create_decoder(context, templ)
+                : si_common_uvd_create_decoder(context, templ, si_uvd_set_dtb);
 }