diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index 83cca397716..f25309736c9 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -6,6 +6,7 @@ C_SOURCES := \
 	$(GENERATED_SOURCES) \
 	cik_sdma.c \
 	driinfo_radeonsi.h \
+	gfx10_query.c \
 	gfx10_shader_ngg.c \
 	si_blit.c \
 	si_buffer.c \
diff --git a/src/gallium/drivers/radeonsi/gfx10_query.c b/src/gallium/drivers/radeonsi/gfx10_query.c
new file mode 100644
index 00000000000..8584b2af505
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/gfx10_query.c
@@ -0,0 +1,521 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stddef.h>
+
+#include "si_pipe.h"
+#include "si_query.h"
+#include "util/u_memory.h"
+#include "util/u_suballoc.h"
+#include "sid.h"
+
+/**
+ * The query buffer is written to by ESGS NGG shaders with statistics about
+ * generated and (streamout-)emitted primitives.
+ *
+ * The context maintains a ring of these query buffers, and queries simply
+ * point into the ring, allowing an arbitrary number of queries to be active
+ * without additional GPU cost.
+ */
+struct gfx10_sh_query_buffer {
+	struct list_head list;
+	struct si_resource *buf;
+	unsigned refcount;
+
+	/* Offset into the buffer in bytes; points at the first un-emitted entry. */
+	unsigned head;
+};
+
+/* Memory layout of the query buffer. Must be kept in sync with shaders
+ * (including QBO shaders) and should be aligned to cachelines.
+ *
+ * The somewhat awkward memory layout is for compatibility with the
+ * SET_PREDICATION packet, which also means that we're setting the high bit
+ * of all those values unconditionally.
+ */
+struct gfx10_sh_query_buffer_mem {
+	struct {
+		uint64_t generated_primitives_start_dummy;
+		uint64_t emitted_primitives_start_dummy;
+		uint64_t generated_primitives;
+		uint64_t emitted_primitives;
+	} stream[4];
+	uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
+	uint32_t pad[31];
+};
+
+/* Shader-based queries. */
+struct gfx10_sh_query {
+	struct si_query b;
+
+	struct gfx10_sh_query_buffer *first;
+	struct gfx10_sh_query_buffer *last;
+	unsigned first_begin;
+	unsigned last_end;
+
+	unsigned stream;
+};
+
+static void emit_shader_query(struct si_context *sctx)
+{
+	assert(!LIST_IS_EMPTY(&sctx->shader_query_buffers));
+
+	struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers,
+							     struct gfx10_sh_query_buffer, list);
+	qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
+}
+
+static void gfx10_release_query_buffers(struct si_context *sctx,
+					struct gfx10_sh_query_buffer *first,
+					struct gfx10_sh_query_buffer *last)
+{
+	while (first) {
+		struct gfx10_sh_query_buffer *qbuf = first;
+		if (first != last)
+			first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+		else
+			first = NULL;
+
+		qbuf->refcount--;
+		if (qbuf->refcount)
+			continue;
+
+		if (qbuf->list.next == &sctx->shader_query_buffers)
+			continue; /* keep the most recent buffer; it may not be full yet */
+		if (qbuf->list.prev == &sctx->shader_query_buffers)
+			continue; /* keep the oldest buffer for recycling */
+
+		LIST_DEL(&qbuf->list);
+		si_resource_reference(&qbuf->buf, NULL);
+		FREE(qbuf);
+	}
+}
+
+static bool gfx10_alloc_query_buffer(struct si_context *sctx)
+{
+	if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
+		return true;
+
+	struct gfx10_sh_query_buffer *qbuf = NULL;
+
+	if (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) {
+		qbuf = list_last_entry(&sctx->shader_query_buffers,
+				       struct gfx10_sh_query_buffer, list);
+		if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
+			goto success;
+
+		qbuf = list_first_entry(&sctx->shader_query_buffers,
+				        struct gfx10_sh_query_buffer, list);
+		if (!qbuf->refcount &&
+		    !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
+		    sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
+			/* Can immediately re-use the oldest buffer */
+			LIST_DEL(&qbuf->list);
+		} else {
+			qbuf = NULL;
+		}
+	}
+
+	if (!qbuf) {
+		qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
+		if (unlikely(!qbuf))
+			return false;
+
+		struct si_screen *screen = sctx->screen;
+		unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem),
+					 screen->info.min_alloc_size);
+		qbuf->buf = si_resource(
+			pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
+		if (unlikely(!qbuf->buf)) {
+			FREE(qbuf);
+			return false;
+		}
+	}
+
+	/* The buffer is currently unused by the GPU. Initialize it.
+	 *
+	 * We need to set the high bit of all the primitive counters for
+	 * compatibility with the SET_PREDICATION packet.
+	 */
+	uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
+						 PIPE_TRANSFER_WRITE |
+						 PIPE_TRANSFER_UNSYNCHRONIZED);
+	assert(results);
+
+	for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem);
+	     i < e; ++i) {
+		for (unsigned j = 0; j < 16; ++j)
+			results[32 * i + j] = (uint64_t)1 << 63;
+		results[32 * i + 16] = 0;
+	}
+
+	LIST_ADDTAIL(&qbuf->list, &sctx->shader_query_buffers);
+	qbuf->head = 0;
+	qbuf->refcount = sctx->num_active_shader_queries;
+
+success:;
+	struct pipe_shader_buffer sbuf;
+	sbuf.buffer = &qbuf->buf->b.b;
+	sbuf.buffer_offset = qbuf->head;
+	sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
+	si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
+
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
+	return true;
+}
+
+static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
+{
+	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+	gfx10_release_query_buffers(sctx, query->first, query->last);
+	FREE(query);
+}
+
+static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
+{
+	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+	gfx10_release_query_buffers(sctx, query->first, query->last);
+	query->first = query->last = NULL;
+
+	if (unlikely(!gfx10_alloc_query_buffer(sctx)))
+		return false;
+
+	query->first = list_last_entry(&sctx->shader_query_buffers,
+				       struct gfx10_sh_query_buffer, list);
+	query->first_begin = query->first->head;
+
+	sctx->num_active_shader_queries++;
+	query->first->refcount++;
+
+	return true;
+}
+
+static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
+{
+	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+	if (unlikely(!query->first))
+		return false; /* earlier out of memory error */
+
+	query->last = list_last_entry(&sctx->shader_query_buffers,
+				      struct gfx10_sh_query_buffer, list);
+	query->last_end = query->last->head;
+
+	/* Signal the fence of the previous chunk */
+	if (query->last_end != 0) {
+		uint64_t fence_va = query->last->buf->gpu_address;
+		fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
+		fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+		si_cp_release_mem(sctx, sctx->gfx_cs,
+				  V_028A90_BOTTOM_OF_PIPE_TS, 0,
+				  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+				  EOP_DATA_SEL_VALUE_32BIT,
+				  query->last->buf, fence_va, 0xffffffff,
+				  PIPE_QUERY_GPU_FINISHED);
+	}
+
+	sctx->num_active_shader_queries--;
+
+	if (sctx->num_active_shader_queries > 0) {
+		gfx10_alloc_query_buffer(sctx);
+	} else {
+		si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
+
+		/* If a query_begin is followed by a query_end without a draw
+		 * in-between, we need to clear the atom to ensure that the
+		 * next query_begin will re-initialize the shader buffer. */
+		si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
+	}
+
+	return true;
+}
+
+static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
+				      struct gfx10_sh_query_buffer_mem *qmem,
+				      union pipe_query_result *result)
+{
+	static const uint64_t mask = ((uint64_t)1 << 63) - 1;
+
+	switch (query->b.type) {
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+		result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
+		break;
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+		result->u64 += qmem->stream[query->stream].generated_primitives & mask;
+		break;
+	case PIPE_QUERY_SO_STATISTICS:
+		result->so_statistics.num_primitives_written +=
+			qmem->stream[query->stream].emitted_primitives & mask;
+		result->so_statistics.primitives_storage_needed +=
+			qmem->stream[query->stream].generated_primitives & mask;
+		break;
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		result->b |= qmem->stream[query->stream].emitted_primitives !=
+			     qmem->stream[query->stream].generated_primitives;
+		break;
+	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+		for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+			result->b |= qmem->stream[query->stream].emitted_primitives !=
+				     qmem->stream[query->stream].generated_primitives;
+		}
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery,
+				      bool wait, union pipe_query_result *result)
+{
+	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+	util_query_clear_result(result, query->b.type);
+
+	if (unlikely(!query->first))
+		return false; /* earlier out of memory error */
+	assert(query->last);
+
+	for (struct gfx10_sh_query_buffer *qbuf = query->last;;
+	     qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
+		unsigned usage = PIPE_TRANSFER_READ |
+				 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+		void *map;
+
+		if (rquery->b.flushed)
+			map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+		else
+			map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+
+		if (!map)
+			return false;
+
+		unsigned results_begin = 0;
+		unsigned results_end = qbuf->head;
+		if (qbuf == query->first)
+			results_begin = query->first_begin;
+		if (qbuf == query->last)
+			results_end = query->last_end;
+
+		while (results_begin != results_end) {
+			struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
+			results_begin += sizeof(*qmem);
+
+			gfx10_sh_query_add_result(query, qmem, result);
+		}
+
+		if (qbuf == query->first)
+			break;
+	}
+
+	return true;
+}
+
+static void gfx10_sh_query_get_result_resource(struct si_context *sctx,
+					       struct si_query *rquery,
+					       bool wait,
+					       enum pipe_query_value_type result_type,
+					       int index,
+					       struct pipe_resource *resource,
+					       unsigned offset)
+{
+	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+	struct si_qbo_state saved_state = {};
+	struct pipe_resource *tmp_buffer = NULL;
+	unsigned tmp_buffer_offset = 0;
+
+	if (!sctx->sh_query_result_shader) {
+		sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
+		if (!sctx->sh_query_result_shader)
+			return;
+	}
+
+	if (query->first != query->last) {
+		u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
+				     &tmp_buffer_offset, &tmp_buffer);
+		if (!tmp_buffer)
+			return;
+	}
+
+	si_save_qbo_state(sctx, &saved_state);
+
+	/* Pre-fill the constants configuring the shader behavior. */
+	struct {
+		uint32_t config;
+		uint32_t offset;
+		uint32_t chain;
+		uint32_t result_count;
+	} consts;
+	struct pipe_constant_buffer constant_buffer = {};
+
+	if (index >= 0) {
+		switch (query->b.type) {
+		case PIPE_QUERY_PRIMITIVES_GENERATED:
+			consts.offset = sizeof(uint32_t) * query->stream;
+			consts.config = 0;
+			break;
+		case PIPE_QUERY_PRIMITIVES_EMITTED:
+			consts.offset = sizeof(uint32_t) * (4 + query->stream);
+			consts.config = 0;
+			break;
+		case PIPE_QUERY_SO_STATISTICS:
+			consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
+			consts.config = 0;
+			break;
+		case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+			consts.offset = sizeof(uint32_t) * query->stream;
+			consts.config = 2;
+			break;
+		case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+			consts.offset = 0;
+			consts.config = 3;
+			break;
+		default: unreachable("bad query type");
+		}
+	} else {
+		/* Check result availability. */
+		consts.offset = 0;
+		consts.config = 1;
+	}
+
+	if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
+		consts.config |= 8;
+
+	constant_buffer.buffer_size = sizeof(consts);
+	constant_buffer.user_buffer = &consts;
+
+	/* Pre-fill the SSBOs and grid. */
+	struct pipe_shader_buffer ssbo[3];
+	struct pipe_grid_info grid = {};
+
+	ssbo[1].buffer = tmp_buffer;
+	ssbo[1].buffer_offset = tmp_buffer_offset;
+	ssbo[1].buffer_size = 16;
+
+	ssbo[2] = ssbo[1];
+
+	sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
+
+	grid.block[0] = 1;
+	grid.block[1] = 1;
+	grid.block[2] = 1;
+	grid.grid[0] = 1;
+	grid.grid[1] = 1;
+	grid.grid[2] = 1;
+
+	struct gfx10_sh_query_buffer *qbuf = query->first;
+	for (;;) {
+		unsigned begin = qbuf == query->first ? query->first_begin : 0;
+		unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
+		if (!end)
+			continue;
+
+		ssbo[0].buffer = &qbuf->buf->b.b;
+		ssbo[0].buffer_offset = begin;
+		ssbo[0].buffer_size = end - begin;
+
+		consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
+		consts.chain = 0;
+		if (qbuf != query->first)
+			consts.chain |= 1;
+		if (qbuf != query->last)
+			consts.chain |= 2;
+
+		if (qbuf == query->last) {
+			ssbo[2].buffer = resource;
+			ssbo[2].buffer_offset = offset;
+			ssbo[2].buffer_size = 8;
+		}
+
+		sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+		sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
+
+		if (wait) {
+			uint64_t va;
+
+			/* Wait for result availability. Wait only for readiness
+			 * of the last entry, since the fence writes should be
+			 * serialized in the CP.
+			 */
+			va = qbuf->buf->gpu_address;
+			va += end - sizeof(struct gfx10_sh_query_buffer_mem);
+			va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+
+			si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
+		}
+
+		sctx->b.launch_grid(&sctx->b, &grid);
+		sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+		if (qbuf == query->last)
+			break;
+		qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+	}
+
+	si_restore_qbo_state(sctx, &saved_state);
+	pipe_resource_reference(&tmp_buffer, NULL);
+}
+
+static const struct si_query_ops gfx10_sh_query_ops = {
+	.destroy = gfx10_sh_query_destroy,
+	.begin = gfx10_sh_query_begin,
+	.end = gfx10_sh_query_end,
+	.get_result = gfx10_sh_query_get_result,
+	.get_result_resource = gfx10_sh_query_get_result_resource,
+};
+
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
+					 enum pipe_query_type query_type,
+					 unsigned index)
+{
+	struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
+	if (unlikely(!query))
+		return NULL;
+
+	query->b.ops = &gfx10_sh_query_ops;
+	query->b.type = query_type;
+	query->stream = index;
+
+	return (struct pipe_query *)query;
+}
+
+void gfx10_init_query(struct si_context *sctx)
+{
+	LIST_INITHEAD(&sctx->shader_query_buffers);
+	sctx->atoms.s.shader_query.emit = emit_shader_query;
+}
+
+void gfx10_destroy_query(struct si_context *sctx)
+{
+	while (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) {
+		struct gfx10_sh_query_buffer *qbuf =
+			list_first_entry(&sctx->shader_query_buffers,
+					 struct gfx10_sh_query_buffer, list);
+		LIST_DEL(&qbuf->list);
+
+		assert(!qbuf->refcount);
+		si_resource_reference(&qbuf->buf, NULL);
+		FREE(qbuf);
+	}
+}
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
index 87ca56b1fdf..c97d9009164 100644
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -64,6 +64,15 @@ static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
 			    false);
 }
 
+static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
+{
+	LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
+					    ctx->param_rw_buffers);
+
+	return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
+				     LLVMConstInt(ctx->i32, GFX10_GS_QUERY_BUF, false));
+}
+
 /* Send GS Alloc Req message from the first wave of the group to SPI.
  * Message payload is:
  * - bits 0..10: vertices in group
@@ -209,6 +218,27 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
 
 	build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx));
 
+	/* Update query buffer */
+	tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
+	ac_build_ifcc(&ctx->ac, tmp, 5030);
+	tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
+	ac_build_ifcc(&ctx->ac, tmp, 5031);
+	{
+		LLVMValueRef args[] = {
+			ngg_get_prim_cnt(ctx),
+			ngg_get_query_buf(ctx),
+			LLVMConstInt(ctx->i32, 16, false), /* offset of stream[0].generated_primitives */
+			ctx->i32_0, /* soffset */
+			ctx->i32_0, /* cachepolicy */
+		};
+
+		/* TODO: should this be 64-bit atomics? */
+		ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
+				   ctx->i32, args, 5, 0);
+	}
+	ac_build_endif(&ctx->ac, 5031);
+	ac_build_endif(&ctx->ac, 5030);
+
 	/* Export primitive data to the index buffer. Format is:
 	 *  - bits 0..8: index 0
 	 *  - bit 9: edge flag 0
@@ -431,9 +461,34 @@ void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
 	tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
 	LLVMBuildStore(builder, tmp, primflagptr);
 
+	tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
+	tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
+	LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
+
 	lp_build_endif(&if_state);
 }
 
+void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
+{
+	/* Zero out the part of LDS scratch that is used to accumulate the
+	 * per-stream generated primitive count.
+	 */
+	LLVMBuilderRef builder = ctx->ac.builder;
+	LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
+	LLVMValueRef tid = get_thread_id_in_tg(ctx);
+	LLVMValueRef tmp;
+
+	tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->i32, 4, false), "");
+	ac_build_ifcc(&ctx->ac, tmp, 5090);
+	{
+		LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
+		LLVMBuildStore(builder, ctx->i32_0, ptr);
+	}
+	ac_build_endif(&ctx->ac, 5090);
+
+	ac_build_s_barrier(&ctx->ac);
+}
+
 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
 {
 	const struct si_shader_selector *sel = ctx->shader->selector;
@@ -481,6 +536,26 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
 		ac_build_endloop(&ctx->ac, 5100);
 	}
 
+	/* Accumulate generated primitives counts across the entire threadgroup. */
+	for (unsigned stream = 0; stream < 4; ++stream) {
+		if (!info->num_stream_output_components[stream])
+			continue;
+
+		LLVMValueRef numprims =
+			LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
+		numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, 64);
+
+		tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->i32_0, "");
+		ac_build_ifcc(&ctx->ac, tmp, 5105);
+		{
+			LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
+					   ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
+							 LLVMConstInt(ctx->i32, stream, false)),
+					   numprims, LLVMAtomicOrderingMonotonic, false);
+		}
+		ac_build_endif(&ctx->ac, 5105);
+	}
+
 	lp_build_endif(&ctx->merged_wrap_if_state);
 
 	ac_build_s_barrier(&ctx->ac);
@@ -490,6 +565,33 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
 
 	/* TODO: streamout */
 
+	tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->i32, 4, false), "");
+	ac_build_ifcc(&ctx->ac, tmp, 5110);
+	{
+		LLVMValueRef offset;
+		tmp = tid;
+		if (sel->so.num_outputs)
+			tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->i32, 3, false), "");
+		offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->i32, 32, false), "");
+		if (sel->so.num_outputs) {
+			tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->i32, 2, false), "");
+			tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->i32, 8, false), "");
+			offset = LLVMBuildAdd(builder, offset, tmp, "");
+		}
+
+		tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
+		LLVMValueRef args[] = {
+			tmp,
+			ngg_get_query_buf(ctx),
+			offset,
+			LLVMConstInt(ctx->i32, 16, false), /* soffset */
+			ctx->i32_0, /* cachepolicy */
+		};
+		ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
+				   ctx->i32, args, 5, 0);
+	}
+	ac_build_endif(&ctx->ac, 5110);
+
 	/* TODO: culling */
 
 	/* Determine vertex liveness. */
diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build
index 0ca065f34e0..a362c207776 100644
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@@ -21,6 +21,7 @@
 files_libradeonsi = files(
   'cik_sdma.c',
   'driinfo_radeonsi.h',
+  'gfx10_query.c',
   'gfx10_shader_ngg.c',
   'si_blit.c',
   'si_buffer.c',
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index c2cee024982..91b474d4d8f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -168,6 +168,9 @@ static void si_destroy_context(struct pipe_context *context)
 
 	si_release_all_descriptors(sctx);
 
+	if (sctx->chip_class >= GFX10)
+		gfx10_destroy_query(sctx);
+
 	pipe_resource_reference(&sctx->esgs_ring, NULL);
 	pipe_resource_reference(&sctx->gsvs_ring, NULL);
 	pipe_resource_reference(&sctx->tess_rings, NULL);
@@ -239,6 +242,8 @@ static void si_destroy_context(struct pipe_context *context)
 
 	if (sctx->query_result_shader)
 		sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader);
+	if (sctx->sh_query_result_shader)
+		sctx->b.delete_compute_state(&sctx->b, sctx->sh_query_result_shader);
 
 	if (sctx->gfx_cs)
 		sctx->ws->cs_destroy(sctx->gfx_cs);
@@ -516,6 +521,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 	/* Initialize graphics-only context functions. */
 	if (sctx->has_graphics) {
 		si_init_context_texture_functions(sctx);
+		if (sctx->chip_class >= GFX10)
+			gfx10_init_query(sctx);
 		si_init_msaa_functions(sctx);
 		si_init_shader_functions(sctx);
 		si_init_state_functions(sctx);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index a351e5004b1..874b1bf4cd0 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -868,6 +868,7 @@ struct si_context {
 	struct pipe_device_reset_callback device_reset_callback;
 	struct u_log_context		*log;
 	void				*query_result_shader;
+	void				*sh_query_result_shader;
 
 	void (*emit_cache_flush)(struct si_context *ctx);
 
@@ -1178,6 +1179,10 @@ struct si_context {
 	unsigned			num_sdma_uploads;
 	unsigned			max_sdma_uploads;
 
+	/* Shader-based queries. */
+	struct list_head		shader_query_buffers;
+	unsigned			num_active_shader_queries;
+
 	/* Statistics gathering for the DCC enablement heuristic. It can't be
 	 * in si_texture because si_texture can be shared by multiple
 	 * contexts. This is for back buffers only. We shouldn't get too many
@@ -1439,6 +1444,11 @@ void *si_clear_render_target_shader(struct pipe_context *ctx);
 void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx);
 void *si_create_dcc_retile_cs(struct pipe_context *ctx);
 void *si_create_query_result_cs(struct si_context *sctx);
+void *gfx10_create_sh_query_result_cs(struct si_context *sctx);
+
+/* gfx10_query.c */
+void gfx10_init_query(struct si_context *sctx);
+void gfx10_destroy_query(struct si_context *sctx);
 
 /* si_test_dma.c */
 void si_test_dma(struct si_screen *sscreen);
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index ae6498e1895..394bf7ff124 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -32,8 +32,6 @@
 #include "util/u_suballoc.h"
 #include "amd/common/sid.h"
 
-#define SI_MAX_STREAMS 4
-
 static const struct si_query_ops query_hw_ops;
 
 struct si_hw_query_params {
@@ -1015,6 +1013,12 @@ static void si_emit_query_predication(struct si_context *ctx)
 	if (!query)
 		return;
 
+	if (ctx->chip_class == GFX10 &&
+	    (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+	     query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
+		assert(!"not implemented");
+	}
+
 	invert = ctx->render_cond_invert;
 	flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
 		    ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
@@ -1096,6 +1100,14 @@ static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned que
 	     query_type != SI_QUERY_TIME_ELAPSED_SDMA))
 		return si_query_sw_create(query_type);
 
+	if (sscreen->info.chip_class >= GFX10 &&
+	    (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
+	     query_type == PIPE_QUERY_PRIMITIVES_GENERATED ||
+	     query_type == PIPE_QUERY_SO_STATISTICS ||
+	     query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+	     query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
+		return gfx10_sh_query_create(sscreen, query_type, index);
+
 	return si_query_hw_create(sscreen, query_type, index);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h
index 82e5e25ed00..dc219f8551c 100644
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -38,6 +38,8 @@ struct si_query_buffer;
 struct si_query_hw;
 struct si_resource;
 
+#define SI_MAX_STREAMS 4
+
 enum {
 	SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
 	SI_QUERY_DECOMPRESS_CALLS,
@@ -228,6 +230,12 @@ void si_query_hw_suspend(struct si_context *sctx, struct si_query *query);
 void si_query_hw_resume(struct si_context *sctx, struct si_query *query);
 
 
+/* Shader-based queries */
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
+					 enum pipe_query_type query_type,
+					 unsigned index);
+
+
 /* Performance counters */
 struct si_perfcounters {
 	unsigned num_groups;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 9abecdf1003..68506b7a92c 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -6082,6 +6082,8 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
 			for (unsigned i = 0; i < 4; ++i) {
 				ctx->gs_curprim_verts[i] =
 					lp_build_alloca(&ctx->gallivm, ctx->ac.i32, "");
+				ctx->gs_generated_prims[i] =
+					lp_build_alloca(&ctx->gallivm, ctx->ac.i32, "");
 			}
 
 			LLVMTypeRef a8i32 = LLVMArrayType(ctx->i32, 8);
@@ -6135,9 +6137,15 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
 
 			if (ctx->type == PIPE_SHADER_TESS_CTRL ||
 			    ctx->type == PIPE_SHADER_GEOMETRY) {
+				if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
+					gfx10_ngg_gs_emit_prologue(ctx);
+					nested_barrier = false;
+				} else {
+					nested_barrier = true;
+				}
+
 				/* Number of patches / primitives */
 				num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
-				nested_barrier = true;
 			} else {
 				/* Number of vertices */
 				num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 0, 8);
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 09efc91b9f5..7832f75ef65 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -214,6 +214,7 @@ struct si_shader_context {
 	LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
 	LLVMValueRef gs_next_vertex[4];
 	LLVMValueRef gs_curprim_verts[4];
+	LLVMValueRef gs_generated_prims[4];
 	LLVMValueRef gs_ngg_emit;
 	LLVMValueRef gs_ngg_scratch;
 	LLVMValueRef postponed_kill;
@@ -388,6 +389,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
 void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
 			      unsigned stream,
 			      LLVMValueRef *addrs);
+void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx);
 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx);
 void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader);
 
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
index b68fd2ff236..9f2f9d30216 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -660,3 +660,228 @@ void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx)
 
 	return ctx->create_compute_state(ctx, &state);
 }
+
+/* Create the compute shader that is used to collect the results of gfx10+
+ * shader queries.
+ *
+ * One compute grid with a single thread is launched for every query result
+ * buffer. The thread (optionally) reads a previous summary buffer, then
+ * accumulates data from the query result buffer, and writes the result either
+ * to a summary buffer to be consumed by the next grid invocation or to the
+ * user-supplied buffer.
+ *
+ * Data layout:
+ *
+ * BUFFER[0] = query result buffer (layout is defined by gfx10_sh_query_buffer_mem)
+ * BUFFER[1] = previous summary buffer
+ * BUFFER[2] = next summary buffer or user-supplied buffer
+ *
+ * CONST
+ *  0.x = config; the low 3 bits indicate the mode:
+ *          0: sum up counts
+ *          1: determine result availability and write it as a boolean
+ *          2: SO_OVERFLOW
+ *          3: SO_ANY_OVERFLOW
+ *        the remaining bits form a bitfield:
+ *          8: write result as a 64-bit value
+ *  0.y = offset in bytes to counts or stream for SO_OVERFLOW mode
+ *  0.z = chain bit field:
+ *          1: have previous summary buffer
+ *          2: write next summary buffer
+ *  0.w = result_count
+ */
+void *gfx10_create_sh_query_result_cs(struct si_context *sctx)
+{
+	/* TEMP[0].x = accumulated result so far
+	 * TEMP[0].y = result missing
+	 * TEMP[0].z = whether we're in overflow mode
+	 */
+	static const char text_tmpl[] =
+		"COMP\n"
+		"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
+		"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+		"DCL BUFFER[0]\n"
+		"DCL BUFFER[1]\n"
+		"DCL BUFFER[2]\n"
+		"DCL CONST[0][0..0]\n"
+		"DCL TEMP[0..5]\n"
+		"IMM[0] UINT32 {0, 7, 0, 4294967295}\n"
+		"IMM[1] UINT32 {1, 2, 4, 8}\n"
+		"IMM[2] UINT32 {16, 32, 64, 128}\n"
+
+		/*
+		acc_result = 0;
+		acc_missing = 0;
+		if (chain & 1) {
+			acc_result = buffer[1][0];
+			acc_missing = buffer[1][1];
+		}
+		*/
+		"MOV TEMP[0].xy, IMM[0].xxxx\n"
+		"AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n"
+		"UIF TEMP[5]\n"
+			"LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n"
+		"ENDIF\n"
+
+		/*
+		is_overflow (TEMP[0].z) = (config & 7) >= 2;
+		result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 : result_count;
+		base_offset (TEMP[1].y) = 0;
+		for (;;) {
+			if (!result_remaining)
+				break;
+			result_remaining--;
+		*/
+		"AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+		"USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n"
+
+		"AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n"
+		"UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n"
+		"MOV TEMP[1].y, IMM[0].xxxx\n"
+
+		"BGNLOOP\n"
+			"USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n"
+			"UIF TEMP[5]\n"
+				"BRK\n"
+			"ENDIF\n"
+			"UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n"
+
+			/*
+			fence = buffer[0]@(base_offset + 32);
+			if (!fence) {
+				acc_missing = ~0u;
+				break;
+			}
+			*/
+			"UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n"
+			"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
+			"USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
+			"UIF TEMP[5]\n"
+				"MOV TEMP[0].y, TEMP[5].xxxx\n"
+				"BRK\n"
+			"ENDIF\n"
+
+			/*
+			stream_offset (TEMP[2].x) = base_offset + offset;
+
+			if (!(config & 7)) {
+				acc_result += buffer[0]@stream_offset;
+			}
+			*/
+			"UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n"
+
+			"AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+			"USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
+			"UIF TEMP[5]\n"
+				"LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n"
+				"UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n"
+			"ENDIF\n"
+
+			/*
+			if ((config & 7) >= 2) {
+				count (TEMP[2].y) = (config & 1) ? 4 : 1;
+			*/
+			"AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+			"USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n"
+			"UIF TEMP[5]\n"
+				"AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n"
+				"UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n"
+
+				/*
+				do {
+					generated = buffer[0]@stream_offset;
+					emitted = buffer[0]@(stream_offset + 16);
+					if (generated != emitted) {
+						acc_result = 1;
+						result_remaining = 0;
+						break;
+					}
+
+					stream_offset += 4;
+				} while (--count);
+				*/
+				"BGNLOOP\n"
+					"UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n"
+					"LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n"
+					"LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n"
+					"USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n"
+					"UIF TEMP[5]\n"
+						"MOV TEMP[0].x, IMM[1].xxxx\n"
+						"MOV TEMP[1].y, IMM[0].xxxx\n"
+						"BRK\n"
+					"ENDIF\n"
+
+					"UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n"
+					"USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n"
+					"UIF TEMP[5]\n"
+						"BRK\n"
+					"ENDIF\n"
+					"UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n"
+				"ENDLOOP\n"
+			"ENDIF\n"
+
+		/*
+			base_offset += 64;
+		} // end outer loop
+		*/
+			"UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n"
+		"ENDLOOP\n"
+
+		/*
+		if (chain & 2) {
+			buffer[2][0] = acc_result;
+			buffer[2][1] = acc_missing;
+		} else {
+		*/
+		"AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n"
+		"UIF TEMP[5]\n"
+			"STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n"
+		"ELSE\n"
+
+			/*
+			if ((config & 7) == 1) {
+				acc_result = acc_missing ? 0 : 1;
+				acc_missing = 0;
+			}
+			*/
+			"AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n"
+			"USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n"
+			"UIF TEMP[5]\n"
+				"UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n"
+				"MOV TEMP[0].y, IMM[0].xxxx\n"
+			"ENDIF\n"
+
+			/*
+			if (!acc_missing) {
+				buffer[2][0] = acc_result;
+				if (config & 8)
+					buffer[2][1] = 0;
+			}
+			*/
+			"USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n"
+			"UIF TEMP[5]\n"
+				"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
+
+				"AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n"
+				"UIF TEMP[5]\n"
+					"STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n"
+				"ENDIF\n"
+			"ENDIF\n"
+		"ENDIF\n"
+
+		"END\n";
+
+	struct tgsi_token tokens[1024];
+	struct pipe_compute_state state = {};
+
+	if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) {
+		assert(false);
+		return NULL;
+	}
+
+	state.ir_type = PIPE_SHADER_IR_TGSI;
+	state.prog = tokens;
+
+	return sctx->b.create_compute_state(&sctx->b, &state);
+}
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 678f87cd73d..757dd1bf5cd 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -228,6 +228,7 @@ union si_state_atoms {
 		struct si_atom spi_map;
 		struct si_atom scratch_state;
 		struct si_atom window_rectangles;
+		struct si_atom shader_query;
 	} s;
 	struct si_atom array[0];
 };
@@ -370,6 +371,8 @@ enum {
 	SI_PS_IMAGE_COLORBUF0_FMASK,
 	SI_PS_IMAGE_COLORBUF0_FMASK_HI,
 
+	GFX10_GS_QUERY_BUF,
+
 	SI_NUM_RW_BUFFERS,
 };